##// END OF EJS Templates
encoding: use raw strings for encoding arguments...
Gregory Szorc -
r42002:25694a78 default
parent child Browse files
Show More
@@ -1,616 +1,616 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from . import (
14 from . import (
15 error,
15 error,
16 policy,
16 policy,
17 pycompat,
17 pycompat,
18 )
18 )
19
19
20 from .pure import (
20 from .pure import (
21 charencode as charencodepure,
21 charencode as charencodepure,
22 )
22 )
23
23
24 charencode = policy.importmod(r'charencode')
24 charencode = policy.importmod(r'charencode')
25
25
26 isasciistr = charencode.isasciistr
26 isasciistr = charencode.isasciistr
27 asciilower = charencode.asciilower
27 asciilower = charencode.asciilower
28 asciiupper = charencode.asciiupper
28 asciiupper = charencode.asciiupper
29 _jsonescapeu8fast = charencode.jsonescapeu8fast
29 _jsonescapeu8fast = charencode.jsonescapeu8fast
30
30
31 _sysstr = pycompat.sysstr
31 _sysstr = pycompat.sysstr
32
32
33 if pycompat.ispy3:
33 if pycompat.ispy3:
34 unichr = chr
34 unichr = chr
35
35
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 # "Unicode Subtleties"), so we need to ignore them in some places for
37 # "Unicode Subtleties"), so we need to ignore them in some places for
38 # sanity.
38 # sanity.
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 "206a 206b 206c 206d 206e 206f feff".split()]
41 "206a 206b 206c 206d 206e 206f feff".split()]
42 # verify the next function will work
42 # verify the next function will work
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44
44
45 def hfsignoreclean(s):
45 def hfsignoreclean(s):
46 """Remove codepoints ignored by HFS+ from s.
46 """Remove codepoints ignored by HFS+ from s.
47
47
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
49 '.hg'
49 '.hg'
50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
51 '.hg'
51 '.hg'
52 """
52 """
53 if "\xe2" in s or "\xef" in s:
53 if "\xe2" in s or "\xef" in s:
54 for c in _ignore:
54 for c in _ignore:
55 s = s.replace(c, '')
55 s = s.replace(c, '')
56 return s
56 return s
57
57
58 # encoding.environ is provided read-only, which may not be used to modify
58 # encoding.environ is provided read-only, which may not be used to modify
59 # the process environment
59 # the process environment
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
61 if not pycompat.ispy3:
61 if not pycompat.ispy3:
62 environ = os.environ # re-exports
62 environ = os.environ # re-exports
63 elif _nativeenviron:
63 elif _nativeenviron:
64 environ = os.environb # re-exports
64 environ = os.environb # re-exports
65 else:
65 else:
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 # and recreate it once encoding is settled
67 # and recreate it once encoding is settled
68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
68 environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))
69 for k, v in os.environ.items()) # re-exports
69 for k, v in os.environ.items()) # re-exports
70
70
71 _encodingrewrites = {
71 _encodingrewrites = {
72 '646': 'ascii',
72 '646': 'ascii',
73 'ANSI_X3.4-1968': 'ascii',
73 'ANSI_X3.4-1968': 'ascii',
74 }
74 }
75 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
75 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
76 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
76 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
77 # https://bugs.python.org/issue13216
77 # https://bugs.python.org/issue13216
78 if pycompat.iswindows and not pycompat.ispy3:
78 if pycompat.iswindows and not pycompat.ispy3:
79 _encodingrewrites['cp65001'] = 'utf-8'
79 _encodingrewrites['cp65001'] = 'utf-8'
80
80
81 try:
81 try:
82 encoding = environ.get("HGENCODING")
82 encoding = environ.get("HGENCODING")
83 if not encoding:
83 if not encoding:
84 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
84 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
85 encoding = _encodingrewrites.get(encoding, encoding)
85 encoding = _encodingrewrites.get(encoding, encoding)
86 except locale.Error:
86 except locale.Error:
87 encoding = 'ascii'
87 encoding = 'ascii'
88 encodingmode = environ.get("HGENCODINGMODE", "strict")
88 encodingmode = environ.get("HGENCODINGMODE", "strict")
89 fallbackencoding = 'ISO-8859-1'
89 fallbackencoding = 'ISO-8859-1'
90
90
91 class localstr(bytes):
91 class localstr(bytes):
92 '''This class allows strings that are unmodified to be
92 '''This class allows strings that are unmodified to be
93 round-tripped to the local encoding and back'''
93 round-tripped to the local encoding and back'''
94 def __new__(cls, u, l):
94 def __new__(cls, u, l):
95 s = bytes.__new__(cls, l)
95 s = bytes.__new__(cls, l)
96 s._utf8 = u
96 s._utf8 = u
97 return s
97 return s
98 def __hash__(self):
98 def __hash__(self):
99 return hash(self._utf8) # avoid collisions in local string space
99 return hash(self._utf8) # avoid collisions in local string space
100
100
101 class safelocalstr(bytes):
101 class safelocalstr(bytes):
102 """Tagged string denoting it was previously an internal UTF-8 string,
102 """Tagged string denoting it was previously an internal UTF-8 string,
103 and can be converted back to UTF-8 losslessly
103 and can be converted back to UTF-8 losslessly
104
104
105 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
105 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
106 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
106 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
107 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
107 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
108 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
108 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
109 """
109 """
110
110
111 def tolocal(s):
111 def tolocal(s):
112 """
112 """
113 Convert a string from internal UTF-8 to local encoding
113 Convert a string from internal UTF-8 to local encoding
114
114
115 All internal strings should be UTF-8 but some repos before the
115 All internal strings should be UTF-8 but some repos before the
116 implementation of locale support may contain latin1 or possibly
116 implementation of locale support may contain latin1 or possibly
117 other character sets. We attempt to decode everything strictly
117 other character sets. We attempt to decode everything strictly
118 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
118 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
119 replace unknown characters.
119 replace unknown characters.
120
120
121 The localstr class is used to cache the known UTF-8 encoding of
121 The localstr class is used to cache the known UTF-8 encoding of
122 strings next to their local representation to allow lossless
122 strings next to their local representation to allow lossless
123 round-trip conversion back to UTF-8.
123 round-trip conversion back to UTF-8.
124
124
125 >>> u = b'foo: \\xc3\\xa4' # utf-8
125 >>> u = b'foo: \\xc3\\xa4' # utf-8
126 >>> l = tolocal(u)
126 >>> l = tolocal(u)
127 >>> l
127 >>> l
128 'foo: ?'
128 'foo: ?'
129 >>> fromlocal(l)
129 >>> fromlocal(l)
130 'foo: \\xc3\\xa4'
130 'foo: \\xc3\\xa4'
131 >>> u2 = b'foo: \\xc3\\xa1'
131 >>> u2 = b'foo: \\xc3\\xa1'
132 >>> d = { l: 1, tolocal(u2): 2 }
132 >>> d = { l: 1, tolocal(u2): 2 }
133 >>> len(d) # no collision
133 >>> len(d) # no collision
134 2
134 2
135 >>> b'foo: ?' in d
135 >>> b'foo: ?' in d
136 False
136 False
137 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
137 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
138 >>> l = tolocal(l1)
138 >>> l = tolocal(l1)
139 >>> l
139 >>> l
140 'foo: ?'
140 'foo: ?'
141 >>> fromlocal(l) # magically in utf-8
141 >>> fromlocal(l) # magically in utf-8
142 'foo: \\xc3\\xa4'
142 'foo: \\xc3\\xa4'
143 """
143 """
144
144
145 if isasciistr(s):
145 if isasciistr(s):
146 return s
146 return s
147
147
148 try:
148 try:
149 try:
149 try:
150 # make sure string is actually stored in UTF-8
150 # make sure string is actually stored in UTF-8
151 u = s.decode('UTF-8')
151 u = s.decode('UTF-8')
152 if encoding == 'UTF-8':
152 if encoding == 'UTF-8':
153 # fast path
153 # fast path
154 return s
154 return s
155 r = u.encode(_sysstr(encoding), u"replace")
155 r = u.encode(_sysstr(encoding), r"replace")
156 if u == r.decode(_sysstr(encoding)):
156 if u == r.decode(_sysstr(encoding)):
157 # r is a safe, non-lossy encoding of s
157 # r is a safe, non-lossy encoding of s
158 return safelocalstr(r)
158 return safelocalstr(r)
159 return localstr(s, r)
159 return localstr(s, r)
160 except UnicodeDecodeError:
160 except UnicodeDecodeError:
161 # we should only get here if we're looking at an ancient changeset
161 # we should only get here if we're looking at an ancient changeset
162 try:
162 try:
163 u = s.decode(_sysstr(fallbackencoding))
163 u = s.decode(_sysstr(fallbackencoding))
164 r = u.encode(_sysstr(encoding), u"replace")
164 r = u.encode(_sysstr(encoding), r"replace")
165 if u == r.decode(_sysstr(encoding)):
165 if u == r.decode(_sysstr(encoding)):
166 # r is a safe, non-lossy encoding of s
166 # r is a safe, non-lossy encoding of s
167 return safelocalstr(r)
167 return safelocalstr(r)
168 return localstr(u.encode('UTF-8'), r)
168 return localstr(u.encode('UTF-8'), r)
169 except UnicodeDecodeError:
169 except UnicodeDecodeError:
170 u = s.decode("utf-8", "replace") # last ditch
170 u = s.decode("utf-8", "replace") # last ditch
171 # can't round-trip
171 # can't round-trip
172 return u.encode(_sysstr(encoding), u"replace")
172 return u.encode(_sysstr(encoding), r"replace")
173 except LookupError as k:
173 except LookupError as k:
174 raise error.Abort(k, hint="please check your locale settings")
174 raise error.Abort(k, hint="please check your locale settings")
175
175
176 def fromlocal(s):
176 def fromlocal(s):
177 """
177 """
178 Convert a string from the local character encoding to UTF-8
178 Convert a string from the local character encoding to UTF-8
179
179
180 We attempt to decode strings using the encoding mode set by
180 We attempt to decode strings using the encoding mode set by
181 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
181 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
182 characters will cause an error message. Other modes include
182 characters will cause an error message. Other modes include
183 'replace', which replaces unknown characters with a special
183 'replace', which replaces unknown characters with a special
184 Unicode character, and 'ignore', which drops the character.
184 Unicode character, and 'ignore', which drops the character.
185 """
185 """
186
186
187 # can we do a lossless round-trip?
187 # can we do a lossless round-trip?
188 if isinstance(s, localstr):
188 if isinstance(s, localstr):
189 return s._utf8
189 return s._utf8
190 if isasciistr(s):
190 if isasciistr(s):
191 return s
191 return s
192
192
193 try:
193 try:
194 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
194 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
195 return u.encode("utf-8")
195 return u.encode("utf-8")
196 except UnicodeDecodeError as inst:
196 except UnicodeDecodeError as inst:
197 sub = s[max(0, inst.start - 10):inst.start + 10]
197 sub = s[max(0, inst.start - 10):inst.start + 10]
198 raise error.Abort("decoding near '%s': %s!"
198 raise error.Abort("decoding near '%s': %s!"
199 % (sub, pycompat.bytestr(inst)))
199 % (sub, pycompat.bytestr(inst)))
200 except LookupError as k:
200 except LookupError as k:
201 raise error.Abort(k, hint="please check your locale settings")
201 raise error.Abort(k, hint="please check your locale settings")
202
202
203 def unitolocal(u):
203 def unitolocal(u):
204 """Convert a unicode string to a byte string of local encoding"""
204 """Convert a unicode string to a byte string of local encoding"""
205 return tolocal(u.encode('utf-8'))
205 return tolocal(u.encode('utf-8'))
206
206
207 def unifromlocal(s):
207 def unifromlocal(s):
208 """Convert a byte string of local encoding to a unicode string"""
208 """Convert a byte string of local encoding to a unicode string"""
209 return fromlocal(s).decode('utf-8')
209 return fromlocal(s).decode('utf-8')
210
210
211 def unimethod(bytesfunc):
211 def unimethod(bytesfunc):
212 """Create a proxy method that forwards __unicode__() and __str__() of
212 """Create a proxy method that forwards __unicode__() and __str__() of
213 Python 3 to __bytes__()"""
213 Python 3 to __bytes__()"""
214 def unifunc(obj):
214 def unifunc(obj):
215 return unifromlocal(bytesfunc(obj))
215 return unifromlocal(bytesfunc(obj))
216 return unifunc
216 return unifunc
217
217
218 # converter functions between native str and byte string. use these if the
218 # converter functions between native str and byte string. use these if the
219 # character encoding is not aware (e.g. exception message) or is known to
219 # character encoding is not aware (e.g. exception message) or is known to
220 # be locale dependent (e.g. date formatting.)
220 # be locale dependent (e.g. date formatting.)
221 if pycompat.ispy3:
221 if pycompat.ispy3:
222 strtolocal = unitolocal
222 strtolocal = unitolocal
223 strfromlocal = unifromlocal
223 strfromlocal = unifromlocal
224 strmethod = unimethod
224 strmethod = unimethod
225 else:
225 else:
226 strtolocal = pycompat.identity
226 strtolocal = pycompat.identity
227 strfromlocal = pycompat.identity
227 strfromlocal = pycompat.identity
228 strmethod = pycompat.identity
228 strmethod = pycompat.identity
229
229
230 if not _nativeenviron:
230 if not _nativeenviron:
231 # now encoding and helper functions are available, recreate the environ
231 # now encoding and helper functions are available, recreate the environ
232 # dict to be exported to other modules
232 # dict to be exported to other modules
233 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
233 environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
234 for k, v in os.environ.items()) # re-exports
234 for k, v in os.environ.items()) # re-exports
235
235
236 if pycompat.ispy3:
236 if pycompat.ispy3:
237 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
237 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
238 # returns bytes.
238 # returns bytes.
239 if pycompat.iswindows:
239 if pycompat.iswindows:
240 # Python 3 on Windows issues a DeprecationWarning about using the bytes
240 # Python 3 on Windows issues a DeprecationWarning about using the bytes
241 # API when os.getcwdb() is called.
241 # API when os.getcwdb() is called.
242 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
242 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
243 else:
243 else:
244 getcwd = os.getcwdb # re-exports
244 getcwd = os.getcwdb # re-exports
245 else:
245 else:
246 getcwd = os.getcwd # re-exports
246 getcwd = os.getcwd # re-exports
247
247
248 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
248 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
249 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
249 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
250 and "WFA" or "WF")
250 and "WFA" or "WF")
251
251
252 def colwidth(s):
252 def colwidth(s):
253 "Find the column width of a string for display in the local encoding"
253 "Find the column width of a string for display in the local encoding"
254 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
254 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
255
255
256 def ucolwidth(d):
256 def ucolwidth(d):
257 "Find the column width of a Unicode string for display"
257 "Find the column width of a Unicode string for display"
258 eaw = getattr(unicodedata, 'east_asian_width', None)
258 eaw = getattr(unicodedata, 'east_asian_width', None)
259 if eaw is not None:
259 if eaw is not None:
260 return sum([eaw(c) in _wide and 2 or 1 for c in d])
260 return sum([eaw(c) in _wide and 2 or 1 for c in d])
261 return len(d)
261 return len(d)
262
262
263 def getcols(s, start, c):
263 def getcols(s, start, c):
264 '''Use colwidth to find a c-column substring of s starting at byte
264 '''Use colwidth to find a c-column substring of s starting at byte
265 index start'''
265 index start'''
266 for x in pycompat.xrange(start + c, len(s)):
266 for x in pycompat.xrange(start + c, len(s)):
267 t = s[start:x]
267 t = s[start:x]
268 if colwidth(t) == c:
268 if colwidth(t) == c:
269 return t
269 return t
270
270
271 def trim(s, width, ellipsis='', leftside=False):
271 def trim(s, width, ellipsis='', leftside=False):
272 """Trim string 's' to at most 'width' columns (including 'ellipsis').
272 """Trim string 's' to at most 'width' columns (including 'ellipsis').
273
273
274 If 'leftside' is True, left side of string 's' is trimmed.
274 If 'leftside' is True, left side of string 's' is trimmed.
275 'ellipsis' is always placed at trimmed side.
275 'ellipsis' is always placed at trimmed side.
276
276
277 >>> from .node import bin
277 >>> from .node import bin
278 >>> def bprint(s):
278 >>> def bprint(s):
279 ... print(pycompat.sysstr(s))
279 ... print(pycompat.sysstr(s))
280 >>> ellipsis = b'+++'
280 >>> ellipsis = b'+++'
281 >>> from . import encoding
281 >>> from . import encoding
282 >>> encoding.encoding = b'utf-8'
282 >>> encoding.encoding = b'utf-8'
283 >>> t = b'1234567890'
283 >>> t = b'1234567890'
284 >>> bprint(trim(t, 12, ellipsis=ellipsis))
284 >>> bprint(trim(t, 12, ellipsis=ellipsis))
285 1234567890
285 1234567890
286 >>> bprint(trim(t, 10, ellipsis=ellipsis))
286 >>> bprint(trim(t, 10, ellipsis=ellipsis))
287 1234567890
287 1234567890
288 >>> bprint(trim(t, 8, ellipsis=ellipsis))
288 >>> bprint(trim(t, 8, ellipsis=ellipsis))
289 12345+++
289 12345+++
290 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
290 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
291 +++67890
291 +++67890
292 >>> bprint(trim(t, 8))
292 >>> bprint(trim(t, 8))
293 12345678
293 12345678
294 >>> bprint(trim(t, 8, leftside=True))
294 >>> bprint(trim(t, 8, leftside=True))
295 34567890
295 34567890
296 >>> bprint(trim(t, 3, ellipsis=ellipsis))
296 >>> bprint(trim(t, 3, ellipsis=ellipsis))
297 +++
297 +++
298 >>> bprint(trim(t, 1, ellipsis=ellipsis))
298 >>> bprint(trim(t, 1, ellipsis=ellipsis))
299 +
299 +
300 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
300 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
301 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
301 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
302 >>> bprint(trim(t, 12, ellipsis=ellipsis))
302 >>> bprint(trim(t, 12, ellipsis=ellipsis))
303 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
303 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
304 >>> bprint(trim(t, 10, ellipsis=ellipsis))
304 >>> bprint(trim(t, 10, ellipsis=ellipsis))
305 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
305 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
306 >>> bprint(trim(t, 8, ellipsis=ellipsis))
306 >>> bprint(trim(t, 8, ellipsis=ellipsis))
307 \xe3\x81\x82\xe3\x81\x84+++
307 \xe3\x81\x82\xe3\x81\x84+++
308 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
308 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
309 +++\xe3\x81\x88\xe3\x81\x8a
309 +++\xe3\x81\x88\xe3\x81\x8a
310 >>> bprint(trim(t, 5))
310 >>> bprint(trim(t, 5))
311 \xe3\x81\x82\xe3\x81\x84
311 \xe3\x81\x82\xe3\x81\x84
312 >>> bprint(trim(t, 5, leftside=True))
312 >>> bprint(trim(t, 5, leftside=True))
313 \xe3\x81\x88\xe3\x81\x8a
313 \xe3\x81\x88\xe3\x81\x8a
314 >>> bprint(trim(t, 4, ellipsis=ellipsis))
314 >>> bprint(trim(t, 4, ellipsis=ellipsis))
315 +++
315 +++
316 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
316 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
317 +++
317 +++
318 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
318 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
319 >>> bprint(trim(t, 12, ellipsis=ellipsis))
319 >>> bprint(trim(t, 12, ellipsis=ellipsis))
320 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
320 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
321 >>> bprint(trim(t, 10, ellipsis=ellipsis))
321 >>> bprint(trim(t, 10, ellipsis=ellipsis))
322 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
322 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
323 >>> bprint(trim(t, 8, ellipsis=ellipsis))
323 >>> bprint(trim(t, 8, ellipsis=ellipsis))
324 \x11\x22\x33\x44\x55+++
324 \x11\x22\x33\x44\x55+++
325 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
325 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
326 +++\x66\x77\x88\x99\xaa
326 +++\x66\x77\x88\x99\xaa
327 >>> bprint(trim(t, 8))
327 >>> bprint(trim(t, 8))
328 \x11\x22\x33\x44\x55\x66\x77\x88
328 \x11\x22\x33\x44\x55\x66\x77\x88
329 >>> bprint(trim(t, 8, leftside=True))
329 >>> bprint(trim(t, 8, leftside=True))
330 \x33\x44\x55\x66\x77\x88\x99\xaa
330 \x33\x44\x55\x66\x77\x88\x99\xaa
331 >>> bprint(trim(t, 3, ellipsis=ellipsis))
331 >>> bprint(trim(t, 3, ellipsis=ellipsis))
332 +++
332 +++
333 >>> bprint(trim(t, 1, ellipsis=ellipsis))
333 >>> bprint(trim(t, 1, ellipsis=ellipsis))
334 +
334 +
335 """
335 """
336 try:
336 try:
337 u = s.decode(_sysstr(encoding))
337 u = s.decode(_sysstr(encoding))
338 except UnicodeDecodeError:
338 except UnicodeDecodeError:
339 if len(s) <= width: # trimming is not needed
339 if len(s) <= width: # trimming is not needed
340 return s
340 return s
341 width -= len(ellipsis)
341 width -= len(ellipsis)
342 if width <= 0: # no enough room even for ellipsis
342 if width <= 0: # no enough room even for ellipsis
343 return ellipsis[:width + len(ellipsis)]
343 return ellipsis[:width + len(ellipsis)]
344 if leftside:
344 if leftside:
345 return ellipsis + s[-width:]
345 return ellipsis + s[-width:]
346 return s[:width] + ellipsis
346 return s[:width] + ellipsis
347
347
348 if ucolwidth(u) <= width: # trimming is not needed
348 if ucolwidth(u) <= width: # trimming is not needed
349 return s
349 return s
350
350
351 width -= len(ellipsis)
351 width -= len(ellipsis)
352 if width <= 0: # no enough room even for ellipsis
352 if width <= 0: # no enough room even for ellipsis
353 return ellipsis[:width + len(ellipsis)]
353 return ellipsis[:width + len(ellipsis)]
354
354
355 if leftside:
355 if leftside:
356 uslice = lambda i: u[i:]
356 uslice = lambda i: u[i:]
357 concat = lambda s: ellipsis + s
357 concat = lambda s: ellipsis + s
358 else:
358 else:
359 uslice = lambda i: u[:-i]
359 uslice = lambda i: u[:-i]
360 concat = lambda s: s + ellipsis
360 concat = lambda s: s + ellipsis
361 for i in pycompat.xrange(1, len(u)):
361 for i in pycompat.xrange(1, len(u)):
362 usub = uslice(i)
362 usub = uslice(i)
363 if ucolwidth(usub) <= width:
363 if ucolwidth(usub) <= width:
364 return concat(usub.encode(_sysstr(encoding)))
364 return concat(usub.encode(_sysstr(encoding)))
365 return ellipsis # no enough room for multi-column characters
365 return ellipsis # no enough room for multi-column characters
366
366
367 def lower(s):
367 def lower(s):
368 "best-effort encoding-aware case-folding of local string s"
368 "best-effort encoding-aware case-folding of local string s"
369 try:
369 try:
370 return asciilower(s)
370 return asciilower(s)
371 except UnicodeDecodeError:
371 except UnicodeDecodeError:
372 pass
372 pass
373 try:
373 try:
374 if isinstance(s, localstr):
374 if isinstance(s, localstr):
375 u = s._utf8.decode("utf-8")
375 u = s._utf8.decode("utf-8")
376 else:
376 else:
377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
378
378
379 lu = u.lower()
379 lu = u.lower()
380 if u == lu:
380 if u == lu:
381 return s # preserve localstring
381 return s # preserve localstring
382 return lu.encode(_sysstr(encoding))
382 return lu.encode(_sysstr(encoding))
383 except UnicodeError:
383 except UnicodeError:
384 return s.lower() # we don't know how to fold this except in ASCII
384 return s.lower() # we don't know how to fold this except in ASCII
385 except LookupError as k:
385 except LookupError as k:
386 raise error.Abort(k, hint="please check your locale settings")
386 raise error.Abort(k, hint="please check your locale settings")
387
387
388 def upper(s):
388 def upper(s):
389 "best-effort encoding-aware case-folding of local string s"
389 "best-effort encoding-aware case-folding of local string s"
390 try:
390 try:
391 return asciiupper(s)
391 return asciiupper(s)
392 except UnicodeDecodeError:
392 except UnicodeDecodeError:
393 return upperfallback(s)
393 return upperfallback(s)
394
394
395 def upperfallback(s):
395 def upperfallback(s):
396 try:
396 try:
397 if isinstance(s, localstr):
397 if isinstance(s, localstr):
398 u = s._utf8.decode("utf-8")
398 u = s._utf8.decode("utf-8")
399 else:
399 else:
400 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
400 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
401
401
402 uu = u.upper()
402 uu = u.upper()
403 if u == uu:
403 if u == uu:
404 return s # preserve localstring
404 return s # preserve localstring
405 return uu.encode(_sysstr(encoding))
405 return uu.encode(_sysstr(encoding))
406 except UnicodeError:
406 except UnicodeError:
407 return s.upper() # we don't know how to fold this except in ASCII
407 return s.upper() # we don't know how to fold this except in ASCII
408 except LookupError as k:
408 except LookupError as k:
409 raise error.Abort(k, hint="please check your locale settings")
409 raise error.Abort(k, hint="please check your locale settings")
410
410
411 class normcasespecs(object):
411 class normcasespecs(object):
412 '''what a platform's normcase does to ASCII strings
412 '''what a platform's normcase does to ASCII strings
413
413
414 This is specified per platform, and should be consistent with what normcase
414 This is specified per platform, and should be consistent with what normcase
415 on that platform actually does.
415 on that platform actually does.
416
416
417 lower: normcase lowercases ASCII strings
417 lower: normcase lowercases ASCII strings
418 upper: normcase uppercases ASCII strings
418 upper: normcase uppercases ASCII strings
419 other: the fallback function should always be called
419 other: the fallback function should always be called
420
420
421 This should be kept in sync with normcase_spec in util.h.'''
421 This should be kept in sync with normcase_spec in util.h.'''
422 lower = -1
422 lower = -1
423 upper = 1
423 upper = 1
424 other = 0
424 other = 0
425
425
426 def jsonescape(s, paranoid=False):
426 def jsonescape(s, paranoid=False):
427 '''returns a string suitable for JSON
427 '''returns a string suitable for JSON
428
428
429 JSON is problematic for us because it doesn't support non-Unicode
429 JSON is problematic for us because it doesn't support non-Unicode
430 bytes. To deal with this, we take the following approach:
430 bytes. To deal with this, we take the following approach:
431
431
432 - localstr/safelocalstr objects are converted back to UTF-8
432 - localstr/safelocalstr objects are converted back to UTF-8
433 - valid UTF-8/ASCII strings are passed as-is
433 - valid UTF-8/ASCII strings are passed as-is
434 - other strings are converted to UTF-8b surrogate encoding
434 - other strings are converted to UTF-8b surrogate encoding
435 - apply JSON-specified string escaping
435 - apply JSON-specified string escaping
436
436
437 (escapes are doubled in these tests)
437 (escapes are doubled in these tests)
438
438
439 >>> jsonescape(b'this is a test')
439 >>> jsonescape(b'this is a test')
440 'this is a test'
440 'this is a test'
441 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
441 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
442 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
442 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
443 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
443 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
444 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
444 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
445 >>> jsonescape(b'a weird byte: \\xdd')
445 >>> jsonescape(b'a weird byte: \\xdd')
446 'a weird byte: \\xed\\xb3\\x9d'
446 'a weird byte: \\xed\\xb3\\x9d'
447 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
447 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
448 'utf-8: caf\\xc3\\xa9'
448 'utf-8: caf\\xc3\\xa9'
449 >>> jsonescape(b'')
449 >>> jsonescape(b'')
450 ''
450 ''
451
451
452 If paranoid, non-ascii and common troublesome characters are also escaped.
452 If paranoid, non-ascii and common troublesome characters are also escaped.
453 This is suitable for web output.
453 This is suitable for web output.
454
454
455 >>> s = b'escape characters: \\0 \\x0b \\x7f'
455 >>> s = b'escape characters: \\0 \\x0b \\x7f'
456 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
456 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
457 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
457 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
458 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
458 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
459 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
459 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
460 'escape boundary: ~ \\\\u007f \\\\u0080'
460 'escape boundary: ~ \\\\u007f \\\\u0080'
461 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
461 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
462 'a weird byte: \\\\udcdd'
462 'a weird byte: \\\\udcdd'
463 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
463 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
464 'utf-8: caf\\\\u00e9'
464 'utf-8: caf\\\\u00e9'
465 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
465 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
466 'non-BMP: \\\\ud834\\\\udd1e'
466 'non-BMP: \\\\ud834\\\\udd1e'
467 >>> jsonescape(b'<foo@example.org>', paranoid=True)
467 >>> jsonescape(b'<foo@example.org>', paranoid=True)
468 '\\\\u003cfoo@example.org\\\\u003e'
468 '\\\\u003cfoo@example.org\\\\u003e'
469 '''
469 '''
470
470
471 u8chars = toutf8b(s)
471 u8chars = toutf8b(s)
472 try:
472 try:
473 return _jsonescapeu8fast(u8chars, paranoid)
473 return _jsonescapeu8fast(u8chars, paranoid)
474 except ValueError:
474 except ValueError:
475 pass
475 pass
476 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
476 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
477
477
478 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
478 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
479 # bytes are mapped to that range.
479 # bytes are mapped to that range.
480 if pycompat.ispy3:
480 if pycompat.ispy3:
481 _utf8strict = r'surrogatepass'
481 _utf8strict = r'surrogatepass'
482 else:
482 else:
483 _utf8strict = r'strict'
483 _utf8strict = r'strict'
484
484
485 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
485 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
486
486
487 def getutf8char(s, pos):
487 def getutf8char(s, pos):
488 '''get the next full utf-8 character in the given string, starting at pos
488 '''get the next full utf-8 character in the given string, starting at pos
489
489
490 Raises a UnicodeError if the given location does not start a valid
490 Raises a UnicodeError if the given location does not start a valid
491 utf-8 character.
491 utf-8 character.
492 '''
492 '''
493
493
494 # find how many bytes to attempt decoding from first nibble
494 # find how many bytes to attempt decoding from first nibble
495 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
495 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
496 if not l: # ascii
496 if not l: # ascii
497 return s[pos:pos + 1]
497 return s[pos:pos + 1]
498
498
499 c = s[pos:pos + l]
499 c = s[pos:pos + l]
500 # validate with attempted decode
500 # validate with attempted decode
501 c.decode("utf-8", _utf8strict)
501 c.decode("utf-8", _utf8strict)
502 return c
502 return c
503
503
504 def toutf8b(s):
504 def toutf8b(s):
505 '''convert a local, possibly-binary string into UTF-8b
505 '''convert a local, possibly-binary string into UTF-8b
506
506
507 This is intended as a generic method to preserve data when working
507 This is intended as a generic method to preserve data when working
508 with schemes like JSON and XML that have no provision for
508 with schemes like JSON and XML that have no provision for
509 arbitrary byte strings. As Mercurial often doesn't know
509 arbitrary byte strings. As Mercurial often doesn't know
510 what encoding data is in, we use so-called UTF-8b.
510 what encoding data is in, we use so-called UTF-8b.
511
511
512 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
512 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
513 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
513 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
514 uDC00-uDCFF.
514 uDC00-uDCFF.
515
515
516 Principles of operation:
516 Principles of operation:
517
517
518 - ASCII and UTF-8 data successfully round-trips and is understood
518 - ASCII and UTF-8 data successfully round-trips and is understood
519 by Unicode-oriented clients
519 by Unicode-oriented clients
520 - filenames and file contents in arbitrary other encodings can have
520 - filenames and file contents in arbitrary other encodings can have
521 be round-tripped or recovered by clueful clients
521 be round-tripped or recovered by clueful clients
522 - local strings that have a cached known UTF-8 encoding (aka
522 - local strings that have a cached known UTF-8 encoding (aka
523 localstr) get sent as UTF-8 so Unicode-oriented clients get the
523 localstr) get sent as UTF-8 so Unicode-oriented clients get the
524 Unicode data they want
524 Unicode data they want
525 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
525 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
526 - because we must preserve UTF-8 bytestring in places such as
526 - because we must preserve UTF-8 bytestring in places such as
527 filenames, metadata can't be roundtripped without help
527 filenames, metadata can't be roundtripped without help
528
528
529 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
529 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
530 arbitrary bytes into an internal Unicode format that can be
530 arbitrary bytes into an internal Unicode format that can be
531 re-encoded back into the original. Here we are exposing the
531 re-encoded back into the original. Here we are exposing the
532 internal surrogate encoding as a UTF-8 string.)
532 internal surrogate encoding as a UTF-8 string.)
533 '''
533 '''
534
534
535 if isinstance(s, localstr):
535 if isinstance(s, localstr):
536 # assume that the original UTF-8 sequence would never contain
536 # assume that the original UTF-8 sequence would never contain
537 # invalid characters in U+DCxx range
537 # invalid characters in U+DCxx range
538 return s._utf8
538 return s._utf8
539 elif isinstance(s, safelocalstr):
539 elif isinstance(s, safelocalstr):
540 # already verified that s is non-lossy in legacy encoding, which
540 # already verified that s is non-lossy in legacy encoding, which
541 # shouldn't contain characters in U+DCxx range
541 # shouldn't contain characters in U+DCxx range
542 return fromlocal(s)
542 return fromlocal(s)
543 elif isasciistr(s):
543 elif isasciistr(s):
544 return s
544 return s
545 if "\xed" not in s:
545 if "\xed" not in s:
546 try:
546 try:
547 s.decode('utf-8', _utf8strict)
547 s.decode('utf-8', _utf8strict)
548 return s
548 return s
549 except UnicodeDecodeError:
549 except UnicodeDecodeError:
550 pass
550 pass
551
551
552 s = pycompat.bytestr(s)
552 s = pycompat.bytestr(s)
553 r = ""
553 r = ""
554 pos = 0
554 pos = 0
555 l = len(s)
555 l = len(s)
556 while pos < l:
556 while pos < l:
557 try:
557 try:
558 c = getutf8char(s, pos)
558 c = getutf8char(s, pos)
559 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
559 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
560 # have to re-escape existing U+DCxx characters
560 # have to re-escape existing U+DCxx characters
561 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
561 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
562 pos += 1
562 pos += 1
563 else:
563 else:
564 pos += len(c)
564 pos += len(c)
565 except UnicodeDecodeError:
565 except UnicodeDecodeError:
566 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
566 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
567 pos += 1
567 pos += 1
568 r += c
568 r += c
569 return r
569 return r
570
570
571 def fromutf8b(s):
571 def fromutf8b(s):
572 '''Given a UTF-8b string, return a local, possibly-binary string.
572 '''Given a UTF-8b string, return a local, possibly-binary string.
573
573
574 return the original binary string. This
574 return the original binary string. This
575 is a round-trip process for strings like filenames, but metadata
575 is a round-trip process for strings like filenames, but metadata
576 that's was passed through tolocal will remain in UTF-8.
576 that's was passed through tolocal will remain in UTF-8.
577
577
578 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
578 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
579 >>> m = b"\\xc3\\xa9\\x99abcd"
579 >>> m = b"\\xc3\\xa9\\x99abcd"
580 >>> toutf8b(m)
580 >>> toutf8b(m)
581 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
581 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
582 >>> roundtrip(m)
582 >>> roundtrip(m)
583 True
583 True
584 >>> roundtrip(b"\\xc2\\xc2\\x80")
584 >>> roundtrip(b"\\xc2\\xc2\\x80")
585 True
585 True
586 >>> roundtrip(b"\\xef\\xbf\\xbd")
586 >>> roundtrip(b"\\xef\\xbf\\xbd")
587 True
587 True
588 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
588 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
589 True
589 True
590 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
590 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
591 True
591 True
592 '''
592 '''
593
593
594 if isasciistr(s):
594 if isasciistr(s):
595 return s
595 return s
596 # fast path - look for uDxxx prefixes in s
596 # fast path - look for uDxxx prefixes in s
597 if "\xed" not in s:
597 if "\xed" not in s:
598 return s
598 return s
599
599
600 # We could do this with the unicode type but some Python builds
600 # We could do this with the unicode type but some Python builds
601 # use UTF-16 internally (issue5031) which causes non-BMP code
601 # use UTF-16 internally (issue5031) which causes non-BMP code
602 # points to be escaped. Instead, we use our handy getutf8char
602 # points to be escaped. Instead, we use our handy getutf8char
603 # helper again to walk the string without "decoding" it.
603 # helper again to walk the string without "decoding" it.
604
604
605 s = pycompat.bytestr(s)
605 s = pycompat.bytestr(s)
606 r = ""
606 r = ""
607 pos = 0
607 pos = 0
608 l = len(s)
608 l = len(s)
609 while pos < l:
609 while pos < l:
610 c = getutf8char(s, pos)
610 c = getutf8char(s, pos)
611 pos += len(c)
611 pos += len(c)
612 # unescape U+DCxx characters
612 # unescape U+DCxx characters
613 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
613 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
614 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
614 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
615 r += c
615 r += c
616 return r
616 return r
General Comments 0
You need to be logged in to leave comments. Login now