##// END OF EJS Templates
encoding: define local identify functions with explicit type comments...
Augie Fackler -
r43770:5f2a8dab default
parent child Browse files
Show More
@@ -1,653 +1,660
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 charencode = policy.importmod(r'charencode')
23 charencode = policy.importmod(r'charencode')
24
24
25 isasciistr = charencode.isasciistr
25 isasciistr = charencode.isasciistr
26 asciilower = charencode.asciilower
26 asciilower = charencode.asciilower
27 asciiupper = charencode.asciiupper
27 asciiupper = charencode.asciiupper
28 _jsonescapeu8fast = charencode.jsonescapeu8fast
28 _jsonescapeu8fast = charencode.jsonescapeu8fast
29
29
30 _sysstr = pycompat.sysstr
30 _sysstr = pycompat.sysstr
31
31
32 if pycompat.ispy3:
32 if pycompat.ispy3:
33 unichr = chr
33 unichr = chr
34
34
35 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
35 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
36 # "Unicode Subtleties"), so we need to ignore them in some places for
36 # "Unicode Subtleties"), so we need to ignore them in some places for
37 # sanity.
37 # sanity.
38 _ignore = [
38 _ignore = [
39 unichr(int(x, 16)).encode("utf-8")
39 unichr(int(x, 16)).encode("utf-8")
40 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
40 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 b"206a 206b 206c 206d 206e 206f feff".split()
41 b"206a 206b 206c 206d 206e 206f feff".split()
42 ]
42 ]
43 # verify the next function will work
43 # verify the next function will work
44 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
44 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
45
45
46
46
47 def hfsignoreclean(s):
47 def hfsignoreclean(s):
48 """Remove codepoints ignored by HFS+ from s.
48 """Remove codepoints ignored by HFS+ from s.
49
49
50 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
51 '.hg'
51 '.hg'
52 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
53 '.hg'
53 '.hg'
54 """
54 """
55 if b"\xe2" in s or b"\xef" in s:
55 if b"\xe2" in s or b"\xef" in s:
56 for c in _ignore:
56 for c in _ignore:
57 s = s.replace(c, b'')
57 s = s.replace(c, b'')
58 return s
58 return s
59
59
60
60
61 # encoding.environ is provided read-only, which may not be used to modify
61 # encoding.environ is provided read-only, which may not be used to modify
62 # the process environment
62 # the process environment
63 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
63 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
64 if not pycompat.ispy3:
64 if not pycompat.ispy3:
65 environ = os.environ # re-exports
65 environ = os.environ # re-exports
66 elif _nativeenviron:
66 elif _nativeenviron:
67 environ = os.environb # re-exports
67 environ = os.environb # re-exports
68 else:
68 else:
69 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
69 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
70 # and recreate it once encoding is settled
70 # and recreate it once encoding is settled
71 environ = dict(
71 environ = dict(
72 (k.encode(r'utf-8'), v.encode(r'utf-8'))
72 (k.encode(r'utf-8'), v.encode(r'utf-8'))
73 for k, v in os.environ.items() # re-exports
73 for k, v in os.environ.items() # re-exports
74 )
74 )
75
75
76 _encodingrewrites = {
76 _encodingrewrites = {
77 b'646': b'ascii',
77 b'646': b'ascii',
78 b'ANSI_X3.4-1968': b'ascii',
78 b'ANSI_X3.4-1968': b'ascii',
79 }
79 }
80 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
80 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
81 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
81 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
82 # https://bugs.python.org/issue13216
82 # https://bugs.python.org/issue13216
83 if pycompat.iswindows and not pycompat.ispy3:
83 if pycompat.iswindows and not pycompat.ispy3:
84 _encodingrewrites[b'cp65001'] = b'utf-8'
84 _encodingrewrites[b'cp65001'] = b'utf-8'
85
85
86 try:
86 try:
87 encoding = environ.get(b"HGENCODING")
87 encoding = environ.get(b"HGENCODING")
88 if not encoding:
88 if not encoding:
89 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
89 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
90 encoding = _encodingrewrites.get(encoding, encoding)
90 encoding = _encodingrewrites.get(encoding, encoding)
91 except locale.Error:
91 except locale.Error:
92 encoding = b'ascii'
92 encoding = b'ascii'
93 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
93 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
94 fallbackencoding = b'ISO-8859-1'
94 fallbackencoding = b'ISO-8859-1'
95
95
96
96
97 class localstr(bytes):
97 class localstr(bytes):
98 '''This class allows strings that are unmodified to be
98 '''This class allows strings that are unmodified to be
99 round-tripped to the local encoding and back'''
99 round-tripped to the local encoding and back'''
100
100
101 def __new__(cls, u, l):
101 def __new__(cls, u, l):
102 s = bytes.__new__(cls, l)
102 s = bytes.__new__(cls, l)
103 s._utf8 = u
103 s._utf8 = u
104 return s
104 return s
105
105
106 def __hash__(self):
106 def __hash__(self):
107 return hash(self._utf8) # avoid collisions in local string space
107 return hash(self._utf8) # avoid collisions in local string space
108
108
109
109
110 class safelocalstr(bytes):
110 class safelocalstr(bytes):
111 """Tagged string denoting it was previously an internal UTF-8 string,
111 """Tagged string denoting it was previously an internal UTF-8 string,
112 and can be converted back to UTF-8 losslessly
112 and can be converted back to UTF-8 losslessly
113
113
114 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
114 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
115 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
115 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
116 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
116 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
117 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
117 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
118 """
118 """
119
119
120
120
121 def tolocal(s):
121 def tolocal(s):
122 """
122 """
123 Convert a string from internal UTF-8 to local encoding
123 Convert a string from internal UTF-8 to local encoding
124
124
125 All internal strings should be UTF-8 but some repos before the
125 All internal strings should be UTF-8 but some repos before the
126 implementation of locale support may contain latin1 or possibly
126 implementation of locale support may contain latin1 or possibly
127 other character sets. We attempt to decode everything strictly
127 other character sets. We attempt to decode everything strictly
128 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
128 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
129 replace unknown characters.
129 replace unknown characters.
130
130
131 The localstr class is used to cache the known UTF-8 encoding of
131 The localstr class is used to cache the known UTF-8 encoding of
132 strings next to their local representation to allow lossless
132 strings next to their local representation to allow lossless
133 round-trip conversion back to UTF-8.
133 round-trip conversion back to UTF-8.
134
134
135 >>> u = b'foo: \\xc3\\xa4' # utf-8
135 >>> u = b'foo: \\xc3\\xa4' # utf-8
136 >>> l = tolocal(u)
136 >>> l = tolocal(u)
137 >>> l
137 >>> l
138 'foo: ?'
138 'foo: ?'
139 >>> fromlocal(l)
139 >>> fromlocal(l)
140 'foo: \\xc3\\xa4'
140 'foo: \\xc3\\xa4'
141 >>> u2 = b'foo: \\xc3\\xa1'
141 >>> u2 = b'foo: \\xc3\\xa1'
142 >>> d = { l: 1, tolocal(u2): 2 }
142 >>> d = { l: 1, tolocal(u2): 2 }
143 >>> len(d) # no collision
143 >>> len(d) # no collision
144 2
144 2
145 >>> b'foo: ?' in d
145 >>> b'foo: ?' in d
146 False
146 False
147 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
147 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
148 >>> l = tolocal(l1)
148 >>> l = tolocal(l1)
149 >>> l
149 >>> l
150 'foo: ?'
150 'foo: ?'
151 >>> fromlocal(l) # magically in utf-8
151 >>> fromlocal(l) # magically in utf-8
152 'foo: \\xc3\\xa4'
152 'foo: \\xc3\\xa4'
153 """
153 """
154
154
155 if isasciistr(s):
155 if isasciistr(s):
156 return s
156 return s
157
157
158 try:
158 try:
159 try:
159 try:
160 # make sure string is actually stored in UTF-8
160 # make sure string is actually stored in UTF-8
161 u = s.decode('UTF-8')
161 u = s.decode('UTF-8')
162 if encoding == b'UTF-8':
162 if encoding == b'UTF-8':
163 # fast path
163 # fast path
164 return s
164 return s
165 r = u.encode(_sysstr(encoding), r"replace")
165 r = u.encode(_sysstr(encoding), r"replace")
166 if u == r.decode(_sysstr(encoding)):
166 if u == r.decode(_sysstr(encoding)):
167 # r is a safe, non-lossy encoding of s
167 # r is a safe, non-lossy encoding of s
168 return safelocalstr(r)
168 return safelocalstr(r)
169 return localstr(s, r)
169 return localstr(s, r)
170 except UnicodeDecodeError:
170 except UnicodeDecodeError:
171 # we should only get here if we're looking at an ancient changeset
171 # we should only get here if we're looking at an ancient changeset
172 try:
172 try:
173 u = s.decode(_sysstr(fallbackencoding))
173 u = s.decode(_sysstr(fallbackencoding))
174 r = u.encode(_sysstr(encoding), r"replace")
174 r = u.encode(_sysstr(encoding), r"replace")
175 if u == r.decode(_sysstr(encoding)):
175 if u == r.decode(_sysstr(encoding)):
176 # r is a safe, non-lossy encoding of s
176 # r is a safe, non-lossy encoding of s
177 return safelocalstr(r)
177 return safelocalstr(r)
178 return localstr(u.encode('UTF-8'), r)
178 return localstr(u.encode('UTF-8'), r)
179 except UnicodeDecodeError:
179 except UnicodeDecodeError:
180 u = s.decode("utf-8", "replace") # last ditch
180 u = s.decode("utf-8", "replace") # last ditch
181 # can't round-trip
181 # can't round-trip
182 return u.encode(_sysstr(encoding), r"replace")
182 return u.encode(_sysstr(encoding), r"replace")
183 except LookupError as k:
183 except LookupError as k:
184 raise error.Abort(k, hint=b"please check your locale settings")
184 raise error.Abort(k, hint=b"please check your locale settings")
185
185
186
186
187 def fromlocal(s):
187 def fromlocal(s):
188 """
188 """
189 Convert a string from the local character encoding to UTF-8
189 Convert a string from the local character encoding to UTF-8
190
190
191 We attempt to decode strings using the encoding mode set by
191 We attempt to decode strings using the encoding mode set by
192 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
192 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
193 characters will cause an error message. Other modes include
193 characters will cause an error message. Other modes include
194 'replace', which replaces unknown characters with a special
194 'replace', which replaces unknown characters with a special
195 Unicode character, and 'ignore', which drops the character.
195 Unicode character, and 'ignore', which drops the character.
196 """
196 """
197
197
198 # can we do a lossless round-trip?
198 # can we do a lossless round-trip?
199 if isinstance(s, localstr):
199 if isinstance(s, localstr):
200 return s._utf8
200 return s._utf8
201 if isasciistr(s):
201 if isasciistr(s):
202 return s
202 return s
203
203
204 try:
204 try:
205 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
205 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
206 return u.encode("utf-8")
206 return u.encode("utf-8")
207 except UnicodeDecodeError as inst:
207 except UnicodeDecodeError as inst:
208 sub = s[max(0, inst.start - 10) : inst.start + 10]
208 sub = s[max(0, inst.start - 10) : inst.start + 10]
209 raise error.Abort(
209 raise error.Abort(
210 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
210 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
211 )
211 )
212 except LookupError as k:
212 except LookupError as k:
213 raise error.Abort(k, hint=b"please check your locale settings")
213 raise error.Abort(k, hint=b"please check your locale settings")
214
214
215
215
216 def unitolocal(u):
216 def unitolocal(u):
217 """Convert a unicode string to a byte string of local encoding"""
217 """Convert a unicode string to a byte string of local encoding"""
218 return tolocal(u.encode('utf-8'))
218 return tolocal(u.encode('utf-8'))
219
219
220
220
221 def unifromlocal(s):
221 def unifromlocal(s):
222 """Convert a byte string of local encoding to a unicode string"""
222 """Convert a byte string of local encoding to a unicode string"""
223 return fromlocal(s).decode('utf-8')
223 return fromlocal(s).decode('utf-8')
224
224
225
225
226 def unimethod(bytesfunc):
226 def unimethod(bytesfunc):
227 """Create a proxy method that forwards __unicode__() and __str__() of
227 """Create a proxy method that forwards __unicode__() and __str__() of
228 Python 3 to __bytes__()"""
228 Python 3 to __bytes__()"""
229
229
230 def unifunc(obj):
230 def unifunc(obj):
231 return unifromlocal(bytesfunc(obj))
231 return unifromlocal(bytesfunc(obj))
232
232
233 return unifunc
233 return unifunc
234
234
235
235
236 # converter functions between native str and byte string. use these if the
236 # converter functions between native str and byte string. use these if the
237 # character encoding is not aware (e.g. exception message) or is known to
237 # character encoding is not aware (e.g. exception message) or is known to
238 # be locale dependent (e.g. date formatting.)
238 # be locale dependent (e.g. date formatting.)
239 if pycompat.ispy3:
239 if pycompat.ispy3:
240 strtolocal = unitolocal
240 strtolocal = unitolocal
241 strfromlocal = unifromlocal
241 strfromlocal = unifromlocal
242 strmethod = unimethod
242 strmethod = unimethod
243 else:
243 else:
244 strtolocal = pycompat.identity
244
245 strfromlocal = pycompat.identity
245 def strtolocal(s):
246 # type: (str) -> bytes
247 return s
248
249 def strfromlocal(s):
250 # type: (bytes) -> str
251 return s
252
246 strmethod = pycompat.identity
253 strmethod = pycompat.identity
247
254
248 if not _nativeenviron:
255 if not _nativeenviron:
249 # now encoding and helper functions are available, recreate the environ
256 # now encoding and helper functions are available, recreate the environ
250 # dict to be exported to other modules
257 # dict to be exported to other modules
251 environ = dict(
258 environ = dict(
252 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
259 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
253 for k, v in os.environ.items() # re-exports
260 for k, v in os.environ.items() # re-exports
254 )
261 )
255
262
256 if pycompat.ispy3:
263 if pycompat.ispy3:
257 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
264 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
258 # returns bytes.
265 # returns bytes.
259 if pycompat.iswindows:
266 if pycompat.iswindows:
260 # Python 3 on Windows issues a DeprecationWarning about using the bytes
267 # Python 3 on Windows issues a DeprecationWarning about using the bytes
261 # API when os.getcwdb() is called.
268 # API when os.getcwdb() is called.
262 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
269 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
263 else:
270 else:
264 getcwd = os.getcwdb # re-exports
271 getcwd = os.getcwdb # re-exports
265 else:
272 else:
266 getcwd = os.getcwd # re-exports
273 getcwd = os.getcwd # re-exports
267
274
268 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
275 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
269 _wide = _sysstr(
276 _wide = _sysstr(
270 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
277 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
271 and b"WFA"
278 and b"WFA"
272 or b"WF"
279 or b"WF"
273 )
280 )
274
281
275
282
276 def colwidth(s):
283 def colwidth(s):
277 b"Find the column width of a string for display in the local encoding"
284 b"Find the column width of a string for display in the local encoding"
278 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
285 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
279
286
280
287
281 def ucolwidth(d):
288 def ucolwidth(d):
282 b"Find the column width of a Unicode string for display"
289 b"Find the column width of a Unicode string for display"
283 eaw = getattr(unicodedata, 'east_asian_width', None)
290 eaw = getattr(unicodedata, 'east_asian_width', None)
284 if eaw is not None:
291 if eaw is not None:
285 return sum([eaw(c) in _wide and 2 or 1 for c in d])
292 return sum([eaw(c) in _wide and 2 or 1 for c in d])
286 return len(d)
293 return len(d)
287
294
288
295
289 def getcols(s, start, c):
296 def getcols(s, start, c):
290 '''Use colwidth to find a c-column substring of s starting at byte
297 '''Use colwidth to find a c-column substring of s starting at byte
291 index start'''
298 index start'''
292 for x in pycompat.xrange(start + c, len(s)):
299 for x in pycompat.xrange(start + c, len(s)):
293 t = s[start:x]
300 t = s[start:x]
294 if colwidth(t) == c:
301 if colwidth(t) == c:
295 return t
302 return t
296
303
297
304
298 def trim(s, width, ellipsis=b'', leftside=False):
305 def trim(s, width, ellipsis=b'', leftside=False):
299 """Trim string 's' to at most 'width' columns (including 'ellipsis').
306 """Trim string 's' to at most 'width' columns (including 'ellipsis').
300
307
301 If 'leftside' is True, left side of string 's' is trimmed.
308 If 'leftside' is True, left side of string 's' is trimmed.
302 'ellipsis' is always placed at trimmed side.
309 'ellipsis' is always placed at trimmed side.
303
310
304 >>> from .node import bin
311 >>> from .node import bin
305 >>> def bprint(s):
312 >>> def bprint(s):
306 ... print(pycompat.sysstr(s))
313 ... print(pycompat.sysstr(s))
307 >>> ellipsis = b'+++'
314 >>> ellipsis = b'+++'
308 >>> from . import encoding
315 >>> from . import encoding
309 >>> encoding.encoding = b'utf-8'
316 >>> encoding.encoding = b'utf-8'
310 >>> t = b'1234567890'
317 >>> t = b'1234567890'
311 >>> bprint(trim(t, 12, ellipsis=ellipsis))
318 >>> bprint(trim(t, 12, ellipsis=ellipsis))
312 1234567890
319 1234567890
313 >>> bprint(trim(t, 10, ellipsis=ellipsis))
320 >>> bprint(trim(t, 10, ellipsis=ellipsis))
314 1234567890
321 1234567890
315 >>> bprint(trim(t, 8, ellipsis=ellipsis))
322 >>> bprint(trim(t, 8, ellipsis=ellipsis))
316 12345+++
323 12345+++
317 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
324 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
318 +++67890
325 +++67890
319 >>> bprint(trim(t, 8))
326 >>> bprint(trim(t, 8))
320 12345678
327 12345678
321 >>> bprint(trim(t, 8, leftside=True))
328 >>> bprint(trim(t, 8, leftside=True))
322 34567890
329 34567890
323 >>> bprint(trim(t, 3, ellipsis=ellipsis))
330 >>> bprint(trim(t, 3, ellipsis=ellipsis))
324 +++
331 +++
325 >>> bprint(trim(t, 1, ellipsis=ellipsis))
332 >>> bprint(trim(t, 1, ellipsis=ellipsis))
326 +
333 +
327 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
334 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
328 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
335 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
329 >>> bprint(trim(t, 12, ellipsis=ellipsis))
336 >>> bprint(trim(t, 12, ellipsis=ellipsis))
330 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
337 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
331 >>> bprint(trim(t, 10, ellipsis=ellipsis))
338 >>> bprint(trim(t, 10, ellipsis=ellipsis))
332 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
339 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
333 >>> bprint(trim(t, 8, ellipsis=ellipsis))
340 >>> bprint(trim(t, 8, ellipsis=ellipsis))
334 \xe3\x81\x82\xe3\x81\x84+++
341 \xe3\x81\x82\xe3\x81\x84+++
335 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
342 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
336 +++\xe3\x81\x88\xe3\x81\x8a
343 +++\xe3\x81\x88\xe3\x81\x8a
337 >>> bprint(trim(t, 5))
344 >>> bprint(trim(t, 5))
338 \xe3\x81\x82\xe3\x81\x84
345 \xe3\x81\x82\xe3\x81\x84
339 >>> bprint(trim(t, 5, leftside=True))
346 >>> bprint(trim(t, 5, leftside=True))
340 \xe3\x81\x88\xe3\x81\x8a
347 \xe3\x81\x88\xe3\x81\x8a
341 >>> bprint(trim(t, 4, ellipsis=ellipsis))
348 >>> bprint(trim(t, 4, ellipsis=ellipsis))
342 +++
349 +++
343 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
350 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
344 +++
351 +++
345 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
352 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
346 >>> bprint(trim(t, 12, ellipsis=ellipsis))
353 >>> bprint(trim(t, 12, ellipsis=ellipsis))
347 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
354 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
348 >>> bprint(trim(t, 10, ellipsis=ellipsis))
355 >>> bprint(trim(t, 10, ellipsis=ellipsis))
349 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
356 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
350 >>> bprint(trim(t, 8, ellipsis=ellipsis))
357 >>> bprint(trim(t, 8, ellipsis=ellipsis))
351 \x11\x22\x33\x44\x55+++
358 \x11\x22\x33\x44\x55+++
352 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
359 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
353 +++\x66\x77\x88\x99\xaa
360 +++\x66\x77\x88\x99\xaa
354 >>> bprint(trim(t, 8))
361 >>> bprint(trim(t, 8))
355 \x11\x22\x33\x44\x55\x66\x77\x88
362 \x11\x22\x33\x44\x55\x66\x77\x88
356 >>> bprint(trim(t, 8, leftside=True))
363 >>> bprint(trim(t, 8, leftside=True))
357 \x33\x44\x55\x66\x77\x88\x99\xaa
364 \x33\x44\x55\x66\x77\x88\x99\xaa
358 >>> bprint(trim(t, 3, ellipsis=ellipsis))
365 >>> bprint(trim(t, 3, ellipsis=ellipsis))
359 +++
366 +++
360 >>> bprint(trim(t, 1, ellipsis=ellipsis))
367 >>> bprint(trim(t, 1, ellipsis=ellipsis))
361 +
368 +
362 """
369 """
363 try:
370 try:
364 u = s.decode(_sysstr(encoding))
371 u = s.decode(_sysstr(encoding))
365 except UnicodeDecodeError:
372 except UnicodeDecodeError:
366 if len(s) <= width: # trimming is not needed
373 if len(s) <= width: # trimming is not needed
367 return s
374 return s
368 width -= len(ellipsis)
375 width -= len(ellipsis)
369 if width <= 0: # no enough room even for ellipsis
376 if width <= 0: # no enough room even for ellipsis
370 return ellipsis[: width + len(ellipsis)]
377 return ellipsis[: width + len(ellipsis)]
371 if leftside:
378 if leftside:
372 return ellipsis + s[-width:]
379 return ellipsis + s[-width:]
373 return s[:width] + ellipsis
380 return s[:width] + ellipsis
374
381
375 if ucolwidth(u) <= width: # trimming is not needed
382 if ucolwidth(u) <= width: # trimming is not needed
376 return s
383 return s
377
384
378 width -= len(ellipsis)
385 width -= len(ellipsis)
379 if width <= 0: # no enough room even for ellipsis
386 if width <= 0: # no enough room even for ellipsis
380 return ellipsis[: width + len(ellipsis)]
387 return ellipsis[: width + len(ellipsis)]
381
388
382 if leftside:
389 if leftside:
383 uslice = lambda i: u[i:]
390 uslice = lambda i: u[i:]
384 concat = lambda s: ellipsis + s
391 concat = lambda s: ellipsis + s
385 else:
392 else:
386 uslice = lambda i: u[:-i]
393 uslice = lambda i: u[:-i]
387 concat = lambda s: s + ellipsis
394 concat = lambda s: s + ellipsis
388 for i in pycompat.xrange(1, len(u)):
395 for i in pycompat.xrange(1, len(u)):
389 usub = uslice(i)
396 usub = uslice(i)
390 if ucolwidth(usub) <= width:
397 if ucolwidth(usub) <= width:
391 return concat(usub.encode(_sysstr(encoding)))
398 return concat(usub.encode(_sysstr(encoding)))
392 return ellipsis # no enough room for multi-column characters
399 return ellipsis # no enough room for multi-column characters
393
400
394
401
395 def lower(s):
402 def lower(s):
396 b"best-effort encoding-aware case-folding of local string s"
403 b"best-effort encoding-aware case-folding of local string s"
397 try:
404 try:
398 return asciilower(s)
405 return asciilower(s)
399 except UnicodeDecodeError:
406 except UnicodeDecodeError:
400 pass
407 pass
401 try:
408 try:
402 if isinstance(s, localstr):
409 if isinstance(s, localstr):
403 u = s._utf8.decode("utf-8")
410 u = s._utf8.decode("utf-8")
404 else:
411 else:
405 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
412 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
406
413
407 lu = u.lower()
414 lu = u.lower()
408 if u == lu:
415 if u == lu:
409 return s # preserve localstring
416 return s # preserve localstring
410 return lu.encode(_sysstr(encoding))
417 return lu.encode(_sysstr(encoding))
411 except UnicodeError:
418 except UnicodeError:
412 return s.lower() # we don't know how to fold this except in ASCII
419 return s.lower() # we don't know how to fold this except in ASCII
413 except LookupError as k:
420 except LookupError as k:
414 raise error.Abort(k, hint=b"please check your locale settings")
421 raise error.Abort(k, hint=b"please check your locale settings")
415
422
416
423
417 def upper(s):
424 def upper(s):
418 b"best-effort encoding-aware case-folding of local string s"
425 b"best-effort encoding-aware case-folding of local string s"
419 try:
426 try:
420 return asciiupper(s)
427 return asciiupper(s)
421 except UnicodeDecodeError:
428 except UnicodeDecodeError:
422 return upperfallback(s)
429 return upperfallback(s)
423
430
424
431
425 def upperfallback(s):
432 def upperfallback(s):
426 try:
433 try:
427 if isinstance(s, localstr):
434 if isinstance(s, localstr):
428 u = s._utf8.decode("utf-8")
435 u = s._utf8.decode("utf-8")
429 else:
436 else:
430 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
437 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
431
438
432 uu = u.upper()
439 uu = u.upper()
433 if u == uu:
440 if u == uu:
434 return s # preserve localstring
441 return s # preserve localstring
435 return uu.encode(_sysstr(encoding))
442 return uu.encode(_sysstr(encoding))
436 except UnicodeError:
443 except UnicodeError:
437 return s.upper() # we don't know how to fold this except in ASCII
444 return s.upper() # we don't know how to fold this except in ASCII
438 except LookupError as k:
445 except LookupError as k:
439 raise error.Abort(k, hint=b"please check your locale settings")
446 raise error.Abort(k, hint=b"please check your locale settings")
440
447
441
448
442 class normcasespecs(object):
449 class normcasespecs(object):
443 '''what a platform's normcase does to ASCII strings
450 '''what a platform's normcase does to ASCII strings
444
451
445 This is specified per platform, and should be consistent with what normcase
452 This is specified per platform, and should be consistent with what normcase
446 on that platform actually does.
453 on that platform actually does.
447
454
448 lower: normcase lowercases ASCII strings
455 lower: normcase lowercases ASCII strings
449 upper: normcase uppercases ASCII strings
456 upper: normcase uppercases ASCII strings
450 other: the fallback function should always be called
457 other: the fallback function should always be called
451
458
452 This should be kept in sync with normcase_spec in util.h.'''
459 This should be kept in sync with normcase_spec in util.h.'''
453
460
454 lower = -1
461 lower = -1
455 upper = 1
462 upper = 1
456 other = 0
463 other = 0
457
464
458
465
459 def jsonescape(s, paranoid=False):
466 def jsonescape(s, paranoid=False):
460 '''returns a string suitable for JSON
467 '''returns a string suitable for JSON
461
468
462 JSON is problematic for us because it doesn't support non-Unicode
469 JSON is problematic for us because it doesn't support non-Unicode
463 bytes. To deal with this, we take the following approach:
470 bytes. To deal with this, we take the following approach:
464
471
465 - localstr/safelocalstr objects are converted back to UTF-8
472 - localstr/safelocalstr objects are converted back to UTF-8
466 - valid UTF-8/ASCII strings are passed as-is
473 - valid UTF-8/ASCII strings are passed as-is
467 - other strings are converted to UTF-8b surrogate encoding
474 - other strings are converted to UTF-8b surrogate encoding
468 - apply JSON-specified string escaping
475 - apply JSON-specified string escaping
469
476
470 (escapes are doubled in these tests)
477 (escapes are doubled in these tests)
471
478
472 >>> jsonescape(b'this is a test')
479 >>> jsonescape(b'this is a test')
473 'this is a test'
480 'this is a test'
474 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
481 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
475 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
482 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
476 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
483 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
477 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
484 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
478 >>> jsonescape(b'a weird byte: \\xdd')
485 >>> jsonescape(b'a weird byte: \\xdd')
479 'a weird byte: \\xed\\xb3\\x9d'
486 'a weird byte: \\xed\\xb3\\x9d'
480 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
487 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
481 'utf-8: caf\\xc3\\xa9'
488 'utf-8: caf\\xc3\\xa9'
482 >>> jsonescape(b'')
489 >>> jsonescape(b'')
483 ''
490 ''
484
491
485 If paranoid, non-ascii and common troublesome characters are also escaped.
492 If paranoid, non-ascii and common troublesome characters are also escaped.
486 This is suitable for web output.
493 This is suitable for web output.
487
494
488 >>> s = b'escape characters: \\0 \\x0b \\x7f'
495 >>> s = b'escape characters: \\0 \\x0b \\x7f'
489 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
496 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
490 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
497 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
491 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
498 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
492 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
499 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
493 'escape boundary: ~ \\\\u007f \\\\u0080'
500 'escape boundary: ~ \\\\u007f \\\\u0080'
494 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
501 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
495 'a weird byte: \\\\udcdd'
502 'a weird byte: \\\\udcdd'
496 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
503 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
497 'utf-8: caf\\\\u00e9'
504 'utf-8: caf\\\\u00e9'
498 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
505 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
499 'non-BMP: \\\\ud834\\\\udd1e'
506 'non-BMP: \\\\ud834\\\\udd1e'
500 >>> jsonescape(b'<foo@example.org>', paranoid=True)
507 >>> jsonescape(b'<foo@example.org>', paranoid=True)
501 '\\\\u003cfoo@example.org\\\\u003e'
508 '\\\\u003cfoo@example.org\\\\u003e'
502 '''
509 '''
503
510
504 u8chars = toutf8b(s)
511 u8chars = toutf8b(s)
505 try:
512 try:
506 return _jsonescapeu8fast(u8chars, paranoid)
513 return _jsonescapeu8fast(u8chars, paranoid)
507 except ValueError:
514 except ValueError:
508 pass
515 pass
509 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
516 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
510
517
511
518
512 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
519 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
513 # bytes are mapped to that range.
520 # bytes are mapped to that range.
514 if pycompat.ispy3:
521 if pycompat.ispy3:
515 _utf8strict = r'surrogatepass'
522 _utf8strict = r'surrogatepass'
516 else:
523 else:
517 _utf8strict = r'strict'
524 _utf8strict = r'strict'
518
525
519 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
526 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
520
527
521
528
522 def getutf8char(s, pos):
529 def getutf8char(s, pos):
523 '''get the next full utf-8 character in the given string, starting at pos
530 '''get the next full utf-8 character in the given string, starting at pos
524
531
525 Raises a UnicodeError if the given location does not start a valid
532 Raises a UnicodeError if the given location does not start a valid
526 utf-8 character.
533 utf-8 character.
527 '''
534 '''
528
535
529 # find how many bytes to attempt decoding from first nibble
536 # find how many bytes to attempt decoding from first nibble
530 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
537 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
531 if not l: # ascii
538 if not l: # ascii
532 return s[pos : pos + 1]
539 return s[pos : pos + 1]
533
540
534 c = s[pos : pos + l]
541 c = s[pos : pos + l]
535 # validate with attempted decode
542 # validate with attempted decode
536 c.decode("utf-8", _utf8strict)
543 c.decode("utf-8", _utf8strict)
537 return c
544 return c
538
545
539
546
540 def toutf8b(s):
547 def toutf8b(s):
541 '''convert a local, possibly-binary string into UTF-8b
548 '''convert a local, possibly-binary string into UTF-8b
542
549
543 This is intended as a generic method to preserve data when working
550 This is intended as a generic method to preserve data when working
544 with schemes like JSON and XML that have no provision for
551 with schemes like JSON and XML that have no provision for
545 arbitrary byte strings. As Mercurial often doesn't know
552 arbitrary byte strings. As Mercurial often doesn't know
546 what encoding data is in, we use so-called UTF-8b.
553 what encoding data is in, we use so-called UTF-8b.
547
554
548 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
555 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
549 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
556 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
550 uDC00-uDCFF.
557 uDC00-uDCFF.
551
558
552 Principles of operation:
559 Principles of operation:
553
560
554 - ASCII and UTF-8 data successfully round-trips and is understood
561 - ASCII and UTF-8 data successfully round-trips and is understood
555 by Unicode-oriented clients
562 by Unicode-oriented clients
556 - filenames and file contents in arbitrary other encodings can have
563 - filenames and file contents in arbitrary other encodings can have
557 be round-tripped or recovered by clueful clients
564 be round-tripped or recovered by clueful clients
558 - local strings that have a cached known UTF-8 encoding (aka
565 - local strings that have a cached known UTF-8 encoding (aka
559 localstr) get sent as UTF-8 so Unicode-oriented clients get the
566 localstr) get sent as UTF-8 so Unicode-oriented clients get the
560 Unicode data they want
567 Unicode data they want
561 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
568 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
562 - because we must preserve UTF-8 bytestring in places such as
569 - because we must preserve UTF-8 bytestring in places such as
563 filenames, metadata can't be roundtripped without help
570 filenames, metadata can't be roundtripped without help
564
571
565 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
572 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
566 arbitrary bytes into an internal Unicode format that can be
573 arbitrary bytes into an internal Unicode format that can be
567 re-encoded back into the original. Here we are exposing the
574 re-encoded back into the original. Here we are exposing the
568 internal surrogate encoding as a UTF-8 string.)
575 internal surrogate encoding as a UTF-8 string.)
569 '''
576 '''
570
577
571 if isinstance(s, localstr):
578 if isinstance(s, localstr):
572 # assume that the original UTF-8 sequence would never contain
579 # assume that the original UTF-8 sequence would never contain
573 # invalid characters in U+DCxx range
580 # invalid characters in U+DCxx range
574 return s._utf8
581 return s._utf8
575 elif isinstance(s, safelocalstr):
582 elif isinstance(s, safelocalstr):
576 # already verified that s is non-lossy in legacy encoding, which
583 # already verified that s is non-lossy in legacy encoding, which
577 # shouldn't contain characters in U+DCxx range
584 # shouldn't contain characters in U+DCxx range
578 return fromlocal(s)
585 return fromlocal(s)
579 elif isasciistr(s):
586 elif isasciistr(s):
580 return s
587 return s
581 if b"\xed" not in s:
588 if b"\xed" not in s:
582 try:
589 try:
583 s.decode('utf-8', _utf8strict)
590 s.decode('utf-8', _utf8strict)
584 return s
591 return s
585 except UnicodeDecodeError:
592 except UnicodeDecodeError:
586 pass
593 pass
587
594
588 s = pycompat.bytestr(s)
595 s = pycompat.bytestr(s)
589 r = b""
596 r = b""
590 pos = 0
597 pos = 0
591 l = len(s)
598 l = len(s)
592 while pos < l:
599 while pos < l:
593 try:
600 try:
594 c = getutf8char(s, pos)
601 c = getutf8char(s, pos)
595 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
602 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
596 # have to re-escape existing U+DCxx characters
603 # have to re-escape existing U+DCxx characters
597 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
604 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
598 pos += 1
605 pos += 1
599 else:
606 else:
600 pos += len(c)
607 pos += len(c)
601 except UnicodeDecodeError:
608 except UnicodeDecodeError:
602 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
609 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
603 pos += 1
610 pos += 1
604 r += c
611 r += c
605 return r
612 return r
606
613
607
614
608 def fromutf8b(s):
615 def fromutf8b(s):
609 '''Given a UTF-8b string, return a local, possibly-binary string.
616 '''Given a UTF-8b string, return a local, possibly-binary string.
610
617
611 return the original binary string. This
618 return the original binary string. This
612 is a round-trip process for strings like filenames, but metadata
619 is a round-trip process for strings like filenames, but metadata
613 that's was passed through tolocal will remain in UTF-8.
620 that's was passed through tolocal will remain in UTF-8.
614
621
615 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
622 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
616 >>> m = b"\\xc3\\xa9\\x99abcd"
623 >>> m = b"\\xc3\\xa9\\x99abcd"
617 >>> toutf8b(m)
624 >>> toutf8b(m)
618 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
625 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
619 >>> roundtrip(m)
626 >>> roundtrip(m)
620 True
627 True
621 >>> roundtrip(b"\\xc2\\xc2\\x80")
628 >>> roundtrip(b"\\xc2\\xc2\\x80")
622 True
629 True
623 >>> roundtrip(b"\\xef\\xbf\\xbd")
630 >>> roundtrip(b"\\xef\\xbf\\xbd")
624 True
631 True
625 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
632 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
626 True
633 True
627 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
634 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
628 True
635 True
629 '''
636 '''
630
637
631 if isasciistr(s):
638 if isasciistr(s):
632 return s
639 return s
633 # fast path - look for uDxxx prefixes in s
640 # fast path - look for uDxxx prefixes in s
634 if b"\xed" not in s:
641 if b"\xed" not in s:
635 return s
642 return s
636
643
637 # We could do this with the unicode type but some Python builds
644 # We could do this with the unicode type but some Python builds
638 # use UTF-16 internally (issue5031) which causes non-BMP code
645 # use UTF-16 internally (issue5031) which causes non-BMP code
639 # points to be escaped. Instead, we use our handy getutf8char
646 # points to be escaped. Instead, we use our handy getutf8char
640 # helper again to walk the string without "decoding" it.
647 # helper again to walk the string without "decoding" it.
641
648
642 s = pycompat.bytestr(s)
649 s = pycompat.bytestr(s)
643 r = b""
650 r = b""
644 pos = 0
651 pos = 0
645 l = len(s)
652 l = len(s)
646 while pos < l:
653 while pos < l:
647 c = getutf8char(s, pos)
654 c = getutf8char(s, pos)
648 pos += len(c)
655 pos += len(c)
649 # unescape U+DCxx characters
656 # unescape U+DCxx characters
650 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
657 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
651 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
658 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
652 r += c
659 r += c
653 return r
660 return r
General Comments 0
You need to be logged in to leave comments. Login now