##// END OF EJS Templates
typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()...
Yuya Nishihara -
r44078:7f51bc36 default
parent child Browse files
Show More
@@ -1,696 +1,696 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 if not globals(): # hide this from non-pytype users
23 if not globals(): # hide this from non-pytype users
24 from typing import (
24 from typing import (
25 Any,
25 Any,
26 Callable,
26 Callable,
27 List,
27 List,
28 Text,
28 Text,
29 Type,
29 Type,
30 TypeVar,
30 TypeVar,
31 Union,
31 Union,
32 )
32 )
33
33
34 # keep pyflakes happy
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
36 assert t
37
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39
39
40 charencode = policy.importmod('charencode')
40 charencode = policy.importmod('charencode')
41
41
42 isasciistr = charencode.isasciistr
42 isasciistr = charencode.isasciistr
43 asciilower = charencode.asciilower
43 asciilower = charencode.asciilower
44 asciiupper = charencode.asciiupper
44 asciiupper = charencode.asciiupper
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46
46
47 _sysstr = pycompat.sysstr
47 _sysstr = pycompat.sysstr
48
48
49 if pycompat.ispy3:
49 if pycompat.ispy3:
50 unichr = chr
50 unichr = chr
51
51
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # "Unicode Subtleties"), so we need to ignore them in some places for
53 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # sanity.
54 # sanity.
55 _ignore = [
55 _ignore = [
56 unichr(int(x, 16)).encode("utf-8")
56 unichr(int(x, 16)).encode("utf-8")
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 b"206a 206b 206c 206d 206e 206f feff".split()
58 b"206a 206b 206c 206d 206e 206f feff".split()
59 ]
59 ]
60 # verify the next function will work
60 # verify the next function will work
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62
62
63
63
64 def hfsignoreclean(s):
64 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
65 # type: (bytes) -> bytes
66 """Remove codepoints ignored by HFS+ from s.
66 """Remove codepoints ignored by HFS+ from s.
67
67
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 '.hg'
69 '.hg'
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 '.hg'
71 '.hg'
72 """
72 """
73 if b"\xe2" in s or b"\xef" in s:
73 if b"\xe2" in s or b"\xef" in s:
74 for c in _ignore:
74 for c in _ignore:
75 s = s.replace(c, b'')
75 s = s.replace(c, b'')
76 return s
76 return s
77
77
78
78
79 # encoding.environ is provided read-only, which may not be used to modify
79 # encoding.environ is provided read-only, which may not be used to modify
80 # the process environment
80 # the process environment
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 if not pycompat.ispy3:
82 if not pycompat.ispy3:
83 environ = os.environ # re-exports
83 environ = os.environ # re-exports
84 elif _nativeenviron:
84 elif _nativeenviron:
85 environ = os.environb # re-exports
85 environ = os.environb # re-exports
86 else:
86 else:
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 # and recreate it once encoding is settled
88 # and recreate it once encoding is settled
89 environ = dict(
89 environ = dict(
90 (k.encode('utf-8'), v.encode('utf-8'))
90 (k.encode('utf-8'), v.encode('utf-8'))
91 for k, v in os.environ.items() # re-exports
91 for k, v in os.environ.items() # re-exports
92 )
92 )
93
93
94 _encodingrewrites = {
94 _encodingrewrites = {
95 b'646': b'ascii',
95 b'646': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
97 }
97 }
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 # https://bugs.python.org/issue13216
100 # https://bugs.python.org/issue13216
101 if pycompat.iswindows and not pycompat.ispy3:
101 if pycompat.iswindows and not pycompat.ispy3:
102 _encodingrewrites[b'cp65001'] = b'utf-8'
102 _encodingrewrites[b'cp65001'] = b'utf-8'
103
103
104 try:
104 try:
105 encoding = environ.get(b"HGENCODING")
105 encoding = environ.get(b"HGENCODING")
106 if not encoding:
106 if not encoding:
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 encoding = _encodingrewrites.get(encoding, encoding)
108 encoding = _encodingrewrites.get(encoding, encoding)
109 except locale.Error:
109 except locale.Error:
110 encoding = b'ascii'
110 encoding = b'ascii'
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 fallbackencoding = b'ISO-8859-1'
112 fallbackencoding = b'ISO-8859-1'
113
113
114
114
115 class localstr(bytes):
115 class localstr(bytes):
116 '''This class allows strings that are unmodified to be
116 '''This class allows strings that are unmodified to be
117 round-tripped to the local encoding and back'''
117 round-tripped to the local encoding and back'''
118
118
119 def __new__(cls, u, l):
119 def __new__(cls, u, l):
120 # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
120 # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
121 s = bytes.__new__(cls, l)
121 s = bytes.__new__(cls, l)
122 s._utf8 = u
122 s._utf8 = u
123 return s
123 return s
124
124
125 def __hash__(self):
125 def __hash__(self):
126 return hash(self._utf8) # avoid collisions in local string space
126 return hash(self._utf8) # avoid collisions in local string space
127
127
128
128
129 class safelocalstr(bytes):
129 class safelocalstr(bytes):
130 """Tagged string denoting it was previously an internal UTF-8 string,
130 """Tagged string denoting it was previously an internal UTF-8 string,
131 and can be converted back to UTF-8 losslessly
131 and can be converted back to UTF-8 losslessly
132
132
133 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
133 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
134 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
134 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
135 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
135 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
136 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
136 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
137 """
137 """
138
138
139
139
140 def tolocal(s):
140 def tolocal(s):
141 # type: (bytes) -> bytes
141 # type: (bytes) -> bytes
142 """
142 """
143 Convert a string from internal UTF-8 to local encoding
143 Convert a string from internal UTF-8 to local encoding
144
144
145 All internal strings should be UTF-8 but some repos before the
145 All internal strings should be UTF-8 but some repos before the
146 implementation of locale support may contain latin1 or possibly
146 implementation of locale support may contain latin1 or possibly
147 other character sets. We attempt to decode everything strictly
147 other character sets. We attempt to decode everything strictly
148 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
148 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
149 replace unknown characters.
149 replace unknown characters.
150
150
151 The localstr class is used to cache the known UTF-8 encoding of
151 The localstr class is used to cache the known UTF-8 encoding of
152 strings next to their local representation to allow lossless
152 strings next to their local representation to allow lossless
153 round-trip conversion back to UTF-8.
153 round-trip conversion back to UTF-8.
154
154
155 >>> u = b'foo: \\xc3\\xa4' # utf-8
155 >>> u = b'foo: \\xc3\\xa4' # utf-8
156 >>> l = tolocal(u)
156 >>> l = tolocal(u)
157 >>> l
157 >>> l
158 'foo: ?'
158 'foo: ?'
159 >>> fromlocal(l)
159 >>> fromlocal(l)
160 'foo: \\xc3\\xa4'
160 'foo: \\xc3\\xa4'
161 >>> u2 = b'foo: \\xc3\\xa1'
161 >>> u2 = b'foo: \\xc3\\xa1'
162 >>> d = { l: 1, tolocal(u2): 2 }
162 >>> d = { l: 1, tolocal(u2): 2 }
163 >>> len(d) # no collision
163 >>> len(d) # no collision
164 2
164 2
165 >>> b'foo: ?' in d
165 >>> b'foo: ?' in d
166 False
166 False
167 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
167 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
168 >>> l = tolocal(l1)
168 >>> l = tolocal(l1)
169 >>> l
169 >>> l
170 'foo: ?'
170 'foo: ?'
171 >>> fromlocal(l) # magically in utf-8
171 >>> fromlocal(l) # magically in utf-8
172 'foo: \\xc3\\xa4'
172 'foo: \\xc3\\xa4'
173 """
173 """
174
174
175 if isasciistr(s):
175 if isasciistr(s):
176 return s
176 return s
177
177
178 try:
178 try:
179 try:
179 try:
180 # make sure string is actually stored in UTF-8
180 # make sure string is actually stored in UTF-8
181 u = s.decode('UTF-8')
181 u = s.decode('UTF-8')
182 if encoding == b'UTF-8':
182 if encoding == b'UTF-8':
183 # fast path
183 # fast path
184 return s
184 return s
185 r = u.encode(_sysstr(encoding), "replace")
185 r = u.encode(_sysstr(encoding), "replace")
186 if u == r.decode(_sysstr(encoding)):
186 if u == r.decode(_sysstr(encoding)):
187 # r is a safe, non-lossy encoding of s
187 # r is a safe, non-lossy encoding of s
188 return safelocalstr(r)
188 return safelocalstr(r)
189 return localstr(s, r)
189 return localstr(s, r)
190 except UnicodeDecodeError:
190 except UnicodeDecodeError:
191 # we should only get here if we're looking at an ancient changeset
191 # we should only get here if we're looking at an ancient changeset
192 try:
192 try:
193 u = s.decode(_sysstr(fallbackencoding))
193 u = s.decode(_sysstr(fallbackencoding))
194 r = u.encode(_sysstr(encoding), "replace")
194 r = u.encode(_sysstr(encoding), "replace")
195 if u == r.decode(_sysstr(encoding)):
195 if u == r.decode(_sysstr(encoding)):
196 # r is a safe, non-lossy encoding of s
196 # r is a safe, non-lossy encoding of s
197 return safelocalstr(r)
197 return safelocalstr(r)
198 return localstr(u.encode('UTF-8'), r)
198 return localstr(u.encode('UTF-8'), r)
199 except UnicodeDecodeError:
199 except UnicodeDecodeError:
200 u = s.decode("utf-8", "replace") # last ditch
200 u = s.decode("utf-8", "replace") # last ditch
201 # can't round-trip
201 # can't round-trip
202 return u.encode(_sysstr(encoding), "replace")
202 return u.encode(_sysstr(encoding), "replace")
203 except LookupError as k:
203 except LookupError as k:
204 raise error.Abort(k, hint=b"please check your locale settings")
204 raise error.Abort(k, hint=b"please check your locale settings")
205
205
206
206
207 def fromlocal(s):
207 def fromlocal(s):
208 # type: (bytes) -> bytes
208 # type: (bytes) -> bytes
209 """
209 """
210 Convert a string from the local character encoding to UTF-8
210 Convert a string from the local character encoding to UTF-8
211
211
212 We attempt to decode strings using the encoding mode set by
212 We attempt to decode strings using the encoding mode set by
213 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
213 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
214 characters will cause an error message. Other modes include
214 characters will cause an error message. Other modes include
215 'replace', which replaces unknown characters with a special
215 'replace', which replaces unknown characters with a special
216 Unicode character, and 'ignore', which drops the character.
216 Unicode character, and 'ignore', which drops the character.
217 """
217 """
218
218
219 # can we do a lossless round-trip?
219 # can we do a lossless round-trip?
220 if isinstance(s, localstr):
220 if isinstance(s, localstr):
221 return s._utf8
221 return s._utf8
222 if isasciistr(s):
222 if isasciistr(s):
223 return s
223 return s
224
224
225 try:
225 try:
226 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
226 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
227 return u.encode("utf-8")
227 return u.encode("utf-8")
228 except UnicodeDecodeError as inst:
228 except UnicodeDecodeError as inst:
229 sub = s[max(0, inst.start - 10) : inst.start + 10]
229 sub = s[max(0, inst.start - 10) : inst.start + 10]
230 raise error.Abort(
230 raise error.Abort(
231 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
231 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
232 )
232 )
233 except LookupError as k:
233 except LookupError as k:
234 raise error.Abort(k, hint=b"please check your locale settings")
234 raise error.Abort(k, hint=b"please check your locale settings")
235
235
236
236
237 def unitolocal(u):
237 def unitolocal(u):
238 # type: (Text) -> bytes
238 # type: (Text) -> bytes
239 """Convert a unicode string to a byte string of local encoding"""
239 """Convert a unicode string to a byte string of local encoding"""
240 return tolocal(u.encode('utf-8'))
240 return tolocal(u.encode('utf-8'))
241
241
242
242
243 def unifromlocal(s):
243 def unifromlocal(s):
244 # type: (bytes) -> Text
244 # type: (bytes) -> Text
245 """Convert a byte string of local encoding to a unicode string"""
245 """Convert a byte string of local encoding to a unicode string"""
246 return fromlocal(s).decode('utf-8')
246 return fromlocal(s).decode('utf-8')
247
247
248
248
249 def unimethod(bytesfunc):
249 def unimethod(bytesfunc):
250 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
250 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
251 """Create a proxy method that forwards __unicode__() and __str__() of
251 """Create a proxy method that forwards __unicode__() and __str__() of
252 Python 3 to __bytes__()"""
252 Python 3 to __bytes__()"""
253
253
254 def unifunc(obj):
254 def unifunc(obj):
255 return unifromlocal(bytesfunc(obj))
255 return unifromlocal(bytesfunc(obj))
256
256
257 return unifunc
257 return unifunc
258
258
259
259
260 # converter functions between native str and byte string. use these if the
260 # converter functions between native str and byte string. use these if the
261 # character encoding is not aware (e.g. exception message) or is known to
261 # character encoding is not aware (e.g. exception message) or is known to
262 # be locale dependent (e.g. date formatting.)
262 # be locale dependent (e.g. date formatting.)
263 if pycompat.ispy3:
263 if pycompat.ispy3:
264 strtolocal = unitolocal
264 strtolocal = unitolocal
265 strfromlocal = unifromlocal
265 strfromlocal = unifromlocal
266 strmethod = unimethod
266 strmethod = unimethod
267 else:
267 else:
268
268
269 def strtolocal(s):
269 def strtolocal(s):
270 # type: (str) -> bytes
270 # type: (str) -> bytes
271 return s
271 return s # pytype: disable=bad-return-type
272
272
273 def strfromlocal(s):
273 def strfromlocal(s):
274 # type: (bytes) -> str
274 # type: (bytes) -> str
275 return s
275 return s # pytype: disable=bad-return-type
276
276
277 strmethod = pycompat.identity
277 strmethod = pycompat.identity
278
278
279 if not _nativeenviron:
279 if not _nativeenviron:
280 # now encoding and helper functions are available, recreate the environ
280 # now encoding and helper functions are available, recreate the environ
281 # dict to be exported to other modules
281 # dict to be exported to other modules
282 environ = dict(
282 environ = dict(
283 (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
283 (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
284 for k, v in os.environ.items() # re-exports
284 for k, v in os.environ.items() # re-exports
285 )
285 )
286
286
287 if pycompat.ispy3:
287 if pycompat.ispy3:
288 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
288 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
289 # returns bytes.
289 # returns bytes.
290 if pycompat.iswindows:
290 if pycompat.iswindows:
291 # Python 3 on Windows issues a DeprecationWarning about using the bytes
291 # Python 3 on Windows issues a DeprecationWarning about using the bytes
292 # API when os.getcwdb() is called.
292 # API when os.getcwdb() is called.
293 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
293 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
294 else:
294 else:
295 getcwd = os.getcwdb # re-exports
295 getcwd = os.getcwdb # re-exports
296 else:
296 else:
297 getcwd = os.getcwd # re-exports
297 getcwd = os.getcwd # re-exports
298
298
299 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
299 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
300 _wide = _sysstr(
300 _wide = _sysstr(
301 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
301 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
302 and b"WFA"
302 and b"WFA"
303 or b"WF"
303 or b"WF"
304 )
304 )
305
305
306
306
307 def colwidth(s):
307 def colwidth(s):
308 # type: (bytes) -> int
308 # type: (bytes) -> int
309 b"Find the column width of a string for display in the local encoding"
309 b"Find the column width of a string for display in the local encoding"
310 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
310 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
311
311
312
312
313 def ucolwidth(d):
313 def ucolwidth(d):
314 # type: (Text) -> int
314 # type: (Text) -> int
315 b"Find the column width of a Unicode string for display"
315 b"Find the column width of a Unicode string for display"
316 eaw = getattr(unicodedata, 'east_asian_width', None)
316 eaw = getattr(unicodedata, 'east_asian_width', None)
317 if eaw is not None:
317 if eaw is not None:
318 return sum([eaw(c) in _wide and 2 or 1 for c in d])
318 return sum([eaw(c) in _wide and 2 or 1 for c in d])
319 return len(d)
319 return len(d)
320
320
321
321
322 def getcols(s, start, c):
322 def getcols(s, start, c):
323 # type: (bytes, int, int) -> bytes
323 # type: (bytes, int, int) -> bytes
324 '''Use colwidth to find a c-column substring of s starting at byte
324 '''Use colwidth to find a c-column substring of s starting at byte
325 index start'''
325 index start'''
326 for x in pycompat.xrange(start + c, len(s)):
326 for x in pycompat.xrange(start + c, len(s)):
327 t = s[start:x]
327 t = s[start:x]
328 if colwidth(t) == c:
328 if colwidth(t) == c:
329 return t
329 return t
330 raise ValueError('substring not found')
330 raise ValueError('substring not found')
331
331
332
332
333 def trim(s, width, ellipsis=b'', leftside=False):
333 def trim(s, width, ellipsis=b'', leftside=False):
334 # type: (bytes, int, bytes, bool) -> bytes
334 # type: (bytes, int, bytes, bool) -> bytes
335 """Trim string 's' to at most 'width' columns (including 'ellipsis').
335 """Trim string 's' to at most 'width' columns (including 'ellipsis').
336
336
337 If 'leftside' is True, left side of string 's' is trimmed.
337 If 'leftside' is True, left side of string 's' is trimmed.
338 'ellipsis' is always placed at trimmed side.
338 'ellipsis' is always placed at trimmed side.
339
339
340 >>> from .node import bin
340 >>> from .node import bin
341 >>> def bprint(s):
341 >>> def bprint(s):
342 ... print(pycompat.sysstr(s))
342 ... print(pycompat.sysstr(s))
343 >>> ellipsis = b'+++'
343 >>> ellipsis = b'+++'
344 >>> from . import encoding
344 >>> from . import encoding
345 >>> encoding.encoding = b'utf-8'
345 >>> encoding.encoding = b'utf-8'
346 >>> t = b'1234567890'
346 >>> t = b'1234567890'
347 >>> bprint(trim(t, 12, ellipsis=ellipsis))
347 >>> bprint(trim(t, 12, ellipsis=ellipsis))
348 1234567890
348 1234567890
349 >>> bprint(trim(t, 10, ellipsis=ellipsis))
349 >>> bprint(trim(t, 10, ellipsis=ellipsis))
350 1234567890
350 1234567890
351 >>> bprint(trim(t, 8, ellipsis=ellipsis))
351 >>> bprint(trim(t, 8, ellipsis=ellipsis))
352 12345+++
352 12345+++
353 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
353 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
354 +++67890
354 +++67890
355 >>> bprint(trim(t, 8))
355 >>> bprint(trim(t, 8))
356 12345678
356 12345678
357 >>> bprint(trim(t, 8, leftside=True))
357 >>> bprint(trim(t, 8, leftside=True))
358 34567890
358 34567890
359 >>> bprint(trim(t, 3, ellipsis=ellipsis))
359 >>> bprint(trim(t, 3, ellipsis=ellipsis))
360 +++
360 +++
361 >>> bprint(trim(t, 1, ellipsis=ellipsis))
361 >>> bprint(trim(t, 1, ellipsis=ellipsis))
362 +
362 +
363 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
363 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
364 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
364 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
365 >>> bprint(trim(t, 12, ellipsis=ellipsis))
365 >>> bprint(trim(t, 12, ellipsis=ellipsis))
366 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
366 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
367 >>> bprint(trim(t, 10, ellipsis=ellipsis))
367 >>> bprint(trim(t, 10, ellipsis=ellipsis))
368 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
368 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
369 >>> bprint(trim(t, 8, ellipsis=ellipsis))
369 >>> bprint(trim(t, 8, ellipsis=ellipsis))
370 \xe3\x81\x82\xe3\x81\x84+++
370 \xe3\x81\x82\xe3\x81\x84+++
371 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
371 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
372 +++\xe3\x81\x88\xe3\x81\x8a
372 +++\xe3\x81\x88\xe3\x81\x8a
373 >>> bprint(trim(t, 5))
373 >>> bprint(trim(t, 5))
374 \xe3\x81\x82\xe3\x81\x84
374 \xe3\x81\x82\xe3\x81\x84
375 >>> bprint(trim(t, 5, leftside=True))
375 >>> bprint(trim(t, 5, leftside=True))
376 \xe3\x81\x88\xe3\x81\x8a
376 \xe3\x81\x88\xe3\x81\x8a
377 >>> bprint(trim(t, 4, ellipsis=ellipsis))
377 >>> bprint(trim(t, 4, ellipsis=ellipsis))
378 +++
378 +++
379 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
379 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
380 +++
380 +++
381 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
381 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
382 >>> bprint(trim(t, 12, ellipsis=ellipsis))
382 >>> bprint(trim(t, 12, ellipsis=ellipsis))
383 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
383 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
384 >>> bprint(trim(t, 10, ellipsis=ellipsis))
384 >>> bprint(trim(t, 10, ellipsis=ellipsis))
385 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
385 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
386 >>> bprint(trim(t, 8, ellipsis=ellipsis))
386 >>> bprint(trim(t, 8, ellipsis=ellipsis))
387 \x11\x22\x33\x44\x55+++
387 \x11\x22\x33\x44\x55+++
388 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
388 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
389 +++\x66\x77\x88\x99\xaa
389 +++\x66\x77\x88\x99\xaa
390 >>> bprint(trim(t, 8))
390 >>> bprint(trim(t, 8))
391 \x11\x22\x33\x44\x55\x66\x77\x88
391 \x11\x22\x33\x44\x55\x66\x77\x88
392 >>> bprint(trim(t, 8, leftside=True))
392 >>> bprint(trim(t, 8, leftside=True))
393 \x33\x44\x55\x66\x77\x88\x99\xaa
393 \x33\x44\x55\x66\x77\x88\x99\xaa
394 >>> bprint(trim(t, 3, ellipsis=ellipsis))
394 >>> bprint(trim(t, 3, ellipsis=ellipsis))
395 +++
395 +++
396 >>> bprint(trim(t, 1, ellipsis=ellipsis))
396 >>> bprint(trim(t, 1, ellipsis=ellipsis))
397 +
397 +
398 """
398 """
399 try:
399 try:
400 u = s.decode(_sysstr(encoding))
400 u = s.decode(_sysstr(encoding))
401 except UnicodeDecodeError:
401 except UnicodeDecodeError:
402 if len(s) <= width: # trimming is not needed
402 if len(s) <= width: # trimming is not needed
403 return s
403 return s
404 width -= len(ellipsis)
404 width -= len(ellipsis)
405 if width <= 0: # no enough room even for ellipsis
405 if width <= 0: # no enough room even for ellipsis
406 return ellipsis[: width + len(ellipsis)]
406 return ellipsis[: width + len(ellipsis)]
407 if leftside:
407 if leftside:
408 return ellipsis + s[-width:]
408 return ellipsis + s[-width:]
409 return s[:width] + ellipsis
409 return s[:width] + ellipsis
410
410
411 if ucolwidth(u) <= width: # trimming is not needed
411 if ucolwidth(u) <= width: # trimming is not needed
412 return s
412 return s
413
413
414 width -= len(ellipsis)
414 width -= len(ellipsis)
415 if width <= 0: # no enough room even for ellipsis
415 if width <= 0: # no enough room even for ellipsis
416 return ellipsis[: width + len(ellipsis)]
416 return ellipsis[: width + len(ellipsis)]
417
417
418 if leftside:
418 if leftside:
419 uslice = lambda i: u[i:]
419 uslice = lambda i: u[i:]
420 concat = lambda s: ellipsis + s
420 concat = lambda s: ellipsis + s
421 else:
421 else:
422 uslice = lambda i: u[:-i]
422 uslice = lambda i: u[:-i]
423 concat = lambda s: s + ellipsis
423 concat = lambda s: s + ellipsis
424 for i in pycompat.xrange(1, len(u)):
424 for i in pycompat.xrange(1, len(u)):
425 usub = uslice(i)
425 usub = uslice(i)
426 if ucolwidth(usub) <= width:
426 if ucolwidth(usub) <= width:
427 return concat(usub.encode(_sysstr(encoding)))
427 return concat(usub.encode(_sysstr(encoding)))
428 return ellipsis # no enough room for multi-column characters
428 return ellipsis # no enough room for multi-column characters
429
429
430
430
431 def lower(s):
431 def lower(s):
432 # type: (bytes) -> bytes
432 # type: (bytes) -> bytes
433 b"best-effort encoding-aware case-folding of local string s"
433 b"best-effort encoding-aware case-folding of local string s"
434 try:
434 try:
435 return asciilower(s)
435 return asciilower(s)
436 except UnicodeDecodeError:
436 except UnicodeDecodeError:
437 pass
437 pass
438 try:
438 try:
439 if isinstance(s, localstr):
439 if isinstance(s, localstr):
440 u = s._utf8.decode("utf-8")
440 u = s._utf8.decode("utf-8")
441 else:
441 else:
442 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
442 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
443
443
444 lu = u.lower()
444 lu = u.lower()
445 if u == lu:
445 if u == lu:
446 return s # preserve localstring
446 return s # preserve localstring
447 return lu.encode(_sysstr(encoding))
447 return lu.encode(_sysstr(encoding))
448 except UnicodeError:
448 except UnicodeError:
449 return s.lower() # we don't know how to fold this except in ASCII
449 return s.lower() # we don't know how to fold this except in ASCII
450 except LookupError as k:
450 except LookupError as k:
451 raise error.Abort(k, hint=b"please check your locale settings")
451 raise error.Abort(k, hint=b"please check your locale settings")
452
452
453
453
454 def upper(s):
454 def upper(s):
455 # type: (bytes) -> bytes
455 # type: (bytes) -> bytes
456 b"best-effort encoding-aware case-folding of local string s"
456 b"best-effort encoding-aware case-folding of local string s"
457 try:
457 try:
458 return asciiupper(s)
458 return asciiupper(s)
459 except UnicodeDecodeError:
459 except UnicodeDecodeError:
460 return upperfallback(s)
460 return upperfallback(s)
461
461
462
462
463 def upperfallback(s):
463 def upperfallback(s):
464 # type: (Any) -> Any
464 # type: (Any) -> Any
465 try:
465 try:
466 if isinstance(s, localstr):
466 if isinstance(s, localstr):
467 u = s._utf8.decode("utf-8")
467 u = s._utf8.decode("utf-8")
468 else:
468 else:
469 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
469 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
470
470
471 uu = u.upper()
471 uu = u.upper()
472 if u == uu:
472 if u == uu:
473 return s # preserve localstring
473 return s # preserve localstring
474 return uu.encode(_sysstr(encoding))
474 return uu.encode(_sysstr(encoding))
475 except UnicodeError:
475 except UnicodeError:
476 return s.upper() # we don't know how to fold this except in ASCII
476 return s.upper() # we don't know how to fold this except in ASCII
477 except LookupError as k:
477 except LookupError as k:
478 raise error.Abort(k, hint=b"please check your locale settings")
478 raise error.Abort(k, hint=b"please check your locale settings")
479
479
480
480
481 class normcasespecs(object):
481 class normcasespecs(object):
482 '''what a platform's normcase does to ASCII strings
482 '''what a platform's normcase does to ASCII strings
483
483
484 This is specified per platform, and should be consistent with what normcase
484 This is specified per platform, and should be consistent with what normcase
485 on that platform actually does.
485 on that platform actually does.
486
486
487 lower: normcase lowercases ASCII strings
487 lower: normcase lowercases ASCII strings
488 upper: normcase uppercases ASCII strings
488 upper: normcase uppercases ASCII strings
489 other: the fallback function should always be called
489 other: the fallback function should always be called
490
490
491 This should be kept in sync with normcase_spec in util.h.'''
491 This should be kept in sync with normcase_spec in util.h.'''
492
492
493 lower = -1
493 lower = -1
494 upper = 1
494 upper = 1
495 other = 0
495 other = 0
496
496
497
497
498 def jsonescape(s, paranoid=False):
498 def jsonescape(s, paranoid=False):
499 # type: (Any, Any) -> Any
499 # type: (Any, Any) -> Any
500 '''returns a string suitable for JSON
500 '''returns a string suitable for JSON
501
501
502 JSON is problematic for us because it doesn't support non-Unicode
502 JSON is problematic for us because it doesn't support non-Unicode
503 bytes. To deal with this, we take the following approach:
503 bytes. To deal with this, we take the following approach:
504
504
505 - localstr/safelocalstr objects are converted back to UTF-8
505 - localstr/safelocalstr objects are converted back to UTF-8
506 - valid UTF-8/ASCII strings are passed as-is
506 - valid UTF-8/ASCII strings are passed as-is
507 - other strings are converted to UTF-8b surrogate encoding
507 - other strings are converted to UTF-8b surrogate encoding
508 - apply JSON-specified string escaping
508 - apply JSON-specified string escaping
509
509
510 (escapes are doubled in these tests)
510 (escapes are doubled in these tests)
511
511
512 >>> jsonescape(b'this is a test')
512 >>> jsonescape(b'this is a test')
513 'this is a test'
513 'this is a test'
514 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
514 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
515 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
515 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
516 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
516 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
517 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
517 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
518 >>> jsonescape(b'a weird byte: \\xdd')
518 >>> jsonescape(b'a weird byte: \\xdd')
519 'a weird byte: \\xed\\xb3\\x9d'
519 'a weird byte: \\xed\\xb3\\x9d'
520 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
520 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
521 'utf-8: caf\\xc3\\xa9'
521 'utf-8: caf\\xc3\\xa9'
522 >>> jsonescape(b'')
522 >>> jsonescape(b'')
523 ''
523 ''
524
524
525 If paranoid, non-ascii and common troublesome characters are also escaped.
525 If paranoid, non-ascii and common troublesome characters are also escaped.
526 This is suitable for web output.
526 This is suitable for web output.
527
527
528 >>> s = b'escape characters: \\0 \\x0b \\x7f'
528 >>> s = b'escape characters: \\0 \\x0b \\x7f'
529 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
529 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
530 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
530 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
531 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
531 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
532 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
532 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
533 'escape boundary: ~ \\\\u007f \\\\u0080'
533 'escape boundary: ~ \\\\u007f \\\\u0080'
534 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
534 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
535 'a weird byte: \\\\udcdd'
535 'a weird byte: \\\\udcdd'
536 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
536 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
537 'utf-8: caf\\\\u00e9'
537 'utf-8: caf\\\\u00e9'
538 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
538 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
539 'non-BMP: \\\\ud834\\\\udd1e'
539 'non-BMP: \\\\ud834\\\\udd1e'
540 >>> jsonescape(b'<foo@example.org>', paranoid=True)
540 >>> jsonescape(b'<foo@example.org>', paranoid=True)
541 '\\\\u003cfoo@example.org\\\\u003e'
541 '\\\\u003cfoo@example.org\\\\u003e'
542 '''
542 '''
543
543
544 u8chars = toutf8b(s)
544 u8chars = toutf8b(s)
545 try:
545 try:
546 return _jsonescapeu8fast(u8chars, paranoid)
546 return _jsonescapeu8fast(u8chars, paranoid)
547 except ValueError:
547 except ValueError:
548 pass
548 pass
549 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
549 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
550
550
551
551
552 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
552 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
553 # bytes are mapped to that range.
553 # bytes are mapped to that range.
554 if pycompat.ispy3:
554 if pycompat.ispy3:
555 _utf8strict = r'surrogatepass'
555 _utf8strict = r'surrogatepass'
556 else:
556 else:
557 _utf8strict = r'strict'
557 _utf8strict = r'strict'
558
558
559 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
559 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
560
560
561
561
562 def getutf8char(s, pos):
562 def getutf8char(s, pos):
563 # type: (bytes, int) -> bytes
563 # type: (bytes, int) -> bytes
564 '''get the next full utf-8 character in the given string, starting at pos
564 '''get the next full utf-8 character in the given string, starting at pos
565
565
566 Raises a UnicodeError if the given location does not start a valid
566 Raises a UnicodeError if the given location does not start a valid
567 utf-8 character.
567 utf-8 character.
568 '''
568 '''
569
569
570 # find how many bytes to attempt decoding from first nibble
570 # find how many bytes to attempt decoding from first nibble
571 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
571 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
572 if not l: # ascii
572 if not l: # ascii
573 return s[pos : pos + 1]
573 return s[pos : pos + 1]
574
574
575 c = s[pos : pos + l]
575 c = s[pos : pos + l]
576 # validate with attempted decode
576 # validate with attempted decode
577 c.decode("utf-8", _utf8strict)
577 c.decode("utf-8", _utf8strict)
578 return c
578 return c
579
579
580
580
581 def toutf8b(s):
581 def toutf8b(s):
582 # type: (bytes) -> bytes
582 # type: (bytes) -> bytes
583 '''convert a local, possibly-binary string into UTF-8b
583 '''convert a local, possibly-binary string into UTF-8b
584
584
585 This is intended as a generic method to preserve data when working
585 This is intended as a generic method to preserve data when working
586 with schemes like JSON and XML that have no provision for
586 with schemes like JSON and XML that have no provision for
587 arbitrary byte strings. As Mercurial often doesn't know
587 arbitrary byte strings. As Mercurial often doesn't know
588 what encoding data is in, we use so-called UTF-8b.
588 what encoding data is in, we use so-called UTF-8b.
589
589
590 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
590 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
591 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
591 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
592 uDC00-uDCFF.
592 uDC00-uDCFF.
593
593
594 Principles of operation:
594 Principles of operation:
595
595
596 - ASCII and UTF-8 data successfully round-trips and is understood
596 - ASCII and UTF-8 data successfully round-trips and is understood
597 by Unicode-oriented clients
597 by Unicode-oriented clients
598 - filenames and file contents in arbitrary other encodings can have
598 - filenames and file contents in arbitrary other encodings can have
599 be round-tripped or recovered by clueful clients
599 be round-tripped or recovered by clueful clients
600 - local strings that have a cached known UTF-8 encoding (aka
600 - local strings that have a cached known UTF-8 encoding (aka
601 localstr) get sent as UTF-8 so Unicode-oriented clients get the
601 localstr) get sent as UTF-8 so Unicode-oriented clients get the
602 Unicode data they want
602 Unicode data they want
603 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
603 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
604 - because we must preserve UTF-8 bytestring in places such as
604 - because we must preserve UTF-8 bytestring in places such as
605 filenames, metadata can't be roundtripped without help
605 filenames, metadata can't be roundtripped without help
606
606
607 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
607 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
608 arbitrary bytes into an internal Unicode format that can be
608 arbitrary bytes into an internal Unicode format that can be
609 re-encoded back into the original. Here we are exposing the
609 re-encoded back into the original. Here we are exposing the
610 internal surrogate encoding as a UTF-8 string.)
610 internal surrogate encoding as a UTF-8 string.)
611 '''
611 '''
612
612
613 if isinstance(s, localstr):
613 if isinstance(s, localstr):
614 # assume that the original UTF-8 sequence would never contain
614 # assume that the original UTF-8 sequence would never contain
615 # invalid characters in U+DCxx range
615 # invalid characters in U+DCxx range
616 return s._utf8
616 return s._utf8
617 elif isinstance(s, safelocalstr):
617 elif isinstance(s, safelocalstr):
618 # already verified that s is non-lossy in legacy encoding, which
618 # already verified that s is non-lossy in legacy encoding, which
619 # shouldn't contain characters in U+DCxx range
619 # shouldn't contain characters in U+DCxx range
620 return fromlocal(s)
620 return fromlocal(s)
621 elif isasciistr(s):
621 elif isasciistr(s):
622 return s
622 return s
623 if b"\xed" not in s:
623 if b"\xed" not in s:
624 try:
624 try:
625 s.decode('utf-8', _utf8strict)
625 s.decode('utf-8', _utf8strict)
626 return s
626 return s
627 except UnicodeDecodeError:
627 except UnicodeDecodeError:
628 pass
628 pass
629
629
630 s = pycompat.bytestr(s)
630 s = pycompat.bytestr(s)
631 r = b""
631 r = b""
632 pos = 0
632 pos = 0
633 l = len(s)
633 l = len(s)
634 while pos < l:
634 while pos < l:
635 try:
635 try:
636 c = getutf8char(s, pos)
636 c = getutf8char(s, pos)
637 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
637 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
638 # have to re-escape existing U+DCxx characters
638 # have to re-escape existing U+DCxx characters
639 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
639 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
640 pos += 1
640 pos += 1
641 else:
641 else:
642 pos += len(c)
642 pos += len(c)
643 except UnicodeDecodeError:
643 except UnicodeDecodeError:
644 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
644 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
645 pos += 1
645 pos += 1
646 r += c
646 r += c
647 return r
647 return r
648
648
649
649
650 def fromutf8b(s):
650 def fromutf8b(s):
651 # type: (bytes) -> bytes
651 # type: (bytes) -> bytes
652 '''Given a UTF-8b string, return a local, possibly-binary string.
652 '''Given a UTF-8b string, return a local, possibly-binary string.
653
653
654 return the original binary string. This
654 return the original binary string. This
655 is a round-trip process for strings like filenames, but metadata
655 is a round-trip process for strings like filenames, but metadata
656 that's was passed through tolocal will remain in UTF-8.
656 that's was passed through tolocal will remain in UTF-8.
657
657
658 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
658 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
659 >>> m = b"\\xc3\\xa9\\x99abcd"
659 >>> m = b"\\xc3\\xa9\\x99abcd"
660 >>> toutf8b(m)
660 >>> toutf8b(m)
661 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
661 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
662 >>> roundtrip(m)
662 >>> roundtrip(m)
663 True
663 True
664 >>> roundtrip(b"\\xc2\\xc2\\x80")
664 >>> roundtrip(b"\\xc2\\xc2\\x80")
665 True
665 True
666 >>> roundtrip(b"\\xef\\xbf\\xbd")
666 >>> roundtrip(b"\\xef\\xbf\\xbd")
667 True
667 True
668 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
668 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
669 True
669 True
670 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
670 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
671 True
671 True
672 '''
672 '''
673
673
674 if isasciistr(s):
674 if isasciistr(s):
675 return s
675 return s
676 # fast path - look for uDxxx prefixes in s
676 # fast path - look for uDxxx prefixes in s
677 if b"\xed" not in s:
677 if b"\xed" not in s:
678 return s
678 return s
679
679
680 # We could do this with the unicode type but some Python builds
680 # We could do this with the unicode type but some Python builds
681 # use UTF-16 internally (issue5031) which causes non-BMP code
681 # use UTF-16 internally (issue5031) which causes non-BMP code
682 # points to be escaped. Instead, we use our handy getutf8char
682 # points to be escaped. Instead, we use our handy getutf8char
683 # helper again to walk the string without "decoding" it.
683 # helper again to walk the string without "decoding" it.
684
684
685 s = pycompat.bytestr(s)
685 s = pycompat.bytestr(s)
686 r = b""
686 r = b""
687 pos = 0
687 pos = 0
688 l = len(s)
688 l = len(s)
689 while pos < l:
689 while pos < l:
690 c = getutf8char(s, pos)
690 c = getutf8char(s, pos)
691 pos += len(c)
691 pos += len(c)
692 # unescape U+DCxx characters
692 # unescape U+DCxx characters
693 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
693 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
694 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
694 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
695 r += c
695 r += c
696 return r
696 return r
General Comments 0
You need to be logged in to leave comments. Login now