##// END OF EJS Templates
typing: add pseudo localstr.__init__() to help pytype...
Yuya Nishihara -
r44080:da925257 default
parent child Browse files
Show More
@@ -1,696 +1,705 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 _TYPE_CHECKING = False
24
23 if not globals(): # hide this from non-pytype users
25 if not globals(): # hide this from non-pytype users
24 from typing import (
26 from typing import (
25 Any,
27 Any,
26 Callable,
28 Callable,
27 List,
29 List,
30 TYPE_CHECKING as _TYPE_CHECKING,
28 Text,
31 Text,
29 Type,
32 Type,
30 TypeVar,
33 TypeVar,
31 Union,
34 Union,
32 )
35 )
33
36
34 # keep pyflakes happy
37 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
38 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
39 assert t
37
40
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
41 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39
42
40 charencode = policy.importmod('charencode')
43 charencode = policy.importmod('charencode')
41
44
42 isasciistr = charencode.isasciistr
45 isasciistr = charencode.isasciistr
43 asciilower = charencode.asciilower
46 asciilower = charencode.asciilower
44 asciiupper = charencode.asciiupper
47 asciiupper = charencode.asciiupper
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
48 _jsonescapeu8fast = charencode.jsonescapeu8fast
46
49
47 _sysstr = pycompat.sysstr
50 _sysstr = pycompat.sysstr
48
51
49 if pycompat.ispy3:
52 if pycompat.ispy3:
50 unichr = chr
53 unichr = chr
51
54
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
55 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # "Unicode Subtleties"), so we need to ignore them in some places for
56 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # sanity.
57 # sanity.
55 _ignore = [
58 _ignore = [
56 unichr(int(x, 16)).encode("utf-8")
59 unichr(int(x, 16)).encode("utf-8")
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
60 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 b"206a 206b 206c 206d 206e 206f feff".split()
61 b"206a 206b 206c 206d 206e 206f feff".split()
59 ]
62 ]
60 # verify the next function will work
63 # verify the next function will work
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
64 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62
65
63
66
64 def hfsignoreclean(s):
67 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
68 # type: (bytes) -> bytes
66 """Remove codepoints ignored by HFS+ from s.
69 """Remove codepoints ignored by HFS+ from s.
67
70
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
71 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 '.hg'
72 '.hg'
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
73 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 '.hg'
74 '.hg'
72 """
75 """
73 if b"\xe2" in s or b"\xef" in s:
76 if b"\xe2" in s or b"\xef" in s:
74 for c in _ignore:
77 for c in _ignore:
75 s = s.replace(c, b'')
78 s = s.replace(c, b'')
76 return s
79 return s
77
80
78
81
79 # encoding.environ is provided read-only, which may not be used to modify
82 # encoding.environ is provided read-only, which may not be used to modify
80 # the process environment
83 # the process environment
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
84 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 if not pycompat.ispy3:
85 if not pycompat.ispy3:
83 environ = os.environ # re-exports
86 environ = os.environ # re-exports
84 elif _nativeenviron:
87 elif _nativeenviron:
85 environ = os.environb # re-exports
88 environ = os.environb # re-exports
86 else:
89 else:
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
90 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 # and recreate it once encoding is settled
91 # and recreate it once encoding is settled
89 environ = dict(
92 environ = dict(
90 (k.encode('utf-8'), v.encode('utf-8'))
93 (k.encode('utf-8'), v.encode('utf-8'))
91 for k, v in os.environ.items() # re-exports
94 for k, v in os.environ.items() # re-exports
92 )
95 )
93
96
94 _encodingrewrites = {
97 _encodingrewrites = {
95 b'646': b'ascii',
98 b'646': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
99 b'ANSI_X3.4-1968': b'ascii',
97 }
100 }
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
101 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
102 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 # https://bugs.python.org/issue13216
103 # https://bugs.python.org/issue13216
101 if pycompat.iswindows and not pycompat.ispy3:
104 if pycompat.iswindows and not pycompat.ispy3:
102 _encodingrewrites[b'cp65001'] = b'utf-8'
105 _encodingrewrites[b'cp65001'] = b'utf-8'
103
106
104 try:
107 try:
105 encoding = environ.get(b"HGENCODING")
108 encoding = environ.get(b"HGENCODING")
106 if not encoding:
109 if not encoding:
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
110 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 encoding = _encodingrewrites.get(encoding, encoding)
111 encoding = _encodingrewrites.get(encoding, encoding)
109 except locale.Error:
112 except locale.Error:
110 encoding = b'ascii'
113 encoding = b'ascii'
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
114 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 fallbackencoding = b'ISO-8859-1'
115 fallbackencoding = b'ISO-8859-1'
113
116
114
117
115 class localstr(bytes):
118 class localstr(bytes):
116 '''This class allows strings that are unmodified to be
119 '''This class allows strings that are unmodified to be
117 round-tripped to the local encoding and back'''
120 round-tripped to the local encoding and back'''
118
121
119 def __new__(cls, u, l):
122 def __new__(cls, u, l):
120 # type: (Type[_Tlocalstr], bytes, bytes) -> _Tlocalstr
121 s = bytes.__new__(cls, l)
123 s = bytes.__new__(cls, l)
122 s._utf8 = u
124 s._utf8 = u
123 return s
125 return s
124
126
127 if _TYPE_CHECKING:
128 # pseudo implementation to help pytype see localstr() constructor
129 def __init__(self, u, l):
130 # type: (bytes, bytes) -> None
131 super(localstr, self).__init__(l)
132 self._utf8 = u
133
125 def __hash__(self):
134 def __hash__(self):
126 return hash(self._utf8) # avoid collisions in local string space
135 return hash(self._utf8) # avoid collisions in local string space
127
136
128
137
129 class safelocalstr(bytes):
138 class safelocalstr(bytes):
130 """Tagged string denoting it was previously an internal UTF-8 string,
139 """Tagged string denoting it was previously an internal UTF-8 string,
131 and can be converted back to UTF-8 losslessly
140 and can be converted back to UTF-8 losslessly
132
141
133 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
142 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
134 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
143 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
135 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
144 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
136 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
145 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
137 """
146 """
138
147
139
148
140 def tolocal(s):
149 def tolocal(s):
141 # type: (bytes) -> bytes
150 # type: (bytes) -> bytes
142 """
151 """
143 Convert a string from internal UTF-8 to local encoding
152 Convert a string from internal UTF-8 to local encoding
144
153
145 All internal strings should be UTF-8 but some repos before the
154 All internal strings should be UTF-8 but some repos before the
146 implementation of locale support may contain latin1 or possibly
155 implementation of locale support may contain latin1 or possibly
147 other character sets. We attempt to decode everything strictly
156 other character sets. We attempt to decode everything strictly
148 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
157 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
149 replace unknown characters.
158 replace unknown characters.
150
159
151 The localstr class is used to cache the known UTF-8 encoding of
160 The localstr class is used to cache the known UTF-8 encoding of
152 strings next to their local representation to allow lossless
161 strings next to their local representation to allow lossless
153 round-trip conversion back to UTF-8.
162 round-trip conversion back to UTF-8.
154
163
155 >>> u = b'foo: \\xc3\\xa4' # utf-8
164 >>> u = b'foo: \\xc3\\xa4' # utf-8
156 >>> l = tolocal(u)
165 >>> l = tolocal(u)
157 >>> l
166 >>> l
158 'foo: ?'
167 'foo: ?'
159 >>> fromlocal(l)
168 >>> fromlocal(l)
160 'foo: \\xc3\\xa4'
169 'foo: \\xc3\\xa4'
161 >>> u2 = b'foo: \\xc3\\xa1'
170 >>> u2 = b'foo: \\xc3\\xa1'
162 >>> d = { l: 1, tolocal(u2): 2 }
171 >>> d = { l: 1, tolocal(u2): 2 }
163 >>> len(d) # no collision
172 >>> len(d) # no collision
164 2
173 2
165 >>> b'foo: ?' in d
174 >>> b'foo: ?' in d
166 False
175 False
167 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
176 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
168 >>> l = tolocal(l1)
177 >>> l = tolocal(l1)
169 >>> l
178 >>> l
170 'foo: ?'
179 'foo: ?'
171 >>> fromlocal(l) # magically in utf-8
180 >>> fromlocal(l) # magically in utf-8
172 'foo: \\xc3\\xa4'
181 'foo: \\xc3\\xa4'
173 """
182 """
174
183
175 if isasciistr(s):
184 if isasciistr(s):
176 return s
185 return s
177
186
178 try:
187 try:
179 try:
188 try:
180 # make sure string is actually stored in UTF-8
189 # make sure string is actually stored in UTF-8
181 u = s.decode('UTF-8')
190 u = s.decode('UTF-8')
182 if encoding == b'UTF-8':
191 if encoding == b'UTF-8':
183 # fast path
192 # fast path
184 return s
193 return s
185 r = u.encode(_sysstr(encoding), "replace")
194 r = u.encode(_sysstr(encoding), "replace")
186 if u == r.decode(_sysstr(encoding)):
195 if u == r.decode(_sysstr(encoding)):
187 # r is a safe, non-lossy encoding of s
196 # r is a safe, non-lossy encoding of s
188 return safelocalstr(r)
197 return safelocalstr(r)
189 return localstr(s, r)
198 return localstr(s, r)
190 except UnicodeDecodeError:
199 except UnicodeDecodeError:
191 # we should only get here if we're looking at an ancient changeset
200 # we should only get here if we're looking at an ancient changeset
192 try:
201 try:
193 u = s.decode(_sysstr(fallbackencoding))
202 u = s.decode(_sysstr(fallbackencoding))
194 r = u.encode(_sysstr(encoding), "replace")
203 r = u.encode(_sysstr(encoding), "replace")
195 if u == r.decode(_sysstr(encoding)):
204 if u == r.decode(_sysstr(encoding)):
196 # r is a safe, non-lossy encoding of s
205 # r is a safe, non-lossy encoding of s
197 return safelocalstr(r)
206 return safelocalstr(r)
198 return localstr(u.encode('UTF-8'), r)
207 return localstr(u.encode('UTF-8'), r)
199 except UnicodeDecodeError:
208 except UnicodeDecodeError:
200 u = s.decode("utf-8", "replace") # last ditch
209 u = s.decode("utf-8", "replace") # last ditch
201 # can't round-trip
210 # can't round-trip
202 return u.encode(_sysstr(encoding), "replace")
211 return u.encode(_sysstr(encoding), "replace")
203 except LookupError as k:
212 except LookupError as k:
204 raise error.Abort(k, hint=b"please check your locale settings")
213 raise error.Abort(k, hint=b"please check your locale settings")
205
214
206
215
207 def fromlocal(s):
216 def fromlocal(s):
208 # type: (bytes) -> bytes
217 # type: (bytes) -> bytes
209 """
218 """
210 Convert a string from the local character encoding to UTF-8
219 Convert a string from the local character encoding to UTF-8
211
220
212 We attempt to decode strings using the encoding mode set by
221 We attempt to decode strings using the encoding mode set by
213 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
214 characters will cause an error message. Other modes include
223 characters will cause an error message. Other modes include
215 'replace', which replaces unknown characters with a special
224 'replace', which replaces unknown characters with a special
216 Unicode character, and 'ignore', which drops the character.
225 Unicode character, and 'ignore', which drops the character.
217 """
226 """
218
227
219 # can we do a lossless round-trip?
228 # can we do a lossless round-trip?
220 if isinstance(s, localstr):
229 if isinstance(s, localstr):
221 return s._utf8
230 return s._utf8
222 if isasciistr(s):
231 if isasciistr(s):
223 return s
232 return s
224
233
225 try:
234 try:
226 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
227 return u.encode("utf-8")
236 return u.encode("utf-8")
228 except UnicodeDecodeError as inst:
237 except UnicodeDecodeError as inst:
229 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 sub = s[max(0, inst.start - 10) : inst.start + 10]
230 raise error.Abort(
239 raise error.Abort(
231 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
232 )
241 )
233 except LookupError as k:
242 except LookupError as k:
234 raise error.Abort(k, hint=b"please check your locale settings")
243 raise error.Abort(k, hint=b"please check your locale settings")
235
244
236
245
237 def unitolocal(u):
246 def unitolocal(u):
238 # type: (Text) -> bytes
247 # type: (Text) -> bytes
239 """Convert a unicode string to a byte string of local encoding"""
248 """Convert a unicode string to a byte string of local encoding"""
240 return tolocal(u.encode('utf-8'))
249 return tolocal(u.encode('utf-8'))
241
250
242
251
243 def unifromlocal(s):
252 def unifromlocal(s):
244 # type: (bytes) -> Text
253 # type: (bytes) -> Text
245 """Convert a byte string of local encoding to a unicode string"""
254 """Convert a byte string of local encoding to a unicode string"""
246 return fromlocal(s).decode('utf-8')
255 return fromlocal(s).decode('utf-8')
247
256
248
257
249 def unimethod(bytesfunc):
258 def unimethod(bytesfunc):
250 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
259 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
251 """Create a proxy method that forwards __unicode__() and __str__() of
260 """Create a proxy method that forwards __unicode__() and __str__() of
252 Python 3 to __bytes__()"""
261 Python 3 to __bytes__()"""
253
262
254 def unifunc(obj):
263 def unifunc(obj):
255 return unifromlocal(bytesfunc(obj))
264 return unifromlocal(bytesfunc(obj))
256
265
257 return unifunc
266 return unifunc
258
267
259
268
260 # converter functions between native str and byte string. use these if the
269 # converter functions between native str and byte string. use these if the
261 # character encoding is not aware (e.g. exception message) or is known to
270 # character encoding is not aware (e.g. exception message) or is known to
262 # be locale dependent (e.g. date formatting.)
271 # be locale dependent (e.g. date formatting.)
263 if pycompat.ispy3:
272 if pycompat.ispy3:
264 strtolocal = unitolocal
273 strtolocal = unitolocal
265 strfromlocal = unifromlocal
274 strfromlocal = unifromlocal
266 strmethod = unimethod
275 strmethod = unimethod
267 else:
276 else:
268
277
269 def strtolocal(s):
278 def strtolocal(s):
270 # type: (str) -> bytes
279 # type: (str) -> bytes
271 return s # pytype: disable=bad-return-type
280 return s # pytype: disable=bad-return-type
272
281
273 def strfromlocal(s):
282 def strfromlocal(s):
274 # type: (bytes) -> str
283 # type: (bytes) -> str
275 return s # pytype: disable=bad-return-type
284 return s # pytype: disable=bad-return-type
276
285
277 strmethod = pycompat.identity
286 strmethod = pycompat.identity
278
287
279 if not _nativeenviron:
288 if not _nativeenviron:
280 # now encoding and helper functions are available, recreate the environ
289 # now encoding and helper functions are available, recreate the environ
281 # dict to be exported to other modules
290 # dict to be exported to other modules
282 environ = dict(
291 environ = dict(
283 (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
292 (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
284 for k, v in os.environ.items() # re-exports
293 for k, v in os.environ.items() # re-exports
285 )
294 )
286
295
287 if pycompat.ispy3:
296 if pycompat.ispy3:
288 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
297 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
289 # returns bytes.
298 # returns bytes.
290 if pycompat.iswindows:
299 if pycompat.iswindows:
291 # Python 3 on Windows issues a DeprecationWarning about using the bytes
300 # Python 3 on Windows issues a DeprecationWarning about using the bytes
292 # API when os.getcwdb() is called.
301 # API when os.getcwdb() is called.
293 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
302 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
294 else:
303 else:
295 getcwd = os.getcwdb # re-exports
304 getcwd = os.getcwdb # re-exports
296 else:
305 else:
297 getcwd = os.getcwd # re-exports
306 getcwd = os.getcwd # re-exports
298
307
299 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
308 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
300 _wide = _sysstr(
309 _wide = _sysstr(
301 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
310 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
302 and b"WFA"
311 and b"WFA"
303 or b"WF"
312 or b"WF"
304 )
313 )
305
314
306
315
307 def colwidth(s):
316 def colwidth(s):
308 # type: (bytes) -> int
317 # type: (bytes) -> int
309 b"Find the column width of a string for display in the local encoding"
318 b"Find the column width of a string for display in the local encoding"
310 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
319 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
311
320
312
321
313 def ucolwidth(d):
322 def ucolwidth(d):
314 # type: (Text) -> int
323 # type: (Text) -> int
315 b"Find the column width of a Unicode string for display"
324 b"Find the column width of a Unicode string for display"
316 eaw = getattr(unicodedata, 'east_asian_width', None)
325 eaw = getattr(unicodedata, 'east_asian_width', None)
317 if eaw is not None:
326 if eaw is not None:
318 return sum([eaw(c) in _wide and 2 or 1 for c in d])
327 return sum([eaw(c) in _wide and 2 or 1 for c in d])
319 return len(d)
328 return len(d)
320
329
321
330
322 def getcols(s, start, c):
331 def getcols(s, start, c):
323 # type: (bytes, int, int) -> bytes
332 # type: (bytes, int, int) -> bytes
324 '''Use colwidth to find a c-column substring of s starting at byte
333 '''Use colwidth to find a c-column substring of s starting at byte
325 index start'''
334 index start'''
326 for x in pycompat.xrange(start + c, len(s)):
335 for x in pycompat.xrange(start + c, len(s)):
327 t = s[start:x]
336 t = s[start:x]
328 if colwidth(t) == c:
337 if colwidth(t) == c:
329 return t
338 return t
330 raise ValueError('substring not found')
339 raise ValueError('substring not found')
331
340
332
341
333 def trim(s, width, ellipsis=b'', leftside=False):
342 def trim(s, width, ellipsis=b'', leftside=False):
334 # type: (bytes, int, bytes, bool) -> bytes
343 # type: (bytes, int, bytes, bool) -> bytes
335 """Trim string 's' to at most 'width' columns (including 'ellipsis').
344 """Trim string 's' to at most 'width' columns (including 'ellipsis').
336
345
337 If 'leftside' is True, left side of string 's' is trimmed.
346 If 'leftside' is True, left side of string 's' is trimmed.
338 'ellipsis' is always placed at trimmed side.
347 'ellipsis' is always placed at trimmed side.
339
348
340 >>> from .node import bin
349 >>> from .node import bin
341 >>> def bprint(s):
350 >>> def bprint(s):
342 ... print(pycompat.sysstr(s))
351 ... print(pycompat.sysstr(s))
343 >>> ellipsis = b'+++'
352 >>> ellipsis = b'+++'
344 >>> from . import encoding
353 >>> from . import encoding
345 >>> encoding.encoding = b'utf-8'
354 >>> encoding.encoding = b'utf-8'
346 >>> t = b'1234567890'
355 >>> t = b'1234567890'
347 >>> bprint(trim(t, 12, ellipsis=ellipsis))
356 >>> bprint(trim(t, 12, ellipsis=ellipsis))
348 1234567890
357 1234567890
349 >>> bprint(trim(t, 10, ellipsis=ellipsis))
358 >>> bprint(trim(t, 10, ellipsis=ellipsis))
350 1234567890
359 1234567890
351 >>> bprint(trim(t, 8, ellipsis=ellipsis))
360 >>> bprint(trim(t, 8, ellipsis=ellipsis))
352 12345+++
361 12345+++
353 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
362 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
354 +++67890
363 +++67890
355 >>> bprint(trim(t, 8))
364 >>> bprint(trim(t, 8))
356 12345678
365 12345678
357 >>> bprint(trim(t, 8, leftside=True))
366 >>> bprint(trim(t, 8, leftside=True))
358 34567890
367 34567890
359 >>> bprint(trim(t, 3, ellipsis=ellipsis))
368 >>> bprint(trim(t, 3, ellipsis=ellipsis))
360 +++
369 +++
361 >>> bprint(trim(t, 1, ellipsis=ellipsis))
370 >>> bprint(trim(t, 1, ellipsis=ellipsis))
362 +
371 +
363 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
372 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
364 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
373 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
365 >>> bprint(trim(t, 12, ellipsis=ellipsis))
374 >>> bprint(trim(t, 12, ellipsis=ellipsis))
366 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
375 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
367 >>> bprint(trim(t, 10, ellipsis=ellipsis))
376 >>> bprint(trim(t, 10, ellipsis=ellipsis))
368 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
377 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
369 >>> bprint(trim(t, 8, ellipsis=ellipsis))
378 >>> bprint(trim(t, 8, ellipsis=ellipsis))
370 \xe3\x81\x82\xe3\x81\x84+++
379 \xe3\x81\x82\xe3\x81\x84+++
371 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
380 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
372 +++\xe3\x81\x88\xe3\x81\x8a
381 +++\xe3\x81\x88\xe3\x81\x8a
373 >>> bprint(trim(t, 5))
382 >>> bprint(trim(t, 5))
374 \xe3\x81\x82\xe3\x81\x84
383 \xe3\x81\x82\xe3\x81\x84
375 >>> bprint(trim(t, 5, leftside=True))
384 >>> bprint(trim(t, 5, leftside=True))
376 \xe3\x81\x88\xe3\x81\x8a
385 \xe3\x81\x88\xe3\x81\x8a
377 >>> bprint(trim(t, 4, ellipsis=ellipsis))
386 >>> bprint(trim(t, 4, ellipsis=ellipsis))
378 +++
387 +++
379 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
388 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
380 +++
389 +++
381 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
390 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
382 >>> bprint(trim(t, 12, ellipsis=ellipsis))
391 >>> bprint(trim(t, 12, ellipsis=ellipsis))
383 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
392 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
384 >>> bprint(trim(t, 10, ellipsis=ellipsis))
393 >>> bprint(trim(t, 10, ellipsis=ellipsis))
385 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
394 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
386 >>> bprint(trim(t, 8, ellipsis=ellipsis))
395 >>> bprint(trim(t, 8, ellipsis=ellipsis))
387 \x11\x22\x33\x44\x55+++
396 \x11\x22\x33\x44\x55+++
388 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
397 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
389 +++\x66\x77\x88\x99\xaa
398 +++\x66\x77\x88\x99\xaa
390 >>> bprint(trim(t, 8))
399 >>> bprint(trim(t, 8))
391 \x11\x22\x33\x44\x55\x66\x77\x88
400 \x11\x22\x33\x44\x55\x66\x77\x88
392 >>> bprint(trim(t, 8, leftside=True))
401 >>> bprint(trim(t, 8, leftside=True))
393 \x33\x44\x55\x66\x77\x88\x99\xaa
402 \x33\x44\x55\x66\x77\x88\x99\xaa
394 >>> bprint(trim(t, 3, ellipsis=ellipsis))
403 >>> bprint(trim(t, 3, ellipsis=ellipsis))
395 +++
404 +++
396 >>> bprint(trim(t, 1, ellipsis=ellipsis))
405 >>> bprint(trim(t, 1, ellipsis=ellipsis))
397 +
406 +
398 """
407 """
399 try:
408 try:
400 u = s.decode(_sysstr(encoding))
409 u = s.decode(_sysstr(encoding))
401 except UnicodeDecodeError:
410 except UnicodeDecodeError:
402 if len(s) <= width: # trimming is not needed
411 if len(s) <= width: # trimming is not needed
403 return s
412 return s
404 width -= len(ellipsis)
413 width -= len(ellipsis)
405 if width <= 0: # no enough room even for ellipsis
414 if width <= 0: # no enough room even for ellipsis
406 return ellipsis[: width + len(ellipsis)]
415 return ellipsis[: width + len(ellipsis)]
407 if leftside:
416 if leftside:
408 return ellipsis + s[-width:]
417 return ellipsis + s[-width:]
409 return s[:width] + ellipsis
418 return s[:width] + ellipsis
410
419
411 if ucolwidth(u) <= width: # trimming is not needed
420 if ucolwidth(u) <= width: # trimming is not needed
412 return s
421 return s
413
422
414 width -= len(ellipsis)
423 width -= len(ellipsis)
415 if width <= 0: # no enough room even for ellipsis
424 if width <= 0: # no enough room even for ellipsis
416 return ellipsis[: width + len(ellipsis)]
425 return ellipsis[: width + len(ellipsis)]
417
426
418 if leftside:
427 if leftside:
419 uslice = lambda i: u[i:]
428 uslice = lambda i: u[i:]
420 concat = lambda s: ellipsis + s
429 concat = lambda s: ellipsis + s
421 else:
430 else:
422 uslice = lambda i: u[:-i]
431 uslice = lambda i: u[:-i]
423 concat = lambda s: s + ellipsis
432 concat = lambda s: s + ellipsis
424 for i in pycompat.xrange(1, len(u)):
433 for i in pycompat.xrange(1, len(u)):
425 usub = uslice(i)
434 usub = uslice(i)
426 if ucolwidth(usub) <= width:
435 if ucolwidth(usub) <= width:
427 return concat(usub.encode(_sysstr(encoding)))
436 return concat(usub.encode(_sysstr(encoding)))
428 return ellipsis # no enough room for multi-column characters
437 return ellipsis # no enough room for multi-column characters
429
438
430
439
431 def lower(s):
440 def lower(s):
432 # type: (bytes) -> bytes
441 # type: (bytes) -> bytes
433 b"best-effort encoding-aware case-folding of local string s"
442 b"best-effort encoding-aware case-folding of local string s"
434 try:
443 try:
435 return asciilower(s)
444 return asciilower(s)
436 except UnicodeDecodeError:
445 except UnicodeDecodeError:
437 pass
446 pass
438 try:
447 try:
439 if isinstance(s, localstr):
448 if isinstance(s, localstr):
440 u = s._utf8.decode("utf-8")
449 u = s._utf8.decode("utf-8")
441 else:
450 else:
442 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
451 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
443
452
444 lu = u.lower()
453 lu = u.lower()
445 if u == lu:
454 if u == lu:
446 return s # preserve localstring
455 return s # preserve localstring
447 return lu.encode(_sysstr(encoding))
456 return lu.encode(_sysstr(encoding))
448 except UnicodeError:
457 except UnicodeError:
449 return s.lower() # we don't know how to fold this except in ASCII
458 return s.lower() # we don't know how to fold this except in ASCII
450 except LookupError as k:
459 except LookupError as k:
451 raise error.Abort(k, hint=b"please check your locale settings")
460 raise error.Abort(k, hint=b"please check your locale settings")
452
461
453
462
454 def upper(s):
463 def upper(s):
455 # type: (bytes) -> bytes
464 # type: (bytes) -> bytes
456 b"best-effort encoding-aware case-folding of local string s"
465 b"best-effort encoding-aware case-folding of local string s"
457 try:
466 try:
458 return asciiupper(s)
467 return asciiupper(s)
459 except UnicodeDecodeError:
468 except UnicodeDecodeError:
460 return upperfallback(s)
469 return upperfallback(s)
461
470
462
471
463 def upperfallback(s):
472 def upperfallback(s):
464 # type: (Any) -> Any
473 # type: (Any) -> Any
465 try:
474 try:
466 if isinstance(s, localstr):
475 if isinstance(s, localstr):
467 u = s._utf8.decode("utf-8")
476 u = s._utf8.decode("utf-8")
468 else:
477 else:
469 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
478 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
470
479
471 uu = u.upper()
480 uu = u.upper()
472 if u == uu:
481 if u == uu:
473 return s # preserve localstring
482 return s # preserve localstring
474 return uu.encode(_sysstr(encoding))
483 return uu.encode(_sysstr(encoding))
475 except UnicodeError:
484 except UnicodeError:
476 return s.upper() # we don't know how to fold this except in ASCII
485 return s.upper() # we don't know how to fold this except in ASCII
477 except LookupError as k:
486 except LookupError as k:
478 raise error.Abort(k, hint=b"please check your locale settings")
487 raise error.Abort(k, hint=b"please check your locale settings")
479
488
480
489
481 class normcasespecs(object):
490 class normcasespecs(object):
482 '''what a platform's normcase does to ASCII strings
491 '''what a platform's normcase does to ASCII strings
483
492
484 This is specified per platform, and should be consistent with what normcase
493 This is specified per platform, and should be consistent with what normcase
485 on that platform actually does.
494 on that platform actually does.
486
495
487 lower: normcase lowercases ASCII strings
496 lower: normcase lowercases ASCII strings
488 upper: normcase uppercases ASCII strings
497 upper: normcase uppercases ASCII strings
489 other: the fallback function should always be called
498 other: the fallback function should always be called
490
499
491 This should be kept in sync with normcase_spec in util.h.'''
500 This should be kept in sync with normcase_spec in util.h.'''
492
501
493 lower = -1
502 lower = -1
494 upper = 1
503 upper = 1
495 other = 0
504 other = 0
496
505
497
506
498 def jsonescape(s, paranoid=False):
507 def jsonescape(s, paranoid=False):
499 # type: (Any, Any) -> Any
508 # type: (Any, Any) -> Any
500 '''returns a string suitable for JSON
509 '''returns a string suitable for JSON
501
510
502 JSON is problematic for us because it doesn't support non-Unicode
511 JSON is problematic for us because it doesn't support non-Unicode
503 bytes. To deal with this, we take the following approach:
512 bytes. To deal with this, we take the following approach:
504
513
505 - localstr/safelocalstr objects are converted back to UTF-8
514 - localstr/safelocalstr objects are converted back to UTF-8
506 - valid UTF-8/ASCII strings are passed as-is
515 - valid UTF-8/ASCII strings are passed as-is
507 - other strings are converted to UTF-8b surrogate encoding
516 - other strings are converted to UTF-8b surrogate encoding
508 - apply JSON-specified string escaping
517 - apply JSON-specified string escaping
509
518
510 (escapes are doubled in these tests)
519 (escapes are doubled in these tests)
511
520
512 >>> jsonescape(b'this is a test')
521 >>> jsonescape(b'this is a test')
513 'this is a test'
522 'this is a test'
514 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
523 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
515 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
524 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
516 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
525 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
517 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
526 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
518 >>> jsonescape(b'a weird byte: \\xdd')
527 >>> jsonescape(b'a weird byte: \\xdd')
519 'a weird byte: \\xed\\xb3\\x9d'
528 'a weird byte: \\xed\\xb3\\x9d'
520 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
529 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
521 'utf-8: caf\\xc3\\xa9'
530 'utf-8: caf\\xc3\\xa9'
522 >>> jsonescape(b'')
531 >>> jsonescape(b'')
523 ''
532 ''
524
533
525 If paranoid, non-ascii and common troublesome characters are also escaped.
534 If paranoid, non-ascii and common troublesome characters are also escaped.
526 This is suitable for web output.
535 This is suitable for web output.
527
536
528 >>> s = b'escape characters: \\0 \\x0b \\x7f'
537 >>> s = b'escape characters: \\0 \\x0b \\x7f'
529 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
538 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
530 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
539 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
531 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
540 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
532 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
541 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
533 'escape boundary: ~ \\\\u007f \\\\u0080'
542 'escape boundary: ~ \\\\u007f \\\\u0080'
534 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
543 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
535 'a weird byte: \\\\udcdd'
544 'a weird byte: \\\\udcdd'
536 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
545 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
537 'utf-8: caf\\\\u00e9'
546 'utf-8: caf\\\\u00e9'
538 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
547 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
539 'non-BMP: \\\\ud834\\\\udd1e'
548 'non-BMP: \\\\ud834\\\\udd1e'
540 >>> jsonescape(b'<foo@example.org>', paranoid=True)
549 >>> jsonescape(b'<foo@example.org>', paranoid=True)
541 '\\\\u003cfoo@example.org\\\\u003e'
550 '\\\\u003cfoo@example.org\\\\u003e'
542 '''
551 '''
543
552
544 u8chars = toutf8b(s)
553 u8chars = toutf8b(s)
545 try:
554 try:
546 return _jsonescapeu8fast(u8chars, paranoid)
555 return _jsonescapeu8fast(u8chars, paranoid)
547 except ValueError:
556 except ValueError:
548 pass
557 pass
549 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
558 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
550
559
551
560
552 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
561 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
553 # bytes are mapped to that range.
562 # bytes are mapped to that range.
554 if pycompat.ispy3:
563 if pycompat.ispy3:
555 _utf8strict = r'surrogatepass'
564 _utf8strict = r'surrogatepass'
556 else:
565 else:
557 _utf8strict = r'strict'
566 _utf8strict = r'strict'
558
567
559 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
568 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
560
569
561
570
562 def getutf8char(s, pos):
571 def getutf8char(s, pos):
563 # type: (bytes, int) -> bytes
572 # type: (bytes, int) -> bytes
564 '''get the next full utf-8 character in the given string, starting at pos
573 '''get the next full utf-8 character in the given string, starting at pos
565
574
566 Raises a UnicodeError if the given location does not start a valid
575 Raises a UnicodeError if the given location does not start a valid
567 utf-8 character.
576 utf-8 character.
568 '''
577 '''
569
578
570 # find how many bytes to attempt decoding from first nibble
579 # find how many bytes to attempt decoding from first nibble
571 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
580 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
572 if not l: # ascii
581 if not l: # ascii
573 return s[pos : pos + 1]
582 return s[pos : pos + 1]
574
583
575 c = s[pos : pos + l]
584 c = s[pos : pos + l]
576 # validate with attempted decode
585 # validate with attempted decode
577 c.decode("utf-8", _utf8strict)
586 c.decode("utf-8", _utf8strict)
578 return c
587 return c
579
588
580
589
581 def toutf8b(s):
590 def toutf8b(s):
582 # type: (bytes) -> bytes
591 # type: (bytes) -> bytes
583 '''convert a local, possibly-binary string into UTF-8b
592 '''convert a local, possibly-binary string into UTF-8b
584
593
585 This is intended as a generic method to preserve data when working
594 This is intended as a generic method to preserve data when working
586 with schemes like JSON and XML that have no provision for
595 with schemes like JSON and XML that have no provision for
587 arbitrary byte strings. As Mercurial often doesn't know
596 arbitrary byte strings. As Mercurial often doesn't know
588 what encoding data is in, we use so-called UTF-8b.
597 what encoding data is in, we use so-called UTF-8b.
589
598
590 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
599 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
591 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
600 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
592 uDC00-uDCFF.
601 uDC00-uDCFF.
593
602
594 Principles of operation:
603 Principles of operation:
595
604
596 - ASCII and UTF-8 data successfully round-trips and is understood
605 - ASCII and UTF-8 data successfully round-trips and is understood
597 by Unicode-oriented clients
606 by Unicode-oriented clients
598 - filenames and file contents in arbitrary other encodings can have
607 - filenames and file contents in arbitrary other encodings can have
599 be round-tripped or recovered by clueful clients
608 be round-tripped or recovered by clueful clients
600 - local strings that have a cached known UTF-8 encoding (aka
609 - local strings that have a cached known UTF-8 encoding (aka
601 localstr) get sent as UTF-8 so Unicode-oriented clients get the
610 localstr) get sent as UTF-8 so Unicode-oriented clients get the
602 Unicode data they want
611 Unicode data they want
603 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
612 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
604 - because we must preserve UTF-8 bytestring in places such as
613 - because we must preserve UTF-8 bytestring in places such as
605 filenames, metadata can't be roundtripped without help
614 filenames, metadata can't be roundtripped without help
606
615
607 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
616 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
608 arbitrary bytes into an internal Unicode format that can be
617 arbitrary bytes into an internal Unicode format that can be
609 re-encoded back into the original. Here we are exposing the
618 re-encoded back into the original. Here we are exposing the
610 internal surrogate encoding as a UTF-8 string.)
619 internal surrogate encoding as a UTF-8 string.)
611 '''
620 '''
612
621
613 if isinstance(s, localstr):
622 if isinstance(s, localstr):
614 # assume that the original UTF-8 sequence would never contain
623 # assume that the original UTF-8 sequence would never contain
615 # invalid characters in U+DCxx range
624 # invalid characters in U+DCxx range
616 return s._utf8
625 return s._utf8
617 elif isinstance(s, safelocalstr):
626 elif isinstance(s, safelocalstr):
618 # already verified that s is non-lossy in legacy encoding, which
627 # already verified that s is non-lossy in legacy encoding, which
619 # shouldn't contain characters in U+DCxx range
628 # shouldn't contain characters in U+DCxx range
620 return fromlocal(s)
629 return fromlocal(s)
621 elif isasciistr(s):
630 elif isasciistr(s):
622 return s
631 return s
623 if b"\xed" not in s:
632 if b"\xed" not in s:
624 try:
633 try:
625 s.decode('utf-8', _utf8strict)
634 s.decode('utf-8', _utf8strict)
626 return s
635 return s
627 except UnicodeDecodeError:
636 except UnicodeDecodeError:
628 pass
637 pass
629
638
630 s = pycompat.bytestr(s)
639 s = pycompat.bytestr(s)
631 r = b""
640 r = b""
632 pos = 0
641 pos = 0
633 l = len(s)
642 l = len(s)
634 while pos < l:
643 while pos < l:
635 try:
644 try:
636 c = getutf8char(s, pos)
645 c = getutf8char(s, pos)
637 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
646 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
638 # have to re-escape existing U+DCxx characters
647 # have to re-escape existing U+DCxx characters
639 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
648 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
640 pos += 1
649 pos += 1
641 else:
650 else:
642 pos += len(c)
651 pos += len(c)
643 except UnicodeDecodeError:
652 except UnicodeDecodeError:
644 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
653 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
645 pos += 1
654 pos += 1
646 r += c
655 r += c
647 return r
656 return r
648
657
649
658
650 def fromutf8b(s):
659 def fromutf8b(s):
651 # type: (bytes) -> bytes
660 # type: (bytes) -> bytes
652 '''Given a UTF-8b string, return a local, possibly-binary string.
661 '''Given a UTF-8b string, return a local, possibly-binary string.
653
662
654 return the original binary string. This
663 return the original binary string. This
655 is a round-trip process for strings like filenames, but metadata
664 is a round-trip process for strings like filenames, but metadata
656 that's was passed through tolocal will remain in UTF-8.
665 that's was passed through tolocal will remain in UTF-8.
657
666
658 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
667 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
659 >>> m = b"\\xc3\\xa9\\x99abcd"
668 >>> m = b"\\xc3\\xa9\\x99abcd"
660 >>> toutf8b(m)
669 >>> toutf8b(m)
661 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
670 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
662 >>> roundtrip(m)
671 >>> roundtrip(m)
663 True
672 True
664 >>> roundtrip(b"\\xc2\\xc2\\x80")
673 >>> roundtrip(b"\\xc2\\xc2\\x80")
665 True
674 True
666 >>> roundtrip(b"\\xef\\xbf\\xbd")
675 >>> roundtrip(b"\\xef\\xbf\\xbd")
667 True
676 True
668 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
677 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
669 True
678 True
670 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
679 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
671 True
680 True
672 '''
681 '''
673
682
674 if isasciistr(s):
683 if isasciistr(s):
675 return s
684 return s
676 # fast path - look for uDxxx prefixes in s
685 # fast path - look for uDxxx prefixes in s
677 if b"\xed" not in s:
686 if b"\xed" not in s:
678 return s
687 return s
679
688
680 # We could do this with the unicode type but some Python builds
689 # We could do this with the unicode type but some Python builds
681 # use UTF-16 internally (issue5031) which causes non-BMP code
690 # use UTF-16 internally (issue5031) which causes non-BMP code
682 # points to be escaped. Instead, we use our handy getutf8char
691 # points to be escaped. Instead, we use our handy getutf8char
683 # helper again to walk the string without "decoding" it.
692 # helper again to walk the string without "decoding" it.
684
693
685 s = pycompat.bytestr(s)
694 s = pycompat.bytestr(s)
686 r = b""
695 r = b""
687 pos = 0
696 pos = 0
688 l = len(s)
697 l = len(s)
689 while pos < l:
698 while pos < l:
690 c = getutf8char(s, pos)
699 c = getutf8char(s, pos)
691 pos += len(c)
700 pos += len(c)
692 # unescape U+DCxx characters
701 # unescape U+DCxx characters
693 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
702 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
694 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
703 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
695 r += c
704 r += c
696 return r
705 return r
General Comments 0
You need to be logged in to leave comments. Login now