##// END OF EJS Templates
windows: wrap `os.getcwd()` in `os.path.realpath()` on py3...
Matt Harbison -
r47037:3dfebba9 default
parent child Browse files
Show More
@@ -1,704 +1,709 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 if pycompat.TYPE_CHECKING:
23 if pycompat.TYPE_CHECKING:
24 from typing import (
24 from typing import (
25 Any,
25 Any,
26 Callable,
26 Callable,
27 List,
27 List,
28 Text,
28 Text,
29 Type,
29 Type,
30 TypeVar,
30 TypeVar,
31 Union,
31 Union,
32 )
32 )
33
33
34 # keep pyflakes happy
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
36 assert t
37
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39
39
40 charencode = policy.importmod('charencode')
40 charencode = policy.importmod('charencode')
41
41
42 isasciistr = charencode.isasciistr
42 isasciistr = charencode.isasciistr
43 asciilower = charencode.asciilower
43 asciilower = charencode.asciilower
44 asciiupper = charencode.asciiupper
44 asciiupper = charencode.asciiupper
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46
46
47 _sysstr = pycompat.sysstr
47 _sysstr = pycompat.sysstr
48
48
49 if pycompat.ispy3:
49 if pycompat.ispy3:
50 unichr = chr
50 unichr = chr
51
51
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # "Unicode Subtleties"), so we need to ignore them in some places for
53 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # sanity.
54 # sanity.
55 _ignore = [
55 _ignore = [
56 unichr(int(x, 16)).encode("utf-8")
56 unichr(int(x, 16)).encode("utf-8")
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 b"206a 206b 206c 206d 206e 206f feff".split()
58 b"206a 206b 206c 206d 206e 206f feff".split()
59 ]
59 ]
60 # verify the next function will work
60 # verify the next function will work
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62
62
63
63
64 def hfsignoreclean(s):
64 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
65 # type: (bytes) -> bytes
66 """Remove codepoints ignored by HFS+ from s.
66 """Remove codepoints ignored by HFS+ from s.
67
67
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 '.hg'
69 '.hg'
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 '.hg'
71 '.hg'
72 """
72 """
73 if b"\xe2" in s or b"\xef" in s:
73 if b"\xe2" in s or b"\xef" in s:
74 for c in _ignore:
74 for c in _ignore:
75 s = s.replace(c, b'')
75 s = s.replace(c, b'')
76 return s
76 return s
77
77
78
78
79 # encoding.environ is provided read-only, which may not be used to modify
79 # encoding.environ is provided read-only, which may not be used to modify
80 # the process environment
80 # the process environment
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 if not pycompat.ispy3:
82 if not pycompat.ispy3:
83 environ = os.environ # re-exports
83 environ = os.environ # re-exports
84 elif _nativeenviron:
84 elif _nativeenviron:
85 environ = os.environb # re-exports
85 environ = os.environb # re-exports
86 else:
86 else:
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 # and recreate it once encoding is settled
88 # and recreate it once encoding is settled
89 environ = {
89 environ = {
90 k.encode('utf-8'): v.encode('utf-8')
90 k.encode('utf-8'): v.encode('utf-8')
91 for k, v in os.environ.items() # re-exports
91 for k, v in os.environ.items() # re-exports
92 }
92 }
93
93
94 _encodingrewrites = {
94 _encodingrewrites = {
95 b'646': b'ascii',
95 b'646': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
97 }
97 }
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 # https://bugs.python.org/issue13216
100 # https://bugs.python.org/issue13216
101 if pycompat.iswindows and not pycompat.ispy3:
101 if pycompat.iswindows and not pycompat.ispy3:
102 _encodingrewrites[b'cp65001'] = b'utf-8'
102 _encodingrewrites[b'cp65001'] = b'utf-8'
103
103
104 try:
104 try:
105 encoding = environ.get(b"HGENCODING")
105 encoding = environ.get(b"HGENCODING")
106 if not encoding:
106 if not encoding:
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 encoding = _encodingrewrites.get(encoding, encoding)
108 encoding = _encodingrewrites.get(encoding, encoding)
109 except locale.Error:
109 except locale.Error:
110 encoding = b'ascii'
110 encoding = b'ascii'
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 fallbackencoding = b'ISO-8859-1'
112 fallbackencoding = b'ISO-8859-1'
113
113
114
114
115 class localstr(bytes):
115 class localstr(bytes):
116 """This class allows strings that are unmodified to be
116 """This class allows strings that are unmodified to be
117 round-tripped to the local encoding and back"""
117 round-tripped to the local encoding and back"""
118
118
119 def __new__(cls, u, l):
119 def __new__(cls, u, l):
120 s = bytes.__new__(cls, l)
120 s = bytes.__new__(cls, l)
121 s._utf8 = u
121 s._utf8 = u
122 return s
122 return s
123
123
124 if pycompat.TYPE_CHECKING:
124 if pycompat.TYPE_CHECKING:
125 # pseudo implementation to help pytype see localstr() constructor
125 # pseudo implementation to help pytype see localstr() constructor
126 def __init__(self, u, l):
126 def __init__(self, u, l):
127 # type: (bytes, bytes) -> None
127 # type: (bytes, bytes) -> None
128 super(localstr, self).__init__(l)
128 super(localstr, self).__init__(l)
129 self._utf8 = u
129 self._utf8 = u
130
130
131 def __hash__(self):
131 def __hash__(self):
132 return hash(self._utf8) # avoid collisions in local string space
132 return hash(self._utf8) # avoid collisions in local string space
133
133
134
134
135 class safelocalstr(bytes):
135 class safelocalstr(bytes):
136 """Tagged string denoting it was previously an internal UTF-8 string,
136 """Tagged string denoting it was previously an internal UTF-8 string,
137 and can be converted back to UTF-8 losslessly
137 and can be converted back to UTF-8 losslessly
138
138
139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 """
143 """
144
144
145
145
146 def tolocal(s):
146 def tolocal(s):
147 # type: (bytes) -> bytes
147 # type: (bytes) -> bytes
148 """
148 """
149 Convert a string from internal UTF-8 to local encoding
149 Convert a string from internal UTF-8 to local encoding
150
150
151 All internal strings should be UTF-8 but some repos before the
151 All internal strings should be UTF-8 but some repos before the
152 implementation of locale support may contain latin1 or possibly
152 implementation of locale support may contain latin1 or possibly
153 other character sets. We attempt to decode everything strictly
153 other character sets. We attempt to decode everything strictly
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 replace unknown characters.
155 replace unknown characters.
156
156
157 The localstr class is used to cache the known UTF-8 encoding of
157 The localstr class is used to cache the known UTF-8 encoding of
158 strings next to their local representation to allow lossless
158 strings next to their local representation to allow lossless
159 round-trip conversion back to UTF-8.
159 round-trip conversion back to UTF-8.
160
160
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 >>> l = tolocal(u)
162 >>> l = tolocal(u)
163 >>> l
163 >>> l
164 'foo: ?'
164 'foo: ?'
165 >>> fromlocal(l)
165 >>> fromlocal(l)
166 'foo: \\xc3\\xa4'
166 'foo: \\xc3\\xa4'
167 >>> u2 = b'foo: \\xc3\\xa1'
167 >>> u2 = b'foo: \\xc3\\xa1'
168 >>> d = { l: 1, tolocal(u2): 2 }
168 >>> d = { l: 1, tolocal(u2): 2 }
169 >>> len(d) # no collision
169 >>> len(d) # no collision
170 2
170 2
171 >>> b'foo: ?' in d
171 >>> b'foo: ?' in d
172 False
172 False
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 >>> l = tolocal(l1)
174 >>> l = tolocal(l1)
175 >>> l
175 >>> l
176 'foo: ?'
176 'foo: ?'
177 >>> fromlocal(l) # magically in utf-8
177 >>> fromlocal(l) # magically in utf-8
178 'foo: \\xc3\\xa4'
178 'foo: \\xc3\\xa4'
179 """
179 """
180
180
181 if isasciistr(s):
181 if isasciistr(s):
182 return s
182 return s
183
183
184 try:
184 try:
185 try:
185 try:
186 # make sure string is actually stored in UTF-8
186 # make sure string is actually stored in UTF-8
187 u = s.decode('UTF-8')
187 u = s.decode('UTF-8')
188 if encoding == b'UTF-8':
188 if encoding == b'UTF-8':
189 # fast path
189 # fast path
190 return s
190 return s
191 r = u.encode(_sysstr(encoding), "replace")
191 r = u.encode(_sysstr(encoding), "replace")
192 if u == r.decode(_sysstr(encoding)):
192 if u == r.decode(_sysstr(encoding)):
193 # r is a safe, non-lossy encoding of s
193 # r is a safe, non-lossy encoding of s
194 return safelocalstr(r)
194 return safelocalstr(r)
195 return localstr(s, r)
195 return localstr(s, r)
196 except UnicodeDecodeError:
196 except UnicodeDecodeError:
197 # we should only get here if we're looking at an ancient changeset
197 # we should only get here if we're looking at an ancient changeset
198 try:
198 try:
199 u = s.decode(_sysstr(fallbackencoding))
199 u = s.decode(_sysstr(fallbackencoding))
200 r = u.encode(_sysstr(encoding), "replace")
200 r = u.encode(_sysstr(encoding), "replace")
201 if u == r.decode(_sysstr(encoding)):
201 if u == r.decode(_sysstr(encoding)):
202 # r is a safe, non-lossy encoding of s
202 # r is a safe, non-lossy encoding of s
203 return safelocalstr(r)
203 return safelocalstr(r)
204 return localstr(u.encode('UTF-8'), r)
204 return localstr(u.encode('UTF-8'), r)
205 except UnicodeDecodeError:
205 except UnicodeDecodeError:
206 u = s.decode("utf-8", "replace") # last ditch
206 u = s.decode("utf-8", "replace") # last ditch
207 # can't round-trip
207 # can't round-trip
208 return u.encode(_sysstr(encoding), "replace")
208 return u.encode(_sysstr(encoding), "replace")
209 except LookupError as k:
209 except LookupError as k:
210 raise error.Abort(
210 raise error.Abort(
211 pycompat.bytestr(k), hint=b"please check your locale settings"
211 pycompat.bytestr(k), hint=b"please check your locale settings"
212 )
212 )
213
213
214
214
215 def fromlocal(s):
215 def fromlocal(s):
216 # type: (bytes) -> bytes
216 # type: (bytes) -> bytes
217 """
217 """
218 Convert a string from the local character encoding to UTF-8
218 Convert a string from the local character encoding to UTF-8
219
219
220 We attempt to decode strings using the encoding mode set by
220 We attempt to decode strings using the encoding mode set by
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 characters will cause an error message. Other modes include
222 characters will cause an error message. Other modes include
223 'replace', which replaces unknown characters with a special
223 'replace', which replaces unknown characters with a special
224 Unicode character, and 'ignore', which drops the character.
224 Unicode character, and 'ignore', which drops the character.
225 """
225 """
226
226
227 # can we do a lossless round-trip?
227 # can we do a lossless round-trip?
228 if isinstance(s, localstr):
228 if isinstance(s, localstr):
229 return s._utf8
229 return s._utf8
230 if isasciistr(s):
230 if isasciistr(s):
231 return s
231 return s
232
232
233 try:
233 try:
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 return u.encode("utf-8")
235 return u.encode("utf-8")
236 except UnicodeDecodeError as inst:
236 except UnicodeDecodeError as inst:
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 raise error.Abort(
238 raise error.Abort(
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 )
240 )
241 except LookupError as k:
241 except LookupError as k:
242 raise error.Abort(k, hint=b"please check your locale settings")
242 raise error.Abort(k, hint=b"please check your locale settings")
243
243
244
244
245 def unitolocal(u):
245 def unitolocal(u):
246 # type: (Text) -> bytes
246 # type: (Text) -> bytes
247 """Convert a unicode string to a byte string of local encoding"""
247 """Convert a unicode string to a byte string of local encoding"""
248 return tolocal(u.encode('utf-8'))
248 return tolocal(u.encode('utf-8'))
249
249
250
250
251 def unifromlocal(s):
251 def unifromlocal(s):
252 # type: (bytes) -> Text
252 # type: (bytes) -> Text
253 """Convert a byte string of local encoding to a unicode string"""
253 """Convert a byte string of local encoding to a unicode string"""
254 return fromlocal(s).decode('utf-8')
254 return fromlocal(s).decode('utf-8')
255
255
256
256
257 def unimethod(bytesfunc):
257 def unimethod(bytesfunc):
258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
259 """Create a proxy method that forwards __unicode__() and __str__() of
259 """Create a proxy method that forwards __unicode__() and __str__() of
260 Python 3 to __bytes__()"""
260 Python 3 to __bytes__()"""
261
261
262 def unifunc(obj):
262 def unifunc(obj):
263 return unifromlocal(bytesfunc(obj))
263 return unifromlocal(bytesfunc(obj))
264
264
265 return unifunc
265 return unifunc
266
266
267
267
268 # converter functions between native str and byte string. use these if the
268 # converter functions between native str and byte string. use these if the
269 # character encoding is not aware (e.g. exception message) or is known to
269 # character encoding is not aware (e.g. exception message) or is known to
270 # be locale dependent (e.g. date formatting.)
270 # be locale dependent (e.g. date formatting.)
271 if pycompat.ispy3:
271 if pycompat.ispy3:
272 strtolocal = unitolocal
272 strtolocal = unitolocal
273 strfromlocal = unifromlocal
273 strfromlocal = unifromlocal
274 strmethod = unimethod
274 strmethod = unimethod
275 else:
275 else:
276
276
277 def strtolocal(s):
277 def strtolocal(s):
278 # type: (str) -> bytes
278 # type: (str) -> bytes
279 return s # pytype: disable=bad-return-type
279 return s # pytype: disable=bad-return-type
280
280
281 def strfromlocal(s):
281 def strfromlocal(s):
282 # type: (bytes) -> str
282 # type: (bytes) -> str
283 return s # pytype: disable=bad-return-type
283 return s # pytype: disable=bad-return-type
284
284
285 strmethod = pycompat.identity
285 strmethod = pycompat.identity
286
286
287 if not _nativeenviron:
287 if not _nativeenviron:
288 # now encoding and helper functions are available, recreate the environ
288 # now encoding and helper functions are available, recreate the environ
289 # dict to be exported to other modules
289 # dict to be exported to other modules
290 environ = {
290 environ = {
291 tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))
291 tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))
292 for k, v in os.environ.items() # re-exports
292 for k, v in os.environ.items() # re-exports
293 }
293 }
294
294
295 if pycompat.ispy3:
295 if pycompat.ispy3:
296 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
296 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
297 # returns bytes.
297 # returns bytes.
298 if pycompat.iswindows:
298 if pycompat.iswindows:
299 # Python 3 on Windows issues a DeprecationWarning about using the bytes
299 # Python 3 on Windows issues a DeprecationWarning about using the bytes
300 # API when os.getcwdb() is called.
300 # API when os.getcwdb() is called.
301 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
301 #
302 # Additionally, py3.8+ uppercases the drive letter when calling
303 # os.path.realpath(), which is used on ``repo.root``. Since those
304 # strings are compared in various places as simple strings, also call
305 # realpath here. See https://bugs.python.org/issue40368
306 getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports
302 else:
307 else:
303 getcwd = os.getcwdb # re-exports
308 getcwd = os.getcwdb # re-exports
304 else:
309 else:
305 getcwd = os.getcwd # re-exports
310 getcwd = os.getcwd # re-exports
306
311
307 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
312 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
308 _wide = _sysstr(
313 _wide = _sysstr(
309 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
314 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
310 and b"WFA"
315 and b"WFA"
311 or b"WF"
316 or b"WF"
312 )
317 )
313
318
314
319
315 def colwidth(s):
320 def colwidth(s):
316 # type: (bytes) -> int
321 # type: (bytes) -> int
317 """Find the column width of a string for display in the local encoding"""
322 """Find the column width of a string for display in the local encoding"""
318 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
323 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
319
324
320
325
321 def ucolwidth(d):
326 def ucolwidth(d):
322 # type: (Text) -> int
327 # type: (Text) -> int
323 """Find the column width of a Unicode string for display"""
328 """Find the column width of a Unicode string for display"""
324 eaw = getattr(unicodedata, 'east_asian_width', None)
329 eaw = getattr(unicodedata, 'east_asian_width', None)
325 if eaw is not None:
330 if eaw is not None:
326 return sum([eaw(c) in _wide and 2 or 1 for c in d])
331 return sum([eaw(c) in _wide and 2 or 1 for c in d])
327 return len(d)
332 return len(d)
328
333
329
334
330 def getcols(s, start, c):
335 def getcols(s, start, c):
331 # type: (bytes, int, int) -> bytes
336 # type: (bytes, int, int) -> bytes
332 """Use colwidth to find a c-column substring of s starting at byte
337 """Use colwidth to find a c-column substring of s starting at byte
333 index start"""
338 index start"""
334 for x in pycompat.xrange(start + c, len(s)):
339 for x in pycompat.xrange(start + c, len(s)):
335 t = s[start:x]
340 t = s[start:x]
336 if colwidth(t) == c:
341 if colwidth(t) == c:
337 return t
342 return t
338 raise ValueError('substring not found')
343 raise ValueError('substring not found')
339
344
340
345
341 def trim(s, width, ellipsis=b'', leftside=False):
346 def trim(s, width, ellipsis=b'', leftside=False):
342 # type: (bytes, int, bytes, bool) -> bytes
347 # type: (bytes, int, bytes, bool) -> bytes
343 """Trim string 's' to at most 'width' columns (including 'ellipsis').
348 """Trim string 's' to at most 'width' columns (including 'ellipsis').
344
349
345 If 'leftside' is True, left side of string 's' is trimmed.
350 If 'leftside' is True, left side of string 's' is trimmed.
346 'ellipsis' is always placed at trimmed side.
351 'ellipsis' is always placed at trimmed side.
347
352
348 >>> from .node import bin
353 >>> from .node import bin
349 >>> def bprint(s):
354 >>> def bprint(s):
350 ... print(pycompat.sysstr(s))
355 ... print(pycompat.sysstr(s))
351 >>> ellipsis = b'+++'
356 >>> ellipsis = b'+++'
352 >>> from . import encoding
357 >>> from . import encoding
353 >>> encoding.encoding = b'utf-8'
358 >>> encoding.encoding = b'utf-8'
354 >>> t = b'1234567890'
359 >>> t = b'1234567890'
355 >>> bprint(trim(t, 12, ellipsis=ellipsis))
360 >>> bprint(trim(t, 12, ellipsis=ellipsis))
356 1234567890
361 1234567890
357 >>> bprint(trim(t, 10, ellipsis=ellipsis))
362 >>> bprint(trim(t, 10, ellipsis=ellipsis))
358 1234567890
363 1234567890
359 >>> bprint(trim(t, 8, ellipsis=ellipsis))
364 >>> bprint(trim(t, 8, ellipsis=ellipsis))
360 12345+++
365 12345+++
361 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
366 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
362 +++67890
367 +++67890
363 >>> bprint(trim(t, 8))
368 >>> bprint(trim(t, 8))
364 12345678
369 12345678
365 >>> bprint(trim(t, 8, leftside=True))
370 >>> bprint(trim(t, 8, leftside=True))
366 34567890
371 34567890
367 >>> bprint(trim(t, 3, ellipsis=ellipsis))
372 >>> bprint(trim(t, 3, ellipsis=ellipsis))
368 +++
373 +++
369 >>> bprint(trim(t, 1, ellipsis=ellipsis))
374 >>> bprint(trim(t, 1, ellipsis=ellipsis))
370 +
375 +
371 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
376 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
372 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
377 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
373 >>> bprint(trim(t, 12, ellipsis=ellipsis))
378 >>> bprint(trim(t, 12, ellipsis=ellipsis))
374 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
379 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
375 >>> bprint(trim(t, 10, ellipsis=ellipsis))
380 >>> bprint(trim(t, 10, ellipsis=ellipsis))
376 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
381 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
377 >>> bprint(trim(t, 8, ellipsis=ellipsis))
382 >>> bprint(trim(t, 8, ellipsis=ellipsis))
378 \xe3\x81\x82\xe3\x81\x84+++
383 \xe3\x81\x82\xe3\x81\x84+++
379 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
384 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
380 +++\xe3\x81\x88\xe3\x81\x8a
385 +++\xe3\x81\x88\xe3\x81\x8a
381 >>> bprint(trim(t, 5))
386 >>> bprint(trim(t, 5))
382 \xe3\x81\x82\xe3\x81\x84
387 \xe3\x81\x82\xe3\x81\x84
383 >>> bprint(trim(t, 5, leftside=True))
388 >>> bprint(trim(t, 5, leftside=True))
384 \xe3\x81\x88\xe3\x81\x8a
389 \xe3\x81\x88\xe3\x81\x8a
385 >>> bprint(trim(t, 4, ellipsis=ellipsis))
390 >>> bprint(trim(t, 4, ellipsis=ellipsis))
386 +++
391 +++
387 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
392 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
388 +++
393 +++
389 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
394 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
390 >>> bprint(trim(t, 12, ellipsis=ellipsis))
395 >>> bprint(trim(t, 12, ellipsis=ellipsis))
391 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
396 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
392 >>> bprint(trim(t, 10, ellipsis=ellipsis))
397 >>> bprint(trim(t, 10, ellipsis=ellipsis))
393 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
398 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
394 >>> bprint(trim(t, 8, ellipsis=ellipsis))
399 >>> bprint(trim(t, 8, ellipsis=ellipsis))
395 \x11\x22\x33\x44\x55+++
400 \x11\x22\x33\x44\x55+++
396 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
401 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
397 +++\x66\x77\x88\x99\xaa
402 +++\x66\x77\x88\x99\xaa
398 >>> bprint(trim(t, 8))
403 >>> bprint(trim(t, 8))
399 \x11\x22\x33\x44\x55\x66\x77\x88
404 \x11\x22\x33\x44\x55\x66\x77\x88
400 >>> bprint(trim(t, 8, leftside=True))
405 >>> bprint(trim(t, 8, leftside=True))
401 \x33\x44\x55\x66\x77\x88\x99\xaa
406 \x33\x44\x55\x66\x77\x88\x99\xaa
402 >>> bprint(trim(t, 3, ellipsis=ellipsis))
407 >>> bprint(trim(t, 3, ellipsis=ellipsis))
403 +++
408 +++
404 >>> bprint(trim(t, 1, ellipsis=ellipsis))
409 >>> bprint(trim(t, 1, ellipsis=ellipsis))
405 +
410 +
406 """
411 """
407 try:
412 try:
408 u = s.decode(_sysstr(encoding))
413 u = s.decode(_sysstr(encoding))
409 except UnicodeDecodeError:
414 except UnicodeDecodeError:
410 if len(s) <= width: # trimming is not needed
415 if len(s) <= width: # trimming is not needed
411 return s
416 return s
412 width -= len(ellipsis)
417 width -= len(ellipsis)
413 if width <= 0: # no enough room even for ellipsis
418 if width <= 0: # no enough room even for ellipsis
414 return ellipsis[: width + len(ellipsis)]
419 return ellipsis[: width + len(ellipsis)]
415 if leftside:
420 if leftside:
416 return ellipsis + s[-width:]
421 return ellipsis + s[-width:]
417 return s[:width] + ellipsis
422 return s[:width] + ellipsis
418
423
419 if ucolwidth(u) <= width: # trimming is not needed
424 if ucolwidth(u) <= width: # trimming is not needed
420 return s
425 return s
421
426
422 width -= len(ellipsis)
427 width -= len(ellipsis)
423 if width <= 0: # no enough room even for ellipsis
428 if width <= 0: # no enough room even for ellipsis
424 return ellipsis[: width + len(ellipsis)]
429 return ellipsis[: width + len(ellipsis)]
425
430
426 if leftside:
431 if leftside:
427 uslice = lambda i: u[i:]
432 uslice = lambda i: u[i:]
428 concat = lambda s: ellipsis + s
433 concat = lambda s: ellipsis + s
429 else:
434 else:
430 uslice = lambda i: u[:-i]
435 uslice = lambda i: u[:-i]
431 concat = lambda s: s + ellipsis
436 concat = lambda s: s + ellipsis
432 for i in pycompat.xrange(1, len(u)):
437 for i in pycompat.xrange(1, len(u)):
433 usub = uslice(i)
438 usub = uslice(i)
434 if ucolwidth(usub) <= width:
439 if ucolwidth(usub) <= width:
435 return concat(usub.encode(_sysstr(encoding)))
440 return concat(usub.encode(_sysstr(encoding)))
436 return ellipsis # no enough room for multi-column characters
441 return ellipsis # no enough room for multi-column characters
437
442
438
443
439 def lower(s):
444 def lower(s):
440 # type: (bytes) -> bytes
445 # type: (bytes) -> bytes
441 """best-effort encoding-aware case-folding of local string s"""
446 """best-effort encoding-aware case-folding of local string s"""
442 try:
447 try:
443 return asciilower(s)
448 return asciilower(s)
444 except UnicodeDecodeError:
449 except UnicodeDecodeError:
445 pass
450 pass
446 try:
451 try:
447 if isinstance(s, localstr):
452 if isinstance(s, localstr):
448 u = s._utf8.decode("utf-8")
453 u = s._utf8.decode("utf-8")
449 else:
454 else:
450 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
455 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
451
456
452 lu = u.lower()
457 lu = u.lower()
453 if u == lu:
458 if u == lu:
454 return s # preserve localstring
459 return s # preserve localstring
455 return lu.encode(_sysstr(encoding))
460 return lu.encode(_sysstr(encoding))
456 except UnicodeError:
461 except UnicodeError:
457 return s.lower() # we don't know how to fold this except in ASCII
462 return s.lower() # we don't know how to fold this except in ASCII
458 except LookupError as k:
463 except LookupError as k:
459 raise error.Abort(k, hint=b"please check your locale settings")
464 raise error.Abort(k, hint=b"please check your locale settings")
460
465
461
466
462 def upper(s):
467 def upper(s):
463 # type: (bytes) -> bytes
468 # type: (bytes) -> bytes
464 """best-effort encoding-aware case-folding of local string s"""
469 """best-effort encoding-aware case-folding of local string s"""
465 try:
470 try:
466 return asciiupper(s)
471 return asciiupper(s)
467 except UnicodeDecodeError:
472 except UnicodeDecodeError:
468 return upperfallback(s)
473 return upperfallback(s)
469
474
470
475
471 def upperfallback(s):
476 def upperfallback(s):
472 # type: (Any) -> Any
477 # type: (Any) -> Any
473 try:
478 try:
474 if isinstance(s, localstr):
479 if isinstance(s, localstr):
475 u = s._utf8.decode("utf-8")
480 u = s._utf8.decode("utf-8")
476 else:
481 else:
477 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
482 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
478
483
479 uu = u.upper()
484 uu = u.upper()
480 if u == uu:
485 if u == uu:
481 return s # preserve localstring
486 return s # preserve localstring
482 return uu.encode(_sysstr(encoding))
487 return uu.encode(_sysstr(encoding))
483 except UnicodeError:
488 except UnicodeError:
484 return s.upper() # we don't know how to fold this except in ASCII
489 return s.upper() # we don't know how to fold this except in ASCII
485 except LookupError as k:
490 except LookupError as k:
486 raise error.Abort(k, hint=b"please check your locale settings")
491 raise error.Abort(k, hint=b"please check your locale settings")
487
492
488
493
489 class normcasespecs(object):
494 class normcasespecs(object):
490 """what a platform's normcase does to ASCII strings
495 """what a platform's normcase does to ASCII strings
491
496
492 This is specified per platform, and should be consistent with what normcase
497 This is specified per platform, and should be consistent with what normcase
493 on that platform actually does.
498 on that platform actually does.
494
499
495 lower: normcase lowercases ASCII strings
500 lower: normcase lowercases ASCII strings
496 upper: normcase uppercases ASCII strings
501 upper: normcase uppercases ASCII strings
497 other: the fallback function should always be called
502 other: the fallback function should always be called
498
503
499 This should be kept in sync with normcase_spec in util.h."""
504 This should be kept in sync with normcase_spec in util.h."""
500
505
501 lower = -1
506 lower = -1
502 upper = 1
507 upper = 1
503 other = 0
508 other = 0
504
509
505
510
506 def jsonescape(s, paranoid=False):
511 def jsonescape(s, paranoid=False):
507 # type: (Any, Any) -> Any
512 # type: (Any, Any) -> Any
508 """returns a string suitable for JSON
513 """returns a string suitable for JSON
509
514
510 JSON is problematic for us because it doesn't support non-Unicode
515 JSON is problematic for us because it doesn't support non-Unicode
511 bytes. To deal with this, we take the following approach:
516 bytes. To deal with this, we take the following approach:
512
517
513 - localstr/safelocalstr objects are converted back to UTF-8
518 - localstr/safelocalstr objects are converted back to UTF-8
514 - valid UTF-8/ASCII strings are passed as-is
519 - valid UTF-8/ASCII strings are passed as-is
515 - other strings are converted to UTF-8b surrogate encoding
520 - other strings are converted to UTF-8b surrogate encoding
516 - apply JSON-specified string escaping
521 - apply JSON-specified string escaping
517
522
518 (escapes are doubled in these tests)
523 (escapes are doubled in these tests)
519
524
520 >>> jsonescape(b'this is a test')
525 >>> jsonescape(b'this is a test')
521 'this is a test'
526 'this is a test'
522 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
527 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
523 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
528 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
524 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
529 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
525 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
530 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
526 >>> jsonescape(b'a weird byte: \\xdd')
531 >>> jsonescape(b'a weird byte: \\xdd')
527 'a weird byte: \\xed\\xb3\\x9d'
532 'a weird byte: \\xed\\xb3\\x9d'
528 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
533 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
529 'utf-8: caf\\xc3\\xa9'
534 'utf-8: caf\\xc3\\xa9'
530 >>> jsonescape(b'')
535 >>> jsonescape(b'')
531 ''
536 ''
532
537
533 If paranoid, non-ascii and common troublesome characters are also escaped.
538 If paranoid, non-ascii and common troublesome characters are also escaped.
534 This is suitable for web output.
539 This is suitable for web output.
535
540
536 >>> s = b'escape characters: \\0 \\x0b \\x7f'
541 >>> s = b'escape characters: \\0 \\x0b \\x7f'
537 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
542 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
538 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
543 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
539 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
544 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
540 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
545 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
541 'escape boundary: ~ \\\\u007f \\\\u0080'
546 'escape boundary: ~ \\\\u007f \\\\u0080'
542 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
547 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
543 'a weird byte: \\\\udcdd'
548 'a weird byte: \\\\udcdd'
544 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
549 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
545 'utf-8: caf\\\\u00e9'
550 'utf-8: caf\\\\u00e9'
546 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
551 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
547 'non-BMP: \\\\ud834\\\\udd1e'
552 'non-BMP: \\\\ud834\\\\udd1e'
548 >>> jsonescape(b'<foo@example.org>', paranoid=True)
553 >>> jsonescape(b'<foo@example.org>', paranoid=True)
549 '\\\\u003cfoo@example.org\\\\u003e'
554 '\\\\u003cfoo@example.org\\\\u003e'
550 """
555 """
551
556
552 u8chars = toutf8b(s)
557 u8chars = toutf8b(s)
553 try:
558 try:
554 return _jsonescapeu8fast(u8chars, paranoid)
559 return _jsonescapeu8fast(u8chars, paranoid)
555 except ValueError:
560 except ValueError:
556 pass
561 pass
557 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
562 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
558
563
559
564
560 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
565 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
561 # bytes are mapped to that range.
566 # bytes are mapped to that range.
562 if pycompat.ispy3:
567 if pycompat.ispy3:
563 _utf8strict = r'surrogatepass'
568 _utf8strict = r'surrogatepass'
564 else:
569 else:
565 _utf8strict = r'strict'
570 _utf8strict = r'strict'
566
571
567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
572 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
568
573
569
574
570 def getutf8char(s, pos):
575 def getutf8char(s, pos):
571 # type: (bytes, int) -> bytes
576 # type: (bytes, int) -> bytes
572 """get the next full utf-8 character in the given string, starting at pos
577 """get the next full utf-8 character in the given string, starting at pos
573
578
574 Raises a UnicodeError if the given location does not start a valid
579 Raises a UnicodeError if the given location does not start a valid
575 utf-8 character.
580 utf-8 character.
576 """
581 """
577
582
578 # find how many bytes to attempt decoding from first nibble
583 # find how many bytes to attempt decoding from first nibble
579 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
584 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
580 if not l: # ascii
585 if not l: # ascii
581 return s[pos : pos + 1]
586 return s[pos : pos + 1]
582
587
583 c = s[pos : pos + l]
588 c = s[pos : pos + l]
584 # validate with attempted decode
589 # validate with attempted decode
585 c.decode("utf-8", _utf8strict)
590 c.decode("utf-8", _utf8strict)
586 return c
591 return c
587
592
588
593
589 def toutf8b(s):
594 def toutf8b(s):
590 # type: (bytes) -> bytes
595 # type: (bytes) -> bytes
591 """convert a local, possibly-binary string into UTF-8b
596 """convert a local, possibly-binary string into UTF-8b
592
597
593 This is intended as a generic method to preserve data when working
598 This is intended as a generic method to preserve data when working
594 with schemes like JSON and XML that have no provision for
599 with schemes like JSON and XML that have no provision for
595 arbitrary byte strings. As Mercurial often doesn't know
600 arbitrary byte strings. As Mercurial often doesn't know
596 what encoding data is in, we use so-called UTF-8b.
601 what encoding data is in, we use so-called UTF-8b.
597
602
598 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
603 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
599 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
604 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
600 uDC00-uDCFF.
605 uDC00-uDCFF.
601
606
602 Principles of operation:
607 Principles of operation:
603
608
604 - ASCII and UTF-8 data successfully round-trips and is understood
609 - ASCII and UTF-8 data successfully round-trips and is understood
605 by Unicode-oriented clients
610 by Unicode-oriented clients
606 - filenames and file contents in arbitrary other encodings can have
611 - filenames and file contents in arbitrary other encodings can have
607 be round-tripped or recovered by clueful clients
612 be round-tripped or recovered by clueful clients
608 - local strings that have a cached known UTF-8 encoding (aka
613 - local strings that have a cached known UTF-8 encoding (aka
609 localstr) get sent as UTF-8 so Unicode-oriented clients get the
614 localstr) get sent as UTF-8 so Unicode-oriented clients get the
610 Unicode data they want
615 Unicode data they want
611 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
616 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
612 - because we must preserve UTF-8 bytestring in places such as
617 - because we must preserve UTF-8 bytestring in places such as
613 filenames, metadata can't be roundtripped without help
618 filenames, metadata can't be roundtripped without help
614
619
615 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
620 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
616 arbitrary bytes into an internal Unicode format that can be
621 arbitrary bytes into an internal Unicode format that can be
617 re-encoded back into the original. Here we are exposing the
622 re-encoded back into the original. Here we are exposing the
618 internal surrogate encoding as a UTF-8 string.)
623 internal surrogate encoding as a UTF-8 string.)
619 """
624 """
620
625
621 if isinstance(s, localstr):
626 if isinstance(s, localstr):
622 # assume that the original UTF-8 sequence would never contain
627 # assume that the original UTF-8 sequence would never contain
623 # invalid characters in U+DCxx range
628 # invalid characters in U+DCxx range
624 return s._utf8
629 return s._utf8
625 elif isinstance(s, safelocalstr):
630 elif isinstance(s, safelocalstr):
626 # already verified that s is non-lossy in legacy encoding, which
631 # already verified that s is non-lossy in legacy encoding, which
627 # shouldn't contain characters in U+DCxx range
632 # shouldn't contain characters in U+DCxx range
628 return fromlocal(s)
633 return fromlocal(s)
629 elif isasciistr(s):
634 elif isasciistr(s):
630 return s
635 return s
631 if b"\xed" not in s:
636 if b"\xed" not in s:
632 try:
637 try:
633 s.decode('utf-8', _utf8strict)
638 s.decode('utf-8', _utf8strict)
634 return s
639 return s
635 except UnicodeDecodeError:
640 except UnicodeDecodeError:
636 pass
641 pass
637
642
638 s = pycompat.bytestr(s)
643 s = pycompat.bytestr(s)
639 r = b""
644 r = b""
640 pos = 0
645 pos = 0
641 l = len(s)
646 l = len(s)
642 while pos < l:
647 while pos < l:
643 try:
648 try:
644 c = getutf8char(s, pos)
649 c = getutf8char(s, pos)
645 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
650 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
646 # have to re-escape existing U+DCxx characters
651 # have to re-escape existing U+DCxx characters
647 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
652 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
648 pos += 1
653 pos += 1
649 else:
654 else:
650 pos += len(c)
655 pos += len(c)
651 except UnicodeDecodeError:
656 except UnicodeDecodeError:
652 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
657 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
653 pos += 1
658 pos += 1
654 r += c
659 r += c
655 return r
660 return r
656
661
657
662
658 def fromutf8b(s):
663 def fromutf8b(s):
659 # type: (bytes) -> bytes
664 # type: (bytes) -> bytes
660 """Given a UTF-8b string, return a local, possibly-binary string.
665 """Given a UTF-8b string, return a local, possibly-binary string.
661
666
662 return the original binary string. This
667 return the original binary string. This
663 is a round-trip process for strings like filenames, but metadata
668 is a round-trip process for strings like filenames, but metadata
664 that's was passed through tolocal will remain in UTF-8.
669 that's was passed through tolocal will remain in UTF-8.
665
670
666 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
671 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
667 >>> m = b"\\xc3\\xa9\\x99abcd"
672 >>> m = b"\\xc3\\xa9\\x99abcd"
668 >>> toutf8b(m)
673 >>> toutf8b(m)
669 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
674 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
670 >>> roundtrip(m)
675 >>> roundtrip(m)
671 True
676 True
672 >>> roundtrip(b"\\xc2\\xc2\\x80")
677 >>> roundtrip(b"\\xc2\\xc2\\x80")
673 True
678 True
674 >>> roundtrip(b"\\xef\\xbf\\xbd")
679 >>> roundtrip(b"\\xef\\xbf\\xbd")
675 True
680 True
676 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
681 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
677 True
682 True
678 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
683 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
679 True
684 True
680 """
685 """
681
686
682 if isasciistr(s):
687 if isasciistr(s):
683 return s
688 return s
684 # fast path - look for uDxxx prefixes in s
689 # fast path - look for uDxxx prefixes in s
685 if b"\xed" not in s:
690 if b"\xed" not in s:
686 return s
691 return s
687
692
688 # We could do this with the unicode type but some Python builds
693 # We could do this with the unicode type but some Python builds
689 # use UTF-16 internally (issue5031) which causes non-BMP code
694 # use UTF-16 internally (issue5031) which causes non-BMP code
690 # points to be escaped. Instead, we use our handy getutf8char
695 # points to be escaped. Instead, we use our handy getutf8char
691 # helper again to walk the string without "decoding" it.
696 # helper again to walk the string without "decoding" it.
692
697
693 s = pycompat.bytestr(s)
698 s = pycompat.bytestr(s)
694 r = b""
699 r = b""
695 pos = 0
700 pos = 0
696 l = len(s)
701 l = len(s)
697 while pos < l:
702 while pos < l:
698 c = getutf8char(s, pos)
703 c = getutf8char(s, pos)
699 pos += len(c)
704 pos += len(c)
700 # unescape U+DCxx characters
705 # unescape U+DCxx characters
701 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
706 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
702 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
707 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
703 r += c
708 r += c
704 return r
709 return r
General Comments 0
You need to be logged in to leave comments. Login now