##// END OF EJS Templates
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings...
Arseniy Alekseyev -
r51214:95acba2c default
parent child Browse files
Show More
@@ -1,725 +1,725 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8
8
9 import locale
9 import locale
10 import os
10 import os
11 import re
11 import re
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 if pycompat.TYPE_CHECKING:
23 if pycompat.TYPE_CHECKING:
24 from typing import (
24 from typing import (
25 Any,
25 Any,
26 Callable,
26 Callable,
27 List,
27 List,
28 Text,
28 Text,
29 Type,
29 Type,
30 TypeVar,
30 TypeVar,
31 Union,
31 Union,
32 )
32 )
33
33
34 # keep pyflakes happy
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
36 assert t
37
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39
39
40 charencode = policy.importmod('charencode')
40 charencode = policy.importmod('charencode')
41
41
42 isasciistr = charencode.isasciistr
42 isasciistr = charencode.isasciistr
43 asciilower = charencode.asciilower
43 asciilower = charencode.asciilower
44 asciiupper = charencode.asciiupper
44 asciiupper = charencode.asciiupper
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46
46
47 _sysstr = pycompat.sysstr
47 _sysstr = pycompat.sysstr
48
48
49 unichr = chr
49 unichr = chr
50
50
51 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
51 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
52 # "Unicode Subtleties"), so we need to ignore them in some places for
52 # "Unicode Subtleties"), so we need to ignore them in some places for
53 # sanity.
53 # sanity.
54 _ignore = [
54 _ignore = [
55 unichr(int(x, 16)).encode("utf-8")
55 unichr(int(x, 16)).encode("utf-8")
56 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
56 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
57 b"206a 206b 206c 206d 206e 206f feff".split()
57 b"206a 206b 206c 206d 206e 206f feff".split()
58 ]
58 ]
59 # verify the next function will work
59 # verify the next function will work
60 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
60 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
61
61
62
62
63 def hfsignoreclean(s):
63 def hfsignoreclean(s):
64 # type: (bytes) -> bytes
64 # type: (bytes) -> bytes
65 """Remove codepoints ignored by HFS+ from s.
65 """Remove codepoints ignored by HFS+ from s.
66
66
67 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
67 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
68 '.hg'
68 '.hg'
69 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
69 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
70 '.hg'
70 '.hg'
71 """
71 """
72 if b"\xe2" in s or b"\xef" in s:
72 if b"\xe2" in s or b"\xef" in s:
73 for c in _ignore:
73 for c in _ignore:
74 s = s.replace(c, b'')
74 s = s.replace(c, b'')
75 return s
75 return s
76
76
77
77
78 # encoding.environ is provided read-only, which may not be used to modify
78 # encoding.environ is provided read-only, which may not be used to modify
79 # the process environment
79 # the process environment
80 _nativeenviron = os.supports_bytes_environ
80 _nativeenviron = os.supports_bytes_environ
81 if _nativeenviron:
81 if _nativeenviron:
82 environ = os.environb # re-exports
82 environ = os.environb # re-exports
83 else:
83 else:
84 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
84 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
85 # and recreate it once encoding is settled
85 # and recreate it once encoding is settled
86 environ = {
86 environ = {
87 k.encode('utf-8'): v.encode('utf-8')
87 k.encode('utf-8'): v.encode('utf-8')
88 for k, v in os.environ.items() # re-exports
88 for k, v in os.environ.items() # re-exports
89 }
89 }
90
90
91 _encodingrewrites = {
91 _encodingrewrites = {
92 b'646': b'ascii',
92 b'646': b'ascii',
93 b'ANSI_X3.4-1968': b'ascii',
93 b'ANSI_X3.4-1968': b'ascii',
94 }
94 }
95 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
95 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
96 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
96 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
97 # https://bugs.python.org/issue13216
97 # https://bugs.python.org/issue13216
98 if pycompat.iswindows:
98 if pycompat.iswindows:
99 _encodingrewrites[b'cp65001'] = b'utf-8'
99 _encodingrewrites[b'cp65001'] = b'utf-8'
100
100
101 try:
101 try:
102 encoding = environ.get(b"HGENCODING")
102 encoding = environ.get(b"HGENCODING")
103 if not encoding:
103 if not encoding:
104 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
104 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
105 encoding = _encodingrewrites.get(encoding, encoding)
105 encoding = _encodingrewrites.get(encoding, encoding)
106 except locale.Error:
106 except locale.Error:
107 encoding = b'ascii'
107 encoding = b'ascii'
108 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
108 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
109 fallbackencoding = b'ISO-8859-1'
109 fallbackencoding = b'ISO-8859-1'
110
110
111
111
112 class localstr(bytes):
112 class localstr(bytes):
113 """This class allows strings that are unmodified to be
113 """This class allows strings that are unmodified to be
114 round-tripped to the local encoding and back"""
114 round-tripped to the local encoding and back"""
115
115
116 def __new__(cls, u, l):
116 def __new__(cls, u, l):
117 s = bytes.__new__(cls, l)
117 s = bytes.__new__(cls, l)
118 s._utf8 = u
118 s._utf8 = u
119 return s
119 return s
120
120
121 if pycompat.TYPE_CHECKING:
121 if pycompat.TYPE_CHECKING:
122 # pseudo implementation to help pytype see localstr() constructor
122 # pseudo implementation to help pytype see localstr() constructor
123 def __init__(self, u, l):
123 def __init__(self, u, l):
124 # type: (bytes, bytes) -> None
124 # type: (bytes, bytes) -> None
125 super(localstr, self).__init__(l)
125 super(localstr, self).__init__(l)
126 self._utf8 = u
126 self._utf8 = u
127
127
128 def __hash__(self):
128 def __hash__(self):
129 return hash(self._utf8) # avoid collisions in local string space
129 return hash(self._utf8) # avoid collisions in local string space
130
130
131
131
132 class safelocalstr(bytes):
132 class safelocalstr(bytes):
133 """Tagged string denoting it was previously an internal UTF-8 string,
133 """Tagged string denoting it was previously an internal UTF-8 string,
134 and can be converted back to UTF-8 losslessly
134 and can be converted back to UTF-8 losslessly
135
135
136 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
136 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
137 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
137 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
138 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
138 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
139 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
139 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
140 """
140 """
141
141
142
142
143 def tolocal(s):
143 def tolocal(s):
144 # type: (bytes) -> bytes
144 # type: (bytes) -> bytes
145 """
145 """
146 Convert a string from internal UTF-8 to local encoding
146 Convert a string from internal UTF-8 to local encoding
147
147
148 All internal strings should be UTF-8 but some repos before the
148 All internal strings should be UTF-8 but some repos before the
149 implementation of locale support may contain latin1 or possibly
149 implementation of locale support may contain latin1 or possibly
150 other character sets. We attempt to decode everything strictly
150 other character sets. We attempt to decode everything strictly
151 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
151 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
152 replace unknown characters.
152 replace unknown characters.
153
153
154 The localstr class is used to cache the known UTF-8 encoding of
154 The localstr class is used to cache the known UTF-8 encoding of
155 strings next to their local representation to allow lossless
155 strings next to their local representation to allow lossless
156 round-trip conversion back to UTF-8.
156 round-trip conversion back to UTF-8.
157
157
158 >>> u = b'foo: \\xc3\\xa4' # utf-8
158 >>> u = b'foo: \\xc3\\xa4' # utf-8
159 >>> l = tolocal(u)
159 >>> l = tolocal(u)
160 >>> l
160 >>> l
161 'foo: ?'
161 'foo: ?'
162 >>> fromlocal(l)
162 >>> fromlocal(l)
163 'foo: \\xc3\\xa4'
163 'foo: \\xc3\\xa4'
164 >>> u2 = b'foo: \\xc3\\xa1'
164 >>> u2 = b'foo: \\xc3\\xa1'
165 >>> d = { l: 1, tolocal(u2): 2 }
165 >>> d = { l: 1, tolocal(u2): 2 }
166 >>> len(d) # no collision
166 >>> len(d) # no collision
167 2
167 2
168 >>> b'foo: ?' in d
168 >>> b'foo: ?' in d
169 False
169 False
170 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
170 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
171 >>> l = tolocal(l1)
171 >>> l = tolocal(l1)
172 >>> l
172 >>> l
173 'foo: ?'
173 'foo: ?'
174 >>> fromlocal(l) # magically in utf-8
174 >>> fromlocal(l) # magically in utf-8
175 'foo: \\xc3\\xa4'
175 'foo: \\xc3\\xa4'
176 """
176 """
177
177
178 if isasciistr(s):
178 if isasciistr(s):
179 return s
179 return s
180
180
181 try:
181 try:
182 try:
182 try:
183 # make sure string is actually stored in UTF-8
183 # make sure string is actually stored in UTF-8
184 u = s.decode('UTF-8')
184 u = s.decode('UTF-8')
185 if encoding == b'UTF-8':
185 if encoding == b'UTF-8':
186 # fast path
186 # fast path
187 return s
187 return s
188 r = u.encode(_sysstr(encoding), "replace")
188 r = u.encode(_sysstr(encoding), "replace")
189 if u == r.decode(_sysstr(encoding)):
189 if u == r.decode(_sysstr(encoding)):
190 # r is a safe, non-lossy encoding of s
190 # r is a safe, non-lossy encoding of s
191 return safelocalstr(r)
191 return safelocalstr(r)
192 return localstr(s, r)
192 return localstr(s, r)
193 except UnicodeDecodeError:
193 except UnicodeDecodeError:
194 # we should only get here if we're looking at an ancient changeset
194 # we should only get here if we're looking at an ancient changeset
195 try:
195 try:
196 u = s.decode(_sysstr(fallbackencoding))
196 u = s.decode(_sysstr(fallbackencoding))
197 r = u.encode(_sysstr(encoding), "replace")
197 r = u.encode(_sysstr(encoding), "replace")
198 if u == r.decode(_sysstr(encoding)):
198 if u == r.decode(_sysstr(encoding)):
199 # r is a safe, non-lossy encoding of s
199 # r is a safe, non-lossy encoding of s
200 return safelocalstr(r)
200 return safelocalstr(r)
201 return localstr(u.encode('UTF-8'), r)
201 return localstr(u.encode('UTF-8'), r)
202 except UnicodeDecodeError:
202 except UnicodeDecodeError:
203 u = s.decode("utf-8", "replace") # last ditch
203 u = s.decode("utf-8", "replace") # last ditch
204 # can't round-trip
204 # can't round-trip
205 return u.encode(_sysstr(encoding), "replace")
205 return u.encode(_sysstr(encoding), "replace")
206 except LookupError as k:
206 except LookupError as k:
207 raise error.Abort(
207 raise error.Abort(
208 pycompat.bytestr(k), hint=b"please check your locale settings"
208 pycompat.bytestr(k), hint=b"please check your locale settings"
209 )
209 )
210
210
211
211
212 def fromlocal(s):
212 def fromlocal(s):
213 # type: (bytes) -> bytes
213 # type: (bytes) -> bytes
214 """
214 """
215 Convert a string from the local character encoding to UTF-8
215 Convert a string from the local character encoding to UTF-8
216
216
217 We attempt to decode strings using the encoding mode set by
217 We attempt to decode strings using the encoding mode set by
218 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
218 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
219 characters will cause an error message. Other modes include
219 characters will cause an error message. Other modes include
220 'replace', which replaces unknown characters with a special
220 'replace', which replaces unknown characters with a special
221 Unicode character, and 'ignore', which drops the character.
221 Unicode character, and 'ignore', which drops the character.
222 """
222 """
223
223
224 # can we do a lossless round-trip?
224 # can we do a lossless round-trip?
225 if isinstance(s, localstr):
225 if isinstance(s, localstr):
226 return s._utf8
226 return s._utf8
227 if isasciistr(s):
227 if isasciistr(s):
228 return s
228 return s
229
229
230 try:
230 try:
231 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
231 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
232 return u.encode("utf-8")
232 return u.encode("utf-8")
233 except UnicodeDecodeError as inst:
233 except UnicodeDecodeError as inst:
234 sub = s[max(0, inst.start - 10) : inst.start + 10]
234 sub = s[max(0, inst.start - 10) : inst.start + 10]
235 raise error.Abort(
235 raise error.Abort(
236 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
236 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
237 )
237 )
238 except LookupError as k:
238 except LookupError as k:
239 raise error.Abort(
239 raise error.Abort(
240 pycompat.bytestr(k), hint=b"please check your locale settings"
240 pycompat.bytestr(k), hint=b"please check your locale settings"
241 )
241 )
242
242
243
243
244 def unitolocal(u):
244 def unitolocal(u):
245 # type: (Text) -> bytes
245 # type: (Text) -> bytes
246 """Convert a unicode string to a byte string of local encoding"""
246 """Convert a unicode string to a byte string of local encoding"""
247 return tolocal(u.encode('utf-8'))
247 return tolocal(u.encode('utf-8'))
248
248
249
249
250 def unifromlocal(s):
250 def unifromlocal(s):
251 # type: (bytes) -> Text
251 # type: (bytes) -> Text
252 """Convert a byte string of local encoding to a unicode string"""
252 """Convert a byte string of local encoding to a unicode string"""
253 return fromlocal(s).decode('utf-8')
253 return fromlocal(s).decode('utf-8')
254
254
255
255
256 def unimethod(bytesfunc):
256 def unimethod(bytesfunc):
257 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
257 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
258 """Create a proxy method that forwards __unicode__() and __str__() of
258 """Create a proxy method that forwards __unicode__() and __str__() of
259 Python 3 to __bytes__()"""
259 Python 3 to __bytes__()"""
260
260
261 def unifunc(obj):
261 def unifunc(obj):
262 return unifromlocal(bytesfunc(obj))
262 return unifromlocal(bytesfunc(obj))
263
263
264 return unifunc
264 return unifunc
265
265
266
266
267 # converter functions between native str and byte string. use these if the
267 # converter functions between native str and byte string. use these if the
268 # character encoding is not aware (e.g. exception message) or is known to
268 # character encoding is not aware (e.g. exception message) or is known to
269 # be locale dependent (e.g. date formatting.)
269 # be locale dependent (e.g. date formatting.)
270 strtolocal = unitolocal
270 strtolocal = unitolocal
271 strfromlocal = unifromlocal
271 strfromlocal = unifromlocal
272 strmethod = unimethod
272 strmethod = unimethod
273
273
274
274
275 def lower(s):
275 def lower(s):
276 # type: (bytes) -> bytes
276 # type: (bytes) -> bytes
277 """best-effort encoding-aware case-folding of local string s"""
277 """best-effort encoding-aware case-folding of local string s"""
278 try:
278 try:
279 return asciilower(s)
279 return asciilower(s)
280 except UnicodeDecodeError:
280 except UnicodeDecodeError:
281 pass
281 pass
282 try:
282 try:
283 if isinstance(s, localstr):
283 if isinstance(s, localstr):
284 u = s._utf8.decode("utf-8")
284 u = s._utf8.decode("utf-8")
285 else:
285 else:
286 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
286 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
287
287
288 lu = u.lower()
288 lu = u.lower()
289 if u == lu:
289 if u == lu:
290 return s # preserve localstring
290 return s # preserve localstring
291 return lu.encode(_sysstr(encoding))
291 return lu.encode(_sysstr(encoding))
292 except UnicodeError:
292 except UnicodeError:
293 return s.lower() # we don't know how to fold this except in ASCII
293 return s.lower() # we don't know how to fold this except in ASCII
294 except LookupError as k:
294 except LookupError as k:
295 raise error.Abort(
295 raise error.Abort(
296 pycompat.bytestr(k), hint=b"please check your locale settings"
296 pycompat.bytestr(k), hint=b"please check your locale settings"
297 )
297 )
298
298
299
299
300 def upper(s):
300 def upper(s):
301 # type: (bytes) -> bytes
301 # type: (bytes) -> bytes
302 """best-effort encoding-aware case-folding of local string s"""
302 """best-effort encoding-aware case-folding of local string s"""
303 try:
303 try:
304 return asciiupper(s)
304 return asciiupper(s)
305 except UnicodeDecodeError:
305 except UnicodeDecodeError:
306 return upperfallback(s)
306 return upperfallback(s)
307
307
308
308
309 def upperfallback(s):
309 def upperfallback(s):
310 # type: (Any) -> Any
310 # type: (Any) -> Any
311 try:
311 try:
312 if isinstance(s, localstr):
312 if isinstance(s, localstr):
313 u = s._utf8.decode("utf-8")
313 u = s._utf8.decode("utf-8")
314 else:
314 else:
315 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
315 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
316
316
317 uu = u.upper()
317 uu = u.upper()
318 if u == uu:
318 if u == uu:
319 return s # preserve localstring
319 return s # preserve localstring
320 return uu.encode(_sysstr(encoding))
320 return uu.encode(_sysstr(encoding))
321 except UnicodeError:
321 except UnicodeError:
322 return s.upper() # we don't know how to fold this except in ASCII
322 return s.upper() # we don't know how to fold this except in ASCII
323 except LookupError as k:
323 except LookupError as k:
324 raise error.Abort(
324 raise error.Abort(
325 pycompat.bytestr(k), hint=b"please check your locale settings"
325 pycompat.bytestr(k), hint=b"please check your locale settings"
326 )
326 )
327
327
328
328
329 if not _nativeenviron:
329 if not _nativeenviron:
330 # now encoding and helper functions are available, recreate the environ
330 # now encoding and helper functions are available, recreate the environ
331 # dict to be exported to other modules
331 # dict to be exported to other modules
332 if pycompat.iswindows:
332 if pycompat.iswindows:
333
333
334 class WindowsEnviron(dict):
334 class WindowsEnviron(dict):
335 """`os.environ` normalizes environment variables to uppercase on windows"""
335 """`os.environ` normalizes environment variables to uppercase on windows"""
336
336
337 def get(self, key, default=None):
337 def get(self, key, default=None):
338 return super().get(upper(key), default)
338 return super().get(upper(key), default)
339
339
340 environ = WindowsEnviron()
340 environ = WindowsEnviron()
341
341
342 for k, v in os.environ.items(): # re-exports
342 for k, v in os.environ.items(): # re-exports
343 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
343 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
344
344
345
345
346 DRIVE_RE = re.compile(b'^[a-z]:')
346 DRIVE_RE = re.compile(b'^[a-z]:')
347
347
348 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
348 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
349 # returns bytes.
349 # returns bytes.
350 if pycompat.iswindows:
350 if pycompat.iswindows:
351 # Python 3 on Windows issues a DeprecationWarning about using the bytes
351 # Python 3 on Windows issues a DeprecationWarning about using the bytes
352 # API when os.getcwdb() is called.
352 # API when os.getcwdb() is called.
353 #
353 #
354 # Additionally, py3.8+ uppercases the drive letter when calling
354 # Additionally, py3.8+ uppercases the drive letter when calling
355 # os.path.realpath(), which is used on ``repo.root``. Since those
355 # os.path.realpath(), which is used on ``repo.root``. Since those
356 # strings are compared in various places as simple strings, also call
356 # strings are compared in various places as simple strings, also call
357 # realpath here. See https://bugs.python.org/issue40368
357 # realpath here. See https://bugs.python.org/issue40368
358 #
358 #
359 # However this is not reliable, so lets explicitly make this drive
359 # However this is not reliable, so lets explicitly make this drive
360 # letter upper case.
360 # letter upper case.
361 #
361 #
362 # note: we should consider dropping realpath here since it seems to
362 # note: we should consider dropping realpath here since it seems to
363 # change the semantic of `getcwd`.
363 # change the semantic of `getcwd`.
364
364
365 def getcwd():
365 def getcwd():
366 cwd = os.getcwd() # re-exports
366 cwd = os.getcwd() # re-exports
367 cwd = os.path.realpath(cwd)
367 cwd = os.path.realpath(cwd)
368 cwd = strtolocal(cwd)
368 cwd = strtolocal(cwd)
369 if DRIVE_RE.match(cwd):
369 if DRIVE_RE.match(cwd):
370 cwd = cwd[0:1].upper() + cwd[1:]
370 cwd = cwd[0:1].upper() + cwd[1:]
371 return cwd
371 return cwd
372
372
373
373
374 else:
374 else:
375 getcwd = os.getcwdb # re-exports
375 getcwd = os.getcwdb # re-exports
376
376
377 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
377 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
378 _wide = _sysstr(
378 _wide = _sysstr(
379 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
379 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
380 and b"WFA"
380 and b"WFA"
381 or b"WF"
381 or b"WF"
382 )
382 )
383
383
384
384
385 def colwidth(s):
385 def colwidth(s):
386 # type: (bytes) -> int
386 # type: (bytes) -> int
387 """Find the column width of a string for display in the local encoding"""
387 """Find the column width of a string for display in the local encoding"""
388 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
388 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
389
389
390
390
391 def ucolwidth(d):
391 def ucolwidth(d):
392 # type: (Text) -> int
392 # type: (Text) -> int
393 """Find the column width of a Unicode string for display"""
393 """Find the column width of a Unicode string for display"""
394 eaw = getattr(unicodedata, 'east_asian_width', None)
394 eaw = getattr(unicodedata, 'east_asian_width', None)
395 if eaw is not None:
395 if eaw is not None:
396 return sum([eaw(c) in _wide and 2 or 1 for c in d])
396 return sum([eaw(c) in _wide and 2 or 1 for c in d])
397 return len(d)
397 return len(d)
398
398
399
399
400 def getcols(s, start, c):
400 def getcols(s, start, c):
401 # type: (bytes, int, int) -> bytes
401 # type: (bytes, int, int) -> bytes
402 """Use colwidth to find a c-column substring of s starting at byte
402 """Use colwidth to find a c-column substring of s starting at byte
403 index start"""
403 index start"""
404 for x in range(start + c, len(s)):
404 for x in range(start + c, len(s)):
405 t = s[start:x]
405 t = s[start:x]
406 if colwidth(t) == c:
406 if colwidth(t) == c:
407 return t
407 return t
408 raise ValueError('substring not found')
408 raise ValueError('substring not found')
409
409
410
410
411 def trim(s, width, ellipsis=b'', leftside=False):
411 def trim(s, width, ellipsis=b'', leftside=False):
412 # type: (bytes, int, bytes, bool) -> bytes
412 # type: (bytes, int, bytes, bool) -> bytes
413 """Trim string 's' to at most 'width' columns (including 'ellipsis').
413 """Trim string 's' to at most 'width' columns (including 'ellipsis').
414
414
415 If 'leftside' is True, left side of string 's' is trimmed.
415 If 'leftside' is True, left side of string 's' is trimmed.
416 'ellipsis' is always placed at trimmed side.
416 'ellipsis' is always placed at trimmed side.
417
417
418 >>> from .node import bin
418 >>> from .node import bin
419 >>> def bprint(s):
419 >>> def bprint(s):
420 ... print(pycompat.sysstr(s))
420 ... print(pycompat.sysstr(s))
421 >>> ellipsis = b'+++'
421 >>> ellipsis = b'+++'
422 >>> from . import encoding
422 >>> from . import encoding
423 >>> encoding.encoding = b'utf-8'
423 >>> encoding.encoding = b'utf-8'
424 >>> t = b'1234567890'
424 >>> t = b'1234567890'
425 >>> bprint(trim(t, 12, ellipsis=ellipsis))
425 >>> bprint(trim(t, 12, ellipsis=ellipsis))
426 1234567890
426 1234567890
427 >>> bprint(trim(t, 10, ellipsis=ellipsis))
427 >>> bprint(trim(t, 10, ellipsis=ellipsis))
428 1234567890
428 1234567890
429 >>> bprint(trim(t, 8, ellipsis=ellipsis))
429 >>> bprint(trim(t, 8, ellipsis=ellipsis))
430 12345+++
430 12345+++
431 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
431 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
432 +++67890
432 +++67890
433 >>> bprint(trim(t, 8))
433 >>> bprint(trim(t, 8))
434 12345678
434 12345678
435 >>> bprint(trim(t, 8, leftside=True))
435 >>> bprint(trim(t, 8, leftside=True))
436 34567890
436 34567890
437 >>> bprint(trim(t, 3, ellipsis=ellipsis))
437 >>> bprint(trim(t, 3, ellipsis=ellipsis))
438 +++
438 +++
439 >>> bprint(trim(t, 1, ellipsis=ellipsis))
439 >>> bprint(trim(t, 1, ellipsis=ellipsis))
440 +
440 +
441 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
441 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
442 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
442 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
443 >>> bprint(trim(t, 12, ellipsis=ellipsis))
443 >>> bprint(trim(t, 12, ellipsis=ellipsis))
444 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
444 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
445 >>> bprint(trim(t, 10, ellipsis=ellipsis))
445 >>> bprint(trim(t, 10, ellipsis=ellipsis))
446 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
446 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
447 >>> bprint(trim(t, 8, ellipsis=ellipsis))
447 >>> bprint(trim(t, 8, ellipsis=ellipsis))
448 \xe3\x81\x82\xe3\x81\x84+++
448 \xe3\x81\x82\xe3\x81\x84+++
449 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
449 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
450 +++\xe3\x81\x88\xe3\x81\x8a
450 +++\xe3\x81\x88\xe3\x81\x8a
451 >>> bprint(trim(t, 5))
451 >>> bprint(trim(t, 5))
452 \xe3\x81\x82\xe3\x81\x84
452 \xe3\x81\x82\xe3\x81\x84
453 >>> bprint(trim(t, 5, leftside=True))
453 >>> bprint(trim(t, 5, leftside=True))
454 \xe3\x81\x88\xe3\x81\x8a
454 \xe3\x81\x88\xe3\x81\x8a
455 >>> bprint(trim(t, 4, ellipsis=ellipsis))
455 >>> bprint(trim(t, 4, ellipsis=ellipsis))
456 +++
456 +++
457 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
457 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
458 +++
458 +++
459 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
459 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
460 >>> bprint(trim(t, 12, ellipsis=ellipsis))
460 >>> bprint(trim(t, 12, ellipsis=ellipsis))
461 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
461 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
462 >>> bprint(trim(t, 10, ellipsis=ellipsis))
462 >>> bprint(trim(t, 10, ellipsis=ellipsis))
463 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
463 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
464 >>> bprint(trim(t, 8, ellipsis=ellipsis))
464 >>> bprint(trim(t, 8, ellipsis=ellipsis))
465 \x11\x22\x33\x44\x55+++
465 \x11\x22\x33\x44\x55+++
466 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
466 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
467 +++\x66\x77\x88\x99\xaa
467 +++\x66\x77\x88\x99\xaa
468 >>> bprint(trim(t, 8))
468 >>> bprint(trim(t, 8))
469 \x11\x22\x33\x44\x55\x66\x77\x88
469 \x11\x22\x33\x44\x55\x66\x77\x88
470 >>> bprint(trim(t, 8, leftside=True))
470 >>> bprint(trim(t, 8, leftside=True))
471 \x33\x44\x55\x66\x77\x88\x99\xaa
471 \x33\x44\x55\x66\x77\x88\x99\xaa
472 >>> bprint(trim(t, 3, ellipsis=ellipsis))
472 >>> bprint(trim(t, 3, ellipsis=ellipsis))
473 +++
473 +++
474 >>> bprint(trim(t, 1, ellipsis=ellipsis))
474 >>> bprint(trim(t, 1, ellipsis=ellipsis))
475 +
475 +
476 """
476 """
477 try:
477 try:
478 u = s.decode(_sysstr(encoding))
478 u = s.decode(_sysstr(encoding))
479 except UnicodeDecodeError:
479 except UnicodeDecodeError:
480 if len(s) <= width: # trimming is not needed
480 if len(s) <= width: # trimming is not needed
481 return s
481 return s
482 width -= len(ellipsis)
482 width -= len(ellipsis)
483 if width <= 0: # no enough room even for ellipsis
483 if width <= 0: # no enough room even for ellipsis
484 return ellipsis[: width + len(ellipsis)]
484 return ellipsis[: width + len(ellipsis)]
485 if leftside:
485 if leftside:
486 return ellipsis + s[-width:]
486 return ellipsis + s[-width:]
487 return s[:width] + ellipsis
487 return s[:width] + ellipsis
488
488
489 if ucolwidth(u) <= width: # trimming is not needed
489 if ucolwidth(u) <= width: # trimming is not needed
490 return s
490 return s
491
491
492 width -= len(ellipsis)
492 width -= len(ellipsis)
493 if width <= 0: # no enough room even for ellipsis
493 if width <= 0: # no enough room even for ellipsis
494 return ellipsis[: width + len(ellipsis)]
494 return ellipsis[: width + len(ellipsis)]
495
495
496 chars = list(u)
496 chars = list(u)
497 if leftside:
497 if leftside:
498 chars.reverse()
498 chars.reverse()
499 width_so_far = 0
499 width_so_far = 0
500 for i, c in enumerate(chars):
500 for i, c in enumerate(chars):
501 width_so_far += ucolwidth(c)
501 width_so_far += ucolwidth(c)
502 if width_so_far > width:
502 if width_so_far > width:
503 break
503 break
504 chars = chars[:i]
504 chars = chars[:i]
505 if leftside:
505 if leftside:
506 chars.reverse()
506 chars.reverse()
507 u = u''.join(chars).encode(_sysstr(encoding))
507 u = u''.join(chars).encode(_sysstr(encoding))
508 if leftside:
508 if leftside:
509 return ellipsis + u
509 return ellipsis + u
510 return u + ellipsis
510 return u + ellipsis
511
511
512
512
513 class normcasespecs:
513 class normcasespecs:
514 """what a platform's normcase does to ASCII strings
514 """what a platform's normcase does to ASCII strings
515
515
516 This is specified per platform, and should be consistent with what normcase
516 This is specified per platform, and should be consistent with what normcase
517 on that platform actually does.
517 on that platform actually does.
518
518
519 lower: normcase lowercases ASCII strings
519 lower: normcase lowercases ASCII strings
520 upper: normcase uppercases ASCII strings
520 upper: normcase uppercases ASCII strings
521 other: the fallback function should always be called
521 other: the fallback function should always be called
522
522
523 This should be kept in sync with normcase_spec in util.h."""
523 This should be kept in sync with normcase_spec in util.h."""
524
524
525 lower = -1
525 lower = -1
526 upper = 1
526 upper = 1
527 other = 0
527 other = 0
528
528
529
529
530 def jsonescape(s, paranoid=False):
530 def jsonescape(s, paranoid=False):
531 # type: (Any, Any) -> Any
531 # type: (Any, Any) -> Any
532 """returns a string suitable for JSON
532 """returns a string suitable for JSON
533
533
534 JSON is problematic for us because it doesn't support non-Unicode
534 JSON is problematic for us because it doesn't support non-Unicode
535 bytes. To deal with this, we take the following approach:
535 bytes. To deal with this, we take the following approach:
536
536
537 - localstr/safelocalstr objects are converted back to UTF-8
537 - localstr/safelocalstr objects are converted back to UTF-8
538 - valid UTF-8/ASCII strings are passed as-is
538 - valid UTF-8/ASCII strings are passed as-is
539 - other strings are converted to UTF-8b surrogate encoding
539 - other strings are converted to UTF-8b surrogate encoding
540 - apply JSON-specified string escaping
540 - apply JSON-specified string escaping
541
541
542 (escapes are doubled in these tests)
542 (escapes are doubled in these tests)
543
543
544 >>> jsonescape(b'this is a test')
544 >>> jsonescape(b'this is a test')
545 'this is a test'
545 'this is a test'
546 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
546 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
547 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
547 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
548 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
548 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
549 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
549 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
550 >>> jsonescape(b'a weird byte: \\xdd')
550 >>> jsonescape(b'a weird byte: \\xdd')
551 'a weird byte: \\xed\\xb3\\x9d'
551 'a weird byte: \\xed\\xb3\\x9d'
552 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
552 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
553 'utf-8: caf\\xc3\\xa9'
553 'utf-8: caf\\xc3\\xa9'
554 >>> jsonescape(b'')
554 >>> jsonescape(b'')
555 ''
555 ''
556
556
557 If paranoid, non-ascii and common troublesome characters are also escaped.
557 If paranoid, non-ascii and common troublesome characters are also escaped.
558 This is suitable for web output.
558 This is suitable for web output.
559
559
560 >>> s = b'escape characters: \\0 \\x0b \\x7f'
560 >>> s = b'escape characters: \\0 \\x0b \\x7f'
561 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
561 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
562 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
562 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
563 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
563 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
564 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
564 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
565 'escape boundary: ~ \\\\u007f \\\\u0080'
565 'escape boundary: ~ \\\\u007f \\\\u0080'
566 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
566 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
567 'a weird byte: \\\\udcdd'
567 'a weird byte: \\\\udcdd'
568 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
568 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
569 'utf-8: caf\\\\u00e9'
569 'utf-8: caf\\\\u00e9'
570 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
570 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
571 'non-BMP: \\\\ud834\\\\udd1e'
571 'non-BMP: \\\\ud834\\\\udd1e'
572 >>> jsonescape(b'<foo@example.org>', paranoid=True)
572 >>> jsonescape(b'<foo@example.org>', paranoid=True)
573 '\\\\u003cfoo@example.org\\\\u003e'
573 '\\\\u003cfoo@example.org\\\\u003e'
574 """
574 """
575
575
576 u8chars = toutf8b(s)
576 u8chars = toutf8b(s)
577 try:
577 try:
578 return _jsonescapeu8fast(u8chars, paranoid)
578 return _jsonescapeu8fast(u8chars, paranoid)
579 except ValueError:
579 except ValueError:
580 pass
580 pass
581 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
581 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
582
582
583
583
584 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
584 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
585 # bytes are mapped to that range.
585 # bytes are mapped to that range.
586 _utf8strict = r'surrogatepass'
586 _utf8strict = r'surrogatepass'
587
587
588 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
588 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
589
589
590
590
591 def getutf8char(s, pos):
591 def getutf8char(s, pos):
592 # type: (bytes, int) -> bytes
592 # type: (bytes, int) -> bytes
593 """get the next full utf-8 character in the given string, starting at pos
593 """get the next full utf-8 character in the given string, starting at pos
594
594
595 Raises a UnicodeError if the given location does not start a valid
595 Raises a UnicodeError if the given location does not start a valid
596 utf-8 character.
596 utf-8 character.
597 """
597 """
598
598
599 # find how many bytes to attempt decoding from first nibble
599 # find how many bytes to attempt decoding from first nibble
600 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
600 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
601 if not l: # ascii
601 if not l: # ascii
602 return s[pos : pos + 1]
602 return s[pos : pos + 1]
603
603
604 c = s[pos : pos + l]
604 c = s[pos : pos + l]
605 # validate with attempted decode
605 # validate with attempted decode
606 c.decode("utf-8", _utf8strict)
606 c.decode("utf-8", _utf8strict)
607 return c
607 return c
608
608
609
609
610 def toutf8b(s):
610 def toutf8b(s):
611 # type: (bytes) -> bytes
611 # type: (bytes) -> bytes
612 """convert a local, possibly-binary string into UTF-8b
612 """convert a local, possibly-binary string into UTF-8b
613
613
614 This is intended as a generic method to preserve data when working
614 This is intended as a generic method to preserve data when working
615 with schemes like JSON and XML that have no provision for
615 with schemes like JSON and XML that have no provision for
616 arbitrary byte strings. As Mercurial often doesn't know
616 arbitrary byte strings. As Mercurial often doesn't know
617 what encoding data is in, we use so-called UTF-8b.
617 what encoding data is in, we use so-called UTF-8b.
618
618
619 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
619 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
620 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
620 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
621 uDC00-uDCFF.
621 uDC00-uDCFF.
622
622
623 Principles of operation:
623 Principles of operation:
624
624
625 - ASCII and UTF-8 data successfully round-trips and is understood
625 - ASCII and UTF-8 data successfully round-trips and is understood
626 by Unicode-oriented clients
626 by Unicode-oriented clients
627 - filenames and file contents in arbitrary other encodings can have
627 - filenames and file contents in arbitrary other encodings can have
628 be round-tripped or recovered by clueful clients
628 be round-tripped or recovered by clueful clients
629 - local strings that have a cached known UTF-8 encoding (aka
629 - local strings that have a cached known UTF-8 encoding (aka
630 localstr) get sent as UTF-8 so Unicode-oriented clients get the
630 localstr) get sent as UTF-8 so Unicode-oriented clients get the
631 Unicode data they want
631 Unicode data they want
632 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
632 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
633 - because we must preserve UTF-8 bytestring in places such as
633 - because we must preserve UTF-8 bytestring in places such as
634 filenames, metadata can't be roundtripped without help
634 filenames, metadata can't be roundtripped without help
635
635
636 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
636 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
637 arbitrary bytes into an internal Unicode format that can be
637 arbitrary bytes into an internal Unicode format that can be
638 re-encoded back into the original. Here we are exposing the
638 re-encoded back into the original. Here we are exposing the
639 internal surrogate encoding as a UTF-8 string.)
639 internal surrogate encoding as a UTF-8 string.)
640 """
640 """
641
641
642 if isinstance(s, localstr):
642 if isinstance(s, localstr):
643 # assume that the original UTF-8 sequence would never contain
643 # assume that the original UTF-8 sequence would never contain
644 # invalid characters in U+DCxx range
644 # invalid characters in U+DCxx range
645 return s._utf8
645 return s._utf8
646 elif isinstance(s, safelocalstr):
646 elif isinstance(s, safelocalstr):
647 # already verified that s is non-lossy in legacy encoding, which
647 # already verified that s is non-lossy in legacy encoding, which
648 # shouldn't contain characters in U+DCxx range
648 # shouldn't contain characters in U+DCxx range
649 return fromlocal(s)
649 return fromlocal(s)
650 elif isasciistr(s):
650 elif isasciistr(s):
651 return s
651 return s
652 if b"\xed" not in s:
652 if b"\xed" not in s:
653 try:
653 try:
654 s.decode('utf-8', _utf8strict)
654 s.decode('utf-8', _utf8strict)
655 return s
655 return s
656 except UnicodeDecodeError:
656 except UnicodeDecodeError:
657 pass
657 pass
658
658
659 s = pycompat.bytestr(s)
659 s = pycompat.bytestr(s)
660 r = b""
660 r = bytearray()
661 pos = 0
661 pos = 0
662 l = len(s)
662 l = len(s)
663 while pos < l:
663 while pos < l:
664 try:
664 try:
665 c = getutf8char(s, pos)
665 c = getutf8char(s, pos)
666 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
666 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
667 # have to re-escape existing U+DCxx characters
667 # have to re-escape existing U+DCxx characters
668 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
669 pos += 1
669 pos += 1
670 else:
670 else:
671 pos += len(c)
671 pos += len(c)
672 except UnicodeDecodeError:
672 except UnicodeDecodeError:
673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
674 pos += 1
674 pos += 1
675 r += c
675 r += c
676 return r
676 return bytes(r)
677
677
678
678
679 def fromutf8b(s):
679 def fromutf8b(s):
680 # type: (bytes) -> bytes
680 # type: (bytes) -> bytes
681 """Given a UTF-8b string, return a local, possibly-binary string.
681 """Given a UTF-8b string, return a local, possibly-binary string.
682
682
683 return the original binary string. This
683 return the original binary string. This
684 is a round-trip process for strings like filenames, but metadata
684 is a round-trip process for strings like filenames, but metadata
685 that's was passed through tolocal will remain in UTF-8.
685 that's was passed through tolocal will remain in UTF-8.
686
686
687 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
687 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
688 >>> m = b"\\xc3\\xa9\\x99abcd"
688 >>> m = b"\\xc3\\xa9\\x99abcd"
689 >>> toutf8b(m)
689 >>> toutf8b(m)
690 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
690 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
691 >>> roundtrip(m)
691 >>> roundtrip(m)
692 True
692 True
693 >>> roundtrip(b"\\xc2\\xc2\\x80")
693 >>> roundtrip(b"\\xc2\\xc2\\x80")
694 True
694 True
695 >>> roundtrip(b"\\xef\\xbf\\xbd")
695 >>> roundtrip(b"\\xef\\xbf\\xbd")
696 True
696 True
697 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
697 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
698 True
698 True
699 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
699 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
700 True
700 True
701 """
701 """
702
702
703 if isasciistr(s):
703 if isasciistr(s):
704 return s
704 return s
705 # fast path - look for uDxxx prefixes in s
705 # fast path - look for uDxxx prefixes in s
706 if b"\xed" not in s:
706 if b"\xed" not in s:
707 return s
707 return s
708
708
709 # We could do this with the unicode type but some Python builds
709 # We could do this with the unicode type but some Python builds
710 # use UTF-16 internally (issue5031) which causes non-BMP code
710 # use UTF-16 internally (issue5031) which causes non-BMP code
711 # points to be escaped. Instead, we use our handy getutf8char
711 # points to be escaped. Instead, we use our handy getutf8char
712 # helper again to walk the string without "decoding" it.
712 # helper again to walk the string without "decoding" it.
713
713
714 s = pycompat.bytestr(s)
714 s = pycompat.bytestr(s)
715 r = b""
715 r = bytearray()
716 pos = 0
716 pos = 0
717 l = len(s)
717 l = len(s)
718 while pos < l:
718 while pos < l:
719 c = getutf8char(s, pos)
719 c = getutf8char(s, pos)
720 pos += len(c)
720 pos += len(c)
721 # unescape U+DCxx characters
721 # unescape U+DCxx characters
722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
724 r += c
724 r += c
725 return r
725 return bytes(r)
General Comments 0
You need to be logged in to leave comments. Login now