##// END OF EJS Templates
typing: restore `encoding.encoding` and `encoding.encodingmode` to bytes...
Matt Harbison -
r52566:f70f61a8 default
parent child Browse files
Show More
@@ -1,718 +1,719
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8
8
9 import locale
9 import locale
10 import os
10 import os
11 import re
11 import re
12 import typing
12 import typing
13 import unicodedata
13 import unicodedata
14
14
15 from typing import (
15 from typing import (
16 Any,
16 Any,
17 Callable,
17 Callable,
18 Text,
18 Text,
19 TypeVar,
19 TypeVar,
20 )
20 )
21
21
22 from . import (
22 from . import (
23 error,
23 error,
24 policy,
24 policy,
25 pycompat,
25 pycompat,
26 )
26 )
27
27
28 from .pure import charencode as charencodepure
28 from .pure import charencode as charencodepure
29
29
30 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
30 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
31
31
32 charencode = policy.importmod('charencode')
32 charencode = policy.importmod('charencode')
33
33
34 isasciistr = charencode.isasciistr
34 isasciistr = charencode.isasciistr
35 asciilower = charencode.asciilower
35 asciilower = charencode.asciilower
36 asciiupper = charencode.asciiupper
36 asciiupper = charencode.asciiupper
37 _jsonescapeu8fast = charencode.jsonescapeu8fast
37 _jsonescapeu8fast = charencode.jsonescapeu8fast
38
38
39 _sysstr = pycompat.sysstr
39 _sysstr = pycompat.sysstr
40
40
41 unichr = chr
41 unichr = chr
42
42
43 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
43 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
44 # "Unicode Subtleties"), so we need to ignore them in some places for
44 # "Unicode Subtleties"), so we need to ignore them in some places for
45 # sanity.
45 # sanity.
46 _ignore = [
46 _ignore = [
47 unichr(int(x, 16)).encode("utf-8")
47 unichr(int(x, 16)).encode("utf-8")
48 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
48 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
49 b"206a 206b 206c 206d 206e 206f feff".split()
49 b"206a 206b 206c 206d 206e 206f feff".split()
50 ]
50 ]
51 # verify the next function will work
51 # verify the next function will work
52 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
52 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
53
53
54
54
55 def hfsignoreclean(s: bytes) -> bytes:
55 def hfsignoreclean(s: bytes) -> bytes:
56 """Remove codepoints ignored by HFS+ from s.
56 """Remove codepoints ignored by HFS+ from s.
57
57
58 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
58 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
59 '.hg'
59 '.hg'
60 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
60 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
61 '.hg'
61 '.hg'
62 """
62 """
63 if b"\xe2" in s or b"\xef" in s:
63 if b"\xe2" in s or b"\xef" in s:
64 for c in _ignore:
64 for c in _ignore:
65 s = s.replace(c, b'')
65 s = s.replace(c, b'')
66 return s
66 return s
67
67
68
68
69 # encoding.environ is provided read-only, which may not be used to modify
69 # encoding.environ is provided read-only, which may not be used to modify
70 # the process environment
70 # the process environment
71 _nativeenviron = os.supports_bytes_environ
71 _nativeenviron = os.supports_bytes_environ
72 if _nativeenviron:
72 if _nativeenviron:
73 environ = os.environb # re-exports
73 environ = os.environb # re-exports
74 if pycompat.sysplatform == b'OpenVMS':
74 if pycompat.sysplatform == b'OpenVMS':
75 # workaround for a bug in VSI 3.10 port
75 # workaround for a bug in VSI 3.10 port
76 # os.environb is only populated with a few Predefined symbols
76 # os.environb is only populated with a few Predefined symbols
77 def newget(self, key, default=None):
77 def newget(self, key, default=None):
78 # pytype on linux does not understand OpenVMS special modules
78 # pytype on linux does not understand OpenVMS special modules
79 import _decc # pytype: disable=import-error
79 import _decc # pytype: disable=import-error
80
80
81 v = _decc.getenv(key, None)
81 v = _decc.getenv(key, None)
82 if isinstance(key, bytes):
82 if isinstance(key, bytes):
83 return default if v is None else v.encode('latin-1')
83 return default if v is None else v.encode('latin-1')
84 else:
84 else:
85 return default if v is None else v
85 return default if v is None else v
86
86
87 environ.__class__.get = newget
87 environ.__class__.get = newget
88 else:
88 else:
89 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
89 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
90 # and recreate it once encoding is settled
90 # and recreate it once encoding is settled
91 environ = {
91 environ = {
92 k.encode('utf-8'): v.encode('utf-8')
92 k.encode('utf-8'): v.encode('utf-8')
93 for k, v in os.environ.items() # re-exports
93 for k, v in os.environ.items() # re-exports
94 }
94 }
95
95
96 _encodingrewrites = {
96 _encodingrewrites = {
97 b'646': b'ascii',
97 b'646': b'ascii',
98 b'ANSI_X3.4-1968': b'ascii',
98 b'ANSI_X3.4-1968': b'ascii',
99 }
99 }
100 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
100 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
101 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
101 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
102 # https://bugs.python.org/issue13216
102 # https://bugs.python.org/issue13216
103 if pycompat.iswindows:
103 if pycompat.iswindows:
104 _encodingrewrites[b'cp65001'] = b'utf-8'
104 _encodingrewrites[b'cp65001'] = b'utf-8'
105
105
106 encoding: bytes = b'' # help pytype avoid seeing None value
106 try:
107 try:
107 encoding = environ.get(b"HGENCODING")
108 encoding = environ.get(b"HGENCODING", b'')
108 if not encoding:
109 if not encoding:
109 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
110 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
110 encoding = _encodingrewrites.get(encoding, encoding)
111 encoding = _encodingrewrites.get(encoding, encoding)
111 except locale.Error:
112 except locale.Error:
112 encoding = b'ascii'
113 encoding = b'ascii'
113 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
114 encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")
114 fallbackencoding = b'ISO-8859-1'
115 fallbackencoding = b'ISO-8859-1'
115
116
116
117
117 class localstr(bytes):
118 class localstr(bytes):
118 """This class allows strings that are unmodified to be
119 """This class allows strings that are unmodified to be
119 round-tripped to the local encoding and back"""
120 round-tripped to the local encoding and back"""
120
121
121 def __new__(cls, u, l):
122 def __new__(cls, u, l):
122 s = bytes.__new__(cls, l)
123 s = bytes.__new__(cls, l)
123 s._utf8 = u
124 s._utf8 = u
124 return s
125 return s
125
126
126 if typing.TYPE_CHECKING:
127 if typing.TYPE_CHECKING:
127 # pseudo implementation to help pytype see localstr() constructor
128 # pseudo implementation to help pytype see localstr() constructor
128 def __init__(self, u: bytes, l: bytes) -> None:
129 def __init__(self, u: bytes, l: bytes) -> None:
129 super(localstr, self).__init__(l)
130 super(localstr, self).__init__(l)
130 self._utf8 = u
131 self._utf8 = u
131
132
132 def __hash__(self):
133 def __hash__(self):
133 return hash(self._utf8) # avoid collisions in local string space
134 return hash(self._utf8) # avoid collisions in local string space
134
135
135
136
136 class safelocalstr(bytes):
137 class safelocalstr(bytes):
137 """Tagged string denoting it was previously an internal UTF-8 string,
138 """Tagged string denoting it was previously an internal UTF-8 string,
138 and can be converted back to UTF-8 losslessly
139 and can be converted back to UTF-8 losslessly
139
140
140 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
141 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
141 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
142 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
142 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
143 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
143 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
144 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
144 """
145 """
145
146
146
147
147 def tolocal(s: bytes) -> bytes:
148 def tolocal(s: bytes) -> bytes:
148 """
149 """
149 Convert a string from internal UTF-8 to local encoding
150 Convert a string from internal UTF-8 to local encoding
150
151
151 All internal strings should be UTF-8 but some repos before the
152 All internal strings should be UTF-8 but some repos before the
152 implementation of locale support may contain latin1 or possibly
153 implementation of locale support may contain latin1 or possibly
153 other character sets. We attempt to decode everything strictly
154 other character sets. We attempt to decode everything strictly
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 replace unknown characters.
156 replace unknown characters.
156
157
157 The localstr class is used to cache the known UTF-8 encoding of
158 The localstr class is used to cache the known UTF-8 encoding of
158 strings next to their local representation to allow lossless
159 strings next to their local representation to allow lossless
159 round-trip conversion back to UTF-8.
160 round-trip conversion back to UTF-8.
160
161
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 >>> l = tolocal(u)
163 >>> l = tolocal(u)
163 >>> l
164 >>> l
164 'foo: ?'
165 'foo: ?'
165 >>> fromlocal(l)
166 >>> fromlocal(l)
166 'foo: \\xc3\\xa4'
167 'foo: \\xc3\\xa4'
167 >>> u2 = b'foo: \\xc3\\xa1'
168 >>> u2 = b'foo: \\xc3\\xa1'
168 >>> d = { l: 1, tolocal(u2): 2 }
169 >>> d = { l: 1, tolocal(u2): 2 }
169 >>> len(d) # no collision
170 >>> len(d) # no collision
170 2
171 2
171 >>> b'foo: ?' in d
172 >>> b'foo: ?' in d
172 False
173 False
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 >>> l = tolocal(l1)
175 >>> l = tolocal(l1)
175 >>> l
176 >>> l
176 'foo: ?'
177 'foo: ?'
177 >>> fromlocal(l) # magically in utf-8
178 >>> fromlocal(l) # magically in utf-8
178 'foo: \\xc3\\xa4'
179 'foo: \\xc3\\xa4'
179 """
180 """
180
181
181 if isasciistr(s):
182 if isasciistr(s):
182 return s
183 return s
183
184
184 try:
185 try:
185 try:
186 try:
186 # make sure string is actually stored in UTF-8
187 # make sure string is actually stored in UTF-8
187 u = s.decode('UTF-8')
188 u = s.decode('UTF-8')
188 if encoding == b'UTF-8':
189 if encoding == b'UTF-8':
189 # fast path
190 # fast path
190 return s
191 return s
191 r = u.encode(_sysstr(encoding), "replace")
192 r = u.encode(_sysstr(encoding), "replace")
192 if u == r.decode(_sysstr(encoding)):
193 if u == r.decode(_sysstr(encoding)):
193 # r is a safe, non-lossy encoding of s
194 # r is a safe, non-lossy encoding of s
194 return safelocalstr(r)
195 return safelocalstr(r)
195 return localstr(s, r)
196 return localstr(s, r)
196 except UnicodeDecodeError:
197 except UnicodeDecodeError:
197 # we should only get here if we're looking at an ancient changeset
198 # we should only get here if we're looking at an ancient changeset
198 try:
199 try:
199 u = s.decode(_sysstr(fallbackencoding))
200 u = s.decode(_sysstr(fallbackencoding))
200 r = u.encode(_sysstr(encoding), "replace")
201 r = u.encode(_sysstr(encoding), "replace")
201 if u == r.decode(_sysstr(encoding)):
202 if u == r.decode(_sysstr(encoding)):
202 # r is a safe, non-lossy encoding of s
203 # r is a safe, non-lossy encoding of s
203 return safelocalstr(r)
204 return safelocalstr(r)
204 return localstr(u.encode('UTF-8'), r)
205 return localstr(u.encode('UTF-8'), r)
205 except UnicodeDecodeError:
206 except UnicodeDecodeError:
206 u = s.decode("utf-8", "replace") # last ditch
207 u = s.decode("utf-8", "replace") # last ditch
207 # can't round-trip
208 # can't round-trip
208 return u.encode(_sysstr(encoding), "replace")
209 return u.encode(_sysstr(encoding), "replace")
209 except LookupError as k:
210 except LookupError as k:
210 raise error.Abort(
211 raise error.Abort(
211 pycompat.bytestr(k), hint=b"please check your locale settings"
212 pycompat.bytestr(k), hint=b"please check your locale settings"
212 )
213 )
213
214
214
215
215 def fromlocal(s: bytes) -> bytes:
216 def fromlocal(s: bytes) -> bytes:
216 """
217 """
217 Convert a string from the local character encoding to UTF-8
218 Convert a string from the local character encoding to UTF-8
218
219
219 We attempt to decode strings using the encoding mode set by
220 We attempt to decode strings using the encoding mode set by
220 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
221 characters will cause an error message. Other modes include
222 characters will cause an error message. Other modes include
222 'replace', which replaces unknown characters with a special
223 'replace', which replaces unknown characters with a special
223 Unicode character, and 'ignore', which drops the character.
224 Unicode character, and 'ignore', which drops the character.
224 """
225 """
225
226
226 # can we do a lossless round-trip?
227 # can we do a lossless round-trip?
227 if isinstance(s, localstr):
228 if isinstance(s, localstr):
228 return s._utf8
229 return s._utf8
229 if isasciistr(s):
230 if isasciistr(s):
230 return s
231 return s
231
232
232 try:
233 try:
233 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
234 return u.encode("utf-8")
235 return u.encode("utf-8")
235 except UnicodeDecodeError as inst:
236 except UnicodeDecodeError as inst:
236 sub = s[max(0, inst.start - 10) : inst.start + 10]
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
237 raise error.Abort(
238 raise error.Abort(
238 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
239 )
240 )
240 except LookupError as k:
241 except LookupError as k:
241 raise error.Abort(
242 raise error.Abort(
242 pycompat.bytestr(k), hint=b"please check your locale settings"
243 pycompat.bytestr(k), hint=b"please check your locale settings"
243 )
244 )
244
245
245
246
246 def unitolocal(u: str) -> bytes:
247 def unitolocal(u: str) -> bytes:
247 """Convert a unicode string to a byte string of local encoding"""
248 """Convert a unicode string to a byte string of local encoding"""
248 return tolocal(u.encode('utf-8'))
249 return tolocal(u.encode('utf-8'))
249
250
250
251
251 def unifromlocal(s: bytes) -> str:
252 def unifromlocal(s: bytes) -> str:
252 """Convert a byte string of local encoding to a unicode string"""
253 """Convert a byte string of local encoding to a unicode string"""
253 return fromlocal(s).decode('utf-8')
254 return fromlocal(s).decode('utf-8')
254
255
255
256
256 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
257 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
257 """Create a proxy method that forwards __unicode__() and __str__() of
258 """Create a proxy method that forwards __unicode__() and __str__() of
258 Python 3 to __bytes__()"""
259 Python 3 to __bytes__()"""
259
260
260 def unifunc(obj):
261 def unifunc(obj):
261 return unifromlocal(bytesfunc(obj))
262 return unifromlocal(bytesfunc(obj))
262
263
263 return unifunc
264 return unifunc
264
265
265
266
266 # converter functions between native str and byte string. use these if the
267 # converter functions between native str and byte string. use these if the
267 # character encoding is not aware (e.g. exception message) or is known to
268 # character encoding is not aware (e.g. exception message) or is known to
268 # be locale dependent (e.g. date formatting.)
269 # be locale dependent (e.g. date formatting.)
269 strtolocal = unitolocal
270 strtolocal = unitolocal
270 strfromlocal = unifromlocal
271 strfromlocal = unifromlocal
271 strmethod = unimethod
272 strmethod = unimethod
272
273
273
274
274 def lower(s: bytes) -> bytes:
275 def lower(s: bytes) -> bytes:
275 """best-effort encoding-aware case-folding of local string s"""
276 """best-effort encoding-aware case-folding of local string s"""
276 try:
277 try:
277 return asciilower(s)
278 return asciilower(s)
278 except UnicodeDecodeError:
279 except UnicodeDecodeError:
279 pass
280 pass
280 try:
281 try:
281 if isinstance(s, localstr):
282 if isinstance(s, localstr):
282 u = s._utf8.decode("utf-8")
283 u = s._utf8.decode("utf-8")
283 else:
284 else:
284 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
285 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
285
286
286 lu = u.lower()
287 lu = u.lower()
287 if u == lu:
288 if u == lu:
288 return s # preserve localstring
289 return s # preserve localstring
289 return lu.encode(_sysstr(encoding))
290 return lu.encode(_sysstr(encoding))
290 except UnicodeError:
291 except UnicodeError:
291 return s.lower() # we don't know how to fold this except in ASCII
292 return s.lower() # we don't know how to fold this except in ASCII
292 except LookupError as k:
293 except LookupError as k:
293 raise error.Abort(
294 raise error.Abort(
294 pycompat.bytestr(k), hint=b"please check your locale settings"
295 pycompat.bytestr(k), hint=b"please check your locale settings"
295 )
296 )
296
297
297
298
298 def upper(s: bytes) -> bytes:
299 def upper(s: bytes) -> bytes:
299 """best-effort encoding-aware case-folding of local string s"""
300 """best-effort encoding-aware case-folding of local string s"""
300 try:
301 try:
301 return asciiupper(s)
302 return asciiupper(s)
302 except UnicodeDecodeError:
303 except UnicodeDecodeError:
303 return upperfallback(s)
304 return upperfallback(s)
304
305
305
306
306 def upperfallback(s: Any) -> Any:
307 def upperfallback(s: Any) -> Any:
307 try:
308 try:
308 if isinstance(s, localstr):
309 if isinstance(s, localstr):
309 u = s._utf8.decode("utf-8")
310 u = s._utf8.decode("utf-8")
310 else:
311 else:
311 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
312 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
312
313
313 uu = u.upper()
314 uu = u.upper()
314 if u == uu:
315 if u == uu:
315 return s # preserve localstring
316 return s # preserve localstring
316 return uu.encode(_sysstr(encoding))
317 return uu.encode(_sysstr(encoding))
317 except UnicodeError:
318 except UnicodeError:
318 return s.upper() # we don't know how to fold this except in ASCII
319 return s.upper() # we don't know how to fold this except in ASCII
319 except LookupError as k:
320 except LookupError as k:
320 raise error.Abort(
321 raise error.Abort(
321 pycompat.bytestr(k), hint=b"please check your locale settings"
322 pycompat.bytestr(k), hint=b"please check your locale settings"
322 )
323 )
323
324
324
325
325 if not _nativeenviron:
326 if not _nativeenviron:
326 # now encoding and helper functions are available, recreate the environ
327 # now encoding and helper functions are available, recreate the environ
327 # dict to be exported to other modules
328 # dict to be exported to other modules
328 if pycompat.iswindows:
329 if pycompat.iswindows:
329
330
330 class WindowsEnviron(dict):
331 class WindowsEnviron(dict):
331 """`os.environ` normalizes environment variables to uppercase on windows"""
332 """`os.environ` normalizes environment variables to uppercase on windows"""
332
333
333 def get(self, key, default=None):
334 def get(self, key, default=None):
334 return super().get(upper(key), default)
335 return super().get(upper(key), default)
335
336
336 environ = WindowsEnviron()
337 environ = WindowsEnviron()
337
338
338 for k, v in os.environ.items(): # re-exports
339 for k, v in os.environ.items(): # re-exports
339 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
340 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
340
341
341
342
342 DRIVE_RE = re.compile(b'^[a-z]:')
343 DRIVE_RE = re.compile(b'^[a-z]:')
343
344
344 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
345 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
345 # returns bytes.
346 # returns bytes.
346 if pycompat.iswindows:
347 if pycompat.iswindows:
347 # Python 3 on Windows issues a DeprecationWarning about using the bytes
348 # Python 3 on Windows issues a DeprecationWarning about using the bytes
348 # API when os.getcwdb() is called.
349 # API when os.getcwdb() is called.
349 #
350 #
350 # Additionally, py3.8+ uppercases the drive letter when calling
351 # Additionally, py3.8+ uppercases the drive letter when calling
351 # os.path.realpath(), which is used on ``repo.root``. Since those
352 # os.path.realpath(), which is used on ``repo.root``. Since those
352 # strings are compared in various places as simple strings, also call
353 # strings are compared in various places as simple strings, also call
353 # realpath here. See https://bugs.python.org/issue40368
354 # realpath here. See https://bugs.python.org/issue40368
354 #
355 #
355 # However this is not reliable, so lets explicitly make this drive
356 # However this is not reliable, so lets explicitly make this drive
356 # letter upper case.
357 # letter upper case.
357 #
358 #
358 # note: we should consider dropping realpath here since it seems to
359 # note: we should consider dropping realpath here since it seems to
359 # change the semantic of `getcwd`.
360 # change the semantic of `getcwd`.
360
361
361 def getcwd():
362 def getcwd():
362 cwd = os.getcwd() # re-exports
363 cwd = os.getcwd() # re-exports
363 cwd = os.path.realpath(cwd)
364 cwd = os.path.realpath(cwd)
364 cwd = strtolocal(cwd)
365 cwd = strtolocal(cwd)
365 if DRIVE_RE.match(cwd):
366 if DRIVE_RE.match(cwd):
366 cwd = cwd[0:1].upper() + cwd[1:]
367 cwd = cwd[0:1].upper() + cwd[1:]
367 return cwd
368 return cwd
368
369
369
370
370 else:
371 else:
371 getcwd = os.getcwdb # re-exports
372 getcwd = os.getcwdb # re-exports
372
373
373 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
374 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
374 _wide = _sysstr(
375 _wide = _sysstr(
375 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
376 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
376 and b"WFA"
377 and b"WFA"
377 or b"WF"
378 or b"WF"
378 )
379 )
379
380
380
381
381 def colwidth(s: bytes) -> int:
382 def colwidth(s: bytes) -> int:
382 """Find the column width of a string for display in the local encoding"""
383 """Find the column width of a string for display in the local encoding"""
383 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
384 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
384
385
385
386
386 def ucolwidth(d: Text) -> int:
387 def ucolwidth(d: Text) -> int:
387 """Find the column width of a Unicode string for display"""
388 """Find the column width of a Unicode string for display"""
388 eaw = getattr(unicodedata, 'east_asian_width', None)
389 eaw = getattr(unicodedata, 'east_asian_width', None)
389 if eaw is not None:
390 if eaw is not None:
390 return sum([eaw(c) in _wide and 2 or 1 for c in d])
391 return sum([eaw(c) in _wide and 2 or 1 for c in d])
391 return len(d)
392 return len(d)
392
393
393
394
394 def getcols(s: bytes, start: int, c: int) -> bytes:
395 def getcols(s: bytes, start: int, c: int) -> bytes:
395 """Use colwidth to find a c-column substring of s starting at byte
396 """Use colwidth to find a c-column substring of s starting at byte
396 index start"""
397 index start"""
397 for x in range(start + c, len(s)):
398 for x in range(start + c, len(s)):
398 t = s[start:x]
399 t = s[start:x]
399 if colwidth(t) == c:
400 if colwidth(t) == c:
400 return t
401 return t
401 raise ValueError('substring not found')
402 raise ValueError('substring not found')
402
403
403
404
404 def trim(
405 def trim(
405 s: bytes,
406 s: bytes,
406 width: int,
407 width: int,
407 ellipsis: bytes = b'',
408 ellipsis: bytes = b'',
408 leftside: bool = False,
409 leftside: bool = False,
409 ) -> bytes:
410 ) -> bytes:
410 """Trim string 's' to at most 'width' columns (including 'ellipsis').
411 """Trim string 's' to at most 'width' columns (including 'ellipsis').
411
412
412 If 'leftside' is True, left side of string 's' is trimmed.
413 If 'leftside' is True, left side of string 's' is trimmed.
413 'ellipsis' is always placed at trimmed side.
414 'ellipsis' is always placed at trimmed side.
414
415
415 >>> from .node import bin
416 >>> from .node import bin
416 >>> def bprint(s):
417 >>> def bprint(s):
417 ... print(pycompat.sysstr(s))
418 ... print(pycompat.sysstr(s))
418 >>> ellipsis = b'+++'
419 >>> ellipsis = b'+++'
419 >>> from . import encoding
420 >>> from . import encoding
420 >>> encoding.encoding = b'utf-8'
421 >>> encoding.encoding = b'utf-8'
421 >>> t = b'1234567890'
422 >>> t = b'1234567890'
422 >>> bprint(trim(t, 12, ellipsis=ellipsis))
423 >>> bprint(trim(t, 12, ellipsis=ellipsis))
423 1234567890
424 1234567890
424 >>> bprint(trim(t, 10, ellipsis=ellipsis))
425 >>> bprint(trim(t, 10, ellipsis=ellipsis))
425 1234567890
426 1234567890
426 >>> bprint(trim(t, 8, ellipsis=ellipsis))
427 >>> bprint(trim(t, 8, ellipsis=ellipsis))
427 12345+++
428 12345+++
428 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
429 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
429 +++67890
430 +++67890
430 >>> bprint(trim(t, 8))
431 >>> bprint(trim(t, 8))
431 12345678
432 12345678
432 >>> bprint(trim(t, 8, leftside=True))
433 >>> bprint(trim(t, 8, leftside=True))
433 34567890
434 34567890
434 >>> bprint(trim(t, 3, ellipsis=ellipsis))
435 >>> bprint(trim(t, 3, ellipsis=ellipsis))
435 +++
436 +++
436 >>> bprint(trim(t, 1, ellipsis=ellipsis))
437 >>> bprint(trim(t, 1, ellipsis=ellipsis))
437 +
438 +
438 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
439 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
439 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
440 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
440 >>> bprint(trim(t, 12, ellipsis=ellipsis))
441 >>> bprint(trim(t, 12, ellipsis=ellipsis))
441 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
442 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
442 >>> bprint(trim(t, 10, ellipsis=ellipsis))
443 >>> bprint(trim(t, 10, ellipsis=ellipsis))
443 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
444 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
444 >>> bprint(trim(t, 8, ellipsis=ellipsis))
445 >>> bprint(trim(t, 8, ellipsis=ellipsis))
445 \xe3\x81\x82\xe3\x81\x84+++
446 \xe3\x81\x82\xe3\x81\x84+++
446 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
447 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
447 +++\xe3\x81\x88\xe3\x81\x8a
448 +++\xe3\x81\x88\xe3\x81\x8a
448 >>> bprint(trim(t, 5))
449 >>> bprint(trim(t, 5))
449 \xe3\x81\x82\xe3\x81\x84
450 \xe3\x81\x82\xe3\x81\x84
450 >>> bprint(trim(t, 5, leftside=True))
451 >>> bprint(trim(t, 5, leftside=True))
451 \xe3\x81\x88\xe3\x81\x8a
452 \xe3\x81\x88\xe3\x81\x8a
452 >>> bprint(trim(t, 4, ellipsis=ellipsis))
453 >>> bprint(trim(t, 4, ellipsis=ellipsis))
453 +++
454 +++
454 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
455 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
455 +++
456 +++
456 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
457 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
457 >>> bprint(trim(t, 12, ellipsis=ellipsis))
458 >>> bprint(trim(t, 12, ellipsis=ellipsis))
458 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
459 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
459 >>> bprint(trim(t, 10, ellipsis=ellipsis))
460 >>> bprint(trim(t, 10, ellipsis=ellipsis))
460 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
461 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
461 >>> bprint(trim(t, 8, ellipsis=ellipsis))
462 >>> bprint(trim(t, 8, ellipsis=ellipsis))
462 \x11\x22\x33\x44\x55+++
463 \x11\x22\x33\x44\x55+++
463 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
464 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
464 +++\x66\x77\x88\x99\xaa
465 +++\x66\x77\x88\x99\xaa
465 >>> bprint(trim(t, 8))
466 >>> bprint(trim(t, 8))
466 \x11\x22\x33\x44\x55\x66\x77\x88
467 \x11\x22\x33\x44\x55\x66\x77\x88
467 >>> bprint(trim(t, 8, leftside=True))
468 >>> bprint(trim(t, 8, leftside=True))
468 \x33\x44\x55\x66\x77\x88\x99\xaa
469 \x33\x44\x55\x66\x77\x88\x99\xaa
469 >>> bprint(trim(t, 3, ellipsis=ellipsis))
470 >>> bprint(trim(t, 3, ellipsis=ellipsis))
470 +++
471 +++
471 >>> bprint(trim(t, 1, ellipsis=ellipsis))
472 >>> bprint(trim(t, 1, ellipsis=ellipsis))
472 +
473 +
473 """
474 """
474 try:
475 try:
475 u = s.decode(_sysstr(encoding))
476 u = s.decode(_sysstr(encoding))
476 except UnicodeDecodeError:
477 except UnicodeDecodeError:
477 if len(s) <= width: # trimming is not needed
478 if len(s) <= width: # trimming is not needed
478 return s
479 return s
479 width -= len(ellipsis)
480 width -= len(ellipsis)
480 if width <= 0: # no enough room even for ellipsis
481 if width <= 0: # no enough room even for ellipsis
481 return ellipsis[: width + len(ellipsis)]
482 return ellipsis[: width + len(ellipsis)]
482 if leftside:
483 if leftside:
483 return ellipsis + s[-width:]
484 return ellipsis + s[-width:]
484 return s[:width] + ellipsis
485 return s[:width] + ellipsis
485
486
486 if ucolwidth(u) <= width: # trimming is not needed
487 if ucolwidth(u) <= width: # trimming is not needed
487 return s
488 return s
488
489
489 width -= len(ellipsis)
490 width -= len(ellipsis)
490 if width <= 0: # no enough room even for ellipsis
491 if width <= 0: # no enough room even for ellipsis
491 return ellipsis[: width + len(ellipsis)]
492 return ellipsis[: width + len(ellipsis)]
492
493
493 chars = list(u)
494 chars = list(u)
494 if leftside:
495 if leftside:
495 chars.reverse()
496 chars.reverse()
496 width_so_far = 0
497 width_so_far = 0
497 for i, c in enumerate(chars):
498 for i, c in enumerate(chars):
498 width_so_far += ucolwidth(c)
499 width_so_far += ucolwidth(c)
499 if width_so_far > width:
500 if width_so_far > width:
500 break
501 break
501 chars = chars[:i]
502 chars = chars[:i]
502 if leftside:
503 if leftside:
503 chars.reverse()
504 chars.reverse()
504 u = u''.join(chars).encode(_sysstr(encoding))
505 u = u''.join(chars).encode(_sysstr(encoding))
505 if leftside:
506 if leftside:
506 return ellipsis + u
507 return ellipsis + u
507 return u + ellipsis
508 return u + ellipsis
508
509
509
510
510 class normcasespecs:
511 class normcasespecs:
511 """what a platform's normcase does to ASCII strings
512 """what a platform's normcase does to ASCII strings
512
513
513 This is specified per platform, and should be consistent with what normcase
514 This is specified per platform, and should be consistent with what normcase
514 on that platform actually does.
515 on that platform actually does.
515
516
516 lower: normcase lowercases ASCII strings
517 lower: normcase lowercases ASCII strings
517 upper: normcase uppercases ASCII strings
518 upper: normcase uppercases ASCII strings
518 other: the fallback function should always be called
519 other: the fallback function should always be called
519
520
520 This should be kept in sync with normcase_spec in util.h."""
521 This should be kept in sync with normcase_spec in util.h."""
521
522
522 lower = -1
523 lower = -1
523 upper = 1
524 upper = 1
524 other = 0
525 other = 0
525
526
526
527
527 def jsonescape(s: Any, paranoid: Any = False) -> Any:
528 def jsonescape(s: Any, paranoid: Any = False) -> Any:
528 """returns a string suitable for JSON
529 """returns a string suitable for JSON
529
530
530 JSON is problematic for us because it doesn't support non-Unicode
531 JSON is problematic for us because it doesn't support non-Unicode
531 bytes. To deal with this, we take the following approach:
532 bytes. To deal with this, we take the following approach:
532
533
533 - localstr/safelocalstr objects are converted back to UTF-8
534 - localstr/safelocalstr objects are converted back to UTF-8
534 - valid UTF-8/ASCII strings are passed as-is
535 - valid UTF-8/ASCII strings are passed as-is
535 - other strings are converted to UTF-8b surrogate encoding
536 - other strings are converted to UTF-8b surrogate encoding
536 - apply JSON-specified string escaping
537 - apply JSON-specified string escaping
537
538
538 (escapes are doubled in these tests)
539 (escapes are doubled in these tests)
539
540
540 >>> jsonescape(b'this is a test')
541 >>> jsonescape(b'this is a test')
541 'this is a test'
542 'this is a test'
542 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
543 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
543 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
544 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
544 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
545 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
545 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
546 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
546 >>> jsonescape(b'a weird byte: \\xdd')
547 >>> jsonescape(b'a weird byte: \\xdd')
547 'a weird byte: \\xed\\xb3\\x9d'
548 'a weird byte: \\xed\\xb3\\x9d'
548 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
549 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
549 'utf-8: caf\\xc3\\xa9'
550 'utf-8: caf\\xc3\\xa9'
550 >>> jsonescape(b'')
551 >>> jsonescape(b'')
551 ''
552 ''
552
553
553 If paranoid, non-ascii and common troublesome characters are also escaped.
554 If paranoid, non-ascii and common troublesome characters are also escaped.
554 This is suitable for web output.
555 This is suitable for web output.
555
556
556 >>> s = b'escape characters: \\0 \\x0b \\x7f'
557 >>> s = b'escape characters: \\0 \\x0b \\x7f'
557 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
558 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
558 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
559 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
559 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
560 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
560 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
561 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
561 'escape boundary: ~ \\\\u007f \\\\u0080'
562 'escape boundary: ~ \\\\u007f \\\\u0080'
562 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
563 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
563 'a weird byte: \\\\udcdd'
564 'a weird byte: \\\\udcdd'
564 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
565 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
565 'utf-8: caf\\\\u00e9'
566 'utf-8: caf\\\\u00e9'
566 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
567 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
567 'non-BMP: \\\\ud834\\\\udd1e'
568 'non-BMP: \\\\ud834\\\\udd1e'
568 >>> jsonescape(b'<foo@example.org>', paranoid=True)
569 >>> jsonescape(b'<foo@example.org>', paranoid=True)
569 '\\\\u003cfoo@example.org\\\\u003e'
570 '\\\\u003cfoo@example.org\\\\u003e'
570 """
571 """
571
572
572 u8chars = toutf8b(s)
573 u8chars = toutf8b(s)
573 try:
574 try:
574 return _jsonescapeu8fast(u8chars, paranoid)
575 return _jsonescapeu8fast(u8chars, paranoid)
575 except ValueError:
576 except ValueError:
576 pass
577 pass
577 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
578 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
578
579
579
580
580 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
581 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
581 # bytes are mapped to that range.
582 # bytes are mapped to that range.
582 _utf8strict = r'surrogatepass'
583 _utf8strict = r'surrogatepass'
583
584
584 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
585 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
585
586
586
587
587 def getutf8char(s: bytes, pos: int) -> bytes:
588 def getutf8char(s: bytes, pos: int) -> bytes:
588 """get the next full utf-8 character in the given string, starting at pos
589 """get the next full utf-8 character in the given string, starting at pos
589
590
590 Raises a UnicodeError if the given location does not start a valid
591 Raises a UnicodeError if the given location does not start a valid
591 utf-8 character.
592 utf-8 character.
592 """
593 """
593
594
594 # find how many bytes to attempt decoding from first nibble
595 # find how many bytes to attempt decoding from first nibble
595 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
596 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
596 if not l: # ascii
597 if not l: # ascii
597 return s[pos : pos + 1]
598 return s[pos : pos + 1]
598
599
599 c = s[pos : pos + l]
600 c = s[pos : pos + l]
600 # validate with attempted decode
601 # validate with attempted decode
601 c.decode("utf-8", _utf8strict)
602 c.decode("utf-8", _utf8strict)
602 return c
603 return c
603
604
604
605
605 def toutf8b(s: bytes) -> bytes:
606 def toutf8b(s: bytes) -> bytes:
606 """convert a local, possibly-binary string into UTF-8b
607 """convert a local, possibly-binary string into UTF-8b
607
608
608 This is intended as a generic method to preserve data when working
609 This is intended as a generic method to preserve data when working
609 with schemes like JSON and XML that have no provision for
610 with schemes like JSON and XML that have no provision for
610 arbitrary byte strings. As Mercurial often doesn't know
611 arbitrary byte strings. As Mercurial often doesn't know
611 what encoding data is in, we use so-called UTF-8b.
612 what encoding data is in, we use so-called UTF-8b.
612
613
613 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
614 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
614 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
615 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
615 uDC00-uDCFF.
616 uDC00-uDCFF.
616
617
617 Principles of operation:
618 Principles of operation:
618
619
619 - ASCII and UTF-8 data successfully round-trips and is understood
620 - ASCII and UTF-8 data successfully round-trips and is understood
620 by Unicode-oriented clients
621 by Unicode-oriented clients
621 - filenames and file contents in arbitrary other encodings can have
622 - filenames and file contents in arbitrary other encodings can have
622 be round-tripped or recovered by clueful clients
623 be round-tripped or recovered by clueful clients
623 - local strings that have a cached known UTF-8 encoding (aka
624 - local strings that have a cached known UTF-8 encoding (aka
624 localstr) get sent as UTF-8 so Unicode-oriented clients get the
625 localstr) get sent as UTF-8 so Unicode-oriented clients get the
625 Unicode data they want
626 Unicode data they want
626 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
627 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
627 - because we must preserve UTF-8 bytestring in places such as
628 - because we must preserve UTF-8 bytestring in places such as
628 filenames, metadata can't be roundtripped without help
629 filenames, metadata can't be roundtripped without help
629
630
630 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
631 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
631 arbitrary bytes into an internal Unicode format that can be
632 arbitrary bytes into an internal Unicode format that can be
632 re-encoded back into the original. Here we are exposing the
633 re-encoded back into the original. Here we are exposing the
633 internal surrogate encoding as a UTF-8 string.)
634 internal surrogate encoding as a UTF-8 string.)
634 """
635 """
635
636
636 if isinstance(s, localstr):
637 if isinstance(s, localstr):
637 # assume that the original UTF-8 sequence would never contain
638 # assume that the original UTF-8 sequence would never contain
638 # invalid characters in U+DCxx range
639 # invalid characters in U+DCxx range
639 return s._utf8
640 return s._utf8
640 elif isinstance(s, safelocalstr):
641 elif isinstance(s, safelocalstr):
641 # already verified that s is non-lossy in legacy encoding, which
642 # already verified that s is non-lossy in legacy encoding, which
642 # shouldn't contain characters in U+DCxx range
643 # shouldn't contain characters in U+DCxx range
643 return fromlocal(s)
644 return fromlocal(s)
644 elif isasciistr(s):
645 elif isasciistr(s):
645 return s
646 return s
646 if b"\xed" not in s:
647 if b"\xed" not in s:
647 try:
648 try:
648 s.decode('utf-8', _utf8strict)
649 s.decode('utf-8', _utf8strict)
649 return s
650 return s
650 except UnicodeDecodeError:
651 except UnicodeDecodeError:
651 pass
652 pass
652
653
653 s = pycompat.bytestr(s)
654 s = pycompat.bytestr(s)
654 r = bytearray()
655 r = bytearray()
655 pos = 0
656 pos = 0
656 l = len(s)
657 l = len(s)
657 while pos < l:
658 while pos < l:
658 try:
659 try:
659 c = getutf8char(s, pos)
660 c = getutf8char(s, pos)
660 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
661 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
661 # have to re-escape existing U+DCxx characters
662 # have to re-escape existing U+DCxx characters
662 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
663 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
663 pos += 1
664 pos += 1
664 else:
665 else:
665 pos += len(c)
666 pos += len(c)
666 except UnicodeDecodeError:
667 except UnicodeDecodeError:
667 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 pos += 1
669 pos += 1
669 r += c
670 r += c
670 return bytes(r)
671 return bytes(r)
671
672
672
673
673 def fromutf8b(s: bytes) -> bytes:
674 def fromutf8b(s: bytes) -> bytes:
674 """Given a UTF-8b string, return a local, possibly-binary string.
675 """Given a UTF-8b string, return a local, possibly-binary string.
675
676
676 return the original binary string. This
677 return the original binary string. This
677 is a round-trip process for strings like filenames, but metadata
678 is a round-trip process for strings like filenames, but metadata
678 that's was passed through tolocal will remain in UTF-8.
679 that's was passed through tolocal will remain in UTF-8.
679
680
680 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
681 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
681 >>> m = b"\\xc3\\xa9\\x99abcd"
682 >>> m = b"\\xc3\\xa9\\x99abcd"
682 >>> toutf8b(m)
683 >>> toutf8b(m)
683 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
684 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
684 >>> roundtrip(m)
685 >>> roundtrip(m)
685 True
686 True
686 >>> roundtrip(b"\\xc2\\xc2\\x80")
687 >>> roundtrip(b"\\xc2\\xc2\\x80")
687 True
688 True
688 >>> roundtrip(b"\\xef\\xbf\\xbd")
689 >>> roundtrip(b"\\xef\\xbf\\xbd")
689 True
690 True
690 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
691 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
691 True
692 True
692 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
693 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
693 True
694 True
694 """
695 """
695
696
696 if isasciistr(s):
697 if isasciistr(s):
697 return s
698 return s
698 # fast path - look for uDxxx prefixes in s
699 # fast path - look for uDxxx prefixes in s
699 if b"\xed" not in s:
700 if b"\xed" not in s:
700 return s
701 return s
701
702
702 # We could do this with the unicode type but some Python builds
703 # We could do this with the unicode type but some Python builds
703 # use UTF-16 internally (issue5031) which causes non-BMP code
704 # use UTF-16 internally (issue5031) which causes non-BMP code
704 # points to be escaped. Instead, we use our handy getutf8char
705 # points to be escaped. Instead, we use our handy getutf8char
705 # helper again to walk the string without "decoding" it.
706 # helper again to walk the string without "decoding" it.
706
707
707 s = pycompat.bytestr(s)
708 s = pycompat.bytestr(s)
708 r = bytearray()
709 r = bytearray()
709 pos = 0
710 pos = 0
710 l = len(s)
711 l = len(s)
711 while pos < l:
712 while pos < l:
712 c = getutf8char(s, pos)
713 c = getutf8char(s, pos)
713 pos += len(c)
714 pos += len(c)
714 # unescape U+DCxx characters
715 # unescape U+DCxx characters
715 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
716 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
716 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
717 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
717 r += c
718 r += c
718 return bytes(r)
719 return bytes(r)
General Comments 0
You need to be logged in to leave comments. Login now