##// END OF EJS Templates
windows: replicate the normalizing behavior of os.environ...
Raphaël Gomès -
r48360:af633293 default
parent child Browse files
Show More
@@ -1,710 +1,719 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 if pycompat.TYPE_CHECKING:
23 if pycompat.TYPE_CHECKING:
24 from typing import (
24 from typing import (
25 Any,
25 Any,
26 Callable,
26 Callable,
27 List,
27 List,
28 Text,
28 Text,
29 Type,
29 Type,
30 TypeVar,
30 TypeVar,
31 Union,
31 Union,
32 )
32 )
33
33
34 # keep pyflakes happy
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
36 assert t
37
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39
39
40 charencode = policy.importmod('charencode')
40 charencode = policy.importmod('charencode')
41
41
42 isasciistr = charencode.isasciistr
42 isasciistr = charencode.isasciistr
43 asciilower = charencode.asciilower
43 asciilower = charencode.asciilower
44 asciiupper = charencode.asciiupper
44 asciiupper = charencode.asciiupper
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46
46
47 _sysstr = pycompat.sysstr
47 _sysstr = pycompat.sysstr
48
48
49 if pycompat.ispy3:
49 if pycompat.ispy3:
50 unichr = chr
50 unichr = chr
51
51
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # "Unicode Subtleties"), so we need to ignore them in some places for
53 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # sanity.
54 # sanity.
55 _ignore = [
55 _ignore = [
56 unichr(int(x, 16)).encode("utf-8")
56 unichr(int(x, 16)).encode("utf-8")
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 b"206a 206b 206c 206d 206e 206f feff".split()
58 b"206a 206b 206c 206d 206e 206f feff".split()
59 ]
59 ]
60 # verify the next function will work
60 # verify the next function will work
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62
62
63
63
64 def hfsignoreclean(s):
64 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
65 # type: (bytes) -> bytes
66 """Remove codepoints ignored by HFS+ from s.
66 """Remove codepoints ignored by HFS+ from s.
67
67
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 '.hg'
69 '.hg'
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 '.hg'
71 '.hg'
72 """
72 """
73 if b"\xe2" in s or b"\xef" in s:
73 if b"\xe2" in s or b"\xef" in s:
74 for c in _ignore:
74 for c in _ignore:
75 s = s.replace(c, b'')
75 s = s.replace(c, b'')
76 return s
76 return s
77
77
78
78
79 # encoding.environ is provided read-only, which may not be used to modify
79 # encoding.environ is provided read-only, which may not be used to modify
80 # the process environment
80 # the process environment
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 if not pycompat.ispy3:
82 if not pycompat.ispy3:
83 environ = os.environ # re-exports
83 environ = os.environ # re-exports
84 elif _nativeenviron:
84 elif _nativeenviron:
85 environ = os.environb # re-exports
85 environ = os.environb # re-exports
86 else:
86 else:
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 # and recreate it once encoding is settled
88 # and recreate it once encoding is settled
89 environ = {
89 environ = {
90 k.encode('utf-8'): v.encode('utf-8')
90 k.encode('utf-8'): v.encode('utf-8')
91 for k, v in os.environ.items() # re-exports
91 for k, v in os.environ.items() # re-exports
92 }
92 }
93
93
94 _encodingrewrites = {
94 _encodingrewrites = {
95 b'646': b'ascii',
95 b'646': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
97 }
97 }
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 # https://bugs.python.org/issue13216
100 # https://bugs.python.org/issue13216
101 if pycompat.iswindows and not pycompat.ispy3:
101 if pycompat.iswindows and not pycompat.ispy3:
102 _encodingrewrites[b'cp65001'] = b'utf-8'
102 _encodingrewrites[b'cp65001'] = b'utf-8'
103
103
104 try:
104 try:
105 encoding = environ.get(b"HGENCODING")
105 encoding = environ.get(b"HGENCODING")
106 if not encoding:
106 if not encoding:
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 encoding = _encodingrewrites.get(encoding, encoding)
108 encoding = _encodingrewrites.get(encoding, encoding)
109 except locale.Error:
109 except locale.Error:
110 encoding = b'ascii'
110 encoding = b'ascii'
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 fallbackencoding = b'ISO-8859-1'
112 fallbackencoding = b'ISO-8859-1'
113
113
114
114
115 class localstr(bytes):
115 class localstr(bytes):
116 """This class allows strings that are unmodified to be
116 """This class allows strings that are unmodified to be
117 round-tripped to the local encoding and back"""
117 round-tripped to the local encoding and back"""
118
118
119 def __new__(cls, u, l):
119 def __new__(cls, u, l):
120 s = bytes.__new__(cls, l)
120 s = bytes.__new__(cls, l)
121 s._utf8 = u
121 s._utf8 = u
122 return s
122 return s
123
123
124 if pycompat.TYPE_CHECKING:
124 if pycompat.TYPE_CHECKING:
125 # pseudo implementation to help pytype see localstr() constructor
125 # pseudo implementation to help pytype see localstr() constructor
126 def __init__(self, u, l):
126 def __init__(self, u, l):
127 # type: (bytes, bytes) -> None
127 # type: (bytes, bytes) -> None
128 super(localstr, self).__init__(l)
128 super(localstr, self).__init__(l)
129 self._utf8 = u
129 self._utf8 = u
130
130
131 def __hash__(self):
131 def __hash__(self):
132 return hash(self._utf8) # avoid collisions in local string space
132 return hash(self._utf8) # avoid collisions in local string space
133
133
134
134
135 class safelocalstr(bytes):
135 class safelocalstr(bytes):
136 """Tagged string denoting it was previously an internal UTF-8 string,
136 """Tagged string denoting it was previously an internal UTF-8 string,
137 and can be converted back to UTF-8 losslessly
137 and can be converted back to UTF-8 losslessly
138
138
139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 """
143 """
144
144
145
145
146 def tolocal(s):
146 def tolocal(s):
147 # type: (bytes) -> bytes
147 # type: (bytes) -> bytes
148 """
148 """
149 Convert a string from internal UTF-8 to local encoding
149 Convert a string from internal UTF-8 to local encoding
150
150
151 All internal strings should be UTF-8 but some repos before the
151 All internal strings should be UTF-8 but some repos before the
152 implementation of locale support may contain latin1 or possibly
152 implementation of locale support may contain latin1 or possibly
153 other character sets. We attempt to decode everything strictly
153 other character sets. We attempt to decode everything strictly
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 replace unknown characters.
155 replace unknown characters.
156
156
157 The localstr class is used to cache the known UTF-8 encoding of
157 The localstr class is used to cache the known UTF-8 encoding of
158 strings next to their local representation to allow lossless
158 strings next to their local representation to allow lossless
159 round-trip conversion back to UTF-8.
159 round-trip conversion back to UTF-8.
160
160
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 >>> l = tolocal(u)
162 >>> l = tolocal(u)
163 >>> l
163 >>> l
164 'foo: ?'
164 'foo: ?'
165 >>> fromlocal(l)
165 >>> fromlocal(l)
166 'foo: \\xc3\\xa4'
166 'foo: \\xc3\\xa4'
167 >>> u2 = b'foo: \\xc3\\xa1'
167 >>> u2 = b'foo: \\xc3\\xa1'
168 >>> d = { l: 1, tolocal(u2): 2 }
168 >>> d = { l: 1, tolocal(u2): 2 }
169 >>> len(d) # no collision
169 >>> len(d) # no collision
170 2
170 2
171 >>> b'foo: ?' in d
171 >>> b'foo: ?' in d
172 False
172 False
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 >>> l = tolocal(l1)
174 >>> l = tolocal(l1)
175 >>> l
175 >>> l
176 'foo: ?'
176 'foo: ?'
177 >>> fromlocal(l) # magically in utf-8
177 >>> fromlocal(l) # magically in utf-8
178 'foo: \\xc3\\xa4'
178 'foo: \\xc3\\xa4'
179 """
179 """
180
180
181 if isasciistr(s):
181 if isasciistr(s):
182 return s
182 return s
183
183
184 try:
184 try:
185 try:
185 try:
186 # make sure string is actually stored in UTF-8
186 # make sure string is actually stored in UTF-8
187 u = s.decode('UTF-8')
187 u = s.decode('UTF-8')
188 if encoding == b'UTF-8':
188 if encoding == b'UTF-8':
189 # fast path
189 # fast path
190 return s
190 return s
191 r = u.encode(_sysstr(encoding), "replace")
191 r = u.encode(_sysstr(encoding), "replace")
192 if u == r.decode(_sysstr(encoding)):
192 if u == r.decode(_sysstr(encoding)):
193 # r is a safe, non-lossy encoding of s
193 # r is a safe, non-lossy encoding of s
194 return safelocalstr(r)
194 return safelocalstr(r)
195 return localstr(s, r)
195 return localstr(s, r)
196 except UnicodeDecodeError:
196 except UnicodeDecodeError:
197 # we should only get here if we're looking at an ancient changeset
197 # we should only get here if we're looking at an ancient changeset
198 try:
198 try:
199 u = s.decode(_sysstr(fallbackencoding))
199 u = s.decode(_sysstr(fallbackencoding))
200 r = u.encode(_sysstr(encoding), "replace")
200 r = u.encode(_sysstr(encoding), "replace")
201 if u == r.decode(_sysstr(encoding)):
201 if u == r.decode(_sysstr(encoding)):
202 # r is a safe, non-lossy encoding of s
202 # r is a safe, non-lossy encoding of s
203 return safelocalstr(r)
203 return safelocalstr(r)
204 return localstr(u.encode('UTF-8'), r)
204 return localstr(u.encode('UTF-8'), r)
205 except UnicodeDecodeError:
205 except UnicodeDecodeError:
206 u = s.decode("utf-8", "replace") # last ditch
206 u = s.decode("utf-8", "replace") # last ditch
207 # can't round-trip
207 # can't round-trip
208 return u.encode(_sysstr(encoding), "replace")
208 return u.encode(_sysstr(encoding), "replace")
209 except LookupError as k:
209 except LookupError as k:
210 raise error.Abort(
210 raise error.Abort(
211 pycompat.bytestr(k), hint=b"please check your locale settings"
211 pycompat.bytestr(k), hint=b"please check your locale settings"
212 )
212 )
213
213
214
214
215 def fromlocal(s):
215 def fromlocal(s):
216 # type: (bytes) -> bytes
216 # type: (bytes) -> bytes
217 """
217 """
218 Convert a string from the local character encoding to UTF-8
218 Convert a string from the local character encoding to UTF-8
219
219
220 We attempt to decode strings using the encoding mode set by
220 We attempt to decode strings using the encoding mode set by
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 characters will cause an error message. Other modes include
222 characters will cause an error message. Other modes include
223 'replace', which replaces unknown characters with a special
223 'replace', which replaces unknown characters with a special
224 Unicode character, and 'ignore', which drops the character.
224 Unicode character, and 'ignore', which drops the character.
225 """
225 """
226
226
227 # can we do a lossless round-trip?
227 # can we do a lossless round-trip?
228 if isinstance(s, localstr):
228 if isinstance(s, localstr):
229 return s._utf8
229 return s._utf8
230 if isasciistr(s):
230 if isasciistr(s):
231 return s
231 return s
232
232
233 try:
233 try:
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 return u.encode("utf-8")
235 return u.encode("utf-8")
236 except UnicodeDecodeError as inst:
236 except UnicodeDecodeError as inst:
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 raise error.Abort(
238 raise error.Abort(
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 )
240 )
241 except LookupError as k:
241 except LookupError as k:
242 raise error.Abort(k, hint=b"please check your locale settings")
242 raise error.Abort(k, hint=b"please check your locale settings")
243
243
244
244
245 def unitolocal(u):
245 def unitolocal(u):
246 # type: (Text) -> bytes
246 # type: (Text) -> bytes
247 """Convert a unicode string to a byte string of local encoding"""
247 """Convert a unicode string to a byte string of local encoding"""
248 return tolocal(u.encode('utf-8'))
248 return tolocal(u.encode('utf-8'))
249
249
250
250
251 def unifromlocal(s):
251 def unifromlocal(s):
252 # type: (bytes) -> Text
252 # type: (bytes) -> Text
253 """Convert a byte string of local encoding to a unicode string"""
253 """Convert a byte string of local encoding to a unicode string"""
254 return fromlocal(s).decode('utf-8')
254 return fromlocal(s).decode('utf-8')
255
255
256
256
257 def unimethod(bytesfunc):
257 def unimethod(bytesfunc):
258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
259 """Create a proxy method that forwards __unicode__() and __str__() of
259 """Create a proxy method that forwards __unicode__() and __str__() of
260 Python 3 to __bytes__()"""
260 Python 3 to __bytes__()"""
261
261
262 def unifunc(obj):
262 def unifunc(obj):
263 return unifromlocal(bytesfunc(obj))
263 return unifromlocal(bytesfunc(obj))
264
264
265 return unifunc
265 return unifunc
266
266
267
267
268 # converter functions between native str and byte string. use these if the
268 # converter functions between native str and byte string. use these if the
269 # character encoding is not aware (e.g. exception message) or is known to
269 # character encoding is not aware (e.g. exception message) or is known to
270 # be locale dependent (e.g. date formatting.)
270 # be locale dependent (e.g. date formatting.)
271 if pycompat.ispy3:
271 if pycompat.ispy3:
272 strtolocal = unitolocal
272 strtolocal = unitolocal
273 strfromlocal = unifromlocal
273 strfromlocal = unifromlocal
274 strmethod = unimethod
274 strmethod = unimethod
275 else:
275 else:
276
276
277 def strtolocal(s):
277 def strtolocal(s):
278 # type: (str) -> bytes
278 # type: (str) -> bytes
279 return s # pytype: disable=bad-return-type
279 return s # pytype: disable=bad-return-type
280
280
281 def strfromlocal(s):
281 def strfromlocal(s):
282 # type: (bytes) -> str
282 # type: (bytes) -> str
283 return s # pytype: disable=bad-return-type
283 return s # pytype: disable=bad-return-type
284
284
285 strmethod = pycompat.identity
285 strmethod = pycompat.identity
286
286
287
287
288 def lower(s):
288 def lower(s):
289 # type: (bytes) -> bytes
289 # type: (bytes) -> bytes
290 """best-effort encoding-aware case-folding of local string s"""
290 """best-effort encoding-aware case-folding of local string s"""
291 try:
291 try:
292 return asciilower(s)
292 return asciilower(s)
293 except UnicodeDecodeError:
293 except UnicodeDecodeError:
294 pass
294 pass
295 try:
295 try:
296 if isinstance(s, localstr):
296 if isinstance(s, localstr):
297 u = s._utf8.decode("utf-8")
297 u = s._utf8.decode("utf-8")
298 else:
298 else:
299 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
299 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
300
300
301 lu = u.lower()
301 lu = u.lower()
302 if u == lu:
302 if u == lu:
303 return s # preserve localstring
303 return s # preserve localstring
304 return lu.encode(_sysstr(encoding))
304 return lu.encode(_sysstr(encoding))
305 except UnicodeError:
305 except UnicodeError:
306 return s.lower() # we don't know how to fold this except in ASCII
306 return s.lower() # we don't know how to fold this except in ASCII
307 except LookupError as k:
307 except LookupError as k:
308 raise error.Abort(k, hint=b"please check your locale settings")
308 raise error.Abort(k, hint=b"please check your locale settings")
309
309
310
310
311 def upper(s):
311 def upper(s):
312 # type: (bytes) -> bytes
312 # type: (bytes) -> bytes
313 """best-effort encoding-aware case-folding of local string s"""
313 """best-effort encoding-aware case-folding of local string s"""
314 try:
314 try:
315 return asciiupper(s)
315 return asciiupper(s)
316 except UnicodeDecodeError:
316 except UnicodeDecodeError:
317 return upperfallback(s)
317 return upperfallback(s)
318
318
319
319
320 def upperfallback(s):
320 def upperfallback(s):
321 # type: (Any) -> Any
321 # type: (Any) -> Any
322 try:
322 try:
323 if isinstance(s, localstr):
323 if isinstance(s, localstr):
324 u = s._utf8.decode("utf-8")
324 u = s._utf8.decode("utf-8")
325 else:
325 else:
326 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
326 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
327
327
328 uu = u.upper()
328 uu = u.upper()
329 if u == uu:
329 if u == uu:
330 return s # preserve localstring
330 return s # preserve localstring
331 return uu.encode(_sysstr(encoding))
331 return uu.encode(_sysstr(encoding))
332 except UnicodeError:
332 except UnicodeError:
333 return s.upper() # we don't know how to fold this except in ASCII
333 return s.upper() # we don't know how to fold this except in ASCII
334 except LookupError as k:
334 except LookupError as k:
335 raise error.Abort(k, hint=b"please check your locale settings")
335 raise error.Abort(k, hint=b"please check your locale settings")
336
336
337
337
338 if not _nativeenviron:
338 if not _nativeenviron:
339 # now encoding and helper functions are available, recreate the environ
339 # now encoding and helper functions are available, recreate the environ
340 # dict to be exported to other modules
340 # dict to be exported to other modules
341 environ = {
341 if pycompat.iswindows and pycompat.ispy3:
342 tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))
342
343 for k, v in os.environ.items() # re-exports
343 class WindowsEnviron(dict):
344 }
344 """`os.environ` normalizes environment variables to uppercase on windows"""
345
346 def get(self, key, default=None):
347 return super().get(upper(key), default)
348
349 environ = WindowsEnviron()
350
351 for k, v in os.environ.items(): # re-exports
352 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
353
345
354
346 if pycompat.ispy3:
355 if pycompat.ispy3:
347 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
356 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
348 # returns bytes.
357 # returns bytes.
349 if pycompat.iswindows:
358 if pycompat.iswindows:
350 # Python 3 on Windows issues a DeprecationWarning about using the bytes
359 # Python 3 on Windows issues a DeprecationWarning about using the bytes
351 # API when os.getcwdb() is called.
360 # API when os.getcwdb() is called.
352 #
361 #
353 # Additionally, py3.8+ uppercases the drive letter when calling
362 # Additionally, py3.8+ uppercases the drive letter when calling
354 # os.path.realpath(), which is used on ``repo.root``. Since those
363 # os.path.realpath(), which is used on ``repo.root``. Since those
355 # strings are compared in various places as simple strings, also call
364 # strings are compared in various places as simple strings, also call
356 # realpath here. See https://bugs.python.org/issue40368
365 # realpath here. See https://bugs.python.org/issue40368
357 getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports
366 getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports
358 else:
367 else:
359 getcwd = os.getcwdb # re-exports
368 getcwd = os.getcwdb # re-exports
360 else:
369 else:
361 getcwd = os.getcwd # re-exports
370 getcwd = os.getcwd # re-exports
362
371
363 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
372 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
364 _wide = _sysstr(
373 _wide = _sysstr(
365 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
374 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
366 and b"WFA"
375 and b"WFA"
367 or b"WF"
376 or b"WF"
368 )
377 )
369
378
370
379
371 def colwidth(s):
380 def colwidth(s):
372 # type: (bytes) -> int
381 # type: (bytes) -> int
373 """Find the column width of a string for display in the local encoding"""
382 """Find the column width of a string for display in the local encoding"""
374 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
383 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
375
384
376
385
377 def ucolwidth(d):
386 def ucolwidth(d):
378 # type: (Text) -> int
387 # type: (Text) -> int
379 """Find the column width of a Unicode string for display"""
388 """Find the column width of a Unicode string for display"""
380 eaw = getattr(unicodedata, 'east_asian_width', None)
389 eaw = getattr(unicodedata, 'east_asian_width', None)
381 if eaw is not None:
390 if eaw is not None:
382 return sum([eaw(c) in _wide and 2 or 1 for c in d])
391 return sum([eaw(c) in _wide and 2 or 1 for c in d])
383 return len(d)
392 return len(d)
384
393
385
394
386 def getcols(s, start, c):
395 def getcols(s, start, c):
387 # type: (bytes, int, int) -> bytes
396 # type: (bytes, int, int) -> bytes
388 """Use colwidth to find a c-column substring of s starting at byte
397 """Use colwidth to find a c-column substring of s starting at byte
389 index start"""
398 index start"""
390 for x in pycompat.xrange(start + c, len(s)):
399 for x in pycompat.xrange(start + c, len(s)):
391 t = s[start:x]
400 t = s[start:x]
392 if colwidth(t) == c:
401 if colwidth(t) == c:
393 return t
402 return t
394 raise ValueError('substring not found')
403 raise ValueError('substring not found')
395
404
396
405
397 def trim(s, width, ellipsis=b'', leftside=False):
406 def trim(s, width, ellipsis=b'', leftside=False):
398 # type: (bytes, int, bytes, bool) -> bytes
407 # type: (bytes, int, bytes, bool) -> bytes
399 """Trim string 's' to at most 'width' columns (including 'ellipsis').
408 """Trim string 's' to at most 'width' columns (including 'ellipsis').
400
409
401 If 'leftside' is True, left side of string 's' is trimmed.
410 If 'leftside' is True, left side of string 's' is trimmed.
402 'ellipsis' is always placed at trimmed side.
411 'ellipsis' is always placed at trimmed side.
403
412
404 >>> from .node import bin
413 >>> from .node import bin
405 >>> def bprint(s):
414 >>> def bprint(s):
406 ... print(pycompat.sysstr(s))
415 ... print(pycompat.sysstr(s))
407 >>> ellipsis = b'+++'
416 >>> ellipsis = b'+++'
408 >>> from . import encoding
417 >>> from . import encoding
409 >>> encoding.encoding = b'utf-8'
418 >>> encoding.encoding = b'utf-8'
410 >>> t = b'1234567890'
419 >>> t = b'1234567890'
411 >>> bprint(trim(t, 12, ellipsis=ellipsis))
420 >>> bprint(trim(t, 12, ellipsis=ellipsis))
412 1234567890
421 1234567890
413 >>> bprint(trim(t, 10, ellipsis=ellipsis))
422 >>> bprint(trim(t, 10, ellipsis=ellipsis))
414 1234567890
423 1234567890
415 >>> bprint(trim(t, 8, ellipsis=ellipsis))
424 >>> bprint(trim(t, 8, ellipsis=ellipsis))
416 12345+++
425 12345+++
417 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
426 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
418 +++67890
427 +++67890
419 >>> bprint(trim(t, 8))
428 >>> bprint(trim(t, 8))
420 12345678
429 12345678
421 >>> bprint(trim(t, 8, leftside=True))
430 >>> bprint(trim(t, 8, leftside=True))
422 34567890
431 34567890
423 >>> bprint(trim(t, 3, ellipsis=ellipsis))
432 >>> bprint(trim(t, 3, ellipsis=ellipsis))
424 +++
433 +++
425 >>> bprint(trim(t, 1, ellipsis=ellipsis))
434 >>> bprint(trim(t, 1, ellipsis=ellipsis))
426 +
435 +
427 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
436 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
428 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
437 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
429 >>> bprint(trim(t, 12, ellipsis=ellipsis))
438 >>> bprint(trim(t, 12, ellipsis=ellipsis))
430 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
439 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
431 >>> bprint(trim(t, 10, ellipsis=ellipsis))
440 >>> bprint(trim(t, 10, ellipsis=ellipsis))
432 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
441 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
433 >>> bprint(trim(t, 8, ellipsis=ellipsis))
442 >>> bprint(trim(t, 8, ellipsis=ellipsis))
434 \xe3\x81\x82\xe3\x81\x84+++
443 \xe3\x81\x82\xe3\x81\x84+++
435 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
444 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
436 +++\xe3\x81\x88\xe3\x81\x8a
445 +++\xe3\x81\x88\xe3\x81\x8a
437 >>> bprint(trim(t, 5))
446 >>> bprint(trim(t, 5))
438 \xe3\x81\x82\xe3\x81\x84
447 \xe3\x81\x82\xe3\x81\x84
439 >>> bprint(trim(t, 5, leftside=True))
448 >>> bprint(trim(t, 5, leftside=True))
440 \xe3\x81\x88\xe3\x81\x8a
449 \xe3\x81\x88\xe3\x81\x8a
441 >>> bprint(trim(t, 4, ellipsis=ellipsis))
450 >>> bprint(trim(t, 4, ellipsis=ellipsis))
442 +++
451 +++
443 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
452 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
444 +++
453 +++
445 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
454 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
446 >>> bprint(trim(t, 12, ellipsis=ellipsis))
455 >>> bprint(trim(t, 12, ellipsis=ellipsis))
447 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
456 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
448 >>> bprint(trim(t, 10, ellipsis=ellipsis))
457 >>> bprint(trim(t, 10, ellipsis=ellipsis))
449 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
458 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
450 >>> bprint(trim(t, 8, ellipsis=ellipsis))
459 >>> bprint(trim(t, 8, ellipsis=ellipsis))
451 \x11\x22\x33\x44\x55+++
460 \x11\x22\x33\x44\x55+++
452 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
461 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
453 +++\x66\x77\x88\x99\xaa
462 +++\x66\x77\x88\x99\xaa
454 >>> bprint(trim(t, 8))
463 >>> bprint(trim(t, 8))
455 \x11\x22\x33\x44\x55\x66\x77\x88
464 \x11\x22\x33\x44\x55\x66\x77\x88
456 >>> bprint(trim(t, 8, leftside=True))
465 >>> bprint(trim(t, 8, leftside=True))
457 \x33\x44\x55\x66\x77\x88\x99\xaa
466 \x33\x44\x55\x66\x77\x88\x99\xaa
458 >>> bprint(trim(t, 3, ellipsis=ellipsis))
467 >>> bprint(trim(t, 3, ellipsis=ellipsis))
459 +++
468 +++
460 >>> bprint(trim(t, 1, ellipsis=ellipsis))
469 >>> bprint(trim(t, 1, ellipsis=ellipsis))
461 +
470 +
462 """
471 """
463 try:
472 try:
464 u = s.decode(_sysstr(encoding))
473 u = s.decode(_sysstr(encoding))
465 except UnicodeDecodeError:
474 except UnicodeDecodeError:
466 if len(s) <= width: # trimming is not needed
475 if len(s) <= width: # trimming is not needed
467 return s
476 return s
468 width -= len(ellipsis)
477 width -= len(ellipsis)
469 if width <= 0: # no enough room even for ellipsis
478 if width <= 0: # no enough room even for ellipsis
470 return ellipsis[: width + len(ellipsis)]
479 return ellipsis[: width + len(ellipsis)]
471 if leftside:
480 if leftside:
472 return ellipsis + s[-width:]
481 return ellipsis + s[-width:]
473 return s[:width] + ellipsis
482 return s[:width] + ellipsis
474
483
475 if ucolwidth(u) <= width: # trimming is not needed
484 if ucolwidth(u) <= width: # trimming is not needed
476 return s
485 return s
477
486
478 width -= len(ellipsis)
487 width -= len(ellipsis)
479 if width <= 0: # no enough room even for ellipsis
488 if width <= 0: # no enough room even for ellipsis
480 return ellipsis[: width + len(ellipsis)]
489 return ellipsis[: width + len(ellipsis)]
481
490
482 if leftside:
491 if leftside:
483 uslice = lambda i: u[i:]
492 uslice = lambda i: u[i:]
484 concat = lambda s: ellipsis + s
493 concat = lambda s: ellipsis + s
485 else:
494 else:
486 uslice = lambda i: u[:-i]
495 uslice = lambda i: u[:-i]
487 concat = lambda s: s + ellipsis
496 concat = lambda s: s + ellipsis
488 for i in pycompat.xrange(1, len(u)):
497 for i in pycompat.xrange(1, len(u)):
489 usub = uslice(i)
498 usub = uslice(i)
490 if ucolwidth(usub) <= width:
499 if ucolwidth(usub) <= width:
491 return concat(usub.encode(_sysstr(encoding)))
500 return concat(usub.encode(_sysstr(encoding)))
492 return ellipsis # no enough room for multi-column characters
501 return ellipsis # no enough room for multi-column characters
493
502
494
503
495 class normcasespecs(object):
504 class normcasespecs(object):
496 """what a platform's normcase does to ASCII strings
505 """what a platform's normcase does to ASCII strings
497
506
498 This is specified per platform, and should be consistent with what normcase
507 This is specified per platform, and should be consistent with what normcase
499 on that platform actually does.
508 on that platform actually does.
500
509
501 lower: normcase lowercases ASCII strings
510 lower: normcase lowercases ASCII strings
502 upper: normcase uppercases ASCII strings
511 upper: normcase uppercases ASCII strings
503 other: the fallback function should always be called
512 other: the fallback function should always be called
504
513
505 This should be kept in sync with normcase_spec in util.h."""
514 This should be kept in sync with normcase_spec in util.h."""
506
515
507 lower = -1
516 lower = -1
508 upper = 1
517 upper = 1
509 other = 0
518 other = 0
510
519
511
520
512 def jsonescape(s, paranoid=False):
521 def jsonescape(s, paranoid=False):
513 # type: (Any, Any) -> Any
522 # type: (Any, Any) -> Any
514 """returns a string suitable for JSON
523 """returns a string suitable for JSON
515
524
516 JSON is problematic for us because it doesn't support non-Unicode
525 JSON is problematic for us because it doesn't support non-Unicode
517 bytes. To deal with this, we take the following approach:
526 bytes. To deal with this, we take the following approach:
518
527
519 - localstr/safelocalstr objects are converted back to UTF-8
528 - localstr/safelocalstr objects are converted back to UTF-8
520 - valid UTF-8/ASCII strings are passed as-is
529 - valid UTF-8/ASCII strings are passed as-is
521 - other strings are converted to UTF-8b surrogate encoding
530 - other strings are converted to UTF-8b surrogate encoding
522 - apply JSON-specified string escaping
531 - apply JSON-specified string escaping
523
532
524 (escapes are doubled in these tests)
533 (escapes are doubled in these tests)
525
534
526 >>> jsonescape(b'this is a test')
535 >>> jsonescape(b'this is a test')
527 'this is a test'
536 'this is a test'
528 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
537 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
529 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
538 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
530 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
539 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
531 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
540 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
532 >>> jsonescape(b'a weird byte: \\xdd')
541 >>> jsonescape(b'a weird byte: \\xdd')
533 'a weird byte: \\xed\\xb3\\x9d'
542 'a weird byte: \\xed\\xb3\\x9d'
534 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
543 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
535 'utf-8: caf\\xc3\\xa9'
544 'utf-8: caf\\xc3\\xa9'
536 >>> jsonescape(b'')
545 >>> jsonescape(b'')
537 ''
546 ''
538
547
539 If paranoid, non-ascii and common troublesome characters are also escaped.
548 If paranoid, non-ascii and common troublesome characters are also escaped.
540 This is suitable for web output.
549 This is suitable for web output.
541
550
542 >>> s = b'escape characters: \\0 \\x0b \\x7f'
551 >>> s = b'escape characters: \\0 \\x0b \\x7f'
543 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
552 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
544 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
553 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
545 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
554 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
546 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
555 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
547 'escape boundary: ~ \\\\u007f \\\\u0080'
556 'escape boundary: ~ \\\\u007f \\\\u0080'
548 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
557 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
549 'a weird byte: \\\\udcdd'
558 'a weird byte: \\\\udcdd'
550 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
559 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
551 'utf-8: caf\\\\u00e9'
560 'utf-8: caf\\\\u00e9'
552 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
561 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
553 'non-BMP: \\\\ud834\\\\udd1e'
562 'non-BMP: \\\\ud834\\\\udd1e'
554 >>> jsonescape(b'<foo@example.org>', paranoid=True)
563 >>> jsonescape(b'<foo@example.org>', paranoid=True)
555 '\\\\u003cfoo@example.org\\\\u003e'
564 '\\\\u003cfoo@example.org\\\\u003e'
556 """
565 """
557
566
558 u8chars = toutf8b(s)
567 u8chars = toutf8b(s)
559 try:
568 try:
560 return _jsonescapeu8fast(u8chars, paranoid)
569 return _jsonescapeu8fast(u8chars, paranoid)
561 except ValueError:
570 except ValueError:
562 pass
571 pass
563 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
572 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
564
573
565
574
566 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
575 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
567 # bytes are mapped to that range.
576 # bytes are mapped to that range.
568 if pycompat.ispy3:
577 if pycompat.ispy3:
569 _utf8strict = r'surrogatepass'
578 _utf8strict = r'surrogatepass'
570 else:
579 else:
571 _utf8strict = r'strict'
580 _utf8strict = r'strict'
572
581
573 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
582 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
574
583
575
584
576 def getutf8char(s, pos):
585 def getutf8char(s, pos):
577 # type: (bytes, int) -> bytes
586 # type: (bytes, int) -> bytes
578 """get the next full utf-8 character in the given string, starting at pos
587 """get the next full utf-8 character in the given string, starting at pos
579
588
580 Raises a UnicodeError if the given location does not start a valid
589 Raises a UnicodeError if the given location does not start a valid
581 utf-8 character.
590 utf-8 character.
582 """
591 """
583
592
584 # find how many bytes to attempt decoding from first nibble
593 # find how many bytes to attempt decoding from first nibble
585 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
594 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
586 if not l: # ascii
595 if not l: # ascii
587 return s[pos : pos + 1]
596 return s[pos : pos + 1]
588
597
589 c = s[pos : pos + l]
598 c = s[pos : pos + l]
590 # validate with attempted decode
599 # validate with attempted decode
591 c.decode("utf-8", _utf8strict)
600 c.decode("utf-8", _utf8strict)
592 return c
601 return c
593
602
594
603
595 def toutf8b(s):
604 def toutf8b(s):
596 # type: (bytes) -> bytes
605 # type: (bytes) -> bytes
597 """convert a local, possibly-binary string into UTF-8b
606 """convert a local, possibly-binary string into UTF-8b
598
607
599 This is intended as a generic method to preserve data when working
608 This is intended as a generic method to preserve data when working
600 with schemes like JSON and XML that have no provision for
609 with schemes like JSON and XML that have no provision for
601 arbitrary byte strings. As Mercurial often doesn't know
610 arbitrary byte strings. As Mercurial often doesn't know
602 what encoding data is in, we use so-called UTF-8b.
611 what encoding data is in, we use so-called UTF-8b.
603
612
604 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
613 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
605 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
614 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
606 uDC00-uDCFF.
615 uDC00-uDCFF.
607
616
608 Principles of operation:
617 Principles of operation:
609
618
610 - ASCII and UTF-8 data successfully round-trips and is understood
619 - ASCII and UTF-8 data successfully round-trips and is understood
611 by Unicode-oriented clients
620 by Unicode-oriented clients
612 - filenames and file contents in arbitrary other encodings can have
621 - filenames and file contents in arbitrary other encodings can have
613 be round-tripped or recovered by clueful clients
622 be round-tripped or recovered by clueful clients
614 - local strings that have a cached known UTF-8 encoding (aka
623 - local strings that have a cached known UTF-8 encoding (aka
615 localstr) get sent as UTF-8 so Unicode-oriented clients get the
624 localstr) get sent as UTF-8 so Unicode-oriented clients get the
616 Unicode data they want
625 Unicode data they want
617 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
626 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
618 - because we must preserve UTF-8 bytestring in places such as
627 - because we must preserve UTF-8 bytestring in places such as
619 filenames, metadata can't be roundtripped without help
628 filenames, metadata can't be roundtripped without help
620
629
621 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
630 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
622 arbitrary bytes into an internal Unicode format that can be
631 arbitrary bytes into an internal Unicode format that can be
623 re-encoded back into the original. Here we are exposing the
632 re-encoded back into the original. Here we are exposing the
624 internal surrogate encoding as a UTF-8 string.)
633 internal surrogate encoding as a UTF-8 string.)
625 """
634 """
626
635
627 if isinstance(s, localstr):
636 if isinstance(s, localstr):
628 # assume that the original UTF-8 sequence would never contain
637 # assume that the original UTF-8 sequence would never contain
629 # invalid characters in U+DCxx range
638 # invalid characters in U+DCxx range
630 return s._utf8
639 return s._utf8
631 elif isinstance(s, safelocalstr):
640 elif isinstance(s, safelocalstr):
632 # already verified that s is non-lossy in legacy encoding, which
641 # already verified that s is non-lossy in legacy encoding, which
633 # shouldn't contain characters in U+DCxx range
642 # shouldn't contain characters in U+DCxx range
634 return fromlocal(s)
643 return fromlocal(s)
635 elif isasciistr(s):
644 elif isasciistr(s):
636 return s
645 return s
637 if b"\xed" not in s:
646 if b"\xed" not in s:
638 try:
647 try:
639 s.decode('utf-8', _utf8strict)
648 s.decode('utf-8', _utf8strict)
640 return s
649 return s
641 except UnicodeDecodeError:
650 except UnicodeDecodeError:
642 pass
651 pass
643
652
644 s = pycompat.bytestr(s)
653 s = pycompat.bytestr(s)
645 r = b""
654 r = b""
646 pos = 0
655 pos = 0
647 l = len(s)
656 l = len(s)
648 while pos < l:
657 while pos < l:
649 try:
658 try:
650 c = getutf8char(s, pos)
659 c = getutf8char(s, pos)
651 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
660 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
652 # have to re-escape existing U+DCxx characters
661 # have to re-escape existing U+DCxx characters
653 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
662 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
654 pos += 1
663 pos += 1
655 else:
664 else:
656 pos += len(c)
665 pos += len(c)
657 except UnicodeDecodeError:
666 except UnicodeDecodeError:
658 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
667 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
659 pos += 1
668 pos += 1
660 r += c
669 r += c
661 return r
670 return r
662
671
663
672
664 def fromutf8b(s):
673 def fromutf8b(s):
665 # type: (bytes) -> bytes
674 # type: (bytes) -> bytes
666 """Given a UTF-8b string, return a local, possibly-binary string.
675 """Given a UTF-8b string, return a local, possibly-binary string.
667
676
668 return the original binary string. This
677 return the original binary string. This
669 is a round-trip process for strings like filenames, but metadata
678 is a round-trip process for strings like filenames, but metadata
670 that's was passed through tolocal will remain in UTF-8.
679 that's was passed through tolocal will remain in UTF-8.
671
680
672 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
681 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
673 >>> m = b"\\xc3\\xa9\\x99abcd"
682 >>> m = b"\\xc3\\xa9\\x99abcd"
674 >>> toutf8b(m)
683 >>> toutf8b(m)
675 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
684 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
676 >>> roundtrip(m)
685 >>> roundtrip(m)
677 True
686 True
678 >>> roundtrip(b"\\xc2\\xc2\\x80")
687 >>> roundtrip(b"\\xc2\\xc2\\x80")
679 True
688 True
680 >>> roundtrip(b"\\xef\\xbf\\xbd")
689 >>> roundtrip(b"\\xef\\xbf\\xbd")
681 True
690 True
682 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
691 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
683 True
692 True
684 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
693 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
685 True
694 True
686 """
695 """
687
696
688 if isasciistr(s):
697 if isasciistr(s):
689 return s
698 return s
690 # fast path - look for uDxxx prefixes in s
699 # fast path - look for uDxxx prefixes in s
691 if b"\xed" not in s:
700 if b"\xed" not in s:
692 return s
701 return s
693
702
694 # We could do this with the unicode type but some Python builds
703 # We could do this with the unicode type but some Python builds
695 # use UTF-16 internally (issue5031) which causes non-BMP code
704 # use UTF-16 internally (issue5031) which causes non-BMP code
696 # points to be escaped. Instead, we use our handy getutf8char
705 # points to be escaped. Instead, we use our handy getutf8char
697 # helper again to walk the string without "decoding" it.
706 # helper again to walk the string without "decoding" it.
698
707
699 s = pycompat.bytestr(s)
708 s = pycompat.bytestr(s)
700 r = b""
709 r = b""
701 pos = 0
710 pos = 0
702 l = len(s)
711 l = len(s)
703 while pos < l:
712 while pos < l:
704 c = getutf8char(s, pos)
713 c = getutf8char(s, pos)
705 pos += len(c)
714 pos += len(c)
706 # unescape U+DCxx characters
715 # unescape U+DCxx characters
707 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
716 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
708 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
717 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
709 r += c
718 r += c
710 return r
719 return r
General Comments 0
You need to be logged in to leave comments. Login now