##// END OF EJS Templates
encoding: remove Python 2 support code...
Gregory Szorc -
r49747:fa2b1a46 default
parent child Browse files
Show More
@@ -1,745 +1,725 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8
8
9 import locale
9 import locale
10 import os
10 import os
11 import re
11 import re
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 if pycompat.TYPE_CHECKING:
23 if pycompat.TYPE_CHECKING:
24 from typing import (
24 from typing import (
25 Any,
25 Any,
26 Callable,
26 Callable,
27 List,
27 List,
28 Text,
28 Text,
29 Type,
29 Type,
30 TypeVar,
30 TypeVar,
31 Union,
31 Union,
32 )
32 )
33
33
34 # keep pyflakes happy
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
36 assert t
37
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39
39
40 charencode = policy.importmod('charencode')
40 charencode = policy.importmod('charencode')
41
41
42 isasciistr = charencode.isasciistr
42 isasciistr = charencode.isasciistr
43 asciilower = charencode.asciilower
43 asciilower = charencode.asciilower
44 asciiupper = charencode.asciiupper
44 asciiupper = charencode.asciiupper
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46
46
47 _sysstr = pycompat.sysstr
47 _sysstr = pycompat.sysstr
48
48
49 if pycompat.ispy3:
49 unichr = chr
50 unichr = chr
51
50
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
51 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # "Unicode Subtleties"), so we need to ignore them in some places for
52 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # sanity.
53 # sanity.
55 _ignore = [
54 _ignore = [
56 unichr(int(x, 16)).encode("utf-8")
55 unichr(int(x, 16)).encode("utf-8")
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
56 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 b"206a 206b 206c 206d 206e 206f feff".split()
57 b"206a 206b 206c 206d 206e 206f feff".split()
59 ]
58 ]
60 # verify the next function will work
59 # verify the next function will work
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
60 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62
61
63
62
64 def hfsignoreclean(s):
63 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
64 # type: (bytes) -> bytes
66 """Remove codepoints ignored by HFS+ from s.
65 """Remove codepoints ignored by HFS+ from s.
67
66
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
67 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 '.hg'
68 '.hg'
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
69 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 '.hg'
70 '.hg'
72 """
71 """
73 if b"\xe2" in s or b"\xef" in s:
72 if b"\xe2" in s or b"\xef" in s:
74 for c in _ignore:
73 for c in _ignore:
75 s = s.replace(c, b'')
74 s = s.replace(c, b'')
76 return s
75 return s
77
76
78
77
79 # encoding.environ is provided read-only, which may not be used to modify
78 # encoding.environ is provided read-only, which may not be used to modify
80 # the process environment
79 # the process environment
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
80 _nativeenviron = os.supports_bytes_environ
82 if not pycompat.ispy3:
81 if _nativeenviron:
83 environ = os.environ # re-exports
84 elif _nativeenviron:
85 environ = os.environb # re-exports
82 environ = os.environb # re-exports
86 else:
83 else:
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
84 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 # and recreate it once encoding is settled
85 # and recreate it once encoding is settled
89 environ = {
86 environ = {
90 k.encode('utf-8'): v.encode('utf-8')
87 k.encode('utf-8'): v.encode('utf-8')
91 for k, v in os.environ.items() # re-exports
88 for k, v in os.environ.items() # re-exports
92 }
89 }
93
90
94 _encodingrewrites = {
91 _encodingrewrites = {
95 b'646': b'ascii',
92 b'646': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
93 b'ANSI_X3.4-1968': b'ascii',
97 }
94 }
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
95 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
96 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 # https://bugs.python.org/issue13216
97 # https://bugs.python.org/issue13216
101 if pycompat.iswindows and not pycompat.ispy3:
98 if pycompat.iswindows:
102 _encodingrewrites[b'cp65001'] = b'utf-8'
99 _encodingrewrites[b'cp65001'] = b'utf-8'
103
100
104 try:
101 try:
105 encoding = environ.get(b"HGENCODING")
102 encoding = environ.get(b"HGENCODING")
106 if not encoding:
103 if not encoding:
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
104 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 encoding = _encodingrewrites.get(encoding, encoding)
105 encoding = _encodingrewrites.get(encoding, encoding)
109 except locale.Error:
106 except locale.Error:
110 encoding = b'ascii'
107 encoding = b'ascii'
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
108 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 fallbackencoding = b'ISO-8859-1'
109 fallbackencoding = b'ISO-8859-1'
113
110
114
111
115 class localstr(bytes):
112 class localstr(bytes):
116 """This class allows strings that are unmodified to be
113 """This class allows strings that are unmodified to be
117 round-tripped to the local encoding and back"""
114 round-tripped to the local encoding and back"""
118
115
119 def __new__(cls, u, l):
116 def __new__(cls, u, l):
120 s = bytes.__new__(cls, l)
117 s = bytes.__new__(cls, l)
121 s._utf8 = u
118 s._utf8 = u
122 return s
119 return s
123
120
124 if pycompat.TYPE_CHECKING:
121 if pycompat.TYPE_CHECKING:
125 # pseudo implementation to help pytype see localstr() constructor
122 # pseudo implementation to help pytype see localstr() constructor
126 def __init__(self, u, l):
123 def __init__(self, u, l):
127 # type: (bytes, bytes) -> None
124 # type: (bytes, bytes) -> None
128 super(localstr, self).__init__(l)
125 super(localstr, self).__init__(l)
129 self._utf8 = u
126 self._utf8 = u
130
127
131 def __hash__(self):
128 def __hash__(self):
132 return hash(self._utf8) # avoid collisions in local string space
129 return hash(self._utf8) # avoid collisions in local string space
133
130
134
131
135 class safelocalstr(bytes):
132 class safelocalstr(bytes):
136 """Tagged string denoting it was previously an internal UTF-8 string,
133 """Tagged string denoting it was previously an internal UTF-8 string,
137 and can be converted back to UTF-8 losslessly
134 and can be converted back to UTF-8 losslessly
138
135
139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
136 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
137 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
138 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
139 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 """
140 """
144
141
145
142
146 def tolocal(s):
143 def tolocal(s):
147 # type: (bytes) -> bytes
144 # type: (bytes) -> bytes
148 """
145 """
149 Convert a string from internal UTF-8 to local encoding
146 Convert a string from internal UTF-8 to local encoding
150
147
151 All internal strings should be UTF-8 but some repos before the
148 All internal strings should be UTF-8 but some repos before the
152 implementation of locale support may contain latin1 or possibly
149 implementation of locale support may contain latin1 or possibly
153 other character sets. We attempt to decode everything strictly
150 other character sets. We attempt to decode everything strictly
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
151 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 replace unknown characters.
152 replace unknown characters.
156
153
157 The localstr class is used to cache the known UTF-8 encoding of
154 The localstr class is used to cache the known UTF-8 encoding of
158 strings next to their local representation to allow lossless
155 strings next to their local representation to allow lossless
159 round-trip conversion back to UTF-8.
156 round-trip conversion back to UTF-8.
160
157
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
158 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 >>> l = tolocal(u)
159 >>> l = tolocal(u)
163 >>> l
160 >>> l
164 'foo: ?'
161 'foo: ?'
165 >>> fromlocal(l)
162 >>> fromlocal(l)
166 'foo: \\xc3\\xa4'
163 'foo: \\xc3\\xa4'
167 >>> u2 = b'foo: \\xc3\\xa1'
164 >>> u2 = b'foo: \\xc3\\xa1'
168 >>> d = { l: 1, tolocal(u2): 2 }
165 >>> d = { l: 1, tolocal(u2): 2 }
169 >>> len(d) # no collision
166 >>> len(d) # no collision
170 2
167 2
171 >>> b'foo: ?' in d
168 >>> b'foo: ?' in d
172 False
169 False
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
170 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 >>> l = tolocal(l1)
171 >>> l = tolocal(l1)
175 >>> l
172 >>> l
176 'foo: ?'
173 'foo: ?'
177 >>> fromlocal(l) # magically in utf-8
174 >>> fromlocal(l) # magically in utf-8
178 'foo: \\xc3\\xa4'
175 'foo: \\xc3\\xa4'
179 """
176 """
180
177
181 if isasciistr(s):
178 if isasciistr(s):
182 return s
179 return s
183
180
184 try:
181 try:
185 try:
182 try:
186 # make sure string is actually stored in UTF-8
183 # make sure string is actually stored in UTF-8
187 u = s.decode('UTF-8')
184 u = s.decode('UTF-8')
188 if encoding == b'UTF-8':
185 if encoding == b'UTF-8':
189 # fast path
186 # fast path
190 return s
187 return s
191 r = u.encode(_sysstr(encoding), "replace")
188 r = u.encode(_sysstr(encoding), "replace")
192 if u == r.decode(_sysstr(encoding)):
189 if u == r.decode(_sysstr(encoding)):
193 # r is a safe, non-lossy encoding of s
190 # r is a safe, non-lossy encoding of s
194 return safelocalstr(r)
191 return safelocalstr(r)
195 return localstr(s, r)
192 return localstr(s, r)
196 except UnicodeDecodeError:
193 except UnicodeDecodeError:
197 # we should only get here if we're looking at an ancient changeset
194 # we should only get here if we're looking at an ancient changeset
198 try:
195 try:
199 u = s.decode(_sysstr(fallbackencoding))
196 u = s.decode(_sysstr(fallbackencoding))
200 r = u.encode(_sysstr(encoding), "replace")
197 r = u.encode(_sysstr(encoding), "replace")
201 if u == r.decode(_sysstr(encoding)):
198 if u == r.decode(_sysstr(encoding)):
202 # r is a safe, non-lossy encoding of s
199 # r is a safe, non-lossy encoding of s
203 return safelocalstr(r)
200 return safelocalstr(r)
204 return localstr(u.encode('UTF-8'), r)
201 return localstr(u.encode('UTF-8'), r)
205 except UnicodeDecodeError:
202 except UnicodeDecodeError:
206 u = s.decode("utf-8", "replace") # last ditch
203 u = s.decode("utf-8", "replace") # last ditch
207 # can't round-trip
204 # can't round-trip
208 return u.encode(_sysstr(encoding), "replace")
205 return u.encode(_sysstr(encoding), "replace")
209 except LookupError as k:
206 except LookupError as k:
210 raise error.Abort(
207 raise error.Abort(
211 pycompat.bytestr(k), hint=b"please check your locale settings"
208 pycompat.bytestr(k), hint=b"please check your locale settings"
212 )
209 )
213
210
214
211
215 def fromlocal(s):
212 def fromlocal(s):
216 # type: (bytes) -> bytes
213 # type: (bytes) -> bytes
217 """
214 """
218 Convert a string from the local character encoding to UTF-8
215 Convert a string from the local character encoding to UTF-8
219
216
220 We attempt to decode strings using the encoding mode set by
217 We attempt to decode strings using the encoding mode set by
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
218 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 characters will cause an error message. Other modes include
219 characters will cause an error message. Other modes include
223 'replace', which replaces unknown characters with a special
220 'replace', which replaces unknown characters with a special
224 Unicode character, and 'ignore', which drops the character.
221 Unicode character, and 'ignore', which drops the character.
225 """
222 """
226
223
227 # can we do a lossless round-trip?
224 # can we do a lossless round-trip?
228 if isinstance(s, localstr):
225 if isinstance(s, localstr):
229 return s._utf8
226 return s._utf8
230 if isasciistr(s):
227 if isasciistr(s):
231 return s
228 return s
232
229
233 try:
230 try:
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
231 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 return u.encode("utf-8")
232 return u.encode("utf-8")
236 except UnicodeDecodeError as inst:
233 except UnicodeDecodeError as inst:
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
234 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 raise error.Abort(
235 raise error.Abort(
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
236 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 )
237 )
241 except LookupError as k:
238 except LookupError as k:
242 raise error.Abort(
239 raise error.Abort(
243 pycompat.bytestr(k), hint=b"please check your locale settings"
240 pycompat.bytestr(k), hint=b"please check your locale settings"
244 )
241 )
245
242
246
243
247 def unitolocal(u):
244 def unitolocal(u):
248 # type: (Text) -> bytes
245 # type: (Text) -> bytes
249 """Convert a unicode string to a byte string of local encoding"""
246 """Convert a unicode string to a byte string of local encoding"""
250 return tolocal(u.encode('utf-8'))
247 return tolocal(u.encode('utf-8'))
251
248
252
249
253 def unifromlocal(s):
250 def unifromlocal(s):
254 # type: (bytes) -> Text
251 # type: (bytes) -> Text
255 """Convert a byte string of local encoding to a unicode string"""
252 """Convert a byte string of local encoding to a unicode string"""
256 return fromlocal(s).decode('utf-8')
253 return fromlocal(s).decode('utf-8')
257
254
258
255
259 def unimethod(bytesfunc):
256 def unimethod(bytesfunc):
260 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
257 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
261 """Create a proxy method that forwards __unicode__() and __str__() of
258 """Create a proxy method that forwards __unicode__() and __str__() of
262 Python 3 to __bytes__()"""
259 Python 3 to __bytes__()"""
263
260
264 def unifunc(obj):
261 def unifunc(obj):
265 return unifromlocal(bytesfunc(obj))
262 return unifromlocal(bytesfunc(obj))
266
263
267 return unifunc
264 return unifunc
268
265
269
266
270 # converter functions between native str and byte string. use these if the
267 # converter functions between native str and byte string. use these if the
271 # character encoding is not aware (e.g. exception message) or is known to
268 # character encoding is not aware (e.g. exception message) or is known to
272 # be locale dependent (e.g. date formatting.)
269 # be locale dependent (e.g. date formatting.)
273 if pycompat.ispy3:
270 strtolocal = unitolocal
274 strtolocal = unitolocal
271 strfromlocal = unifromlocal
275 strfromlocal = unifromlocal
272 strmethod = unimethod
276 strmethod = unimethod
277 else:
278
279 def strtolocal(s):
280 # type: (str) -> bytes
281 return s # pytype: disable=bad-return-type
282
283 def strfromlocal(s):
284 # type: (bytes) -> str
285 return s # pytype: disable=bad-return-type
286
287 strmethod = pycompat.identity
288
273
289
274
290 def lower(s):
275 def lower(s):
291 # type: (bytes) -> bytes
276 # type: (bytes) -> bytes
292 """best-effort encoding-aware case-folding of local string s"""
277 """best-effort encoding-aware case-folding of local string s"""
293 try:
278 try:
294 return asciilower(s)
279 return asciilower(s)
295 except UnicodeDecodeError:
280 except UnicodeDecodeError:
296 pass
281 pass
297 try:
282 try:
298 if isinstance(s, localstr):
283 if isinstance(s, localstr):
299 u = s._utf8.decode("utf-8")
284 u = s._utf8.decode("utf-8")
300 else:
285 else:
301 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
286 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
302
287
303 lu = u.lower()
288 lu = u.lower()
304 if u == lu:
289 if u == lu:
305 return s # preserve localstring
290 return s # preserve localstring
306 return lu.encode(_sysstr(encoding))
291 return lu.encode(_sysstr(encoding))
307 except UnicodeError:
292 except UnicodeError:
308 return s.lower() # we don't know how to fold this except in ASCII
293 return s.lower() # we don't know how to fold this except in ASCII
309 except LookupError as k:
294 except LookupError as k:
310 raise error.Abort(
295 raise error.Abort(
311 pycompat.bytestr(k), hint=b"please check your locale settings"
296 pycompat.bytestr(k), hint=b"please check your locale settings"
312 )
297 )
313
298
314
299
315 def upper(s):
300 def upper(s):
316 # type: (bytes) -> bytes
301 # type: (bytes) -> bytes
317 """best-effort encoding-aware case-folding of local string s"""
302 """best-effort encoding-aware case-folding of local string s"""
318 try:
303 try:
319 return asciiupper(s)
304 return asciiupper(s)
320 except UnicodeDecodeError:
305 except UnicodeDecodeError:
321 return upperfallback(s)
306 return upperfallback(s)
322
307
323
308
324 def upperfallback(s):
309 def upperfallback(s):
325 # type: (Any) -> Any
310 # type: (Any) -> Any
326 try:
311 try:
327 if isinstance(s, localstr):
312 if isinstance(s, localstr):
328 u = s._utf8.decode("utf-8")
313 u = s._utf8.decode("utf-8")
329 else:
314 else:
330 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
315 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
331
316
332 uu = u.upper()
317 uu = u.upper()
333 if u == uu:
318 if u == uu:
334 return s # preserve localstring
319 return s # preserve localstring
335 return uu.encode(_sysstr(encoding))
320 return uu.encode(_sysstr(encoding))
336 except UnicodeError:
321 except UnicodeError:
337 return s.upper() # we don't know how to fold this except in ASCII
322 return s.upper() # we don't know how to fold this except in ASCII
338 except LookupError as k:
323 except LookupError as k:
339 raise error.Abort(
324 raise error.Abort(
340 pycompat.bytestr(k), hint=b"please check your locale settings"
325 pycompat.bytestr(k), hint=b"please check your locale settings"
341 )
326 )
342
327
343
328
344 if not _nativeenviron:
329 if not _nativeenviron:
345 # now encoding and helper functions are available, recreate the environ
330 # now encoding and helper functions are available, recreate the environ
346 # dict to be exported to other modules
331 # dict to be exported to other modules
347 if pycompat.iswindows and pycompat.ispy3:
332 if pycompat.iswindows:
348
333
349 class WindowsEnviron(dict):
334 class WindowsEnviron(dict):
350 """`os.environ` normalizes environment variables to uppercase on windows"""
335 """`os.environ` normalizes environment variables to uppercase on windows"""
351
336
352 def get(self, key, default=None):
337 def get(self, key, default=None):
353 return super().get(upper(key), default)
338 return super().get(upper(key), default)
354
339
355 environ = WindowsEnviron()
340 environ = WindowsEnviron()
356
341
357 for k, v in os.environ.items(): # re-exports
342 for k, v in os.environ.items(): # re-exports
358 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
343 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
359
344
360
345
361 DRIVE_RE = re.compile(b'^[a-z]:')
346 DRIVE_RE = re.compile(b'^[a-z]:')
362
347
363 if pycompat.ispy3:
348 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
364 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
349 # returns bytes.
365 # returns bytes.
350 if pycompat.iswindows:
366 if pycompat.iswindows:
351 # Python 3 on Windows issues a DeprecationWarning about using the bytes
367 # Python 3 on Windows issues a DeprecationWarning about using the bytes
352 # API when os.getcwdb() is called.
368 # API when os.getcwdb() is called.
353 #
369 #
354 # Additionally, py3.8+ uppercases the drive letter when calling
370 # Additionally, py3.8+ uppercases the drive letter when calling
355 # os.path.realpath(), which is used on ``repo.root``. Since those
371 # os.path.realpath(), which is used on ``repo.root``. Since those
356 # strings are compared in various places as simple strings, also call
372 # strings are compared in various places as simple strings, also call
357 # realpath here. See https://bugs.python.org/issue40368
373 # realpath here. See https://bugs.python.org/issue40368
358 #
374 #
359 # However this is not reliable, so lets explicitly make this drive
375 # However this is not reliable, so lets explicitly make this drive
360 # letter upper case.
376 # letter upper case.
361 #
377 #
362 # note: we should consider dropping realpath here since it seems to
378 # note: we should consider dropping realpath here since it seems to
363 # change the semantic of `getcwd`.
379 # change the semantic of `getcwd`.
380
364
381 def getcwd():
365 def getcwd():
382 cwd = os.getcwd() # re-exports
366 cwd = os.getcwd() # re-exports
383 cwd = os.path.realpath(cwd)
367 cwd = os.path.realpath(cwd)
384 cwd = strtolocal(cwd)
368 cwd = strtolocal(cwd)
385 if DRIVE_RE.match(cwd):
369 if DRIVE_RE.match(cwd):
386 cwd = cwd[0:1].upper() + cwd[1:]
370 cwd = cwd[0:1].upper() + cwd[1:]
387 return cwd
371 return cwd
388
372
389 else:
373
390 getcwd = os.getcwdb # re-exports
391 else:
374 else:
392 getcwd = os.getcwd # re-exports
375 getcwd = os.getcwdb # re-exports
393
376
394 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
377 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
395 _wide = _sysstr(
378 _wide = _sysstr(
396 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
379 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
397 and b"WFA"
380 and b"WFA"
398 or b"WF"
381 or b"WF"
399 )
382 )
400
383
401
384
402 def colwidth(s):
385 def colwidth(s):
403 # type: (bytes) -> int
386 # type: (bytes) -> int
404 """Find the column width of a string for display in the local encoding"""
387 """Find the column width of a string for display in the local encoding"""
405 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
388 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
406
389
407
390
408 def ucolwidth(d):
391 def ucolwidth(d):
409 # type: (Text) -> int
392 # type: (Text) -> int
410 """Find the column width of a Unicode string for display"""
393 """Find the column width of a Unicode string for display"""
411 eaw = getattr(unicodedata, 'east_asian_width', None)
394 eaw = getattr(unicodedata, 'east_asian_width', None)
412 if eaw is not None:
395 if eaw is not None:
413 return sum([eaw(c) in _wide and 2 or 1 for c in d])
396 return sum([eaw(c) in _wide and 2 or 1 for c in d])
414 return len(d)
397 return len(d)
415
398
416
399
417 def getcols(s, start, c):
400 def getcols(s, start, c):
418 # type: (bytes, int, int) -> bytes
401 # type: (bytes, int, int) -> bytes
419 """Use colwidth to find a c-column substring of s starting at byte
402 """Use colwidth to find a c-column substring of s starting at byte
420 index start"""
403 index start"""
421 for x in pycompat.xrange(start + c, len(s)):
404 for x in pycompat.xrange(start + c, len(s)):
422 t = s[start:x]
405 t = s[start:x]
423 if colwidth(t) == c:
406 if colwidth(t) == c:
424 return t
407 return t
425 raise ValueError('substring not found')
408 raise ValueError('substring not found')
426
409
427
410
428 def trim(s, width, ellipsis=b'', leftside=False):
411 def trim(s, width, ellipsis=b'', leftside=False):
429 # type: (bytes, int, bytes, bool) -> bytes
412 # type: (bytes, int, bytes, bool) -> bytes
430 """Trim string 's' to at most 'width' columns (including 'ellipsis').
413 """Trim string 's' to at most 'width' columns (including 'ellipsis').
431
414
432 If 'leftside' is True, left side of string 's' is trimmed.
415 If 'leftside' is True, left side of string 's' is trimmed.
433 'ellipsis' is always placed at trimmed side.
416 'ellipsis' is always placed at trimmed side.
434
417
435 >>> from .node import bin
418 >>> from .node import bin
436 >>> def bprint(s):
419 >>> def bprint(s):
437 ... print(pycompat.sysstr(s))
420 ... print(pycompat.sysstr(s))
438 >>> ellipsis = b'+++'
421 >>> ellipsis = b'+++'
439 >>> from . import encoding
422 >>> from . import encoding
440 >>> encoding.encoding = b'utf-8'
423 >>> encoding.encoding = b'utf-8'
441 >>> t = b'1234567890'
424 >>> t = b'1234567890'
442 >>> bprint(trim(t, 12, ellipsis=ellipsis))
425 >>> bprint(trim(t, 12, ellipsis=ellipsis))
443 1234567890
426 1234567890
444 >>> bprint(trim(t, 10, ellipsis=ellipsis))
427 >>> bprint(trim(t, 10, ellipsis=ellipsis))
445 1234567890
428 1234567890
446 >>> bprint(trim(t, 8, ellipsis=ellipsis))
429 >>> bprint(trim(t, 8, ellipsis=ellipsis))
447 12345+++
430 12345+++
448 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
431 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
449 +++67890
432 +++67890
450 >>> bprint(trim(t, 8))
433 >>> bprint(trim(t, 8))
451 12345678
434 12345678
452 >>> bprint(trim(t, 8, leftside=True))
435 >>> bprint(trim(t, 8, leftside=True))
453 34567890
436 34567890
454 >>> bprint(trim(t, 3, ellipsis=ellipsis))
437 >>> bprint(trim(t, 3, ellipsis=ellipsis))
455 +++
438 +++
456 >>> bprint(trim(t, 1, ellipsis=ellipsis))
439 >>> bprint(trim(t, 1, ellipsis=ellipsis))
457 +
440 +
458 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
441 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
459 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
442 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
460 >>> bprint(trim(t, 12, ellipsis=ellipsis))
443 >>> bprint(trim(t, 12, ellipsis=ellipsis))
461 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
444 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
462 >>> bprint(trim(t, 10, ellipsis=ellipsis))
445 >>> bprint(trim(t, 10, ellipsis=ellipsis))
463 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
446 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
464 >>> bprint(trim(t, 8, ellipsis=ellipsis))
447 >>> bprint(trim(t, 8, ellipsis=ellipsis))
465 \xe3\x81\x82\xe3\x81\x84+++
448 \xe3\x81\x82\xe3\x81\x84+++
466 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
449 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
467 +++\xe3\x81\x88\xe3\x81\x8a
450 +++\xe3\x81\x88\xe3\x81\x8a
468 >>> bprint(trim(t, 5))
451 >>> bprint(trim(t, 5))
469 \xe3\x81\x82\xe3\x81\x84
452 \xe3\x81\x82\xe3\x81\x84
470 >>> bprint(trim(t, 5, leftside=True))
453 >>> bprint(trim(t, 5, leftside=True))
471 \xe3\x81\x88\xe3\x81\x8a
454 \xe3\x81\x88\xe3\x81\x8a
472 >>> bprint(trim(t, 4, ellipsis=ellipsis))
455 >>> bprint(trim(t, 4, ellipsis=ellipsis))
473 +++
456 +++
474 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
457 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
475 +++
458 +++
476 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
459 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
477 >>> bprint(trim(t, 12, ellipsis=ellipsis))
460 >>> bprint(trim(t, 12, ellipsis=ellipsis))
478 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
461 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
479 >>> bprint(trim(t, 10, ellipsis=ellipsis))
462 >>> bprint(trim(t, 10, ellipsis=ellipsis))
480 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
463 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
481 >>> bprint(trim(t, 8, ellipsis=ellipsis))
464 >>> bprint(trim(t, 8, ellipsis=ellipsis))
482 \x11\x22\x33\x44\x55+++
465 \x11\x22\x33\x44\x55+++
483 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
466 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
484 +++\x66\x77\x88\x99\xaa
467 +++\x66\x77\x88\x99\xaa
485 >>> bprint(trim(t, 8))
468 >>> bprint(trim(t, 8))
486 \x11\x22\x33\x44\x55\x66\x77\x88
469 \x11\x22\x33\x44\x55\x66\x77\x88
487 >>> bprint(trim(t, 8, leftside=True))
470 >>> bprint(trim(t, 8, leftside=True))
488 \x33\x44\x55\x66\x77\x88\x99\xaa
471 \x33\x44\x55\x66\x77\x88\x99\xaa
489 >>> bprint(trim(t, 3, ellipsis=ellipsis))
472 >>> bprint(trim(t, 3, ellipsis=ellipsis))
490 +++
473 +++
491 >>> bprint(trim(t, 1, ellipsis=ellipsis))
474 >>> bprint(trim(t, 1, ellipsis=ellipsis))
492 +
475 +
493 """
476 """
494 try:
477 try:
495 u = s.decode(_sysstr(encoding))
478 u = s.decode(_sysstr(encoding))
496 except UnicodeDecodeError:
479 except UnicodeDecodeError:
497 if len(s) <= width: # trimming is not needed
480 if len(s) <= width: # trimming is not needed
498 return s
481 return s
499 width -= len(ellipsis)
482 width -= len(ellipsis)
500 if width <= 0: # no enough room even for ellipsis
483 if width <= 0: # no enough room even for ellipsis
501 return ellipsis[: width + len(ellipsis)]
484 return ellipsis[: width + len(ellipsis)]
502 if leftside:
485 if leftside:
503 return ellipsis + s[-width:]
486 return ellipsis + s[-width:]
504 return s[:width] + ellipsis
487 return s[:width] + ellipsis
505
488
506 if ucolwidth(u) <= width: # trimming is not needed
489 if ucolwidth(u) <= width: # trimming is not needed
507 return s
490 return s
508
491
509 width -= len(ellipsis)
492 width -= len(ellipsis)
510 if width <= 0: # no enough room even for ellipsis
493 if width <= 0: # no enough room even for ellipsis
511 return ellipsis[: width + len(ellipsis)]
494 return ellipsis[: width + len(ellipsis)]
512
495
513 chars = list(u)
496 chars = list(u)
514 if leftside:
497 if leftside:
515 chars.reverse()
498 chars.reverse()
516 width_so_far = 0
499 width_so_far = 0
517 for i, c in enumerate(chars):
500 for i, c in enumerate(chars):
518 width_so_far += ucolwidth(c)
501 width_so_far += ucolwidth(c)
519 if width_so_far > width:
502 if width_so_far > width:
520 break
503 break
521 chars = chars[:i]
504 chars = chars[:i]
522 if leftside:
505 if leftside:
523 chars.reverse()
506 chars.reverse()
524 u = u''.join(chars).encode(_sysstr(encoding))
507 u = u''.join(chars).encode(_sysstr(encoding))
525 if leftside:
508 if leftside:
526 return ellipsis + u
509 return ellipsis + u
527 return u + ellipsis
510 return u + ellipsis
528
511
529
512
530 class normcasespecs(object):
513 class normcasespecs(object):
531 """what a platform's normcase does to ASCII strings
514 """what a platform's normcase does to ASCII strings
532
515
533 This is specified per platform, and should be consistent with what normcase
516 This is specified per platform, and should be consistent with what normcase
534 on that platform actually does.
517 on that platform actually does.
535
518
536 lower: normcase lowercases ASCII strings
519 lower: normcase lowercases ASCII strings
537 upper: normcase uppercases ASCII strings
520 upper: normcase uppercases ASCII strings
538 other: the fallback function should always be called
521 other: the fallback function should always be called
539
522
540 This should be kept in sync with normcase_spec in util.h."""
523 This should be kept in sync with normcase_spec in util.h."""
541
524
542 lower = -1
525 lower = -1
543 upper = 1
526 upper = 1
544 other = 0
527 other = 0
545
528
546
529
547 def jsonescape(s, paranoid=False):
530 def jsonescape(s, paranoid=False):
548 # type: (Any, Any) -> Any
531 # type: (Any, Any) -> Any
549 """returns a string suitable for JSON
532 """returns a string suitable for JSON
550
533
551 JSON is problematic for us because it doesn't support non-Unicode
534 JSON is problematic for us because it doesn't support non-Unicode
552 bytes. To deal with this, we take the following approach:
535 bytes. To deal with this, we take the following approach:
553
536
554 - localstr/safelocalstr objects are converted back to UTF-8
537 - localstr/safelocalstr objects are converted back to UTF-8
555 - valid UTF-8/ASCII strings are passed as-is
538 - valid UTF-8/ASCII strings are passed as-is
556 - other strings are converted to UTF-8b surrogate encoding
539 - other strings are converted to UTF-8b surrogate encoding
557 - apply JSON-specified string escaping
540 - apply JSON-specified string escaping
558
541
559 (escapes are doubled in these tests)
542 (escapes are doubled in these tests)
560
543
561 >>> jsonescape(b'this is a test')
544 >>> jsonescape(b'this is a test')
562 'this is a test'
545 'this is a test'
563 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
546 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
564 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
547 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
565 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
548 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
566 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
549 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
567 >>> jsonescape(b'a weird byte: \\xdd')
550 >>> jsonescape(b'a weird byte: \\xdd')
568 'a weird byte: \\xed\\xb3\\x9d'
551 'a weird byte: \\xed\\xb3\\x9d'
569 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
552 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
570 'utf-8: caf\\xc3\\xa9'
553 'utf-8: caf\\xc3\\xa9'
571 >>> jsonescape(b'')
554 >>> jsonescape(b'')
572 ''
555 ''
573
556
574 If paranoid, non-ascii and common troublesome characters are also escaped.
557 If paranoid, non-ascii and common troublesome characters are also escaped.
575 This is suitable for web output.
558 This is suitable for web output.
576
559
577 >>> s = b'escape characters: \\0 \\x0b \\x7f'
560 >>> s = b'escape characters: \\0 \\x0b \\x7f'
578 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
561 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
579 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
562 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
580 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
563 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
581 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
564 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
582 'escape boundary: ~ \\\\u007f \\\\u0080'
565 'escape boundary: ~ \\\\u007f \\\\u0080'
583 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
566 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
584 'a weird byte: \\\\udcdd'
567 'a weird byte: \\\\udcdd'
585 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
568 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
586 'utf-8: caf\\\\u00e9'
569 'utf-8: caf\\\\u00e9'
587 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
570 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
588 'non-BMP: \\\\ud834\\\\udd1e'
571 'non-BMP: \\\\ud834\\\\udd1e'
589 >>> jsonescape(b'<foo@example.org>', paranoid=True)
572 >>> jsonescape(b'<foo@example.org>', paranoid=True)
590 '\\\\u003cfoo@example.org\\\\u003e'
573 '\\\\u003cfoo@example.org\\\\u003e'
591 """
574 """
592
575
593 u8chars = toutf8b(s)
576 u8chars = toutf8b(s)
594 try:
577 try:
595 return _jsonescapeu8fast(u8chars, paranoid)
578 return _jsonescapeu8fast(u8chars, paranoid)
596 except ValueError:
579 except ValueError:
597 pass
580 pass
598 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
581 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
599
582
600
583
601 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
584 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
602 # bytes are mapped to that range.
585 # bytes are mapped to that range.
603 if pycompat.ispy3:
586 _utf8strict = r'surrogatepass'
604 _utf8strict = r'surrogatepass'
605 else:
606 _utf8strict = r'strict'
607
587
608 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
588 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
609
589
610
590
611 def getutf8char(s, pos):
591 def getutf8char(s, pos):
612 # type: (bytes, int) -> bytes
592 # type: (bytes, int) -> bytes
613 """get the next full utf-8 character in the given string, starting at pos
593 """get the next full utf-8 character in the given string, starting at pos
614
594
615 Raises a UnicodeError if the given location does not start a valid
595 Raises a UnicodeError if the given location does not start a valid
616 utf-8 character.
596 utf-8 character.
617 """
597 """
618
598
619 # find how many bytes to attempt decoding from first nibble
599 # find how many bytes to attempt decoding from first nibble
620 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
600 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
621 if not l: # ascii
601 if not l: # ascii
622 return s[pos : pos + 1]
602 return s[pos : pos + 1]
623
603
624 c = s[pos : pos + l]
604 c = s[pos : pos + l]
625 # validate with attempted decode
605 # validate with attempted decode
626 c.decode("utf-8", _utf8strict)
606 c.decode("utf-8", _utf8strict)
627 return c
607 return c
628
608
629
609
630 def toutf8b(s):
610 def toutf8b(s):
631 # type: (bytes) -> bytes
611 # type: (bytes) -> bytes
632 """convert a local, possibly-binary string into UTF-8b
612 """convert a local, possibly-binary string into UTF-8b
633
613
634 This is intended as a generic method to preserve data when working
614 This is intended as a generic method to preserve data when working
635 with schemes like JSON and XML that have no provision for
615 with schemes like JSON and XML that have no provision for
636 arbitrary byte strings. As Mercurial often doesn't know
616 arbitrary byte strings. As Mercurial often doesn't know
637 what encoding data is in, we use so-called UTF-8b.
617 what encoding data is in, we use so-called UTF-8b.
638
618
639 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
619 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
640 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
620 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
641 uDC00-uDCFF.
621 uDC00-uDCFF.
642
622
643 Principles of operation:
623 Principles of operation:
644
624
645 - ASCII and UTF-8 data successfully round-trips and is understood
625 - ASCII and UTF-8 data successfully round-trips and is understood
646 by Unicode-oriented clients
626 by Unicode-oriented clients
647 - filenames and file contents in arbitrary other encodings can have
627 - filenames and file contents in arbitrary other encodings can have
648 be round-tripped or recovered by clueful clients
628 be round-tripped or recovered by clueful clients
649 - local strings that have a cached known UTF-8 encoding (aka
629 - local strings that have a cached known UTF-8 encoding (aka
650 localstr) get sent as UTF-8 so Unicode-oriented clients get the
630 localstr) get sent as UTF-8 so Unicode-oriented clients get the
651 Unicode data they want
631 Unicode data they want
652 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
632 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
653 - because we must preserve UTF-8 bytestring in places such as
633 - because we must preserve UTF-8 bytestring in places such as
654 filenames, metadata can't be roundtripped without help
634 filenames, metadata can't be roundtripped without help
655
635
656 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
636 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
657 arbitrary bytes into an internal Unicode format that can be
637 arbitrary bytes into an internal Unicode format that can be
658 re-encoded back into the original. Here we are exposing the
638 re-encoded back into the original. Here we are exposing the
659 internal surrogate encoding as a UTF-8 string.)
639 internal surrogate encoding as a UTF-8 string.)
660 """
640 """
661
641
662 if isinstance(s, localstr):
642 if isinstance(s, localstr):
663 # assume that the original UTF-8 sequence would never contain
643 # assume that the original UTF-8 sequence would never contain
664 # invalid characters in U+DCxx range
644 # invalid characters in U+DCxx range
665 return s._utf8
645 return s._utf8
666 elif isinstance(s, safelocalstr):
646 elif isinstance(s, safelocalstr):
667 # already verified that s is non-lossy in legacy encoding, which
647 # already verified that s is non-lossy in legacy encoding, which
668 # shouldn't contain characters in U+DCxx range
648 # shouldn't contain characters in U+DCxx range
669 return fromlocal(s)
649 return fromlocal(s)
670 elif isasciistr(s):
650 elif isasciistr(s):
671 return s
651 return s
672 if b"\xed" not in s:
652 if b"\xed" not in s:
673 try:
653 try:
674 s.decode('utf-8', _utf8strict)
654 s.decode('utf-8', _utf8strict)
675 return s
655 return s
676 except UnicodeDecodeError:
656 except UnicodeDecodeError:
677 pass
657 pass
678
658
679 s = pycompat.bytestr(s)
659 s = pycompat.bytestr(s)
680 r = b""
660 r = b""
681 pos = 0
661 pos = 0
682 l = len(s)
662 l = len(s)
683 while pos < l:
663 while pos < l:
684 try:
664 try:
685 c = getutf8char(s, pos)
665 c = getutf8char(s, pos)
686 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
666 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
687 # have to re-escape existing U+DCxx characters
667 # have to re-escape existing U+DCxx characters
688 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
689 pos += 1
669 pos += 1
690 else:
670 else:
691 pos += len(c)
671 pos += len(c)
692 except UnicodeDecodeError:
672 except UnicodeDecodeError:
693 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
694 pos += 1
674 pos += 1
695 r += c
675 r += c
696 return r
676 return r
697
677
698
678
699 def fromutf8b(s):
679 def fromutf8b(s):
700 # type: (bytes) -> bytes
680 # type: (bytes) -> bytes
701 """Given a UTF-8b string, return a local, possibly-binary string.
681 """Given a UTF-8b string, return a local, possibly-binary string.
702
682
703 return the original binary string. This
683 return the original binary string. This
704 is a round-trip process for strings like filenames, but metadata
684 is a round-trip process for strings like filenames, but metadata
705 that's was passed through tolocal will remain in UTF-8.
685 that's was passed through tolocal will remain in UTF-8.
706
686
707 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
687 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
708 >>> m = b"\\xc3\\xa9\\x99abcd"
688 >>> m = b"\\xc3\\xa9\\x99abcd"
709 >>> toutf8b(m)
689 >>> toutf8b(m)
710 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
690 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
711 >>> roundtrip(m)
691 >>> roundtrip(m)
712 True
692 True
713 >>> roundtrip(b"\\xc2\\xc2\\x80")
693 >>> roundtrip(b"\\xc2\\xc2\\x80")
714 True
694 True
715 >>> roundtrip(b"\\xef\\xbf\\xbd")
695 >>> roundtrip(b"\\xef\\xbf\\xbd")
716 True
696 True
717 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
697 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
718 True
698 True
719 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
699 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
720 True
700 True
721 """
701 """
722
702
723 if isasciistr(s):
703 if isasciistr(s):
724 return s
704 return s
725 # fast path - look for uDxxx prefixes in s
705 # fast path - look for uDxxx prefixes in s
726 if b"\xed" not in s:
706 if b"\xed" not in s:
727 return s
707 return s
728
708
729 # We could do this with the unicode type but some Python builds
709 # We could do this with the unicode type but some Python builds
730 # use UTF-16 internally (issue5031) which causes non-BMP code
710 # use UTF-16 internally (issue5031) which causes non-BMP code
731 # points to be escaped. Instead, we use our handy getutf8char
711 # points to be escaped. Instead, we use our handy getutf8char
732 # helper again to walk the string without "decoding" it.
712 # helper again to walk the string without "decoding" it.
733
713
734 s = pycompat.bytestr(s)
714 s = pycompat.bytestr(s)
735 r = b""
715 r = b""
736 pos = 0
716 pos = 0
737 l = len(s)
717 l = len(s)
738 while pos < l:
718 while pos < l:
739 c = getutf8char(s, pos)
719 c = getutf8char(s, pos)
740 pos += len(c)
720 pos += len(c)
741 # unescape U+DCxx characters
721 # unescape U+DCxx characters
742 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
743 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
744 r += c
724 r += c
745 return r
725 return r
General Comments 0
You need to be logged in to leave comments. Login now