##// END OF EJS Templates
windows: enforce upper case drive letter for getcwd in mercurial too...
marmoute -
r48421:d6ee6456 default
parent child Browse files
Show More
@@ -1,719 +1,736 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import re
12 import unicodedata
13 import unicodedata
13
14
14 from .pycompat import getattr
15 from .pycompat import getattr
15 from . import (
16 from . import (
16 error,
17 error,
17 policy,
18 policy,
18 pycompat,
19 pycompat,
19 )
20 )
20
21
21 from .pure import charencode as charencodepure
22 from .pure import charencode as charencodepure
22
23
23 if pycompat.TYPE_CHECKING:
24 if pycompat.TYPE_CHECKING:
24 from typing import (
25 from typing import (
25 Any,
26 Any,
26 Callable,
27 Callable,
27 List,
28 List,
28 Text,
29 Text,
29 Type,
30 Type,
30 TypeVar,
31 TypeVar,
31 Union,
32 Union,
32 )
33 )
33
34
34 # keep pyflakes happy
35 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
36 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
37 assert t
37
38
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39
40
40 charencode = policy.importmod('charencode')
41 charencode = policy.importmod('charencode')
41
42
42 isasciistr = charencode.isasciistr
43 isasciistr = charencode.isasciistr
43 asciilower = charencode.asciilower
44 asciilower = charencode.asciilower
44 asciiupper = charencode.asciiupper
45 asciiupper = charencode.asciiupper
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46 _jsonescapeu8fast = charencode.jsonescapeu8fast
46
47
47 _sysstr = pycompat.sysstr
48 _sysstr = pycompat.sysstr
48
49
49 if pycompat.ispy3:
50 if pycompat.ispy3:
50 unichr = chr
51 unichr = chr
51
52
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # sanity.
55 # sanity.
55 _ignore = [
56 _ignore = [
56 unichr(int(x, 16)).encode("utf-8")
57 unichr(int(x, 16)).encode("utf-8")
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 b"206a 206b 206c 206d 206e 206f feff".split()
59 b"206a 206b 206c 206d 206e 206f feff".split()
59 ]
60 ]
60 # verify the next function will work
61 # verify the next function will work
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62
63
63
64
64 def hfsignoreclean(s):
65 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
66 # type: (bytes) -> bytes
66 """Remove codepoints ignored by HFS+ from s.
67 """Remove codepoints ignored by HFS+ from s.
67
68
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 '.hg'
70 '.hg'
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 '.hg'
72 '.hg'
72 """
73 """
73 if b"\xe2" in s or b"\xef" in s:
74 if b"\xe2" in s or b"\xef" in s:
74 for c in _ignore:
75 for c in _ignore:
75 s = s.replace(c, b'')
76 s = s.replace(c, b'')
76 return s
77 return s
77
78
78
79
79 # encoding.environ is provided read-only, which may not be used to modify
80 # encoding.environ is provided read-only, which may not be used to modify
80 # the process environment
81 # the process environment
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 if not pycompat.ispy3:
83 if not pycompat.ispy3:
83 environ = os.environ # re-exports
84 environ = os.environ # re-exports
84 elif _nativeenviron:
85 elif _nativeenviron:
85 environ = os.environb # re-exports
86 environ = os.environb # re-exports
86 else:
87 else:
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 # and recreate it once encoding is settled
89 # and recreate it once encoding is settled
89 environ = {
90 environ = {
90 k.encode('utf-8'): v.encode('utf-8')
91 k.encode('utf-8'): v.encode('utf-8')
91 for k, v in os.environ.items() # re-exports
92 for k, v in os.environ.items() # re-exports
92 }
93 }
93
94
94 _encodingrewrites = {
95 _encodingrewrites = {
95 b'646': b'ascii',
96 b'646': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
97 b'ANSI_X3.4-1968': b'ascii',
97 }
98 }
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 # https://bugs.python.org/issue13216
101 # https://bugs.python.org/issue13216
101 if pycompat.iswindows and not pycompat.ispy3:
102 if pycompat.iswindows and not pycompat.ispy3:
102 _encodingrewrites[b'cp65001'] = b'utf-8'
103 _encodingrewrites[b'cp65001'] = b'utf-8'
103
104
104 try:
105 try:
105 encoding = environ.get(b"HGENCODING")
106 encoding = environ.get(b"HGENCODING")
106 if not encoding:
107 if not encoding:
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 encoding = _encodingrewrites.get(encoding, encoding)
109 encoding = _encodingrewrites.get(encoding, encoding)
109 except locale.Error:
110 except locale.Error:
110 encoding = b'ascii'
111 encoding = b'ascii'
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 fallbackencoding = b'ISO-8859-1'
113 fallbackencoding = b'ISO-8859-1'
113
114
114
115
115 class localstr(bytes):
116 class localstr(bytes):
116 """This class allows strings that are unmodified to be
117 """This class allows strings that are unmodified to be
117 round-tripped to the local encoding and back"""
118 round-tripped to the local encoding and back"""
118
119
119 def __new__(cls, u, l):
120 def __new__(cls, u, l):
120 s = bytes.__new__(cls, l)
121 s = bytes.__new__(cls, l)
121 s._utf8 = u
122 s._utf8 = u
122 return s
123 return s
123
124
124 if pycompat.TYPE_CHECKING:
125 if pycompat.TYPE_CHECKING:
125 # pseudo implementation to help pytype see localstr() constructor
126 # pseudo implementation to help pytype see localstr() constructor
126 def __init__(self, u, l):
127 def __init__(self, u, l):
127 # type: (bytes, bytes) -> None
128 # type: (bytes, bytes) -> None
128 super(localstr, self).__init__(l)
129 super(localstr, self).__init__(l)
129 self._utf8 = u
130 self._utf8 = u
130
131
131 def __hash__(self):
132 def __hash__(self):
132 return hash(self._utf8) # avoid collisions in local string space
133 return hash(self._utf8) # avoid collisions in local string space
133
134
134
135
135 class safelocalstr(bytes):
136 class safelocalstr(bytes):
136 """Tagged string denoting it was previously an internal UTF-8 string,
137 """Tagged string denoting it was previously an internal UTF-8 string,
137 and can be converted back to UTF-8 losslessly
138 and can be converted back to UTF-8 losslessly
138
139
139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 """
144 """
144
145
145
146
146 def tolocal(s):
147 def tolocal(s):
147 # type: (bytes) -> bytes
148 # type: (bytes) -> bytes
148 """
149 """
149 Convert a string from internal UTF-8 to local encoding
150 Convert a string from internal UTF-8 to local encoding
150
151
151 All internal strings should be UTF-8 but some repos before the
152 All internal strings should be UTF-8 but some repos before the
152 implementation of locale support may contain latin1 or possibly
153 implementation of locale support may contain latin1 or possibly
153 other character sets. We attempt to decode everything strictly
154 other character sets. We attempt to decode everything strictly
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 replace unknown characters.
156 replace unknown characters.
156
157
157 The localstr class is used to cache the known UTF-8 encoding of
158 The localstr class is used to cache the known UTF-8 encoding of
158 strings next to their local representation to allow lossless
159 strings next to their local representation to allow lossless
159 round-trip conversion back to UTF-8.
160 round-trip conversion back to UTF-8.
160
161
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 >>> l = tolocal(u)
163 >>> l = tolocal(u)
163 >>> l
164 >>> l
164 'foo: ?'
165 'foo: ?'
165 >>> fromlocal(l)
166 >>> fromlocal(l)
166 'foo: \\xc3\\xa4'
167 'foo: \\xc3\\xa4'
167 >>> u2 = b'foo: \\xc3\\xa1'
168 >>> u2 = b'foo: \\xc3\\xa1'
168 >>> d = { l: 1, tolocal(u2): 2 }
169 >>> d = { l: 1, tolocal(u2): 2 }
169 >>> len(d) # no collision
170 >>> len(d) # no collision
170 2
171 2
171 >>> b'foo: ?' in d
172 >>> b'foo: ?' in d
172 False
173 False
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 >>> l = tolocal(l1)
175 >>> l = tolocal(l1)
175 >>> l
176 >>> l
176 'foo: ?'
177 'foo: ?'
177 >>> fromlocal(l) # magically in utf-8
178 >>> fromlocal(l) # magically in utf-8
178 'foo: \\xc3\\xa4'
179 'foo: \\xc3\\xa4'
179 """
180 """
180
181
181 if isasciistr(s):
182 if isasciistr(s):
182 return s
183 return s
183
184
184 try:
185 try:
185 try:
186 try:
186 # make sure string is actually stored in UTF-8
187 # make sure string is actually stored in UTF-8
187 u = s.decode('UTF-8')
188 u = s.decode('UTF-8')
188 if encoding == b'UTF-8':
189 if encoding == b'UTF-8':
189 # fast path
190 # fast path
190 return s
191 return s
191 r = u.encode(_sysstr(encoding), "replace")
192 r = u.encode(_sysstr(encoding), "replace")
192 if u == r.decode(_sysstr(encoding)):
193 if u == r.decode(_sysstr(encoding)):
193 # r is a safe, non-lossy encoding of s
194 # r is a safe, non-lossy encoding of s
194 return safelocalstr(r)
195 return safelocalstr(r)
195 return localstr(s, r)
196 return localstr(s, r)
196 except UnicodeDecodeError:
197 except UnicodeDecodeError:
197 # we should only get here if we're looking at an ancient changeset
198 # we should only get here if we're looking at an ancient changeset
198 try:
199 try:
199 u = s.decode(_sysstr(fallbackencoding))
200 u = s.decode(_sysstr(fallbackencoding))
200 r = u.encode(_sysstr(encoding), "replace")
201 r = u.encode(_sysstr(encoding), "replace")
201 if u == r.decode(_sysstr(encoding)):
202 if u == r.decode(_sysstr(encoding)):
202 # r is a safe, non-lossy encoding of s
203 # r is a safe, non-lossy encoding of s
203 return safelocalstr(r)
204 return safelocalstr(r)
204 return localstr(u.encode('UTF-8'), r)
205 return localstr(u.encode('UTF-8'), r)
205 except UnicodeDecodeError:
206 except UnicodeDecodeError:
206 u = s.decode("utf-8", "replace") # last ditch
207 u = s.decode("utf-8", "replace") # last ditch
207 # can't round-trip
208 # can't round-trip
208 return u.encode(_sysstr(encoding), "replace")
209 return u.encode(_sysstr(encoding), "replace")
209 except LookupError as k:
210 except LookupError as k:
210 raise error.Abort(
211 raise error.Abort(
211 pycompat.bytestr(k), hint=b"please check your locale settings"
212 pycompat.bytestr(k), hint=b"please check your locale settings"
212 )
213 )
213
214
214
215
215 def fromlocal(s):
216 def fromlocal(s):
216 # type: (bytes) -> bytes
217 # type: (bytes) -> bytes
217 """
218 """
218 Convert a string from the local character encoding to UTF-8
219 Convert a string from the local character encoding to UTF-8
219
220
220 We attempt to decode strings using the encoding mode set by
221 We attempt to decode strings using the encoding mode set by
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 characters will cause an error message. Other modes include
223 characters will cause an error message. Other modes include
223 'replace', which replaces unknown characters with a special
224 'replace', which replaces unknown characters with a special
224 Unicode character, and 'ignore', which drops the character.
225 Unicode character, and 'ignore', which drops the character.
225 """
226 """
226
227
227 # can we do a lossless round-trip?
228 # can we do a lossless round-trip?
228 if isinstance(s, localstr):
229 if isinstance(s, localstr):
229 return s._utf8
230 return s._utf8
230 if isasciistr(s):
231 if isasciistr(s):
231 return s
232 return s
232
233
233 try:
234 try:
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 return u.encode("utf-8")
236 return u.encode("utf-8")
236 except UnicodeDecodeError as inst:
237 except UnicodeDecodeError as inst:
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 raise error.Abort(
239 raise error.Abort(
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 )
241 )
241 except LookupError as k:
242 except LookupError as k:
242 raise error.Abort(k, hint=b"please check your locale settings")
243 raise error.Abort(k, hint=b"please check your locale settings")
243
244
244
245
245 def unitolocal(u):
246 def unitolocal(u):
246 # type: (Text) -> bytes
247 # type: (Text) -> bytes
247 """Convert a unicode string to a byte string of local encoding"""
248 """Convert a unicode string to a byte string of local encoding"""
248 return tolocal(u.encode('utf-8'))
249 return tolocal(u.encode('utf-8'))
249
250
250
251
251 def unifromlocal(s):
252 def unifromlocal(s):
252 # type: (bytes) -> Text
253 # type: (bytes) -> Text
253 """Convert a byte string of local encoding to a unicode string"""
254 """Convert a byte string of local encoding to a unicode string"""
254 return fromlocal(s).decode('utf-8')
255 return fromlocal(s).decode('utf-8')
255
256
256
257
257 def unimethod(bytesfunc):
258 def unimethod(bytesfunc):
258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
259 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
259 """Create a proxy method that forwards __unicode__() and __str__() of
260 """Create a proxy method that forwards __unicode__() and __str__() of
260 Python 3 to __bytes__()"""
261 Python 3 to __bytes__()"""
261
262
262 def unifunc(obj):
263 def unifunc(obj):
263 return unifromlocal(bytesfunc(obj))
264 return unifromlocal(bytesfunc(obj))
264
265
265 return unifunc
266 return unifunc
266
267
267
268
268 # converter functions between native str and byte string. use these if the
269 # converter functions between native str and byte string. use these if the
269 # character encoding is not aware (e.g. exception message) or is known to
270 # character encoding is not aware (e.g. exception message) or is known to
270 # be locale dependent (e.g. date formatting.)
271 # be locale dependent (e.g. date formatting.)
271 if pycompat.ispy3:
272 if pycompat.ispy3:
272 strtolocal = unitolocal
273 strtolocal = unitolocal
273 strfromlocal = unifromlocal
274 strfromlocal = unifromlocal
274 strmethod = unimethod
275 strmethod = unimethod
275 else:
276 else:
276
277
277 def strtolocal(s):
278 def strtolocal(s):
278 # type: (str) -> bytes
279 # type: (str) -> bytes
279 return s # pytype: disable=bad-return-type
280 return s # pytype: disable=bad-return-type
280
281
281 def strfromlocal(s):
282 def strfromlocal(s):
282 # type: (bytes) -> str
283 # type: (bytes) -> str
283 return s # pytype: disable=bad-return-type
284 return s # pytype: disable=bad-return-type
284
285
285 strmethod = pycompat.identity
286 strmethod = pycompat.identity
286
287
287
288
288 def lower(s):
289 def lower(s):
289 # type: (bytes) -> bytes
290 # type: (bytes) -> bytes
290 """best-effort encoding-aware case-folding of local string s"""
291 """best-effort encoding-aware case-folding of local string s"""
291 try:
292 try:
292 return asciilower(s)
293 return asciilower(s)
293 except UnicodeDecodeError:
294 except UnicodeDecodeError:
294 pass
295 pass
295 try:
296 try:
296 if isinstance(s, localstr):
297 if isinstance(s, localstr):
297 u = s._utf8.decode("utf-8")
298 u = s._utf8.decode("utf-8")
298 else:
299 else:
299 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
300 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
300
301
301 lu = u.lower()
302 lu = u.lower()
302 if u == lu:
303 if u == lu:
303 return s # preserve localstring
304 return s # preserve localstring
304 return lu.encode(_sysstr(encoding))
305 return lu.encode(_sysstr(encoding))
305 except UnicodeError:
306 except UnicodeError:
306 return s.lower() # we don't know how to fold this except in ASCII
307 return s.lower() # we don't know how to fold this except in ASCII
307 except LookupError as k:
308 except LookupError as k:
308 raise error.Abort(k, hint=b"please check your locale settings")
309 raise error.Abort(k, hint=b"please check your locale settings")
309
310
310
311
311 def upper(s):
312 def upper(s):
312 # type: (bytes) -> bytes
313 # type: (bytes) -> bytes
313 """best-effort encoding-aware case-folding of local string s"""
314 """best-effort encoding-aware case-folding of local string s"""
314 try:
315 try:
315 return asciiupper(s)
316 return asciiupper(s)
316 except UnicodeDecodeError:
317 except UnicodeDecodeError:
317 return upperfallback(s)
318 return upperfallback(s)
318
319
319
320
320 def upperfallback(s):
321 def upperfallback(s):
321 # type: (Any) -> Any
322 # type: (Any) -> Any
322 try:
323 try:
323 if isinstance(s, localstr):
324 if isinstance(s, localstr):
324 u = s._utf8.decode("utf-8")
325 u = s._utf8.decode("utf-8")
325 else:
326 else:
326 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
327 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
327
328
328 uu = u.upper()
329 uu = u.upper()
329 if u == uu:
330 if u == uu:
330 return s # preserve localstring
331 return s # preserve localstring
331 return uu.encode(_sysstr(encoding))
332 return uu.encode(_sysstr(encoding))
332 except UnicodeError:
333 except UnicodeError:
333 return s.upper() # we don't know how to fold this except in ASCII
334 return s.upper() # we don't know how to fold this except in ASCII
334 except LookupError as k:
335 except LookupError as k:
335 raise error.Abort(k, hint=b"please check your locale settings")
336 raise error.Abort(k, hint=b"please check your locale settings")
336
337
337
338
338 if not _nativeenviron:
339 if not _nativeenviron:
339 # now encoding and helper functions are available, recreate the environ
340 # now encoding and helper functions are available, recreate the environ
340 # dict to be exported to other modules
341 # dict to be exported to other modules
341 if pycompat.iswindows and pycompat.ispy3:
342 if pycompat.iswindows and pycompat.ispy3:
342
343
343 class WindowsEnviron(dict):
344 class WindowsEnviron(dict):
344 """`os.environ` normalizes environment variables to uppercase on windows"""
345 """`os.environ` normalizes environment variables to uppercase on windows"""
345
346
346 def get(self, key, default=None):
347 def get(self, key, default=None):
347 return super().get(upper(key), default)
348 return super().get(upper(key), default)
348
349
349 environ = WindowsEnviron()
350 environ = WindowsEnviron()
350
351
351 for k, v in os.environ.items(): # re-exports
352 for k, v in os.environ.items(): # re-exports
352 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
353 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
353
354
354
355
356 DRIVE_RE = re.compile(b'^[a-z]:')
357
355 if pycompat.ispy3:
358 if pycompat.ispy3:
356 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
359 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
357 # returns bytes.
360 # returns bytes.
358 if pycompat.iswindows:
361 if pycompat.iswindows:
359 # Python 3 on Windows issues a DeprecationWarning about using the bytes
362 # Python 3 on Windows issues a DeprecationWarning about using the bytes
360 # API when os.getcwdb() is called.
363 # API when os.getcwdb() is called.
361 #
364 #
362 # Additionally, py3.8+ uppercases the drive letter when calling
365 # Additionally, py3.8+ uppercases the drive letter when calling
363 # os.path.realpath(), which is used on ``repo.root``. Since those
366 # os.path.realpath(), which is used on ``repo.root``. Since those
364 # strings are compared in various places as simple strings, also call
367 # strings are compared in various places as simple strings, also call
365 # realpath here. See https://bugs.python.org/issue40368
368 # realpath here. See https://bugs.python.org/issue40368
366 getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports
369 #
370 # However this is not reliable, so lets explicitly make this drive
371 # letter upper case.
372 #
373 # note: we should consider dropping realpath here since it seems to
374 # change the semantic of `getcwd`.
375
376 def getcwd():
377 cwd = os.getcwd() # re-exports
378 cwd = os.path.realpath(cwd)
379 cwd = strtolocal(cwd)
380 if DRIVE_RE.match(cwd):
381 cwd = cwd[0:1].upper() + cwd[1:]
382 return cwd
383
367 else:
384 else:
368 getcwd = os.getcwdb # re-exports
385 getcwd = os.getcwdb # re-exports
369 else:
386 else:
370 getcwd = os.getcwd # re-exports
387 getcwd = os.getcwd # re-exports
371
388
372 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
389 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
373 _wide = _sysstr(
390 _wide = _sysstr(
374 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
391 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
375 and b"WFA"
392 and b"WFA"
376 or b"WF"
393 or b"WF"
377 )
394 )
378
395
379
396
380 def colwidth(s):
397 def colwidth(s):
381 # type: (bytes) -> int
398 # type: (bytes) -> int
382 """Find the column width of a string for display in the local encoding"""
399 """Find the column width of a string for display in the local encoding"""
383 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
400 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
384
401
385
402
386 def ucolwidth(d):
403 def ucolwidth(d):
387 # type: (Text) -> int
404 # type: (Text) -> int
388 """Find the column width of a Unicode string for display"""
405 """Find the column width of a Unicode string for display"""
389 eaw = getattr(unicodedata, 'east_asian_width', None)
406 eaw = getattr(unicodedata, 'east_asian_width', None)
390 if eaw is not None:
407 if eaw is not None:
391 return sum([eaw(c) in _wide and 2 or 1 for c in d])
408 return sum([eaw(c) in _wide and 2 or 1 for c in d])
392 return len(d)
409 return len(d)
393
410
394
411
395 def getcols(s, start, c):
412 def getcols(s, start, c):
396 # type: (bytes, int, int) -> bytes
413 # type: (bytes, int, int) -> bytes
397 """Use colwidth to find a c-column substring of s starting at byte
414 """Use colwidth to find a c-column substring of s starting at byte
398 index start"""
415 index start"""
399 for x in pycompat.xrange(start + c, len(s)):
416 for x in pycompat.xrange(start + c, len(s)):
400 t = s[start:x]
417 t = s[start:x]
401 if colwidth(t) == c:
418 if colwidth(t) == c:
402 return t
419 return t
403 raise ValueError('substring not found')
420 raise ValueError('substring not found')
404
421
405
422
406 def trim(s, width, ellipsis=b'', leftside=False):
423 def trim(s, width, ellipsis=b'', leftside=False):
407 # type: (bytes, int, bytes, bool) -> bytes
424 # type: (bytes, int, bytes, bool) -> bytes
408 """Trim string 's' to at most 'width' columns (including 'ellipsis').
425 """Trim string 's' to at most 'width' columns (including 'ellipsis').
409
426
410 If 'leftside' is True, left side of string 's' is trimmed.
427 If 'leftside' is True, left side of string 's' is trimmed.
411 'ellipsis' is always placed at trimmed side.
428 'ellipsis' is always placed at trimmed side.
412
429
413 >>> from .node import bin
430 >>> from .node import bin
414 >>> def bprint(s):
431 >>> def bprint(s):
415 ... print(pycompat.sysstr(s))
432 ... print(pycompat.sysstr(s))
416 >>> ellipsis = b'+++'
433 >>> ellipsis = b'+++'
417 >>> from . import encoding
434 >>> from . import encoding
418 >>> encoding.encoding = b'utf-8'
435 >>> encoding.encoding = b'utf-8'
419 >>> t = b'1234567890'
436 >>> t = b'1234567890'
420 >>> bprint(trim(t, 12, ellipsis=ellipsis))
437 >>> bprint(trim(t, 12, ellipsis=ellipsis))
421 1234567890
438 1234567890
422 >>> bprint(trim(t, 10, ellipsis=ellipsis))
439 >>> bprint(trim(t, 10, ellipsis=ellipsis))
423 1234567890
440 1234567890
424 >>> bprint(trim(t, 8, ellipsis=ellipsis))
441 >>> bprint(trim(t, 8, ellipsis=ellipsis))
425 12345+++
442 12345+++
426 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
443 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
427 +++67890
444 +++67890
428 >>> bprint(trim(t, 8))
445 >>> bprint(trim(t, 8))
429 12345678
446 12345678
430 >>> bprint(trim(t, 8, leftside=True))
447 >>> bprint(trim(t, 8, leftside=True))
431 34567890
448 34567890
432 >>> bprint(trim(t, 3, ellipsis=ellipsis))
449 >>> bprint(trim(t, 3, ellipsis=ellipsis))
433 +++
450 +++
434 >>> bprint(trim(t, 1, ellipsis=ellipsis))
451 >>> bprint(trim(t, 1, ellipsis=ellipsis))
435 +
452 +
436 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
453 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
437 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
454 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
438 >>> bprint(trim(t, 12, ellipsis=ellipsis))
455 >>> bprint(trim(t, 12, ellipsis=ellipsis))
439 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
456 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
440 >>> bprint(trim(t, 10, ellipsis=ellipsis))
457 >>> bprint(trim(t, 10, ellipsis=ellipsis))
441 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
458 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
442 >>> bprint(trim(t, 8, ellipsis=ellipsis))
459 >>> bprint(trim(t, 8, ellipsis=ellipsis))
443 \xe3\x81\x82\xe3\x81\x84+++
460 \xe3\x81\x82\xe3\x81\x84+++
444 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
461 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
445 +++\xe3\x81\x88\xe3\x81\x8a
462 +++\xe3\x81\x88\xe3\x81\x8a
446 >>> bprint(trim(t, 5))
463 >>> bprint(trim(t, 5))
447 \xe3\x81\x82\xe3\x81\x84
464 \xe3\x81\x82\xe3\x81\x84
448 >>> bprint(trim(t, 5, leftside=True))
465 >>> bprint(trim(t, 5, leftside=True))
449 \xe3\x81\x88\xe3\x81\x8a
466 \xe3\x81\x88\xe3\x81\x8a
450 >>> bprint(trim(t, 4, ellipsis=ellipsis))
467 >>> bprint(trim(t, 4, ellipsis=ellipsis))
451 +++
468 +++
452 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
469 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
453 +++
470 +++
454 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
471 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
455 >>> bprint(trim(t, 12, ellipsis=ellipsis))
472 >>> bprint(trim(t, 12, ellipsis=ellipsis))
456 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
473 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
457 >>> bprint(trim(t, 10, ellipsis=ellipsis))
474 >>> bprint(trim(t, 10, ellipsis=ellipsis))
458 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
475 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
459 >>> bprint(trim(t, 8, ellipsis=ellipsis))
476 >>> bprint(trim(t, 8, ellipsis=ellipsis))
460 \x11\x22\x33\x44\x55+++
477 \x11\x22\x33\x44\x55+++
461 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
478 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
462 +++\x66\x77\x88\x99\xaa
479 +++\x66\x77\x88\x99\xaa
463 >>> bprint(trim(t, 8))
480 >>> bprint(trim(t, 8))
464 \x11\x22\x33\x44\x55\x66\x77\x88
481 \x11\x22\x33\x44\x55\x66\x77\x88
465 >>> bprint(trim(t, 8, leftside=True))
482 >>> bprint(trim(t, 8, leftside=True))
466 \x33\x44\x55\x66\x77\x88\x99\xaa
483 \x33\x44\x55\x66\x77\x88\x99\xaa
467 >>> bprint(trim(t, 3, ellipsis=ellipsis))
484 >>> bprint(trim(t, 3, ellipsis=ellipsis))
468 +++
485 +++
469 >>> bprint(trim(t, 1, ellipsis=ellipsis))
486 >>> bprint(trim(t, 1, ellipsis=ellipsis))
470 +
487 +
471 """
488 """
472 try:
489 try:
473 u = s.decode(_sysstr(encoding))
490 u = s.decode(_sysstr(encoding))
474 except UnicodeDecodeError:
491 except UnicodeDecodeError:
475 if len(s) <= width: # trimming is not needed
492 if len(s) <= width: # trimming is not needed
476 return s
493 return s
477 width -= len(ellipsis)
494 width -= len(ellipsis)
478 if width <= 0: # no enough room even for ellipsis
495 if width <= 0: # no enough room even for ellipsis
479 return ellipsis[: width + len(ellipsis)]
496 return ellipsis[: width + len(ellipsis)]
480 if leftside:
497 if leftside:
481 return ellipsis + s[-width:]
498 return ellipsis + s[-width:]
482 return s[:width] + ellipsis
499 return s[:width] + ellipsis
483
500
484 if ucolwidth(u) <= width: # trimming is not needed
501 if ucolwidth(u) <= width: # trimming is not needed
485 return s
502 return s
486
503
487 width -= len(ellipsis)
504 width -= len(ellipsis)
488 if width <= 0: # no enough room even for ellipsis
505 if width <= 0: # no enough room even for ellipsis
489 return ellipsis[: width + len(ellipsis)]
506 return ellipsis[: width + len(ellipsis)]
490
507
491 if leftside:
508 if leftside:
492 uslice = lambda i: u[i:]
509 uslice = lambda i: u[i:]
493 concat = lambda s: ellipsis + s
510 concat = lambda s: ellipsis + s
494 else:
511 else:
495 uslice = lambda i: u[:-i]
512 uslice = lambda i: u[:-i]
496 concat = lambda s: s + ellipsis
513 concat = lambda s: s + ellipsis
497 for i in pycompat.xrange(1, len(u)):
514 for i in pycompat.xrange(1, len(u)):
498 usub = uslice(i)
515 usub = uslice(i)
499 if ucolwidth(usub) <= width:
516 if ucolwidth(usub) <= width:
500 return concat(usub.encode(_sysstr(encoding)))
517 return concat(usub.encode(_sysstr(encoding)))
501 return ellipsis # no enough room for multi-column characters
518 return ellipsis # no enough room for multi-column characters
502
519
503
520
504 class normcasespecs(object):
521 class normcasespecs(object):
505 """what a platform's normcase does to ASCII strings
522 """what a platform's normcase does to ASCII strings
506
523
507 This is specified per platform, and should be consistent with what normcase
524 This is specified per platform, and should be consistent with what normcase
508 on that platform actually does.
525 on that platform actually does.
509
526
510 lower: normcase lowercases ASCII strings
527 lower: normcase lowercases ASCII strings
511 upper: normcase uppercases ASCII strings
528 upper: normcase uppercases ASCII strings
512 other: the fallback function should always be called
529 other: the fallback function should always be called
513
530
514 This should be kept in sync with normcase_spec in util.h."""
531 This should be kept in sync with normcase_spec in util.h."""
515
532
516 lower = -1
533 lower = -1
517 upper = 1
534 upper = 1
518 other = 0
535 other = 0
519
536
520
537
521 def jsonescape(s, paranoid=False):
538 def jsonescape(s, paranoid=False):
522 # type: (Any, Any) -> Any
539 # type: (Any, Any) -> Any
523 """returns a string suitable for JSON
540 """returns a string suitable for JSON
524
541
525 JSON is problematic for us because it doesn't support non-Unicode
542 JSON is problematic for us because it doesn't support non-Unicode
526 bytes. To deal with this, we take the following approach:
543 bytes. To deal with this, we take the following approach:
527
544
528 - localstr/safelocalstr objects are converted back to UTF-8
545 - localstr/safelocalstr objects are converted back to UTF-8
529 - valid UTF-8/ASCII strings are passed as-is
546 - valid UTF-8/ASCII strings are passed as-is
530 - other strings are converted to UTF-8b surrogate encoding
547 - other strings are converted to UTF-8b surrogate encoding
531 - apply JSON-specified string escaping
548 - apply JSON-specified string escaping
532
549
533 (escapes are doubled in these tests)
550 (escapes are doubled in these tests)
534
551
535 >>> jsonescape(b'this is a test')
552 >>> jsonescape(b'this is a test')
536 'this is a test'
553 'this is a test'
537 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
554 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
538 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
555 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
539 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
556 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
540 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
557 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
541 >>> jsonescape(b'a weird byte: \\xdd')
558 >>> jsonescape(b'a weird byte: \\xdd')
542 'a weird byte: \\xed\\xb3\\x9d'
559 'a weird byte: \\xed\\xb3\\x9d'
543 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
560 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
544 'utf-8: caf\\xc3\\xa9'
561 'utf-8: caf\\xc3\\xa9'
545 >>> jsonescape(b'')
562 >>> jsonescape(b'')
546 ''
563 ''
547
564
548 If paranoid, non-ascii and common troublesome characters are also escaped.
565 If paranoid, non-ascii and common troublesome characters are also escaped.
549 This is suitable for web output.
566 This is suitable for web output.
550
567
551 >>> s = b'escape characters: \\0 \\x0b \\x7f'
568 >>> s = b'escape characters: \\0 \\x0b \\x7f'
552 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
569 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
553 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
570 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
554 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
571 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
555 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
572 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
556 'escape boundary: ~ \\\\u007f \\\\u0080'
573 'escape boundary: ~ \\\\u007f \\\\u0080'
557 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
574 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
558 'a weird byte: \\\\udcdd'
575 'a weird byte: \\\\udcdd'
559 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
576 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
560 'utf-8: caf\\\\u00e9'
577 'utf-8: caf\\\\u00e9'
561 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
578 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
562 'non-BMP: \\\\ud834\\\\udd1e'
579 'non-BMP: \\\\ud834\\\\udd1e'
563 >>> jsonescape(b'<foo@example.org>', paranoid=True)
580 >>> jsonescape(b'<foo@example.org>', paranoid=True)
564 '\\\\u003cfoo@example.org\\\\u003e'
581 '\\\\u003cfoo@example.org\\\\u003e'
565 """
582 """
566
583
567 u8chars = toutf8b(s)
584 u8chars = toutf8b(s)
568 try:
585 try:
569 return _jsonescapeu8fast(u8chars, paranoid)
586 return _jsonescapeu8fast(u8chars, paranoid)
570 except ValueError:
587 except ValueError:
571 pass
588 pass
572 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
589 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
573
590
574
591
575 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
592 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
576 # bytes are mapped to that range.
593 # bytes are mapped to that range.
577 if pycompat.ispy3:
594 if pycompat.ispy3:
578 _utf8strict = r'surrogatepass'
595 _utf8strict = r'surrogatepass'
579 else:
596 else:
580 _utf8strict = r'strict'
597 _utf8strict = r'strict'
581
598
582 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
599 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
583
600
584
601
585 def getutf8char(s, pos):
602 def getutf8char(s, pos):
586 # type: (bytes, int) -> bytes
603 # type: (bytes, int) -> bytes
587 """get the next full utf-8 character in the given string, starting at pos
604 """get the next full utf-8 character in the given string, starting at pos
588
605
589 Raises a UnicodeError if the given location does not start a valid
606 Raises a UnicodeError if the given location does not start a valid
590 utf-8 character.
607 utf-8 character.
591 """
608 """
592
609
593 # find how many bytes to attempt decoding from first nibble
610 # find how many bytes to attempt decoding from first nibble
594 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
611 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
595 if not l: # ascii
612 if not l: # ascii
596 return s[pos : pos + 1]
613 return s[pos : pos + 1]
597
614
598 c = s[pos : pos + l]
615 c = s[pos : pos + l]
599 # validate with attempted decode
616 # validate with attempted decode
600 c.decode("utf-8", _utf8strict)
617 c.decode("utf-8", _utf8strict)
601 return c
618 return c
602
619
603
620
604 def toutf8b(s):
621 def toutf8b(s):
605 # type: (bytes) -> bytes
622 # type: (bytes) -> bytes
606 """convert a local, possibly-binary string into UTF-8b
623 """convert a local, possibly-binary string into UTF-8b
607
624
608 This is intended as a generic method to preserve data when working
625 This is intended as a generic method to preserve data when working
609 with schemes like JSON and XML that have no provision for
626 with schemes like JSON and XML that have no provision for
610 arbitrary byte strings. As Mercurial often doesn't know
627 arbitrary byte strings. As Mercurial often doesn't know
611 what encoding data is in, we use so-called UTF-8b.
628 what encoding data is in, we use so-called UTF-8b.
612
629
613 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
630 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
614 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
631 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
615 uDC00-uDCFF.
632 uDC00-uDCFF.
616
633
617 Principles of operation:
634 Principles of operation:
618
635
619 - ASCII and UTF-8 data successfully round-trips and is understood
636 - ASCII and UTF-8 data successfully round-trips and is understood
620 by Unicode-oriented clients
637 by Unicode-oriented clients
621 - filenames and file contents in arbitrary other encodings can have
638 - filenames and file contents in arbitrary other encodings can have
622 be round-tripped or recovered by clueful clients
639 be round-tripped or recovered by clueful clients
623 - local strings that have a cached known UTF-8 encoding (aka
640 - local strings that have a cached known UTF-8 encoding (aka
624 localstr) get sent as UTF-8 so Unicode-oriented clients get the
641 localstr) get sent as UTF-8 so Unicode-oriented clients get the
625 Unicode data they want
642 Unicode data they want
626 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
643 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
627 - because we must preserve UTF-8 bytestring in places such as
644 - because we must preserve UTF-8 bytestring in places such as
628 filenames, metadata can't be roundtripped without help
645 filenames, metadata can't be roundtripped without help
629
646
630 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
647 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
631 arbitrary bytes into an internal Unicode format that can be
648 arbitrary bytes into an internal Unicode format that can be
632 re-encoded back into the original. Here we are exposing the
649 re-encoded back into the original. Here we are exposing the
633 internal surrogate encoding as a UTF-8 string.)
650 internal surrogate encoding as a UTF-8 string.)
634 """
651 """
635
652
636 if isinstance(s, localstr):
653 if isinstance(s, localstr):
637 # assume that the original UTF-8 sequence would never contain
654 # assume that the original UTF-8 sequence would never contain
638 # invalid characters in U+DCxx range
655 # invalid characters in U+DCxx range
639 return s._utf8
656 return s._utf8
640 elif isinstance(s, safelocalstr):
657 elif isinstance(s, safelocalstr):
641 # already verified that s is non-lossy in legacy encoding, which
658 # already verified that s is non-lossy in legacy encoding, which
642 # shouldn't contain characters in U+DCxx range
659 # shouldn't contain characters in U+DCxx range
643 return fromlocal(s)
660 return fromlocal(s)
644 elif isasciistr(s):
661 elif isasciistr(s):
645 return s
662 return s
646 if b"\xed" not in s:
663 if b"\xed" not in s:
647 try:
664 try:
648 s.decode('utf-8', _utf8strict)
665 s.decode('utf-8', _utf8strict)
649 return s
666 return s
650 except UnicodeDecodeError:
667 except UnicodeDecodeError:
651 pass
668 pass
652
669
653 s = pycompat.bytestr(s)
670 s = pycompat.bytestr(s)
654 r = b""
671 r = b""
655 pos = 0
672 pos = 0
656 l = len(s)
673 l = len(s)
657 while pos < l:
674 while pos < l:
658 try:
675 try:
659 c = getutf8char(s, pos)
676 c = getutf8char(s, pos)
660 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
677 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
661 # have to re-escape existing U+DCxx characters
678 # have to re-escape existing U+DCxx characters
662 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
679 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
663 pos += 1
680 pos += 1
664 else:
681 else:
665 pos += len(c)
682 pos += len(c)
666 except UnicodeDecodeError:
683 except UnicodeDecodeError:
667 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
684 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 pos += 1
685 pos += 1
669 r += c
686 r += c
670 return r
687 return r
671
688
672
689
673 def fromutf8b(s):
690 def fromutf8b(s):
674 # type: (bytes) -> bytes
691 # type: (bytes) -> bytes
675 """Given a UTF-8b string, return a local, possibly-binary string.
692 """Given a UTF-8b string, return a local, possibly-binary string.
676
693
677 return the original binary string. This
694 return the original binary string. This
678 is a round-trip process for strings like filenames, but metadata
695 is a round-trip process for strings like filenames, but metadata
679 that's was passed through tolocal will remain in UTF-8.
696 that's was passed through tolocal will remain in UTF-8.
680
697
681 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
698 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
682 >>> m = b"\\xc3\\xa9\\x99abcd"
699 >>> m = b"\\xc3\\xa9\\x99abcd"
683 >>> toutf8b(m)
700 >>> toutf8b(m)
684 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
701 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
685 >>> roundtrip(m)
702 >>> roundtrip(m)
686 True
703 True
687 >>> roundtrip(b"\\xc2\\xc2\\x80")
704 >>> roundtrip(b"\\xc2\\xc2\\x80")
688 True
705 True
689 >>> roundtrip(b"\\xef\\xbf\\xbd")
706 >>> roundtrip(b"\\xef\\xbf\\xbd")
690 True
707 True
691 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
708 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
692 True
709 True
693 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
710 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
694 True
711 True
695 """
712 """
696
713
697 if isasciistr(s):
714 if isasciistr(s):
698 return s
715 return s
699 # fast path - look for uDxxx prefixes in s
716 # fast path - look for uDxxx prefixes in s
700 if b"\xed" not in s:
717 if b"\xed" not in s:
701 return s
718 return s
702
719
703 # We could do this with the unicode type but some Python builds
720 # We could do this with the unicode type but some Python builds
704 # use UTF-16 internally (issue5031) which causes non-BMP code
721 # use UTF-16 internally (issue5031) which causes non-BMP code
705 # points to be escaped. Instead, we use our handy getutf8char
722 # points to be escaped. Instead, we use our handy getutf8char
706 # helper again to walk the string without "decoding" it.
723 # helper again to walk the string without "decoding" it.
707
724
708 s = pycompat.bytestr(s)
725 s = pycompat.bytestr(s)
709 r = b""
726 r = b""
710 pos = 0
727 pos = 0
711 l = len(s)
728 l = len(s)
712 while pos < l:
729 while pos < l:
713 c = getutf8char(s, pos)
730 c = getutf8char(s, pos)
714 pos += len(c)
731 pos += len(c)
715 # unescape U+DCxx characters
732 # unescape U+DCxx characters
716 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
733 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
717 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
734 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
718 r += c
735 r += c
719 return r
736 return r
General Comments 0
You need to be logged in to leave comments. Login now