##// END OF EJS Templates
typing: add type hints to the `charencode` module...
Matt Harbison -
r52615:43adbe03 default
parent child Browse files
Show More
@@ -1,718 +1,728
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8
8
9 import locale
9 import locale
10 import os
10 import os
11 import re
11 import re
12 import typing
12 import typing
13 import unicodedata
13 import unicodedata
14
14
15 from typing import (
15 from typing import (
16 Any,
16 Any,
17 Callable,
17 Callable,
18 Text,
18 Text,
19 TypeVar,
19 TypeVar,
20 )
20 )
21
21
22 from . import (
22 from . import (
23 error,
23 error,
24 policy,
24 policy,
25 pycompat,
25 pycompat,
26 )
26 )
27
27
28 from .pure import charencode as charencodepure
28 from .pure import charencode as charencodepure
29
29
30 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
30 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
31
31
32 charencode = policy.importmod('charencode')
32 charencode = policy.importmod('charencode')
33
33
34 isasciistr = charencode.isasciistr
34 isasciistr = charencode.isasciistr
35 asciilower = charencode.asciilower
35 asciilower = charencode.asciilower
36 asciiupper = charencode.asciiupper
36 asciiupper = charencode.asciiupper
37 _jsonescapeu8fast = charencode.jsonescapeu8fast
37 _jsonescapeu8fast = charencode.jsonescapeu8fast
38
38
39 _sysstr = pycompat.sysstr
39 _sysstr = pycompat.sysstr
40
40
41 unichr = chr
41 unichr = chr
42
42
43 if typing.TYPE_CHECKING:
44 # TODO: make a stub file for .cext.charencode, and import here
45 from .pure.charencode import (
46 asciilower,
47 asciiupper,
48 isasciistr,
49 jsonescapeu8fast as _jsonescapeu8fast,
50 )
51
52
43 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
44 # "Unicode Subtleties"), so we need to ignore them in some places for
54 # "Unicode Subtleties"), so we need to ignore them in some places for
45 # sanity.
55 # sanity.
46 _ignore = [
56 _ignore = [
47 unichr(int(x, 16)).encode("utf-8")
57 unichr(int(x, 16)).encode("utf-8")
48 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
49 b"206a 206b 206c 206d 206e 206f feff".split()
59 b"206a 206b 206c 206d 206e 206f feff".split()
50 ]
60 ]
51 # verify the next function will work
61 # verify the next function will work
52 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
53
63
54
64
55 def hfsignoreclean(s: bytes) -> bytes:
65 def hfsignoreclean(s: bytes) -> bytes:
56 """Remove codepoints ignored by HFS+ from s.
66 """Remove codepoints ignored by HFS+ from s.
57
67
58 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
59 '.hg'
69 '.hg'
60 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
61 '.hg'
71 '.hg'
62 """
72 """
63 if b"\xe2" in s or b"\xef" in s:
73 if b"\xe2" in s or b"\xef" in s:
64 for c in _ignore:
74 for c in _ignore:
65 s = s.replace(c, b'')
75 s = s.replace(c, b'')
66 return s
76 return s
67
77
68
78
69 # encoding.environ is provided read-only, which may not be used to modify
79 # encoding.environ is provided read-only, which may not be used to modify
70 # the process environment
80 # the process environment
71 _nativeenviron = os.supports_bytes_environ
81 _nativeenviron = os.supports_bytes_environ
72 if _nativeenviron:
82 if _nativeenviron:
73 environ = os.environb # re-exports
83 environ = os.environb # re-exports
74 if pycompat.sysplatform == b'OpenVMS':
84 if pycompat.sysplatform == b'OpenVMS':
75 # workaround for a bug in VSI 3.10 port
85 # workaround for a bug in VSI 3.10 port
76 # os.environb is only populated with a few Predefined symbols
86 # os.environb is only populated with a few Predefined symbols
77 def newget(self, key, default=None):
87 def newget(self, key, default=None):
78 # pytype on linux does not understand OpenVMS special modules
88 # pytype on linux does not understand OpenVMS special modules
79 import _decc # pytype: disable=import-error
89 import _decc # pytype: disable=import-error
80
90
81 v = _decc.getenv(key, None)
91 v = _decc.getenv(key, None)
82 if isinstance(key, bytes):
92 if isinstance(key, bytes):
83 return default if v is None else v.encode('latin-1')
93 return default if v is None else v.encode('latin-1')
84 else:
94 else:
85 return default if v is None else v
95 return default if v is None else v
86
96
87 environ.__class__.get = newget
97 environ.__class__.get = newget
88 else:
98 else:
89 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
99 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
90 # and recreate it once encoding is settled
100 # and recreate it once encoding is settled
91 environ = {
101 environ = {
92 k.encode('utf-8'): v.encode('utf-8')
102 k.encode('utf-8'): v.encode('utf-8')
93 for k, v in os.environ.items() # re-exports
103 for k, v in os.environ.items() # re-exports
94 }
104 }
95
105
96 _encodingrewrites = {
106 _encodingrewrites = {
97 b'646': b'ascii',
107 b'646': b'ascii',
98 b'ANSI_X3.4-1968': b'ascii',
108 b'ANSI_X3.4-1968': b'ascii',
99 }
109 }
100 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
110 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
101 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
111 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
102 # https://bugs.python.org/issue13216
112 # https://bugs.python.org/issue13216
103 if pycompat.iswindows:
113 if pycompat.iswindows:
104 _encodingrewrites[b'cp65001'] = b'utf-8'
114 _encodingrewrites[b'cp65001'] = b'utf-8'
105
115
106 encoding: bytes = b'' # help pytype avoid seeing None value
116 encoding: bytes = b'' # help pytype avoid seeing None value
107 try:
117 try:
108 encoding = environ.get(b"HGENCODING", b'')
118 encoding = environ.get(b"HGENCODING", b'')
109 if not encoding:
119 if not encoding:
110 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
120 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
111 encoding = _encodingrewrites.get(encoding, encoding)
121 encoding = _encodingrewrites.get(encoding, encoding)
112 except locale.Error:
122 except locale.Error:
113 encoding = b'ascii'
123 encoding = b'ascii'
114 encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")
124 encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")
115 fallbackencoding = b'ISO-8859-1'
125 fallbackencoding = b'ISO-8859-1'
116
126
117
127
118 class localstr(bytes):
128 class localstr(bytes):
119 """This class allows strings that are unmodified to be
129 """This class allows strings that are unmodified to be
120 round-tripped to the local encoding and back"""
130 round-tripped to the local encoding and back"""
121
131
122 def __new__(cls, u, l):
132 def __new__(cls, u, l):
123 s = bytes.__new__(cls, l)
133 s = bytes.__new__(cls, l)
124 s._utf8 = u
134 s._utf8 = u
125 return s
135 return s
126
136
127 if typing.TYPE_CHECKING:
137 if typing.TYPE_CHECKING:
128 # pseudo implementation to help pytype see localstr() constructor
138 # pseudo implementation to help pytype see localstr() constructor
129 def __init__(self, u: bytes, l: bytes) -> None:
139 def __init__(self, u: bytes, l: bytes) -> None:
130 super(localstr, self).__init__(l)
140 super(localstr, self).__init__(l)
131 self._utf8 = u
141 self._utf8 = u
132
142
133 def __hash__(self):
143 def __hash__(self):
134 return hash(self._utf8) # avoid collisions in local string space
144 return hash(self._utf8) # avoid collisions in local string space
135
145
136
146
137 class safelocalstr(bytes):
147 class safelocalstr(bytes):
138 """Tagged string denoting it was previously an internal UTF-8 string,
148 """Tagged string denoting it was previously an internal UTF-8 string,
139 and can be converted back to UTF-8 losslessly
149 and can be converted back to UTF-8 losslessly
140
150
141 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
151 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
142 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
152 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
143 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
153 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
144 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
154 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
145 """
155 """
146
156
147
157
148 def tolocal(s: bytes) -> bytes:
158 def tolocal(s: bytes) -> bytes:
149 """
159 """
150 Convert a string from internal UTF-8 to local encoding
160 Convert a string from internal UTF-8 to local encoding
151
161
152 All internal strings should be UTF-8 but some repos before the
162 All internal strings should be UTF-8 but some repos before the
153 implementation of locale support may contain latin1 or possibly
163 implementation of locale support may contain latin1 or possibly
154 other character sets. We attempt to decode everything strictly
164 other character sets. We attempt to decode everything strictly
155 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
165 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
156 replace unknown characters.
166 replace unknown characters.
157
167
158 The localstr class is used to cache the known UTF-8 encoding of
168 The localstr class is used to cache the known UTF-8 encoding of
159 strings next to their local representation to allow lossless
169 strings next to their local representation to allow lossless
160 round-trip conversion back to UTF-8.
170 round-trip conversion back to UTF-8.
161
171
162 >>> u = b'foo: \\xc3\\xa4' # utf-8
172 >>> u = b'foo: \\xc3\\xa4' # utf-8
163 >>> l = tolocal(u)
173 >>> l = tolocal(u)
164 >>> l
174 >>> l
165 'foo: ?'
175 'foo: ?'
166 >>> fromlocal(l)
176 >>> fromlocal(l)
167 'foo: \\xc3\\xa4'
177 'foo: \\xc3\\xa4'
168 >>> u2 = b'foo: \\xc3\\xa1'
178 >>> u2 = b'foo: \\xc3\\xa1'
169 >>> d = { l: 1, tolocal(u2): 2 }
179 >>> d = { l: 1, tolocal(u2): 2 }
170 >>> len(d) # no collision
180 >>> len(d) # no collision
171 2
181 2
172 >>> b'foo: ?' in d
182 >>> b'foo: ?' in d
173 False
183 False
174 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
184 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
175 >>> l = tolocal(l1)
185 >>> l = tolocal(l1)
176 >>> l
186 >>> l
177 'foo: ?'
187 'foo: ?'
178 >>> fromlocal(l) # magically in utf-8
188 >>> fromlocal(l) # magically in utf-8
179 'foo: \\xc3\\xa4'
189 'foo: \\xc3\\xa4'
180 """
190 """
181
191
182 if isasciistr(s):
192 if isasciistr(s):
183 return s
193 return s
184
194
185 try:
195 try:
186 try:
196 try:
187 # make sure string is actually stored in UTF-8
197 # make sure string is actually stored in UTF-8
188 u = s.decode('UTF-8')
198 u = s.decode('UTF-8')
189 if encoding == b'UTF-8':
199 if encoding == b'UTF-8':
190 # fast path
200 # fast path
191 return s
201 return s
192 r = u.encode(_sysstr(encoding), "replace")
202 r = u.encode(_sysstr(encoding), "replace")
193 if u == r.decode(_sysstr(encoding)):
203 if u == r.decode(_sysstr(encoding)):
194 # r is a safe, non-lossy encoding of s
204 # r is a safe, non-lossy encoding of s
195 return safelocalstr(r)
205 return safelocalstr(r)
196 return localstr(s, r)
206 return localstr(s, r)
197 except UnicodeDecodeError:
207 except UnicodeDecodeError:
198 # we should only get here if we're looking at an ancient changeset
208 # we should only get here if we're looking at an ancient changeset
199 try:
209 try:
200 u = s.decode(_sysstr(fallbackencoding))
210 u = s.decode(_sysstr(fallbackencoding))
201 r = u.encode(_sysstr(encoding), "replace")
211 r = u.encode(_sysstr(encoding), "replace")
202 if u == r.decode(_sysstr(encoding)):
212 if u == r.decode(_sysstr(encoding)):
203 # r is a safe, non-lossy encoding of s
213 # r is a safe, non-lossy encoding of s
204 return safelocalstr(r)
214 return safelocalstr(r)
205 return localstr(u.encode('UTF-8'), r)
215 return localstr(u.encode('UTF-8'), r)
206 except UnicodeDecodeError:
216 except UnicodeDecodeError:
207 u = s.decode("utf-8", "replace") # last ditch
217 u = s.decode("utf-8", "replace") # last ditch
208 # can't round-trip
218 # can't round-trip
209 return u.encode(_sysstr(encoding), "replace")
219 return u.encode(_sysstr(encoding), "replace")
210 except LookupError as k:
220 except LookupError as k:
211 raise error.Abort(
221 raise error.Abort(
212 pycompat.bytestr(k), hint=b"please check your locale settings"
222 pycompat.bytestr(k), hint=b"please check your locale settings"
213 )
223 )
214
224
215
225
216 def fromlocal(s: bytes) -> bytes:
226 def fromlocal(s: bytes) -> bytes:
217 """
227 """
218 Convert a string from the local character encoding to UTF-8
228 Convert a string from the local character encoding to UTF-8
219
229
220 We attempt to decode strings using the encoding mode set by
230 We attempt to decode strings using the encoding mode set by
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
231 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 characters will cause an error message. Other modes include
232 characters will cause an error message. Other modes include
223 'replace', which replaces unknown characters with a special
233 'replace', which replaces unknown characters with a special
224 Unicode character, and 'ignore', which drops the character.
234 Unicode character, and 'ignore', which drops the character.
225 """
235 """
226
236
227 # can we do a lossless round-trip?
237 # can we do a lossless round-trip?
228 if isinstance(s, localstr):
238 if isinstance(s, localstr):
229 return s._utf8
239 return s._utf8
230 if isasciistr(s):
240 if isasciistr(s):
231 return s
241 return s
232
242
233 try:
243 try:
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
244 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 return u.encode("utf-8")
245 return u.encode("utf-8")
236 except UnicodeDecodeError as inst:
246 except UnicodeDecodeError as inst:
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
247 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 raise error.Abort(
248 raise error.Abort(
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
249 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 )
250 )
241 except LookupError as k:
251 except LookupError as k:
242 raise error.Abort(
252 raise error.Abort(
243 pycompat.bytestr(k), hint=b"please check your locale settings"
253 pycompat.bytestr(k), hint=b"please check your locale settings"
244 )
254 )
245
255
246
256
247 def unitolocal(u: str) -> bytes:
257 def unitolocal(u: str) -> bytes:
248 """Convert a unicode string to a byte string of local encoding"""
258 """Convert a unicode string to a byte string of local encoding"""
249 return tolocal(u.encode('utf-8'))
259 return tolocal(u.encode('utf-8'))
250
260
251
261
252 def unifromlocal(s: bytes) -> str:
262 def unifromlocal(s: bytes) -> str:
253 """Convert a byte string of local encoding to a unicode string"""
263 """Convert a byte string of local encoding to a unicode string"""
254 return fromlocal(s).decode('utf-8')
264 return fromlocal(s).decode('utf-8')
255
265
256
266
257 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
267 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
258 """Create a proxy method that forwards __unicode__() and __str__() of
268 """Create a proxy method that forwards __unicode__() and __str__() of
259 Python 3 to __bytes__()"""
269 Python 3 to __bytes__()"""
260
270
261 def unifunc(obj):
271 def unifunc(obj):
262 return unifromlocal(bytesfunc(obj))
272 return unifromlocal(bytesfunc(obj))
263
273
264 return unifunc
274 return unifunc
265
275
266
276
267 # converter functions between native str and byte string. use these if the
277 # converter functions between native str and byte string. use these if the
268 # character encoding is not aware (e.g. exception message) or is known to
278 # character encoding is not aware (e.g. exception message) or is known to
269 # be locale dependent (e.g. date formatting.)
279 # be locale dependent (e.g. date formatting.)
270 strtolocal = unitolocal
280 strtolocal = unitolocal
271 strfromlocal = unifromlocal
281 strfromlocal = unifromlocal
272 strmethod = unimethod
282 strmethod = unimethod
273
283
274
284
275 def lower(s: bytes) -> bytes:
285 def lower(s: bytes) -> bytes:
276 """best-effort encoding-aware case-folding of local string s"""
286 """best-effort encoding-aware case-folding of local string s"""
277 try:
287 try:
278 return asciilower(s)
288 return asciilower(s)
279 except UnicodeDecodeError:
289 except UnicodeDecodeError:
280 pass
290 pass
281 try:
291 try:
282 if isinstance(s, localstr):
292 if isinstance(s, localstr):
283 u = s._utf8.decode("utf-8")
293 u = s._utf8.decode("utf-8")
284 else:
294 else:
285 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
295 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
286
296
287 lu = u.lower()
297 lu = u.lower()
288 if u == lu:
298 if u == lu:
289 return s # preserve localstring
299 return s # preserve localstring
290 return lu.encode(_sysstr(encoding))
300 return lu.encode(_sysstr(encoding))
291 except UnicodeError:
301 except UnicodeError:
292 return s.lower() # we don't know how to fold this except in ASCII
302 return s.lower() # we don't know how to fold this except in ASCII
293 except LookupError as k:
303 except LookupError as k:
294 raise error.Abort(
304 raise error.Abort(
295 pycompat.bytestr(k), hint=b"please check your locale settings"
305 pycompat.bytestr(k), hint=b"please check your locale settings"
296 )
306 )
297
307
298
308
299 def upper(s: bytes) -> bytes:
309 def upper(s: bytes) -> bytes:
300 """best-effort encoding-aware case-folding of local string s"""
310 """best-effort encoding-aware case-folding of local string s"""
301 try:
311 try:
302 return asciiupper(s)
312 return asciiupper(s)
303 except UnicodeDecodeError:
313 except UnicodeDecodeError:
304 return upperfallback(s)
314 return upperfallback(s)
305
315
306
316
307 def upperfallback(s: Any) -> Any:
317 def upperfallback(s: Any) -> Any:
308 try:
318 try:
309 if isinstance(s, localstr):
319 if isinstance(s, localstr):
310 u = s._utf8.decode("utf-8")
320 u = s._utf8.decode("utf-8")
311 else:
321 else:
312 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
322 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
313
323
314 uu = u.upper()
324 uu = u.upper()
315 if u == uu:
325 if u == uu:
316 return s # preserve localstring
326 return s # preserve localstring
317 return uu.encode(_sysstr(encoding))
327 return uu.encode(_sysstr(encoding))
318 except UnicodeError:
328 except UnicodeError:
319 return s.upper() # we don't know how to fold this except in ASCII
329 return s.upper() # we don't know how to fold this except in ASCII
320 except LookupError as k:
330 except LookupError as k:
321 raise error.Abort(
331 raise error.Abort(
322 pycompat.bytestr(k), hint=b"please check your locale settings"
332 pycompat.bytestr(k), hint=b"please check your locale settings"
323 )
333 )
324
334
325
335
326 if not _nativeenviron:
336 if not _nativeenviron:
327 # now encoding and helper functions are available, recreate the environ
337 # now encoding and helper functions are available, recreate the environ
328 # dict to be exported to other modules
338 # dict to be exported to other modules
329 if pycompat.iswindows:
339 if pycompat.iswindows:
330
340
331 class WindowsEnviron(dict):
341 class WindowsEnviron(dict):
332 """`os.environ` normalizes environment variables to uppercase on windows"""
342 """`os.environ` normalizes environment variables to uppercase on windows"""
333
343
334 def get(self, key, default=None):
344 def get(self, key, default=None):
335 return super().get(upper(key), default)
345 return super().get(upper(key), default)
336
346
337 environ = WindowsEnviron()
347 environ = WindowsEnviron()
338
348
339 for k, v in os.environ.items(): # re-exports
349 for k, v in os.environ.items(): # re-exports
340 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
350 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
341
351
342
352
343 DRIVE_RE = re.compile(b'^[a-z]:')
353 DRIVE_RE = re.compile(b'^[a-z]:')
344
354
345 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
355 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
346 # returns bytes.
356 # returns bytes.
347 if pycompat.iswindows:
357 if pycompat.iswindows:
348 # Python 3 on Windows issues a DeprecationWarning about using the bytes
358 # Python 3 on Windows issues a DeprecationWarning about using the bytes
349 # API when os.getcwdb() is called.
359 # API when os.getcwdb() is called.
350 #
360 #
351 # Additionally, py3.8+ uppercases the drive letter when calling
361 # Additionally, py3.8+ uppercases the drive letter when calling
352 # os.path.realpath(), which is used on ``repo.root``. Since those
362 # os.path.realpath(), which is used on ``repo.root``. Since those
353 # strings are compared in various places as simple strings, also call
363 # strings are compared in various places as simple strings, also call
354 # realpath here. See https://bugs.python.org/issue40368
364 # realpath here. See https://bugs.python.org/issue40368
355 #
365 #
356 # However this is not reliable, so lets explicitly make this drive
366 # However this is not reliable, so lets explicitly make this drive
357 # letter upper case.
367 # letter upper case.
358 #
368 #
359 # note: we should consider dropping realpath here since it seems to
369 # note: we should consider dropping realpath here since it seems to
360 # change the semantic of `getcwd`.
370 # change the semantic of `getcwd`.
361
371
362 def getcwd():
372 def getcwd():
363 cwd = os.getcwd() # re-exports
373 cwd = os.getcwd() # re-exports
364 cwd = os.path.realpath(cwd)
374 cwd = os.path.realpath(cwd)
365 cwd = strtolocal(cwd)
375 cwd = strtolocal(cwd)
366 if DRIVE_RE.match(cwd):
376 if DRIVE_RE.match(cwd):
367 cwd = cwd[0:1].upper() + cwd[1:]
377 cwd = cwd[0:1].upper() + cwd[1:]
368 return cwd
378 return cwd
369
379
370 else:
380 else:
371 getcwd = os.getcwdb # re-exports
381 getcwd = os.getcwdb # re-exports
372
382
373 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
383 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
374 _wide = _sysstr(
384 _wide = _sysstr(
375 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
385 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
376 and b"WFA"
386 and b"WFA"
377 or b"WF"
387 or b"WF"
378 )
388 )
379
389
380
390
381 def colwidth(s: bytes) -> int:
391 def colwidth(s: bytes) -> int:
382 """Find the column width of a string for display in the local encoding"""
392 """Find the column width of a string for display in the local encoding"""
383 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
393 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
384
394
385
395
386 def ucolwidth(d: Text) -> int:
396 def ucolwidth(d: Text) -> int:
387 """Find the column width of a Unicode string for display"""
397 """Find the column width of a Unicode string for display"""
388 eaw = getattr(unicodedata, 'east_asian_width', None)
398 eaw = getattr(unicodedata, 'east_asian_width', None)
389 if eaw is not None:
399 if eaw is not None:
390 return sum([eaw(c) in _wide and 2 or 1 for c in d])
400 return sum([eaw(c) in _wide and 2 or 1 for c in d])
391 return len(d)
401 return len(d)
392
402
393
403
394 def getcols(s: bytes, start: int, c: int) -> bytes:
404 def getcols(s: bytes, start: int, c: int) -> bytes:
395 """Use colwidth to find a c-column substring of s starting at byte
405 """Use colwidth to find a c-column substring of s starting at byte
396 index start"""
406 index start"""
397 for x in range(start + c, len(s)):
407 for x in range(start + c, len(s)):
398 t = s[start:x]
408 t = s[start:x]
399 if colwidth(t) == c:
409 if colwidth(t) == c:
400 return t
410 return t
401 raise ValueError('substring not found')
411 raise ValueError('substring not found')
402
412
403
413
404 def trim(
414 def trim(
405 s: bytes,
415 s: bytes,
406 width: int,
416 width: int,
407 ellipsis: bytes = b'',
417 ellipsis: bytes = b'',
408 leftside: bool = False,
418 leftside: bool = False,
409 ) -> bytes:
419 ) -> bytes:
410 """Trim string 's' to at most 'width' columns (including 'ellipsis').
420 """Trim string 's' to at most 'width' columns (including 'ellipsis').
411
421
412 If 'leftside' is True, left side of string 's' is trimmed.
422 If 'leftside' is True, left side of string 's' is trimmed.
413 'ellipsis' is always placed at trimmed side.
423 'ellipsis' is always placed at trimmed side.
414
424
415 >>> from .node import bin
425 >>> from .node import bin
416 >>> def bprint(s):
426 >>> def bprint(s):
417 ... print(pycompat.sysstr(s))
427 ... print(pycompat.sysstr(s))
418 >>> ellipsis = b'+++'
428 >>> ellipsis = b'+++'
419 >>> from . import encoding
429 >>> from . import encoding
420 >>> encoding.encoding = b'utf-8'
430 >>> encoding.encoding = b'utf-8'
421 >>> t = b'1234567890'
431 >>> t = b'1234567890'
422 >>> bprint(trim(t, 12, ellipsis=ellipsis))
432 >>> bprint(trim(t, 12, ellipsis=ellipsis))
423 1234567890
433 1234567890
424 >>> bprint(trim(t, 10, ellipsis=ellipsis))
434 >>> bprint(trim(t, 10, ellipsis=ellipsis))
425 1234567890
435 1234567890
426 >>> bprint(trim(t, 8, ellipsis=ellipsis))
436 >>> bprint(trim(t, 8, ellipsis=ellipsis))
427 12345+++
437 12345+++
428 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
438 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
429 +++67890
439 +++67890
430 >>> bprint(trim(t, 8))
440 >>> bprint(trim(t, 8))
431 12345678
441 12345678
432 >>> bprint(trim(t, 8, leftside=True))
442 >>> bprint(trim(t, 8, leftside=True))
433 34567890
443 34567890
434 >>> bprint(trim(t, 3, ellipsis=ellipsis))
444 >>> bprint(trim(t, 3, ellipsis=ellipsis))
435 +++
445 +++
436 >>> bprint(trim(t, 1, ellipsis=ellipsis))
446 >>> bprint(trim(t, 1, ellipsis=ellipsis))
437 +
447 +
438 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
448 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
439 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
449 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
440 >>> bprint(trim(t, 12, ellipsis=ellipsis))
450 >>> bprint(trim(t, 12, ellipsis=ellipsis))
441 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
451 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
442 >>> bprint(trim(t, 10, ellipsis=ellipsis))
452 >>> bprint(trim(t, 10, ellipsis=ellipsis))
443 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
453 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
444 >>> bprint(trim(t, 8, ellipsis=ellipsis))
454 >>> bprint(trim(t, 8, ellipsis=ellipsis))
445 \xe3\x81\x82\xe3\x81\x84+++
455 \xe3\x81\x82\xe3\x81\x84+++
446 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
456 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
447 +++\xe3\x81\x88\xe3\x81\x8a
457 +++\xe3\x81\x88\xe3\x81\x8a
448 >>> bprint(trim(t, 5))
458 >>> bprint(trim(t, 5))
449 \xe3\x81\x82\xe3\x81\x84
459 \xe3\x81\x82\xe3\x81\x84
450 >>> bprint(trim(t, 5, leftside=True))
460 >>> bprint(trim(t, 5, leftside=True))
451 \xe3\x81\x88\xe3\x81\x8a
461 \xe3\x81\x88\xe3\x81\x8a
452 >>> bprint(trim(t, 4, ellipsis=ellipsis))
462 >>> bprint(trim(t, 4, ellipsis=ellipsis))
453 +++
463 +++
454 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
464 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
455 +++
465 +++
456 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
466 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
457 >>> bprint(trim(t, 12, ellipsis=ellipsis))
467 >>> bprint(trim(t, 12, ellipsis=ellipsis))
458 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
468 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
459 >>> bprint(trim(t, 10, ellipsis=ellipsis))
469 >>> bprint(trim(t, 10, ellipsis=ellipsis))
460 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
470 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
461 >>> bprint(trim(t, 8, ellipsis=ellipsis))
471 >>> bprint(trim(t, 8, ellipsis=ellipsis))
462 \x11\x22\x33\x44\x55+++
472 \x11\x22\x33\x44\x55+++
463 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
473 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
464 +++\x66\x77\x88\x99\xaa
474 +++\x66\x77\x88\x99\xaa
465 >>> bprint(trim(t, 8))
475 >>> bprint(trim(t, 8))
466 \x11\x22\x33\x44\x55\x66\x77\x88
476 \x11\x22\x33\x44\x55\x66\x77\x88
467 >>> bprint(trim(t, 8, leftside=True))
477 >>> bprint(trim(t, 8, leftside=True))
468 \x33\x44\x55\x66\x77\x88\x99\xaa
478 \x33\x44\x55\x66\x77\x88\x99\xaa
469 >>> bprint(trim(t, 3, ellipsis=ellipsis))
479 >>> bprint(trim(t, 3, ellipsis=ellipsis))
470 +++
480 +++
471 >>> bprint(trim(t, 1, ellipsis=ellipsis))
481 >>> bprint(trim(t, 1, ellipsis=ellipsis))
472 +
482 +
473 """
483 """
474 try:
484 try:
475 u = s.decode(_sysstr(encoding))
485 u = s.decode(_sysstr(encoding))
476 except UnicodeDecodeError:
486 except UnicodeDecodeError:
477 if len(s) <= width: # trimming is not needed
487 if len(s) <= width: # trimming is not needed
478 return s
488 return s
479 width -= len(ellipsis)
489 width -= len(ellipsis)
480 if width <= 0: # no enough room even for ellipsis
490 if width <= 0: # no enough room even for ellipsis
481 return ellipsis[: width + len(ellipsis)]
491 return ellipsis[: width + len(ellipsis)]
482 if leftside:
492 if leftside:
483 return ellipsis + s[-width:]
493 return ellipsis + s[-width:]
484 return s[:width] + ellipsis
494 return s[:width] + ellipsis
485
495
486 if ucolwidth(u) <= width: # trimming is not needed
496 if ucolwidth(u) <= width: # trimming is not needed
487 return s
497 return s
488
498
489 width -= len(ellipsis)
499 width -= len(ellipsis)
490 if width <= 0: # no enough room even for ellipsis
500 if width <= 0: # no enough room even for ellipsis
491 return ellipsis[: width + len(ellipsis)]
501 return ellipsis[: width + len(ellipsis)]
492
502
493 chars = list(u)
503 chars = list(u)
494 if leftside:
504 if leftside:
495 chars.reverse()
505 chars.reverse()
496 width_so_far = 0
506 width_so_far = 0
497 for i, c in enumerate(chars):
507 for i, c in enumerate(chars):
498 width_so_far += ucolwidth(c)
508 width_so_far += ucolwidth(c)
499 if width_so_far > width:
509 if width_so_far > width:
500 break
510 break
501 chars = chars[:i]
511 chars = chars[:i]
502 if leftside:
512 if leftside:
503 chars.reverse()
513 chars.reverse()
504 u = u''.join(chars).encode(_sysstr(encoding))
514 u = u''.join(chars).encode(_sysstr(encoding))
505 if leftside:
515 if leftside:
506 return ellipsis + u
516 return ellipsis + u
507 return u + ellipsis
517 return u + ellipsis
508
518
509
519
510 class normcasespecs:
520 class normcasespecs:
511 """what a platform's normcase does to ASCII strings
521 """what a platform's normcase does to ASCII strings
512
522
513 This is specified per platform, and should be consistent with what normcase
523 This is specified per platform, and should be consistent with what normcase
514 on that platform actually does.
524 on that platform actually does.
515
525
516 lower: normcase lowercases ASCII strings
526 lower: normcase lowercases ASCII strings
517 upper: normcase uppercases ASCII strings
527 upper: normcase uppercases ASCII strings
518 other: the fallback function should always be called
528 other: the fallback function should always be called
519
529
520 This should be kept in sync with normcase_spec in util.h."""
530 This should be kept in sync with normcase_spec in util.h."""
521
531
522 lower = -1
532 lower = -1
523 upper = 1
533 upper = 1
524 other = 0
534 other = 0
525
535
526
536
527 def jsonescape(s: Any, paranoid: Any = False) -> Any:
537 def jsonescape(s: bytes, paranoid: bool = False) -> bytes:
528 """returns a string suitable for JSON
538 """returns a string suitable for JSON
529
539
530 JSON is problematic for us because it doesn't support non-Unicode
540 JSON is problematic for us because it doesn't support non-Unicode
531 bytes. To deal with this, we take the following approach:
541 bytes. To deal with this, we take the following approach:
532
542
533 - localstr/safelocalstr objects are converted back to UTF-8
543 - localstr/safelocalstr objects are converted back to UTF-8
534 - valid UTF-8/ASCII strings are passed as-is
544 - valid UTF-8/ASCII strings are passed as-is
535 - other strings are converted to UTF-8b surrogate encoding
545 - other strings are converted to UTF-8b surrogate encoding
536 - apply JSON-specified string escaping
546 - apply JSON-specified string escaping
537
547
538 (escapes are doubled in these tests)
548 (escapes are doubled in these tests)
539
549
540 >>> jsonescape(b'this is a test')
550 >>> jsonescape(b'this is a test')
541 'this is a test'
551 'this is a test'
542 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
552 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
543 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
553 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
544 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
554 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
545 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
555 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
546 >>> jsonescape(b'a weird byte: \\xdd')
556 >>> jsonescape(b'a weird byte: \\xdd')
547 'a weird byte: \\xed\\xb3\\x9d'
557 'a weird byte: \\xed\\xb3\\x9d'
548 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
558 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
549 'utf-8: caf\\xc3\\xa9'
559 'utf-8: caf\\xc3\\xa9'
550 >>> jsonescape(b'')
560 >>> jsonescape(b'')
551 ''
561 ''
552
562
553 If paranoid, non-ascii and common troublesome characters are also escaped.
563 If paranoid, non-ascii and common troublesome characters are also escaped.
554 This is suitable for web output.
564 This is suitable for web output.
555
565
556 >>> s = b'escape characters: \\0 \\x0b \\x7f'
566 >>> s = b'escape characters: \\0 \\x0b \\x7f'
557 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
567 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
558 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
568 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
559 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
569 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
560 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
570 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
561 'escape boundary: ~ \\\\u007f \\\\u0080'
571 'escape boundary: ~ \\\\u007f \\\\u0080'
562 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
572 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
563 'a weird byte: \\\\udcdd'
573 'a weird byte: \\\\udcdd'
564 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
574 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
565 'utf-8: caf\\\\u00e9'
575 'utf-8: caf\\\\u00e9'
566 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
576 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
567 'non-BMP: \\\\ud834\\\\udd1e'
577 'non-BMP: \\\\ud834\\\\udd1e'
568 >>> jsonescape(b'<foo@example.org>', paranoid=True)
578 >>> jsonescape(b'<foo@example.org>', paranoid=True)
569 '\\\\u003cfoo@example.org\\\\u003e'
579 '\\\\u003cfoo@example.org\\\\u003e'
570 """
580 """
571
581
572 u8chars = toutf8b(s)
582 u8chars = toutf8b(s)
573 try:
583 try:
574 return _jsonescapeu8fast(u8chars, paranoid)
584 return _jsonescapeu8fast(u8chars, paranoid)
575 except ValueError:
585 except ValueError:
576 pass
586 pass
577 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
587 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
578
588
579
589
580 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
590 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
581 # bytes are mapped to that range.
591 # bytes are mapped to that range.
582 _utf8strict = r'surrogatepass'
592 _utf8strict = r'surrogatepass'
583
593
584 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
594 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
585
595
586
596
587 def getutf8char(s: bytes, pos: int) -> bytes:
597 def getutf8char(s: bytes, pos: int) -> bytes:
588 """get the next full utf-8 character in the given string, starting at pos
598 """get the next full utf-8 character in the given string, starting at pos
589
599
590 Raises a UnicodeError if the given location does not start a valid
600 Raises a UnicodeError if the given location does not start a valid
591 utf-8 character.
601 utf-8 character.
592 """
602 """
593
603
594 # find how many bytes to attempt decoding from first nibble
604 # find how many bytes to attempt decoding from first nibble
595 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
605 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
596 if not l: # ascii
606 if not l: # ascii
597 return s[pos : pos + 1]
607 return s[pos : pos + 1]
598
608
599 c = s[pos : pos + l]
609 c = s[pos : pos + l]
600 # validate with attempted decode
610 # validate with attempted decode
601 c.decode("utf-8", _utf8strict)
611 c.decode("utf-8", _utf8strict)
602 return c
612 return c
603
613
604
614
605 def toutf8b(s: bytes) -> bytes:
615 def toutf8b(s: bytes) -> bytes:
606 """convert a local, possibly-binary string into UTF-8b
616 """convert a local, possibly-binary string into UTF-8b
607
617
608 This is intended as a generic method to preserve data when working
618 This is intended as a generic method to preserve data when working
609 with schemes like JSON and XML that have no provision for
619 with schemes like JSON and XML that have no provision for
610 arbitrary byte strings. As Mercurial often doesn't know
620 arbitrary byte strings. As Mercurial often doesn't know
611 what encoding data is in, we use so-called UTF-8b.
621 what encoding data is in, we use so-called UTF-8b.
612
622
613 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
623 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
614 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
624 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
615 uDC00-uDCFF.
625 uDC00-uDCFF.
616
626
617 Principles of operation:
627 Principles of operation:
618
628
619 - ASCII and UTF-8 data successfully round-trips and is understood
629 - ASCII and UTF-8 data successfully round-trips and is understood
620 by Unicode-oriented clients
630 by Unicode-oriented clients
621 - filenames and file contents in arbitrary other encodings can have
631 - filenames and file contents in arbitrary other encodings can have
622 be round-tripped or recovered by clueful clients
632 be round-tripped or recovered by clueful clients
623 - local strings that have a cached known UTF-8 encoding (aka
633 - local strings that have a cached known UTF-8 encoding (aka
624 localstr) get sent as UTF-8 so Unicode-oriented clients get the
634 localstr) get sent as UTF-8 so Unicode-oriented clients get the
625 Unicode data they want
635 Unicode data they want
626 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
636 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
627 - because we must preserve UTF-8 bytestring in places such as
637 - because we must preserve UTF-8 bytestring in places such as
628 filenames, metadata can't be roundtripped without help
638 filenames, metadata can't be roundtripped without help
629
639
630 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
640 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
631 arbitrary bytes into an internal Unicode format that can be
641 arbitrary bytes into an internal Unicode format that can be
632 re-encoded back into the original. Here we are exposing the
642 re-encoded back into the original. Here we are exposing the
633 internal surrogate encoding as a UTF-8 string.)
643 internal surrogate encoding as a UTF-8 string.)
634 """
644 """
635
645
636 if isinstance(s, localstr):
646 if isinstance(s, localstr):
637 # assume that the original UTF-8 sequence would never contain
647 # assume that the original UTF-8 sequence would never contain
638 # invalid characters in U+DCxx range
648 # invalid characters in U+DCxx range
639 return s._utf8
649 return s._utf8
640 elif isinstance(s, safelocalstr):
650 elif isinstance(s, safelocalstr):
641 # already verified that s is non-lossy in legacy encoding, which
651 # already verified that s is non-lossy in legacy encoding, which
642 # shouldn't contain characters in U+DCxx range
652 # shouldn't contain characters in U+DCxx range
643 return fromlocal(s)
653 return fromlocal(s)
644 elif isasciistr(s):
654 elif isasciistr(s):
645 return s
655 return s
646 if b"\xed" not in s:
656 if b"\xed" not in s:
647 try:
657 try:
648 s.decode('utf-8', _utf8strict)
658 s.decode('utf-8', _utf8strict)
649 return s
659 return s
650 except UnicodeDecodeError:
660 except UnicodeDecodeError:
651 pass
661 pass
652
662
653 s = pycompat.bytestr(s)
663 s = pycompat.bytestr(s)
654 r = bytearray()
664 r = bytearray()
655 pos = 0
665 pos = 0
656 l = len(s)
666 l = len(s)
657 while pos < l:
667 while pos < l:
658 try:
668 try:
659 c = getutf8char(s, pos)
669 c = getutf8char(s, pos)
660 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
670 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
661 # have to re-escape existing U+DCxx characters
671 # have to re-escape existing U+DCxx characters
662 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
672 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
663 pos += 1
673 pos += 1
664 else:
674 else:
665 pos += len(c)
675 pos += len(c)
666 except UnicodeDecodeError:
676 except UnicodeDecodeError:
667 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
677 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 pos += 1
678 pos += 1
669 r += c
679 r += c
670 return bytes(r)
680 return bytes(r)
671
681
672
682
673 def fromutf8b(s: bytes) -> bytes:
683 def fromutf8b(s: bytes) -> bytes:
674 """Given a UTF-8b string, return a local, possibly-binary string.
684 """Given a UTF-8b string, return a local, possibly-binary string.
675
685
676 return the original binary string. This
686 return the original binary string. This
677 is a round-trip process for strings like filenames, but metadata
687 is a round-trip process for strings like filenames, but metadata
678 that's was passed through tolocal will remain in UTF-8.
688 that's was passed through tolocal will remain in UTF-8.
679
689
680 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
690 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
681 >>> m = b"\\xc3\\xa9\\x99abcd"
691 >>> m = b"\\xc3\\xa9\\x99abcd"
682 >>> toutf8b(m)
692 >>> toutf8b(m)
683 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
693 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
684 >>> roundtrip(m)
694 >>> roundtrip(m)
685 True
695 True
686 >>> roundtrip(b"\\xc2\\xc2\\x80")
696 >>> roundtrip(b"\\xc2\\xc2\\x80")
687 True
697 True
688 >>> roundtrip(b"\\xef\\xbf\\xbd")
698 >>> roundtrip(b"\\xef\\xbf\\xbd")
689 True
699 True
690 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
700 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
691 True
701 True
692 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
702 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
693 True
703 True
694 """
704 """
695
705
696 if isasciistr(s):
706 if isasciistr(s):
697 return s
707 return s
698 # fast path - look for uDxxx prefixes in s
708 # fast path - look for uDxxx prefixes in s
699 if b"\xed" not in s:
709 if b"\xed" not in s:
700 return s
710 return s
701
711
702 # We could do this with the unicode type but some Python builds
712 # We could do this with the unicode type but some Python builds
703 # use UTF-16 internally (issue5031) which causes non-BMP code
713 # use UTF-16 internally (issue5031) which causes non-BMP code
704 # points to be escaped. Instead, we use our handy getutf8char
714 # points to be escaped. Instead, we use our handy getutf8char
705 # helper again to walk the string without "decoding" it.
715 # helper again to walk the string without "decoding" it.
706
716
707 s = pycompat.bytestr(s)
717 s = pycompat.bytestr(s)
708 r = bytearray()
718 r = bytearray()
709 pos = 0
719 pos = 0
710 l = len(s)
720 l = len(s)
711 while pos < l:
721 while pos < l:
712 c = getutf8char(s, pos)
722 c = getutf8char(s, pos)
713 pos += len(c)
723 pos += len(c)
714 # unescape U+DCxx characters
724 # unescape U+DCxx characters
715 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
725 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
716 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
726 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
717 r += c
727 r += c
718 return bytes(r)
728 return bytes(r)
@@ -1,86 +1,86
1 # charencode.py - miscellaneous character encoding
1 # charencode.py - miscellaneous character encoding
2 #
2 #
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8
8
9 import array
9 import array
10
10
11 from .. import pycompat
11 from .. import pycompat
12
12
13
13
14 def isasciistr(s):
14 def isasciistr(s: bytes) -> bool:
15 try:
15 try:
16 s.decode('ascii')
16 s.decode('ascii')
17 return True
17 return True
18 except UnicodeDecodeError:
18 except UnicodeDecodeError:
19 return False
19 return False
20
20
21
21
22 def asciilower(s):
22 def asciilower(s: bytes) -> bytes:
23 """convert a string to lowercase if ASCII
23 """convert a string to lowercase if ASCII
24
24
25 Raises UnicodeDecodeError if non-ASCII characters are found."""
25 Raises UnicodeDecodeError if non-ASCII characters are found."""
26 s.decode('ascii')
26 s.decode('ascii')
27 return s.lower()
27 return s.lower()
28
28
29
29
30 def asciiupper(s):
30 def asciiupper(s: bytes) -> bytes:
31 """convert a string to uppercase if ASCII
31 """convert a string to uppercase if ASCII
32
32
33 Raises UnicodeDecodeError if non-ASCII characters are found."""
33 Raises UnicodeDecodeError if non-ASCII characters are found."""
34 s.decode('ascii')
34 s.decode('ascii')
35 return s.upper()
35 return s.upper()
36
36
37
37
38 _jsonmap = []
38 _jsonmap = []
39 _jsonmap.extend(b"\\u%04x" % x for x in range(32))
39 _jsonmap.extend(b"\\u%04x" % x for x in range(32))
40 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
40 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
41 _jsonmap.append(b'\\u007f')
41 _jsonmap.append(b'\\u007f')
42 _jsonmap[0x09] = b'\\t'
42 _jsonmap[0x09] = b'\\t'
43 _jsonmap[0x0A] = b'\\n'
43 _jsonmap[0x0A] = b'\\n'
44 _jsonmap[0x22] = b'\\"'
44 _jsonmap[0x22] = b'\\"'
45 _jsonmap[0x5C] = b'\\\\'
45 _jsonmap[0x5C] = b'\\\\'
46 _jsonmap[0x08] = b'\\b'
46 _jsonmap[0x08] = b'\\b'
47 _jsonmap[0x0C] = b'\\f'
47 _jsonmap[0x0C] = b'\\f'
48 _jsonmap[0x0D] = b'\\r'
48 _jsonmap[0x0D] = b'\\r'
49 _paranoidjsonmap = _jsonmap[:]
49 _paranoidjsonmap = _jsonmap[:]
50 _paranoidjsonmap[0x3C] = b'\\u003c' # '<' (e.g. escape "</script>")
50 _paranoidjsonmap[0x3C] = b'\\u003c' # '<' (e.g. escape "</script>")
51 _paranoidjsonmap[0x3E] = b'\\u003e' # '>'
51 _paranoidjsonmap[0x3E] = b'\\u003e' # '>'
52 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
52 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
53
53
54
54
55 def jsonescapeu8fast(u8chars, paranoid):
55 def jsonescapeu8fast(u8chars: bytes, paranoid: bool) -> bytes:
56 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
56 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
57
57
58 Raises ValueError if non-ASCII characters have to be escaped.
58 Raises ValueError if non-ASCII characters have to be escaped.
59 """
59 """
60 if paranoid:
60 if paranoid:
61 jm = _paranoidjsonmap
61 jm = _paranoidjsonmap
62 else:
62 else:
63 jm = _jsonmap
63 jm = _jsonmap
64 try:
64 try:
65 return b''.join(jm[x] for x in bytearray(u8chars))
65 return b''.join(jm[x] for x in bytearray(u8chars))
66 except IndexError:
66 except IndexError:
67 raise ValueError
67 raise ValueError
68
68
69
69
70 _utf8strict = r'surrogatepass'
70 _utf8strict = r'surrogatepass'
71
71
72
72
73 def jsonescapeu8fallback(u8chars, paranoid):
73 def jsonescapeu8fallback(u8chars: bytes, paranoid: bool) -> bytes:
74 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
74 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
75
75
76 Escapes all non-ASCII characters no matter if paranoid is False.
76 Escapes all non-ASCII characters no matter if paranoid is False.
77 """
77 """
78 if paranoid:
78 if paranoid:
79 jm = _paranoidjsonmap
79 jm = _paranoidjsonmap
80 else:
80 else:
81 jm = _jsonmap
81 jm = _jsonmap
82 # non-BMP char is represented as UTF-16 surrogate pair
82 # non-BMP char is represented as UTF-16 surrogate pair
83 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
83 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
84 u16codes = array.array('H', u16b)
84 u16codes = array.array('H', u16b)
85 u16codes.pop(0) # drop BOM
85 u16codes.pop(0) # drop BOM
86 return b''.join(jm[x] if x < 128 else b'\\u%04x' % x for x in u16codes)
86 return b''.join(jm[x] if x < 128 else b'\\u%04x' % x for x in u16codes)
General Comments 0
You need to be logged in to leave comments. Login now