##// END OF EJS Templates
typing: add type hints to the `charencode` module...
Matt Harbison -
r52615:43adbe03 default
parent child Browse files
Show More
@@ -1,718 +1,728
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8
9 9 import locale
10 10 import os
11 11 import re
12 12 import typing
13 13 import unicodedata
14 14
15 15 from typing import (
16 16 Any,
17 17 Callable,
18 18 Text,
19 19 TypeVar,
20 20 )
21 21
22 22 from . import (
23 23 error,
24 24 policy,
25 25 pycompat,
26 26 )
27 27
28 28 from .pure import charencode as charencodepure
29 29
30 30 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
31 31
32 32 charencode = policy.importmod('charencode')
33 33
34 34 isasciistr = charencode.isasciistr
35 35 asciilower = charencode.asciilower
36 36 asciiupper = charencode.asciiupper
37 37 _jsonescapeu8fast = charencode.jsonescapeu8fast
38 38
39 39 _sysstr = pycompat.sysstr
40 40
41 41 unichr = chr
42 42
43 if typing.TYPE_CHECKING:
44 # TODO: make a stub file for .cext.charencode, and import here
45 from .pure.charencode import (
46 asciilower,
47 asciiupper,
48 isasciistr,
49 jsonescapeu8fast as _jsonescapeu8fast,
50 )
51
52
43 53 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
44 54 # "Unicode Subtleties"), so we need to ignore them in some places for
45 55 # sanity.
46 56 _ignore = [
47 57 unichr(int(x, 16)).encode("utf-8")
48 58 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
49 59 b"206a 206b 206c 206d 206e 206f feff".split()
50 60 ]
51 61 # verify the next function will work
52 62 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
53 63
54 64
55 65 def hfsignoreclean(s: bytes) -> bytes:
56 66 """Remove codepoints ignored by HFS+ from s.
57 67
58 68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
59 69 '.hg'
60 70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
61 71 '.hg'
62 72 """
63 73 if b"\xe2" in s or b"\xef" in s:
64 74 for c in _ignore:
65 75 s = s.replace(c, b'')
66 76 return s
67 77
68 78
69 79 # encoding.environ is provided read-only, which may not be used to modify
70 80 # the process environment
71 81 _nativeenviron = os.supports_bytes_environ
72 82 if _nativeenviron:
73 83 environ = os.environb # re-exports
74 84 if pycompat.sysplatform == b'OpenVMS':
75 85 # workaround for a bug in VSI 3.10 port
76 86 # os.environb is only populated with a few Predefined symbols
77 87 def newget(self, key, default=None):
78 88 # pytype on linux does not understand OpenVMS special modules
79 89 import _decc # pytype: disable=import-error
80 90
81 91 v = _decc.getenv(key, None)
82 92 if isinstance(key, bytes):
83 93 return default if v is None else v.encode('latin-1')
84 94 else:
85 95 return default if v is None else v
86 96
87 97 environ.__class__.get = newget
88 98 else:
89 99 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
90 100 # and recreate it once encoding is settled
91 101 environ = {
92 102 k.encode('utf-8'): v.encode('utf-8')
93 103 for k, v in os.environ.items() # re-exports
94 104 }
95 105
96 106 _encodingrewrites = {
97 107 b'646': b'ascii',
98 108 b'ANSI_X3.4-1968': b'ascii',
99 109 }
100 110 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
101 111 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
102 112 # https://bugs.python.org/issue13216
103 113 if pycompat.iswindows:
104 114 _encodingrewrites[b'cp65001'] = b'utf-8'
105 115
106 116 encoding: bytes = b'' # help pytype avoid seeing None value
107 117 try:
108 118 encoding = environ.get(b"HGENCODING", b'')
109 119 if not encoding:
110 120 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
111 121 encoding = _encodingrewrites.get(encoding, encoding)
112 122 except locale.Error:
113 123 encoding = b'ascii'
114 124 encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")
115 125 fallbackencoding = b'ISO-8859-1'
116 126
117 127
118 128 class localstr(bytes):
119 129 """This class allows strings that are unmodified to be
120 130 round-tripped to the local encoding and back"""
121 131
122 132 def __new__(cls, u, l):
123 133 s = bytes.__new__(cls, l)
124 134 s._utf8 = u
125 135 return s
126 136
127 137 if typing.TYPE_CHECKING:
128 138 # pseudo implementation to help pytype see localstr() constructor
129 139 def __init__(self, u: bytes, l: bytes) -> None:
130 140 super(localstr, self).__init__(l)
131 141 self._utf8 = u
132 142
133 143 def __hash__(self):
134 144 return hash(self._utf8) # avoid collisions in local string space
135 145
136 146
137 147 class safelocalstr(bytes):
138 148 """Tagged string denoting it was previously an internal UTF-8 string,
139 149 and can be converted back to UTF-8 losslessly
140 150
141 151 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
142 152 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
143 153 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
144 154 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
145 155 """
146 156
147 157
148 158 def tolocal(s: bytes) -> bytes:
149 159 """
150 160 Convert a string from internal UTF-8 to local encoding
151 161
152 162 All internal strings should be UTF-8 but some repos before the
153 163 implementation of locale support may contain latin1 or possibly
154 164 other character sets. We attempt to decode everything strictly
155 165 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
156 166 replace unknown characters.
157 167
158 168 The localstr class is used to cache the known UTF-8 encoding of
159 169 strings next to their local representation to allow lossless
160 170 round-trip conversion back to UTF-8.
161 171
162 172 >>> u = b'foo: \\xc3\\xa4' # utf-8
163 173 >>> l = tolocal(u)
164 174 >>> l
165 175 'foo: ?'
166 176 >>> fromlocal(l)
167 177 'foo: \\xc3\\xa4'
168 178 >>> u2 = b'foo: \\xc3\\xa1'
169 179 >>> d = { l: 1, tolocal(u2): 2 }
170 180 >>> len(d) # no collision
171 181 2
172 182 >>> b'foo: ?' in d
173 183 False
174 184 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
175 185 >>> l = tolocal(l1)
176 186 >>> l
177 187 'foo: ?'
178 188 >>> fromlocal(l) # magically in utf-8
179 189 'foo: \\xc3\\xa4'
180 190 """
181 191
182 192 if isasciistr(s):
183 193 return s
184 194
185 195 try:
186 196 try:
187 197 # make sure string is actually stored in UTF-8
188 198 u = s.decode('UTF-8')
189 199 if encoding == b'UTF-8':
190 200 # fast path
191 201 return s
192 202 r = u.encode(_sysstr(encoding), "replace")
193 203 if u == r.decode(_sysstr(encoding)):
194 204 # r is a safe, non-lossy encoding of s
195 205 return safelocalstr(r)
196 206 return localstr(s, r)
197 207 except UnicodeDecodeError:
198 208 # we should only get here if we're looking at an ancient changeset
199 209 try:
200 210 u = s.decode(_sysstr(fallbackencoding))
201 211 r = u.encode(_sysstr(encoding), "replace")
202 212 if u == r.decode(_sysstr(encoding)):
203 213 # r is a safe, non-lossy encoding of s
204 214 return safelocalstr(r)
205 215 return localstr(u.encode('UTF-8'), r)
206 216 except UnicodeDecodeError:
207 217 u = s.decode("utf-8", "replace") # last ditch
208 218 # can't round-trip
209 219 return u.encode(_sysstr(encoding), "replace")
210 220 except LookupError as k:
211 221 raise error.Abort(
212 222 pycompat.bytestr(k), hint=b"please check your locale settings"
213 223 )
214 224
215 225
216 226 def fromlocal(s: bytes) -> bytes:
217 227 """
218 228 Convert a string from the local character encoding to UTF-8
219 229
220 230 We attempt to decode strings using the encoding mode set by
221 231 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 232 characters will cause an error message. Other modes include
223 233 'replace', which replaces unknown characters with a special
224 234 Unicode character, and 'ignore', which drops the character.
225 235 """
226 236
227 237 # can we do a lossless round-trip?
228 238 if isinstance(s, localstr):
229 239 return s._utf8
230 240 if isasciistr(s):
231 241 return s
232 242
233 243 try:
234 244 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 245 return u.encode("utf-8")
236 246 except UnicodeDecodeError as inst:
237 247 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 248 raise error.Abort(
239 249 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 250 )
241 251 except LookupError as k:
242 252 raise error.Abort(
243 253 pycompat.bytestr(k), hint=b"please check your locale settings"
244 254 )
245 255
246 256
247 257 def unitolocal(u: str) -> bytes:
248 258 """Convert a unicode string to a byte string of local encoding"""
249 259 return tolocal(u.encode('utf-8'))
250 260
251 261
252 262 def unifromlocal(s: bytes) -> str:
253 263 """Convert a byte string of local encoding to a unicode string"""
254 264 return fromlocal(s).decode('utf-8')
255 265
256 266
257 267 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
258 268 """Create a proxy method that forwards __unicode__() and __str__() of
259 269 Python 3 to __bytes__()"""
260 270
261 271 def unifunc(obj):
262 272 return unifromlocal(bytesfunc(obj))
263 273
264 274 return unifunc
265 275
266 276
267 277 # converter functions between native str and byte string. use these if the
268 278 # character encoding is not aware (e.g. exception message) or is known to
269 279 # be locale dependent (e.g. date formatting.)
270 280 strtolocal = unitolocal
271 281 strfromlocal = unifromlocal
272 282 strmethod = unimethod
273 283
274 284
275 285 def lower(s: bytes) -> bytes:
276 286 """best-effort encoding-aware case-folding of local string s"""
277 287 try:
278 288 return asciilower(s)
279 289 except UnicodeDecodeError:
280 290 pass
281 291 try:
282 292 if isinstance(s, localstr):
283 293 u = s._utf8.decode("utf-8")
284 294 else:
285 295 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
286 296
287 297 lu = u.lower()
288 298 if u == lu:
289 299 return s # preserve localstring
290 300 return lu.encode(_sysstr(encoding))
291 301 except UnicodeError:
292 302 return s.lower() # we don't know how to fold this except in ASCII
293 303 except LookupError as k:
294 304 raise error.Abort(
295 305 pycompat.bytestr(k), hint=b"please check your locale settings"
296 306 )
297 307
298 308
299 309 def upper(s: bytes) -> bytes:
300 310 """best-effort encoding-aware case-folding of local string s"""
301 311 try:
302 312 return asciiupper(s)
303 313 except UnicodeDecodeError:
304 314 return upperfallback(s)
305 315
306 316
307 317 def upperfallback(s: Any) -> Any:
308 318 try:
309 319 if isinstance(s, localstr):
310 320 u = s._utf8.decode("utf-8")
311 321 else:
312 322 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
313 323
314 324 uu = u.upper()
315 325 if u == uu:
316 326 return s # preserve localstring
317 327 return uu.encode(_sysstr(encoding))
318 328 except UnicodeError:
319 329 return s.upper() # we don't know how to fold this except in ASCII
320 330 except LookupError as k:
321 331 raise error.Abort(
322 332 pycompat.bytestr(k), hint=b"please check your locale settings"
323 333 )
324 334
325 335
326 336 if not _nativeenviron:
327 337 # now encoding and helper functions are available, recreate the environ
328 338 # dict to be exported to other modules
329 339 if pycompat.iswindows:
330 340
331 341 class WindowsEnviron(dict):
332 342 """`os.environ` normalizes environment variables to uppercase on windows"""
333 343
334 344 def get(self, key, default=None):
335 345 return super().get(upper(key), default)
336 346
337 347 environ = WindowsEnviron()
338 348
339 349 for k, v in os.environ.items(): # re-exports
340 350 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
341 351
342 352
343 353 DRIVE_RE = re.compile(b'^[a-z]:')
344 354
345 355 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
346 356 # returns bytes.
347 357 if pycompat.iswindows:
348 358 # Python 3 on Windows issues a DeprecationWarning about using the bytes
349 359 # API when os.getcwdb() is called.
350 360 #
351 361 # Additionally, py3.8+ uppercases the drive letter when calling
352 362 # os.path.realpath(), which is used on ``repo.root``. Since those
353 363 # strings are compared in various places as simple strings, also call
354 364 # realpath here. See https://bugs.python.org/issue40368
355 365 #
356 366 # However this is not reliable, so lets explicitly make this drive
357 367 # letter upper case.
358 368 #
359 369 # note: we should consider dropping realpath here since it seems to
360 370 # change the semantic of `getcwd`.
361 371
362 372 def getcwd():
363 373 cwd = os.getcwd() # re-exports
364 374 cwd = os.path.realpath(cwd)
365 375 cwd = strtolocal(cwd)
366 376 if DRIVE_RE.match(cwd):
367 377 cwd = cwd[0:1].upper() + cwd[1:]
368 378 return cwd
369 379
370 380 else:
371 381 getcwd = os.getcwdb # re-exports
372 382
373 383 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
374 384 _wide = _sysstr(
375 385 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
376 386 and b"WFA"
377 387 or b"WF"
378 388 )
379 389
380 390
381 391 def colwidth(s: bytes) -> int:
382 392 """Find the column width of a string for display in the local encoding"""
383 393 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
384 394
385 395
386 396 def ucolwidth(d: Text) -> int:
387 397 """Find the column width of a Unicode string for display"""
388 398 eaw = getattr(unicodedata, 'east_asian_width', None)
389 399 if eaw is not None:
390 400 return sum([eaw(c) in _wide and 2 or 1 for c in d])
391 401 return len(d)
392 402
393 403
394 404 def getcols(s: bytes, start: int, c: int) -> bytes:
395 405 """Use colwidth to find a c-column substring of s starting at byte
396 406 index start"""
397 407 for x in range(start + c, len(s)):
398 408 t = s[start:x]
399 409 if colwidth(t) == c:
400 410 return t
401 411 raise ValueError('substring not found')
402 412
403 413
404 414 def trim(
405 415 s: bytes,
406 416 width: int,
407 417 ellipsis: bytes = b'',
408 418 leftside: bool = False,
409 419 ) -> bytes:
410 420 """Trim string 's' to at most 'width' columns (including 'ellipsis').
411 421
412 422 If 'leftside' is True, left side of string 's' is trimmed.
413 423 'ellipsis' is always placed at trimmed side.
414 424
415 425 >>> from .node import bin
416 426 >>> def bprint(s):
417 427 ... print(pycompat.sysstr(s))
418 428 >>> ellipsis = b'+++'
419 429 >>> from . import encoding
420 430 >>> encoding.encoding = b'utf-8'
421 431 >>> t = b'1234567890'
422 432 >>> bprint(trim(t, 12, ellipsis=ellipsis))
423 433 1234567890
424 434 >>> bprint(trim(t, 10, ellipsis=ellipsis))
425 435 1234567890
426 436 >>> bprint(trim(t, 8, ellipsis=ellipsis))
427 437 12345+++
428 438 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
429 439 +++67890
430 440 >>> bprint(trim(t, 8))
431 441 12345678
432 442 >>> bprint(trim(t, 8, leftside=True))
433 443 34567890
434 444 >>> bprint(trim(t, 3, ellipsis=ellipsis))
435 445 +++
436 446 >>> bprint(trim(t, 1, ellipsis=ellipsis))
437 447 +
438 448 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
439 449 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
440 450 >>> bprint(trim(t, 12, ellipsis=ellipsis))
441 451 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
442 452 >>> bprint(trim(t, 10, ellipsis=ellipsis))
443 453 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
444 454 >>> bprint(trim(t, 8, ellipsis=ellipsis))
445 455 \xe3\x81\x82\xe3\x81\x84+++
446 456 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
447 457 +++\xe3\x81\x88\xe3\x81\x8a
448 458 >>> bprint(trim(t, 5))
449 459 \xe3\x81\x82\xe3\x81\x84
450 460 >>> bprint(trim(t, 5, leftside=True))
451 461 \xe3\x81\x88\xe3\x81\x8a
452 462 >>> bprint(trim(t, 4, ellipsis=ellipsis))
453 463 +++
454 464 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
455 465 +++
456 466 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
457 467 >>> bprint(trim(t, 12, ellipsis=ellipsis))
458 468 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
459 469 >>> bprint(trim(t, 10, ellipsis=ellipsis))
460 470 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
461 471 >>> bprint(trim(t, 8, ellipsis=ellipsis))
462 472 \x11\x22\x33\x44\x55+++
463 473 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
464 474 +++\x66\x77\x88\x99\xaa
465 475 >>> bprint(trim(t, 8))
466 476 \x11\x22\x33\x44\x55\x66\x77\x88
467 477 >>> bprint(trim(t, 8, leftside=True))
468 478 \x33\x44\x55\x66\x77\x88\x99\xaa
469 479 >>> bprint(trim(t, 3, ellipsis=ellipsis))
470 480 +++
471 481 >>> bprint(trim(t, 1, ellipsis=ellipsis))
472 482 +
473 483 """
474 484 try:
475 485 u = s.decode(_sysstr(encoding))
476 486 except UnicodeDecodeError:
477 487 if len(s) <= width: # trimming is not needed
478 488 return s
479 489 width -= len(ellipsis)
480 490 if width <= 0: # no enough room even for ellipsis
481 491 return ellipsis[: width + len(ellipsis)]
482 492 if leftside:
483 493 return ellipsis + s[-width:]
484 494 return s[:width] + ellipsis
485 495
486 496 if ucolwidth(u) <= width: # trimming is not needed
487 497 return s
488 498
489 499 width -= len(ellipsis)
490 500 if width <= 0: # no enough room even for ellipsis
491 501 return ellipsis[: width + len(ellipsis)]
492 502
493 503 chars = list(u)
494 504 if leftside:
495 505 chars.reverse()
496 506 width_so_far = 0
497 507 for i, c in enumerate(chars):
498 508 width_so_far += ucolwidth(c)
499 509 if width_so_far > width:
500 510 break
501 511 chars = chars[:i]
502 512 if leftside:
503 513 chars.reverse()
504 514 u = u''.join(chars).encode(_sysstr(encoding))
505 515 if leftside:
506 516 return ellipsis + u
507 517 return u + ellipsis
508 518
509 519
510 520 class normcasespecs:
511 521 """what a platform's normcase does to ASCII strings
512 522
513 523 This is specified per platform, and should be consistent with what normcase
514 524 on that platform actually does.
515 525
516 526 lower: normcase lowercases ASCII strings
517 527 upper: normcase uppercases ASCII strings
518 528 other: the fallback function should always be called
519 529
520 530 This should be kept in sync with normcase_spec in util.h."""
521 531
522 532 lower = -1
523 533 upper = 1
524 534 other = 0
525 535
526 536
527 def jsonescape(s: Any, paranoid: Any = False) -> Any:
537 def jsonescape(s: bytes, paranoid: bool = False) -> bytes:
528 538 """returns a string suitable for JSON
529 539
530 540 JSON is problematic for us because it doesn't support non-Unicode
531 541 bytes. To deal with this, we take the following approach:
532 542
533 543 - localstr/safelocalstr objects are converted back to UTF-8
534 544 - valid UTF-8/ASCII strings are passed as-is
535 545 - other strings are converted to UTF-8b surrogate encoding
536 546 - apply JSON-specified string escaping
537 547
538 548 (escapes are doubled in these tests)
539 549
540 550 >>> jsonescape(b'this is a test')
541 551 'this is a test'
542 552 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
543 553 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
544 554 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
545 555 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
546 556 >>> jsonescape(b'a weird byte: \\xdd')
547 557 'a weird byte: \\xed\\xb3\\x9d'
548 558 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
549 559 'utf-8: caf\\xc3\\xa9'
550 560 >>> jsonescape(b'')
551 561 ''
552 562
553 563 If paranoid, non-ascii and common troublesome characters are also escaped.
554 564 This is suitable for web output.
555 565
556 566 >>> s = b'escape characters: \\0 \\x0b \\x7f'
557 567 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
558 568 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
559 569 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
560 570 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
561 571 'escape boundary: ~ \\\\u007f \\\\u0080'
562 572 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
563 573 'a weird byte: \\\\udcdd'
564 574 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
565 575 'utf-8: caf\\\\u00e9'
566 576 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
567 577 'non-BMP: \\\\ud834\\\\udd1e'
568 578 >>> jsonescape(b'<foo@example.org>', paranoid=True)
569 579 '\\\\u003cfoo@example.org\\\\u003e'
570 580 """
571 581
572 582 u8chars = toutf8b(s)
573 583 try:
574 584 return _jsonescapeu8fast(u8chars, paranoid)
575 585 except ValueError:
576 586 pass
577 587 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
578 588
579 589
580 590 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
581 591 # bytes are mapped to that range.
582 592 _utf8strict = r'surrogatepass'
583 593
584 594 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
585 595
586 596
587 597 def getutf8char(s: bytes, pos: int) -> bytes:
588 598 """get the next full utf-8 character in the given string, starting at pos
589 599
590 600 Raises a UnicodeError if the given location does not start a valid
591 601 utf-8 character.
592 602 """
593 603
594 604 # find how many bytes to attempt decoding from first nibble
595 605 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
596 606 if not l: # ascii
597 607 return s[pos : pos + 1]
598 608
599 609 c = s[pos : pos + l]
600 610 # validate with attempted decode
601 611 c.decode("utf-8", _utf8strict)
602 612 return c
603 613
604 614
605 615 def toutf8b(s: bytes) -> bytes:
606 616 """convert a local, possibly-binary string into UTF-8b
607 617
608 618 This is intended as a generic method to preserve data when working
609 619 with schemes like JSON and XML that have no provision for
610 620 arbitrary byte strings. As Mercurial often doesn't know
611 621 what encoding data is in, we use so-called UTF-8b.
612 622
613 623 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
614 624 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
615 625 uDC00-uDCFF.
616 626
617 627 Principles of operation:
618 628
619 629 - ASCII and UTF-8 data successfully round-trips and is understood
620 630 by Unicode-oriented clients
621 631 - filenames and file contents in arbitrary other encodings can have
622 632 be round-tripped or recovered by clueful clients
623 633 - local strings that have a cached known UTF-8 encoding (aka
624 634 localstr) get sent as UTF-8 so Unicode-oriented clients get the
625 635 Unicode data they want
626 636 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
627 637 - because we must preserve UTF-8 bytestring in places such as
628 638 filenames, metadata can't be roundtripped without help
629 639
630 640 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
631 641 arbitrary bytes into an internal Unicode format that can be
632 642 re-encoded back into the original. Here we are exposing the
633 643 internal surrogate encoding as a UTF-8 string.)
634 644 """
635 645
636 646 if isinstance(s, localstr):
637 647 # assume that the original UTF-8 sequence would never contain
638 648 # invalid characters in U+DCxx range
639 649 return s._utf8
640 650 elif isinstance(s, safelocalstr):
641 651 # already verified that s is non-lossy in legacy encoding, which
642 652 # shouldn't contain characters in U+DCxx range
643 653 return fromlocal(s)
644 654 elif isasciistr(s):
645 655 return s
646 656 if b"\xed" not in s:
647 657 try:
648 658 s.decode('utf-8', _utf8strict)
649 659 return s
650 660 except UnicodeDecodeError:
651 661 pass
652 662
653 663 s = pycompat.bytestr(s)
654 664 r = bytearray()
655 665 pos = 0
656 666 l = len(s)
657 667 while pos < l:
658 668 try:
659 669 c = getutf8char(s, pos)
660 670 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
661 671 # have to re-escape existing U+DCxx characters
662 672 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
663 673 pos += 1
664 674 else:
665 675 pos += len(c)
666 676 except UnicodeDecodeError:
667 677 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 678 pos += 1
669 679 r += c
670 680 return bytes(r)
671 681
672 682
673 683 def fromutf8b(s: bytes) -> bytes:
674 684 """Given a UTF-8b string, return a local, possibly-binary string.
675 685
676 686 return the original binary string. This
677 687 is a round-trip process for strings like filenames, but metadata
678 688 that's was passed through tolocal will remain in UTF-8.
679 689
680 690 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
681 691 >>> m = b"\\xc3\\xa9\\x99abcd"
682 692 >>> toutf8b(m)
683 693 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
684 694 >>> roundtrip(m)
685 695 True
686 696 >>> roundtrip(b"\\xc2\\xc2\\x80")
687 697 True
688 698 >>> roundtrip(b"\\xef\\xbf\\xbd")
689 699 True
690 700 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
691 701 True
692 702 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
693 703 True
694 704 """
695 705
696 706 if isasciistr(s):
697 707 return s
698 708 # fast path - look for uDxxx prefixes in s
699 709 if b"\xed" not in s:
700 710 return s
701 711
702 712 # We could do this with the unicode type but some Python builds
703 713 # use UTF-16 internally (issue5031) which causes non-BMP code
704 714 # points to be escaped. Instead, we use our handy getutf8char
705 715 # helper again to walk the string without "decoding" it.
706 716
707 717 s = pycompat.bytestr(s)
708 718 r = bytearray()
709 719 pos = 0
710 720 l = len(s)
711 721 while pos < l:
712 722 c = getutf8char(s, pos)
713 723 pos += len(c)
714 724 # unescape U+DCxx characters
715 725 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
716 726 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
717 727 r += c
718 728 return bytes(r)
@@ -1,86 +1,86
1 1 # charencode.py - miscellaneous character encoding
2 2 #
3 3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8
9 9 import array
10 10
11 11 from .. import pycompat
12 12
13 13
14 def isasciistr(s):
14 def isasciistr(s: bytes) -> bool:
15 15 try:
16 16 s.decode('ascii')
17 17 return True
18 18 except UnicodeDecodeError:
19 19 return False
20 20
21 21
22 def asciilower(s):
22 def asciilower(s: bytes) -> bytes:
23 23 """convert a string to lowercase if ASCII
24 24
25 25 Raises UnicodeDecodeError if non-ASCII characters are found."""
26 26 s.decode('ascii')
27 27 return s.lower()
28 28
29 29
30 def asciiupper(s):
30 def asciiupper(s: bytes) -> bytes:
31 31 """convert a string to uppercase if ASCII
32 32
33 33 Raises UnicodeDecodeError if non-ASCII characters are found."""
34 34 s.decode('ascii')
35 35 return s.upper()
36 36
37 37
38 38 _jsonmap = []
39 39 _jsonmap.extend(b"\\u%04x" % x for x in range(32))
40 40 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
41 41 _jsonmap.append(b'\\u007f')
42 42 _jsonmap[0x09] = b'\\t'
43 43 _jsonmap[0x0A] = b'\\n'
44 44 _jsonmap[0x22] = b'\\"'
45 45 _jsonmap[0x5C] = b'\\\\'
46 46 _jsonmap[0x08] = b'\\b'
47 47 _jsonmap[0x0C] = b'\\f'
48 48 _jsonmap[0x0D] = b'\\r'
49 49 _paranoidjsonmap = _jsonmap[:]
50 50 _paranoidjsonmap[0x3C] = b'\\u003c' # '<' (e.g. escape "</script>")
51 51 _paranoidjsonmap[0x3E] = b'\\u003e' # '>'
52 52 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
53 53
54 54
55 def jsonescapeu8fast(u8chars, paranoid):
55 def jsonescapeu8fast(u8chars: bytes, paranoid: bool) -> bytes:
56 56 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
57 57
58 58 Raises ValueError if non-ASCII characters have to be escaped.
59 59 """
60 60 if paranoid:
61 61 jm = _paranoidjsonmap
62 62 else:
63 63 jm = _jsonmap
64 64 try:
65 65 return b''.join(jm[x] for x in bytearray(u8chars))
66 66 except IndexError:
67 67 raise ValueError
68 68
69 69
70 70 _utf8strict = r'surrogatepass'
71 71
72 72
73 def jsonescapeu8fallback(u8chars, paranoid):
73 def jsonescapeu8fallback(u8chars: bytes, paranoid: bool) -> bytes:
74 74 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
75 75
76 76 Escapes all non-ASCII characters no matter if paranoid is False.
77 77 """
78 78 if paranoid:
79 79 jm = _paranoidjsonmap
80 80 else:
81 81 jm = _jsonmap
82 82 # non-BMP char is represented as UTF-16 surrogate pair
83 83 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
84 84 u16codes = array.array('H', u16b)
85 85 u16codes.pop(0) # drop BOM
86 86 return b''.join(jm[x] if x < 128 else b'\\u%04x' % x for x in u16codes)
General Comments 0
You need to be logged in to leave comments. Login now