##// END OF EJS Templates
typing: restore `encoding.encoding` and `encoding.encodingmode` to bytes...
Matt Harbison -
r52566:f70f61a8 default
parent child Browse files
Show More
@@ -1,718 +1,719
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8
9 9 import locale
10 10 import os
11 11 import re
12 12 import typing
13 13 import unicodedata
14 14
15 15 from typing import (
16 16 Any,
17 17 Callable,
18 18 Text,
19 19 TypeVar,
20 20 )
21 21
22 22 from . import (
23 23 error,
24 24 policy,
25 25 pycompat,
26 26 )
27 27
28 28 from .pure import charencode as charencodepure
29 29
30 30 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
31 31
32 32 charencode = policy.importmod('charencode')
33 33
34 34 isasciistr = charencode.isasciistr
35 35 asciilower = charencode.asciilower
36 36 asciiupper = charencode.asciiupper
37 37 _jsonescapeu8fast = charencode.jsonescapeu8fast
38 38
39 39 _sysstr = pycompat.sysstr
40 40
41 41 unichr = chr
42 42
43 43 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
44 44 # "Unicode Subtleties"), so we need to ignore them in some places for
45 45 # sanity.
46 46 _ignore = [
47 47 unichr(int(x, 16)).encode("utf-8")
48 48 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
49 49 b"206a 206b 206c 206d 206e 206f feff".split()
50 50 ]
51 51 # verify the next function will work
52 52 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
53 53
54 54
55 55 def hfsignoreclean(s: bytes) -> bytes:
56 56 """Remove codepoints ignored by HFS+ from s.
57 57
58 58 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
59 59 '.hg'
60 60 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
61 61 '.hg'
62 62 """
63 63 if b"\xe2" in s or b"\xef" in s:
64 64 for c in _ignore:
65 65 s = s.replace(c, b'')
66 66 return s
67 67
68 68
69 69 # encoding.environ is provided read-only, which may not be used to modify
70 70 # the process environment
71 71 _nativeenviron = os.supports_bytes_environ
72 72 if _nativeenviron:
73 73 environ = os.environb # re-exports
74 74 if pycompat.sysplatform == b'OpenVMS':
75 75 # workaround for a bug in VSI 3.10 port
76 76 # os.environb is only populated with a few Predefined symbols
77 77 def newget(self, key, default=None):
78 78 # pytype on linux does not understand OpenVMS special modules
79 79 import _decc # pytype: disable=import-error
80 80
81 81 v = _decc.getenv(key, None)
82 82 if isinstance(key, bytes):
83 83 return default if v is None else v.encode('latin-1')
84 84 else:
85 85 return default if v is None else v
86 86
87 87 environ.__class__.get = newget
88 88 else:
89 89 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
90 90 # and recreate it once encoding is settled
91 91 environ = {
92 92 k.encode('utf-8'): v.encode('utf-8')
93 93 for k, v in os.environ.items() # re-exports
94 94 }
95 95
96 96 _encodingrewrites = {
97 97 b'646': b'ascii',
98 98 b'ANSI_X3.4-1968': b'ascii',
99 99 }
100 100 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
101 101 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
102 102 # https://bugs.python.org/issue13216
103 103 if pycompat.iswindows:
104 104 _encodingrewrites[b'cp65001'] = b'utf-8'
105 105
106 encoding: bytes = b'' # help pytype avoid seeing None value
106 107 try:
107 encoding = environ.get(b"HGENCODING")
108 encoding = environ.get(b"HGENCODING", b'')
108 109 if not encoding:
109 110 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
110 111 encoding = _encodingrewrites.get(encoding, encoding)
111 112 except locale.Error:
112 113 encoding = b'ascii'
113 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
114 encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")
114 115 fallbackencoding = b'ISO-8859-1'
115 116
116 117
117 118 class localstr(bytes):
118 119 """This class allows strings that are unmodified to be
119 120 round-tripped to the local encoding and back"""
120 121
121 122 def __new__(cls, u, l):
122 123 s = bytes.__new__(cls, l)
123 124 s._utf8 = u
124 125 return s
125 126
126 127 if typing.TYPE_CHECKING:
127 128 # pseudo implementation to help pytype see localstr() constructor
128 129 def __init__(self, u: bytes, l: bytes) -> None:
129 130 super(localstr, self).__init__(l)
130 131 self._utf8 = u
131 132
132 133 def __hash__(self):
133 134 return hash(self._utf8) # avoid collisions in local string space
134 135
135 136
136 137 class safelocalstr(bytes):
137 138 """Tagged string denoting it was previously an internal UTF-8 string,
138 139 and can be converted back to UTF-8 losslessly
139 140
140 141 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
141 142 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
142 143 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
143 144 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
144 145 """
145 146
146 147
147 148 def tolocal(s: bytes) -> bytes:
148 149 """
149 150 Convert a string from internal UTF-8 to local encoding
150 151
151 152 All internal strings should be UTF-8 but some repos before the
152 153 implementation of locale support may contain latin1 or possibly
153 154 other character sets. We attempt to decode everything strictly
154 155 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 156 replace unknown characters.
156 157
157 158 The localstr class is used to cache the known UTF-8 encoding of
158 159 strings next to their local representation to allow lossless
159 160 round-trip conversion back to UTF-8.
160 161
161 162 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 163 >>> l = tolocal(u)
163 164 >>> l
164 165 'foo: ?'
165 166 >>> fromlocal(l)
166 167 'foo: \\xc3\\xa4'
167 168 >>> u2 = b'foo: \\xc3\\xa1'
168 169 >>> d = { l: 1, tolocal(u2): 2 }
169 170 >>> len(d) # no collision
170 171 2
171 172 >>> b'foo: ?' in d
172 173 False
173 174 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 175 >>> l = tolocal(l1)
175 176 >>> l
176 177 'foo: ?'
177 178 >>> fromlocal(l) # magically in utf-8
178 179 'foo: \\xc3\\xa4'
179 180 """
180 181
181 182 if isasciistr(s):
182 183 return s
183 184
184 185 try:
185 186 try:
186 187 # make sure string is actually stored in UTF-8
187 188 u = s.decode('UTF-8')
188 189 if encoding == b'UTF-8':
189 190 # fast path
190 191 return s
191 192 r = u.encode(_sysstr(encoding), "replace")
192 193 if u == r.decode(_sysstr(encoding)):
193 194 # r is a safe, non-lossy encoding of s
194 195 return safelocalstr(r)
195 196 return localstr(s, r)
196 197 except UnicodeDecodeError:
197 198 # we should only get here if we're looking at an ancient changeset
198 199 try:
199 200 u = s.decode(_sysstr(fallbackencoding))
200 201 r = u.encode(_sysstr(encoding), "replace")
201 202 if u == r.decode(_sysstr(encoding)):
202 203 # r is a safe, non-lossy encoding of s
203 204 return safelocalstr(r)
204 205 return localstr(u.encode('UTF-8'), r)
205 206 except UnicodeDecodeError:
206 207 u = s.decode("utf-8", "replace") # last ditch
207 208 # can't round-trip
208 209 return u.encode(_sysstr(encoding), "replace")
209 210 except LookupError as k:
210 211 raise error.Abort(
211 212 pycompat.bytestr(k), hint=b"please check your locale settings"
212 213 )
213 214
214 215
215 216 def fromlocal(s: bytes) -> bytes:
216 217 """
217 218 Convert a string from the local character encoding to UTF-8
218 219
219 220 We attempt to decode strings using the encoding mode set by
220 221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
221 222 characters will cause an error message. Other modes include
222 223 'replace', which replaces unknown characters with a special
223 224 Unicode character, and 'ignore', which drops the character.
224 225 """
225 226
226 227 # can we do a lossless round-trip?
227 228 if isinstance(s, localstr):
228 229 return s._utf8
229 230 if isasciistr(s):
230 231 return s
231 232
232 233 try:
233 234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
234 235 return u.encode("utf-8")
235 236 except UnicodeDecodeError as inst:
236 237 sub = s[max(0, inst.start - 10) : inst.start + 10]
237 238 raise error.Abort(
238 239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
239 240 )
240 241 except LookupError as k:
241 242 raise error.Abort(
242 243 pycompat.bytestr(k), hint=b"please check your locale settings"
243 244 )
244 245
245 246
246 247 def unitolocal(u: str) -> bytes:
247 248 """Convert a unicode string to a byte string of local encoding"""
248 249 return tolocal(u.encode('utf-8'))
249 250
250 251
251 252 def unifromlocal(s: bytes) -> str:
252 253 """Convert a byte string of local encoding to a unicode string"""
253 254 return fromlocal(s).decode('utf-8')
254 255
255 256
256 257 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
257 258 """Create a proxy method that forwards __unicode__() and __str__() of
258 259 Python 3 to __bytes__()"""
259 260
260 261 def unifunc(obj):
261 262 return unifromlocal(bytesfunc(obj))
262 263
263 264 return unifunc
264 265
265 266
266 267 # converter functions between native str and byte string. use these if the
267 268 # character encoding is not aware (e.g. exception message) or is known to
268 269 # be locale dependent (e.g. date formatting.)
269 270 strtolocal = unitolocal
270 271 strfromlocal = unifromlocal
271 272 strmethod = unimethod
272 273
273 274
274 275 def lower(s: bytes) -> bytes:
275 276 """best-effort encoding-aware case-folding of local string s"""
276 277 try:
277 278 return asciilower(s)
278 279 except UnicodeDecodeError:
279 280 pass
280 281 try:
281 282 if isinstance(s, localstr):
282 283 u = s._utf8.decode("utf-8")
283 284 else:
284 285 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
285 286
286 287 lu = u.lower()
287 288 if u == lu:
288 289 return s # preserve localstring
289 290 return lu.encode(_sysstr(encoding))
290 291 except UnicodeError:
291 292 return s.lower() # we don't know how to fold this except in ASCII
292 293 except LookupError as k:
293 294 raise error.Abort(
294 295 pycompat.bytestr(k), hint=b"please check your locale settings"
295 296 )
296 297
297 298
298 299 def upper(s: bytes) -> bytes:
299 300 """best-effort encoding-aware case-folding of local string s"""
300 301 try:
301 302 return asciiupper(s)
302 303 except UnicodeDecodeError:
303 304 return upperfallback(s)
304 305
305 306
306 307 def upperfallback(s: Any) -> Any:
307 308 try:
308 309 if isinstance(s, localstr):
309 310 u = s._utf8.decode("utf-8")
310 311 else:
311 312 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
312 313
313 314 uu = u.upper()
314 315 if u == uu:
315 316 return s # preserve localstring
316 317 return uu.encode(_sysstr(encoding))
317 318 except UnicodeError:
318 319 return s.upper() # we don't know how to fold this except in ASCII
319 320 except LookupError as k:
320 321 raise error.Abort(
321 322 pycompat.bytestr(k), hint=b"please check your locale settings"
322 323 )
323 324
324 325
325 326 if not _nativeenviron:
326 327 # now encoding and helper functions are available, recreate the environ
327 328 # dict to be exported to other modules
328 329 if pycompat.iswindows:
329 330
330 331 class WindowsEnviron(dict):
331 332 """`os.environ` normalizes environment variables to uppercase on windows"""
332 333
333 334 def get(self, key, default=None):
334 335 return super().get(upper(key), default)
335 336
336 337 environ = WindowsEnviron()
337 338
338 339 for k, v in os.environ.items(): # re-exports
339 340 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
340 341
341 342
342 343 DRIVE_RE = re.compile(b'^[a-z]:')
343 344
344 345 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
345 346 # returns bytes.
346 347 if pycompat.iswindows:
347 348 # Python 3 on Windows issues a DeprecationWarning about using the bytes
348 349 # API when os.getcwdb() is called.
349 350 #
350 351 # Additionally, py3.8+ uppercases the drive letter when calling
351 352 # os.path.realpath(), which is used on ``repo.root``. Since those
352 353 # strings are compared in various places as simple strings, also call
353 354 # realpath here. See https://bugs.python.org/issue40368
354 355 #
355 356 # However this is not reliable, so lets explicitly make this drive
356 357 # letter upper case.
357 358 #
358 359 # note: we should consider dropping realpath here since it seems to
359 360 # change the semantic of `getcwd`.
360 361
361 362 def getcwd():
362 363 cwd = os.getcwd() # re-exports
363 364 cwd = os.path.realpath(cwd)
364 365 cwd = strtolocal(cwd)
365 366 if DRIVE_RE.match(cwd):
366 367 cwd = cwd[0:1].upper() + cwd[1:]
367 368 return cwd
368 369
369 370
370 371 else:
371 372 getcwd = os.getcwdb # re-exports
372 373
373 374 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
374 375 _wide = _sysstr(
375 376 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
376 377 and b"WFA"
377 378 or b"WF"
378 379 )
379 380
380 381
381 382 def colwidth(s: bytes) -> int:
382 383 """Find the column width of a string for display in the local encoding"""
383 384 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
384 385
385 386
386 387 def ucolwidth(d: Text) -> int:
387 388 """Find the column width of a Unicode string for display"""
388 389 eaw = getattr(unicodedata, 'east_asian_width', None)
389 390 if eaw is not None:
390 391 return sum([eaw(c) in _wide and 2 or 1 for c in d])
391 392 return len(d)
392 393
393 394
394 395 def getcols(s: bytes, start: int, c: int) -> bytes:
395 396 """Use colwidth to find a c-column substring of s starting at byte
396 397 index start"""
397 398 for x in range(start + c, len(s)):
398 399 t = s[start:x]
399 400 if colwidth(t) == c:
400 401 return t
401 402 raise ValueError('substring not found')
402 403
403 404
404 405 def trim(
405 406 s: bytes,
406 407 width: int,
407 408 ellipsis: bytes = b'',
408 409 leftside: bool = False,
409 410 ) -> bytes:
410 411 """Trim string 's' to at most 'width' columns (including 'ellipsis').
411 412
412 413 If 'leftside' is True, left side of string 's' is trimmed.
413 414 'ellipsis' is always placed at trimmed side.
414 415
415 416 >>> from .node import bin
416 417 >>> def bprint(s):
417 418 ... print(pycompat.sysstr(s))
418 419 >>> ellipsis = b'+++'
419 420 >>> from . import encoding
420 421 >>> encoding.encoding = b'utf-8'
421 422 >>> t = b'1234567890'
422 423 >>> bprint(trim(t, 12, ellipsis=ellipsis))
423 424 1234567890
424 425 >>> bprint(trim(t, 10, ellipsis=ellipsis))
425 426 1234567890
426 427 >>> bprint(trim(t, 8, ellipsis=ellipsis))
427 428 12345+++
428 429 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
429 430 +++67890
430 431 >>> bprint(trim(t, 8))
431 432 12345678
432 433 >>> bprint(trim(t, 8, leftside=True))
433 434 34567890
434 435 >>> bprint(trim(t, 3, ellipsis=ellipsis))
435 436 +++
436 437 >>> bprint(trim(t, 1, ellipsis=ellipsis))
437 438 +
438 439 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
439 440 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
440 441 >>> bprint(trim(t, 12, ellipsis=ellipsis))
441 442 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
442 443 >>> bprint(trim(t, 10, ellipsis=ellipsis))
443 444 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
444 445 >>> bprint(trim(t, 8, ellipsis=ellipsis))
445 446 \xe3\x81\x82\xe3\x81\x84+++
446 447 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
447 448 +++\xe3\x81\x88\xe3\x81\x8a
448 449 >>> bprint(trim(t, 5))
449 450 \xe3\x81\x82\xe3\x81\x84
450 451 >>> bprint(trim(t, 5, leftside=True))
451 452 \xe3\x81\x88\xe3\x81\x8a
452 453 >>> bprint(trim(t, 4, ellipsis=ellipsis))
453 454 +++
454 455 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
455 456 +++
456 457 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
457 458 >>> bprint(trim(t, 12, ellipsis=ellipsis))
458 459 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
459 460 >>> bprint(trim(t, 10, ellipsis=ellipsis))
460 461 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
461 462 >>> bprint(trim(t, 8, ellipsis=ellipsis))
462 463 \x11\x22\x33\x44\x55+++
463 464 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
464 465 +++\x66\x77\x88\x99\xaa
465 466 >>> bprint(trim(t, 8))
466 467 \x11\x22\x33\x44\x55\x66\x77\x88
467 468 >>> bprint(trim(t, 8, leftside=True))
468 469 \x33\x44\x55\x66\x77\x88\x99\xaa
469 470 >>> bprint(trim(t, 3, ellipsis=ellipsis))
470 471 +++
471 472 >>> bprint(trim(t, 1, ellipsis=ellipsis))
472 473 +
473 474 """
474 475 try:
475 476 u = s.decode(_sysstr(encoding))
476 477 except UnicodeDecodeError:
477 478 if len(s) <= width: # trimming is not needed
478 479 return s
479 480 width -= len(ellipsis)
480 481 if width <= 0: # no enough room even for ellipsis
481 482 return ellipsis[: width + len(ellipsis)]
482 483 if leftside:
483 484 return ellipsis + s[-width:]
484 485 return s[:width] + ellipsis
485 486
486 487 if ucolwidth(u) <= width: # trimming is not needed
487 488 return s
488 489
489 490 width -= len(ellipsis)
490 491 if width <= 0: # no enough room even for ellipsis
491 492 return ellipsis[: width + len(ellipsis)]
492 493
493 494 chars = list(u)
494 495 if leftside:
495 496 chars.reverse()
496 497 width_so_far = 0
497 498 for i, c in enumerate(chars):
498 499 width_so_far += ucolwidth(c)
499 500 if width_so_far > width:
500 501 break
501 502 chars = chars[:i]
502 503 if leftside:
503 504 chars.reverse()
504 505 u = u''.join(chars).encode(_sysstr(encoding))
505 506 if leftside:
506 507 return ellipsis + u
507 508 return u + ellipsis
508 509
509 510
510 511 class normcasespecs:
511 512 """what a platform's normcase does to ASCII strings
512 513
513 514 This is specified per platform, and should be consistent with what normcase
514 515 on that platform actually does.
515 516
516 517 lower: normcase lowercases ASCII strings
517 518 upper: normcase uppercases ASCII strings
518 519 other: the fallback function should always be called
519 520
520 521 This should be kept in sync with normcase_spec in util.h."""
521 522
522 523 lower = -1
523 524 upper = 1
524 525 other = 0
525 526
526 527
527 528 def jsonescape(s: Any, paranoid: Any = False) -> Any:
528 529 """returns a string suitable for JSON
529 530
530 531 JSON is problematic for us because it doesn't support non-Unicode
531 532 bytes. To deal with this, we take the following approach:
532 533
533 534 - localstr/safelocalstr objects are converted back to UTF-8
534 535 - valid UTF-8/ASCII strings are passed as-is
535 536 - other strings are converted to UTF-8b surrogate encoding
536 537 - apply JSON-specified string escaping
537 538
538 539 (escapes are doubled in these tests)
539 540
540 541 >>> jsonescape(b'this is a test')
541 542 'this is a test'
542 543 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
543 544 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
544 545 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
545 546 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
546 547 >>> jsonescape(b'a weird byte: \\xdd')
547 548 'a weird byte: \\xed\\xb3\\x9d'
548 549 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
549 550 'utf-8: caf\\xc3\\xa9'
550 551 >>> jsonescape(b'')
551 552 ''
552 553
553 554 If paranoid, non-ascii and common troublesome characters are also escaped.
554 555 This is suitable for web output.
555 556
556 557 >>> s = b'escape characters: \\0 \\x0b \\x7f'
557 558 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
558 559 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
559 560 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
560 561 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
561 562 'escape boundary: ~ \\\\u007f \\\\u0080'
562 563 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
563 564 'a weird byte: \\\\udcdd'
564 565 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
565 566 'utf-8: caf\\\\u00e9'
566 567 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
567 568 'non-BMP: \\\\ud834\\\\udd1e'
568 569 >>> jsonescape(b'<foo@example.org>', paranoid=True)
569 570 '\\\\u003cfoo@example.org\\\\u003e'
570 571 """
571 572
572 573 u8chars = toutf8b(s)
573 574 try:
574 575 return _jsonescapeu8fast(u8chars, paranoid)
575 576 except ValueError:
576 577 pass
577 578 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
578 579
579 580
580 581 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
581 582 # bytes are mapped to that range.
582 583 _utf8strict = r'surrogatepass'
583 584
584 585 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
585 586
586 587
587 588 def getutf8char(s: bytes, pos: int) -> bytes:
588 589 """get the next full utf-8 character in the given string, starting at pos
589 590
590 591 Raises a UnicodeError if the given location does not start a valid
591 592 utf-8 character.
592 593 """
593 594
594 595 # find how many bytes to attempt decoding from first nibble
595 596 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
596 597 if not l: # ascii
597 598 return s[pos : pos + 1]
598 599
599 600 c = s[pos : pos + l]
600 601 # validate with attempted decode
601 602 c.decode("utf-8", _utf8strict)
602 603 return c
603 604
604 605
605 606 def toutf8b(s: bytes) -> bytes:
606 607 """convert a local, possibly-binary string into UTF-8b
607 608
608 609 This is intended as a generic method to preserve data when working
609 610 with schemes like JSON and XML that have no provision for
610 611 arbitrary byte strings. As Mercurial often doesn't know
611 612 what encoding data is in, we use so-called UTF-8b.
612 613
613 614 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
614 615 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
615 616 uDC00-uDCFF.
616 617
617 618 Principles of operation:
618 619
619 620 - ASCII and UTF-8 data successfully round-trips and is understood
620 621 by Unicode-oriented clients
621 622 - filenames and file contents in arbitrary other encodings can have
622 623 be round-tripped or recovered by clueful clients
623 624 - local strings that have a cached known UTF-8 encoding (aka
624 625 localstr) get sent as UTF-8 so Unicode-oriented clients get the
625 626 Unicode data they want
626 627 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
627 628 - because we must preserve UTF-8 bytestring in places such as
628 629 filenames, metadata can't be roundtripped without help
629 630
630 631 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
631 632 arbitrary bytes into an internal Unicode format that can be
632 633 re-encoded back into the original. Here we are exposing the
633 634 internal surrogate encoding as a UTF-8 string.)
634 635 """
635 636
636 637 if isinstance(s, localstr):
637 638 # assume that the original UTF-8 sequence would never contain
638 639 # invalid characters in U+DCxx range
639 640 return s._utf8
640 641 elif isinstance(s, safelocalstr):
641 642 # already verified that s is non-lossy in legacy encoding, which
642 643 # shouldn't contain characters in U+DCxx range
643 644 return fromlocal(s)
644 645 elif isasciistr(s):
645 646 return s
646 647 if b"\xed" not in s:
647 648 try:
648 649 s.decode('utf-8', _utf8strict)
649 650 return s
650 651 except UnicodeDecodeError:
651 652 pass
652 653
653 654 s = pycompat.bytestr(s)
654 655 r = bytearray()
655 656 pos = 0
656 657 l = len(s)
657 658 while pos < l:
658 659 try:
659 660 c = getutf8char(s, pos)
660 661 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
661 662 # have to re-escape existing U+DCxx characters
662 663 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
663 664 pos += 1
664 665 else:
665 666 pos += len(c)
666 667 except UnicodeDecodeError:
667 668 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
668 669 pos += 1
669 670 r += c
670 671 return bytes(r)
671 672
672 673
673 674 def fromutf8b(s: bytes) -> bytes:
674 675 """Given a UTF-8b string, return a local, possibly-binary string.
675 676
676 677 return the original binary string. This
677 678 is a round-trip process for strings like filenames, but metadata
678 679 that's was passed through tolocal will remain in UTF-8.
679 680
680 681 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
681 682 >>> m = b"\\xc3\\xa9\\x99abcd"
682 683 >>> toutf8b(m)
683 684 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
684 685 >>> roundtrip(m)
685 686 True
686 687 >>> roundtrip(b"\\xc2\\xc2\\x80")
687 688 True
688 689 >>> roundtrip(b"\\xef\\xbf\\xbd")
689 690 True
690 691 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
691 692 True
692 693 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
693 694 True
694 695 """
695 696
696 697 if isasciistr(s):
697 698 return s
698 699 # fast path - look for uDxxx prefixes in s
699 700 if b"\xed" not in s:
700 701 return s
701 702
702 703 # We could do this with the unicode type but some Python builds
703 704 # use UTF-16 internally (issue5031) which causes non-BMP code
704 705 # points to be escaped. Instead, we use our handy getutf8char
705 706 # helper again to walk the string without "decoding" it.
706 707
707 708 s = pycompat.bytestr(s)
708 709 r = bytearray()
709 710 pos = 0
710 711 l = len(s)
711 712 while pos < l:
712 713 c = getutf8char(s, pos)
713 714 pos += len(c)
714 715 # unescape U+DCxx characters
715 716 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
716 717 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
717 718 r += c
718 719 return bytes(r)
General Comments 0
You need to be logged in to leave comments. Login now