##// END OF EJS Templates
encoding: remove Python 2 support code...
Gregory Szorc -
r49747:fa2b1a46 default
parent child Browse files
Show More
@@ -1,745 +1,725 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8
9 9 import locale
10 10 import os
11 11 import re
12 12 import unicodedata
13 13
14 14 from .pycompat import getattr
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import charencode as charencodepure
22 22
23 23 if pycompat.TYPE_CHECKING:
24 24 from typing import (
25 25 Any,
26 26 Callable,
27 27 List,
28 28 Text,
29 29 Type,
30 30 TypeVar,
31 31 Union,
32 32 )
33 33
34 34 # keep pyflakes happy
35 35 for t in (Any, Callable, List, Text, Type, Union):
36 36 assert t
37 37
38 38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39 39
40 40 charencode = policy.importmod('charencode')
41 41
42 42 isasciistr = charencode.isasciistr
43 43 asciilower = charencode.asciilower
44 44 asciiupper = charencode.asciiupper
45 45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46 46
47 47 _sysstr = pycompat.sysstr
48 48
49 if pycompat.ispy3:
50 49 unichr = chr
51 50
52 51 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 52 # "Unicode Subtleties"), so we need to ignore them in some places for
54 53 # sanity.
55 54 _ignore = [
56 55 unichr(int(x, 16)).encode("utf-8")
57 56 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 57 b"206a 206b 206c 206d 206e 206f feff".split()
59 58 ]
60 59 # verify the next function will work
61 60 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62 61
63 62
64 63 def hfsignoreclean(s):
65 64 # type: (bytes) -> bytes
66 65 """Remove codepoints ignored by HFS+ from s.
67 66
68 67 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 68 '.hg'
70 69 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 70 '.hg'
72 71 """
73 72 if b"\xe2" in s or b"\xef" in s:
74 73 for c in _ignore:
75 74 s = s.replace(c, b'')
76 75 return s
77 76
78 77
79 78 # encoding.environ is provided read-only, which may not be used to modify
80 79 # the process environment
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 if not pycompat.ispy3:
83 environ = os.environ # re-exports
84 elif _nativeenviron:
80 _nativeenviron = os.supports_bytes_environ
81 if _nativeenviron:
85 82 environ = os.environb # re-exports
86 83 else:
87 84 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 85 # and recreate it once encoding is settled
89 86 environ = {
90 87 k.encode('utf-8'): v.encode('utf-8')
91 88 for k, v in os.environ.items() # re-exports
92 89 }
93 90
94 91 _encodingrewrites = {
95 92 b'646': b'ascii',
96 93 b'ANSI_X3.4-1968': b'ascii',
97 94 }
98 95 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 96 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 97 # https://bugs.python.org/issue13216
101 if pycompat.iswindows and not pycompat.ispy3:
98 if pycompat.iswindows:
102 99 _encodingrewrites[b'cp65001'] = b'utf-8'
103 100
104 101 try:
105 102 encoding = environ.get(b"HGENCODING")
106 103 if not encoding:
107 104 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 105 encoding = _encodingrewrites.get(encoding, encoding)
109 106 except locale.Error:
110 107 encoding = b'ascii'
111 108 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 109 fallbackencoding = b'ISO-8859-1'
113 110
114 111
115 112 class localstr(bytes):
116 113 """This class allows strings that are unmodified to be
117 114 round-tripped to the local encoding and back"""
118 115
119 116 def __new__(cls, u, l):
120 117 s = bytes.__new__(cls, l)
121 118 s._utf8 = u
122 119 return s
123 120
124 121 if pycompat.TYPE_CHECKING:
125 122 # pseudo implementation to help pytype see localstr() constructor
126 123 def __init__(self, u, l):
127 124 # type: (bytes, bytes) -> None
128 125 super(localstr, self).__init__(l)
129 126 self._utf8 = u
130 127
131 128 def __hash__(self):
132 129 return hash(self._utf8) # avoid collisions in local string space
133 130
134 131
135 132 class safelocalstr(bytes):
136 133 """Tagged string denoting it was previously an internal UTF-8 string,
137 134 and can be converted back to UTF-8 losslessly
138 135
139 136 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 137 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 138 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 139 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 140 """
144 141
145 142
146 143 def tolocal(s):
147 144 # type: (bytes) -> bytes
148 145 """
149 146 Convert a string from internal UTF-8 to local encoding
150 147
151 148 All internal strings should be UTF-8 but some repos before the
152 149 implementation of locale support may contain latin1 or possibly
153 150 other character sets. We attempt to decode everything strictly
154 151 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 152 replace unknown characters.
156 153
157 154 The localstr class is used to cache the known UTF-8 encoding of
158 155 strings next to their local representation to allow lossless
159 156 round-trip conversion back to UTF-8.
160 157
161 158 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 159 >>> l = tolocal(u)
163 160 >>> l
164 161 'foo: ?'
165 162 >>> fromlocal(l)
166 163 'foo: \\xc3\\xa4'
167 164 >>> u2 = b'foo: \\xc3\\xa1'
168 165 >>> d = { l: 1, tolocal(u2): 2 }
169 166 >>> len(d) # no collision
170 167 2
171 168 >>> b'foo: ?' in d
172 169 False
173 170 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 171 >>> l = tolocal(l1)
175 172 >>> l
176 173 'foo: ?'
177 174 >>> fromlocal(l) # magically in utf-8
178 175 'foo: \\xc3\\xa4'
179 176 """
180 177
181 178 if isasciistr(s):
182 179 return s
183 180
184 181 try:
185 182 try:
186 183 # make sure string is actually stored in UTF-8
187 184 u = s.decode('UTF-8')
188 185 if encoding == b'UTF-8':
189 186 # fast path
190 187 return s
191 188 r = u.encode(_sysstr(encoding), "replace")
192 189 if u == r.decode(_sysstr(encoding)):
193 190 # r is a safe, non-lossy encoding of s
194 191 return safelocalstr(r)
195 192 return localstr(s, r)
196 193 except UnicodeDecodeError:
197 194 # we should only get here if we're looking at an ancient changeset
198 195 try:
199 196 u = s.decode(_sysstr(fallbackencoding))
200 197 r = u.encode(_sysstr(encoding), "replace")
201 198 if u == r.decode(_sysstr(encoding)):
202 199 # r is a safe, non-lossy encoding of s
203 200 return safelocalstr(r)
204 201 return localstr(u.encode('UTF-8'), r)
205 202 except UnicodeDecodeError:
206 203 u = s.decode("utf-8", "replace") # last ditch
207 204 # can't round-trip
208 205 return u.encode(_sysstr(encoding), "replace")
209 206 except LookupError as k:
210 207 raise error.Abort(
211 208 pycompat.bytestr(k), hint=b"please check your locale settings"
212 209 )
213 210
214 211
215 212 def fromlocal(s):
216 213 # type: (bytes) -> bytes
217 214 """
218 215 Convert a string from the local character encoding to UTF-8
219 216
220 217 We attempt to decode strings using the encoding mode set by
221 218 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 219 characters will cause an error message. Other modes include
223 220 'replace', which replaces unknown characters with a special
224 221 Unicode character, and 'ignore', which drops the character.
225 222 """
226 223
227 224 # can we do a lossless round-trip?
228 225 if isinstance(s, localstr):
229 226 return s._utf8
230 227 if isasciistr(s):
231 228 return s
232 229
233 230 try:
234 231 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 232 return u.encode("utf-8")
236 233 except UnicodeDecodeError as inst:
237 234 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 235 raise error.Abort(
239 236 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 237 )
241 238 except LookupError as k:
242 239 raise error.Abort(
243 240 pycompat.bytestr(k), hint=b"please check your locale settings"
244 241 )
245 242
246 243
247 244 def unitolocal(u):
248 245 # type: (Text) -> bytes
249 246 """Convert a unicode string to a byte string of local encoding"""
250 247 return tolocal(u.encode('utf-8'))
251 248
252 249
253 250 def unifromlocal(s):
254 251 # type: (bytes) -> Text
255 252 """Convert a byte string of local encoding to a unicode string"""
256 253 return fromlocal(s).decode('utf-8')
257 254
258 255
259 256 def unimethod(bytesfunc):
260 257 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
261 258 """Create a proxy method that forwards __unicode__() and __str__() of
262 259 Python 3 to __bytes__()"""
263 260
264 261 def unifunc(obj):
265 262 return unifromlocal(bytesfunc(obj))
266 263
267 264 return unifunc
268 265
269 266
270 267 # converter functions between native str and byte string. use these if the
271 268 # character encoding is not aware (e.g. exception message) or is known to
272 269 # be locale dependent (e.g. date formatting.)
273 if pycompat.ispy3:
274 270 strtolocal = unitolocal
275 271 strfromlocal = unifromlocal
276 272 strmethod = unimethod
277 else:
278
279 def strtolocal(s):
280 # type: (str) -> bytes
281 return s # pytype: disable=bad-return-type
282
283 def strfromlocal(s):
284 # type: (bytes) -> str
285 return s # pytype: disable=bad-return-type
286
287 strmethod = pycompat.identity
288 273
289 274
290 275 def lower(s):
291 276 # type: (bytes) -> bytes
292 277 """best-effort encoding-aware case-folding of local string s"""
293 278 try:
294 279 return asciilower(s)
295 280 except UnicodeDecodeError:
296 281 pass
297 282 try:
298 283 if isinstance(s, localstr):
299 284 u = s._utf8.decode("utf-8")
300 285 else:
301 286 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
302 287
303 288 lu = u.lower()
304 289 if u == lu:
305 290 return s # preserve localstring
306 291 return lu.encode(_sysstr(encoding))
307 292 except UnicodeError:
308 293 return s.lower() # we don't know how to fold this except in ASCII
309 294 except LookupError as k:
310 295 raise error.Abort(
311 296 pycompat.bytestr(k), hint=b"please check your locale settings"
312 297 )
313 298
314 299
315 300 def upper(s):
316 301 # type: (bytes) -> bytes
317 302 """best-effort encoding-aware case-folding of local string s"""
318 303 try:
319 304 return asciiupper(s)
320 305 except UnicodeDecodeError:
321 306 return upperfallback(s)
322 307
323 308
324 309 def upperfallback(s):
325 310 # type: (Any) -> Any
326 311 try:
327 312 if isinstance(s, localstr):
328 313 u = s._utf8.decode("utf-8")
329 314 else:
330 315 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
331 316
332 317 uu = u.upper()
333 318 if u == uu:
334 319 return s # preserve localstring
335 320 return uu.encode(_sysstr(encoding))
336 321 except UnicodeError:
337 322 return s.upper() # we don't know how to fold this except in ASCII
338 323 except LookupError as k:
339 324 raise error.Abort(
340 325 pycompat.bytestr(k), hint=b"please check your locale settings"
341 326 )
342 327
343 328
344 329 if not _nativeenviron:
345 330 # now encoding and helper functions are available, recreate the environ
346 331 # dict to be exported to other modules
347 if pycompat.iswindows and pycompat.ispy3:
332 if pycompat.iswindows:
348 333
349 334 class WindowsEnviron(dict):
350 335 """`os.environ` normalizes environment variables to uppercase on windows"""
351 336
352 337 def get(self, key, default=None):
353 338 return super().get(upper(key), default)
354 339
355 340 environ = WindowsEnviron()
356 341
357 342 for k, v in os.environ.items(): # re-exports
358 343 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
359 344
360 345
361 346 DRIVE_RE = re.compile(b'^[a-z]:')
362 347
363 if pycompat.ispy3:
364 348 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
365 349 # returns bytes.
366 350 if pycompat.iswindows:
367 351 # Python 3 on Windows issues a DeprecationWarning about using the bytes
368 352 # API when os.getcwdb() is called.
369 353 #
370 354 # Additionally, py3.8+ uppercases the drive letter when calling
371 355 # os.path.realpath(), which is used on ``repo.root``. Since those
372 356 # strings are compared in various places as simple strings, also call
373 357 # realpath here. See https://bugs.python.org/issue40368
374 358 #
375 359 # However this is not reliable, so lets explicitly make this drive
376 360 # letter upper case.
377 361 #
378 362 # note: we should consider dropping realpath here since it seems to
379 363 # change the semantic of `getcwd`.
380 364
381 365 def getcwd():
382 366 cwd = os.getcwd() # re-exports
383 367 cwd = os.path.realpath(cwd)
384 368 cwd = strtolocal(cwd)
385 369 if DRIVE_RE.match(cwd):
386 370 cwd = cwd[0:1].upper() + cwd[1:]
387 371 return cwd
388 372
373
389 374 else:
390 375 getcwd = os.getcwdb # re-exports
391 else:
392 getcwd = os.getcwd # re-exports
393 376
394 377 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
395 378 _wide = _sysstr(
396 379 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
397 380 and b"WFA"
398 381 or b"WF"
399 382 )
400 383
401 384
402 385 def colwidth(s):
403 386 # type: (bytes) -> int
404 387 """Find the column width of a string for display in the local encoding"""
405 388 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
406 389
407 390
408 391 def ucolwidth(d):
409 392 # type: (Text) -> int
410 393 """Find the column width of a Unicode string for display"""
411 394 eaw = getattr(unicodedata, 'east_asian_width', None)
412 395 if eaw is not None:
413 396 return sum([eaw(c) in _wide and 2 or 1 for c in d])
414 397 return len(d)
415 398
416 399
417 400 def getcols(s, start, c):
418 401 # type: (bytes, int, int) -> bytes
419 402 """Use colwidth to find a c-column substring of s starting at byte
420 403 index start"""
421 404 for x in pycompat.xrange(start + c, len(s)):
422 405 t = s[start:x]
423 406 if colwidth(t) == c:
424 407 return t
425 408 raise ValueError('substring not found')
426 409
427 410
428 411 def trim(s, width, ellipsis=b'', leftside=False):
429 412 # type: (bytes, int, bytes, bool) -> bytes
430 413 """Trim string 's' to at most 'width' columns (including 'ellipsis').
431 414
432 415 If 'leftside' is True, left side of string 's' is trimmed.
433 416 'ellipsis' is always placed at trimmed side.
434 417
435 418 >>> from .node import bin
436 419 >>> def bprint(s):
437 420 ... print(pycompat.sysstr(s))
438 421 >>> ellipsis = b'+++'
439 422 >>> from . import encoding
440 423 >>> encoding.encoding = b'utf-8'
441 424 >>> t = b'1234567890'
442 425 >>> bprint(trim(t, 12, ellipsis=ellipsis))
443 426 1234567890
444 427 >>> bprint(trim(t, 10, ellipsis=ellipsis))
445 428 1234567890
446 429 >>> bprint(trim(t, 8, ellipsis=ellipsis))
447 430 12345+++
448 431 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
449 432 +++67890
450 433 >>> bprint(trim(t, 8))
451 434 12345678
452 435 >>> bprint(trim(t, 8, leftside=True))
453 436 34567890
454 437 >>> bprint(trim(t, 3, ellipsis=ellipsis))
455 438 +++
456 439 >>> bprint(trim(t, 1, ellipsis=ellipsis))
457 440 +
458 441 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
459 442 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
460 443 >>> bprint(trim(t, 12, ellipsis=ellipsis))
461 444 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
462 445 >>> bprint(trim(t, 10, ellipsis=ellipsis))
463 446 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
464 447 >>> bprint(trim(t, 8, ellipsis=ellipsis))
465 448 \xe3\x81\x82\xe3\x81\x84+++
466 449 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
467 450 +++\xe3\x81\x88\xe3\x81\x8a
468 451 >>> bprint(trim(t, 5))
469 452 \xe3\x81\x82\xe3\x81\x84
470 453 >>> bprint(trim(t, 5, leftside=True))
471 454 \xe3\x81\x88\xe3\x81\x8a
472 455 >>> bprint(trim(t, 4, ellipsis=ellipsis))
473 456 +++
474 457 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
475 458 +++
476 459 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
477 460 >>> bprint(trim(t, 12, ellipsis=ellipsis))
478 461 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
479 462 >>> bprint(trim(t, 10, ellipsis=ellipsis))
480 463 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
481 464 >>> bprint(trim(t, 8, ellipsis=ellipsis))
482 465 \x11\x22\x33\x44\x55+++
483 466 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
484 467 +++\x66\x77\x88\x99\xaa
485 468 >>> bprint(trim(t, 8))
486 469 \x11\x22\x33\x44\x55\x66\x77\x88
487 470 >>> bprint(trim(t, 8, leftside=True))
488 471 \x33\x44\x55\x66\x77\x88\x99\xaa
489 472 >>> bprint(trim(t, 3, ellipsis=ellipsis))
490 473 +++
491 474 >>> bprint(trim(t, 1, ellipsis=ellipsis))
492 475 +
493 476 """
494 477 try:
495 478 u = s.decode(_sysstr(encoding))
496 479 except UnicodeDecodeError:
497 480 if len(s) <= width: # trimming is not needed
498 481 return s
499 482 width -= len(ellipsis)
500 483 if width <= 0: # no enough room even for ellipsis
501 484 return ellipsis[: width + len(ellipsis)]
502 485 if leftside:
503 486 return ellipsis + s[-width:]
504 487 return s[:width] + ellipsis
505 488
506 489 if ucolwidth(u) <= width: # trimming is not needed
507 490 return s
508 491
509 492 width -= len(ellipsis)
510 493 if width <= 0: # no enough room even for ellipsis
511 494 return ellipsis[: width + len(ellipsis)]
512 495
513 496 chars = list(u)
514 497 if leftside:
515 498 chars.reverse()
516 499 width_so_far = 0
517 500 for i, c in enumerate(chars):
518 501 width_so_far += ucolwidth(c)
519 502 if width_so_far > width:
520 503 break
521 504 chars = chars[:i]
522 505 if leftside:
523 506 chars.reverse()
524 507 u = u''.join(chars).encode(_sysstr(encoding))
525 508 if leftside:
526 509 return ellipsis + u
527 510 return u + ellipsis
528 511
529 512
530 513 class normcasespecs(object):
531 514 """what a platform's normcase does to ASCII strings
532 515
533 516 This is specified per platform, and should be consistent with what normcase
534 517 on that platform actually does.
535 518
536 519 lower: normcase lowercases ASCII strings
537 520 upper: normcase uppercases ASCII strings
538 521 other: the fallback function should always be called
539 522
540 523 This should be kept in sync with normcase_spec in util.h."""
541 524
542 525 lower = -1
543 526 upper = 1
544 527 other = 0
545 528
546 529
547 530 def jsonescape(s, paranoid=False):
548 531 # type: (Any, Any) -> Any
549 532 """returns a string suitable for JSON
550 533
551 534 JSON is problematic for us because it doesn't support non-Unicode
552 535 bytes. To deal with this, we take the following approach:
553 536
554 537 - localstr/safelocalstr objects are converted back to UTF-8
555 538 - valid UTF-8/ASCII strings are passed as-is
556 539 - other strings are converted to UTF-8b surrogate encoding
557 540 - apply JSON-specified string escaping
558 541
559 542 (escapes are doubled in these tests)
560 543
561 544 >>> jsonescape(b'this is a test')
562 545 'this is a test'
563 546 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
564 547 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
565 548 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
566 549 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
567 550 >>> jsonescape(b'a weird byte: \\xdd')
568 551 'a weird byte: \\xed\\xb3\\x9d'
569 552 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
570 553 'utf-8: caf\\xc3\\xa9'
571 554 >>> jsonescape(b'')
572 555 ''
573 556
574 557 If paranoid, non-ascii and common troublesome characters are also escaped.
575 558 This is suitable for web output.
576 559
577 560 >>> s = b'escape characters: \\0 \\x0b \\x7f'
578 561 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
579 562 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
580 563 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
581 564 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
582 565 'escape boundary: ~ \\\\u007f \\\\u0080'
583 566 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
584 567 'a weird byte: \\\\udcdd'
585 568 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
586 569 'utf-8: caf\\\\u00e9'
587 570 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
588 571 'non-BMP: \\\\ud834\\\\udd1e'
589 572 >>> jsonescape(b'<foo@example.org>', paranoid=True)
590 573 '\\\\u003cfoo@example.org\\\\u003e'
591 574 """
592 575
593 576 u8chars = toutf8b(s)
594 577 try:
595 578 return _jsonescapeu8fast(u8chars, paranoid)
596 579 except ValueError:
597 580 pass
598 581 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
599 582
600 583
601 584 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
602 585 # bytes are mapped to that range.
603 if pycompat.ispy3:
604 586 _utf8strict = r'surrogatepass'
605 else:
606 _utf8strict = r'strict'
607 587
608 588 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
609 589
610 590
611 591 def getutf8char(s, pos):
612 592 # type: (bytes, int) -> bytes
613 593 """get the next full utf-8 character in the given string, starting at pos
614 594
615 595 Raises a UnicodeError if the given location does not start a valid
616 596 utf-8 character.
617 597 """
618 598
619 599 # find how many bytes to attempt decoding from first nibble
620 600 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
621 601 if not l: # ascii
622 602 return s[pos : pos + 1]
623 603
624 604 c = s[pos : pos + l]
625 605 # validate with attempted decode
626 606 c.decode("utf-8", _utf8strict)
627 607 return c
628 608
629 609
630 610 def toutf8b(s):
631 611 # type: (bytes) -> bytes
632 612 """convert a local, possibly-binary string into UTF-8b
633 613
634 614 This is intended as a generic method to preserve data when working
635 615 with schemes like JSON and XML that have no provision for
636 616 arbitrary byte strings. As Mercurial often doesn't know
637 617 what encoding data is in, we use so-called UTF-8b.
638 618
639 619 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
640 620 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
641 621 uDC00-uDCFF.
642 622
643 623 Principles of operation:
644 624
645 625 - ASCII and UTF-8 data successfully round-trips and is understood
646 626 by Unicode-oriented clients
647 627 - filenames and file contents in arbitrary other encodings can have
648 628 be round-tripped or recovered by clueful clients
649 629 - local strings that have a cached known UTF-8 encoding (aka
650 630 localstr) get sent as UTF-8 so Unicode-oriented clients get the
651 631 Unicode data they want
652 632 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
653 633 - because we must preserve UTF-8 bytestring in places such as
654 634 filenames, metadata can't be roundtripped without help
655 635
656 636 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
657 637 arbitrary bytes into an internal Unicode format that can be
658 638 re-encoded back into the original. Here we are exposing the
659 639 internal surrogate encoding as a UTF-8 string.)
660 640 """
661 641
662 642 if isinstance(s, localstr):
663 643 # assume that the original UTF-8 sequence would never contain
664 644 # invalid characters in U+DCxx range
665 645 return s._utf8
666 646 elif isinstance(s, safelocalstr):
667 647 # already verified that s is non-lossy in legacy encoding, which
668 648 # shouldn't contain characters in U+DCxx range
669 649 return fromlocal(s)
670 650 elif isasciistr(s):
671 651 return s
672 652 if b"\xed" not in s:
673 653 try:
674 654 s.decode('utf-8', _utf8strict)
675 655 return s
676 656 except UnicodeDecodeError:
677 657 pass
678 658
679 659 s = pycompat.bytestr(s)
680 660 r = b""
681 661 pos = 0
682 662 l = len(s)
683 663 while pos < l:
684 664 try:
685 665 c = getutf8char(s, pos)
686 666 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
687 667 # have to re-escape existing U+DCxx characters
688 668 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
689 669 pos += 1
690 670 else:
691 671 pos += len(c)
692 672 except UnicodeDecodeError:
693 673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
694 674 pos += 1
695 675 r += c
696 676 return r
697 677
698 678
699 679 def fromutf8b(s):
700 680 # type: (bytes) -> bytes
701 681 """Given a UTF-8b string, return a local, possibly-binary string.
702 682
703 683 return the original binary string. This
704 684 is a round-trip process for strings like filenames, but metadata
705 685 that's was passed through tolocal will remain in UTF-8.
706 686
707 687 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
708 688 >>> m = b"\\xc3\\xa9\\x99abcd"
709 689 >>> toutf8b(m)
710 690 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
711 691 >>> roundtrip(m)
712 692 True
713 693 >>> roundtrip(b"\\xc2\\xc2\\x80")
714 694 True
715 695 >>> roundtrip(b"\\xef\\xbf\\xbd")
716 696 True
717 697 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
718 698 True
719 699 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
720 700 True
721 701 """
722 702
723 703 if isasciistr(s):
724 704 return s
725 705 # fast path - look for uDxxx prefixes in s
726 706 if b"\xed" not in s:
727 707 return s
728 708
729 709 # We could do this with the unicode type but some Python builds
730 710 # use UTF-16 internally (issue5031) which causes non-BMP code
731 711 # points to be escaped. Instead, we use our handy getutf8char
732 712 # helper again to walk the string without "decoding" it.
733 713
734 714 s = pycompat.bytestr(s)
735 715 r = b""
736 716 pos = 0
737 717 l = len(s)
738 718 while pos < l:
739 719 c = getutf8char(s, pos)
740 720 pos += len(c)
741 721 # unescape U+DCxx characters
742 722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
743 723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
744 724 r += c
745 725 return r
General Comments 0
You need to be logged in to leave comments. Login now