##// END OF EJS Templates
typing: add pseudo localstr.__init__() to help pytype...
Yuya Nishihara -
r44080:da925257 default
parent child Browse files
Show More
@@ -1,696 +1,705 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from .pycompat import getattr
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import charencode as charencodepure
22 22
23 _TYPE_CHECKING = False
24
23 25 if not globals(): # hide this from non-pytype users
24 26 from typing import (
25 27 Any,
26 28 Callable,
27 29 List,
30 TYPE_CHECKING as _TYPE_CHECKING,
28 31 Text,
29 32 Type,
30 33 TypeVar,
31 34 Union,
32 35 )
33 36
34 37 # keep pyflakes happy
35 38 for t in (Any, Callable, List, Text, Type, Union):
36 39 assert t
37 40
38 41 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39 42
40 43 charencode = policy.importmod('charencode')
41 44
42 45 isasciistr = charencode.isasciistr
43 46 asciilower = charencode.asciilower
44 47 asciiupper = charencode.asciiupper
45 48 _jsonescapeu8fast = charencode.jsonescapeu8fast
46 49
47 50 _sysstr = pycompat.sysstr
48 51
49 52 if pycompat.ispy3:
50 53 unichr = chr
51 54
52 55 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 56 # "Unicode Subtleties"), so we need to ignore them in some places for
54 57 # sanity.
55 58 _ignore = [
56 59 unichr(int(x, 16)).encode("utf-8")
57 60 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 61 b"206a 206b 206c 206d 206e 206f feff".split()
59 62 ]
60 63 # verify the next function will work
61 64 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62 65
63 66
64 67 def hfsignoreclean(s):
65 68 # type: (bytes) -> bytes
66 69 """Remove codepoints ignored by HFS+ from s.
67 70
68 71 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 72 '.hg'
70 73 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 74 '.hg'
72 75 """
73 76 if b"\xe2" in s or b"\xef" in s:
74 77 for c in _ignore:
75 78 s = s.replace(c, b'')
76 79 return s
77 80
78 81
79 82 # encoding.environ is provided read-only, which may not be used to modify
80 83 # the process environment
81 84 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 85 if not pycompat.ispy3:
83 86 environ = os.environ # re-exports
84 87 elif _nativeenviron:
85 88 environ = os.environb # re-exports
86 89 else:
87 90 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 91 # and recreate it once encoding is settled
89 92 environ = dict(
90 93 (k.encode('utf-8'), v.encode('utf-8'))
91 94 for k, v in os.environ.items() # re-exports
92 95 )
93 96
94 97 _encodingrewrites = {
95 98 b'646': b'ascii',
96 99 b'ANSI_X3.4-1968': b'ascii',
97 100 }
98 101 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 102 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 103 # https://bugs.python.org/issue13216
101 104 if pycompat.iswindows and not pycompat.ispy3:
102 105 _encodingrewrites[b'cp65001'] = b'utf-8'
103 106
104 107 try:
105 108 encoding = environ.get(b"HGENCODING")
106 109 if not encoding:
107 110 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 111 encoding = _encodingrewrites.get(encoding, encoding)
109 112 except locale.Error:
110 113 encoding = b'ascii'
111 114 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 115 fallbackencoding = b'ISO-8859-1'
113 116
114 117
115 118 class localstr(bytes):
116 119 '''This class allows strings that are unmodified to be
117 120 round-tripped to the local encoding and back'''
118 121
119 122 def __new__(cls, u, l):
120 # type: (Type[_Tlocalstr], bytes, bytes) -> _Tlocalstr
121 123 s = bytes.__new__(cls, l)
122 124 s._utf8 = u
123 125 return s
124 126
127 if _TYPE_CHECKING:
128 # pseudo implementation to help pytype see localstr() constructor
129 def __init__(self, u, l):
130 # type: (bytes, bytes) -> None
131 super(localstr, self).__init__(l)
132 self._utf8 = u
133
125 134 def __hash__(self):
126 135 return hash(self._utf8) # avoid collisions in local string space
127 136
128 137
129 138 class safelocalstr(bytes):
130 139 """Tagged string denoting it was previously an internal UTF-8 string,
131 140 and can be converted back to UTF-8 losslessly
132 141
133 142 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
134 143 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
135 144 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
136 145 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
137 146 """
138 147
139 148
140 149 def tolocal(s):
141 150 # type: (bytes) -> bytes
142 151 """
143 152 Convert a string from internal UTF-8 to local encoding
144 153
145 154 All internal strings should be UTF-8 but some repos before the
146 155 implementation of locale support may contain latin1 or possibly
147 156 other character sets. We attempt to decode everything strictly
148 157 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
149 158 replace unknown characters.
150 159
151 160 The localstr class is used to cache the known UTF-8 encoding of
152 161 strings next to their local representation to allow lossless
153 162 round-trip conversion back to UTF-8.
154 163
155 164 >>> u = b'foo: \\xc3\\xa4' # utf-8
156 165 >>> l = tolocal(u)
157 166 >>> l
158 167 'foo: ?'
159 168 >>> fromlocal(l)
160 169 'foo: \\xc3\\xa4'
161 170 >>> u2 = b'foo: \\xc3\\xa1'
162 171 >>> d = { l: 1, tolocal(u2): 2 }
163 172 >>> len(d) # no collision
164 173 2
165 174 >>> b'foo: ?' in d
166 175 False
167 176 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
168 177 >>> l = tolocal(l1)
169 178 >>> l
170 179 'foo: ?'
171 180 >>> fromlocal(l) # magically in utf-8
172 181 'foo: \\xc3\\xa4'
173 182 """
174 183
175 184 if isasciistr(s):
176 185 return s
177 186
178 187 try:
179 188 try:
180 189 # make sure string is actually stored in UTF-8
181 190 u = s.decode('UTF-8')
182 191 if encoding == b'UTF-8':
183 192 # fast path
184 193 return s
185 194 r = u.encode(_sysstr(encoding), "replace")
186 195 if u == r.decode(_sysstr(encoding)):
187 196 # r is a safe, non-lossy encoding of s
188 197 return safelocalstr(r)
189 198 return localstr(s, r)
190 199 except UnicodeDecodeError:
191 200 # we should only get here if we're looking at an ancient changeset
192 201 try:
193 202 u = s.decode(_sysstr(fallbackencoding))
194 203 r = u.encode(_sysstr(encoding), "replace")
195 204 if u == r.decode(_sysstr(encoding)):
196 205 # r is a safe, non-lossy encoding of s
197 206 return safelocalstr(r)
198 207 return localstr(u.encode('UTF-8'), r)
199 208 except UnicodeDecodeError:
200 209 u = s.decode("utf-8", "replace") # last ditch
201 210 # can't round-trip
202 211 return u.encode(_sysstr(encoding), "replace")
203 212 except LookupError as k:
204 213 raise error.Abort(k, hint=b"please check your locale settings")
205 214
206 215
207 216 def fromlocal(s):
208 217 # type: (bytes) -> bytes
209 218 """
210 219 Convert a string from the local character encoding to UTF-8
211 220
212 221 We attempt to decode strings using the encoding mode set by
213 222 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
214 223 characters will cause an error message. Other modes include
215 224 'replace', which replaces unknown characters with a special
216 225 Unicode character, and 'ignore', which drops the character.
217 226 """
218 227
219 228 # can we do a lossless round-trip?
220 229 if isinstance(s, localstr):
221 230 return s._utf8
222 231 if isasciistr(s):
223 232 return s
224 233
225 234 try:
226 235 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
227 236 return u.encode("utf-8")
228 237 except UnicodeDecodeError as inst:
229 238 sub = s[max(0, inst.start - 10) : inst.start + 10]
230 239 raise error.Abort(
231 240 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
232 241 )
233 242 except LookupError as k:
234 243 raise error.Abort(k, hint=b"please check your locale settings")
235 244
236 245
237 246 def unitolocal(u):
238 247 # type: (Text) -> bytes
239 248 """Convert a unicode string to a byte string of local encoding"""
240 249 return tolocal(u.encode('utf-8'))
241 250
242 251
243 252 def unifromlocal(s):
244 253 # type: (bytes) -> Text
245 254 """Convert a byte string of local encoding to a unicode string"""
246 255 return fromlocal(s).decode('utf-8')
247 256
248 257
249 258 def unimethod(bytesfunc):
250 259 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
251 260 """Create a proxy method that forwards __unicode__() and __str__() of
252 261 Python 3 to __bytes__()"""
253 262
254 263 def unifunc(obj):
255 264 return unifromlocal(bytesfunc(obj))
256 265
257 266 return unifunc
258 267
259 268
260 269 # converter functions between native str and byte string. use these if the
261 270 # character encoding is not aware (e.g. exception message) or is known to
262 271 # be locale dependent (e.g. date formatting.)
263 272 if pycompat.ispy3:
264 273 strtolocal = unitolocal
265 274 strfromlocal = unifromlocal
266 275 strmethod = unimethod
267 276 else:
268 277
269 278 def strtolocal(s):
270 279 # type: (str) -> bytes
271 280 return s # pytype: disable=bad-return-type
272 281
273 282 def strfromlocal(s):
274 283 # type: (bytes) -> str
275 284 return s # pytype: disable=bad-return-type
276 285
277 286 strmethod = pycompat.identity
278 287
279 288 if not _nativeenviron:
280 289 # now encoding and helper functions are available, recreate the environ
281 290 # dict to be exported to other modules
282 291 environ = dict(
283 292 (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
284 293 for k, v in os.environ.items() # re-exports
285 294 )
286 295
287 296 if pycompat.ispy3:
288 297 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
289 298 # returns bytes.
290 299 if pycompat.iswindows:
291 300 # Python 3 on Windows issues a DeprecationWarning about using the bytes
292 301 # API when os.getcwdb() is called.
293 302 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
294 303 else:
295 304 getcwd = os.getcwdb # re-exports
296 305 else:
297 306 getcwd = os.getcwd # re-exports
298 307
299 308 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
300 309 _wide = _sysstr(
301 310 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
302 311 and b"WFA"
303 312 or b"WF"
304 313 )
305 314
306 315
307 316 def colwidth(s):
308 317 # type: (bytes) -> int
309 318 b"Find the column width of a string for display in the local encoding"
310 319 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
311 320
312 321
313 322 def ucolwidth(d):
314 323 # type: (Text) -> int
315 324 b"Find the column width of a Unicode string for display"
316 325 eaw = getattr(unicodedata, 'east_asian_width', None)
317 326 if eaw is not None:
318 327 return sum([eaw(c) in _wide and 2 or 1 for c in d])
319 328 return len(d)
320 329
321 330
322 331 def getcols(s, start, c):
323 332 # type: (bytes, int, int) -> bytes
324 333 '''Use colwidth to find a c-column substring of s starting at byte
325 334 index start'''
326 335 for x in pycompat.xrange(start + c, len(s)):
327 336 t = s[start:x]
328 337 if colwidth(t) == c:
329 338 return t
330 339 raise ValueError('substring not found')
331 340
332 341
333 342 def trim(s, width, ellipsis=b'', leftside=False):
334 343 # type: (bytes, int, bytes, bool) -> bytes
335 344 """Trim string 's' to at most 'width' columns (including 'ellipsis').
336 345
337 346 If 'leftside' is True, left side of string 's' is trimmed.
338 347 'ellipsis' is always placed at trimmed side.
339 348
340 349 >>> from .node import bin
341 350 >>> def bprint(s):
342 351 ... print(pycompat.sysstr(s))
343 352 >>> ellipsis = b'+++'
344 353 >>> from . import encoding
345 354 >>> encoding.encoding = b'utf-8'
346 355 >>> t = b'1234567890'
347 356 >>> bprint(trim(t, 12, ellipsis=ellipsis))
348 357 1234567890
349 358 >>> bprint(trim(t, 10, ellipsis=ellipsis))
350 359 1234567890
351 360 >>> bprint(trim(t, 8, ellipsis=ellipsis))
352 361 12345+++
353 362 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
354 363 +++67890
355 364 >>> bprint(trim(t, 8))
356 365 12345678
357 366 >>> bprint(trim(t, 8, leftside=True))
358 367 34567890
359 368 >>> bprint(trim(t, 3, ellipsis=ellipsis))
360 369 +++
361 370 >>> bprint(trim(t, 1, ellipsis=ellipsis))
362 371 +
363 372 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
364 373 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
365 374 >>> bprint(trim(t, 12, ellipsis=ellipsis))
366 375 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
367 376 >>> bprint(trim(t, 10, ellipsis=ellipsis))
368 377 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
369 378 >>> bprint(trim(t, 8, ellipsis=ellipsis))
370 379 \xe3\x81\x82\xe3\x81\x84+++
371 380 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
372 381 +++\xe3\x81\x88\xe3\x81\x8a
373 382 >>> bprint(trim(t, 5))
374 383 \xe3\x81\x82\xe3\x81\x84
375 384 >>> bprint(trim(t, 5, leftside=True))
376 385 \xe3\x81\x88\xe3\x81\x8a
377 386 >>> bprint(trim(t, 4, ellipsis=ellipsis))
378 387 +++
379 388 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
380 389 +++
381 390 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
382 391 >>> bprint(trim(t, 12, ellipsis=ellipsis))
383 392 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
384 393 >>> bprint(trim(t, 10, ellipsis=ellipsis))
385 394 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
386 395 >>> bprint(trim(t, 8, ellipsis=ellipsis))
387 396 \x11\x22\x33\x44\x55+++
388 397 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
389 398 +++\x66\x77\x88\x99\xaa
390 399 >>> bprint(trim(t, 8))
391 400 \x11\x22\x33\x44\x55\x66\x77\x88
392 401 >>> bprint(trim(t, 8, leftside=True))
393 402 \x33\x44\x55\x66\x77\x88\x99\xaa
394 403 >>> bprint(trim(t, 3, ellipsis=ellipsis))
395 404 +++
396 405 >>> bprint(trim(t, 1, ellipsis=ellipsis))
397 406 +
398 407 """
399 408 try:
400 409 u = s.decode(_sysstr(encoding))
401 410 except UnicodeDecodeError:
402 411 if len(s) <= width: # trimming is not needed
403 412 return s
404 413 width -= len(ellipsis)
405 414 if width <= 0: # no enough room even for ellipsis
406 415 return ellipsis[: width + len(ellipsis)]
407 416 if leftside:
408 417 return ellipsis + s[-width:]
409 418 return s[:width] + ellipsis
410 419
411 420 if ucolwidth(u) <= width: # trimming is not needed
412 421 return s
413 422
414 423 width -= len(ellipsis)
415 424 if width <= 0: # no enough room even for ellipsis
416 425 return ellipsis[: width + len(ellipsis)]
417 426
418 427 if leftside:
419 428 uslice = lambda i: u[i:]
420 429 concat = lambda s: ellipsis + s
421 430 else:
422 431 uslice = lambda i: u[:-i]
423 432 concat = lambda s: s + ellipsis
424 433 for i in pycompat.xrange(1, len(u)):
425 434 usub = uslice(i)
426 435 if ucolwidth(usub) <= width:
427 436 return concat(usub.encode(_sysstr(encoding)))
428 437 return ellipsis # no enough room for multi-column characters
429 438
430 439
431 440 def lower(s):
432 441 # type: (bytes) -> bytes
433 442 b"best-effort encoding-aware case-folding of local string s"
434 443 try:
435 444 return asciilower(s)
436 445 except UnicodeDecodeError:
437 446 pass
438 447 try:
439 448 if isinstance(s, localstr):
440 449 u = s._utf8.decode("utf-8")
441 450 else:
442 451 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
443 452
444 453 lu = u.lower()
445 454 if u == lu:
446 455 return s # preserve localstring
447 456 return lu.encode(_sysstr(encoding))
448 457 except UnicodeError:
449 458 return s.lower() # we don't know how to fold this except in ASCII
450 459 except LookupError as k:
451 460 raise error.Abort(k, hint=b"please check your locale settings")
452 461
453 462
454 463 def upper(s):
455 464 # type: (bytes) -> bytes
456 465 b"best-effort encoding-aware case-folding of local string s"
457 466 try:
458 467 return asciiupper(s)
459 468 except UnicodeDecodeError:
460 469 return upperfallback(s)
461 470
462 471
463 472 def upperfallback(s):
464 473 # type: (Any) -> Any
465 474 try:
466 475 if isinstance(s, localstr):
467 476 u = s._utf8.decode("utf-8")
468 477 else:
469 478 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
470 479
471 480 uu = u.upper()
472 481 if u == uu:
473 482 return s # preserve localstring
474 483 return uu.encode(_sysstr(encoding))
475 484 except UnicodeError:
476 485 return s.upper() # we don't know how to fold this except in ASCII
477 486 except LookupError as k:
478 487 raise error.Abort(k, hint=b"please check your locale settings")
479 488
480 489
481 490 class normcasespecs(object):
482 491 '''what a platform's normcase does to ASCII strings
483 492
484 493 This is specified per platform, and should be consistent with what normcase
485 494 on that platform actually does.
486 495
487 496 lower: normcase lowercases ASCII strings
488 497 upper: normcase uppercases ASCII strings
489 498 other: the fallback function should always be called
490 499
491 500 This should be kept in sync with normcase_spec in util.h.'''
492 501
493 502 lower = -1
494 503 upper = 1
495 504 other = 0
496 505
497 506
498 507 def jsonescape(s, paranoid=False):
499 508 # type: (Any, Any) -> Any
500 509 '''returns a string suitable for JSON
501 510
502 511 JSON is problematic for us because it doesn't support non-Unicode
503 512 bytes. To deal with this, we take the following approach:
504 513
505 514 - localstr/safelocalstr objects are converted back to UTF-8
506 515 - valid UTF-8/ASCII strings are passed as-is
507 516 - other strings are converted to UTF-8b surrogate encoding
508 517 - apply JSON-specified string escaping
509 518
510 519 (escapes are doubled in these tests)
511 520
512 521 >>> jsonescape(b'this is a test')
513 522 'this is a test'
514 523 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
515 524 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
516 525 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
517 526 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
518 527 >>> jsonescape(b'a weird byte: \\xdd')
519 528 'a weird byte: \\xed\\xb3\\x9d'
520 529 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
521 530 'utf-8: caf\\xc3\\xa9'
522 531 >>> jsonescape(b'')
523 532 ''
524 533
525 534 If paranoid, non-ascii and common troublesome characters are also escaped.
526 535 This is suitable for web output.
527 536
528 537 >>> s = b'escape characters: \\0 \\x0b \\x7f'
529 538 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
530 539 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
531 540 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
532 541 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
533 542 'escape boundary: ~ \\\\u007f \\\\u0080'
534 543 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
535 544 'a weird byte: \\\\udcdd'
536 545 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
537 546 'utf-8: caf\\\\u00e9'
538 547 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
539 548 'non-BMP: \\\\ud834\\\\udd1e'
540 549 >>> jsonescape(b'<foo@example.org>', paranoid=True)
541 550 '\\\\u003cfoo@example.org\\\\u003e'
542 551 '''
543 552
544 553 u8chars = toutf8b(s)
545 554 try:
546 555 return _jsonescapeu8fast(u8chars, paranoid)
547 556 except ValueError:
548 557 pass
549 558 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
550 559
551 560
552 561 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
553 562 # bytes are mapped to that range.
554 563 if pycompat.ispy3:
555 564 _utf8strict = r'surrogatepass'
556 565 else:
557 566 _utf8strict = r'strict'
558 567
559 568 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
560 569
561 570
562 571 def getutf8char(s, pos):
563 572 # type: (bytes, int) -> bytes
564 573 '''get the next full utf-8 character in the given string, starting at pos
565 574
566 575 Raises a UnicodeError if the given location does not start a valid
567 576 utf-8 character.
568 577 '''
569 578
570 579 # find how many bytes to attempt decoding from first nibble
571 580 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
572 581 if not l: # ascii
573 582 return s[pos : pos + 1]
574 583
575 584 c = s[pos : pos + l]
576 585 # validate with attempted decode
577 586 c.decode("utf-8", _utf8strict)
578 587 return c
579 588
580 589
581 590 def toutf8b(s):
582 591 # type: (bytes) -> bytes
583 592 '''convert a local, possibly-binary string into UTF-8b
584 593
585 594 This is intended as a generic method to preserve data when working
586 595 with schemes like JSON and XML that have no provision for
587 596 arbitrary byte strings. As Mercurial often doesn't know
588 597 what encoding data is in, we use so-called UTF-8b.
589 598
590 599 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
591 600 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
592 601 uDC00-uDCFF.
593 602
594 603 Principles of operation:
595 604
596 605 - ASCII and UTF-8 data successfully round-trips and is understood
597 606 by Unicode-oriented clients
598 607 - filenames and file contents in arbitrary other encodings can have
599 608 be round-tripped or recovered by clueful clients
600 609 - local strings that have a cached known UTF-8 encoding (aka
601 610 localstr) get sent as UTF-8 so Unicode-oriented clients get the
602 611 Unicode data they want
603 612 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
604 613 - because we must preserve UTF-8 bytestring in places such as
605 614 filenames, metadata can't be roundtripped without help
606 615
607 616 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
608 617 arbitrary bytes into an internal Unicode format that can be
609 618 re-encoded back into the original. Here we are exposing the
610 619 internal surrogate encoding as a UTF-8 string.)
611 620 '''
612 621
613 622 if isinstance(s, localstr):
614 623 # assume that the original UTF-8 sequence would never contain
615 624 # invalid characters in U+DCxx range
616 625 return s._utf8
617 626 elif isinstance(s, safelocalstr):
618 627 # already verified that s is non-lossy in legacy encoding, which
619 628 # shouldn't contain characters in U+DCxx range
620 629 return fromlocal(s)
621 630 elif isasciistr(s):
622 631 return s
623 632 if b"\xed" not in s:
624 633 try:
625 634 s.decode('utf-8', _utf8strict)
626 635 return s
627 636 except UnicodeDecodeError:
628 637 pass
629 638
630 639 s = pycompat.bytestr(s)
631 640 r = b""
632 641 pos = 0
633 642 l = len(s)
634 643 while pos < l:
635 644 try:
636 645 c = getutf8char(s, pos)
637 646 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
638 647 # have to re-escape existing U+DCxx characters
639 648 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
640 649 pos += 1
641 650 else:
642 651 pos += len(c)
643 652 except UnicodeDecodeError:
644 653 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
645 654 pos += 1
646 655 r += c
647 656 return r
648 657
649 658
650 659 def fromutf8b(s):
651 660 # type: (bytes) -> bytes
652 661 '''Given a UTF-8b string, return a local, possibly-binary string.
653 662
654 663 return the original binary string. This
655 664 is a round-trip process for strings like filenames, but metadata
656 665 that's was passed through tolocal will remain in UTF-8.
657 666
658 667 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
659 668 >>> m = b"\\xc3\\xa9\\x99abcd"
660 669 >>> toutf8b(m)
661 670 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
662 671 >>> roundtrip(m)
663 672 True
664 673 >>> roundtrip(b"\\xc2\\xc2\\x80")
665 674 True
666 675 >>> roundtrip(b"\\xef\\xbf\\xbd")
667 676 True
668 677 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
669 678 True
670 679 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
671 680 True
672 681 '''
673 682
674 683 if isasciistr(s):
675 684 return s
676 685 # fast path - look for uDxxx prefixes in s
677 686 if b"\xed" not in s:
678 687 return s
679 688
680 689 # We could do this with the unicode type but some Python builds
681 690 # use UTF-16 internally (issue5031) which causes non-BMP code
682 691 # points to be escaped. Instead, we use our handy getutf8char
683 692 # helper again to walk the string without "decoding" it.
684 693
685 694 s = pycompat.bytestr(s)
686 695 r = b""
687 696 pos = 0
688 697 l = len(s)
689 698 while pos < l:
690 699 c = getutf8char(s, pos)
691 700 pos += len(c)
692 701 # unescape U+DCxx characters
693 702 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
694 703 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
695 704 r += c
696 705 return r
General Comments 0
You need to be logged in to leave comments. Login now