##// END OF EJS Templates
typing: fix argument type of encoding.localstr()...
Yuya Nishihara -
r44079:009c115e default
parent child Browse files
Show More
@@ -1,696 +1,696 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from .pycompat import getattr
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import charencode as charencodepure
22 22
23 23 if not globals(): # hide this from non-pytype users
24 24 from typing import (
25 25 Any,
26 26 Callable,
27 27 List,
28 28 Text,
29 29 Type,
30 30 TypeVar,
31 31 Union,
32 32 )
33 33
34 34 # keep pyflakes happy
35 35 for t in (Any, Callable, List, Text, Type, Union):
36 36 assert t
37 37
38 38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39 39
40 40 charencode = policy.importmod('charencode')
41 41
42 42 isasciistr = charencode.isasciistr
43 43 asciilower = charencode.asciilower
44 44 asciiupper = charencode.asciiupper
45 45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46 46
47 47 _sysstr = pycompat.sysstr
48 48
49 49 if pycompat.ispy3:
50 50 unichr = chr
51 51
52 52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 53 # "Unicode Subtleties"), so we need to ignore them in some places for
54 54 # sanity.
55 55 _ignore = [
56 56 unichr(int(x, 16)).encode("utf-8")
57 57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 58 b"206a 206b 206c 206d 206e 206f feff".split()
59 59 ]
60 60 # verify the next function will work
61 61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62 62
63 63
64 64 def hfsignoreclean(s):
65 65 # type: (bytes) -> bytes
66 66 """Remove codepoints ignored by HFS+ from s.
67 67
68 68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 69 '.hg'
70 70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 71 '.hg'
72 72 """
73 73 if b"\xe2" in s or b"\xef" in s:
74 74 for c in _ignore:
75 75 s = s.replace(c, b'')
76 76 return s
77 77
78 78
79 79 # encoding.environ is provided read-only, which may not be used to modify
80 80 # the process environment
81 81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 82 if not pycompat.ispy3:
83 83 environ = os.environ # re-exports
84 84 elif _nativeenviron:
85 85 environ = os.environb # re-exports
86 86 else:
87 87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 88 # and recreate it once encoding is settled
89 89 environ = dict(
90 90 (k.encode('utf-8'), v.encode('utf-8'))
91 91 for k, v in os.environ.items() # re-exports
92 92 )
93 93
94 94 _encodingrewrites = {
95 95 b'646': b'ascii',
96 96 b'ANSI_X3.4-1968': b'ascii',
97 97 }
98 98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 100 # https://bugs.python.org/issue13216
101 101 if pycompat.iswindows and not pycompat.ispy3:
102 102 _encodingrewrites[b'cp65001'] = b'utf-8'
103 103
104 104 try:
105 105 encoding = environ.get(b"HGENCODING")
106 106 if not encoding:
107 107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 108 encoding = _encodingrewrites.get(encoding, encoding)
109 109 except locale.Error:
110 110 encoding = b'ascii'
111 111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 112 fallbackencoding = b'ISO-8859-1'
113 113
114 114
115 115 class localstr(bytes):
116 116 '''This class allows strings that are unmodified to be
117 117 round-tripped to the local encoding and back'''
118 118
119 119 def __new__(cls, u, l):
120 # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
120 # type: (Type[_Tlocalstr], bytes, bytes) -> _Tlocalstr
121 121 s = bytes.__new__(cls, l)
122 122 s._utf8 = u
123 123 return s
124 124
125 125 def __hash__(self):
126 126 return hash(self._utf8) # avoid collisions in local string space
127 127
128 128
129 129 class safelocalstr(bytes):
130 130 """Tagged string denoting it was previously an internal UTF-8 string,
131 131 and can be converted back to UTF-8 losslessly
132 132
133 133 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
134 134 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
135 135 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
136 136 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
137 137 """
138 138
139 139
140 140 def tolocal(s):
141 141 # type: (bytes) -> bytes
142 142 """
143 143 Convert a string from internal UTF-8 to local encoding
144 144
145 145 All internal strings should be UTF-8 but some repos before the
146 146 implementation of locale support may contain latin1 or possibly
147 147 other character sets. We attempt to decode everything strictly
148 148 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
149 149 replace unknown characters.
150 150
151 151 The localstr class is used to cache the known UTF-8 encoding of
152 152 strings next to their local representation to allow lossless
153 153 round-trip conversion back to UTF-8.
154 154
155 155 >>> u = b'foo: \\xc3\\xa4' # utf-8
156 156 >>> l = tolocal(u)
157 157 >>> l
158 158 'foo: ?'
159 159 >>> fromlocal(l)
160 160 'foo: \\xc3\\xa4'
161 161 >>> u2 = b'foo: \\xc3\\xa1'
162 162 >>> d = { l: 1, tolocal(u2): 2 }
163 163 >>> len(d) # no collision
164 164 2
165 165 >>> b'foo: ?' in d
166 166 False
167 167 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
168 168 >>> l = tolocal(l1)
169 169 >>> l
170 170 'foo: ?'
171 171 >>> fromlocal(l) # magically in utf-8
172 172 'foo: \\xc3\\xa4'
173 173 """
174 174
175 175 if isasciistr(s):
176 176 return s
177 177
178 178 try:
179 179 try:
180 180 # make sure string is actually stored in UTF-8
181 181 u = s.decode('UTF-8')
182 182 if encoding == b'UTF-8':
183 183 # fast path
184 184 return s
185 185 r = u.encode(_sysstr(encoding), "replace")
186 186 if u == r.decode(_sysstr(encoding)):
187 187 # r is a safe, non-lossy encoding of s
188 188 return safelocalstr(r)
189 189 return localstr(s, r)
190 190 except UnicodeDecodeError:
191 191 # we should only get here if we're looking at an ancient changeset
192 192 try:
193 193 u = s.decode(_sysstr(fallbackencoding))
194 194 r = u.encode(_sysstr(encoding), "replace")
195 195 if u == r.decode(_sysstr(encoding)):
196 196 # r is a safe, non-lossy encoding of s
197 197 return safelocalstr(r)
198 198 return localstr(u.encode('UTF-8'), r)
199 199 except UnicodeDecodeError:
200 200 u = s.decode("utf-8", "replace") # last ditch
201 201 # can't round-trip
202 202 return u.encode(_sysstr(encoding), "replace")
203 203 except LookupError as k:
204 204 raise error.Abort(k, hint=b"please check your locale settings")
205 205
206 206
207 207 def fromlocal(s):
208 208 # type: (bytes) -> bytes
209 209 """
210 210 Convert a string from the local character encoding to UTF-8
211 211
212 212 We attempt to decode strings using the encoding mode set by
213 213 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
214 214 characters will cause an error message. Other modes include
215 215 'replace', which replaces unknown characters with a special
216 216 Unicode character, and 'ignore', which drops the character.
217 217 """
218 218
219 219 # can we do a lossless round-trip?
220 220 if isinstance(s, localstr):
221 221 return s._utf8
222 222 if isasciistr(s):
223 223 return s
224 224
225 225 try:
226 226 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
227 227 return u.encode("utf-8")
228 228 except UnicodeDecodeError as inst:
229 229 sub = s[max(0, inst.start - 10) : inst.start + 10]
230 230 raise error.Abort(
231 231 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
232 232 )
233 233 except LookupError as k:
234 234 raise error.Abort(k, hint=b"please check your locale settings")
235 235
236 236
237 237 def unitolocal(u):
238 238 # type: (Text) -> bytes
239 239 """Convert a unicode string to a byte string of local encoding"""
240 240 return tolocal(u.encode('utf-8'))
241 241
242 242
243 243 def unifromlocal(s):
244 244 # type: (bytes) -> Text
245 245 """Convert a byte string of local encoding to a unicode string"""
246 246 return fromlocal(s).decode('utf-8')
247 247
248 248
249 249 def unimethod(bytesfunc):
250 250 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
251 251 """Create a proxy method that forwards __unicode__() and __str__() of
252 252 Python 3 to __bytes__()"""
253 253
254 254 def unifunc(obj):
255 255 return unifromlocal(bytesfunc(obj))
256 256
257 257 return unifunc
258 258
259 259
260 260 # converter functions between native str and byte string. use these if the
261 261 # character encoding is not aware (e.g. exception message) or is known to
262 262 # be locale dependent (e.g. date formatting.)
263 263 if pycompat.ispy3:
264 264 strtolocal = unitolocal
265 265 strfromlocal = unifromlocal
266 266 strmethod = unimethod
267 267 else:
268 268
269 269 def strtolocal(s):
270 270 # type: (str) -> bytes
271 271 return s # pytype: disable=bad-return-type
272 272
273 273 def strfromlocal(s):
274 274 # type: (bytes) -> str
275 275 return s # pytype: disable=bad-return-type
276 276
277 277 strmethod = pycompat.identity
278 278
279 279 if not _nativeenviron:
280 280 # now encoding and helper functions are available, recreate the environ
281 281 # dict to be exported to other modules
282 282 environ = dict(
283 283 (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
284 284 for k, v in os.environ.items() # re-exports
285 285 )
286 286
287 287 if pycompat.ispy3:
288 288 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
289 289 # returns bytes.
290 290 if pycompat.iswindows:
291 291 # Python 3 on Windows issues a DeprecationWarning about using the bytes
292 292 # API when os.getcwdb() is called.
293 293 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
294 294 else:
295 295 getcwd = os.getcwdb # re-exports
296 296 else:
297 297 getcwd = os.getcwd # re-exports
298 298
299 299 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
300 300 _wide = _sysstr(
301 301 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
302 302 and b"WFA"
303 303 or b"WF"
304 304 )
305 305
306 306
307 307 def colwidth(s):
308 308 # type: (bytes) -> int
309 309 b"Find the column width of a string for display in the local encoding"
310 310 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
311 311
312 312
313 313 def ucolwidth(d):
314 314 # type: (Text) -> int
315 315 b"Find the column width of a Unicode string for display"
316 316 eaw = getattr(unicodedata, 'east_asian_width', None)
317 317 if eaw is not None:
318 318 return sum([eaw(c) in _wide and 2 or 1 for c in d])
319 319 return len(d)
320 320
321 321
322 322 def getcols(s, start, c):
323 323 # type: (bytes, int, int) -> bytes
324 324 '''Use colwidth to find a c-column substring of s starting at byte
325 325 index start'''
326 326 for x in pycompat.xrange(start + c, len(s)):
327 327 t = s[start:x]
328 328 if colwidth(t) == c:
329 329 return t
330 330 raise ValueError('substring not found')
331 331
332 332
333 333 def trim(s, width, ellipsis=b'', leftside=False):
334 334 # type: (bytes, int, bytes, bool) -> bytes
335 335 """Trim string 's' to at most 'width' columns (including 'ellipsis').
336 336
337 337 If 'leftside' is True, left side of string 's' is trimmed.
338 338 'ellipsis' is always placed at trimmed side.
339 339
340 340 >>> from .node import bin
341 341 >>> def bprint(s):
342 342 ... print(pycompat.sysstr(s))
343 343 >>> ellipsis = b'+++'
344 344 >>> from . import encoding
345 345 >>> encoding.encoding = b'utf-8'
346 346 >>> t = b'1234567890'
347 347 >>> bprint(trim(t, 12, ellipsis=ellipsis))
348 348 1234567890
349 349 >>> bprint(trim(t, 10, ellipsis=ellipsis))
350 350 1234567890
351 351 >>> bprint(trim(t, 8, ellipsis=ellipsis))
352 352 12345+++
353 353 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
354 354 +++67890
355 355 >>> bprint(trim(t, 8))
356 356 12345678
357 357 >>> bprint(trim(t, 8, leftside=True))
358 358 34567890
359 359 >>> bprint(trim(t, 3, ellipsis=ellipsis))
360 360 +++
361 361 >>> bprint(trim(t, 1, ellipsis=ellipsis))
362 362 +
363 363 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
364 364 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
365 365 >>> bprint(trim(t, 12, ellipsis=ellipsis))
366 366 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
367 367 >>> bprint(trim(t, 10, ellipsis=ellipsis))
368 368 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
369 369 >>> bprint(trim(t, 8, ellipsis=ellipsis))
370 370 \xe3\x81\x82\xe3\x81\x84+++
371 371 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
372 372 +++\xe3\x81\x88\xe3\x81\x8a
373 373 >>> bprint(trim(t, 5))
374 374 \xe3\x81\x82\xe3\x81\x84
375 375 >>> bprint(trim(t, 5, leftside=True))
376 376 \xe3\x81\x88\xe3\x81\x8a
377 377 >>> bprint(trim(t, 4, ellipsis=ellipsis))
378 378 +++
379 379 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
380 380 +++
381 381 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
382 382 >>> bprint(trim(t, 12, ellipsis=ellipsis))
383 383 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
384 384 >>> bprint(trim(t, 10, ellipsis=ellipsis))
385 385 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
386 386 >>> bprint(trim(t, 8, ellipsis=ellipsis))
387 387 \x11\x22\x33\x44\x55+++
388 388 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
389 389 +++\x66\x77\x88\x99\xaa
390 390 >>> bprint(trim(t, 8))
391 391 \x11\x22\x33\x44\x55\x66\x77\x88
392 392 >>> bprint(trim(t, 8, leftside=True))
393 393 \x33\x44\x55\x66\x77\x88\x99\xaa
394 394 >>> bprint(trim(t, 3, ellipsis=ellipsis))
395 395 +++
396 396 >>> bprint(trim(t, 1, ellipsis=ellipsis))
397 397 +
398 398 """
399 399 try:
400 400 u = s.decode(_sysstr(encoding))
401 401 except UnicodeDecodeError:
402 402 if len(s) <= width: # trimming is not needed
403 403 return s
404 404 width -= len(ellipsis)
405 405 if width <= 0: # no enough room even for ellipsis
406 406 return ellipsis[: width + len(ellipsis)]
407 407 if leftside:
408 408 return ellipsis + s[-width:]
409 409 return s[:width] + ellipsis
410 410
411 411 if ucolwidth(u) <= width: # trimming is not needed
412 412 return s
413 413
414 414 width -= len(ellipsis)
415 415 if width <= 0: # no enough room even for ellipsis
416 416 return ellipsis[: width + len(ellipsis)]
417 417
418 418 if leftside:
419 419 uslice = lambda i: u[i:]
420 420 concat = lambda s: ellipsis + s
421 421 else:
422 422 uslice = lambda i: u[:-i]
423 423 concat = lambda s: s + ellipsis
424 424 for i in pycompat.xrange(1, len(u)):
425 425 usub = uslice(i)
426 426 if ucolwidth(usub) <= width:
427 427 return concat(usub.encode(_sysstr(encoding)))
428 428 return ellipsis # no enough room for multi-column characters
429 429
430 430
431 431 def lower(s):
432 432 # type: (bytes) -> bytes
433 433 b"best-effort encoding-aware case-folding of local string s"
434 434 try:
435 435 return asciilower(s)
436 436 except UnicodeDecodeError:
437 437 pass
438 438 try:
439 439 if isinstance(s, localstr):
440 440 u = s._utf8.decode("utf-8")
441 441 else:
442 442 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
443 443
444 444 lu = u.lower()
445 445 if u == lu:
446 446 return s # preserve localstring
447 447 return lu.encode(_sysstr(encoding))
448 448 except UnicodeError:
449 449 return s.lower() # we don't know how to fold this except in ASCII
450 450 except LookupError as k:
451 451 raise error.Abort(k, hint=b"please check your locale settings")
452 452
453 453
454 454 def upper(s):
455 455 # type: (bytes) -> bytes
456 456 b"best-effort encoding-aware case-folding of local string s"
457 457 try:
458 458 return asciiupper(s)
459 459 except UnicodeDecodeError:
460 460 return upperfallback(s)
461 461
462 462
463 463 def upperfallback(s):
464 464 # type: (Any) -> Any
465 465 try:
466 466 if isinstance(s, localstr):
467 467 u = s._utf8.decode("utf-8")
468 468 else:
469 469 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
470 470
471 471 uu = u.upper()
472 472 if u == uu:
473 473 return s # preserve localstring
474 474 return uu.encode(_sysstr(encoding))
475 475 except UnicodeError:
476 476 return s.upper() # we don't know how to fold this except in ASCII
477 477 except LookupError as k:
478 478 raise error.Abort(k, hint=b"please check your locale settings")
479 479
480 480
481 481 class normcasespecs(object):
482 482 '''what a platform's normcase does to ASCII strings
483 483
484 484 This is specified per platform, and should be consistent with what normcase
485 485 on that platform actually does.
486 486
487 487 lower: normcase lowercases ASCII strings
488 488 upper: normcase uppercases ASCII strings
489 489 other: the fallback function should always be called
490 490
491 491 This should be kept in sync with normcase_spec in util.h.'''
492 492
493 493 lower = -1
494 494 upper = 1
495 495 other = 0
496 496
497 497
498 498 def jsonescape(s, paranoid=False):
499 499 # type: (Any, Any) -> Any
500 500 '''returns a string suitable for JSON
501 501
502 502 JSON is problematic for us because it doesn't support non-Unicode
503 503 bytes. To deal with this, we take the following approach:
504 504
505 505 - localstr/safelocalstr objects are converted back to UTF-8
506 506 - valid UTF-8/ASCII strings are passed as-is
507 507 - other strings are converted to UTF-8b surrogate encoding
508 508 - apply JSON-specified string escaping
509 509
510 510 (escapes are doubled in these tests)
511 511
512 512 >>> jsonescape(b'this is a test')
513 513 'this is a test'
514 514 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
515 515 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
516 516 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
517 517 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
518 518 >>> jsonescape(b'a weird byte: \\xdd')
519 519 'a weird byte: \\xed\\xb3\\x9d'
520 520 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
521 521 'utf-8: caf\\xc3\\xa9'
522 522 >>> jsonescape(b'')
523 523 ''
524 524
525 525 If paranoid, non-ascii and common troublesome characters are also escaped.
526 526 This is suitable for web output.
527 527
528 528 >>> s = b'escape characters: \\0 \\x0b \\x7f'
529 529 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
530 530 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
531 531 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
532 532 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
533 533 'escape boundary: ~ \\\\u007f \\\\u0080'
534 534 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
535 535 'a weird byte: \\\\udcdd'
536 536 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
537 537 'utf-8: caf\\\\u00e9'
538 538 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
539 539 'non-BMP: \\\\ud834\\\\udd1e'
540 540 >>> jsonescape(b'<foo@example.org>', paranoid=True)
541 541 '\\\\u003cfoo@example.org\\\\u003e'
542 542 '''
543 543
544 544 u8chars = toutf8b(s)
545 545 try:
546 546 return _jsonescapeu8fast(u8chars, paranoid)
547 547 except ValueError:
548 548 pass
549 549 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
550 550
551 551
552 552 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
553 553 # bytes are mapped to that range.
554 554 if pycompat.ispy3:
555 555 _utf8strict = r'surrogatepass'
556 556 else:
557 557 _utf8strict = r'strict'
558 558
559 559 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
560 560
561 561
562 562 def getutf8char(s, pos):
563 563 # type: (bytes, int) -> bytes
564 564 '''get the next full utf-8 character in the given string, starting at pos
565 565
566 566 Raises a UnicodeError if the given location does not start a valid
567 567 utf-8 character.
568 568 '''
569 569
570 570 # find how many bytes to attempt decoding from first nibble
571 571 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
572 572 if not l: # ascii
573 573 return s[pos : pos + 1]
574 574
575 575 c = s[pos : pos + l]
576 576 # validate with attempted decode
577 577 c.decode("utf-8", _utf8strict)
578 578 return c
579 579
580 580
581 581 def toutf8b(s):
582 582 # type: (bytes) -> bytes
583 583 '''convert a local, possibly-binary string into UTF-8b
584 584
585 585 This is intended as a generic method to preserve data when working
586 586 with schemes like JSON and XML that have no provision for
587 587 arbitrary byte strings. As Mercurial often doesn't know
588 588 what encoding data is in, we use so-called UTF-8b.
589 589
590 590 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
591 591 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
592 592 uDC00-uDCFF.
593 593
594 594 Principles of operation:
595 595
596 596 - ASCII and UTF-8 data successfully round-trips and is understood
597 597 by Unicode-oriented clients
598 598 - filenames and file contents in arbitrary other encodings can have
599 599 be round-tripped or recovered by clueful clients
600 600 - local strings that have a cached known UTF-8 encoding (aka
601 601 localstr) get sent as UTF-8 so Unicode-oriented clients get the
602 602 Unicode data they want
603 603 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
604 604 - because we must preserve UTF-8 bytestring in places such as
605 605 filenames, metadata can't be roundtripped without help
606 606
607 607 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
608 608 arbitrary bytes into an internal Unicode format that can be
609 609 re-encoded back into the original. Here we are exposing the
610 610 internal surrogate encoding as a UTF-8 string.)
611 611 '''
612 612
613 613 if isinstance(s, localstr):
614 614 # assume that the original UTF-8 sequence would never contain
615 615 # invalid characters in U+DCxx range
616 616 return s._utf8
617 617 elif isinstance(s, safelocalstr):
618 618 # already verified that s is non-lossy in legacy encoding, which
619 619 # shouldn't contain characters in U+DCxx range
620 620 return fromlocal(s)
621 621 elif isasciistr(s):
622 622 return s
623 623 if b"\xed" not in s:
624 624 try:
625 625 s.decode('utf-8', _utf8strict)
626 626 return s
627 627 except UnicodeDecodeError:
628 628 pass
629 629
630 630 s = pycompat.bytestr(s)
631 631 r = b""
632 632 pos = 0
633 633 l = len(s)
634 634 while pos < l:
635 635 try:
636 636 c = getutf8char(s, pos)
637 637 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
638 638 # have to re-escape existing U+DCxx characters
639 639 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
640 640 pos += 1
641 641 else:
642 642 pos += len(c)
643 643 except UnicodeDecodeError:
644 644 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
645 645 pos += 1
646 646 r += c
647 647 return r
648 648
649 649
650 650 def fromutf8b(s):
651 651 # type: (bytes) -> bytes
652 652 '''Given a UTF-8b string, return a local, possibly-binary string.
653 653
654 654 return the original binary string. This
655 655 is a round-trip process for strings like filenames, but metadata
656 656 that's was passed through tolocal will remain in UTF-8.
657 657
658 658 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
659 659 >>> m = b"\\xc3\\xa9\\x99abcd"
660 660 >>> toutf8b(m)
661 661 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
662 662 >>> roundtrip(m)
663 663 True
664 664 >>> roundtrip(b"\\xc2\\xc2\\x80")
665 665 True
666 666 >>> roundtrip(b"\\xef\\xbf\\xbd")
667 667 True
668 668 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
669 669 True
670 670 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
671 671 True
672 672 '''
673 673
674 674 if isasciistr(s):
675 675 return s
676 676 # fast path - look for uDxxx prefixes in s
677 677 if b"\xed" not in s:
678 678 return s
679 679
680 680 # We could do this with the unicode type but some Python builds
681 681 # use UTF-16 internally (issue5031) which causes non-BMP code
682 682 # points to be escaped. Instead, we use our handy getutf8char
683 683 # helper again to walk the string without "decoding" it.
684 684
685 685 s = pycompat.bytestr(s)
686 686 r = b""
687 687 pos = 0
688 688 l = len(s)
689 689 while pos < l:
690 690 c = getutf8char(s, pos)
691 691 pos += len(c)
692 692 # unescape U+DCxx characters
693 693 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
694 694 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
695 695 r += c
696 696 return r
General Comments 0
You need to be logged in to leave comments. Login now