##// END OF EJS Templates
windows: wrap `os.getcwd()` in `os.path.realpath()` on py3...
Matt Harbison -
r47037:3dfebba9 default
parent child Browse files
Show More
@@ -1,704 +1,709 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from .pycompat import getattr
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import charencode as charencodepure
22 22
23 23 if pycompat.TYPE_CHECKING:
24 24 from typing import (
25 25 Any,
26 26 Callable,
27 27 List,
28 28 Text,
29 29 Type,
30 30 TypeVar,
31 31 Union,
32 32 )
33 33
34 34 # keep pyflakes happy
35 35 for t in (Any, Callable, List, Text, Type, Union):
36 36 assert t
37 37
38 38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39 39
40 40 charencode = policy.importmod('charencode')
41 41
42 42 isasciistr = charencode.isasciistr
43 43 asciilower = charencode.asciilower
44 44 asciiupper = charencode.asciiupper
45 45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46 46
47 47 _sysstr = pycompat.sysstr
48 48
49 49 if pycompat.ispy3:
50 50 unichr = chr
51 51
52 52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 53 # "Unicode Subtleties"), so we need to ignore them in some places for
54 54 # sanity.
55 55 _ignore = [
56 56 unichr(int(x, 16)).encode("utf-8")
57 57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 58 b"206a 206b 206c 206d 206e 206f feff".split()
59 59 ]
60 60 # verify the next function will work
61 61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62 62
63 63
64 64 def hfsignoreclean(s):
65 65 # type: (bytes) -> bytes
66 66 """Remove codepoints ignored by HFS+ from s.
67 67
68 68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 69 '.hg'
70 70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 71 '.hg'
72 72 """
73 73 if b"\xe2" in s or b"\xef" in s:
74 74 for c in _ignore:
75 75 s = s.replace(c, b'')
76 76 return s
77 77
78 78
79 79 # encoding.environ is provided read-only, which may not be used to modify
80 80 # the process environment
81 81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 82 if not pycompat.ispy3:
83 83 environ = os.environ # re-exports
84 84 elif _nativeenviron:
85 85 environ = os.environb # re-exports
86 86 else:
87 87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 88 # and recreate it once encoding is settled
89 89 environ = {
90 90 k.encode('utf-8'): v.encode('utf-8')
91 91 for k, v in os.environ.items() # re-exports
92 92 }
93 93
94 94 _encodingrewrites = {
95 95 b'646': b'ascii',
96 96 b'ANSI_X3.4-1968': b'ascii',
97 97 }
98 98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 100 # https://bugs.python.org/issue13216
101 101 if pycompat.iswindows and not pycompat.ispy3:
102 102 _encodingrewrites[b'cp65001'] = b'utf-8'
103 103
104 104 try:
105 105 encoding = environ.get(b"HGENCODING")
106 106 if not encoding:
107 107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 108 encoding = _encodingrewrites.get(encoding, encoding)
109 109 except locale.Error:
110 110 encoding = b'ascii'
111 111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 112 fallbackencoding = b'ISO-8859-1'
113 113
114 114
115 115 class localstr(bytes):
116 116 """This class allows strings that are unmodified to be
117 117 round-tripped to the local encoding and back"""
118 118
119 119 def __new__(cls, u, l):
120 120 s = bytes.__new__(cls, l)
121 121 s._utf8 = u
122 122 return s
123 123
124 124 if pycompat.TYPE_CHECKING:
125 125 # pseudo implementation to help pytype see localstr() constructor
126 126 def __init__(self, u, l):
127 127 # type: (bytes, bytes) -> None
128 128 super(localstr, self).__init__(l)
129 129 self._utf8 = u
130 130
131 131 def __hash__(self):
132 132 return hash(self._utf8) # avoid collisions in local string space
133 133
134 134
135 135 class safelocalstr(bytes):
136 136 """Tagged string denoting it was previously an internal UTF-8 string,
137 137 and can be converted back to UTF-8 losslessly
138 138
139 139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 143 """
144 144
145 145
146 146 def tolocal(s):
147 147 # type: (bytes) -> bytes
148 148 """
149 149 Convert a string from internal UTF-8 to local encoding
150 150
151 151 All internal strings should be UTF-8 but some repos before the
152 152 implementation of locale support may contain latin1 or possibly
153 153 other character sets. We attempt to decode everything strictly
154 154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 155 replace unknown characters.
156 156
157 157 The localstr class is used to cache the known UTF-8 encoding of
158 158 strings next to their local representation to allow lossless
159 159 round-trip conversion back to UTF-8.
160 160
161 161 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 162 >>> l = tolocal(u)
163 163 >>> l
164 164 'foo: ?'
165 165 >>> fromlocal(l)
166 166 'foo: \\xc3\\xa4'
167 167 >>> u2 = b'foo: \\xc3\\xa1'
168 168 >>> d = { l: 1, tolocal(u2): 2 }
169 169 >>> len(d) # no collision
170 170 2
171 171 >>> b'foo: ?' in d
172 172 False
173 173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 174 >>> l = tolocal(l1)
175 175 >>> l
176 176 'foo: ?'
177 177 >>> fromlocal(l) # magically in utf-8
178 178 'foo: \\xc3\\xa4'
179 179 """
180 180
181 181 if isasciistr(s):
182 182 return s
183 183
184 184 try:
185 185 try:
186 186 # make sure string is actually stored in UTF-8
187 187 u = s.decode('UTF-8')
188 188 if encoding == b'UTF-8':
189 189 # fast path
190 190 return s
191 191 r = u.encode(_sysstr(encoding), "replace")
192 192 if u == r.decode(_sysstr(encoding)):
193 193 # r is a safe, non-lossy encoding of s
194 194 return safelocalstr(r)
195 195 return localstr(s, r)
196 196 except UnicodeDecodeError:
197 197 # we should only get here if we're looking at an ancient changeset
198 198 try:
199 199 u = s.decode(_sysstr(fallbackencoding))
200 200 r = u.encode(_sysstr(encoding), "replace")
201 201 if u == r.decode(_sysstr(encoding)):
202 202 # r is a safe, non-lossy encoding of s
203 203 return safelocalstr(r)
204 204 return localstr(u.encode('UTF-8'), r)
205 205 except UnicodeDecodeError:
206 206 u = s.decode("utf-8", "replace") # last ditch
207 207 # can't round-trip
208 208 return u.encode(_sysstr(encoding), "replace")
209 209 except LookupError as k:
210 210 raise error.Abort(
211 211 pycompat.bytestr(k), hint=b"please check your locale settings"
212 212 )
213 213
214 214
215 215 def fromlocal(s):
216 216 # type: (bytes) -> bytes
217 217 """
218 218 Convert a string from the local character encoding to UTF-8
219 219
220 220 We attempt to decode strings using the encoding mode set by
221 221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 222 characters will cause an error message. Other modes include
223 223 'replace', which replaces unknown characters with a special
224 224 Unicode character, and 'ignore', which drops the character.
225 225 """
226 226
227 227 # can we do a lossless round-trip?
228 228 if isinstance(s, localstr):
229 229 return s._utf8
230 230 if isasciistr(s):
231 231 return s
232 232
233 233 try:
234 234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 235 return u.encode("utf-8")
236 236 except UnicodeDecodeError as inst:
237 237 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 238 raise error.Abort(
239 239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 240 )
241 241 except LookupError as k:
242 242 raise error.Abort(k, hint=b"please check your locale settings")
243 243
244 244
245 245 def unitolocal(u):
246 246 # type: (Text) -> bytes
247 247 """Convert a unicode string to a byte string of local encoding"""
248 248 return tolocal(u.encode('utf-8'))
249 249
250 250
251 251 def unifromlocal(s):
252 252 # type: (bytes) -> Text
253 253 """Convert a byte string of local encoding to a unicode string"""
254 254 return fromlocal(s).decode('utf-8')
255 255
256 256
257 257 def unimethod(bytesfunc):
258 258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
259 259 """Create a proxy method that forwards __unicode__() and __str__() of
260 260 Python 3 to __bytes__()"""
261 261
262 262 def unifunc(obj):
263 263 return unifromlocal(bytesfunc(obj))
264 264
265 265 return unifunc
266 266
267 267
268 268 # converter functions between native str and byte string. use these if the
269 269 # character encoding is not aware (e.g. exception message) or is known to
270 270 # be locale dependent (e.g. date formatting.)
271 271 if pycompat.ispy3:
272 272 strtolocal = unitolocal
273 273 strfromlocal = unifromlocal
274 274 strmethod = unimethod
275 275 else:
276 276
277 277 def strtolocal(s):
278 278 # type: (str) -> bytes
279 279 return s # pytype: disable=bad-return-type
280 280
281 281 def strfromlocal(s):
282 282 # type: (bytes) -> str
283 283 return s # pytype: disable=bad-return-type
284 284
285 285 strmethod = pycompat.identity
286 286
287 287 if not _nativeenviron:
288 288 # now encoding and helper functions are available, recreate the environ
289 289 # dict to be exported to other modules
290 290 environ = {
291 291 tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))
292 292 for k, v in os.environ.items() # re-exports
293 293 }
294 294
295 295 if pycompat.ispy3:
296 296 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
297 297 # returns bytes.
298 298 if pycompat.iswindows:
299 299 # Python 3 on Windows issues a DeprecationWarning about using the bytes
300 300 # API when os.getcwdb() is called.
301 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
301 #
302 # Additionally, py3.8+ uppercases the drive letter when calling
303 # os.path.realpath(), which is used on ``repo.root``. Since those
304 # strings are compared in various places as simple strings, also call
305 # realpath here. See https://bugs.python.org/issue40368
306 getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports
302 307 else:
303 308 getcwd = os.getcwdb # re-exports
304 309 else:
305 310 getcwd = os.getcwd # re-exports
306 311
307 312 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
308 313 _wide = _sysstr(
309 314 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
310 315 and b"WFA"
311 316 or b"WF"
312 317 )
313 318
314 319
315 320 def colwidth(s):
316 321 # type: (bytes) -> int
317 322 """Find the column width of a string for display in the local encoding"""
318 323 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
319 324
320 325
321 326 def ucolwidth(d):
322 327 # type: (Text) -> int
323 328 """Find the column width of a Unicode string for display"""
324 329 eaw = getattr(unicodedata, 'east_asian_width', None)
325 330 if eaw is not None:
326 331 return sum([eaw(c) in _wide and 2 or 1 for c in d])
327 332 return len(d)
328 333
329 334
330 335 def getcols(s, start, c):
331 336 # type: (bytes, int, int) -> bytes
332 337 """Use colwidth to find a c-column substring of s starting at byte
333 338 index start"""
334 339 for x in pycompat.xrange(start + c, len(s)):
335 340 t = s[start:x]
336 341 if colwidth(t) == c:
337 342 return t
338 343 raise ValueError('substring not found')
339 344
340 345
341 346 def trim(s, width, ellipsis=b'', leftside=False):
342 347 # type: (bytes, int, bytes, bool) -> bytes
343 348 """Trim string 's' to at most 'width' columns (including 'ellipsis').
344 349
345 350 If 'leftside' is True, left side of string 's' is trimmed.
346 351 'ellipsis' is always placed at trimmed side.
347 352
348 353 >>> from .node import bin
349 354 >>> def bprint(s):
350 355 ... print(pycompat.sysstr(s))
351 356 >>> ellipsis = b'+++'
352 357 >>> from . import encoding
353 358 >>> encoding.encoding = b'utf-8'
354 359 >>> t = b'1234567890'
355 360 >>> bprint(trim(t, 12, ellipsis=ellipsis))
356 361 1234567890
357 362 >>> bprint(trim(t, 10, ellipsis=ellipsis))
358 363 1234567890
359 364 >>> bprint(trim(t, 8, ellipsis=ellipsis))
360 365 12345+++
361 366 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
362 367 +++67890
363 368 >>> bprint(trim(t, 8))
364 369 12345678
365 370 >>> bprint(trim(t, 8, leftside=True))
366 371 34567890
367 372 >>> bprint(trim(t, 3, ellipsis=ellipsis))
368 373 +++
369 374 >>> bprint(trim(t, 1, ellipsis=ellipsis))
370 375 +
371 376 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
372 377 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
373 378 >>> bprint(trim(t, 12, ellipsis=ellipsis))
374 379 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
375 380 >>> bprint(trim(t, 10, ellipsis=ellipsis))
376 381 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
377 382 >>> bprint(trim(t, 8, ellipsis=ellipsis))
378 383 \xe3\x81\x82\xe3\x81\x84+++
379 384 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
380 385 +++\xe3\x81\x88\xe3\x81\x8a
381 386 >>> bprint(trim(t, 5))
382 387 \xe3\x81\x82\xe3\x81\x84
383 388 >>> bprint(trim(t, 5, leftside=True))
384 389 \xe3\x81\x88\xe3\x81\x8a
385 390 >>> bprint(trim(t, 4, ellipsis=ellipsis))
386 391 +++
387 392 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
388 393 +++
389 394 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
390 395 >>> bprint(trim(t, 12, ellipsis=ellipsis))
391 396 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
392 397 >>> bprint(trim(t, 10, ellipsis=ellipsis))
393 398 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
394 399 >>> bprint(trim(t, 8, ellipsis=ellipsis))
395 400 \x11\x22\x33\x44\x55+++
396 401 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
397 402 +++\x66\x77\x88\x99\xaa
398 403 >>> bprint(trim(t, 8))
399 404 \x11\x22\x33\x44\x55\x66\x77\x88
400 405 >>> bprint(trim(t, 8, leftside=True))
401 406 \x33\x44\x55\x66\x77\x88\x99\xaa
402 407 >>> bprint(trim(t, 3, ellipsis=ellipsis))
403 408 +++
404 409 >>> bprint(trim(t, 1, ellipsis=ellipsis))
405 410 +
406 411 """
407 412 try:
408 413 u = s.decode(_sysstr(encoding))
409 414 except UnicodeDecodeError:
410 415 if len(s) <= width: # trimming is not needed
411 416 return s
412 417 width -= len(ellipsis)
413 418 if width <= 0: # no enough room even for ellipsis
414 419 return ellipsis[: width + len(ellipsis)]
415 420 if leftside:
416 421 return ellipsis + s[-width:]
417 422 return s[:width] + ellipsis
418 423
419 424 if ucolwidth(u) <= width: # trimming is not needed
420 425 return s
421 426
422 427 width -= len(ellipsis)
423 428 if width <= 0: # no enough room even for ellipsis
424 429 return ellipsis[: width + len(ellipsis)]
425 430
426 431 if leftside:
427 432 uslice = lambda i: u[i:]
428 433 concat = lambda s: ellipsis + s
429 434 else:
430 435 uslice = lambda i: u[:-i]
431 436 concat = lambda s: s + ellipsis
432 437 for i in pycompat.xrange(1, len(u)):
433 438 usub = uslice(i)
434 439 if ucolwidth(usub) <= width:
435 440 return concat(usub.encode(_sysstr(encoding)))
436 441 return ellipsis # no enough room for multi-column characters
437 442
438 443
439 444 def lower(s):
440 445 # type: (bytes) -> bytes
441 446 """best-effort encoding-aware case-folding of local string s"""
442 447 try:
443 448 return asciilower(s)
444 449 except UnicodeDecodeError:
445 450 pass
446 451 try:
447 452 if isinstance(s, localstr):
448 453 u = s._utf8.decode("utf-8")
449 454 else:
450 455 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
451 456
452 457 lu = u.lower()
453 458 if u == lu:
454 459 return s # preserve localstring
455 460 return lu.encode(_sysstr(encoding))
456 461 except UnicodeError:
457 462 return s.lower() # we don't know how to fold this except in ASCII
458 463 except LookupError as k:
459 464 raise error.Abort(k, hint=b"please check your locale settings")
460 465
461 466
462 467 def upper(s):
463 468 # type: (bytes) -> bytes
464 469 """best-effort encoding-aware case-folding of local string s"""
465 470 try:
466 471 return asciiupper(s)
467 472 except UnicodeDecodeError:
468 473 return upperfallback(s)
469 474
470 475
471 476 def upperfallback(s):
472 477 # type: (Any) -> Any
473 478 try:
474 479 if isinstance(s, localstr):
475 480 u = s._utf8.decode("utf-8")
476 481 else:
477 482 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
478 483
479 484 uu = u.upper()
480 485 if u == uu:
481 486 return s # preserve localstring
482 487 return uu.encode(_sysstr(encoding))
483 488 except UnicodeError:
484 489 return s.upper() # we don't know how to fold this except in ASCII
485 490 except LookupError as k:
486 491 raise error.Abort(k, hint=b"please check your locale settings")
487 492
488 493
489 494 class normcasespecs(object):
490 495 """what a platform's normcase does to ASCII strings
491 496
492 497 This is specified per platform, and should be consistent with what normcase
493 498 on that platform actually does.
494 499
495 500 lower: normcase lowercases ASCII strings
496 501 upper: normcase uppercases ASCII strings
497 502 other: the fallback function should always be called
498 503
499 504 This should be kept in sync with normcase_spec in util.h."""
500 505
501 506 lower = -1
502 507 upper = 1
503 508 other = 0
504 509
505 510
506 511 def jsonescape(s, paranoid=False):
507 512 # type: (Any, Any) -> Any
508 513 """returns a string suitable for JSON
509 514
510 515 JSON is problematic for us because it doesn't support non-Unicode
511 516 bytes. To deal with this, we take the following approach:
512 517
513 518 - localstr/safelocalstr objects are converted back to UTF-8
514 519 - valid UTF-8/ASCII strings are passed as-is
515 520 - other strings are converted to UTF-8b surrogate encoding
516 521 - apply JSON-specified string escaping
517 522
518 523 (escapes are doubled in these tests)
519 524
520 525 >>> jsonescape(b'this is a test')
521 526 'this is a test'
522 527 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
523 528 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
524 529 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
525 530 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
526 531 >>> jsonescape(b'a weird byte: \\xdd')
527 532 'a weird byte: \\xed\\xb3\\x9d'
528 533 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
529 534 'utf-8: caf\\xc3\\xa9'
530 535 >>> jsonescape(b'')
531 536 ''
532 537
533 538 If paranoid, non-ascii and common troublesome characters are also escaped.
534 539 This is suitable for web output.
535 540
536 541 >>> s = b'escape characters: \\0 \\x0b \\x7f'
537 542 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
538 543 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
539 544 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
540 545 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
541 546 'escape boundary: ~ \\\\u007f \\\\u0080'
542 547 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
543 548 'a weird byte: \\\\udcdd'
544 549 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
545 550 'utf-8: caf\\\\u00e9'
546 551 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
547 552 'non-BMP: \\\\ud834\\\\udd1e'
548 553 >>> jsonescape(b'<foo@example.org>', paranoid=True)
549 554 '\\\\u003cfoo@example.org\\\\u003e'
550 555 """
551 556
552 557 u8chars = toutf8b(s)
553 558 try:
554 559 return _jsonescapeu8fast(u8chars, paranoid)
555 560 except ValueError:
556 561 pass
557 562 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
558 563
559 564
560 565 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
561 566 # bytes are mapped to that range.
562 567 if pycompat.ispy3:
563 568 _utf8strict = r'surrogatepass'
564 569 else:
565 570 _utf8strict = r'strict'
566 571
567 572 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
568 573
569 574
570 575 def getutf8char(s, pos):
571 576 # type: (bytes, int) -> bytes
572 577 """get the next full utf-8 character in the given string, starting at pos
573 578
574 579 Raises a UnicodeError if the given location does not start a valid
575 580 utf-8 character.
576 581 """
577 582
578 583 # find how many bytes to attempt decoding from first nibble
579 584 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
580 585 if not l: # ascii
581 586 return s[pos : pos + 1]
582 587
583 588 c = s[pos : pos + l]
584 589 # validate with attempted decode
585 590 c.decode("utf-8", _utf8strict)
586 591 return c
587 592
588 593
589 594 def toutf8b(s):
590 595 # type: (bytes) -> bytes
591 596 """convert a local, possibly-binary string into UTF-8b
592 597
593 598 This is intended as a generic method to preserve data when working
594 599 with schemes like JSON and XML that have no provision for
595 600 arbitrary byte strings. As Mercurial often doesn't know
596 601 what encoding data is in, we use so-called UTF-8b.
597 602
598 603 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
599 604 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
600 605 uDC00-uDCFF.
601 606
602 607 Principles of operation:
603 608
604 609 - ASCII and UTF-8 data successfully round-trips and is understood
605 610 by Unicode-oriented clients
606 611 - filenames and file contents in arbitrary other encodings can have
607 612 be round-tripped or recovered by clueful clients
608 613 - local strings that have a cached known UTF-8 encoding (aka
609 614 localstr) get sent as UTF-8 so Unicode-oriented clients get the
610 615 Unicode data they want
611 616 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
612 617 - because we must preserve UTF-8 bytestring in places such as
613 618 filenames, metadata can't be roundtripped without help
614 619
615 620 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
616 621 arbitrary bytes into an internal Unicode format that can be
617 622 re-encoded back into the original. Here we are exposing the
618 623 internal surrogate encoding as a UTF-8 string.)
619 624 """
620 625
621 626 if isinstance(s, localstr):
622 627 # assume that the original UTF-8 sequence would never contain
623 628 # invalid characters in U+DCxx range
624 629 return s._utf8
625 630 elif isinstance(s, safelocalstr):
626 631 # already verified that s is non-lossy in legacy encoding, which
627 632 # shouldn't contain characters in U+DCxx range
628 633 return fromlocal(s)
629 634 elif isasciistr(s):
630 635 return s
631 636 if b"\xed" not in s:
632 637 try:
633 638 s.decode('utf-8', _utf8strict)
634 639 return s
635 640 except UnicodeDecodeError:
636 641 pass
637 642
638 643 s = pycompat.bytestr(s)
639 644 r = b""
640 645 pos = 0
641 646 l = len(s)
642 647 while pos < l:
643 648 try:
644 649 c = getutf8char(s, pos)
645 650 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
646 651 # have to re-escape existing U+DCxx characters
647 652 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
648 653 pos += 1
649 654 else:
650 655 pos += len(c)
651 656 except UnicodeDecodeError:
652 657 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
653 658 pos += 1
654 659 r += c
655 660 return r
656 661
657 662
658 663 def fromutf8b(s):
659 664 # type: (bytes) -> bytes
660 665 """Given a UTF-8b string, return a local, possibly-binary string.
661 666
662 667 return the original binary string. This
663 668 is a round-trip process for strings like filenames, but metadata
664 669 that's was passed through tolocal will remain in UTF-8.
665 670
666 671 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
667 672 >>> m = b"\\xc3\\xa9\\x99abcd"
668 673 >>> toutf8b(m)
669 674 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
670 675 >>> roundtrip(m)
671 676 True
672 677 >>> roundtrip(b"\\xc2\\xc2\\x80")
673 678 True
674 679 >>> roundtrip(b"\\xef\\xbf\\xbd")
675 680 True
676 681 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
677 682 True
678 683 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
679 684 True
680 685 """
681 686
682 687 if isasciistr(s):
683 688 return s
684 689 # fast path - look for uDxxx prefixes in s
685 690 if b"\xed" not in s:
686 691 return s
687 692
688 693 # We could do this with the unicode type but some Python builds
689 694 # use UTF-16 internally (issue5031) which causes non-BMP code
690 695 # points to be escaped. Instead, we use our handy getutf8char
691 696 # helper again to walk the string without "decoding" it.
692 697
693 698 s = pycompat.bytestr(s)
694 699 r = b""
695 700 pos = 0
696 701 l = len(s)
697 702 while pos < l:
698 703 c = getutf8char(s, pos)
699 704 pos += len(c)
700 705 # unescape U+DCxx characters
701 706 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
702 707 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
703 708 r += c
704 709 return r
General Comments 0
You need to be logged in to leave comments. Login now