##// END OF EJS Templates
windows: replicate the normalizing behavior of os.environ...
Raphaël Gomès -
r48360:af633293 default
parent child Browse files
Show More
@@ -1,710 +1,719 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from .pycompat import getattr
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import charencode as charencodepure
22 22
23 23 if pycompat.TYPE_CHECKING:
24 24 from typing import (
25 25 Any,
26 26 Callable,
27 27 List,
28 28 Text,
29 29 Type,
30 30 TypeVar,
31 31 Union,
32 32 )
33 33
34 34 # keep pyflakes happy
35 35 for t in (Any, Callable, List, Text, Type, Union):
36 36 assert t
37 37
38 38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
39 39
40 40 charencode = policy.importmod('charencode')
41 41
42 42 isasciistr = charencode.isasciistr
43 43 asciilower = charencode.asciilower
44 44 asciiupper = charencode.asciiupper
45 45 _jsonescapeu8fast = charencode.jsonescapeu8fast
46 46
47 47 _sysstr = pycompat.sysstr
48 48
49 49 if pycompat.ispy3:
50 50 unichr = chr
51 51
52 52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
53 53 # "Unicode Subtleties"), so we need to ignore them in some places for
54 54 # sanity.
55 55 _ignore = [
56 56 unichr(int(x, 16)).encode("utf-8")
57 57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
58 58 b"206a 206b 206c 206d 206e 206f feff".split()
59 59 ]
60 60 # verify the next function will work
61 61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
62 62
63 63
64 64 def hfsignoreclean(s):
65 65 # type: (bytes) -> bytes
66 66 """Remove codepoints ignored by HFS+ from s.
67 67
68 68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
69 69 '.hg'
70 70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
71 71 '.hg'
72 72 """
73 73 if b"\xe2" in s or b"\xef" in s:
74 74 for c in _ignore:
75 75 s = s.replace(c, b'')
76 76 return s
77 77
78 78
79 79 # encoding.environ is provided read-only, which may not be used to modify
80 80 # the process environment
81 81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
82 82 if not pycompat.ispy3:
83 83 environ = os.environ # re-exports
84 84 elif _nativeenviron:
85 85 environ = os.environb # re-exports
86 86 else:
87 87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
88 88 # and recreate it once encoding is settled
89 89 environ = {
90 90 k.encode('utf-8'): v.encode('utf-8')
91 91 for k, v in os.environ.items() # re-exports
92 92 }
93 93
94 94 _encodingrewrites = {
95 95 b'646': b'ascii',
96 96 b'ANSI_X3.4-1968': b'ascii',
97 97 }
98 98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
99 99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
100 100 # https://bugs.python.org/issue13216
101 101 if pycompat.iswindows and not pycompat.ispy3:
102 102 _encodingrewrites[b'cp65001'] = b'utf-8'
103 103
104 104 try:
105 105 encoding = environ.get(b"HGENCODING")
106 106 if not encoding:
107 107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
108 108 encoding = _encodingrewrites.get(encoding, encoding)
109 109 except locale.Error:
110 110 encoding = b'ascii'
111 111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 112 fallbackencoding = b'ISO-8859-1'
113 113
114 114
115 115 class localstr(bytes):
116 116 """This class allows strings that are unmodified to be
117 117 round-tripped to the local encoding and back"""
118 118
119 119 def __new__(cls, u, l):
120 120 s = bytes.__new__(cls, l)
121 121 s._utf8 = u
122 122 return s
123 123
124 124 if pycompat.TYPE_CHECKING:
125 125 # pseudo implementation to help pytype see localstr() constructor
126 126 def __init__(self, u, l):
127 127 # type: (bytes, bytes) -> None
128 128 super(localstr, self).__init__(l)
129 129 self._utf8 = u
130 130
131 131 def __hash__(self):
132 132 return hash(self._utf8) # avoid collisions in local string space
133 133
134 134
135 135 class safelocalstr(bytes):
136 136 """Tagged string denoting it was previously an internal UTF-8 string,
137 137 and can be converted back to UTF-8 losslessly
138 138
139 139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
140 140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
141 141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
142 142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
143 143 """
144 144
145 145
146 146 def tolocal(s):
147 147 # type: (bytes) -> bytes
148 148 """
149 149 Convert a string from internal UTF-8 to local encoding
150 150
151 151 All internal strings should be UTF-8 but some repos before the
152 152 implementation of locale support may contain latin1 or possibly
153 153 other character sets. We attempt to decode everything strictly
154 154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
155 155 replace unknown characters.
156 156
157 157 The localstr class is used to cache the known UTF-8 encoding of
158 158 strings next to their local representation to allow lossless
159 159 round-trip conversion back to UTF-8.
160 160
161 161 >>> u = b'foo: \\xc3\\xa4' # utf-8
162 162 >>> l = tolocal(u)
163 163 >>> l
164 164 'foo: ?'
165 165 >>> fromlocal(l)
166 166 'foo: \\xc3\\xa4'
167 167 >>> u2 = b'foo: \\xc3\\xa1'
168 168 >>> d = { l: 1, tolocal(u2): 2 }
169 169 >>> len(d) # no collision
170 170 2
171 171 >>> b'foo: ?' in d
172 172 False
173 173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
174 174 >>> l = tolocal(l1)
175 175 >>> l
176 176 'foo: ?'
177 177 >>> fromlocal(l) # magically in utf-8
178 178 'foo: \\xc3\\xa4'
179 179 """
180 180
181 181 if isasciistr(s):
182 182 return s
183 183
184 184 try:
185 185 try:
186 186 # make sure string is actually stored in UTF-8
187 187 u = s.decode('UTF-8')
188 188 if encoding == b'UTF-8':
189 189 # fast path
190 190 return s
191 191 r = u.encode(_sysstr(encoding), "replace")
192 192 if u == r.decode(_sysstr(encoding)):
193 193 # r is a safe, non-lossy encoding of s
194 194 return safelocalstr(r)
195 195 return localstr(s, r)
196 196 except UnicodeDecodeError:
197 197 # we should only get here if we're looking at an ancient changeset
198 198 try:
199 199 u = s.decode(_sysstr(fallbackencoding))
200 200 r = u.encode(_sysstr(encoding), "replace")
201 201 if u == r.decode(_sysstr(encoding)):
202 202 # r is a safe, non-lossy encoding of s
203 203 return safelocalstr(r)
204 204 return localstr(u.encode('UTF-8'), r)
205 205 except UnicodeDecodeError:
206 206 u = s.decode("utf-8", "replace") # last ditch
207 207 # can't round-trip
208 208 return u.encode(_sysstr(encoding), "replace")
209 209 except LookupError as k:
210 210 raise error.Abort(
211 211 pycompat.bytestr(k), hint=b"please check your locale settings"
212 212 )
213 213
214 214
215 215 def fromlocal(s):
216 216 # type: (bytes) -> bytes
217 217 """
218 218 Convert a string from the local character encoding to UTF-8
219 219
220 220 We attempt to decode strings using the encoding mode set by
221 221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
222 222 characters will cause an error message. Other modes include
223 223 'replace', which replaces unknown characters with a special
224 224 Unicode character, and 'ignore', which drops the character.
225 225 """
226 226
227 227 # can we do a lossless round-trip?
228 228 if isinstance(s, localstr):
229 229 return s._utf8
230 230 if isasciistr(s):
231 231 return s
232 232
233 233 try:
234 234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
235 235 return u.encode("utf-8")
236 236 except UnicodeDecodeError as inst:
237 237 sub = s[max(0, inst.start - 10) : inst.start + 10]
238 238 raise error.Abort(
239 239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
240 240 )
241 241 except LookupError as k:
242 242 raise error.Abort(k, hint=b"please check your locale settings")
243 243
244 244
245 245 def unitolocal(u):
246 246 # type: (Text) -> bytes
247 247 """Convert a unicode string to a byte string of local encoding"""
248 248 return tolocal(u.encode('utf-8'))
249 249
250 250
251 251 def unifromlocal(s):
252 252 # type: (bytes) -> Text
253 253 """Convert a byte string of local encoding to a unicode string"""
254 254 return fromlocal(s).decode('utf-8')
255 255
256 256
257 257 def unimethod(bytesfunc):
258 258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
259 259 """Create a proxy method that forwards __unicode__() and __str__() of
260 260 Python 3 to __bytes__()"""
261 261
262 262 def unifunc(obj):
263 263 return unifromlocal(bytesfunc(obj))
264 264
265 265 return unifunc
266 266
267 267
268 268 # converter functions between native str and byte string. use these if the
269 269 # character encoding is not aware (e.g. exception message) or is known to
270 270 # be locale dependent (e.g. date formatting.)
271 271 if pycompat.ispy3:
272 272 strtolocal = unitolocal
273 273 strfromlocal = unifromlocal
274 274 strmethod = unimethod
275 275 else:
276 276
277 277 def strtolocal(s):
278 278 # type: (str) -> bytes
279 279 return s # pytype: disable=bad-return-type
280 280
281 281 def strfromlocal(s):
282 282 # type: (bytes) -> str
283 283 return s # pytype: disable=bad-return-type
284 284
285 285 strmethod = pycompat.identity
286 286
287 287
288 288 def lower(s):
289 289 # type: (bytes) -> bytes
290 290 """best-effort encoding-aware case-folding of local string s"""
291 291 try:
292 292 return asciilower(s)
293 293 except UnicodeDecodeError:
294 294 pass
295 295 try:
296 296 if isinstance(s, localstr):
297 297 u = s._utf8.decode("utf-8")
298 298 else:
299 299 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
300 300
301 301 lu = u.lower()
302 302 if u == lu:
303 303 return s # preserve localstring
304 304 return lu.encode(_sysstr(encoding))
305 305 except UnicodeError:
306 306 return s.lower() # we don't know how to fold this except in ASCII
307 307 except LookupError as k:
308 308 raise error.Abort(k, hint=b"please check your locale settings")
309 309
310 310
311 311 def upper(s):
312 312 # type: (bytes) -> bytes
313 313 """best-effort encoding-aware case-folding of local string s"""
314 314 try:
315 315 return asciiupper(s)
316 316 except UnicodeDecodeError:
317 317 return upperfallback(s)
318 318
319 319
320 320 def upperfallback(s):
321 321 # type: (Any) -> Any
322 322 try:
323 323 if isinstance(s, localstr):
324 324 u = s._utf8.decode("utf-8")
325 325 else:
326 326 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
327 327
328 328 uu = u.upper()
329 329 if u == uu:
330 330 return s # preserve localstring
331 331 return uu.encode(_sysstr(encoding))
332 332 except UnicodeError:
333 333 return s.upper() # we don't know how to fold this except in ASCII
334 334 except LookupError as k:
335 335 raise error.Abort(k, hint=b"please check your locale settings")
336 336
337 337
338 338 if not _nativeenviron:
339 339 # now encoding and helper functions are available, recreate the environ
340 340 # dict to be exported to other modules
341 environ = {
342 tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))
343 for k, v in os.environ.items() # re-exports
344 }
341 if pycompat.iswindows and pycompat.ispy3:
342
343 class WindowsEnviron(dict):
344 """`os.environ` normalizes environment variables to uppercase on windows"""
345
346 def get(self, key, default=None):
347 return super().get(upper(key), default)
348
349 environ = WindowsEnviron()
350
351 for k, v in os.environ.items(): # re-exports
352 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
353
345 354
346 355 if pycompat.ispy3:
347 356 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
348 357 # returns bytes.
349 358 if pycompat.iswindows:
350 359 # Python 3 on Windows issues a DeprecationWarning about using the bytes
351 360 # API when os.getcwdb() is called.
352 361 #
353 362 # Additionally, py3.8+ uppercases the drive letter when calling
354 363 # os.path.realpath(), which is used on ``repo.root``. Since those
355 364 # strings are compared in various places as simple strings, also call
356 365 # realpath here. See https://bugs.python.org/issue40368
357 366 getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports
358 367 else:
359 368 getcwd = os.getcwdb # re-exports
360 369 else:
361 370 getcwd = os.getcwd # re-exports
362 371
363 372 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
364 373 _wide = _sysstr(
365 374 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
366 375 and b"WFA"
367 376 or b"WF"
368 377 )
369 378
370 379
371 380 def colwidth(s):
372 381 # type: (bytes) -> int
373 382 """Find the column width of a string for display in the local encoding"""
374 383 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
375 384
376 385
377 386 def ucolwidth(d):
378 387 # type: (Text) -> int
379 388 """Find the column width of a Unicode string for display"""
380 389 eaw = getattr(unicodedata, 'east_asian_width', None)
381 390 if eaw is not None:
382 391 return sum([eaw(c) in _wide and 2 or 1 for c in d])
383 392 return len(d)
384 393
385 394
386 395 def getcols(s, start, c):
387 396 # type: (bytes, int, int) -> bytes
388 397 """Use colwidth to find a c-column substring of s starting at byte
389 398 index start"""
390 399 for x in pycompat.xrange(start + c, len(s)):
391 400 t = s[start:x]
392 401 if colwidth(t) == c:
393 402 return t
394 403 raise ValueError('substring not found')
395 404
396 405
397 406 def trim(s, width, ellipsis=b'', leftside=False):
398 407 # type: (bytes, int, bytes, bool) -> bytes
399 408 """Trim string 's' to at most 'width' columns (including 'ellipsis').
400 409
401 410 If 'leftside' is True, left side of string 's' is trimmed.
402 411 'ellipsis' is always placed at trimmed side.
403 412
404 413 >>> from .node import bin
405 414 >>> def bprint(s):
406 415 ... print(pycompat.sysstr(s))
407 416 >>> ellipsis = b'+++'
408 417 >>> from . import encoding
409 418 >>> encoding.encoding = b'utf-8'
410 419 >>> t = b'1234567890'
411 420 >>> bprint(trim(t, 12, ellipsis=ellipsis))
412 421 1234567890
413 422 >>> bprint(trim(t, 10, ellipsis=ellipsis))
414 423 1234567890
415 424 >>> bprint(trim(t, 8, ellipsis=ellipsis))
416 425 12345+++
417 426 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
418 427 +++67890
419 428 >>> bprint(trim(t, 8))
420 429 12345678
421 430 >>> bprint(trim(t, 8, leftside=True))
422 431 34567890
423 432 >>> bprint(trim(t, 3, ellipsis=ellipsis))
424 433 +++
425 434 >>> bprint(trim(t, 1, ellipsis=ellipsis))
426 435 +
427 436 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
428 437 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
429 438 >>> bprint(trim(t, 12, ellipsis=ellipsis))
430 439 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
431 440 >>> bprint(trim(t, 10, ellipsis=ellipsis))
432 441 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
433 442 >>> bprint(trim(t, 8, ellipsis=ellipsis))
434 443 \xe3\x81\x82\xe3\x81\x84+++
435 444 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
436 445 +++\xe3\x81\x88\xe3\x81\x8a
437 446 >>> bprint(trim(t, 5))
438 447 \xe3\x81\x82\xe3\x81\x84
439 448 >>> bprint(trim(t, 5, leftside=True))
440 449 \xe3\x81\x88\xe3\x81\x8a
441 450 >>> bprint(trim(t, 4, ellipsis=ellipsis))
442 451 +++
443 452 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
444 453 +++
445 454 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
446 455 >>> bprint(trim(t, 12, ellipsis=ellipsis))
447 456 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
448 457 >>> bprint(trim(t, 10, ellipsis=ellipsis))
449 458 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
450 459 >>> bprint(trim(t, 8, ellipsis=ellipsis))
451 460 \x11\x22\x33\x44\x55+++
452 461 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
453 462 +++\x66\x77\x88\x99\xaa
454 463 >>> bprint(trim(t, 8))
455 464 \x11\x22\x33\x44\x55\x66\x77\x88
456 465 >>> bprint(trim(t, 8, leftside=True))
457 466 \x33\x44\x55\x66\x77\x88\x99\xaa
458 467 >>> bprint(trim(t, 3, ellipsis=ellipsis))
459 468 +++
460 469 >>> bprint(trim(t, 1, ellipsis=ellipsis))
461 470 +
462 471 """
463 472 try:
464 473 u = s.decode(_sysstr(encoding))
465 474 except UnicodeDecodeError:
466 475 if len(s) <= width: # trimming is not needed
467 476 return s
468 477 width -= len(ellipsis)
469 478 if width <= 0: # no enough room even for ellipsis
470 479 return ellipsis[: width + len(ellipsis)]
471 480 if leftside:
472 481 return ellipsis + s[-width:]
473 482 return s[:width] + ellipsis
474 483
475 484 if ucolwidth(u) <= width: # trimming is not needed
476 485 return s
477 486
478 487 width -= len(ellipsis)
479 488 if width <= 0: # no enough room even for ellipsis
480 489 return ellipsis[: width + len(ellipsis)]
481 490
482 491 if leftside:
483 492 uslice = lambda i: u[i:]
484 493 concat = lambda s: ellipsis + s
485 494 else:
486 495 uslice = lambda i: u[:-i]
487 496 concat = lambda s: s + ellipsis
488 497 for i in pycompat.xrange(1, len(u)):
489 498 usub = uslice(i)
490 499 if ucolwidth(usub) <= width:
491 500 return concat(usub.encode(_sysstr(encoding)))
492 501 return ellipsis # no enough room for multi-column characters
493 502
494 503
495 504 class normcasespecs(object):
496 505 """what a platform's normcase does to ASCII strings
497 506
498 507 This is specified per platform, and should be consistent with what normcase
499 508 on that platform actually does.
500 509
501 510 lower: normcase lowercases ASCII strings
502 511 upper: normcase uppercases ASCII strings
503 512 other: the fallback function should always be called
504 513
505 514 This should be kept in sync with normcase_spec in util.h."""
506 515
507 516 lower = -1
508 517 upper = 1
509 518 other = 0
510 519
511 520
512 521 def jsonescape(s, paranoid=False):
513 522 # type: (Any, Any) -> Any
514 523 """returns a string suitable for JSON
515 524
516 525 JSON is problematic for us because it doesn't support non-Unicode
517 526 bytes. To deal with this, we take the following approach:
518 527
519 528 - localstr/safelocalstr objects are converted back to UTF-8
520 529 - valid UTF-8/ASCII strings are passed as-is
521 530 - other strings are converted to UTF-8b surrogate encoding
522 531 - apply JSON-specified string escaping
523 532
524 533 (escapes are doubled in these tests)
525 534
526 535 >>> jsonescape(b'this is a test')
527 536 'this is a test'
528 537 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
529 538 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
530 539 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
531 540 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
532 541 >>> jsonescape(b'a weird byte: \\xdd')
533 542 'a weird byte: \\xed\\xb3\\x9d'
534 543 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
535 544 'utf-8: caf\\xc3\\xa9'
536 545 >>> jsonescape(b'')
537 546 ''
538 547
539 548 If paranoid, non-ascii and common troublesome characters are also escaped.
540 549 This is suitable for web output.
541 550
542 551 >>> s = b'escape characters: \\0 \\x0b \\x7f'
543 552 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
544 553 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
545 554 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
546 555 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
547 556 'escape boundary: ~ \\\\u007f \\\\u0080'
548 557 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
549 558 'a weird byte: \\\\udcdd'
550 559 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
551 560 'utf-8: caf\\\\u00e9'
552 561 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
553 562 'non-BMP: \\\\ud834\\\\udd1e'
554 563 >>> jsonescape(b'<foo@example.org>', paranoid=True)
555 564 '\\\\u003cfoo@example.org\\\\u003e'
556 565 """
557 566
558 567 u8chars = toutf8b(s)
559 568 try:
560 569 return _jsonescapeu8fast(u8chars, paranoid)
561 570 except ValueError:
562 571 pass
563 572 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
564 573
565 574
566 575 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
567 576 # bytes are mapped to that range.
568 577 if pycompat.ispy3:
569 578 _utf8strict = r'surrogatepass'
570 579 else:
571 580 _utf8strict = r'strict'
572 581
573 582 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
574 583
575 584
576 585 def getutf8char(s, pos):
577 586 # type: (bytes, int) -> bytes
578 587 """get the next full utf-8 character in the given string, starting at pos
579 588
580 589 Raises a UnicodeError if the given location does not start a valid
581 590 utf-8 character.
582 591 """
583 592
584 593 # find how many bytes to attempt decoding from first nibble
585 594 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
586 595 if not l: # ascii
587 596 return s[pos : pos + 1]
588 597
589 598 c = s[pos : pos + l]
590 599 # validate with attempted decode
591 600 c.decode("utf-8", _utf8strict)
592 601 return c
593 602
594 603
595 604 def toutf8b(s):
596 605 # type: (bytes) -> bytes
597 606 """convert a local, possibly-binary string into UTF-8b
598 607
599 608 This is intended as a generic method to preserve data when working
600 609 with schemes like JSON and XML that have no provision for
601 610 arbitrary byte strings. As Mercurial often doesn't know
602 611 what encoding data is in, we use so-called UTF-8b.
603 612
604 613 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
605 614 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
606 615 uDC00-uDCFF.
607 616
608 617 Principles of operation:
609 618
610 619 - ASCII and UTF-8 data successfully round-trips and is understood
611 620 by Unicode-oriented clients
612 621 - filenames and file contents in arbitrary other encodings can have
613 622 be round-tripped or recovered by clueful clients
614 623 - local strings that have a cached known UTF-8 encoding (aka
615 624 localstr) get sent as UTF-8 so Unicode-oriented clients get the
616 625 Unicode data they want
617 626 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
618 627 - because we must preserve UTF-8 bytestring in places such as
619 628 filenames, metadata can't be roundtripped without help
620 629
621 630 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
622 631 arbitrary bytes into an internal Unicode format that can be
623 632 re-encoded back into the original. Here we are exposing the
624 633 internal surrogate encoding as a UTF-8 string.)
625 634 """
626 635
627 636 if isinstance(s, localstr):
628 637 # assume that the original UTF-8 sequence would never contain
629 638 # invalid characters in U+DCxx range
630 639 return s._utf8
631 640 elif isinstance(s, safelocalstr):
632 641 # already verified that s is non-lossy in legacy encoding, which
633 642 # shouldn't contain characters in U+DCxx range
634 643 return fromlocal(s)
635 644 elif isasciistr(s):
636 645 return s
637 646 if b"\xed" not in s:
638 647 try:
639 648 s.decode('utf-8', _utf8strict)
640 649 return s
641 650 except UnicodeDecodeError:
642 651 pass
643 652
644 653 s = pycompat.bytestr(s)
645 654 r = b""
646 655 pos = 0
647 656 l = len(s)
648 657 while pos < l:
649 658 try:
650 659 c = getutf8char(s, pos)
651 660 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
652 661 # have to re-escape existing U+DCxx characters
653 662 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
654 663 pos += 1
655 664 else:
656 665 pos += len(c)
657 666 except UnicodeDecodeError:
658 667 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
659 668 pos += 1
660 669 r += c
661 670 return r
662 671
663 672
664 673 def fromutf8b(s):
665 674 # type: (bytes) -> bytes
666 675 """Given a UTF-8b string, return a local, possibly-binary string.
667 676
668 677 return the original binary string. This
669 678 is a round-trip process for strings like filenames, but metadata
670 679 that's was passed through tolocal will remain in UTF-8.
671 680
672 681 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
673 682 >>> m = b"\\xc3\\xa9\\x99abcd"
674 683 >>> toutf8b(m)
675 684 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
676 685 >>> roundtrip(m)
677 686 True
678 687 >>> roundtrip(b"\\xc2\\xc2\\x80")
679 688 True
680 689 >>> roundtrip(b"\\xef\\xbf\\xbd")
681 690 True
682 691 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
683 692 True
684 693 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
685 694 True
686 695 """
687 696
688 697 if isasciistr(s):
689 698 return s
690 699 # fast path - look for uDxxx prefixes in s
691 700 if b"\xed" not in s:
692 701 return s
693 702
694 703 # We could do this with the unicode type but some Python builds
695 704 # use UTF-16 internally (issue5031) which causes non-BMP code
696 705 # points to be escaped. Instead, we use our handy getutf8char
697 706 # helper again to walk the string without "decoding" it.
698 707
699 708 s = pycompat.bytestr(s)
700 709 r = b""
701 710 pos = 0
702 711 l = len(s)
703 712 while pos < l:
704 713 c = getutf8char(s, pos)
705 714 pos += len(c)
706 715 # unescape U+DCxx characters
707 716 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
708 717 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
709 718 r += c
710 719 return r
General Comments 0
You need to be logged in to leave comments. Login now