##// END OF EJS Templates
encoding: define local identify functions with explicit type comments...
Augie Fackler -
r43770:5f2a8dab default
parent child Browse files
Show More
@@ -1,653 +1,660
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from .pycompat import getattr
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import charencode as charencodepure
22 22
23 23 charencode = policy.importmod(r'charencode')
24 24
25 25 isasciistr = charencode.isasciistr
26 26 asciilower = charencode.asciilower
27 27 asciiupper = charencode.asciiupper
28 28 _jsonescapeu8fast = charencode.jsonescapeu8fast
29 29
30 30 _sysstr = pycompat.sysstr
31 31
32 32 if pycompat.ispy3:
33 33 unichr = chr
34 34
35 35 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
36 36 # "Unicode Subtleties"), so we need to ignore them in some places for
37 37 # sanity.
38 38 _ignore = [
39 39 unichr(int(x, 16)).encode("utf-8")
40 40 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 41 b"206a 206b 206c 206d 206e 206f feff".split()
42 42 ]
43 43 # verify the next function will work
44 44 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
45 45
46 46
47 47 def hfsignoreclean(s):
48 48 """Remove codepoints ignored by HFS+ from s.
49 49
50 50 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
51 51 '.hg'
52 52 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
53 53 '.hg'
54 54 """
55 55 if b"\xe2" in s or b"\xef" in s:
56 56 for c in _ignore:
57 57 s = s.replace(c, b'')
58 58 return s
59 59
60 60
61 61 # encoding.environ is provided read-only, which may not be used to modify
62 62 # the process environment
63 63 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
64 64 if not pycompat.ispy3:
65 65 environ = os.environ # re-exports
66 66 elif _nativeenviron:
67 67 environ = os.environb # re-exports
68 68 else:
69 69 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
70 70 # and recreate it once encoding is settled
71 71 environ = dict(
72 72 (k.encode(r'utf-8'), v.encode(r'utf-8'))
73 73 for k, v in os.environ.items() # re-exports
74 74 )
75 75
76 76 _encodingrewrites = {
77 77 b'646': b'ascii',
78 78 b'ANSI_X3.4-1968': b'ascii',
79 79 }
80 80 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
81 81 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
82 82 # https://bugs.python.org/issue13216
83 83 if pycompat.iswindows and not pycompat.ispy3:
84 84 _encodingrewrites[b'cp65001'] = b'utf-8'
85 85
86 86 try:
87 87 encoding = environ.get(b"HGENCODING")
88 88 if not encoding:
89 89 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
90 90 encoding = _encodingrewrites.get(encoding, encoding)
91 91 except locale.Error:
92 92 encoding = b'ascii'
93 93 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
94 94 fallbackencoding = b'ISO-8859-1'
95 95
96 96
97 97 class localstr(bytes):
98 98 '''This class allows strings that are unmodified to be
99 99 round-tripped to the local encoding and back'''
100 100
101 101 def __new__(cls, u, l):
102 102 s = bytes.__new__(cls, l)
103 103 s._utf8 = u
104 104 return s
105 105
106 106 def __hash__(self):
107 107 return hash(self._utf8) # avoid collisions in local string space
108 108
109 109
110 110 class safelocalstr(bytes):
111 111 """Tagged string denoting it was previously an internal UTF-8 string,
112 112 and can be converted back to UTF-8 losslessly
113 113
114 114 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
115 115 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
116 116 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
117 117 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
118 118 """
119 119
120 120
121 121 def tolocal(s):
122 122 """
123 123 Convert a string from internal UTF-8 to local encoding
124 124
125 125 All internal strings should be UTF-8 but some repos before the
126 126 implementation of locale support may contain latin1 or possibly
127 127 other character sets. We attempt to decode everything strictly
128 128 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
129 129 replace unknown characters.
130 130
131 131 The localstr class is used to cache the known UTF-8 encoding of
132 132 strings next to their local representation to allow lossless
133 133 round-trip conversion back to UTF-8.
134 134
135 135 >>> u = b'foo: \\xc3\\xa4' # utf-8
136 136 >>> l = tolocal(u)
137 137 >>> l
138 138 'foo: ?'
139 139 >>> fromlocal(l)
140 140 'foo: \\xc3\\xa4'
141 141 >>> u2 = b'foo: \\xc3\\xa1'
142 142 >>> d = { l: 1, tolocal(u2): 2 }
143 143 >>> len(d) # no collision
144 144 2
145 145 >>> b'foo: ?' in d
146 146 False
147 147 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
148 148 >>> l = tolocal(l1)
149 149 >>> l
150 150 'foo: ?'
151 151 >>> fromlocal(l) # magically in utf-8
152 152 'foo: \\xc3\\xa4'
153 153 """
154 154
155 155 if isasciistr(s):
156 156 return s
157 157
158 158 try:
159 159 try:
160 160 # make sure string is actually stored in UTF-8
161 161 u = s.decode('UTF-8')
162 162 if encoding == b'UTF-8':
163 163 # fast path
164 164 return s
165 165 r = u.encode(_sysstr(encoding), r"replace")
166 166 if u == r.decode(_sysstr(encoding)):
167 167 # r is a safe, non-lossy encoding of s
168 168 return safelocalstr(r)
169 169 return localstr(s, r)
170 170 except UnicodeDecodeError:
171 171 # we should only get here if we're looking at an ancient changeset
172 172 try:
173 173 u = s.decode(_sysstr(fallbackencoding))
174 174 r = u.encode(_sysstr(encoding), r"replace")
175 175 if u == r.decode(_sysstr(encoding)):
176 176 # r is a safe, non-lossy encoding of s
177 177 return safelocalstr(r)
178 178 return localstr(u.encode('UTF-8'), r)
179 179 except UnicodeDecodeError:
180 180 u = s.decode("utf-8", "replace") # last ditch
181 181 # can't round-trip
182 182 return u.encode(_sysstr(encoding), r"replace")
183 183 except LookupError as k:
184 184 raise error.Abort(k, hint=b"please check your locale settings")
185 185
186 186
187 187 def fromlocal(s):
188 188 """
189 189 Convert a string from the local character encoding to UTF-8
190 190
191 191 We attempt to decode strings using the encoding mode set by
192 192 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
193 193 characters will cause an error message. Other modes include
194 194 'replace', which replaces unknown characters with a special
195 195 Unicode character, and 'ignore', which drops the character.
196 196 """
197 197
198 198 # can we do a lossless round-trip?
199 199 if isinstance(s, localstr):
200 200 return s._utf8
201 201 if isasciistr(s):
202 202 return s
203 203
204 204 try:
205 205 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
206 206 return u.encode("utf-8")
207 207 except UnicodeDecodeError as inst:
208 208 sub = s[max(0, inst.start - 10) : inst.start + 10]
209 209 raise error.Abort(
210 210 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
211 211 )
212 212 except LookupError as k:
213 213 raise error.Abort(k, hint=b"please check your locale settings")
214 214
215 215
216 216 def unitolocal(u):
217 217 """Convert a unicode string to a byte string of local encoding"""
218 218 return tolocal(u.encode('utf-8'))
219 219
220 220
221 221 def unifromlocal(s):
222 222 """Convert a byte string of local encoding to a unicode string"""
223 223 return fromlocal(s).decode('utf-8')
224 224
225 225
226 226 def unimethod(bytesfunc):
227 227 """Create a proxy method that forwards __unicode__() and __str__() of
228 228 Python 3 to __bytes__()"""
229 229
230 230 def unifunc(obj):
231 231 return unifromlocal(bytesfunc(obj))
232 232
233 233 return unifunc
234 234
235 235
236 236 # converter functions between native str and byte string. use these if the
237 237 # character encoding is not aware (e.g. exception message) or is known to
238 238 # be locale dependent (e.g. date formatting.)
239 239 if pycompat.ispy3:
240 240 strtolocal = unitolocal
241 241 strfromlocal = unifromlocal
242 242 strmethod = unimethod
243 243 else:
244 strtolocal = pycompat.identity
245 strfromlocal = pycompat.identity
244
245 def strtolocal(s):
246 # type: (str) -> bytes
247 return s
248
249 def strfromlocal(s):
250 # type: (bytes) -> str
251 return s
252
246 253 strmethod = pycompat.identity
247 254
248 255 if not _nativeenviron:
249 256 # now encoding and helper functions are available, recreate the environ
250 257 # dict to be exported to other modules
251 258 environ = dict(
252 259 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
253 260 for k, v in os.environ.items() # re-exports
254 261 )
255 262
256 263 if pycompat.ispy3:
257 264 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
258 265 # returns bytes.
259 266 if pycompat.iswindows:
260 267 # Python 3 on Windows issues a DeprecationWarning about using the bytes
261 268 # API when os.getcwdb() is called.
262 269 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
263 270 else:
264 271 getcwd = os.getcwdb # re-exports
265 272 else:
266 273 getcwd = os.getcwd # re-exports
267 274
268 275 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
269 276 _wide = _sysstr(
270 277 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
271 278 and b"WFA"
272 279 or b"WF"
273 280 )
274 281
275 282
276 283 def colwidth(s):
277 284 b"Find the column width of a string for display in the local encoding"
278 285 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
279 286
280 287
281 288 def ucolwidth(d):
282 289 b"Find the column width of a Unicode string for display"
283 290 eaw = getattr(unicodedata, 'east_asian_width', None)
284 291 if eaw is not None:
285 292 return sum([eaw(c) in _wide and 2 or 1 for c in d])
286 293 return len(d)
287 294
288 295
289 296 def getcols(s, start, c):
290 297 '''Use colwidth to find a c-column substring of s starting at byte
291 298 index start'''
292 299 for x in pycompat.xrange(start + c, len(s)):
293 300 t = s[start:x]
294 301 if colwidth(t) == c:
295 302 return t
296 303
297 304
298 305 def trim(s, width, ellipsis=b'', leftside=False):
299 306 """Trim string 's' to at most 'width' columns (including 'ellipsis').
300 307
301 308 If 'leftside' is True, left side of string 's' is trimmed.
302 309 'ellipsis' is always placed at trimmed side.
303 310
304 311 >>> from .node import bin
305 312 >>> def bprint(s):
306 313 ... print(pycompat.sysstr(s))
307 314 >>> ellipsis = b'+++'
308 315 >>> from . import encoding
309 316 >>> encoding.encoding = b'utf-8'
310 317 >>> t = b'1234567890'
311 318 >>> bprint(trim(t, 12, ellipsis=ellipsis))
312 319 1234567890
313 320 >>> bprint(trim(t, 10, ellipsis=ellipsis))
314 321 1234567890
315 322 >>> bprint(trim(t, 8, ellipsis=ellipsis))
316 323 12345+++
317 324 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
318 325 +++67890
319 326 >>> bprint(trim(t, 8))
320 327 12345678
321 328 >>> bprint(trim(t, 8, leftside=True))
322 329 34567890
323 330 >>> bprint(trim(t, 3, ellipsis=ellipsis))
324 331 +++
325 332 >>> bprint(trim(t, 1, ellipsis=ellipsis))
326 333 +
327 334 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
328 335 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
329 336 >>> bprint(trim(t, 12, ellipsis=ellipsis))
330 337 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
331 338 >>> bprint(trim(t, 10, ellipsis=ellipsis))
332 339 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
333 340 >>> bprint(trim(t, 8, ellipsis=ellipsis))
334 341 \xe3\x81\x82\xe3\x81\x84+++
335 342 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
336 343 +++\xe3\x81\x88\xe3\x81\x8a
337 344 >>> bprint(trim(t, 5))
338 345 \xe3\x81\x82\xe3\x81\x84
339 346 >>> bprint(trim(t, 5, leftside=True))
340 347 \xe3\x81\x88\xe3\x81\x8a
341 348 >>> bprint(trim(t, 4, ellipsis=ellipsis))
342 349 +++
343 350 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
344 351 +++
345 352 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
346 353 >>> bprint(trim(t, 12, ellipsis=ellipsis))
347 354 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
348 355 >>> bprint(trim(t, 10, ellipsis=ellipsis))
349 356 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
350 357 >>> bprint(trim(t, 8, ellipsis=ellipsis))
351 358 \x11\x22\x33\x44\x55+++
352 359 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
353 360 +++\x66\x77\x88\x99\xaa
354 361 >>> bprint(trim(t, 8))
355 362 \x11\x22\x33\x44\x55\x66\x77\x88
356 363 >>> bprint(trim(t, 8, leftside=True))
357 364 \x33\x44\x55\x66\x77\x88\x99\xaa
358 365 >>> bprint(trim(t, 3, ellipsis=ellipsis))
359 366 +++
360 367 >>> bprint(trim(t, 1, ellipsis=ellipsis))
361 368 +
362 369 """
363 370 try:
364 371 u = s.decode(_sysstr(encoding))
365 372 except UnicodeDecodeError:
366 373 if len(s) <= width: # trimming is not needed
367 374 return s
368 375 width -= len(ellipsis)
369 376 if width <= 0: # no enough room even for ellipsis
370 377 return ellipsis[: width + len(ellipsis)]
371 378 if leftside:
372 379 return ellipsis + s[-width:]
373 380 return s[:width] + ellipsis
374 381
375 382 if ucolwidth(u) <= width: # trimming is not needed
376 383 return s
377 384
378 385 width -= len(ellipsis)
379 386 if width <= 0: # no enough room even for ellipsis
380 387 return ellipsis[: width + len(ellipsis)]
381 388
382 389 if leftside:
383 390 uslice = lambda i: u[i:]
384 391 concat = lambda s: ellipsis + s
385 392 else:
386 393 uslice = lambda i: u[:-i]
387 394 concat = lambda s: s + ellipsis
388 395 for i in pycompat.xrange(1, len(u)):
389 396 usub = uslice(i)
390 397 if ucolwidth(usub) <= width:
391 398 return concat(usub.encode(_sysstr(encoding)))
392 399 return ellipsis # no enough room for multi-column characters
393 400
394 401
395 402 def lower(s):
396 403 b"best-effort encoding-aware case-folding of local string s"
397 404 try:
398 405 return asciilower(s)
399 406 except UnicodeDecodeError:
400 407 pass
401 408 try:
402 409 if isinstance(s, localstr):
403 410 u = s._utf8.decode("utf-8")
404 411 else:
405 412 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
406 413
407 414 lu = u.lower()
408 415 if u == lu:
409 416 return s # preserve localstring
410 417 return lu.encode(_sysstr(encoding))
411 418 except UnicodeError:
412 419 return s.lower() # we don't know how to fold this except in ASCII
413 420 except LookupError as k:
414 421 raise error.Abort(k, hint=b"please check your locale settings")
415 422
416 423
417 424 def upper(s):
418 425 b"best-effort encoding-aware case-folding of local string s"
419 426 try:
420 427 return asciiupper(s)
421 428 except UnicodeDecodeError:
422 429 return upperfallback(s)
423 430
424 431
425 432 def upperfallback(s):
426 433 try:
427 434 if isinstance(s, localstr):
428 435 u = s._utf8.decode("utf-8")
429 436 else:
430 437 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
431 438
432 439 uu = u.upper()
433 440 if u == uu:
434 441 return s # preserve localstring
435 442 return uu.encode(_sysstr(encoding))
436 443 except UnicodeError:
437 444 return s.upper() # we don't know how to fold this except in ASCII
438 445 except LookupError as k:
439 446 raise error.Abort(k, hint=b"please check your locale settings")
440 447
441 448
442 449 class normcasespecs(object):
443 450 '''what a platform's normcase does to ASCII strings
444 451
445 452 This is specified per platform, and should be consistent with what normcase
446 453 on that platform actually does.
447 454
448 455 lower: normcase lowercases ASCII strings
449 456 upper: normcase uppercases ASCII strings
450 457 other: the fallback function should always be called
451 458
452 459 This should be kept in sync with normcase_spec in util.h.'''
453 460
454 461 lower = -1
455 462 upper = 1
456 463 other = 0
457 464
458 465
459 466 def jsonescape(s, paranoid=False):
460 467 '''returns a string suitable for JSON
461 468
462 469 JSON is problematic for us because it doesn't support non-Unicode
463 470 bytes. To deal with this, we take the following approach:
464 471
465 472 - localstr/safelocalstr objects are converted back to UTF-8
466 473 - valid UTF-8/ASCII strings are passed as-is
467 474 - other strings are converted to UTF-8b surrogate encoding
468 475 - apply JSON-specified string escaping
469 476
470 477 (escapes are doubled in these tests)
471 478
472 479 >>> jsonescape(b'this is a test')
473 480 'this is a test'
474 481 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
475 482 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
476 483 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
477 484 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
478 485 >>> jsonescape(b'a weird byte: \\xdd')
479 486 'a weird byte: \\xed\\xb3\\x9d'
480 487 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
481 488 'utf-8: caf\\xc3\\xa9'
482 489 >>> jsonescape(b'')
483 490 ''
484 491
485 492 If paranoid, non-ascii and common troublesome characters are also escaped.
486 493 This is suitable for web output.
487 494
488 495 >>> s = b'escape characters: \\0 \\x0b \\x7f'
489 496 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
490 497 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
491 498 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
492 499 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
493 500 'escape boundary: ~ \\\\u007f \\\\u0080'
494 501 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
495 502 'a weird byte: \\\\udcdd'
496 503 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
497 504 'utf-8: caf\\\\u00e9'
498 505 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
499 506 'non-BMP: \\\\ud834\\\\udd1e'
500 507 >>> jsonescape(b'<foo@example.org>', paranoid=True)
501 508 '\\\\u003cfoo@example.org\\\\u003e'
502 509 '''
503 510
504 511 u8chars = toutf8b(s)
505 512 try:
506 513 return _jsonescapeu8fast(u8chars, paranoid)
507 514 except ValueError:
508 515 pass
509 516 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
510 517
511 518
512 519 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
513 520 # bytes are mapped to that range.
514 521 if pycompat.ispy3:
515 522 _utf8strict = r'surrogatepass'
516 523 else:
517 524 _utf8strict = r'strict'
518 525
519 526 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
520 527
521 528
522 529 def getutf8char(s, pos):
523 530 '''get the next full utf-8 character in the given string, starting at pos
524 531
525 532 Raises a UnicodeError if the given location does not start a valid
526 533 utf-8 character.
527 534 '''
528 535
529 536 # find how many bytes to attempt decoding from first nibble
530 537 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
531 538 if not l: # ascii
532 539 return s[pos : pos + 1]
533 540
534 541 c = s[pos : pos + l]
535 542 # validate with attempted decode
536 543 c.decode("utf-8", _utf8strict)
537 544 return c
538 545
539 546
540 547 def toutf8b(s):
541 548 '''convert a local, possibly-binary string into UTF-8b
542 549
543 550 This is intended as a generic method to preserve data when working
544 551 with schemes like JSON and XML that have no provision for
545 552 arbitrary byte strings. As Mercurial often doesn't know
546 553 what encoding data is in, we use so-called UTF-8b.
547 554
548 555 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
549 556 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
550 557 uDC00-uDCFF.
551 558
552 559 Principles of operation:
553 560
554 561 - ASCII and UTF-8 data successfully round-trips and is understood
555 562 by Unicode-oriented clients
556 563 - filenames and file contents in arbitrary other encodings can have
557 564 be round-tripped or recovered by clueful clients
558 565 - local strings that have a cached known UTF-8 encoding (aka
559 566 localstr) get sent as UTF-8 so Unicode-oriented clients get the
560 567 Unicode data they want
561 568 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
562 569 - because we must preserve UTF-8 bytestring in places such as
563 570 filenames, metadata can't be roundtripped without help
564 571
565 572 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
566 573 arbitrary bytes into an internal Unicode format that can be
567 574 re-encoded back into the original. Here we are exposing the
568 575 internal surrogate encoding as a UTF-8 string.)
569 576 '''
570 577
571 578 if isinstance(s, localstr):
572 579 # assume that the original UTF-8 sequence would never contain
573 580 # invalid characters in U+DCxx range
574 581 return s._utf8
575 582 elif isinstance(s, safelocalstr):
576 583 # already verified that s is non-lossy in legacy encoding, which
577 584 # shouldn't contain characters in U+DCxx range
578 585 return fromlocal(s)
579 586 elif isasciistr(s):
580 587 return s
581 588 if b"\xed" not in s:
582 589 try:
583 590 s.decode('utf-8', _utf8strict)
584 591 return s
585 592 except UnicodeDecodeError:
586 593 pass
587 594
588 595 s = pycompat.bytestr(s)
589 596 r = b""
590 597 pos = 0
591 598 l = len(s)
592 599 while pos < l:
593 600 try:
594 601 c = getutf8char(s, pos)
595 602 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
596 603 # have to re-escape existing U+DCxx characters
597 604 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
598 605 pos += 1
599 606 else:
600 607 pos += len(c)
601 608 except UnicodeDecodeError:
602 609 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
603 610 pos += 1
604 611 r += c
605 612 return r
606 613
607 614
608 615 def fromutf8b(s):
609 616 '''Given a UTF-8b string, return a local, possibly-binary string.
610 617
611 618 return the original binary string. This
612 619 is a round-trip process for strings like filenames, but metadata
613 620 that's was passed through tolocal will remain in UTF-8.
614 621
615 622 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
616 623 >>> m = b"\\xc3\\xa9\\x99abcd"
617 624 >>> toutf8b(m)
618 625 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
619 626 >>> roundtrip(m)
620 627 True
621 628 >>> roundtrip(b"\\xc2\\xc2\\x80")
622 629 True
623 630 >>> roundtrip(b"\\xef\\xbf\\xbd")
624 631 True
625 632 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
626 633 True
627 634 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
628 635 True
629 636 '''
630 637
631 638 if isasciistr(s):
632 639 return s
633 640 # fast path - look for uDxxx prefixes in s
634 641 if b"\xed" not in s:
635 642 return s
636 643
637 644 # We could do this with the unicode type but some Python builds
638 645 # use UTF-16 internally (issue5031) which causes non-BMP code
639 646 # points to be escaped. Instead, we use our handy getutf8char
640 647 # helper again to walk the string without "decoding" it.
641 648
642 649 s = pycompat.bytestr(s)
643 650 r = b""
644 651 pos = 0
645 652 l = len(s)
646 653 while pos < l:
647 654 c = getutf8char(s, pos)
648 655 pos += len(c)
649 656 # unescape U+DCxx characters
650 657 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
651 658 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
652 659 r += c
653 660 return r
General Comments 0
You need to be logged in to leave comments. Login now