##// END OF EJS Templates
encoding: alias cp65001 to utf-8 on Windows...
Yuya Nishihara -
r38633:44302901 stable
parent child Browse files
Show More
@@ -1,582 +1,587 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from . import (
15 15 error,
16 16 policy,
17 17 pycompat,
18 18 )
19 19
20 20 from .pure import (
21 21 charencode as charencodepure,
22 22 )
23 23
24 24 charencode = policy.importmod(r'charencode')
25 25
26 26 isasciistr = charencode.isasciistr
27 27 asciilower = charencode.asciilower
28 28 asciiupper = charencode.asciiupper
29 29 _jsonescapeu8fast = charencode.jsonescapeu8fast
30 30
31 31 _sysstr = pycompat.sysstr
32 32
33 33 if pycompat.ispy3:
34 34 unichr = chr
35 35
36 36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 37 # "Unicode Subtleties"), so we need to ignore them in some places for
38 38 # sanity.
39 39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
40 40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 41 "206a 206b 206c 206d 206e 206f feff".split()]
42 42 # verify the next function will work
43 43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44 44
45 45 def hfsignoreclean(s):
46 46 """Remove codepoints ignored by HFS+ from s.
47 47
48 48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
49 49 '.hg'
50 50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
51 51 '.hg'
52 52 """
53 53 if "\xe2" in s or "\xef" in s:
54 54 for c in _ignore:
55 55 s = s.replace(c, '')
56 56 return s
57 57
58 58 # encoding.environ is provided read-only, which may not be used to modify
59 59 # the process environment
60 60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
61 61 if not pycompat.ispy3:
62 62 environ = os.environ # re-exports
63 63 elif _nativeenviron:
64 64 environ = os.environb # re-exports
65 65 else:
66 66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 67 # and recreate it once encoding is settled
68 68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
69 69 for k, v in os.environ.items()) # re-exports
70 70
71 71 _encodingfixers = {
72 72 '646': lambda: 'ascii',
73 73 'ANSI_X3.4-1968': lambda: 'ascii',
74 74 }
75 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
76 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
77 # https://bugs.python.org/issue13216
78 if pycompat.iswindows and not pycompat.ispy3:
79 _encodingfixers['cp65001'] = lambda: 'utf-8'
75 80
76 81 try:
77 82 encoding = environ.get("HGENCODING")
78 83 if not encoding:
79 84 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
80 85 encoding = _encodingfixers.get(encoding, lambda: encoding)()
81 86 except locale.Error:
82 87 encoding = 'ascii'
83 88 encodingmode = environ.get("HGENCODINGMODE", "strict")
84 89 fallbackencoding = 'ISO-8859-1'
85 90
86 91 class localstr(bytes):
87 92 '''This class allows strings that are unmodified to be
88 93 round-tripped to the local encoding and back'''
89 94 def __new__(cls, u, l):
90 95 s = bytes.__new__(cls, l)
91 96 s._utf8 = u
92 97 return s
93 98 def __hash__(self):
94 99 return hash(self._utf8) # avoid collisions in local string space
95 100
96 101 def tolocal(s):
97 102 """
98 103 Convert a string from internal UTF-8 to local encoding
99 104
100 105 All internal strings should be UTF-8 but some repos before the
101 106 implementation of locale support may contain latin1 or possibly
102 107 other character sets. We attempt to decode everything strictly
103 108 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
104 109 replace unknown characters.
105 110
106 111 The localstr class is used to cache the known UTF-8 encoding of
107 112 strings next to their local representation to allow lossless
108 113 round-trip conversion back to UTF-8.
109 114
110 115 >>> u = b'foo: \\xc3\\xa4' # utf-8
111 116 >>> l = tolocal(u)
112 117 >>> l
113 118 'foo: ?'
114 119 >>> fromlocal(l)
115 120 'foo: \\xc3\\xa4'
116 121 >>> u2 = b'foo: \\xc3\\xa1'
117 122 >>> d = { l: 1, tolocal(u2): 2 }
118 123 >>> len(d) # no collision
119 124 2
120 125 >>> b'foo: ?' in d
121 126 False
122 127 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
123 128 >>> l = tolocal(l1)
124 129 >>> l
125 130 'foo: ?'
126 131 >>> fromlocal(l) # magically in utf-8
127 132 'foo: \\xc3\\xa4'
128 133 """
129 134
130 135 if isasciistr(s):
131 136 return s
132 137
133 138 try:
134 139 try:
135 140 # make sure string is actually stored in UTF-8
136 141 u = s.decode('UTF-8')
137 142 if encoding == 'UTF-8':
138 143 # fast path
139 144 return s
140 145 r = u.encode(_sysstr(encoding), u"replace")
141 146 if u == r.decode(_sysstr(encoding)):
142 147 # r is a safe, non-lossy encoding of s
143 148 return r
144 149 return localstr(s, r)
145 150 except UnicodeDecodeError:
146 151 # we should only get here if we're looking at an ancient changeset
147 152 try:
148 153 u = s.decode(_sysstr(fallbackencoding))
149 154 r = u.encode(_sysstr(encoding), u"replace")
150 155 if u == r.decode(_sysstr(encoding)):
151 156 # r is a safe, non-lossy encoding of s
152 157 return r
153 158 return localstr(u.encode('UTF-8'), r)
154 159 except UnicodeDecodeError:
155 160 u = s.decode("utf-8", "replace") # last ditch
156 161 # can't round-trip
157 162 return u.encode(_sysstr(encoding), u"replace")
158 163 except LookupError as k:
159 164 raise error.Abort(k, hint="please check your locale settings")
160 165
161 166 def fromlocal(s):
162 167 """
163 168 Convert a string from the local character encoding to UTF-8
164 169
165 170 We attempt to decode strings using the encoding mode set by
166 171 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
167 172 characters will cause an error message. Other modes include
168 173 'replace', which replaces unknown characters with a special
169 174 Unicode character, and 'ignore', which drops the character.
170 175 """
171 176
172 177 # can we do a lossless round-trip?
173 178 if isinstance(s, localstr):
174 179 return s._utf8
175 180 if isasciistr(s):
176 181 return s
177 182
178 183 try:
179 184 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
180 185 return u.encode("utf-8")
181 186 except UnicodeDecodeError as inst:
182 187 sub = s[max(0, inst.start - 10):inst.start + 10]
183 188 raise error.Abort("decoding near '%s': %s!"
184 189 % (sub, pycompat.bytestr(inst)))
185 190 except LookupError as k:
186 191 raise error.Abort(k, hint="please check your locale settings")
187 192
188 193 def unitolocal(u):
189 194 """Convert a unicode string to a byte string of local encoding"""
190 195 return tolocal(u.encode('utf-8'))
191 196
192 197 def unifromlocal(s):
193 198 """Convert a byte string of local encoding to a unicode string"""
194 199 return fromlocal(s).decode('utf-8')
195 200
196 201 def unimethod(bytesfunc):
197 202 """Create a proxy method that forwards __unicode__() and __str__() of
198 203 Python 3 to __bytes__()"""
199 204 def unifunc(obj):
200 205 return unifromlocal(bytesfunc(obj))
201 206 return unifunc
202 207
203 208 # converter functions between native str and byte string. use these if the
204 209 # character encoding is not aware (e.g. exception message) or is known to
205 210 # be locale dependent (e.g. date formatting.)
206 211 if pycompat.ispy3:
207 212 strtolocal = unitolocal
208 213 strfromlocal = unifromlocal
209 214 strmethod = unimethod
210 215 else:
211 216 strtolocal = pycompat.identity
212 217 strfromlocal = pycompat.identity
213 218 strmethod = pycompat.identity
214 219
215 220 if not _nativeenviron:
216 221 # now encoding and helper functions are available, recreate the environ
217 222 # dict to be exported to other modules
218 223 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 224 for k, v in os.environ.items()) # re-exports
220 225
221 226 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 227 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 228 and "WFA" or "WF")
224 229
225 230 def colwidth(s):
226 231 "Find the column width of a string for display in the local encoding"
227 232 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228 233
229 234 def ucolwidth(d):
230 235 "Find the column width of a Unicode string for display"
231 236 eaw = getattr(unicodedata, 'east_asian_width', None)
232 237 if eaw is not None:
233 238 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 239 return len(d)
235 240
236 241 def getcols(s, start, c):
237 242 '''Use colwidth to find a c-column substring of s starting at byte
238 243 index start'''
239 244 for x in xrange(start + c, len(s)):
240 245 t = s[start:x]
241 246 if colwidth(t) == c:
242 247 return t
243 248
244 249 def trim(s, width, ellipsis='', leftside=False):
245 250 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246 251
247 252 If 'leftside' is True, left side of string 's' is trimmed.
248 253 'ellipsis' is always placed at trimmed side.
249 254
250 255 >>> from .node import bin
251 256 >>> def bprint(s):
252 257 ... print(pycompat.sysstr(s))
253 258 >>> ellipsis = b'+++'
254 259 >>> from . import encoding
255 260 >>> encoding.encoding = b'utf-8'
256 261 >>> t = b'1234567890'
257 262 >>> bprint(trim(t, 12, ellipsis=ellipsis))
258 263 1234567890
259 264 >>> bprint(trim(t, 10, ellipsis=ellipsis))
260 265 1234567890
261 266 >>> bprint(trim(t, 8, ellipsis=ellipsis))
262 267 12345+++
263 268 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
264 269 +++67890
265 270 >>> bprint(trim(t, 8))
266 271 12345678
267 272 >>> bprint(trim(t, 8, leftside=True))
268 273 34567890
269 274 >>> bprint(trim(t, 3, ellipsis=ellipsis))
270 275 +++
271 276 >>> bprint(trim(t, 1, ellipsis=ellipsis))
272 277 +
273 278 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
274 279 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
275 280 >>> bprint(trim(t, 12, ellipsis=ellipsis))
276 281 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
277 282 >>> bprint(trim(t, 10, ellipsis=ellipsis))
278 283 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
279 284 >>> bprint(trim(t, 8, ellipsis=ellipsis))
280 285 \xe3\x81\x82\xe3\x81\x84+++
281 286 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
282 287 +++\xe3\x81\x88\xe3\x81\x8a
283 288 >>> bprint(trim(t, 5))
284 289 \xe3\x81\x82\xe3\x81\x84
285 290 >>> bprint(trim(t, 5, leftside=True))
286 291 \xe3\x81\x88\xe3\x81\x8a
287 292 >>> bprint(trim(t, 4, ellipsis=ellipsis))
288 293 +++
289 294 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
290 295 +++
291 296 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
292 297 >>> bprint(trim(t, 12, ellipsis=ellipsis))
293 298 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
294 299 >>> bprint(trim(t, 10, ellipsis=ellipsis))
295 300 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
296 301 >>> bprint(trim(t, 8, ellipsis=ellipsis))
297 302 \x11\x22\x33\x44\x55+++
298 303 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
299 304 +++\x66\x77\x88\x99\xaa
300 305 >>> bprint(trim(t, 8))
301 306 \x11\x22\x33\x44\x55\x66\x77\x88
302 307 >>> bprint(trim(t, 8, leftside=True))
303 308 \x33\x44\x55\x66\x77\x88\x99\xaa
304 309 >>> bprint(trim(t, 3, ellipsis=ellipsis))
305 310 +++
306 311 >>> bprint(trim(t, 1, ellipsis=ellipsis))
307 312 +
308 313 """
309 314 try:
310 315 u = s.decode(_sysstr(encoding))
311 316 except UnicodeDecodeError:
312 317 if len(s) <= width: # trimming is not needed
313 318 return s
314 319 width -= len(ellipsis)
315 320 if width <= 0: # no enough room even for ellipsis
316 321 return ellipsis[:width + len(ellipsis)]
317 322 if leftside:
318 323 return ellipsis + s[-width:]
319 324 return s[:width] + ellipsis
320 325
321 326 if ucolwidth(u) <= width: # trimming is not needed
322 327 return s
323 328
324 329 width -= len(ellipsis)
325 330 if width <= 0: # no enough room even for ellipsis
326 331 return ellipsis[:width + len(ellipsis)]
327 332
328 333 if leftside:
329 334 uslice = lambda i: u[i:]
330 335 concat = lambda s: ellipsis + s
331 336 else:
332 337 uslice = lambda i: u[:-i]
333 338 concat = lambda s: s + ellipsis
334 339 for i in xrange(1, len(u)):
335 340 usub = uslice(i)
336 341 if ucolwidth(usub) <= width:
337 342 return concat(usub.encode(_sysstr(encoding)))
338 343 return ellipsis # no enough room for multi-column characters
339 344
340 345 def lower(s):
341 346 "best-effort encoding-aware case-folding of local string s"
342 347 try:
343 348 return asciilower(s)
344 349 except UnicodeDecodeError:
345 350 pass
346 351 try:
347 352 if isinstance(s, localstr):
348 353 u = s._utf8.decode("utf-8")
349 354 else:
350 355 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
351 356
352 357 lu = u.lower()
353 358 if u == lu:
354 359 return s # preserve localstring
355 360 return lu.encode(_sysstr(encoding))
356 361 except UnicodeError:
357 362 return s.lower() # we don't know how to fold this except in ASCII
358 363 except LookupError as k:
359 364 raise error.Abort(k, hint="please check your locale settings")
360 365
361 366 def upper(s):
362 367 "best-effort encoding-aware case-folding of local string s"
363 368 try:
364 369 return asciiupper(s)
365 370 except UnicodeDecodeError:
366 371 return upperfallback(s)
367 372
368 373 def upperfallback(s):
369 374 try:
370 375 if isinstance(s, localstr):
371 376 u = s._utf8.decode("utf-8")
372 377 else:
373 378 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
374 379
375 380 uu = u.upper()
376 381 if u == uu:
377 382 return s # preserve localstring
378 383 return uu.encode(_sysstr(encoding))
379 384 except UnicodeError:
380 385 return s.upper() # we don't know how to fold this except in ASCII
381 386 except LookupError as k:
382 387 raise error.Abort(k, hint="please check your locale settings")
383 388
384 389 class normcasespecs(object):
385 390 '''what a platform's normcase does to ASCII strings
386 391
387 392 This is specified per platform, and should be consistent with what normcase
388 393 on that platform actually does.
389 394
390 395 lower: normcase lowercases ASCII strings
391 396 upper: normcase uppercases ASCII strings
392 397 other: the fallback function should always be called
393 398
394 399 This should be kept in sync with normcase_spec in util.h.'''
395 400 lower = -1
396 401 upper = 1
397 402 other = 0
398 403
399 404 def jsonescape(s, paranoid=False):
400 405 '''returns a string suitable for JSON
401 406
402 407 JSON is problematic for us because it doesn't support non-Unicode
403 408 bytes. To deal with this, we take the following approach:
404 409
405 410 - localstr objects are converted back to UTF-8
406 411 - valid UTF-8/ASCII strings are passed as-is
407 412 - other strings are converted to UTF-8b surrogate encoding
408 413 - apply JSON-specified string escaping
409 414
410 415 (escapes are doubled in these tests)
411 416
412 417 >>> jsonescape(b'this is a test')
413 418 'this is a test'
414 419 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
415 420 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
416 421 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
417 422 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
418 423 >>> jsonescape(b'a weird byte: \\xdd')
419 424 'a weird byte: \\xed\\xb3\\x9d'
420 425 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
421 426 'utf-8: caf\\xc3\\xa9'
422 427 >>> jsonescape(b'')
423 428 ''
424 429
425 430 If paranoid, non-ascii and common troublesome characters are also escaped.
426 431 This is suitable for web output.
427 432
428 433 >>> s = b'escape characters: \\0 \\x0b \\x7f'
429 434 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
430 435 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
431 436 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
432 437 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
433 438 'escape boundary: ~ \\\\u007f \\\\u0080'
434 439 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
435 440 'a weird byte: \\\\udcdd'
436 441 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
437 442 'utf-8: caf\\\\u00e9'
438 443 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
439 444 'non-BMP: \\\\ud834\\\\udd1e'
440 445 >>> jsonescape(b'<foo@example.org>', paranoid=True)
441 446 '\\\\u003cfoo@example.org\\\\u003e'
442 447 '''
443 448
444 449 u8chars = toutf8b(s)
445 450 try:
446 451 return _jsonescapeu8fast(u8chars, paranoid)
447 452 except ValueError:
448 453 pass
449 454 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
450 455
451 456 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
452 457 # bytes are mapped to that range.
453 458 if pycompat.ispy3:
454 459 _utf8strict = r'surrogatepass'
455 460 else:
456 461 _utf8strict = r'strict'
457 462
458 463 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
459 464
460 465 def getutf8char(s, pos):
461 466 '''get the next full utf-8 character in the given string, starting at pos
462 467
463 468 Raises a UnicodeError if the given location does not start a valid
464 469 utf-8 character.
465 470 '''
466 471
467 472 # find how many bytes to attempt decoding from first nibble
468 473 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
469 474 if not l: # ascii
470 475 return s[pos:pos + 1]
471 476
472 477 c = s[pos:pos + l]
473 478 # validate with attempted decode
474 479 c.decode("utf-8", _utf8strict)
475 480 return c
476 481
477 482 def toutf8b(s):
478 483 '''convert a local, possibly-binary string into UTF-8b
479 484
480 485 This is intended as a generic method to preserve data when working
481 486 with schemes like JSON and XML that have no provision for
482 487 arbitrary byte strings. As Mercurial often doesn't know
483 488 what encoding data is in, we use so-called UTF-8b.
484 489
485 490 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
486 491 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
487 492 uDC00-uDCFF.
488 493
489 494 Principles of operation:
490 495
491 496 - ASCII and UTF-8 data successfully round-trips and is understood
492 497 by Unicode-oriented clients
493 498 - filenames and file contents in arbitrary other encodings can have
494 499 be round-tripped or recovered by clueful clients
495 500 - local strings that have a cached known UTF-8 encoding (aka
496 501 localstr) get sent as UTF-8 so Unicode-oriented clients get the
497 502 Unicode data they want
498 503 - because we must preserve UTF-8 bytestring in places such as
499 504 filenames, metadata can't be roundtripped without help
500 505
501 506 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
502 507 arbitrary bytes into an internal Unicode format that can be
503 508 re-encoded back into the original. Here we are exposing the
504 509 internal surrogate encoding as a UTF-8 string.)
505 510 '''
506 511
507 512 if not isinstance(s, localstr) and isasciistr(s):
508 513 return s
509 514 if "\xed" not in s:
510 515 if isinstance(s, localstr):
511 516 return s._utf8
512 517 try:
513 518 s.decode('utf-8', _utf8strict)
514 519 return s
515 520 except UnicodeDecodeError:
516 521 pass
517 522
518 523 s = pycompat.bytestr(s)
519 524 r = ""
520 525 pos = 0
521 526 l = len(s)
522 527 while pos < l:
523 528 try:
524 529 c = getutf8char(s, pos)
525 530 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
526 531 # have to re-escape existing U+DCxx characters
527 532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
528 533 pos += 1
529 534 else:
530 535 pos += len(c)
531 536 except UnicodeDecodeError:
532 537 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
533 538 pos += 1
534 539 r += c
535 540 return r
536 541
537 542 def fromutf8b(s):
538 543 '''Given a UTF-8b string, return a local, possibly-binary string.
539 544
540 545 return the original binary string. This
541 546 is a round-trip process for strings like filenames, but metadata
542 547 that's was passed through tolocal will remain in UTF-8.
543 548
544 549 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
545 550 >>> m = b"\\xc3\\xa9\\x99abcd"
546 551 >>> toutf8b(m)
547 552 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
548 553 >>> roundtrip(m)
549 554 True
550 555 >>> roundtrip(b"\\xc2\\xc2\\x80")
551 556 True
552 557 >>> roundtrip(b"\\xef\\xbf\\xbd")
553 558 True
554 559 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
555 560 True
556 561 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
557 562 True
558 563 '''
559 564
560 565 if isasciistr(s):
561 566 return s
562 567 # fast path - look for uDxxx prefixes in s
563 568 if "\xed" not in s:
564 569 return s
565 570
566 571 # We could do this with the unicode type but some Python builds
567 572 # use UTF-16 internally (issue5031) which causes non-BMP code
568 573 # points to be escaped. Instead, we use our handy getutf8char
569 574 # helper again to walk the string without "decoding" it.
570 575
571 576 s = pycompat.bytestr(s)
572 577 r = ""
573 578 pos = 0
574 579 l = len(s)
575 580 while pos < l:
576 581 c = getutf8char(s, pos)
577 582 pos += len(c)
578 583 # unescape U+DCxx characters
579 584 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
580 585 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
581 586 r += c
582 587 return r
General Comments 0
You need to be logged in to leave comments. Login now