##// END OF EJS Templates
py3: make sure encoding.encoding is a bytes variable...
Pulkit Goyal -
r30622:ce36fa9b default
parent child Browse files
Show More
@@ -1,602 +1,602 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import array
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 pycompat,
18 18 )
19 19
20 20 _sysstr = pycompat.sysstr
21 21
22 22 if pycompat.ispy3:
23 23 unichr = chr
24 24
25 25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
26 26 # "Unicode Subtleties"), so we need to ignore them in some places for
27 27 # sanity.
28 28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
29 29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
30 30 "206a 206b 206c 206d 206e 206f feff".split()]
31 31 # verify the next function will work
32 32 if pycompat.ispy3:
33 33 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
34 34 else:
35 35 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
36 36
37 37 def hfsignoreclean(s):
38 38 """Remove codepoints ignored by HFS+ from s.
39 39
40 40 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
41 41 '.hg'
42 42 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
43 43 '.hg'
44 44 """
45 45 if "\xe2" in s or "\xef" in s:
46 46 for c in _ignore:
47 47 s = s.replace(c, '')
48 48 return s
49 49
50 50 # encoding.environ is provided read-only, which may not be used to modify
51 51 # the process environment
52 52 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
53 53 if not pycompat.ispy3:
54 54 environ = os.environ
55 55 elif _nativeenviron:
56 56 environ = os.environb
57 57 else:
58 58 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
59 59 # and recreate it once encoding is settled
60 60 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
61 61 for k, v in os.environ.items())
62 62
63 63 def _getpreferredencoding():
64 64 '''
65 65 On darwin, getpreferredencoding ignores the locale environment and
66 66 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
67 67 for Python 2.7 and up. This is the same corrected code for earlier
68 68 Python versions.
69 69
70 70 However, we can't use a version check for this method, as some distributions
71 71 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
72 72 encoding, as it is unlikely that this encoding is the actually expected.
73 73 '''
74 74 try:
75 75 locale.CODESET
76 76 except AttributeError:
77 77 # Fall back to parsing environment variables :-(
78 78 return locale.getdefaultlocale()[1]
79 79
80 80 oldloc = locale.setlocale(locale.LC_CTYPE)
81 81 locale.setlocale(locale.LC_CTYPE, "")
82 82 result = locale.nl_langinfo(locale.CODESET)
83 83 locale.setlocale(locale.LC_CTYPE, oldloc)
84 84
85 85 return result
86 86
87 87 _encodingfixers = {
88 88 '646': lambda: 'ascii',
89 89 'ANSI_X3.4-1968': lambda: 'ascii',
90 90 'mac-roman': _getpreferredencoding
91 91 }
92 92
93 93 try:
94 94 encoding = environ.get("HGENCODING")
95 95 if not encoding:
96 encoding = locale.getpreferredencoding() or 'ascii'
96 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
97 97 encoding = _encodingfixers.get(encoding, lambda: encoding)()
98 98 except locale.Error:
99 99 encoding = 'ascii'
100 100 encodingmode = environ.get("HGENCODINGMODE", "strict")
101 101 fallbackencoding = 'ISO-8859-1'
102 102
103 103 class localstr(str):
104 104 '''This class allows strings that are unmodified to be
105 105 round-tripped to the local encoding and back'''
106 106 def __new__(cls, u, l):
107 107 s = str.__new__(cls, l)
108 108 s._utf8 = u
109 109 return s
110 110 def __hash__(self):
111 111 return hash(self._utf8) # avoid collisions in local string space
112 112
113 113 def tolocal(s):
114 114 """
115 115 Convert a string from internal UTF-8 to local encoding
116 116
117 117 All internal strings should be UTF-8 but some repos before the
118 118 implementation of locale support may contain latin1 or possibly
119 119 other character sets. We attempt to decode everything strictly
120 120 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
121 121 replace unknown characters.
122 122
123 123 The localstr class is used to cache the known UTF-8 encoding of
124 124 strings next to their local representation to allow lossless
125 125 round-trip conversion back to UTF-8.
126 126
127 127 >>> u = 'foo: \\xc3\\xa4' # utf-8
128 128 >>> l = tolocal(u)
129 129 >>> l
130 130 'foo: ?'
131 131 >>> fromlocal(l)
132 132 'foo: \\xc3\\xa4'
133 133 >>> u2 = 'foo: \\xc3\\xa1'
134 134 >>> d = { l: 1, tolocal(u2): 2 }
135 135 >>> len(d) # no collision
136 136 2
137 137 >>> 'foo: ?' in d
138 138 False
139 139 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
140 140 >>> l = tolocal(l1)
141 141 >>> l
142 142 'foo: ?'
143 143 >>> fromlocal(l) # magically in utf-8
144 144 'foo: \\xc3\\xa4'
145 145 """
146 146
147 147 try:
148 148 try:
149 149 # make sure string is actually stored in UTF-8
150 150 u = s.decode('UTF-8')
151 151 if encoding == 'UTF-8':
152 152 # fast path
153 153 return s
154 154 r = u.encode(_sysstr(encoding), u"replace")
155 155 if u == r.decode(_sysstr(encoding)):
156 156 # r is a safe, non-lossy encoding of s
157 157 return r
158 158 return localstr(s, r)
159 159 except UnicodeDecodeError:
160 160 # we should only get here if we're looking at an ancient changeset
161 161 try:
162 162 u = s.decode(_sysstr(fallbackencoding))
163 163 r = u.encode(_sysstr(encoding), u"replace")
164 164 if u == r.decode(_sysstr(encoding)):
165 165 # r is a safe, non-lossy encoding of s
166 166 return r
167 167 return localstr(u.encode('UTF-8'), r)
168 168 except UnicodeDecodeError:
169 169 u = s.decode("utf-8", "replace") # last ditch
170 170 # can't round-trip
171 171 return u.encode(_sysstr(encoding), u"replace")
172 172 except LookupError as k:
173 173 raise error.Abort(k, hint="please check your locale settings")
174 174
175 175 def fromlocal(s):
176 176 """
177 177 Convert a string from the local character encoding to UTF-8
178 178
179 179 We attempt to decode strings using the encoding mode set by
180 180 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
181 181 characters will cause an error message. Other modes include
182 182 'replace', which replaces unknown characters with a special
183 183 Unicode character, and 'ignore', which drops the character.
184 184 """
185 185
186 186 # can we do a lossless round-trip?
187 187 if isinstance(s, localstr):
188 188 return s._utf8
189 189
190 190 try:
191 191 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
192 192 return u.encode("utf-8")
193 193 except UnicodeDecodeError as inst:
194 194 sub = s[max(0, inst.start - 10):inst.start + 10]
195 195 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
196 196 except LookupError as k:
197 197 raise error.Abort(k, hint="please check your locale settings")
198 198
199 199 if not _nativeenviron:
200 200 # now encoding and helper functions are available, recreate the environ
201 201 # dict to be exported to other modules
202 202 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
203 203 for k, v in os.environ.items())
204 204
205 205 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
206 206 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
207 207 and "WFA" or "WF")
208 208
209 209 def colwidth(s):
210 210 "Find the column width of a string for display in the local encoding"
211 211 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
212 212
213 213 def ucolwidth(d):
214 214 "Find the column width of a Unicode string for display"
215 215 eaw = getattr(unicodedata, 'east_asian_width', None)
216 216 if eaw is not None:
217 217 return sum([eaw(c) in wide and 2 or 1 for c in d])
218 218 return len(d)
219 219
220 220 def getcols(s, start, c):
221 221 '''Use colwidth to find a c-column substring of s starting at byte
222 222 index start'''
223 223 for x in xrange(start + c, len(s)):
224 224 t = s[start:x]
225 225 if colwidth(t) == c:
226 226 return t
227 227
228 228 def trim(s, width, ellipsis='', leftside=False):
229 229 """Trim string 's' to at most 'width' columns (including 'ellipsis').
230 230
231 231 If 'leftside' is True, left side of string 's' is trimmed.
232 232 'ellipsis' is always placed at trimmed side.
233 233
234 234 >>> ellipsis = '+++'
235 235 >>> from . import encoding
236 236 >>> encoding.encoding = 'utf-8'
237 237 >>> t= '1234567890'
238 238 >>> print trim(t, 12, ellipsis=ellipsis)
239 239 1234567890
240 240 >>> print trim(t, 10, ellipsis=ellipsis)
241 241 1234567890
242 242 >>> print trim(t, 8, ellipsis=ellipsis)
243 243 12345+++
244 244 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
245 245 +++67890
246 246 >>> print trim(t, 8)
247 247 12345678
248 248 >>> print trim(t, 8, leftside=True)
249 249 34567890
250 250 >>> print trim(t, 3, ellipsis=ellipsis)
251 251 +++
252 252 >>> print trim(t, 1, ellipsis=ellipsis)
253 253 +
254 254 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
255 255 >>> t = u.encode(encoding.encoding)
256 256 >>> print trim(t, 12, ellipsis=ellipsis)
257 257 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
258 258 >>> print trim(t, 10, ellipsis=ellipsis)
259 259 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
260 260 >>> print trim(t, 8, ellipsis=ellipsis)
261 261 \xe3\x81\x82\xe3\x81\x84+++
262 262 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
263 263 +++\xe3\x81\x88\xe3\x81\x8a
264 264 >>> print trim(t, 5)
265 265 \xe3\x81\x82\xe3\x81\x84
266 266 >>> print trim(t, 5, leftside=True)
267 267 \xe3\x81\x88\xe3\x81\x8a
268 268 >>> print trim(t, 4, ellipsis=ellipsis)
269 269 +++
270 270 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
271 271 +++
272 272 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
273 273 >>> print trim(t, 12, ellipsis=ellipsis)
274 274 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
275 275 >>> print trim(t, 10, ellipsis=ellipsis)
276 276 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
277 277 >>> print trim(t, 8, ellipsis=ellipsis)
278 278 \x11\x22\x33\x44\x55+++
279 279 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
280 280 +++\x66\x77\x88\x99\xaa
281 281 >>> print trim(t, 8)
282 282 \x11\x22\x33\x44\x55\x66\x77\x88
283 283 >>> print trim(t, 8, leftside=True)
284 284 \x33\x44\x55\x66\x77\x88\x99\xaa
285 285 >>> print trim(t, 3, ellipsis=ellipsis)
286 286 +++
287 287 >>> print trim(t, 1, ellipsis=ellipsis)
288 288 +
289 289 """
290 290 try:
291 291 u = s.decode(_sysstr(encoding))
292 292 except UnicodeDecodeError:
293 293 if len(s) <= width: # trimming is not needed
294 294 return s
295 295 width -= len(ellipsis)
296 296 if width <= 0: # no enough room even for ellipsis
297 297 return ellipsis[:width + len(ellipsis)]
298 298 if leftside:
299 299 return ellipsis + s[-width:]
300 300 return s[:width] + ellipsis
301 301
302 302 if ucolwidth(u) <= width: # trimming is not needed
303 303 return s
304 304
305 305 width -= len(ellipsis)
306 306 if width <= 0: # no enough room even for ellipsis
307 307 return ellipsis[:width + len(ellipsis)]
308 308
309 309 if leftside:
310 310 uslice = lambda i: u[i:]
311 311 concat = lambda s: ellipsis + s
312 312 else:
313 313 uslice = lambda i: u[:-i]
314 314 concat = lambda s: s + ellipsis
315 315 for i in xrange(1, len(u)):
316 316 usub = uslice(i)
317 317 if ucolwidth(usub) <= width:
318 318 return concat(usub.encode(_sysstr(encoding)))
319 319 return ellipsis # no enough room for multi-column characters
320 320
321 321 def _asciilower(s):
322 322 '''convert a string to lowercase if ASCII
323 323
324 324 Raises UnicodeDecodeError if non-ASCII characters are found.'''
325 325 s.decode('ascii')
326 326 return s.lower()
327 327
328 328 def asciilower(s):
329 329 # delay importing avoids cyclic dependency around "parsers" in
330 330 # pure Python build (util => i18n => encoding => parsers => util)
331 331 from . import parsers
332 332 impl = getattr(parsers, 'asciilower', _asciilower)
333 333 global asciilower
334 334 asciilower = impl
335 335 return impl(s)
336 336
337 337 def _asciiupper(s):
338 338 '''convert a string to uppercase if ASCII
339 339
340 340 Raises UnicodeDecodeError if non-ASCII characters are found.'''
341 341 s.decode('ascii')
342 342 return s.upper()
343 343
344 344 def asciiupper(s):
345 345 # delay importing avoids cyclic dependency around "parsers" in
346 346 # pure Python build (util => i18n => encoding => parsers => util)
347 347 from . import parsers
348 348 impl = getattr(parsers, 'asciiupper', _asciiupper)
349 349 global asciiupper
350 350 asciiupper = impl
351 351 return impl(s)
352 352
353 353 def lower(s):
354 354 "best-effort encoding-aware case-folding of local string s"
355 355 try:
356 356 return asciilower(s)
357 357 except UnicodeDecodeError:
358 358 pass
359 359 try:
360 360 if isinstance(s, localstr):
361 361 u = s._utf8.decode("utf-8")
362 362 else:
363 363 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
364 364
365 365 lu = u.lower()
366 366 if u == lu:
367 367 return s # preserve localstring
368 368 return lu.encode(_sysstr(encoding))
369 369 except UnicodeError:
370 370 return s.lower() # we don't know how to fold this except in ASCII
371 371 except LookupError as k:
372 372 raise error.Abort(k, hint="please check your locale settings")
373 373
374 374 def upper(s):
375 375 "best-effort encoding-aware case-folding of local string s"
376 376 try:
377 377 return asciiupper(s)
378 378 except UnicodeDecodeError:
379 379 return upperfallback(s)
380 380
381 381 def upperfallback(s):
382 382 try:
383 383 if isinstance(s, localstr):
384 384 u = s._utf8.decode("utf-8")
385 385 else:
386 386 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
387 387
388 388 uu = u.upper()
389 389 if u == uu:
390 390 return s # preserve localstring
391 391 return uu.encode(_sysstr(encoding))
392 392 except UnicodeError:
393 393 return s.upper() # we don't know how to fold this except in ASCII
394 394 except LookupError as k:
395 395 raise error.Abort(k, hint="please check your locale settings")
396 396
397 397 class normcasespecs(object):
398 398 '''what a platform's normcase does to ASCII strings
399 399
400 400 This is specified per platform, and should be consistent with what normcase
401 401 on that platform actually does.
402 402
403 403 lower: normcase lowercases ASCII strings
404 404 upper: normcase uppercases ASCII strings
405 405 other: the fallback function should always be called
406 406
407 407 This should be kept in sync with normcase_spec in util.h.'''
408 408 lower = -1
409 409 upper = 1
410 410 other = 0
411 411
412 412 _jsonmap = []
413 413 _jsonmap.extend("\\u%04x" % x for x in range(32))
414 414 _jsonmap.extend(chr(x) for x in range(32, 127))
415 415 _jsonmap.append('\\u007f')
416 416 _jsonmap[0x09] = '\\t'
417 417 _jsonmap[0x0a] = '\\n'
418 418 _jsonmap[0x22] = '\\"'
419 419 _jsonmap[0x5c] = '\\\\'
420 420 _jsonmap[0x08] = '\\b'
421 421 _jsonmap[0x0c] = '\\f'
422 422 _jsonmap[0x0d] = '\\r'
423 423 _paranoidjsonmap = _jsonmap[:]
424 424 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
425 425 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
426 426 _jsonmap.extend(chr(x) for x in range(128, 256))
427 427
428 428 def jsonescape(s, paranoid=False):
429 429 '''returns a string suitable for JSON
430 430
431 431 JSON is problematic for us because it doesn't support non-Unicode
432 432 bytes. To deal with this, we take the following approach:
433 433
434 434 - localstr objects are converted back to UTF-8
435 435 - valid UTF-8/ASCII strings are passed as-is
436 436 - other strings are converted to UTF-8b surrogate encoding
437 437 - apply JSON-specified string escaping
438 438
439 439 (escapes are doubled in these tests)
440 440
441 441 >>> jsonescape('this is a test')
442 442 'this is a test'
443 443 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
444 444 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
445 445 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
446 446 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
447 447 >>> jsonescape('a weird byte: \\xdd')
448 448 'a weird byte: \\xed\\xb3\\x9d'
449 449 >>> jsonescape('utf-8: caf\\xc3\\xa9')
450 450 'utf-8: caf\\xc3\\xa9'
451 451 >>> jsonescape('')
452 452 ''
453 453
454 454 If paranoid, non-ascii and common troublesome characters are also escaped.
455 455 This is suitable for web output.
456 456
457 457 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
458 458 'escape boundary: ~ \\\\u007f \\\\u0080'
459 459 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
460 460 'a weird byte: \\\\udcdd'
461 461 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
462 462 'utf-8: caf\\\\u00e9'
463 463 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
464 464 'non-BMP: \\\\ud834\\\\udd1e'
465 465 >>> jsonescape('<foo@example.org>', paranoid=True)
466 466 '\\\\u003cfoo@example.org\\\\u003e'
467 467 '''
468 468
469 469 if paranoid:
470 470 jm = _paranoidjsonmap
471 471 else:
472 472 jm = _jsonmap
473 473
474 474 u8chars = toutf8b(s)
475 475 try:
476 476 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
477 477 except IndexError:
478 478 pass
479 479 # non-BMP char is represented as UTF-16 surrogate pair
480 480 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
481 481 u16codes.pop(0) # drop BOM
482 482 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
483 483
484 484 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
485 485
486 486 def getutf8char(s, pos):
487 487 '''get the next full utf-8 character in the given string, starting at pos
488 488
489 489 Raises a UnicodeError if the given location does not start a valid
490 490 utf-8 character.
491 491 '''
492 492
493 493 # find how many bytes to attempt decoding from first nibble
494 494 l = _utf8len[ord(s[pos]) >> 4]
495 495 if not l: # ascii
496 496 return s[pos]
497 497
498 498 c = s[pos:pos + l]
499 499 # validate with attempted decode
500 500 c.decode("utf-8")
501 501 return c
502 502
503 503 def toutf8b(s):
504 504 '''convert a local, possibly-binary string into UTF-8b
505 505
506 506 This is intended as a generic method to preserve data when working
507 507 with schemes like JSON and XML that have no provision for
508 508 arbitrary byte strings. As Mercurial often doesn't know
509 509 what encoding data is in, we use so-called UTF-8b.
510 510
511 511 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
512 512 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
513 513 uDC00-uDCFF.
514 514
515 515 Principles of operation:
516 516
517 517 - ASCII and UTF-8 data successfully round-trips and is understood
518 518 by Unicode-oriented clients
519 519 - filenames and file contents in arbitrary other encodings can have
520 520 be round-tripped or recovered by clueful clients
521 521 - local strings that have a cached known UTF-8 encoding (aka
522 522 localstr) get sent as UTF-8 so Unicode-oriented clients get the
523 523 Unicode data they want
524 524 - because we must preserve UTF-8 bytestring in places such as
525 525 filenames, metadata can't be roundtripped without help
526 526
527 527 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
528 528 arbitrary bytes into an internal Unicode format that can be
529 529 re-encoded back into the original. Here we are exposing the
530 530 internal surrogate encoding as a UTF-8 string.)
531 531 '''
532 532
533 533 if "\xed" not in s:
534 534 if isinstance(s, localstr):
535 535 return s._utf8
536 536 try:
537 537 s.decode('utf-8')
538 538 return s
539 539 except UnicodeDecodeError:
540 540 pass
541 541
542 542 r = ""
543 543 pos = 0
544 544 l = len(s)
545 545 while pos < l:
546 546 try:
547 547 c = getutf8char(s, pos)
548 548 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
549 549 # have to re-escape existing U+DCxx characters
550 550 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
551 551 pos += 1
552 552 else:
553 553 pos += len(c)
554 554 except UnicodeDecodeError:
555 555 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
556 556 pos += 1
557 557 r += c
558 558 return r
559 559
560 560 def fromutf8b(s):
561 561 '''Given a UTF-8b string, return a local, possibly-binary string.
562 562
563 563 return the original binary string. This
564 564 is a round-trip process for strings like filenames, but metadata
565 565 that's was passed through tolocal will remain in UTF-8.
566 566
567 567 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
568 568 >>> m = "\\xc3\\xa9\\x99abcd"
569 569 >>> toutf8b(m)
570 570 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
571 571 >>> roundtrip(m)
572 572 True
573 573 >>> roundtrip("\\xc2\\xc2\\x80")
574 574 True
575 575 >>> roundtrip("\\xef\\xbf\\xbd")
576 576 True
577 577 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
578 578 True
579 579 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
580 580 True
581 581 '''
582 582
583 583 # fast path - look for uDxxx prefixes in s
584 584 if "\xed" not in s:
585 585 return s
586 586
587 587 # We could do this with the unicode type but some Python builds
588 588 # use UTF-16 internally (issue5031) which causes non-BMP code
589 589 # points to be escaped. Instead, we use our handy getutf8char
590 590 # helper again to walk the string without "decoding" it.
591 591
592 592 r = ""
593 593 pos = 0
594 594 l = len(s)
595 595 while pos < l:
596 596 c = getutf8char(s, pos)
597 597 pos += len(c)
598 598 # unescape U+DCxx characters
599 599 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
600 600 c = chr(ord(c.decode("utf-8")) & 0xff)
601 601 r += c
602 602 return r
General Comments 0
You need to be logged in to leave comments. Login now