##// END OF EJS Templates
encoding: make sure "wide" variable never be referenced from other modules...
Yuya Nishihara -
r32537:044f3d7e default
parent child Browse files
Show More
@@ -1,593 +1,593 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import array
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 _sysstr = pycompat.sysstr
22 22
23 23 if pycompat.ispy3:
24 24 unichr = chr
25 25
26 26 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
27 27 # "Unicode Subtleties"), so we need to ignore them in some places for
28 28 # sanity.
29 29 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
30 30 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
31 31 "206a 206b 206c 206d 206e 206f feff".split()]
32 32 # verify the next function will work
33 33 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
34 34
35 35 def hfsignoreclean(s):
36 36 """Remove codepoints ignored by HFS+ from s.
37 37
38 38 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
39 39 '.hg'
40 40 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
41 41 '.hg'
42 42 """
43 43 if "\xe2" in s or "\xef" in s:
44 44 for c in _ignore:
45 45 s = s.replace(c, '')
46 46 return s
47 47
48 48 # encoding.environ is provided read-only, which may not be used to modify
49 49 # the process environment
50 50 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
51 51 if not pycompat.ispy3:
52 52 environ = os.environ # re-exports
53 53 elif _nativeenviron:
54 54 environ = os.environb # re-exports
55 55 else:
56 56 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
57 57 # and recreate it once encoding is settled
58 58 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
59 59 for k, v in os.environ.items()) # re-exports
60 60
61 61 _encodingfixers = {
62 62 '646': lambda: 'ascii',
63 63 'ANSI_X3.4-1968': lambda: 'ascii',
64 64 }
65 65
66 66 try:
67 67 encoding = environ.get("HGENCODING")
68 68 if not encoding:
69 69 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
70 70 encoding = _encodingfixers.get(encoding, lambda: encoding)()
71 71 except locale.Error:
72 72 encoding = 'ascii'
73 73 encodingmode = environ.get("HGENCODINGMODE", "strict")
74 74 fallbackencoding = 'ISO-8859-1'
75 75
76 76 class localstr(str):
77 77 '''This class allows strings that are unmodified to be
78 78 round-tripped to the local encoding and back'''
79 79 def __new__(cls, u, l):
80 80 s = str.__new__(cls, l)
81 81 s._utf8 = u
82 82 return s
83 83 def __hash__(self):
84 84 return hash(self._utf8) # avoid collisions in local string space
85 85
86 86 def tolocal(s):
87 87 """
88 88 Convert a string from internal UTF-8 to local encoding
89 89
90 90 All internal strings should be UTF-8 but some repos before the
91 91 implementation of locale support may contain latin1 or possibly
92 92 other character sets. We attempt to decode everything strictly
93 93 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
94 94 replace unknown characters.
95 95
96 96 The localstr class is used to cache the known UTF-8 encoding of
97 97 strings next to their local representation to allow lossless
98 98 round-trip conversion back to UTF-8.
99 99
100 100 >>> u = 'foo: \\xc3\\xa4' # utf-8
101 101 >>> l = tolocal(u)
102 102 >>> l
103 103 'foo: ?'
104 104 >>> fromlocal(l)
105 105 'foo: \\xc3\\xa4'
106 106 >>> u2 = 'foo: \\xc3\\xa1'
107 107 >>> d = { l: 1, tolocal(u2): 2 }
108 108 >>> len(d) # no collision
109 109 2
110 110 >>> 'foo: ?' in d
111 111 False
112 112 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
113 113 >>> l = tolocal(l1)
114 114 >>> l
115 115 'foo: ?'
116 116 >>> fromlocal(l) # magically in utf-8
117 117 'foo: \\xc3\\xa4'
118 118 """
119 119
120 120 try:
121 121 try:
122 122 # make sure string is actually stored in UTF-8
123 123 u = s.decode('UTF-8')
124 124 if encoding == 'UTF-8':
125 125 # fast path
126 126 return s
127 127 r = u.encode(_sysstr(encoding), u"replace")
128 128 if u == r.decode(_sysstr(encoding)):
129 129 # r is a safe, non-lossy encoding of s
130 130 return r
131 131 return localstr(s, r)
132 132 except UnicodeDecodeError:
133 133 # we should only get here if we're looking at an ancient changeset
134 134 try:
135 135 u = s.decode(_sysstr(fallbackencoding))
136 136 r = u.encode(_sysstr(encoding), u"replace")
137 137 if u == r.decode(_sysstr(encoding)):
138 138 # r is a safe, non-lossy encoding of s
139 139 return r
140 140 return localstr(u.encode('UTF-8'), r)
141 141 except UnicodeDecodeError:
142 142 u = s.decode("utf-8", "replace") # last ditch
143 143 # can't round-trip
144 144 return u.encode(_sysstr(encoding), u"replace")
145 145 except LookupError as k:
146 146 raise error.Abort(k, hint="please check your locale settings")
147 147
148 148 def fromlocal(s):
149 149 """
150 150 Convert a string from the local character encoding to UTF-8
151 151
152 152 We attempt to decode strings using the encoding mode set by
153 153 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
154 154 characters will cause an error message. Other modes include
155 155 'replace', which replaces unknown characters with a special
156 156 Unicode character, and 'ignore', which drops the character.
157 157 """
158 158
159 159 # can we do a lossless round-trip?
160 160 if isinstance(s, localstr):
161 161 return s._utf8
162 162
163 163 try:
164 164 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
165 165 return u.encode("utf-8")
166 166 except UnicodeDecodeError as inst:
167 167 sub = s[max(0, inst.start - 10):inst.start + 10]
168 168 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
169 169 except LookupError as k:
170 170 raise error.Abort(k, hint="please check your locale settings")
171 171
172 172 def unitolocal(u):
173 173 """Convert a unicode string to a byte string of local encoding"""
174 174 return tolocal(u.encode('utf-8'))
175 175
176 176 def unifromlocal(s):
177 177 """Convert a byte string of local encoding to a unicode string"""
178 178 return fromlocal(s).decode('utf-8')
179 179
180 180 # converter functions between native str and byte string. use these if the
181 181 # character encoding is not aware (e.g. exception message) or is known to
182 182 # be locale dependent (e.g. date formatting.)
183 183 if pycompat.ispy3:
184 184 strtolocal = unitolocal
185 185 strfromlocal = unifromlocal
186 186 else:
187 187 strtolocal = pycompat.identity
188 188 strfromlocal = pycompat.identity
189 189
190 190 if not _nativeenviron:
191 191 # now encoding and helper functions are available, recreate the environ
192 192 # dict to be exported to other modules
193 193 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
194 194 for k, v in os.environ.items()) # re-exports
195 195
196 196 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
197 wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
197 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
198 198 and "WFA" or "WF")
199 199
200 200 def colwidth(s):
201 201 "Find the column width of a string for display in the local encoding"
202 202 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
203 203
204 204 def ucolwidth(d):
205 205 "Find the column width of a Unicode string for display"
206 206 eaw = getattr(unicodedata, 'east_asian_width', None)
207 207 if eaw is not None:
208 return sum([eaw(c) in wide and 2 or 1 for c in d])
208 return sum([eaw(c) in _wide and 2 or 1 for c in d])
209 209 return len(d)
210 210
211 211 def getcols(s, start, c):
212 212 '''Use colwidth to find a c-column substring of s starting at byte
213 213 index start'''
214 214 for x in xrange(start + c, len(s)):
215 215 t = s[start:x]
216 216 if colwidth(t) == c:
217 217 return t
218 218
219 219 def trim(s, width, ellipsis='', leftside=False):
220 220 """Trim string 's' to at most 'width' columns (including 'ellipsis').
221 221
222 222 If 'leftside' is True, left side of string 's' is trimmed.
223 223 'ellipsis' is always placed at trimmed side.
224 224
225 225 >>> ellipsis = '+++'
226 226 >>> from . import encoding
227 227 >>> encoding.encoding = 'utf-8'
228 228 >>> t= '1234567890'
229 229 >>> print trim(t, 12, ellipsis=ellipsis)
230 230 1234567890
231 231 >>> print trim(t, 10, ellipsis=ellipsis)
232 232 1234567890
233 233 >>> print trim(t, 8, ellipsis=ellipsis)
234 234 12345+++
235 235 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
236 236 +++67890
237 237 >>> print trim(t, 8)
238 238 12345678
239 239 >>> print trim(t, 8, leftside=True)
240 240 34567890
241 241 >>> print trim(t, 3, ellipsis=ellipsis)
242 242 +++
243 243 >>> print trim(t, 1, ellipsis=ellipsis)
244 244 +
245 245 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
246 246 >>> t = u.encode(encoding.encoding)
247 247 >>> print trim(t, 12, ellipsis=ellipsis)
248 248 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
249 249 >>> print trim(t, 10, ellipsis=ellipsis)
250 250 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
251 251 >>> print trim(t, 8, ellipsis=ellipsis)
252 252 \xe3\x81\x82\xe3\x81\x84+++
253 253 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
254 254 +++\xe3\x81\x88\xe3\x81\x8a
255 255 >>> print trim(t, 5)
256 256 \xe3\x81\x82\xe3\x81\x84
257 257 >>> print trim(t, 5, leftside=True)
258 258 \xe3\x81\x88\xe3\x81\x8a
259 259 >>> print trim(t, 4, ellipsis=ellipsis)
260 260 +++
261 261 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
262 262 +++
263 263 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
264 264 >>> print trim(t, 12, ellipsis=ellipsis)
265 265 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
266 266 >>> print trim(t, 10, ellipsis=ellipsis)
267 267 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
268 268 >>> print trim(t, 8, ellipsis=ellipsis)
269 269 \x11\x22\x33\x44\x55+++
270 270 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
271 271 +++\x66\x77\x88\x99\xaa
272 272 >>> print trim(t, 8)
273 273 \x11\x22\x33\x44\x55\x66\x77\x88
274 274 >>> print trim(t, 8, leftside=True)
275 275 \x33\x44\x55\x66\x77\x88\x99\xaa
276 276 >>> print trim(t, 3, ellipsis=ellipsis)
277 277 +++
278 278 >>> print trim(t, 1, ellipsis=ellipsis)
279 279 +
280 280 """
281 281 try:
282 282 u = s.decode(_sysstr(encoding))
283 283 except UnicodeDecodeError:
284 284 if len(s) <= width: # trimming is not needed
285 285 return s
286 286 width -= len(ellipsis)
287 287 if width <= 0: # no enough room even for ellipsis
288 288 return ellipsis[:width + len(ellipsis)]
289 289 if leftside:
290 290 return ellipsis + s[-width:]
291 291 return s[:width] + ellipsis
292 292
293 293 if ucolwidth(u) <= width: # trimming is not needed
294 294 return s
295 295
296 296 width -= len(ellipsis)
297 297 if width <= 0: # no enough room even for ellipsis
298 298 return ellipsis[:width + len(ellipsis)]
299 299
300 300 if leftside:
301 301 uslice = lambda i: u[i:]
302 302 concat = lambda s: ellipsis + s
303 303 else:
304 304 uslice = lambda i: u[:-i]
305 305 concat = lambda s: s + ellipsis
306 306 for i in xrange(1, len(u)):
307 307 usub = uslice(i)
308 308 if ucolwidth(usub) <= width:
309 309 return concat(usub.encode(_sysstr(encoding)))
310 310 return ellipsis # no enough room for multi-column characters
311 311
312 312 def _asciilower(s):
313 313 '''convert a string to lowercase if ASCII
314 314
315 315 Raises UnicodeDecodeError if non-ASCII characters are found.'''
316 316 s.decode('ascii')
317 317 return s.lower()
318 318
319 319 def asciilower(s):
320 320 # delay importing avoids cyclic dependency around "parsers" in
321 321 # pure Python build (util => i18n => encoding => parsers => util)
322 322 parsers = policy.importmod(r'parsers')
323 323 impl = getattr(parsers, 'asciilower', _asciilower)
324 324 global asciilower
325 325 asciilower = impl
326 326 return impl(s)
327 327
328 328 def _asciiupper(s):
329 329 '''convert a string to uppercase if ASCII
330 330
331 331 Raises UnicodeDecodeError if non-ASCII characters are found.'''
332 332 s.decode('ascii')
333 333 return s.upper()
334 334
335 335 def asciiupper(s):
336 336 # delay importing avoids cyclic dependency around "parsers" in
337 337 # pure Python build (util => i18n => encoding => parsers => util)
338 338 parsers = policy.importmod(r'parsers')
339 339 impl = getattr(parsers, 'asciiupper', _asciiupper)
340 340 global asciiupper
341 341 asciiupper = impl
342 342 return impl(s)
343 343
344 344 def lower(s):
345 345 "best-effort encoding-aware case-folding of local string s"
346 346 try:
347 347 return asciilower(s)
348 348 except UnicodeDecodeError:
349 349 pass
350 350 try:
351 351 if isinstance(s, localstr):
352 352 u = s._utf8.decode("utf-8")
353 353 else:
354 354 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
355 355
356 356 lu = u.lower()
357 357 if u == lu:
358 358 return s # preserve localstring
359 359 return lu.encode(_sysstr(encoding))
360 360 except UnicodeError:
361 361 return s.lower() # we don't know how to fold this except in ASCII
362 362 except LookupError as k:
363 363 raise error.Abort(k, hint="please check your locale settings")
364 364
365 365 def upper(s):
366 366 "best-effort encoding-aware case-folding of local string s"
367 367 try:
368 368 return asciiupper(s)
369 369 except UnicodeDecodeError:
370 370 return upperfallback(s)
371 371
372 372 def upperfallback(s):
373 373 try:
374 374 if isinstance(s, localstr):
375 375 u = s._utf8.decode("utf-8")
376 376 else:
377 377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
378 378
379 379 uu = u.upper()
380 380 if u == uu:
381 381 return s # preserve localstring
382 382 return uu.encode(_sysstr(encoding))
383 383 except UnicodeError:
384 384 return s.upper() # we don't know how to fold this except in ASCII
385 385 except LookupError as k:
386 386 raise error.Abort(k, hint="please check your locale settings")
387 387
388 388 class normcasespecs(object):
389 389 '''what a platform's normcase does to ASCII strings
390 390
391 391 This is specified per platform, and should be consistent with what normcase
392 392 on that platform actually does.
393 393
394 394 lower: normcase lowercases ASCII strings
395 395 upper: normcase uppercases ASCII strings
396 396 other: the fallback function should always be called
397 397
398 398 This should be kept in sync with normcase_spec in util.h.'''
399 399 lower = -1
400 400 upper = 1
401 401 other = 0
402 402
403 403 _jsonmap = []
404 404 _jsonmap.extend("\\u%04x" % x for x in range(32))
405 405 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
406 406 _jsonmap.append('\\u007f')
407 407 _jsonmap[0x09] = '\\t'
408 408 _jsonmap[0x0a] = '\\n'
409 409 _jsonmap[0x22] = '\\"'
410 410 _jsonmap[0x5c] = '\\\\'
411 411 _jsonmap[0x08] = '\\b'
412 412 _jsonmap[0x0c] = '\\f'
413 413 _jsonmap[0x0d] = '\\r'
414 414 _paranoidjsonmap = _jsonmap[:]
415 415 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
416 416 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
417 417 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
418 418
419 419 def jsonescape(s, paranoid=False):
420 420 '''returns a string suitable for JSON
421 421
422 422 JSON is problematic for us because it doesn't support non-Unicode
423 423 bytes. To deal with this, we take the following approach:
424 424
425 425 - localstr objects are converted back to UTF-8
426 426 - valid UTF-8/ASCII strings are passed as-is
427 427 - other strings are converted to UTF-8b surrogate encoding
428 428 - apply JSON-specified string escaping
429 429
430 430 (escapes are doubled in these tests)
431 431
432 432 >>> jsonescape('this is a test')
433 433 'this is a test'
434 434 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
435 435 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
436 436 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
437 437 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
438 438 >>> jsonescape('a weird byte: \\xdd')
439 439 'a weird byte: \\xed\\xb3\\x9d'
440 440 >>> jsonescape('utf-8: caf\\xc3\\xa9')
441 441 'utf-8: caf\\xc3\\xa9'
442 442 >>> jsonescape('')
443 443 ''
444 444
445 445 If paranoid, non-ascii and common troublesome characters are also escaped.
446 446 This is suitable for web output.
447 447
448 448 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
449 449 'escape boundary: ~ \\\\u007f \\\\u0080'
450 450 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
451 451 'a weird byte: \\\\udcdd'
452 452 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
453 453 'utf-8: caf\\\\u00e9'
454 454 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
455 455 'non-BMP: \\\\ud834\\\\udd1e'
456 456 >>> jsonescape('<foo@example.org>', paranoid=True)
457 457 '\\\\u003cfoo@example.org\\\\u003e'
458 458 '''
459 459
460 460 if paranoid:
461 461 jm = _paranoidjsonmap
462 462 else:
463 463 jm = _jsonmap
464 464
465 465 u8chars = toutf8b(s)
466 466 try:
467 467 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
468 468 except IndexError:
469 469 pass
470 470 # non-BMP char is represented as UTF-16 surrogate pair
471 471 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
472 472 u16codes.pop(0) # drop BOM
473 473 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
474 474
475 475 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
476 476
477 477 def getutf8char(s, pos):
478 478 '''get the next full utf-8 character in the given string, starting at pos
479 479
480 480 Raises a UnicodeError if the given location does not start a valid
481 481 utf-8 character.
482 482 '''
483 483
484 484 # find how many bytes to attempt decoding from first nibble
485 485 l = _utf8len[ord(s[pos]) >> 4]
486 486 if not l: # ascii
487 487 return s[pos]
488 488
489 489 c = s[pos:pos + l]
490 490 # validate with attempted decode
491 491 c.decode("utf-8")
492 492 return c
493 493
494 494 def toutf8b(s):
495 495 '''convert a local, possibly-binary string into UTF-8b
496 496
497 497 This is intended as a generic method to preserve data when working
498 498 with schemes like JSON and XML that have no provision for
499 499 arbitrary byte strings. As Mercurial often doesn't know
500 500 what encoding data is in, we use so-called UTF-8b.
501 501
502 502 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
503 503 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
504 504 uDC00-uDCFF.
505 505
506 506 Principles of operation:
507 507
508 508 - ASCII and UTF-8 data successfully round-trips and is understood
509 509 by Unicode-oriented clients
510 510 - filenames and file contents in arbitrary other encodings can have
511 511 be round-tripped or recovered by clueful clients
512 512 - local strings that have a cached known UTF-8 encoding (aka
513 513 localstr) get sent as UTF-8 so Unicode-oriented clients get the
514 514 Unicode data they want
515 515 - because we must preserve UTF-8 bytestring in places such as
516 516 filenames, metadata can't be roundtripped without help
517 517
518 518 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
519 519 arbitrary bytes into an internal Unicode format that can be
520 520 re-encoded back into the original. Here we are exposing the
521 521 internal surrogate encoding as a UTF-8 string.)
522 522 '''
523 523
524 524 if "\xed" not in s:
525 525 if isinstance(s, localstr):
526 526 return s._utf8
527 527 try:
528 528 s.decode('utf-8')
529 529 return s
530 530 except UnicodeDecodeError:
531 531 pass
532 532
533 533 r = ""
534 534 pos = 0
535 535 l = len(s)
536 536 while pos < l:
537 537 try:
538 538 c = getutf8char(s, pos)
539 539 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
540 540 # have to re-escape existing U+DCxx characters
541 541 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
542 542 pos += 1
543 543 else:
544 544 pos += len(c)
545 545 except UnicodeDecodeError:
546 546 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
547 547 pos += 1
548 548 r += c
549 549 return r
550 550
551 551 def fromutf8b(s):
552 552 '''Given a UTF-8b string, return a local, possibly-binary string.
553 553
554 554 return the original binary string. This
555 555 is a round-trip process for strings like filenames, but metadata
556 556 that's was passed through tolocal will remain in UTF-8.
557 557
558 558 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
559 559 >>> m = "\\xc3\\xa9\\x99abcd"
560 560 >>> toutf8b(m)
561 561 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
562 562 >>> roundtrip(m)
563 563 True
564 564 >>> roundtrip("\\xc2\\xc2\\x80")
565 565 True
566 566 >>> roundtrip("\\xef\\xbf\\xbd")
567 567 True
568 568 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
569 569 True
570 570 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
571 571 True
572 572 '''
573 573
574 574 # fast path - look for uDxxx prefixes in s
575 575 if "\xed" not in s:
576 576 return s
577 577
578 578 # We could do this with the unicode type but some Python builds
579 579 # use UTF-16 internally (issue5031) which causes non-BMP code
580 580 # points to be escaped. Instead, we use our handy getutf8char
581 581 # helper again to walk the string without "decoding" it.
582 582
583 583 r = ""
584 584 pos = 0
585 585 l = len(s)
586 586 while pos < l:
587 587 c = getutf8char(s, pos)
588 588 pos += len(c)
589 589 # unescape U+DCxx characters
590 590 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
591 591 c = chr(ord(c.decode("utf-8")) & 0xff)
592 592 r += c
593 593 return r
General Comments 0
You need to be logged in to leave comments. Login now