##// END OF EJS Templates
encoding: use i.startswith() instead of i[0] to eliminate py2/3 divergence
Yuya Nishihara -
r32299:7040f513 default
parent child Browse files
Show More
@@ -1,595 +1,592
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import array
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 pycompat,
18 18 )
19 19
20 20 _sysstr = pycompat.sysstr
21 21
22 22 if pycompat.ispy3:
23 23 unichr = chr
24 24
25 25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
26 26 # "Unicode Subtleties"), so we need to ignore them in some places for
27 27 # sanity.
28 28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
29 29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
30 30 "206a 206b 206c 206d 206e 206f feff".split()]
31 31 # verify the next function will work
32 if pycompat.ispy3:
33 assert set(i[0] for i in _ignore) == {ord(b'\xe2'), ord(b'\xef')}
34 else:
35 assert set(i[0] for i in _ignore) == {"\xe2", "\xef"}
32 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
36 33
37 34 def hfsignoreclean(s):
38 35 """Remove codepoints ignored by HFS+ from s.
39 36
40 37 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
41 38 '.hg'
42 39 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
43 40 '.hg'
44 41 """
45 42 if "\xe2" in s or "\xef" in s:
46 43 for c in _ignore:
47 44 s = s.replace(c, '')
48 45 return s
49 46
50 47 # encoding.environ is provided read-only, which may not be used to modify
51 48 # the process environment
52 49 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
53 50 if not pycompat.ispy3:
54 51 environ = os.environ # re-exports
55 52 elif _nativeenviron:
56 53 environ = os.environb # re-exports
57 54 else:
58 55 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
59 56 # and recreate it once encoding is settled
60 57 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
61 58 for k, v in os.environ.items()) # re-exports
62 59
63 60 _encodingfixers = {
64 61 '646': lambda: 'ascii',
65 62 'ANSI_X3.4-1968': lambda: 'ascii',
66 63 }
67 64
68 65 try:
69 66 encoding = environ.get("HGENCODING")
70 67 if not encoding:
71 68 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
72 69 encoding = _encodingfixers.get(encoding, lambda: encoding)()
73 70 except locale.Error:
74 71 encoding = 'ascii'
75 72 encodingmode = environ.get("HGENCODINGMODE", "strict")
76 73 fallbackencoding = 'ISO-8859-1'
77 74
78 75 class localstr(str):
79 76 '''This class allows strings that are unmodified to be
80 77 round-tripped to the local encoding and back'''
81 78 def __new__(cls, u, l):
82 79 s = str.__new__(cls, l)
83 80 s._utf8 = u
84 81 return s
85 82 def __hash__(self):
86 83 return hash(self._utf8) # avoid collisions in local string space
87 84
88 85 def tolocal(s):
89 86 """
90 87 Convert a string from internal UTF-8 to local encoding
91 88
92 89 All internal strings should be UTF-8 but some repos before the
93 90 implementation of locale support may contain latin1 or possibly
94 91 other character sets. We attempt to decode everything strictly
95 92 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
96 93 replace unknown characters.
97 94
98 95 The localstr class is used to cache the known UTF-8 encoding of
99 96 strings next to their local representation to allow lossless
100 97 round-trip conversion back to UTF-8.
101 98
102 99 >>> u = 'foo: \\xc3\\xa4' # utf-8
103 100 >>> l = tolocal(u)
104 101 >>> l
105 102 'foo: ?'
106 103 >>> fromlocal(l)
107 104 'foo: \\xc3\\xa4'
108 105 >>> u2 = 'foo: \\xc3\\xa1'
109 106 >>> d = { l: 1, tolocal(u2): 2 }
110 107 >>> len(d) # no collision
111 108 2
112 109 >>> 'foo: ?' in d
113 110 False
114 111 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
115 112 >>> l = tolocal(l1)
116 113 >>> l
117 114 'foo: ?'
118 115 >>> fromlocal(l) # magically in utf-8
119 116 'foo: \\xc3\\xa4'
120 117 """
121 118
122 119 try:
123 120 try:
124 121 # make sure string is actually stored in UTF-8
125 122 u = s.decode('UTF-8')
126 123 if encoding == 'UTF-8':
127 124 # fast path
128 125 return s
129 126 r = u.encode(_sysstr(encoding), u"replace")
130 127 if u == r.decode(_sysstr(encoding)):
131 128 # r is a safe, non-lossy encoding of s
132 129 return r
133 130 return localstr(s, r)
134 131 except UnicodeDecodeError:
135 132 # we should only get here if we're looking at an ancient changeset
136 133 try:
137 134 u = s.decode(_sysstr(fallbackencoding))
138 135 r = u.encode(_sysstr(encoding), u"replace")
139 136 if u == r.decode(_sysstr(encoding)):
140 137 # r is a safe, non-lossy encoding of s
141 138 return r
142 139 return localstr(u.encode('UTF-8'), r)
143 140 except UnicodeDecodeError:
144 141 u = s.decode("utf-8", "replace") # last ditch
145 142 # can't round-trip
146 143 return u.encode(_sysstr(encoding), u"replace")
147 144 except LookupError as k:
148 145 raise error.Abort(k, hint="please check your locale settings")
149 146
150 147 def fromlocal(s):
151 148 """
152 149 Convert a string from the local character encoding to UTF-8
153 150
154 151 We attempt to decode strings using the encoding mode set by
155 152 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
156 153 characters will cause an error message. Other modes include
157 154 'replace', which replaces unknown characters with a special
158 155 Unicode character, and 'ignore', which drops the character.
159 156 """
160 157
161 158 # can we do a lossless round-trip?
162 159 if isinstance(s, localstr):
163 160 return s._utf8
164 161
165 162 try:
166 163 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
167 164 return u.encode("utf-8")
168 165 except UnicodeDecodeError as inst:
169 166 sub = s[max(0, inst.start - 10):inst.start + 10]
170 167 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 168 except LookupError as k:
172 169 raise error.Abort(k, hint="please check your locale settings")
173 170
174 171 def unitolocal(u):
175 172 """Convert a unicode string to a byte string of local encoding"""
176 173 return tolocal(u.encode('utf-8'))
177 174
178 175 def unifromlocal(s):
179 176 """Convert a byte string of local encoding to a unicode string"""
180 177 return fromlocal(s).decode('utf-8')
181 178
182 179 # converter functions between native str and byte string. use these if the
183 180 # character encoding is not aware (e.g. exception message) or is known to
184 181 # be locale dependent (e.g. date formatting.)
185 182 if pycompat.ispy3:
186 183 strtolocal = unitolocal
187 184 strfromlocal = unifromlocal
188 185 else:
189 186 strtolocal = pycompat.identity
190 187 strfromlocal = pycompat.identity
191 188
192 189 if not _nativeenviron:
193 190 # now encoding and helper functions are available, recreate the environ
194 191 # dict to be exported to other modules
195 192 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
196 193 for k, v in os.environ.items()) # re-exports
197 194
198 195 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
199 196 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
200 197 and "WFA" or "WF")
201 198
202 199 def colwidth(s):
203 200 "Find the column width of a string for display in the local encoding"
204 201 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
205 202
206 203 def ucolwidth(d):
207 204 "Find the column width of a Unicode string for display"
208 205 eaw = getattr(unicodedata, 'east_asian_width', None)
209 206 if eaw is not None:
210 207 return sum([eaw(c) in wide and 2 or 1 for c in d])
211 208 return len(d)
212 209
213 210 def getcols(s, start, c):
214 211 '''Use colwidth to find a c-column substring of s starting at byte
215 212 index start'''
216 213 for x in xrange(start + c, len(s)):
217 214 t = s[start:x]
218 215 if colwidth(t) == c:
219 216 return t
220 217
221 218 def trim(s, width, ellipsis='', leftside=False):
222 219 """Trim string 's' to at most 'width' columns (including 'ellipsis').
223 220
224 221 If 'leftside' is True, left side of string 's' is trimmed.
225 222 'ellipsis' is always placed at trimmed side.
226 223
227 224 >>> ellipsis = '+++'
228 225 >>> from . import encoding
229 226 >>> encoding.encoding = 'utf-8'
230 227 >>> t= '1234567890'
231 228 >>> print trim(t, 12, ellipsis=ellipsis)
232 229 1234567890
233 230 >>> print trim(t, 10, ellipsis=ellipsis)
234 231 1234567890
235 232 >>> print trim(t, 8, ellipsis=ellipsis)
236 233 12345+++
237 234 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
238 235 +++67890
239 236 >>> print trim(t, 8)
240 237 12345678
241 238 >>> print trim(t, 8, leftside=True)
242 239 34567890
243 240 >>> print trim(t, 3, ellipsis=ellipsis)
244 241 +++
245 242 >>> print trim(t, 1, ellipsis=ellipsis)
246 243 +
247 244 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
248 245 >>> t = u.encode(encoding.encoding)
249 246 >>> print trim(t, 12, ellipsis=ellipsis)
250 247 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
251 248 >>> print trim(t, 10, ellipsis=ellipsis)
252 249 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
253 250 >>> print trim(t, 8, ellipsis=ellipsis)
254 251 \xe3\x81\x82\xe3\x81\x84+++
255 252 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
256 253 +++\xe3\x81\x88\xe3\x81\x8a
257 254 >>> print trim(t, 5)
258 255 \xe3\x81\x82\xe3\x81\x84
259 256 >>> print trim(t, 5, leftside=True)
260 257 \xe3\x81\x88\xe3\x81\x8a
261 258 >>> print trim(t, 4, ellipsis=ellipsis)
262 259 +++
263 260 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
264 261 +++
265 262 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
266 263 >>> print trim(t, 12, ellipsis=ellipsis)
267 264 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
268 265 >>> print trim(t, 10, ellipsis=ellipsis)
269 266 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
270 267 >>> print trim(t, 8, ellipsis=ellipsis)
271 268 \x11\x22\x33\x44\x55+++
272 269 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
273 270 +++\x66\x77\x88\x99\xaa
274 271 >>> print trim(t, 8)
275 272 \x11\x22\x33\x44\x55\x66\x77\x88
276 273 >>> print trim(t, 8, leftside=True)
277 274 \x33\x44\x55\x66\x77\x88\x99\xaa
278 275 >>> print trim(t, 3, ellipsis=ellipsis)
279 276 +++
280 277 >>> print trim(t, 1, ellipsis=ellipsis)
281 278 +
282 279 """
283 280 try:
284 281 u = s.decode(_sysstr(encoding))
285 282 except UnicodeDecodeError:
286 283 if len(s) <= width: # trimming is not needed
287 284 return s
288 285 width -= len(ellipsis)
289 286 if width <= 0: # no enough room even for ellipsis
290 287 return ellipsis[:width + len(ellipsis)]
291 288 if leftside:
292 289 return ellipsis + s[-width:]
293 290 return s[:width] + ellipsis
294 291
295 292 if ucolwidth(u) <= width: # trimming is not needed
296 293 return s
297 294
298 295 width -= len(ellipsis)
299 296 if width <= 0: # no enough room even for ellipsis
300 297 return ellipsis[:width + len(ellipsis)]
301 298
302 299 if leftside:
303 300 uslice = lambda i: u[i:]
304 301 concat = lambda s: ellipsis + s
305 302 else:
306 303 uslice = lambda i: u[:-i]
307 304 concat = lambda s: s + ellipsis
308 305 for i in xrange(1, len(u)):
309 306 usub = uslice(i)
310 307 if ucolwidth(usub) <= width:
311 308 return concat(usub.encode(_sysstr(encoding)))
312 309 return ellipsis # no enough room for multi-column characters
313 310
314 311 def _asciilower(s):
315 312 '''convert a string to lowercase if ASCII
316 313
317 314 Raises UnicodeDecodeError if non-ASCII characters are found.'''
318 315 s.decode('ascii')
319 316 return s.lower()
320 317
321 318 def asciilower(s):
322 319 # delay importing avoids cyclic dependency around "parsers" in
323 320 # pure Python build (util => i18n => encoding => parsers => util)
324 321 from . import parsers
325 322 impl = getattr(parsers, 'asciilower', _asciilower)
326 323 global asciilower
327 324 asciilower = impl
328 325 return impl(s)
329 326
330 327 def _asciiupper(s):
331 328 '''convert a string to uppercase if ASCII
332 329
333 330 Raises UnicodeDecodeError if non-ASCII characters are found.'''
334 331 s.decode('ascii')
335 332 return s.upper()
336 333
337 334 def asciiupper(s):
338 335 # delay importing avoids cyclic dependency around "parsers" in
339 336 # pure Python build (util => i18n => encoding => parsers => util)
340 337 from . import parsers
341 338 impl = getattr(parsers, 'asciiupper', _asciiupper)
342 339 global asciiupper
343 340 asciiupper = impl
344 341 return impl(s)
345 342
346 343 def lower(s):
347 344 "best-effort encoding-aware case-folding of local string s"
348 345 try:
349 346 return asciilower(s)
350 347 except UnicodeDecodeError:
351 348 pass
352 349 try:
353 350 if isinstance(s, localstr):
354 351 u = s._utf8.decode("utf-8")
355 352 else:
356 353 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
357 354
358 355 lu = u.lower()
359 356 if u == lu:
360 357 return s # preserve localstring
361 358 return lu.encode(_sysstr(encoding))
362 359 except UnicodeError:
363 360 return s.lower() # we don't know how to fold this except in ASCII
364 361 except LookupError as k:
365 362 raise error.Abort(k, hint="please check your locale settings")
366 363
367 364 def upper(s):
368 365 "best-effort encoding-aware case-folding of local string s"
369 366 try:
370 367 return asciiupper(s)
371 368 except UnicodeDecodeError:
372 369 return upperfallback(s)
373 370
374 371 def upperfallback(s):
375 372 try:
376 373 if isinstance(s, localstr):
377 374 u = s._utf8.decode("utf-8")
378 375 else:
379 376 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
380 377
381 378 uu = u.upper()
382 379 if u == uu:
383 380 return s # preserve localstring
384 381 return uu.encode(_sysstr(encoding))
385 382 except UnicodeError:
386 383 return s.upper() # we don't know how to fold this except in ASCII
387 384 except LookupError as k:
388 385 raise error.Abort(k, hint="please check your locale settings")
389 386
390 387 class normcasespecs(object):
391 388 '''what a platform's normcase does to ASCII strings
392 389
393 390 This is specified per platform, and should be consistent with what normcase
394 391 on that platform actually does.
395 392
396 393 lower: normcase lowercases ASCII strings
397 394 upper: normcase uppercases ASCII strings
398 395 other: the fallback function should always be called
399 396
400 397 This should be kept in sync with normcase_spec in util.h.'''
401 398 lower = -1
402 399 upper = 1
403 400 other = 0
404 401
405 402 _jsonmap = []
406 403 _jsonmap.extend("\\u%04x" % x for x in range(32))
407 404 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
408 405 _jsonmap.append('\\u007f')
409 406 _jsonmap[0x09] = '\\t'
410 407 _jsonmap[0x0a] = '\\n'
411 408 _jsonmap[0x22] = '\\"'
412 409 _jsonmap[0x5c] = '\\\\'
413 410 _jsonmap[0x08] = '\\b'
414 411 _jsonmap[0x0c] = '\\f'
415 412 _jsonmap[0x0d] = '\\r'
416 413 _paranoidjsonmap = _jsonmap[:]
417 414 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
418 415 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
419 416 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
420 417
421 418 def jsonescape(s, paranoid=False):
422 419 '''returns a string suitable for JSON
423 420
424 421 JSON is problematic for us because it doesn't support non-Unicode
425 422 bytes. To deal with this, we take the following approach:
426 423
427 424 - localstr objects are converted back to UTF-8
428 425 - valid UTF-8/ASCII strings are passed as-is
429 426 - other strings are converted to UTF-8b surrogate encoding
430 427 - apply JSON-specified string escaping
431 428
432 429 (escapes are doubled in these tests)
433 430
434 431 >>> jsonescape('this is a test')
435 432 'this is a test'
436 433 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
437 434 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
438 435 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
439 436 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
440 437 >>> jsonescape('a weird byte: \\xdd')
441 438 'a weird byte: \\xed\\xb3\\x9d'
442 439 >>> jsonescape('utf-8: caf\\xc3\\xa9')
443 440 'utf-8: caf\\xc3\\xa9'
444 441 >>> jsonescape('')
445 442 ''
446 443
447 444 If paranoid, non-ascii and common troublesome characters are also escaped.
448 445 This is suitable for web output.
449 446
450 447 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
451 448 'escape boundary: ~ \\\\u007f \\\\u0080'
452 449 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
453 450 'a weird byte: \\\\udcdd'
454 451 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
455 452 'utf-8: caf\\\\u00e9'
456 453 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
457 454 'non-BMP: \\\\ud834\\\\udd1e'
458 455 >>> jsonescape('<foo@example.org>', paranoid=True)
459 456 '\\\\u003cfoo@example.org\\\\u003e'
460 457 '''
461 458
462 459 if paranoid:
463 460 jm = _paranoidjsonmap
464 461 else:
465 462 jm = _jsonmap
466 463
467 464 u8chars = toutf8b(s)
468 465 try:
469 466 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
470 467 except IndexError:
471 468 pass
472 469 # non-BMP char is represented as UTF-16 surrogate pair
473 470 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
474 471 u16codes.pop(0) # drop BOM
475 472 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
476 473
477 474 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
478 475
479 476 def getutf8char(s, pos):
480 477 '''get the next full utf-8 character in the given string, starting at pos
481 478
482 479 Raises a UnicodeError if the given location does not start a valid
483 480 utf-8 character.
484 481 '''
485 482
486 483 # find how many bytes to attempt decoding from first nibble
487 484 l = _utf8len[ord(s[pos]) >> 4]
488 485 if not l: # ascii
489 486 return s[pos]
490 487
491 488 c = s[pos:pos + l]
492 489 # validate with attempted decode
493 490 c.decode("utf-8")
494 491 return c
495 492
496 493 def toutf8b(s):
497 494 '''convert a local, possibly-binary string into UTF-8b
498 495
499 496 This is intended as a generic method to preserve data when working
500 497 with schemes like JSON and XML that have no provision for
501 498 arbitrary byte strings. As Mercurial often doesn't know
502 499 what encoding data is in, we use so-called UTF-8b.
503 500
504 501 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
505 502 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
506 503 uDC00-uDCFF.
507 504
508 505 Principles of operation:
509 506
510 507 - ASCII and UTF-8 data successfully round-trips and is understood
511 508 by Unicode-oriented clients
512 509 - filenames and file contents in arbitrary other encodings can have
513 510 be round-tripped or recovered by clueful clients
514 511 - local strings that have a cached known UTF-8 encoding (aka
515 512 localstr) get sent as UTF-8 so Unicode-oriented clients get the
516 513 Unicode data they want
517 514 - because we must preserve UTF-8 bytestring in places such as
518 515 filenames, metadata can't be roundtripped without help
519 516
520 517 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
521 518 arbitrary bytes into an internal Unicode format that can be
522 519 re-encoded back into the original. Here we are exposing the
523 520 internal surrogate encoding as a UTF-8 string.)
524 521 '''
525 522
526 523 if "\xed" not in s:
527 524 if isinstance(s, localstr):
528 525 return s._utf8
529 526 try:
530 527 s.decode('utf-8')
531 528 return s
532 529 except UnicodeDecodeError:
533 530 pass
534 531
535 532 r = ""
536 533 pos = 0
537 534 l = len(s)
538 535 while pos < l:
539 536 try:
540 537 c = getutf8char(s, pos)
541 538 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
542 539 # have to re-escape existing U+DCxx characters
543 540 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
544 541 pos += 1
545 542 else:
546 543 pos += len(c)
547 544 except UnicodeDecodeError:
548 545 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
549 546 pos += 1
550 547 r += c
551 548 return r
552 549
553 550 def fromutf8b(s):
554 551 '''Given a UTF-8b string, return a local, possibly-binary string.
555 552
556 553 return the original binary string. This
557 554 is a round-trip process for strings like filenames, but metadata
558 555 that's was passed through tolocal will remain in UTF-8.
559 556
560 557 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
561 558 >>> m = "\\xc3\\xa9\\x99abcd"
562 559 >>> toutf8b(m)
563 560 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
564 561 >>> roundtrip(m)
565 562 True
566 563 >>> roundtrip("\\xc2\\xc2\\x80")
567 564 True
568 565 >>> roundtrip("\\xef\\xbf\\xbd")
569 566 True
570 567 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
571 568 True
572 569 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
573 570 True
574 571 '''
575 572
576 573 # fast path - look for uDxxx prefixes in s
577 574 if "\xed" not in s:
578 575 return s
579 576
580 577 # We could do this with the unicode type but some Python builds
581 578 # use UTF-16 internally (issue5031) which causes non-BMP code
582 579 # points to be escaped. Instead, we use our handy getutf8char
583 580 # helper again to walk the string without "decoding" it.
584 581
585 582 r = ""
586 583 pos = 0
587 584 l = len(s)
588 585 while pos < l:
589 586 c = getutf8char(s, pos)
590 587 pos += len(c)
591 588 # unescape U+DCxx characters
592 589 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
593 590 c = chr(ord(c.decode("utf-8")) & 0xff)
594 591 r += c
595 592 return r
General Comments 0
You need to be logged in to leave comments. Login now