Show More
@@ -516,17 +516,27 b' def fromutf8b(s):' | |||||
516 | True |
|
516 | True | |
517 | >>> roundtrip("\\xef\\xef\\xbf\\xbd") |
|
517 | >>> roundtrip("\\xef\\xef\\xbf\\xbd") | |
518 | True |
|
518 | True | |
|
519 | >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80") | |||
|
520 | True | |||
519 | ''' |
|
521 | ''' | |
520 |
|
522 | |||
521 | # fast path - look for uDxxx prefixes in s |
|
523 | # fast path - look for uDxxx prefixes in s | |
522 | if "\xed" not in s: |
|
524 | if "\xed" not in s: | |
523 | return s |
|
525 | return s | |
524 |
|
526 | |||
525 | u = s.decode("utf-8") |
|
527 | # We could do this with the unicode type but some Python builds | |
|
528 | # use UTF-16 internally (issue5031) which causes non-BMP code | |||
|
529 | # points to be escaped. Instead, we use our handy getutf8char | |||
|
530 | # helper again to walk the string without "decoding" it. | |||
|
531 | ||||
526 | r = "" |
|
532 | r = "" | |
527 | for c in u: |
|
533 | pos = 0 | |
528 | if ord(c) & 0xffff00 == 0xdc00: |
|
534 | l = len(s) | |
529 | r += chr(ord(c) & 0xff) |
|
535 | while pos < l: | |
530 | else: |
|
536 | c = getutf8char(s, pos) | |
531 | r += c.encode("utf-8") |
|
537 | pos += len(c) | |
|
538 | # unescape U+DCxx characters | |||
|
539 | if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": | |||
|
540 | c = chr(ord(c.decode("utf-8")) & 0xff) | |||
|
541 | r += c | |||
532 | return r |
|
542 | return r |
General Comments 0
You need to be logged in to leave comments.
Login now