##// END OF EJS Templates
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
Matt Mackall -
r27699:c8d3392f default
parent child Browse files
Show More
@@ -516,17 +516,27 b' def fromutf8b(s):'
516 True
516 True
517 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
517 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
518 True
518 True
519 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
520 True
519 '''
521 '''
520
522
521 # fast path - look for uDxxx prefixes in s
523 # fast path - look for uDxxx prefixes in s
522 if "\xed" not in s:
524 if "\xed" not in s:
523 return s
525 return s
524
526
525 u = s.decode("utf-8")
527 # We could do this with the unicode type but some Python builds
528 # use UTF-16 internally (issue5031) which causes non-BMP code
529 # points to be escaped. Instead, we use our handy getutf8char
530 # helper again to walk the string without "decoding" it.
531
526 r = ""
532 r = ""
527 for c in u:
533 pos = 0
528 if ord(c) & 0xffff00 == 0xdc00:
534 l = len(s)
529 r += chr(ord(c) & 0xff)
535 while pos < l:
530 else:
536 c = getutf8char(s, pos)
531 r += c.encode("utf-8")
537 pos += len(c)
538 # unescape U+DCxx characters
539 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
540 c = chr(ord(c.decode("utf-8")) & 0xff)
541 r += c
532 return r
542 return r
General Comments 0
You need to be logged in to leave comments. Login now