##// END OF EJS Templates
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
Matt Mackall -
r26879:a24b98f4 default
parent child Browse files
Show More
@@ -463,14 +463,14 b' def toutf8b(s):'
463 internal surrogate encoding as a UTF-8 string.)
463 internal surrogate encoding as a UTF-8 string.)
464 '''
464 '''
465
465
466 if isinstance(s, localstr):
466 if "\xed" not in s:
467 return s._utf8
467 if isinstance(s, localstr):
468
468 return s._utf8
469 try:
469 try:
470 s.decode('utf-8')
470 s.decode('utf-8')
471 return s
471 return s
472 except UnicodeDecodeError:
472 except UnicodeDecodeError:
473 pass
473 pass
474
474
475 r = ""
475 r = ""
476 pos = 0
476 pos = 0
@@ -478,7 +478,12 b' def toutf8b(s):'
478 while pos < l:
478 while pos < l:
479 try:
479 try:
480 c = getutf8char(s, pos)
480 c = getutf8char(s, pos)
481 pos += len(c)
481 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
482 # have to re-escape existing U+DCxx characters
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
484 pos += 1
485 else:
486 pos += len(c)
482 except UnicodeDecodeError:
487 except UnicodeDecodeError:
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
488 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
484 pos += 1
489 pos += 1
General Comments 0
You need to be logged in to leave comments. Login now