##// END OF EJS Templates
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
Yuya Nishihara -
r34215:aa877860 default
parent child Browse files
Show More
@@ -448,6 +448,13 b' def jsonescape(s, paranoid=False):'
448 pass
448 pass
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
450
450
451 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
452 # bytes are mapped to that range.
453 if pycompat.ispy3:
454 _utf8strict = r'surrogatepass'
455 else:
456 _utf8strict = r'strict'
457
451 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
452
459
453 def getutf8char(s, pos):
460 def getutf8char(s, pos):
@@ -464,7 +471,7 b' def getutf8char(s, pos):'
464
471
465 c = s[pos:pos + l]
472 c = s[pos:pos + l]
466 # validate with attempted decode
473 # validate with attempted decode
467 c.decode("utf-8")
474 c.decode("utf-8", _utf8strict)
468 return c
475 return c
469
476
470 def toutf8b(s):
477 def toutf8b(s):
@@ -503,7 +510,7 b' def toutf8b(s):'
503 if isinstance(s, localstr):
510 if isinstance(s, localstr):
504 return s._utf8
511 return s._utf8
505 try:
512 try:
506 s.decode('utf-8')
513 s.decode('utf-8', _utf8strict)
507 return s
514 return s
508 except UnicodeDecodeError:
515 except UnicodeDecodeError:
509 pass
516 pass
@@ -517,12 +524,12 b' def toutf8b(s):'
517 c = getutf8char(s, pos)
524 c = getutf8char(s, pos)
518 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
519 # have to re-escape existing U+DCxx characters
526 # have to re-escape existing U+DCxx characters
520 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
521 pos += 1
528 pos += 1
522 else:
529 else:
523 pos += len(c)
530 pos += len(c)
524 except UnicodeDecodeError:
531 except UnicodeDecodeError:
525 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
526 pos += 1
533 pos += 1
527 r += c
534 r += c
528 return r
535 return r
@@ -570,7 +577,7 b' def fromutf8b(s):'
570 pos += len(c)
577 pos += len(c)
571 # unescape U+DCxx characters
578 # unescape U+DCxx characters
572 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
579 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
573 c = chr(ord(c.decode("utf-8")) & 0xff)
580 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
574 r += c
581 r += c
575 return r
582 return r
576
583
@@ -64,6 +64,11 b' def jsonescapeu8fast(u8chars, paranoid):'
64 except IndexError:
64 except IndexError:
65 raise ValueError
65 raise ValueError
66
66
67 if pycompat.ispy3:
68 _utf8strict = r'surrogatepass'
69 else:
70 _utf8strict = r'strict'
71
67 def jsonescapeu8fallback(u8chars, paranoid):
72 def jsonescapeu8fallback(u8chars, paranoid):
68 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
73 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
69
74
@@ -74,6 +79,7 b' def jsonescapeu8fallback(u8chars, parano'
74 else:
79 else:
75 jm = _jsonmap
80 jm = _jsonmap
76 # non-BMP char is represented as UTF-16 surrogate pair
81 # non-BMP char is represented as UTF-16 surrogate pair
77 u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
82 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
83 u16codes = array.array(r'H', u16b)
78 u16codes.pop(0) # drop BOM
84 u16codes.pop(0) # drop BOM
79 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
85 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
@@ -50,7 +50,7 b" testmod('mercurial.config')"
50 testmod('mercurial.context')
50 testmod('mercurial.context')
51 testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
51 testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
52 testmod('mercurial.dispatch')
52 testmod('mercurial.dispatch')
53 testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues
53 testmod('mercurial.encoding')
54 testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout
54 testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout
55 testmod('mercurial.hg')
55 testmod('mercurial.hg')
56 testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?
56 testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?
General Comments 0
You need to be logged in to leave comments. Login now