diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -448,6 +448,13 @@ def jsonescape(s, paranoid=False): pass return charencodepure.jsonescapeu8fallback(u8chars, paranoid) +# We need to decode/encode U+DCxx codes transparently since invalid UTF-8 +# bytes are mapped to that range. +if pycompat.ispy3: + _utf8strict = r'surrogatepass' +else: + _utf8strict = r'strict' + _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] def getutf8char(s, pos): @@ -464,7 +471,7 @@ def getutf8char(s, pos): c = s[pos:pos + l] # validate with attempted decode - c.decode("utf-8") + c.decode("utf-8", _utf8strict) return c def toutf8b(s): @@ -503,7 +510,7 @@ def toutf8b(s): if isinstance(s, localstr): return s._utf8 try: - s.decode('utf-8') + s.decode('utf-8', _utf8strict) return s except UnicodeDecodeError: pass @@ -517,12 +524,12 @@ def toutf8b(s): c = getutf8char(s, pos) if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": # have to re-escape existing U+DCxx characters - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 else: pos += len(c) except UnicodeDecodeError: - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 r += c return r @@ -570,7 +577,7 @@ def fromutf8b(s): pos += len(c) # unescape U+DCxx characters if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": - c = chr(ord(c.decode("utf-8")) & 0xff) + c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) r += c return r diff --git a/mercurial/pure/charencode.py b/mercurial/pure/charencode.py --- a/mercurial/pure/charencode.py +++ b/mercurial/pure/charencode.py @@ -64,6 +64,11 @@ def jsonescapeu8fast(u8chars, paranoid): except IndexError: raise ValueError +if pycompat.ispy3: + _utf8strict = r'surrogatepass' +else: + _utf8strict = r'strict' + def jsonescapeu8fallback(u8chars, paranoid): """Convert a UTF-8 byte string to JSON-escaped form (slow path) @@ -74,6 +79,7 @@ def jsonescapeu8fallback(u8chars, parano else: jm = _jsonmap # non-BMP char is represented as UTF-16 surrogate pair - u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16')) + u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict) + u16codes = array.array(r'H', u16b) u16codes.pop(0) # drop BOM return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) diff --git a/tests/test-doctest.py b/tests/test-doctest.py --- a/tests/test-doctest.py +++ b/tests/test-doctest.py @@ -50,7 +50,7 @@ testmod('mercurial.config') testmod('mercurial.context') testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE) testmod('mercurial.dispatch') -testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues +testmod('mercurial.encoding') testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout testmod('mercurial.hg') testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?