# HG changeset patch # User Matt Mackall # Date 2015-11-05 23:21:43 # Node ID d7e83f1064590c1ecb6cde0e42a55311febd73e4 # Parent cb467a9d759321b114a388005491ba9667642025 encoding: use getutf8char in toutf8b This correctly avoids the ambiguity of U+FFFD already present in the input and similar confusion by working a character at a time. diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -470,17 +470,20 @@ def toutf8b(s): s.decode('utf-8') return s except UnicodeDecodeError: - # surrogate-encode any characters that don't round-trip - s2 = s.decode('utf-8', 'ignore').encode('utf-8') - r = "" - pos = 0 - for c in s: - if s2[pos:pos + 1] == c: - r += c - pos += 1 - else: - r += unichr(0xdc00 + ord(c)).encode('utf-8') - return r + pass + + r = "" + pos = 0 + l = len(s) + while pos < l: + try: + c = getutf8char(s, pos) + pos += len(c) + except UnicodeDecodeError: + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + pos += 1 + r += c + return r def fromutf8b(s): '''Given a UTF-8b string, return a local, possibly-binary string.