# HG changeset patch
# User Matt Mackall <mpm@selenic.com>
# Date 2015-11-05 23:21:43
# Node ID d7e83f1064590c1ecb6cde0e42a55311febd73e4
# Parent  cb467a9d759321b114a388005491ba9667642025

encoding: use getutf8char in toutf8b

This correctly avoids the ambiguity of U+FFFD already present in the
input and similar confusion by working a character at a time.

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -470,17 +470,20 @@ def toutf8b(s):
         s.decode('utf-8')
         return s
     except UnicodeDecodeError:
-        # surrogate-encode any characters that don't round-trip
-        s2 = s.decode('utf-8', 'ignore').encode('utf-8')
-        r = ""
-        pos = 0
-        for c in s:
-            if s2[pos:pos + 1] == c:
-                r += c
-                pos += 1
-            else:
-                r += unichr(0xdc00 + ord(c)).encode('utf-8')
-        return r
+        pass
+
+    r = ""
+    pos = 0
+    l = len(s)
+    while pos < l:
+        try:
+            c = getutf8char(s, pos)
+            pos += len(c)
+        except UnicodeDecodeError:
+            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+            pos += 1
+        r += c
+    return r
 
 def fromutf8b(s):
     '''Given a UTF-8b string, return a local, possibly-binary string.