diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -93,6 +93,16 @@ class localstr(bytes): def __hash__(self): return hash(self._utf8) # avoid collisions in local string space +class safelocalstr(bytes): + """Tagged string denoting it was previously an internal UTF-8 string, + and can be converted back to UTF-8 losslessly + + >>> assert safelocalstr(b'\\xc3') == b'\\xc3' + >>> assert b'\\xc3' == safelocalstr(b'\\xc3') + >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} + >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} + """ + def tolocal(s): """ Convert a string from internal UTF-8 to local encoding @@ -140,7 +150,7 @@ def tolocal(s): r = u.encode(_sysstr(encoding), u"replace") if u == r.decode(_sysstr(encoding)): # r is a safe, non-lossy encoding of s - return r + return safelocalstr(r) return localstr(s, r) except UnicodeDecodeError: # we should only get here if we're looking at an ancient changeset @@ -149,7 +159,7 @@ def tolocal(s): r = u.encode(_sysstr(encoding), u"replace") if u == r.decode(_sysstr(encoding)): # r is a safe, non-lossy encoding of s - return r + return safelocalstr(r) return localstr(u.encode('UTF-8'), r) except UnicodeDecodeError: u = s.decode("utf-8", "replace") # last ditch @@ -402,7 +412,7 @@ def jsonescape(s, paranoid=False): JSON is problematic for us because it doesn't support non-Unicode bytes. To deal with this, we take the following approach: - - localstr objects are converted back to UTF-8 + - localstr/safelocalstr objects are converted back to UTF-8 - valid UTF-8/ASCII strings are passed as-is - other strings are converted to UTF-8b surrogate encoding - apply JSON-specified string escaping @@ -495,6 +505,7 @@ def toutf8b(s): - local strings that have a cached known UTF-8 encoding (aka localstr) get sent as UTF-8 so Unicode-oriented clients get the Unicode data they want + - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well - because we must preserve UTF-8 bytestring in places such as filenames, metadata can't be roundtripped without help @@ -508,6 +519,10 @@ def toutf8b(s): # assume that the original UTF-8 sequence would never contain # invalid characters in U+DCxx range return s._utf8 + elif isinstance(s, safelocalstr): + # already verified that s is non-lossy in legacy encoding, which + # shouldn't contain characters in U+DCxx range + return fromlocal(s) elif isasciistr(s): return s if "\xed" not in s: diff --git a/mercurial/templatekw.py b/mercurial/templatekw.py --- a/mercurial/templatekw.py +++ b/mercurial/templatekw.py @@ -278,6 +278,8 @@ def showdescription(context, mapping): if isinstance(s, encoding.localstr): # try hard to preserve utf-8 bytes return encoding.tolocal(encoding.fromlocal(s).strip()) + elif isinstance(s, encoding.safelocalstr): + return encoding.safelocalstr(s.strip()) else: return s.strip() diff --git a/tests/test-command-template.t b/tests/test-command-template.t --- a/tests/test-command-template.t +++ b/tests/test-command-template.t @@ -4691,6 +4691,13 @@ json filter should try round-trip conver $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0 "non-ascii branch: \u00e9" +json filter should take input as utf-8 if it was converted from utf-8: + + $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0 + "\u00e9" + $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0 + "non-ascii branch: \u00e9" + json filter takes input as utf-8b: $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1 diff --git a/tests/test-encoding-func.py b/tests/test-encoding-func.py --- a/tests/test-encoding-func.py +++ b/tests/test-encoding-func.py @@ -53,6 +53,13 @@ class Utf8bEncodingTest(unittest.TestCas self.assertEqual(l, b'?') # lossy self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved + def testlosslesslatin(self): + encoding.encoding = b'latin-1' + s = u'\xc0'.encode('utf-8') + l = encoding.tolocal(s) + self.assertEqual(l, b'\xc0') # lossless + self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 + def testlossy0xed(self): encoding.encoding = b'euc-kr' # U+Dxxx Hangul s = u'\ud1bc\xc0'.encode('utf-8') @@ -61,6 +68,13 @@ class Utf8bEncodingTest(unittest.TestCas self.assertTrue(l.endswith(b'?')) # lossy self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved + def testlossless0xed(self): + encoding.encoding = b'euc-kr' # U+Dxxx Hangul + s = u'\ud1bc'.encode('utf-8') + l = encoding.tolocal(s) + self.assertEqual(l, b'\xc5\xed') # lossless + self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 + if __name__ == '__main__': import silenttestrunner silenttestrunner.main(__name__)