upstream/mercurial-mirror Commit - r37966:3ea3c96a

encoding: introduce tagging type for non-lossy non-ASCII string...

Yuya Nishihara -

r37966:3ea3c96a default

parent child

mercurial/encoding.py

0 +18 -3

                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
+            class safelocalstr(bytes):
+                """Tagged string denoting it was previously an internal UTF-8 string,
+                and can be converted back to UTF-8 losslessly
+                >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
+                >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
+                >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
+                >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
+                """
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
-                            return r
+                            return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
-                                return r
+                                return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
-                - localstr objects are converted back to UTF-8
+                - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
+                - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
+                elif isinstance(s, safelocalstr):
+                    # already verified that s is non-lossy in legacy encoding, which
+                    # shouldn't contain characters in U+DCxx range
+                    return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if "\xed" not in s:

mercurial/templatekw.py

0 +2 0

                 if isinstance(s, encoding.localstr):
                     # try hard to preserve utf-8 bytes
                     return encoding.tolocal(encoding.fromlocal(s).strip())
+                elif isinstance(s, encoding.safelocalstr):
+                    return encoding.safelocalstr(s.strip())
                 else:
                     return s.strip()

tests/test-command-template.t

0 +7 0

               $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0
               "non-ascii branch: \u00e9"
+            json filter should take input as utf-8 if it was converted from utf-8:
+              $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0
+              "\u00e9"
+              $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0
+              "non-ascii branch: \u00e9"
             json filter takes input as utf-8b:
               $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1

tests/test-encoding-func.py

0 +14 0

                     self.assertEqual(l, b'?')  # lossy
                     self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
+                def testlosslesslatin(self):
+                    encoding.encoding = b'latin-1'
+                    s = u'\xc0'.encode('utf-8')
+                    l = encoding.tolocal(s)
+                    self.assertEqual(l, b'\xc0')  # lossless
+                    self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
                 def testlossy0xed(self):
                     encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
                     s = u'\ud1bc\xc0'.encode('utf-8')
                     self.assertTrue(l.endswith(b'?'))  # lossy
                     self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
+                def testlossless0xed(self):
+                    encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
+                    s = u'\ud1bc'.encode('utf-8')
+                    l = encoding.tolocal(s)
+                    self.assertEqual(l, b'\xc5\xed')  # lossless
+                    self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
             if __name__ == '__main__':
                 import silenttestrunner
                 silenttestrunner.main(__name__)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages