##// END OF EJS Templates
encoding: introduce tagging type for non-lossy non-ASCII string...
Yuya Nishihara -
r37966:3ea3c96a default
parent child Browse files
Show More
@@ -93,6 +93,16 b' class localstr(bytes):'
93 def __hash__(self):
93 def __hash__(self):
94 return hash(self._utf8) # avoid collisions in local string space
94 return hash(self._utf8) # avoid collisions in local string space
95
95
96 class safelocalstr(bytes):
97 """Tagged string denoting it was previously an internal UTF-8 string,
98 and can be converted back to UTF-8 losslessly
99
100 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
101 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
102 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
103 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
104 """
105
96 def tolocal(s):
106 def tolocal(s):
97 """
107 """
98 Convert a string from internal UTF-8 to local encoding
108 Convert a string from internal UTF-8 to local encoding
@@ -140,7 +150,7 b' def tolocal(s):'
140 r = u.encode(_sysstr(encoding), u"replace")
150 r = u.encode(_sysstr(encoding), u"replace")
141 if u == r.decode(_sysstr(encoding)):
151 if u == r.decode(_sysstr(encoding)):
142 # r is a safe, non-lossy encoding of s
152 # r is a safe, non-lossy encoding of s
143 return r
153 return safelocalstr(r)
144 return localstr(s, r)
154 return localstr(s, r)
145 except UnicodeDecodeError:
155 except UnicodeDecodeError:
146 # we should only get here if we're looking at an ancient changeset
156 # we should only get here if we're looking at an ancient changeset
@@ -149,7 +159,7 b' def tolocal(s):'
149 r = u.encode(_sysstr(encoding), u"replace")
159 r = u.encode(_sysstr(encoding), u"replace")
150 if u == r.decode(_sysstr(encoding)):
160 if u == r.decode(_sysstr(encoding)):
151 # r is a safe, non-lossy encoding of s
161 # r is a safe, non-lossy encoding of s
152 return r
162 return safelocalstr(r)
153 return localstr(u.encode('UTF-8'), r)
163 return localstr(u.encode('UTF-8'), r)
154 except UnicodeDecodeError:
164 except UnicodeDecodeError:
155 u = s.decode("utf-8", "replace") # last ditch
165 u = s.decode("utf-8", "replace") # last ditch
@@ -402,7 +412,7 b' def jsonescape(s, paranoid=False):'
402 JSON is problematic for us because it doesn't support non-Unicode
412 JSON is problematic for us because it doesn't support non-Unicode
403 bytes. To deal with this, we take the following approach:
413 bytes. To deal with this, we take the following approach:
404
414
405 - localstr objects are converted back to UTF-8
415 - localstr/safelocalstr objects are converted back to UTF-8
406 - valid UTF-8/ASCII strings are passed as-is
416 - valid UTF-8/ASCII strings are passed as-is
407 - other strings are converted to UTF-8b surrogate encoding
417 - other strings are converted to UTF-8b surrogate encoding
408 - apply JSON-specified string escaping
418 - apply JSON-specified string escaping
@@ -495,6 +505,7 b' def toutf8b(s):'
495 - local strings that have a cached known UTF-8 encoding (aka
505 - local strings that have a cached known UTF-8 encoding (aka
496 localstr) get sent as UTF-8 so Unicode-oriented clients get the
506 localstr) get sent as UTF-8 so Unicode-oriented clients get the
497 Unicode data they want
507 Unicode data they want
508 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
498 - because we must preserve UTF-8 bytestring in places such as
509 - because we must preserve UTF-8 bytestring in places such as
499 filenames, metadata can't be roundtripped without help
510 filenames, metadata can't be roundtripped without help
500
511
@@ -508,6 +519,10 b' def toutf8b(s):'
508 # assume that the original UTF-8 sequence would never contain
519 # assume that the original UTF-8 sequence would never contain
509 # invalid characters in U+DCxx range
520 # invalid characters in U+DCxx range
510 return s._utf8
521 return s._utf8
522 elif isinstance(s, safelocalstr):
523 # already verified that s is non-lossy in legacy encoding, which
524 # shouldn't contain characters in U+DCxx range
525 return fromlocal(s)
511 elif isasciistr(s):
526 elif isasciistr(s):
512 return s
527 return s
513 if "\xed" not in s:
528 if "\xed" not in s:
@@ -278,6 +278,8 b' def showdescription(context, mapping):'
278 if isinstance(s, encoding.localstr):
278 if isinstance(s, encoding.localstr):
279 # try hard to preserve utf-8 bytes
279 # try hard to preserve utf-8 bytes
280 return encoding.tolocal(encoding.fromlocal(s).strip())
280 return encoding.tolocal(encoding.fromlocal(s).strip())
281 elif isinstance(s, encoding.safelocalstr):
282 return encoding.safelocalstr(s.strip())
281 else:
283 else:
282 return s.strip()
284 return s.strip()
283
285
@@ -4691,6 +4691,13 b' json filter should try round-trip conver'
4691 $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0
4691 $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0
4692 "non-ascii branch: \u00e9"
4692 "non-ascii branch: \u00e9"
4693
4693
4694 json filter should take input as utf-8 if it was converted from utf-8:
4695
4696 $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0
4697 "\u00e9"
4698 $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0
4699 "non-ascii branch: \u00e9"
4700
4694 json filter takes input as utf-8b:
4701 json filter takes input as utf-8b:
4695
4702
4696 $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1
4703 $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1
@@ -53,6 +53,13 b' class Utf8bEncodingTest(unittest.TestCas'
53 self.assertEqual(l, b'?') # lossy
53 self.assertEqual(l, b'?') # lossy
54 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
54 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
55
55
56 def testlosslesslatin(self):
57 encoding.encoding = b'latin-1'
58 s = u'\xc0'.encode('utf-8')
59 l = encoding.tolocal(s)
60 self.assertEqual(l, b'\xc0') # lossless
61 self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8
62
56 def testlossy0xed(self):
63 def testlossy0xed(self):
57 encoding.encoding = b'euc-kr' # U+Dxxx Hangul
64 encoding.encoding = b'euc-kr' # U+Dxxx Hangul
58 s = u'\ud1bc\xc0'.encode('utf-8')
65 s = u'\ud1bc\xc0'.encode('utf-8')
@@ -61,6 +68,13 b' class Utf8bEncodingTest(unittest.TestCas'
61 self.assertTrue(l.endswith(b'?')) # lossy
68 self.assertTrue(l.endswith(b'?')) # lossy
62 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
69 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
63
70
71 def testlossless0xed(self):
72 encoding.encoding = b'euc-kr' # U+Dxxx Hangul
73 s = u'\ud1bc'.encode('utf-8')
74 l = encoding.tolocal(s)
75 self.assertEqual(l, b'\xc5\xed') # lossless
76 self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8
77
64 if __name__ == '__main__':
78 if __name__ == '__main__':
65 import silenttestrunner
79 import silenttestrunner
66 silenttestrunner.main(__name__)
80 silenttestrunner.main(__name__)
General Comments 0
You need to be logged in to leave comments. Login now