Show More
@@ -93,6 +93,16 b' class localstr(bytes):' | |||||
93 | def __hash__(self): |
|
93 | def __hash__(self): | |
94 | return hash(self._utf8) # avoid collisions in local string space |
|
94 | return hash(self._utf8) # avoid collisions in local string space | |
95 |
|
95 | |||
|
96 | class safelocalstr(bytes): | |||
|
97 | """Tagged string denoting it was previously an internal UTF-8 string, | |||
|
98 | and can be converted back to UTF-8 losslessly | |||
|
99 | ||||
|
100 | >>> assert safelocalstr(b'\\xc3') == b'\\xc3' | |||
|
101 | >>> assert b'\\xc3' == safelocalstr(b'\\xc3') | |||
|
102 | >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} | |||
|
103 | >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} | |||
|
104 | """ | |||
|
105 | ||||
96 | def tolocal(s): |
|
106 | def tolocal(s): | |
97 | """ |
|
107 | """ | |
98 | Convert a string from internal UTF-8 to local encoding |
|
108 | Convert a string from internal UTF-8 to local encoding | |
@@ -140,7 +150,7 b' def tolocal(s):' | |||||
140 | r = u.encode(_sysstr(encoding), u"replace") |
|
150 | r = u.encode(_sysstr(encoding), u"replace") | |
141 | if u == r.decode(_sysstr(encoding)): |
|
151 | if u == r.decode(_sysstr(encoding)): | |
142 | # r is a safe, non-lossy encoding of s |
|
152 | # r is a safe, non-lossy encoding of s | |
143 | return r |
|
153 | return safelocalstr(r) | |
144 | return localstr(s, r) |
|
154 | return localstr(s, r) | |
145 | except UnicodeDecodeError: |
|
155 | except UnicodeDecodeError: | |
146 | # we should only get here if we're looking at an ancient changeset |
|
156 | # we should only get here if we're looking at an ancient changeset | |
@@ -149,7 +159,7 b' def tolocal(s):' | |||||
149 | r = u.encode(_sysstr(encoding), u"replace") |
|
159 | r = u.encode(_sysstr(encoding), u"replace") | |
150 | if u == r.decode(_sysstr(encoding)): |
|
160 | if u == r.decode(_sysstr(encoding)): | |
151 | # r is a safe, non-lossy encoding of s |
|
161 | # r is a safe, non-lossy encoding of s | |
152 | return r |
|
162 | return safelocalstr(r) | |
153 | return localstr(u.encode('UTF-8'), r) |
|
163 | return localstr(u.encode('UTF-8'), r) | |
154 | except UnicodeDecodeError: |
|
164 | except UnicodeDecodeError: | |
155 | u = s.decode("utf-8", "replace") # last ditch |
|
165 | u = s.decode("utf-8", "replace") # last ditch | |
@@ -402,7 +412,7 b' def jsonescape(s, paranoid=False):' | |||||
402 | JSON is problematic for us because it doesn't support non-Unicode |
|
412 | JSON is problematic for us because it doesn't support non-Unicode | |
403 | bytes. To deal with this, we take the following approach: |
|
413 | bytes. To deal with this, we take the following approach: | |
404 |
|
414 | |||
405 | - localstr objects are converted back to UTF-8 |
|
415 | - localstr/safelocalstr objects are converted back to UTF-8 | |
406 | - valid UTF-8/ASCII strings are passed as-is |
|
416 | - valid UTF-8/ASCII strings are passed as-is | |
407 | - other strings are converted to UTF-8b surrogate encoding |
|
417 | - other strings are converted to UTF-8b surrogate encoding | |
408 | - apply JSON-specified string escaping |
|
418 | - apply JSON-specified string escaping | |
@@ -495,6 +505,7 b' def toutf8b(s):' | |||||
495 | - local strings that have a cached known UTF-8 encoding (aka |
|
505 | - local strings that have a cached known UTF-8 encoding (aka | |
496 | localstr) get sent as UTF-8 so Unicode-oriented clients get the |
|
506 | localstr) get sent as UTF-8 so Unicode-oriented clients get the | |
497 | Unicode data they want |
|
507 | Unicode data they want | |
|
508 | - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well | |||
498 | - because we must preserve UTF-8 bytestring in places such as |
|
509 | - because we must preserve UTF-8 bytestring in places such as | |
499 | filenames, metadata can't be roundtripped without help |
|
510 | filenames, metadata can't be roundtripped without help | |
500 |
|
511 | |||
@@ -508,6 +519,10 b' def toutf8b(s):' | |||||
508 | # assume that the original UTF-8 sequence would never contain |
|
519 | # assume that the original UTF-8 sequence would never contain | |
509 | # invalid characters in U+DCxx range |
|
520 | # invalid characters in U+DCxx range | |
510 | return s._utf8 |
|
521 | return s._utf8 | |
|
522 | elif isinstance(s, safelocalstr): | |||
|
523 | # already verified that s is non-lossy in legacy encoding, which | |||
|
524 | # shouldn't contain characters in U+DCxx range | |||
|
525 | return fromlocal(s) | |||
511 | elif isasciistr(s): |
|
526 | elif isasciistr(s): | |
512 | return s |
|
527 | return s | |
513 | if "\xed" not in s: |
|
528 | if "\xed" not in s: |
@@ -278,6 +278,8 b' def showdescription(context, mapping):' | |||||
278 | if isinstance(s, encoding.localstr): |
|
278 | if isinstance(s, encoding.localstr): | |
279 | # try hard to preserve utf-8 bytes |
|
279 | # try hard to preserve utf-8 bytes | |
280 | return encoding.tolocal(encoding.fromlocal(s).strip()) |
|
280 | return encoding.tolocal(encoding.fromlocal(s).strip()) | |
|
281 | elif isinstance(s, encoding.safelocalstr): | |||
|
282 | return encoding.safelocalstr(s.strip()) | |||
281 | else: |
|
283 | else: | |
282 | return s.strip() |
|
284 | return s.strip() | |
283 |
|
285 |
@@ -4691,6 +4691,13 b' json filter should try round-trip conver' | |||||
4691 | $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0 |
|
4691 | $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0 | |
4692 | "non-ascii branch: \u00e9" |
|
4692 | "non-ascii branch: \u00e9" | |
4693 |
|
4693 | |||
|
4694 | json filter should take input as utf-8 if it was converted from utf-8: | |||
|
4695 | ||||
|
4696 | $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0 | |||
|
4697 | "\u00e9" | |||
|
4698 | $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0 | |||
|
4699 | "non-ascii branch: \u00e9" | |||
|
4700 | ||||
4694 | json filter takes input as utf-8b: |
|
4701 | json filter takes input as utf-8b: | |
4695 |
|
4702 | |||
4696 | $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1 |
|
4703 | $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1 |
@@ -53,6 +53,13 b' class Utf8bEncodingTest(unittest.TestCas' | |||||
53 | self.assertEqual(l, b'?') # lossy |
|
53 | self.assertEqual(l, b'?') # lossy | |
54 | self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved |
|
54 | self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved | |
55 |
|
55 | |||
|
56 | def testlosslesslatin(self): | |||
|
57 | encoding.encoding = b'latin-1' | |||
|
58 | s = u'\xc0'.encode('utf-8') | |||
|
59 | l = encoding.tolocal(s) | |||
|
60 | self.assertEqual(l, b'\xc0') # lossless | |||
|
61 | self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 | |||
|
62 | ||||
56 | def testlossy0xed(self): |
|
63 | def testlossy0xed(self): | |
57 | encoding.encoding = b'euc-kr' # U+Dxxx Hangul |
|
64 | encoding.encoding = b'euc-kr' # U+Dxxx Hangul | |
58 | s = u'\ud1bc\xc0'.encode('utf-8') |
|
65 | s = u'\ud1bc\xc0'.encode('utf-8') | |
@@ -61,6 +68,13 b' class Utf8bEncodingTest(unittest.TestCas' | |||||
61 | self.assertTrue(l.endswith(b'?')) # lossy |
|
68 | self.assertTrue(l.endswith(b'?')) # lossy | |
62 | self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved |
|
69 | self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved | |
63 |
|
70 | |||
|
71 | def testlossless0xed(self): | |||
|
72 | encoding.encoding = b'euc-kr' # U+Dxxx Hangul | |||
|
73 | s = u'\ud1bc'.encode('utf-8') | |||
|
74 | l = encoding.tolocal(s) | |||
|
75 | self.assertEqual(l, b'\xc5\xed') # lossless | |||
|
76 | self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 | |||
|
77 | ||||
64 | if __name__ == '__main__': |
|
78 | if __name__ == '__main__': | |
65 | import silenttestrunner |
|
79 | import silenttestrunner | |
66 | silenttestrunner.main(__name__) |
|
80 | silenttestrunner.main(__name__) |
General Comments 0
You need to be logged in to leave comments.
Login now