##// END OF EJS Templates
encoding: add localstr class to track UTF-8 version of transcoded strings...
Matt Mackall -
r13046:7cc4263e default
parent child Browse files
Show More
@@ -48,6 +48,16 except locale.Error:
48 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 49 fallbackencoding = 'ISO-8859-1'
50 50
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
56 s._utf8 = u
57 return s
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
60
51 61 def tolocal(s):
52 62 """
53 63 Convert a string from internal UTF-8 to local encoding
@@ -57,17 +67,45 def tolocal(s):
57 67 other character sets. We attempt to decode everything strictly
58 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
59 69 replace unknown characters.
70
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
77 >>> l
78 'foo: ?'
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> d # no collision
84 {'foo: ?': 1, 'foo: ?': 2}
85 >>> 'foo: ?' in d
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
89 >>> l
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
60 93 """
94
61 95 for e in ('UTF-8', fallbackencoding):
62 96 try:
63 97 u = s.decode(e) # attempt strict decoding
64 return u.encode(encoding, "replace")
98 if u == 'UTF-8':
99 return localstr(s, u.encode(encoding, "replace"))
100 else:
101 return localstr(u.encode('UTF-8'),
102 u.encode(encoding, "replace"))
65 103 except LookupError, k:
66 104 raise error.Abort("%s, please check your locale settings" % k)
67 105 except UnicodeDecodeError:
68 106 pass
69 107 u = s.decode("utf-8", "replace") # last ditch
70 return u.encode(encoding, "replace")
108 return u.encode(encoding, "replace") # can't round-trip
71 109
72 110 def fromlocal(s):
73 111 """
@@ -79,6 +117,11 def fromlocal(s):
79 117 'replace', which replaces unknown characters with a special
80 118 Unicode character, and 'ignore', which drops the character.
81 119 """
120
121 # can we do a lossless round-trip?
122 if isinstance(s, localstr):
123 return s._utf8
124
82 125 try:
83 126 return s.decode(encoding, encodingmode).encode("utf-8")
84 127 except UnicodeDecodeError, inst:
@@ -13,8 +13,8 doctest.testmod(mercurial.dagparser, opt
13 13 import mercurial.match
14 14 doctest.testmod(mercurial.match)
15 15
16 import mercurial.url
17 doctest.testmod(mercurial.url)
16 import mercurial.encoding
17 doctest.testmod(mercurial.encoding)
18 18
19 19 import hgext.convert.cvsps
20 20 doctest.testmod(hgext.convert.cvsps)
General Comments 0
You need to be logged in to leave comments. Login now