##// END OF EJS Templates
encoding: add localstr class to track UTF-8 version of transcoded strings...
Matt Mackall -
r13046:7cc4263e default
parent child Browse files
Show More
@@ -48,6 +48,16 b' except locale.Error:'
48 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 49 fallbackencoding = 'ISO-8859-1'
50 50
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
56 s._utf8 = u
57 return s
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
60
51 61 def tolocal(s):
52 62 """
53 63 Convert a string from internal UTF-8 to local encoding
@@ -57,17 +67,45 b' def tolocal(s):'
57 67 other character sets. We attempt to decode everything strictly
58 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
59 69 replace unknown characters.
70
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
77 >>> l
78 'foo: ?'
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> d # no collision
84 {'foo: ?': 1, 'foo: ?': 2}
85 >>> 'foo: ?' in d
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
89 >>> l
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
60 93 """
94
61 95 for e in ('UTF-8', fallbackencoding):
62 96 try:
63 97 u = s.decode(e) # attempt strict decoding
64 return u.encode(encoding, "replace")
98 if u == 'UTF-8':
99 return localstr(s, u.encode(encoding, "replace"))
100 else:
101 return localstr(u.encode('UTF-8'),
102 u.encode(encoding, "replace"))
65 103 except LookupError, k:
66 104 raise error.Abort("%s, please check your locale settings" % k)
67 105 except UnicodeDecodeError:
68 106 pass
69 107 u = s.decode("utf-8", "replace") # last ditch
70 return u.encode(encoding, "replace")
108 return u.encode(encoding, "replace") # can't round-trip
71 109
72 110 def fromlocal(s):
73 111 """
@@ -79,6 +117,11 b' def fromlocal(s):'
79 117 'replace', which replaces unknown characters with a special
80 118 Unicode character, and 'ignore', which drops the character.
81 119 """
120
121 # can we do a lossless round-trip?
122 if isinstance(s, localstr):
123 return s._utf8
124
82 125 try:
83 126 return s.decode(encoding, encodingmode).encode("utf-8")
84 127 except UnicodeDecodeError, inst:
@@ -13,8 +13,8 b' doctest.testmod(mercurial.dagparser, opt'
13 13 import mercurial.match
14 14 doctest.testmod(mercurial.match)
15 15
16 import mercurial.url
17 doctest.testmod(mercurial.url)
16 import mercurial.encoding
17 doctest.testmod(mercurial.encoding)
18 18
19 19 import hgext.convert.cvsps
20 20 doctest.testmod(hgext.convert.cvsps)
General Comments 0
You need to be logged in to leave comments. Login now