##// END OF EJS Templates
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...
Matt Mackall -
r13940:b7b26e54 stable
parent child Browse files
Show More
@@ -1,146 +1,150 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 11 def _getpreferredencoding():
12 12 '''
13 13 On darwin, getpreferredencoding ignores the locale environment and
14 14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 15 for Python 2.7 and up. This is the same corrected code for earlier
16 16 Python versions.
17 17
18 18 However, we can't use a version check for this method, as some distributions
19 19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 20 encoding, as it is unlikely that this encoding is the actually expected.
21 21 '''
22 22 try:
23 23 locale.CODESET
24 24 except AttributeError:
25 25 # Fall back to parsing environment variables :-(
26 26 return locale.getdefaultlocale()[1]
27 27
28 28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 29 locale.setlocale(locale.LC_CTYPE, "")
30 30 result = locale.nl_langinfo(locale.CODESET)
31 31 locale.setlocale(locale.LC_CTYPE, oldloc)
32 32
33 33 return result
34 34
35 35 _encodingfixers = {
36 36 '646': lambda: 'ascii',
37 37 'ANSI_X3.4-1968': lambda: 'ascii',
38 38 'mac-roman': _getpreferredencoding
39 39 }
40 40
41 41 try:
42 42 encoding = os.environ.get("HGENCODING")
43 43 if not encoding:
44 44 encoding = locale.getpreferredencoding() or 'ascii'
45 45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 46 except locale.Error:
47 47 encoding = 'ascii'
48 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 49 fallbackencoding = 'ISO-8859-1'
50 50
51 51 class localstr(str):
52 52 '''This class allows strings that are unmodified to be
53 53 round-tripped to the local encoding and back'''
54 54 def __new__(cls, u, l):
55 55 s = str.__new__(cls, l)
56 56 s._utf8 = u
57 57 return s
58 58 def __hash__(self):
59 59 return hash(self._utf8) # avoid collisions in local string space
60 60
61 61 def tolocal(s):
62 62 """
63 63 Convert a string from internal UTF-8 to local encoding
64 64
65 65 All internal strings should be UTF-8 but some repos before the
66 66 implementation of locale support may contain latin1 or possibly
67 67 other character sets. We attempt to decode everything strictly
68 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 69 replace unknown characters.
70 70
71 71 The localstr class is used to cache the known UTF-8 encoding of
72 72 strings next to their local representation to allow lossless
73 73 round-trip conversion back to UTF-8.
74 74
75 75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 76 >>> l = tolocal(u)
77 77 >>> l
78 78 'foo: ?'
79 79 >>> fromlocal(l)
80 80 'foo: \\xc3\\xa4'
81 81 >>> u2 = 'foo: \\xc3\\xa1'
82 82 >>> d = { l: 1, tolocal(u2): 2 }
83 83 >>> d # no collision
84 84 {'foo: ?': 1, 'foo: ?': 2}
85 85 >>> 'foo: ?' in d
86 86 False
87 87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 88 >>> l = tolocal(l1)
89 89 >>> l
90 90 'foo: ?'
91 91 >>> fromlocal(l) # magically in utf-8
92 92 'foo: \\xc3\\xa4'
93 93 """
94 94
95 95 for e in ('UTF-8', fallbackencoding):
96 96 try:
97 97 u = s.decode(e) # attempt strict decoding
98 if e == 'UTF-8':
99 return localstr(s, u.encode(encoding, "replace"))
98 r = u.encode(encoding, "replace")
99 if u == r.decode(encoding):
100 # r is a safe, non-lossy encoding of s
101 return r
102 elif e == 'UTF-8':
103 return localstr(s, r)
100 104 else:
101 return localstr(u.encode('UTF-8'),
102 u.encode(encoding, "replace"))
105 return localstr(u.encode('UTF-8'), r)
106
103 107 except LookupError, k:
104 108 raise error.Abort("%s, please check your locale settings" % k)
105 109 except UnicodeDecodeError:
106 110 pass
107 111 u = s.decode("utf-8", "replace") # last ditch
108 112 return u.encode(encoding, "replace") # can't round-trip
109 113
110 114 def fromlocal(s):
111 115 """
112 116 Convert a string from the local character encoding to UTF-8
113 117
114 118 We attempt to decode strings using the encoding mode set by
115 119 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
116 120 characters will cause an error message. Other modes include
117 121 'replace', which replaces unknown characters with a special
118 122 Unicode character, and 'ignore', which drops the character.
119 123 """
120 124
121 125 # can we do a lossless round-trip?
122 126 if isinstance(s, localstr):
123 127 return s._utf8
124 128
125 129 try:
126 130 return s.decode(encoding, encodingmode).encode("utf-8")
127 131 except UnicodeDecodeError, inst:
128 132 sub = s[max(0, inst.start - 10):inst.start + 10]
129 133 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
130 134 except LookupError, k:
131 135 raise error.Abort("%s, please check your locale settings" % k)
132 136
133 137 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
134 138 ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")
135 139
136 140 def colwidth(s):
137 141 "Find the column width of a UTF-8 string for display"
138 142 d = s.decode(encoding, 'replace')
139 143 if hasattr(unicodedata, 'east_asian_width'):
140 144 wide = "WF"
141 145 if ambiguous == "wide":
142 146 wide = "WFA"
143 147 w = unicodedata.east_asian_width
144 148 return sum([w(c) in wide and 2 or 1 for c in d])
145 149 return len(d)
146 150
@@ -1,243 +1,249 b''
1 1 Test character encoding
2 2
3 3 $ hg init t
4 4 $ cd t
5 5
6 6 we need a repo with some legacy latin-1 changesets
7 7
8 8 $ hg unbundle $TESTDIR/legacy-encoding.hg
9 9 adding changesets
10 10 adding manifests
11 11 adding file changes
12 12 added 2 changesets with 2 changes to 1 files
13 13 (run 'hg update' to get a working copy)
14 14 $ hg co
15 15 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
16 16 $ python << EOF
17 17 > f = file('latin-1', 'w'); f.write("latin-1 e' encoded: \xe9"); f.close()
18 18 > f = file('utf-8', 'w'); f.write("utf-8 e' encoded: \xc3\xa9"); f.close()
19 19 > f = file('latin-1-tag', 'w'); f.write("\xe9"); f.close()
20 20 > EOF
21 21
22 22 should fail with encoding error
23 23
24 24 $ echo "plain old ascii" > a
25 25 $ hg st
26 26 M a
27 27 ? latin-1
28 28 ? latin-1-tag
29 29 ? utf-8
30 30 $ HGENCODING=ascii hg ci -l latin-1
31 31 transaction abort!
32 32 rollback completed
33 33 abort: decoding near ' encoded: \xe9': 'ascii' codec can't decode byte 0xe9 in position 20: ordinal not in range(128)! (esc)
34 34 [255]
35 35
36 36 these should work
37 37
38 38 $ echo "latin-1" > a
39 39 $ HGENCODING=latin-1 hg ci -l latin-1
40 40 $ echo "utf-8" > a
41 41 $ HGENCODING=utf-8 hg ci -l utf-8
42 42 $ HGENCODING=latin-1 hg tag `cat latin-1-tag`
43 43 $ HGENCODING=latin-1 hg branch `cat latin-1-tag`
44 44 marked working directory as branch \xe9 (esc)
45 45 $ HGENCODING=latin-1 hg ci -m 'latin1 branch'
46 46 $ rm .hg/branch
47 47
48 48 hg log (ascii)
49 49
50 50 $ hg --encoding ascii log
51 51 changeset: 5:093c6077d1c8
52 52 branch: ?
53 53 tag: tip
54 54 user: test
55 55 date: Thu Jan 01 00:00:00 1970 +0000
56 56 summary: latin1 branch
57 57
58 58 changeset: 4:94db611b4196
59 59 user: test
60 60 date: Thu Jan 01 00:00:00 1970 +0000
61 61 summary: Added tag ? for changeset ca661e7520de
62 62
63 63 changeset: 3:ca661e7520de
64 64 tag: ?
65 65 user: test
66 66 date: Thu Jan 01 00:00:00 1970 +0000
67 67 summary: utf-8 e' encoded: ?
68 68
69 69 changeset: 2:650c6f3d55dd
70 70 user: test
71 71 date: Thu Jan 01 00:00:00 1970 +0000
72 72 summary: latin-1 e' encoded: ?
73 73
74 74 changeset: 1:0e5b7e3f9c4a
75 75 user: test
76 76 date: Mon Jan 12 13:46:40 1970 +0000
77 77 summary: koi8-r: ????? = u'\u0440\u0442\u0443\u0442\u044c'
78 78
79 79 changeset: 0:1e78a93102a3
80 80 user: test
81 81 date: Mon Jan 12 13:46:40 1970 +0000
82 82 summary: latin-1 e': ? = u'\xe9'
83 83
84 84
85 85 hg log (latin-1)
86 86
87 87 $ hg --encoding latin-1 log
88 88 changeset: 5:093c6077d1c8
89 89 branch: \xe9 (esc)
90 90 tag: tip
91 91 user: test
92 92 date: Thu Jan 01 00:00:00 1970 +0000
93 93 summary: latin1 branch
94 94
95 95 changeset: 4:94db611b4196
96 96 user: test
97 97 date: Thu Jan 01 00:00:00 1970 +0000
98 98 summary: Added tag \xe9 for changeset ca661e7520de (esc)
99 99
100 100 changeset: 3:ca661e7520de
101 101 tag: \xe9 (esc)
102 102 user: test
103 103 date: Thu Jan 01 00:00:00 1970 +0000
104 104 summary: utf-8 e' encoded: \xe9 (esc)
105 105
106 106 changeset: 2:650c6f3d55dd
107 107 user: test
108 108 date: Thu Jan 01 00:00:00 1970 +0000
109 109 summary: latin-1 e' encoded: \xe9 (esc)
110 110
111 111 changeset: 1:0e5b7e3f9c4a
112 112 user: test
113 113 date: Mon Jan 12 13:46:40 1970 +0000
114 114 summary: koi8-r: \xd2\xd4\xd5\xd4\xd8 = u'\\u0440\\u0442\\u0443\\u0442\\u044c' (esc)
115 115
116 116 changeset: 0:1e78a93102a3
117 117 user: test
118 118 date: Mon Jan 12 13:46:40 1970 +0000
119 119 summary: latin-1 e': \xe9 = u'\\xe9' (esc)
120 120
121 121
122 122 hg log (utf-8)
123 123
124 124 $ hg --encoding utf-8 log
125 125 changeset: 5:093c6077d1c8
126 126 branch: \xc3\xa9 (esc)
127 127 tag: tip
128 128 user: test
129 129 date: Thu Jan 01 00:00:00 1970 +0000
130 130 summary: latin1 branch
131 131
132 132 changeset: 4:94db611b4196
133 133 user: test
134 134 date: Thu Jan 01 00:00:00 1970 +0000
135 135 summary: Added tag \xc3\xa9 for changeset ca661e7520de (esc)
136 136
137 137 changeset: 3:ca661e7520de
138 138 tag: \xc3\xa9 (esc)
139 139 user: test
140 140 date: Thu Jan 01 00:00:00 1970 +0000
141 141 summary: utf-8 e' encoded: \xc3\xa9 (esc)
142 142
143 143 changeset: 2:650c6f3d55dd
144 144 user: test
145 145 date: Thu Jan 01 00:00:00 1970 +0000
146 146 summary: latin-1 e' encoded: \xc3\xa9 (esc)
147 147
148 148 changeset: 1:0e5b7e3f9c4a
149 149 user: test
150 150 date: Mon Jan 12 13:46:40 1970 +0000
151 151 summary: koi8-r: \xc3\x92\xc3\x94\xc3\x95\xc3\x94\xc3\x98 = u'\\u0440\\u0442\\u0443\\u0442\\u044c' (esc)
152 152
153 153 changeset: 0:1e78a93102a3
154 154 user: test
155 155 date: Mon Jan 12 13:46:40 1970 +0000
156 156 summary: latin-1 e': \xc3\xa9 = u'\\xe9' (esc)
157 157
158 158
159 159 hg tags (ascii)
160 160
161 161 $ HGENCODING=ascii hg tags
162 162 tip 5:093c6077d1c8
163 163 ? 3:ca661e7520de
164 164
165 165 hg tags (latin-1)
166 166
167 167 $ HGENCODING=latin-1 hg tags
168 168 tip 5:093c6077d1c8
169 169 \xe9 3:ca661e7520de (esc)
170 170
171 171 hg tags (utf-8)
172 172
173 173 $ HGENCODING=utf-8 hg tags
174 174 tip 5:093c6077d1c8
175 175 \xc3\xa9 3:ca661e7520de (esc)
176 176
177 177 hg branches (ascii)
178 178
179 179 $ HGENCODING=ascii hg branches
180 180 ? 5:093c6077d1c8
181 181 default 4:94db611b4196 (inactive)
182 182
183 183 hg branches (latin-1)
184 184
185 185 $ HGENCODING=latin-1 hg branches
186 186 \xe9 5:093c6077d1c8 (esc)
187 187 default 4:94db611b4196 (inactive)
188 188
189 189 hg branches (utf-8)
190 190
191 191 $ HGENCODING=utf-8 hg branches
192 192 \xc3\xa9 5:093c6077d1c8 (esc)
193 193 default 4:94db611b4196 (inactive)
194 194 $ echo '[ui]' >> .hg/hgrc
195 195 $ echo 'fallbackencoding = koi8-r' >> .hg/hgrc
196 196
197 197 hg log (utf-8)
198 198
199 199 $ HGENCODING=utf-8 hg log
200 200 changeset: 5:093c6077d1c8
201 201 branch: \xc3\xa9 (esc)
202 202 tag: tip
203 203 user: test
204 204 date: Thu Jan 01 00:00:00 1970 +0000
205 205 summary: latin1 branch
206 206
207 207 changeset: 4:94db611b4196
208 208 user: test
209 209 date: Thu Jan 01 00:00:00 1970 +0000
210 210 summary: Added tag \xc3\xa9 for changeset ca661e7520de (esc)
211 211
212 212 changeset: 3:ca661e7520de
213 213 tag: \xc3\xa9 (esc)
214 214 user: test
215 215 date: Thu Jan 01 00:00:00 1970 +0000
216 216 summary: utf-8 e' encoded: \xc3\xa9 (esc)
217 217
218 218 changeset: 2:650c6f3d55dd
219 219 user: test
220 220 date: Thu Jan 01 00:00:00 1970 +0000
221 221 summary: latin-1 e' encoded: \xc3\xa9 (esc)
222 222
223 223 changeset: 1:0e5b7e3f9c4a
224 224 user: test
225 225 date: Mon Jan 12 13:46:40 1970 +0000
226 226 summary: koi8-r: \xd1\x80\xd1\x82\xd1\x83\xd1\x82\xd1\x8c = u'\\u0440\\u0442\\u0443\\u0442\\u044c' (esc)
227 227
228 228 changeset: 0:1e78a93102a3
229 229 user: test
230 230 date: Mon Jan 12 13:46:40 1970 +0000
231 231 summary: latin-1 e': \xd0\x98 = u'\\xe9' (esc)
232 232
233 233
234 234 hg log (dolphin)
235 235
236 236 $ HGENCODING=dolphin hg log
237 237 abort: unknown encoding: dolphin, please check your locale settings
238 238 [255]
239 239 $ HGENCODING=ascii hg branch `cat latin-1-tag`
240 240 abort: decoding near '\xe9': 'ascii' codec can't decode byte 0xe9 in position 0: ordinal not in range(128)! (esc)
241 241 [255]
242 242 $ cp latin-1-tag .hg/branch
243 243 $ HGENCODING=latin-1 hg ci -m 'auto-promote legacy name'
244
245 Test roundtrip encoding of lookup tables when not using UTF-8 (issue2763)
246
247 $ HGENCODING=latin-1 hg up `cat latin-1-tag`
248 0 files updated, 0 files merged, 1 files removed, 0 files unresolved
249
General Comments 0
You need to be logged in to leave comments. Login now