##// END OF EJS Templates
encoding.lower: use fast ASCII lower...
Siddharth Agarwal -
r22779:d9585dda default
parent child Browse files
Show More
@@ -1,432 +1,431 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error, parsers
9 9 import unicodedata, locale, os
10 10
11 11 def _getpreferredencoding():
12 12 '''
13 13 On darwin, getpreferredencoding ignores the locale environment and
14 14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 15 for Python 2.7 and up. This is the same corrected code for earlier
16 16 Python versions.
17 17
18 18 However, we can't use a version check for this method, as some distributions
19 19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 20 encoding, as it is unlikely that this encoding is the actually expected.
21 21 '''
22 22 try:
23 23 locale.CODESET
24 24 except AttributeError:
25 25 # Fall back to parsing environment variables :-(
26 26 return locale.getdefaultlocale()[1]
27 27
28 28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 29 locale.setlocale(locale.LC_CTYPE, "")
30 30 result = locale.nl_langinfo(locale.CODESET)
31 31 locale.setlocale(locale.LC_CTYPE, oldloc)
32 32
33 33 return result
34 34
35 35 _encodingfixers = {
36 36 '646': lambda: 'ascii',
37 37 'ANSI_X3.4-1968': lambda: 'ascii',
38 38 'mac-roman': _getpreferredencoding
39 39 }
40 40
41 41 try:
42 42 encoding = os.environ.get("HGENCODING")
43 43 if not encoding:
44 44 encoding = locale.getpreferredencoding() or 'ascii'
45 45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 46 except locale.Error:
47 47 encoding = 'ascii'
48 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 49 fallbackencoding = 'ISO-8859-1'
50 50
51 51 class localstr(str):
52 52 '''This class allows strings that are unmodified to be
53 53 round-tripped to the local encoding and back'''
54 54 def __new__(cls, u, l):
55 55 s = str.__new__(cls, l)
56 56 s._utf8 = u
57 57 return s
58 58 def __hash__(self):
59 59 return hash(self._utf8) # avoid collisions in local string space
60 60
61 61 def tolocal(s):
62 62 """
63 63 Convert a string from internal UTF-8 to local encoding
64 64
65 65 All internal strings should be UTF-8 but some repos before the
66 66 implementation of locale support may contain latin1 or possibly
67 67 other character sets. We attempt to decode everything strictly
68 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 69 replace unknown characters.
70 70
71 71 The localstr class is used to cache the known UTF-8 encoding of
72 72 strings next to their local representation to allow lossless
73 73 round-trip conversion back to UTF-8.
74 74
75 75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 76 >>> l = tolocal(u)
77 77 >>> l
78 78 'foo: ?'
79 79 >>> fromlocal(l)
80 80 'foo: \\xc3\\xa4'
81 81 >>> u2 = 'foo: \\xc3\\xa1'
82 82 >>> d = { l: 1, tolocal(u2): 2 }
83 83 >>> len(d) # no collision
84 84 2
85 85 >>> 'foo: ?' in d
86 86 False
87 87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 88 >>> l = tolocal(l1)
89 89 >>> l
90 90 'foo: ?'
91 91 >>> fromlocal(l) # magically in utf-8
92 92 'foo: \\xc3\\xa4'
93 93 """
94 94
95 95 try:
96 96 try:
97 97 # make sure string is actually stored in UTF-8
98 98 u = s.decode('UTF-8')
99 99 if encoding == 'UTF-8':
100 100 # fast path
101 101 return s
102 102 r = u.encode(encoding, "replace")
103 103 if u == r.decode(encoding):
104 104 # r is a safe, non-lossy encoding of s
105 105 return r
106 106 return localstr(s, r)
107 107 except UnicodeDecodeError:
108 108 # we should only get here if we're looking at an ancient changeset
109 109 try:
110 110 u = s.decode(fallbackencoding)
111 111 r = u.encode(encoding, "replace")
112 112 if u == r.decode(encoding):
113 113 # r is a safe, non-lossy encoding of s
114 114 return r
115 115 return localstr(u.encode('UTF-8'), r)
116 116 except UnicodeDecodeError:
117 117 u = s.decode("utf-8", "replace") # last ditch
118 118 return u.encode(encoding, "replace") # can't round-trip
119 119 except LookupError, k:
120 120 raise error.Abort(k, hint="please check your locale settings")
121 121
122 122 def fromlocal(s):
123 123 """
124 124 Convert a string from the local character encoding to UTF-8
125 125
126 126 We attempt to decode strings using the encoding mode set by
127 127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 128 characters will cause an error message. Other modes include
129 129 'replace', which replaces unknown characters with a special
130 130 Unicode character, and 'ignore', which drops the character.
131 131 """
132 132
133 133 # can we do a lossless round-trip?
134 134 if isinstance(s, localstr):
135 135 return s._utf8
136 136
137 137 try:
138 138 return s.decode(encoding, encodingmode).encode("utf-8")
139 139 except UnicodeDecodeError, inst:
140 140 sub = s[max(0, inst.start - 10):inst.start + 10]
141 141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 142 except LookupError, k:
143 143 raise error.Abort(k, hint="please check your locale settings")
144 144
145 145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 147 and "WFA" or "WF")
148 148
149 149 def colwidth(s):
150 150 "Find the column width of a string for display in the local encoding"
151 151 return ucolwidth(s.decode(encoding, 'replace'))
152 152
153 153 def ucolwidth(d):
154 154 "Find the column width of a Unicode string for display"
155 155 eaw = getattr(unicodedata, 'east_asian_width', None)
156 156 if eaw is not None:
157 157 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 158 return len(d)
159 159
160 160 def getcols(s, start, c):
161 161 '''Use colwidth to find a c-column substring of s starting at byte
162 162 index start'''
163 163 for x in xrange(start + c, len(s)):
164 164 t = s[start:x]
165 165 if colwidth(t) == c:
166 166 return t
167 167
168 168 def trim(s, width, ellipsis='', leftside=False):
169 169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170 170
171 171 If 'leftside' is True, left side of string 's' is trimmed.
172 172 'ellipsis' is always placed at trimmed side.
173 173
174 174 >>> ellipsis = '+++'
175 175 >>> from mercurial import encoding
176 176 >>> encoding.encoding = 'utf-8'
177 177 >>> t= '1234567890'
178 178 >>> print trim(t, 12, ellipsis=ellipsis)
179 179 1234567890
180 180 >>> print trim(t, 10, ellipsis=ellipsis)
181 181 1234567890
182 182 >>> print trim(t, 8, ellipsis=ellipsis)
183 183 12345+++
184 184 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
185 185 +++67890
186 186 >>> print trim(t, 8)
187 187 12345678
188 188 >>> print trim(t, 8, leftside=True)
189 189 34567890
190 190 >>> print trim(t, 3, ellipsis=ellipsis)
191 191 +++
192 192 >>> print trim(t, 1, ellipsis=ellipsis)
193 193 +
194 194 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
195 195 >>> t = u.encode(encoding.encoding)
196 196 >>> print trim(t, 12, ellipsis=ellipsis)
197 197 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
198 198 >>> print trim(t, 10, ellipsis=ellipsis)
199 199 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
200 200 >>> print trim(t, 8, ellipsis=ellipsis)
201 201 \xe3\x81\x82\xe3\x81\x84+++
202 202 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
203 203 +++\xe3\x81\x88\xe3\x81\x8a
204 204 >>> print trim(t, 5)
205 205 \xe3\x81\x82\xe3\x81\x84
206 206 >>> print trim(t, 5, leftside=True)
207 207 \xe3\x81\x88\xe3\x81\x8a
208 208 >>> print trim(t, 4, ellipsis=ellipsis)
209 209 +++
210 210 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
211 211 +++
212 212 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
213 213 >>> print trim(t, 12, ellipsis=ellipsis)
214 214 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
215 215 >>> print trim(t, 10, ellipsis=ellipsis)
216 216 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
217 217 >>> print trim(t, 8, ellipsis=ellipsis)
218 218 \x11\x22\x33\x44\x55+++
219 219 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
220 220 +++\x66\x77\x88\x99\xaa
221 221 >>> print trim(t, 8)
222 222 \x11\x22\x33\x44\x55\x66\x77\x88
223 223 >>> print trim(t, 8, leftside=True)
224 224 \x33\x44\x55\x66\x77\x88\x99\xaa
225 225 >>> print trim(t, 3, ellipsis=ellipsis)
226 226 +++
227 227 >>> print trim(t, 1, ellipsis=ellipsis)
228 228 +
229 229 """
230 230 try:
231 231 u = s.decode(encoding)
232 232 except UnicodeDecodeError:
233 233 if len(s) <= width: # trimming is not needed
234 234 return s
235 235 width -= len(ellipsis)
236 236 if width <= 0: # no enough room even for ellipsis
237 237 return ellipsis[:width + len(ellipsis)]
238 238 if leftside:
239 239 return ellipsis + s[-width:]
240 240 return s[:width] + ellipsis
241 241
242 242 if ucolwidth(u) <= width: # trimming is not needed
243 243 return s
244 244
245 245 width -= len(ellipsis)
246 246 if width <= 0: # no enough room even for ellipsis
247 247 return ellipsis[:width + len(ellipsis)]
248 248
249 249 if leftside:
250 250 uslice = lambda i: u[i:]
251 251 concat = lambda s: ellipsis + s
252 252 else:
253 253 uslice = lambda i: u[:-i]
254 254 concat = lambda s: s + ellipsis
255 255 for i in xrange(1, len(u)):
256 256 usub = uslice(i)
257 257 if ucolwidth(usub) <= width:
258 258 return concat(usub.encode(encoding))
259 259 return ellipsis # no enough room for multi-column characters
260 260
261 261 def asciilower(s):
262 262 '''convert a string to lowercase if ASCII
263 263
264 264 Raises UnicodeDecodeError if non-ASCII characters are found.'''
265 265 s.decode('ascii')
266 266 return s.lower()
267 267
268 268 asciilower = getattr(parsers, 'asciilower', asciilower)
269 269
270 270 def lower(s):
271 271 "best-effort encoding-aware case-folding of local string s"
272 272 try:
273 s.decode('ascii') # throw exception for non-ASCII character
274 return s.lower()
273 return asciilower(s)
275 274 except UnicodeDecodeError:
276 275 pass
277 276 try:
278 277 if isinstance(s, localstr):
279 278 u = s._utf8.decode("utf-8")
280 279 else:
281 280 u = s.decode(encoding, encodingmode)
282 281
283 282 lu = u.lower()
284 283 if u == lu:
285 284 return s # preserve localstring
286 285 return lu.encode(encoding)
287 286 except UnicodeError:
288 287 return s.lower() # we don't know how to fold this except in ASCII
289 288 except LookupError, k:
290 289 raise error.Abort(k, hint="please check your locale settings")
291 290
292 291 def upper(s):
293 292 "best-effort encoding-aware case-folding of local string s"
294 293 try:
295 294 s.decode('ascii') # throw exception for non-ASCII character
296 295 return s.upper()
297 296 except UnicodeDecodeError:
298 297 pass
299 298 try:
300 299 if isinstance(s, localstr):
301 300 u = s._utf8.decode("utf-8")
302 301 else:
303 302 u = s.decode(encoding, encodingmode)
304 303
305 304 uu = u.upper()
306 305 if u == uu:
307 306 return s # preserve localstring
308 307 return uu.encode(encoding)
309 308 except UnicodeError:
310 309 return s.upper() # we don't know how to fold this except in ASCII
311 310 except LookupError, k:
312 311 raise error.Abort(k, hint="please check your locale settings")
313 312
314 313 _jsonmap = {}
315 314
316 315 def jsonescape(s):
317 316 '''returns a string suitable for JSON
318 317
319 318 JSON is problematic for us because it doesn't support non-Unicode
320 319 bytes. To deal with this, we take the following approach:
321 320
322 321 - localstr objects are converted back to UTF-8
323 322 - valid UTF-8/ASCII strings are passed as-is
324 323 - other strings are converted to UTF-8b surrogate encoding
325 324 - apply JSON-specified string escaping
326 325
327 326 (escapes are doubled in these tests)
328 327
329 328 >>> jsonescape('this is a test')
330 329 'this is a test'
331 330 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
332 331 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
333 332 >>> jsonescape('a weird byte: \\xdd')
334 333 'a weird byte: \\xed\\xb3\\x9d'
335 334 >>> jsonescape('utf-8: caf\\xc3\\xa9')
336 335 'utf-8: caf\\xc3\\xa9'
337 336 >>> jsonescape('')
338 337 ''
339 338 '''
340 339
341 340 if not _jsonmap:
342 341 for x in xrange(32):
343 342 _jsonmap[chr(x)] = "\u%04x" %x
344 343 for x in xrange(32, 256):
345 344 c = chr(x)
346 345 _jsonmap[c] = c
347 346 _jsonmap['\t'] = '\\t'
348 347 _jsonmap['\n'] = '\\n'
349 348 _jsonmap['\"'] = '\\"'
350 349 _jsonmap['\\'] = '\\\\'
351 350 _jsonmap['\b'] = '\\b'
352 351 _jsonmap['\f'] = '\\f'
353 352 _jsonmap['\r'] = '\\r'
354 353
355 354 return ''.join(_jsonmap[c] for c in toutf8b(s))
356 355
357 356 def toutf8b(s):
358 357 '''convert a local, possibly-binary string into UTF-8b
359 358
360 359 This is intended as a generic method to preserve data when working
361 360 with schemes like JSON and XML that have no provision for
362 361 arbitrary byte strings. As Mercurial often doesn't know
363 362 what encoding data is in, we use so-called UTF-8b.
364 363
365 364 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
366 365 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
367 366 uDC00-uDCFF.
368 367
369 368 Principles of operation:
370 369
371 370 - ASCII and UTF-8 data successfully round-trips and is understood
372 371 by Unicode-oriented clients
373 372 - filenames and file contents in arbitrary other encodings can have
374 373 be round-tripped or recovered by clueful clients
375 374 - local strings that have a cached known UTF-8 encoding (aka
376 375 localstr) get sent as UTF-8 so Unicode-oriented clients get the
377 376 Unicode data they want
378 377 - because we must preserve UTF-8 bytestring in places such as
379 378 filenames, metadata can't be roundtripped without help
380 379
381 380 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
382 381 arbitrary bytes into an internal Unicode format that can be
383 382 re-encoded back into the original. Here we are exposing the
384 383 internal surrogate encoding as a UTF-8 string.)
385 384 '''
386 385
387 386 if isinstance(s, localstr):
388 387 return s._utf8
389 388
390 389 try:
391 390 s.decode('utf-8')
392 391 return s
393 392 except UnicodeDecodeError:
394 393 # surrogate-encode any characters that don't round-trip
395 394 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
396 395 r = ""
397 396 pos = 0
398 397 for c in s:
399 398 if s2[pos:pos + 1] == c:
400 399 r += c
401 400 pos += 1
402 401 else:
403 402 r += unichr(0xdc00 + ord(c)).encode('utf-8')
404 403 return r
405 404
406 405 def fromutf8b(s):
407 406 '''Given a UTF-8b string, return a local, possibly-binary string.
408 407
409 408 return the original binary string. This
410 409 is a round-trip process for strings like filenames, but metadata
411 410 that's was passed through tolocal will remain in UTF-8.
412 411
413 412 >>> m = "\\xc3\\xa9\\x99abcd"
414 413 >>> n = toutf8b(m)
415 414 >>> n
416 415 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
417 416 >>> fromutf8b(n) == m
418 417 True
419 418 '''
420 419
421 420 # fast path - look for uDxxx prefixes in s
422 421 if "\xed" not in s:
423 422 return s
424 423
425 424 u = s.decode("utf-8")
426 425 r = ""
427 426 for c in u:
428 427 if ord(c) & 0xff00 == 0xdc00:
429 428 r += chr(ord(c) & 0xff)
430 429 else:
431 430 r += c.encode("utf-8")
432 431 return r
General Comments 0
You need to be logged in to leave comments. Login now