##// END OF EJS Templates
encoding.upper: factor out fallback code...
Siddharth Agarwal -
r24597:b4258d5a default
parent child Browse files
Show More
@@ -1,488 +1,490
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 13 # sanity.
14 14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 16 "206a 206b 206c 206d 206e 206f feff".split()]
17 17 # verify the next function will work
18 18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19 19
20 20 def hfsignoreclean(s):
21 21 """Remove codepoints ignored by HFS+ from s.
22 22
23 23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 24 '.hg'
25 25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 26 '.hg'
27 27 """
28 28 if "\xe2" in s or "\xef" in s:
29 29 for c in _ignore:
30 30 s = s.replace(c, '')
31 31 return s
32 32
33 33 def _getpreferredencoding():
34 34 '''
35 35 On darwin, getpreferredencoding ignores the locale environment and
36 36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
37 37 for Python 2.7 and up. This is the same corrected code for earlier
38 38 Python versions.
39 39
40 40 However, we can't use a version check for this method, as some distributions
41 41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
42 42 encoding, as it is unlikely that this encoding is the actually expected.
43 43 '''
44 44 try:
45 45 locale.CODESET
46 46 except AttributeError:
47 47 # Fall back to parsing environment variables :-(
48 48 return locale.getdefaultlocale()[1]
49 49
50 50 oldloc = locale.setlocale(locale.LC_CTYPE)
51 51 locale.setlocale(locale.LC_CTYPE, "")
52 52 result = locale.nl_langinfo(locale.CODESET)
53 53 locale.setlocale(locale.LC_CTYPE, oldloc)
54 54
55 55 return result
56 56
57 57 _encodingfixers = {
58 58 '646': lambda: 'ascii',
59 59 'ANSI_X3.4-1968': lambda: 'ascii',
60 60 'mac-roman': _getpreferredencoding
61 61 }
62 62
63 63 try:
64 64 encoding = os.environ.get("HGENCODING")
65 65 if not encoding:
66 66 encoding = locale.getpreferredencoding() or 'ascii'
67 67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
68 68 except locale.Error:
69 69 encoding = 'ascii'
70 70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
71 71 fallbackencoding = 'ISO-8859-1'
72 72
73 73 class localstr(str):
74 74 '''This class allows strings that are unmodified to be
75 75 round-tripped to the local encoding and back'''
76 76 def __new__(cls, u, l):
77 77 s = str.__new__(cls, l)
78 78 s._utf8 = u
79 79 return s
80 80 def __hash__(self):
81 81 return hash(self._utf8) # avoid collisions in local string space
82 82
83 83 def tolocal(s):
84 84 """
85 85 Convert a string from internal UTF-8 to local encoding
86 86
87 87 All internal strings should be UTF-8 but some repos before the
88 88 implementation of locale support may contain latin1 or possibly
89 89 other character sets. We attempt to decode everything strictly
90 90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
91 91 replace unknown characters.
92 92
93 93 The localstr class is used to cache the known UTF-8 encoding of
94 94 strings next to their local representation to allow lossless
95 95 round-trip conversion back to UTF-8.
96 96
97 97 >>> u = 'foo: \\xc3\\xa4' # utf-8
98 98 >>> l = tolocal(u)
99 99 >>> l
100 100 'foo: ?'
101 101 >>> fromlocal(l)
102 102 'foo: \\xc3\\xa4'
103 103 >>> u2 = 'foo: \\xc3\\xa1'
104 104 >>> d = { l: 1, tolocal(u2): 2 }
105 105 >>> len(d) # no collision
106 106 2
107 107 >>> 'foo: ?' in d
108 108 False
109 109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
110 110 >>> l = tolocal(l1)
111 111 >>> l
112 112 'foo: ?'
113 113 >>> fromlocal(l) # magically in utf-8
114 114 'foo: \\xc3\\xa4'
115 115 """
116 116
117 117 try:
118 118 try:
119 119 # make sure string is actually stored in UTF-8
120 120 u = s.decode('UTF-8')
121 121 if encoding == 'UTF-8':
122 122 # fast path
123 123 return s
124 124 r = u.encode(encoding, "replace")
125 125 if u == r.decode(encoding):
126 126 # r is a safe, non-lossy encoding of s
127 127 return r
128 128 return localstr(s, r)
129 129 except UnicodeDecodeError:
130 130 # we should only get here if we're looking at an ancient changeset
131 131 try:
132 132 u = s.decode(fallbackencoding)
133 133 r = u.encode(encoding, "replace")
134 134 if u == r.decode(encoding):
135 135 # r is a safe, non-lossy encoding of s
136 136 return r
137 137 return localstr(u.encode('UTF-8'), r)
138 138 except UnicodeDecodeError:
139 139 u = s.decode("utf-8", "replace") # last ditch
140 140 return u.encode(encoding, "replace") # can't round-trip
141 141 except LookupError, k:
142 142 raise error.Abort(k, hint="please check your locale settings")
143 143
144 144 def fromlocal(s):
145 145 """
146 146 Convert a string from the local character encoding to UTF-8
147 147
148 148 We attempt to decode strings using the encoding mode set by
149 149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
150 150 characters will cause an error message. Other modes include
151 151 'replace', which replaces unknown characters with a special
152 152 Unicode character, and 'ignore', which drops the character.
153 153 """
154 154
155 155 # can we do a lossless round-trip?
156 156 if isinstance(s, localstr):
157 157 return s._utf8
158 158
159 159 try:
160 160 return s.decode(encoding, encodingmode).encode("utf-8")
161 161 except UnicodeDecodeError, inst:
162 162 sub = s[max(0, inst.start - 10):inst.start + 10]
163 163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
164 164 except LookupError, k:
165 165 raise error.Abort(k, hint="please check your locale settings")
166 166
167 167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
168 168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
169 169 and "WFA" or "WF")
170 170
171 171 def colwidth(s):
172 172 "Find the column width of a string for display in the local encoding"
173 173 return ucolwidth(s.decode(encoding, 'replace'))
174 174
175 175 def ucolwidth(d):
176 176 "Find the column width of a Unicode string for display"
177 177 eaw = getattr(unicodedata, 'east_asian_width', None)
178 178 if eaw is not None:
179 179 return sum([eaw(c) in wide and 2 or 1 for c in d])
180 180 return len(d)
181 181
182 182 def getcols(s, start, c):
183 183 '''Use colwidth to find a c-column substring of s starting at byte
184 184 index start'''
185 185 for x in xrange(start + c, len(s)):
186 186 t = s[start:x]
187 187 if colwidth(t) == c:
188 188 return t
189 189
190 190 def trim(s, width, ellipsis='', leftside=False):
191 191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
192 192
193 193 If 'leftside' is True, left side of string 's' is trimmed.
194 194 'ellipsis' is always placed at trimmed side.
195 195
196 196 >>> ellipsis = '+++'
197 197 >>> from mercurial import encoding
198 198 >>> encoding.encoding = 'utf-8'
199 199 >>> t= '1234567890'
200 200 >>> print trim(t, 12, ellipsis=ellipsis)
201 201 1234567890
202 202 >>> print trim(t, 10, ellipsis=ellipsis)
203 203 1234567890
204 204 >>> print trim(t, 8, ellipsis=ellipsis)
205 205 12345+++
206 206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
207 207 +++67890
208 208 >>> print trim(t, 8)
209 209 12345678
210 210 >>> print trim(t, 8, leftside=True)
211 211 34567890
212 212 >>> print trim(t, 3, ellipsis=ellipsis)
213 213 +++
214 214 >>> print trim(t, 1, ellipsis=ellipsis)
215 215 +
216 216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
217 217 >>> t = u.encode(encoding.encoding)
218 218 >>> print trim(t, 12, ellipsis=ellipsis)
219 219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
220 220 >>> print trim(t, 10, ellipsis=ellipsis)
221 221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
222 222 >>> print trim(t, 8, ellipsis=ellipsis)
223 223 \xe3\x81\x82\xe3\x81\x84+++
224 224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
225 225 +++\xe3\x81\x88\xe3\x81\x8a
226 226 >>> print trim(t, 5)
227 227 \xe3\x81\x82\xe3\x81\x84
228 228 >>> print trim(t, 5, leftside=True)
229 229 \xe3\x81\x88\xe3\x81\x8a
230 230 >>> print trim(t, 4, ellipsis=ellipsis)
231 231 +++
232 232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
233 233 +++
234 234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
235 235 >>> print trim(t, 12, ellipsis=ellipsis)
236 236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
237 237 >>> print trim(t, 10, ellipsis=ellipsis)
238 238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
239 239 >>> print trim(t, 8, ellipsis=ellipsis)
240 240 \x11\x22\x33\x44\x55+++
241 241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
242 242 +++\x66\x77\x88\x99\xaa
243 243 >>> print trim(t, 8)
244 244 \x11\x22\x33\x44\x55\x66\x77\x88
245 245 >>> print trim(t, 8, leftside=True)
246 246 \x33\x44\x55\x66\x77\x88\x99\xaa
247 247 >>> print trim(t, 3, ellipsis=ellipsis)
248 248 +++
249 249 >>> print trim(t, 1, ellipsis=ellipsis)
250 250 +
251 251 """
252 252 try:
253 253 u = s.decode(encoding)
254 254 except UnicodeDecodeError:
255 255 if len(s) <= width: # trimming is not needed
256 256 return s
257 257 width -= len(ellipsis)
258 258 if width <= 0: # no enough room even for ellipsis
259 259 return ellipsis[:width + len(ellipsis)]
260 260 if leftside:
261 261 return ellipsis + s[-width:]
262 262 return s[:width] + ellipsis
263 263
264 264 if ucolwidth(u) <= width: # trimming is not needed
265 265 return s
266 266
267 267 width -= len(ellipsis)
268 268 if width <= 0: # no enough room even for ellipsis
269 269 return ellipsis[:width + len(ellipsis)]
270 270
271 271 if leftside:
272 272 uslice = lambda i: u[i:]
273 273 concat = lambda s: ellipsis + s
274 274 else:
275 275 uslice = lambda i: u[:-i]
276 276 concat = lambda s: s + ellipsis
277 277 for i in xrange(1, len(u)):
278 278 usub = uslice(i)
279 279 if ucolwidth(usub) <= width:
280 280 return concat(usub.encode(encoding))
281 281 return ellipsis # no enough room for multi-column characters
282 282
283 283 def _asciilower(s):
284 284 '''convert a string to lowercase if ASCII
285 285
286 286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
287 287 s.decode('ascii')
288 288 return s.lower()
289 289
290 290 def asciilower(s):
291 291 # delay importing avoids cyclic dependency around "parsers" in
292 292 # pure Python build (util => i18n => encoding => parsers => util)
293 293 import parsers
294 294 impl = getattr(parsers, 'asciilower', _asciilower)
295 295 global asciilower
296 296 asciilower = impl
297 297 return impl(s)
298 298
299 299 def _asciiupper(s):
300 300 '''convert a string to uppercase if ASCII
301 301
302 302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
303 303 s.decode('ascii')
304 304 return s.upper()
305 305
306 306 def asciiupper(s):
307 307 # delay importing avoids cyclic dependency around "parsers" in
308 308 # pure Python build (util => i18n => encoding => parsers => util)
309 309 import parsers
310 310 impl = getattr(parsers, 'asciiupper', _asciiupper)
311 311 global asciiupper
312 312 asciiupper = impl
313 313 return impl(s)
314 314
315 315 def lower(s):
316 316 "best-effort encoding-aware case-folding of local string s"
317 317 try:
318 318 return asciilower(s)
319 319 except UnicodeDecodeError:
320 320 pass
321 321 try:
322 322 if isinstance(s, localstr):
323 323 u = s._utf8.decode("utf-8")
324 324 else:
325 325 u = s.decode(encoding, encodingmode)
326 326
327 327 lu = u.lower()
328 328 if u == lu:
329 329 return s # preserve localstring
330 330 return lu.encode(encoding)
331 331 except UnicodeError:
332 332 return s.lower() # we don't know how to fold this except in ASCII
333 333 except LookupError, k:
334 334 raise error.Abort(k, hint="please check your locale settings")
335 335
336 336 def upper(s):
337 337 "best-effort encoding-aware case-folding of local string s"
338 338 try:
339 339 return asciiupper(s)
340 340 except UnicodeDecodeError:
341 pass
341 return upperfallback(s)
342
343 def upperfallback(s):
342 344 try:
343 345 if isinstance(s, localstr):
344 346 u = s._utf8.decode("utf-8")
345 347 else:
346 348 u = s.decode(encoding, encodingmode)
347 349
348 350 uu = u.upper()
349 351 if u == uu:
350 352 return s # preserve localstring
351 353 return uu.encode(encoding)
352 354 except UnicodeError:
353 355 return s.upper() # we don't know how to fold this except in ASCII
354 356 except LookupError, k:
355 357 raise error.Abort(k, hint="please check your locale settings")
356 358
357 359 class normcasespecs(object):
358 360 '''what a platform's normcase does to ASCII strings
359 361
360 362 This is specified per platform, and should be consistent with what normcase
361 363 on that platform actually does.
362 364
363 365 lower: normcase lowercases ASCII strings
364 366 upper: normcase uppercases ASCII strings
365 367 other: the fallback function should always be called'''
366 368 lower = -1
367 369 upper = 1
368 370 other = 0
369 371
370 372 _jsonmap = {}
371 373
372 374 def jsonescape(s):
373 375 '''returns a string suitable for JSON
374 376
375 377 JSON is problematic for us because it doesn't support non-Unicode
376 378 bytes. To deal with this, we take the following approach:
377 379
378 380 - localstr objects are converted back to UTF-8
379 381 - valid UTF-8/ASCII strings are passed as-is
380 382 - other strings are converted to UTF-8b surrogate encoding
381 383 - apply JSON-specified string escaping
382 384
383 385 (escapes are doubled in these tests)
384 386
385 387 >>> jsonescape('this is a test')
386 388 'this is a test'
387 389 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
388 390 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
389 391 >>> jsonescape('a weird byte: \\xdd')
390 392 'a weird byte: \\xed\\xb3\\x9d'
391 393 >>> jsonescape('utf-8: caf\\xc3\\xa9')
392 394 'utf-8: caf\\xc3\\xa9'
393 395 >>> jsonescape('')
394 396 ''
395 397 '''
396 398
397 399 if not _jsonmap:
398 400 for x in xrange(32):
399 401 _jsonmap[chr(x)] = "\u%04x" %x
400 402 for x in xrange(32, 256):
401 403 c = chr(x)
402 404 _jsonmap[c] = c
403 405 _jsonmap['\t'] = '\\t'
404 406 _jsonmap['\n'] = '\\n'
405 407 _jsonmap['\"'] = '\\"'
406 408 _jsonmap['\\'] = '\\\\'
407 409 _jsonmap['\b'] = '\\b'
408 410 _jsonmap['\f'] = '\\f'
409 411 _jsonmap['\r'] = '\\r'
410 412
411 413 return ''.join(_jsonmap[c] for c in toutf8b(s))
412 414
413 415 def toutf8b(s):
414 416 '''convert a local, possibly-binary string into UTF-8b
415 417
416 418 This is intended as a generic method to preserve data when working
417 419 with schemes like JSON and XML that have no provision for
418 420 arbitrary byte strings. As Mercurial often doesn't know
419 421 what encoding data is in, we use so-called UTF-8b.
420 422
421 423 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
422 424 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
423 425 uDC00-uDCFF.
424 426
425 427 Principles of operation:
426 428
427 429 - ASCII and UTF-8 data successfully round-trips and is understood
428 430 by Unicode-oriented clients
429 431 - filenames and file contents in arbitrary other encodings can have
430 432 be round-tripped or recovered by clueful clients
431 433 - local strings that have a cached known UTF-8 encoding (aka
432 434 localstr) get sent as UTF-8 so Unicode-oriented clients get the
433 435 Unicode data they want
434 436 - because we must preserve UTF-8 bytestring in places such as
435 437 filenames, metadata can't be roundtripped without help
436 438
437 439 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
438 440 arbitrary bytes into an internal Unicode format that can be
439 441 re-encoded back into the original. Here we are exposing the
440 442 internal surrogate encoding as a UTF-8 string.)
441 443 '''
442 444
443 445 if isinstance(s, localstr):
444 446 return s._utf8
445 447
446 448 try:
447 449 s.decode('utf-8')
448 450 return s
449 451 except UnicodeDecodeError:
450 452 # surrogate-encode any characters that don't round-trip
451 453 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
452 454 r = ""
453 455 pos = 0
454 456 for c in s:
455 457 if s2[pos:pos + 1] == c:
456 458 r += c
457 459 pos += 1
458 460 else:
459 461 r += unichr(0xdc00 + ord(c)).encode('utf-8')
460 462 return r
461 463
462 464 def fromutf8b(s):
463 465 '''Given a UTF-8b string, return a local, possibly-binary string.
464 466
465 467 return the original binary string. This
466 468 is a round-trip process for strings like filenames, but metadata
467 469 that's was passed through tolocal will remain in UTF-8.
468 470
469 471 >>> m = "\\xc3\\xa9\\x99abcd"
470 472 >>> n = toutf8b(m)
471 473 >>> n
472 474 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
473 475 >>> fromutf8b(n) == m
474 476 True
475 477 '''
476 478
477 479 # fast path - look for uDxxx prefixes in s
478 480 if "\xed" not in s:
479 481 return s
480 482
481 483 u = s.decode("utf-8")
482 484 r = ""
483 485 for c in u:
484 486 if ord(c) & 0xff00 == 0xdc00:
485 487 r += chr(ord(c) & 0xff)
486 488 else:
487 489 r += c.encode("utf-8")
488 490 return r
General Comments 0
You need to be logged in to leave comments. Login now