##// END OF EJS Templates
encoding: change jsonmap to a list indexed by code point...
Yuya Nishihara -
r28066:d1cc0712 default
parent child Browse files
Show More
@@ -1,545 +1,542
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from . import (
15 15 error,
16 16 )
17 17
18 18 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 19 # "Unicode Subtleties"), so we need to ignore them in some places for
20 20 # sanity.
21 21 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 22 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 23 "206a 206b 206c 206d 206e 206f feff".split()]
24 24 # verify the next function will work
25 25 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26 26
27 27 def hfsignoreclean(s):
28 28 """Remove codepoints ignored by HFS+ from s.
29 29
30 30 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 31 '.hg'
32 32 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 33 '.hg'
34 34 """
35 35 if "\xe2" in s or "\xef" in s:
36 36 for c in _ignore:
37 37 s = s.replace(c, '')
38 38 return s
39 39
40 40 def _getpreferredencoding():
41 41 '''
42 42 On darwin, getpreferredencoding ignores the locale environment and
43 43 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 44 for Python 2.7 and up. This is the same corrected code for earlier
45 45 Python versions.
46 46
47 47 However, we can't use a version check for this method, as some distributions
48 48 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 49 encoding, as it is unlikely that this encoding is the actually expected.
50 50 '''
51 51 try:
52 52 locale.CODESET
53 53 except AttributeError:
54 54 # Fall back to parsing environment variables :-(
55 55 return locale.getdefaultlocale()[1]
56 56
57 57 oldloc = locale.setlocale(locale.LC_CTYPE)
58 58 locale.setlocale(locale.LC_CTYPE, "")
59 59 result = locale.nl_langinfo(locale.CODESET)
60 60 locale.setlocale(locale.LC_CTYPE, oldloc)
61 61
62 62 return result
63 63
64 64 _encodingfixers = {
65 65 '646': lambda: 'ascii',
66 66 'ANSI_X3.4-1968': lambda: 'ascii',
67 67 'mac-roman': _getpreferredencoding
68 68 }
69 69
70 70 try:
71 71 encoding = os.environ.get("HGENCODING")
72 72 if not encoding:
73 73 encoding = locale.getpreferredencoding() or 'ascii'
74 74 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 75 except locale.Error:
76 76 encoding = 'ascii'
77 77 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 78 fallbackencoding = 'ISO-8859-1'
79 79
80 80 class localstr(str):
81 81 '''This class allows strings that are unmodified to be
82 82 round-tripped to the local encoding and back'''
83 83 def __new__(cls, u, l):
84 84 s = str.__new__(cls, l)
85 85 s._utf8 = u
86 86 return s
87 87 def __hash__(self):
88 88 return hash(self._utf8) # avoid collisions in local string space
89 89
90 90 def tolocal(s):
91 91 """
92 92 Convert a string from internal UTF-8 to local encoding
93 93
94 94 All internal strings should be UTF-8 but some repos before the
95 95 implementation of locale support may contain latin1 or possibly
96 96 other character sets. We attempt to decode everything strictly
97 97 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 98 replace unknown characters.
99 99
100 100 The localstr class is used to cache the known UTF-8 encoding of
101 101 strings next to their local representation to allow lossless
102 102 round-trip conversion back to UTF-8.
103 103
104 104 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 105 >>> l = tolocal(u)
106 106 >>> l
107 107 'foo: ?'
108 108 >>> fromlocal(l)
109 109 'foo: \\xc3\\xa4'
110 110 >>> u2 = 'foo: \\xc3\\xa1'
111 111 >>> d = { l: 1, tolocal(u2): 2 }
112 112 >>> len(d) # no collision
113 113 2
114 114 >>> 'foo: ?' in d
115 115 False
116 116 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 117 >>> l = tolocal(l1)
118 118 >>> l
119 119 'foo: ?'
120 120 >>> fromlocal(l) # magically in utf-8
121 121 'foo: \\xc3\\xa4'
122 122 """
123 123
124 124 try:
125 125 try:
126 126 # make sure string is actually stored in UTF-8
127 127 u = s.decode('UTF-8')
128 128 if encoding == 'UTF-8':
129 129 # fast path
130 130 return s
131 131 r = u.encode(encoding, "replace")
132 132 if u == r.decode(encoding):
133 133 # r is a safe, non-lossy encoding of s
134 134 return r
135 135 return localstr(s, r)
136 136 except UnicodeDecodeError:
137 137 # we should only get here if we're looking at an ancient changeset
138 138 try:
139 139 u = s.decode(fallbackencoding)
140 140 r = u.encode(encoding, "replace")
141 141 if u == r.decode(encoding):
142 142 # r is a safe, non-lossy encoding of s
143 143 return r
144 144 return localstr(u.encode('UTF-8'), r)
145 145 except UnicodeDecodeError:
146 146 u = s.decode("utf-8", "replace") # last ditch
147 147 return u.encode(encoding, "replace") # can't round-trip
148 148 except LookupError as k:
149 149 raise error.Abort(k, hint="please check your locale settings")
150 150
151 151 def fromlocal(s):
152 152 """
153 153 Convert a string from the local character encoding to UTF-8
154 154
155 155 We attempt to decode strings using the encoding mode set by
156 156 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 157 characters will cause an error message. Other modes include
158 158 'replace', which replaces unknown characters with a special
159 159 Unicode character, and 'ignore', which drops the character.
160 160 """
161 161
162 162 # can we do a lossless round-trip?
163 163 if isinstance(s, localstr):
164 164 return s._utf8
165 165
166 166 try:
167 167 return s.decode(encoding, encodingmode).encode("utf-8")
168 168 except UnicodeDecodeError as inst:
169 169 sub = s[max(0, inst.start - 10):inst.start + 10]
170 170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 171 except LookupError as k:
172 172 raise error.Abort(k, hint="please check your locale settings")
173 173
174 174 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 175 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 176 and "WFA" or "WF")
177 177
178 178 def colwidth(s):
179 179 "Find the column width of a string for display in the local encoding"
180 180 return ucolwidth(s.decode(encoding, 'replace'))
181 181
182 182 def ucolwidth(d):
183 183 "Find the column width of a Unicode string for display"
184 184 eaw = getattr(unicodedata, 'east_asian_width', None)
185 185 if eaw is not None:
186 186 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 187 return len(d)
188 188
189 189 def getcols(s, start, c):
190 190 '''Use colwidth to find a c-column substring of s starting at byte
191 191 index start'''
192 192 for x in xrange(start + c, len(s)):
193 193 t = s[start:x]
194 194 if colwidth(t) == c:
195 195 return t
196 196
197 197 def trim(s, width, ellipsis='', leftside=False):
198 198 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199 199
200 200 If 'leftside' is True, left side of string 's' is trimmed.
201 201 'ellipsis' is always placed at trimmed side.
202 202
203 203 >>> ellipsis = '+++'
204 204 >>> from . import encoding
205 205 >>> encoding.encoding = 'utf-8'
206 206 >>> t= '1234567890'
207 207 >>> print trim(t, 12, ellipsis=ellipsis)
208 208 1234567890
209 209 >>> print trim(t, 10, ellipsis=ellipsis)
210 210 1234567890
211 211 >>> print trim(t, 8, ellipsis=ellipsis)
212 212 12345+++
213 213 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 214 +++67890
215 215 >>> print trim(t, 8)
216 216 12345678
217 217 >>> print trim(t, 8, leftside=True)
218 218 34567890
219 219 >>> print trim(t, 3, ellipsis=ellipsis)
220 220 +++
221 221 >>> print trim(t, 1, ellipsis=ellipsis)
222 222 +
223 223 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 224 >>> t = u.encode(encoding.encoding)
225 225 >>> print trim(t, 12, ellipsis=ellipsis)
226 226 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 227 >>> print trim(t, 10, ellipsis=ellipsis)
228 228 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 229 >>> print trim(t, 8, ellipsis=ellipsis)
230 230 \xe3\x81\x82\xe3\x81\x84+++
231 231 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 232 +++\xe3\x81\x88\xe3\x81\x8a
233 233 >>> print trim(t, 5)
234 234 \xe3\x81\x82\xe3\x81\x84
235 235 >>> print trim(t, 5, leftside=True)
236 236 \xe3\x81\x88\xe3\x81\x8a
237 237 >>> print trim(t, 4, ellipsis=ellipsis)
238 238 +++
239 239 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 240 +++
241 241 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 242 >>> print trim(t, 12, ellipsis=ellipsis)
243 243 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 244 >>> print trim(t, 10, ellipsis=ellipsis)
245 245 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 246 >>> print trim(t, 8, ellipsis=ellipsis)
247 247 \x11\x22\x33\x44\x55+++
248 248 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 249 +++\x66\x77\x88\x99\xaa
250 250 >>> print trim(t, 8)
251 251 \x11\x22\x33\x44\x55\x66\x77\x88
252 252 >>> print trim(t, 8, leftside=True)
253 253 \x33\x44\x55\x66\x77\x88\x99\xaa
254 254 >>> print trim(t, 3, ellipsis=ellipsis)
255 255 +++
256 256 >>> print trim(t, 1, ellipsis=ellipsis)
257 257 +
258 258 """
259 259 try:
260 260 u = s.decode(encoding)
261 261 except UnicodeDecodeError:
262 262 if len(s) <= width: # trimming is not needed
263 263 return s
264 264 width -= len(ellipsis)
265 265 if width <= 0: # no enough room even for ellipsis
266 266 return ellipsis[:width + len(ellipsis)]
267 267 if leftside:
268 268 return ellipsis + s[-width:]
269 269 return s[:width] + ellipsis
270 270
271 271 if ucolwidth(u) <= width: # trimming is not needed
272 272 return s
273 273
274 274 width -= len(ellipsis)
275 275 if width <= 0: # no enough room even for ellipsis
276 276 return ellipsis[:width + len(ellipsis)]
277 277
278 278 if leftside:
279 279 uslice = lambda i: u[i:]
280 280 concat = lambda s: ellipsis + s
281 281 else:
282 282 uslice = lambda i: u[:-i]
283 283 concat = lambda s: s + ellipsis
284 284 for i in xrange(1, len(u)):
285 285 usub = uslice(i)
286 286 if ucolwidth(usub) <= width:
287 287 return concat(usub.encode(encoding))
288 288 return ellipsis # no enough room for multi-column characters
289 289
290 290 def _asciilower(s):
291 291 '''convert a string to lowercase if ASCII
292 292
293 293 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 294 s.decode('ascii')
295 295 return s.lower()
296 296
297 297 def asciilower(s):
298 298 # delay importing avoids cyclic dependency around "parsers" in
299 299 # pure Python build (util => i18n => encoding => parsers => util)
300 300 from . import parsers
301 301 impl = getattr(parsers, 'asciilower', _asciilower)
302 302 global asciilower
303 303 asciilower = impl
304 304 return impl(s)
305 305
306 306 def _asciiupper(s):
307 307 '''convert a string to uppercase if ASCII
308 308
309 309 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 310 s.decode('ascii')
311 311 return s.upper()
312 312
313 313 def asciiupper(s):
314 314 # delay importing avoids cyclic dependency around "parsers" in
315 315 # pure Python build (util => i18n => encoding => parsers => util)
316 316 from . import parsers
317 317 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 318 global asciiupper
319 319 asciiupper = impl
320 320 return impl(s)
321 321
322 322 def lower(s):
323 323 "best-effort encoding-aware case-folding of local string s"
324 324 try:
325 325 return asciilower(s)
326 326 except UnicodeDecodeError:
327 327 pass
328 328 try:
329 329 if isinstance(s, localstr):
330 330 u = s._utf8.decode("utf-8")
331 331 else:
332 332 u = s.decode(encoding, encodingmode)
333 333
334 334 lu = u.lower()
335 335 if u == lu:
336 336 return s # preserve localstring
337 337 return lu.encode(encoding)
338 338 except UnicodeError:
339 339 return s.lower() # we don't know how to fold this except in ASCII
340 340 except LookupError as k:
341 341 raise error.Abort(k, hint="please check your locale settings")
342 342
343 343 def upper(s):
344 344 "best-effort encoding-aware case-folding of local string s"
345 345 try:
346 346 return asciiupper(s)
347 347 except UnicodeDecodeError:
348 348 return upperfallback(s)
349 349
350 350 def upperfallback(s):
351 351 try:
352 352 if isinstance(s, localstr):
353 353 u = s._utf8.decode("utf-8")
354 354 else:
355 355 u = s.decode(encoding, encodingmode)
356 356
357 357 uu = u.upper()
358 358 if u == uu:
359 359 return s # preserve localstring
360 360 return uu.encode(encoding)
361 361 except UnicodeError:
362 362 return s.upper() # we don't know how to fold this except in ASCII
363 363 except LookupError as k:
364 364 raise error.Abort(k, hint="please check your locale settings")
365 365
366 366 class normcasespecs(object):
367 367 '''what a platform's normcase does to ASCII strings
368 368
369 369 This is specified per platform, and should be consistent with what normcase
370 370 on that platform actually does.
371 371
372 372 lower: normcase lowercases ASCII strings
373 373 upper: normcase uppercases ASCII strings
374 374 other: the fallback function should always be called
375 375
376 376 This should be kept in sync with normcase_spec in util.h.'''
377 377 lower = -1
378 378 upper = 1
379 379 other = 0
380 380
381 _jsonmap = {}
381 _jsonmap = []
382 382
383 383 def jsonescape(s):
384 384 '''returns a string suitable for JSON
385 385
386 386 JSON is problematic for us because it doesn't support non-Unicode
387 387 bytes. To deal with this, we take the following approach:
388 388
389 389 - localstr objects are converted back to UTF-8
390 390 - valid UTF-8/ASCII strings are passed as-is
391 391 - other strings are converted to UTF-8b surrogate encoding
392 392 - apply JSON-specified string escaping
393 393
394 394 (escapes are doubled in these tests)
395 395
396 396 >>> jsonescape('this is a test')
397 397 'this is a test'
398 398 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
399 399 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
400 400 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
401 401 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
402 402 >>> jsonescape('a weird byte: \\xdd')
403 403 'a weird byte: \\xed\\xb3\\x9d'
404 404 >>> jsonescape('utf-8: caf\\xc3\\xa9')
405 405 'utf-8: caf\\xc3\\xa9'
406 406 >>> jsonescape('')
407 407 ''
408 408 '''
409 409
410 410 if not _jsonmap:
411 for x in xrange(32):
412 _jsonmap[chr(x)] = "\\u%04x" % x
413 for x in xrange(32, 256):
414 c = chr(x)
415 _jsonmap[c] = c
416 _jsonmap['\x7f'] = '\\u007f'
417 _jsonmap['\t'] = '\\t'
418 _jsonmap['\n'] = '\\n'
419 _jsonmap['\"'] = '\\"'
420 _jsonmap['\\'] = '\\\\'
421 _jsonmap['\b'] = '\\b'
422 _jsonmap['\f'] = '\\f'
423 _jsonmap['\r'] = '\\r'
411 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
412 _jsonmap.extend(chr(x) for x in xrange(32, 256))
413 _jsonmap[0x7f] = '\\u007f'
414 _jsonmap[0x09] = '\\t'
415 _jsonmap[0x0a] = '\\n'
416 _jsonmap[0x22] = '\\"'
417 _jsonmap[0x5c] = '\\\\'
418 _jsonmap[0x08] = '\\b'
419 _jsonmap[0x0c] = '\\f'
420 _jsonmap[0x0d] = '\\r'
424 421
425 return ''.join(_jsonmap[c] for c in toutf8b(s))
422 return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
426 423
427 424 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
428 425
429 426 def getutf8char(s, pos):
430 427 '''get the next full utf-8 character in the given string, starting at pos
431 428
432 429 Raises a UnicodeError if the given location does not start a valid
433 430 utf-8 character.
434 431 '''
435 432
436 433 # find how many bytes to attempt decoding from first nibble
437 434 l = _utf8len[ord(s[pos]) >> 4]
438 435 if not l: # ascii
439 436 return s[pos]
440 437
441 438 c = s[pos:pos + l]
442 439 # validate with attempted decode
443 440 c.decode("utf-8")
444 441 return c
445 442
446 443 def toutf8b(s):
447 444 '''convert a local, possibly-binary string into UTF-8b
448 445
449 446 This is intended as a generic method to preserve data when working
450 447 with schemes like JSON and XML that have no provision for
451 448 arbitrary byte strings. As Mercurial often doesn't know
452 449 what encoding data is in, we use so-called UTF-8b.
453 450
454 451 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
455 452 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
456 453 uDC00-uDCFF.
457 454
458 455 Principles of operation:
459 456
460 457 - ASCII and UTF-8 data successfully round-trips and is understood
461 458 by Unicode-oriented clients
462 459 - filenames and file contents in arbitrary other encodings can have
463 460 be round-tripped or recovered by clueful clients
464 461 - local strings that have a cached known UTF-8 encoding (aka
465 462 localstr) get sent as UTF-8 so Unicode-oriented clients get the
466 463 Unicode data they want
467 464 - because we must preserve UTF-8 bytestring in places such as
468 465 filenames, metadata can't be roundtripped without help
469 466
470 467 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
471 468 arbitrary bytes into an internal Unicode format that can be
472 469 re-encoded back into the original. Here we are exposing the
473 470 internal surrogate encoding as a UTF-8 string.)
474 471 '''
475 472
476 473 if "\xed" not in s:
477 474 if isinstance(s, localstr):
478 475 return s._utf8
479 476 try:
480 477 s.decode('utf-8')
481 478 return s
482 479 except UnicodeDecodeError:
483 480 pass
484 481
485 482 r = ""
486 483 pos = 0
487 484 l = len(s)
488 485 while pos < l:
489 486 try:
490 487 c = getutf8char(s, pos)
491 488 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
492 489 # have to re-escape existing U+DCxx characters
493 490 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
494 491 pos += 1
495 492 else:
496 493 pos += len(c)
497 494 except UnicodeDecodeError:
498 495 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
499 496 pos += 1
500 497 r += c
501 498 return r
502 499
503 500 def fromutf8b(s):
504 501 '''Given a UTF-8b string, return a local, possibly-binary string.
505 502
506 503 return the original binary string. This
507 504 is a round-trip process for strings like filenames, but metadata
508 505 that's was passed through tolocal will remain in UTF-8.
509 506
510 507 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
511 508 >>> m = "\\xc3\\xa9\\x99abcd"
512 509 >>> toutf8b(m)
513 510 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
514 511 >>> roundtrip(m)
515 512 True
516 513 >>> roundtrip("\\xc2\\xc2\\x80")
517 514 True
518 515 >>> roundtrip("\\xef\\xbf\\xbd")
519 516 True
520 517 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
521 518 True
522 519 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
523 520 True
524 521 '''
525 522
526 523 # fast path - look for uDxxx prefixes in s
527 524 if "\xed" not in s:
528 525 return s
529 526
530 527 # We could do this with the unicode type but some Python builds
531 528 # use UTF-16 internally (issue5031) which causes non-BMP code
532 529 # points to be escaped. Instead, we use our handy getutf8char
533 530 # helper again to walk the string without "decoding" it.
534 531
535 532 r = ""
536 533 pos = 0
537 534 l = len(s)
538 535 while pos < l:
539 536 c = getutf8char(s, pos)
540 537 pos += len(c)
541 538 # unescape U+DCxx characters
542 539 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
543 540 c = chr(ord(c.decode("utf-8")) & 0xff)
544 541 r += c
545 542 return r
General Comments 0
You need to be logged in to leave comments. Login now