##// END OF EJS Templates
encoding: add option to escape non-ascii characters in JSON...
Yuya Nishihara -
r28068:9ece901f default
parent child Browse files
Show More
@@ -1,540 +1,568 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 import array
10 11 import locale
11 12 import os
12 13 import unicodedata
13 14
14 15 from . import (
15 16 error,
16 17 )
17 18
18 19 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 20 # "Unicode Subtleties"), so we need to ignore them in some places for
20 21 # sanity.
21 22 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 23 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 24 "206a 206b 206c 206d 206e 206f feff".split()]
24 25 # verify the next function will work
25 26 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26 27
27 28 def hfsignoreclean(s):
28 29 """Remove codepoints ignored by HFS+ from s.
29 30
30 31 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 32 '.hg'
32 33 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 34 '.hg'
34 35 """
35 36 if "\xe2" in s or "\xef" in s:
36 37 for c in _ignore:
37 38 s = s.replace(c, '')
38 39 return s
39 40
40 41 def _getpreferredencoding():
41 42 '''
42 43 On darwin, getpreferredencoding ignores the locale environment and
43 44 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 45 for Python 2.7 and up. This is the same corrected code for earlier
45 46 Python versions.
46 47
47 48 However, we can't use a version check for this method, as some distributions
48 49 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 50 encoding, as it is unlikely that this encoding is the actually expected.
50 51 '''
51 52 try:
52 53 locale.CODESET
53 54 except AttributeError:
54 55 # Fall back to parsing environment variables :-(
55 56 return locale.getdefaultlocale()[1]
56 57
57 58 oldloc = locale.setlocale(locale.LC_CTYPE)
58 59 locale.setlocale(locale.LC_CTYPE, "")
59 60 result = locale.nl_langinfo(locale.CODESET)
60 61 locale.setlocale(locale.LC_CTYPE, oldloc)
61 62
62 63 return result
63 64
64 65 _encodingfixers = {
65 66 '646': lambda: 'ascii',
66 67 'ANSI_X3.4-1968': lambda: 'ascii',
67 68 'mac-roman': _getpreferredencoding
68 69 }
69 70
70 71 try:
71 72 encoding = os.environ.get("HGENCODING")
72 73 if not encoding:
73 74 encoding = locale.getpreferredencoding() or 'ascii'
74 75 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 76 except locale.Error:
76 77 encoding = 'ascii'
77 78 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 79 fallbackencoding = 'ISO-8859-1'
79 80
80 81 class localstr(str):
81 82 '''This class allows strings that are unmodified to be
82 83 round-tripped to the local encoding and back'''
83 84 def __new__(cls, u, l):
84 85 s = str.__new__(cls, l)
85 86 s._utf8 = u
86 87 return s
87 88 def __hash__(self):
88 89 return hash(self._utf8) # avoid collisions in local string space
89 90
90 91 def tolocal(s):
91 92 """
92 93 Convert a string from internal UTF-8 to local encoding
93 94
94 95 All internal strings should be UTF-8 but some repos before the
95 96 implementation of locale support may contain latin1 or possibly
96 97 other character sets. We attempt to decode everything strictly
97 98 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 99 replace unknown characters.
99 100
100 101 The localstr class is used to cache the known UTF-8 encoding of
101 102 strings next to their local representation to allow lossless
102 103 round-trip conversion back to UTF-8.
103 104
104 105 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 106 >>> l = tolocal(u)
106 107 >>> l
107 108 'foo: ?'
108 109 >>> fromlocal(l)
109 110 'foo: \\xc3\\xa4'
110 111 >>> u2 = 'foo: \\xc3\\xa1'
111 112 >>> d = { l: 1, tolocal(u2): 2 }
112 113 >>> len(d) # no collision
113 114 2
114 115 >>> 'foo: ?' in d
115 116 False
116 117 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 118 >>> l = tolocal(l1)
118 119 >>> l
119 120 'foo: ?'
120 121 >>> fromlocal(l) # magically in utf-8
121 122 'foo: \\xc3\\xa4'
122 123 """
123 124
124 125 try:
125 126 try:
126 127 # make sure string is actually stored in UTF-8
127 128 u = s.decode('UTF-8')
128 129 if encoding == 'UTF-8':
129 130 # fast path
130 131 return s
131 132 r = u.encode(encoding, "replace")
132 133 if u == r.decode(encoding):
133 134 # r is a safe, non-lossy encoding of s
134 135 return r
135 136 return localstr(s, r)
136 137 except UnicodeDecodeError:
137 138 # we should only get here if we're looking at an ancient changeset
138 139 try:
139 140 u = s.decode(fallbackencoding)
140 141 r = u.encode(encoding, "replace")
141 142 if u == r.decode(encoding):
142 143 # r is a safe, non-lossy encoding of s
143 144 return r
144 145 return localstr(u.encode('UTF-8'), r)
145 146 except UnicodeDecodeError:
146 147 u = s.decode("utf-8", "replace") # last ditch
147 148 return u.encode(encoding, "replace") # can't round-trip
148 149 except LookupError as k:
149 150 raise error.Abort(k, hint="please check your locale settings")
150 151
151 152 def fromlocal(s):
152 153 """
153 154 Convert a string from the local character encoding to UTF-8
154 155
155 156 We attempt to decode strings using the encoding mode set by
156 157 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 158 characters will cause an error message. Other modes include
158 159 'replace', which replaces unknown characters with a special
159 160 Unicode character, and 'ignore', which drops the character.
160 161 """
161 162
162 163 # can we do a lossless round-trip?
163 164 if isinstance(s, localstr):
164 165 return s._utf8
165 166
166 167 try:
167 168 return s.decode(encoding, encodingmode).encode("utf-8")
168 169 except UnicodeDecodeError as inst:
169 170 sub = s[max(0, inst.start - 10):inst.start + 10]
170 171 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 172 except LookupError as k:
172 173 raise error.Abort(k, hint="please check your locale settings")
173 174
174 175 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 176 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 177 and "WFA" or "WF")
177 178
178 179 def colwidth(s):
179 180 "Find the column width of a string for display in the local encoding"
180 181 return ucolwidth(s.decode(encoding, 'replace'))
181 182
182 183 def ucolwidth(d):
183 184 "Find the column width of a Unicode string for display"
184 185 eaw = getattr(unicodedata, 'east_asian_width', None)
185 186 if eaw is not None:
186 187 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 188 return len(d)
188 189
189 190 def getcols(s, start, c):
190 191 '''Use colwidth to find a c-column substring of s starting at byte
191 192 index start'''
192 193 for x in xrange(start + c, len(s)):
193 194 t = s[start:x]
194 195 if colwidth(t) == c:
195 196 return t
196 197
197 198 def trim(s, width, ellipsis='', leftside=False):
198 199 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199 200
200 201 If 'leftside' is True, left side of string 's' is trimmed.
201 202 'ellipsis' is always placed at trimmed side.
202 203
203 204 >>> ellipsis = '+++'
204 205 >>> from . import encoding
205 206 >>> encoding.encoding = 'utf-8'
206 207 >>> t= '1234567890'
207 208 >>> print trim(t, 12, ellipsis=ellipsis)
208 209 1234567890
209 210 >>> print trim(t, 10, ellipsis=ellipsis)
210 211 1234567890
211 212 >>> print trim(t, 8, ellipsis=ellipsis)
212 213 12345+++
213 214 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 215 +++67890
215 216 >>> print trim(t, 8)
216 217 12345678
217 218 >>> print trim(t, 8, leftside=True)
218 219 34567890
219 220 >>> print trim(t, 3, ellipsis=ellipsis)
220 221 +++
221 222 >>> print trim(t, 1, ellipsis=ellipsis)
222 223 +
223 224 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 225 >>> t = u.encode(encoding.encoding)
225 226 >>> print trim(t, 12, ellipsis=ellipsis)
226 227 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 228 >>> print trim(t, 10, ellipsis=ellipsis)
228 229 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 230 >>> print trim(t, 8, ellipsis=ellipsis)
230 231 \xe3\x81\x82\xe3\x81\x84+++
231 232 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 233 +++\xe3\x81\x88\xe3\x81\x8a
233 234 >>> print trim(t, 5)
234 235 \xe3\x81\x82\xe3\x81\x84
235 236 >>> print trim(t, 5, leftside=True)
236 237 \xe3\x81\x88\xe3\x81\x8a
237 238 >>> print trim(t, 4, ellipsis=ellipsis)
238 239 +++
239 240 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 241 +++
241 242 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 243 >>> print trim(t, 12, ellipsis=ellipsis)
243 244 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 245 >>> print trim(t, 10, ellipsis=ellipsis)
245 246 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 247 >>> print trim(t, 8, ellipsis=ellipsis)
247 248 \x11\x22\x33\x44\x55+++
248 249 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 250 +++\x66\x77\x88\x99\xaa
250 251 >>> print trim(t, 8)
251 252 \x11\x22\x33\x44\x55\x66\x77\x88
252 253 >>> print trim(t, 8, leftside=True)
253 254 \x33\x44\x55\x66\x77\x88\x99\xaa
254 255 >>> print trim(t, 3, ellipsis=ellipsis)
255 256 +++
256 257 >>> print trim(t, 1, ellipsis=ellipsis)
257 258 +
258 259 """
259 260 try:
260 261 u = s.decode(encoding)
261 262 except UnicodeDecodeError:
262 263 if len(s) <= width: # trimming is not needed
263 264 return s
264 265 width -= len(ellipsis)
265 266 if width <= 0: # no enough room even for ellipsis
266 267 return ellipsis[:width + len(ellipsis)]
267 268 if leftside:
268 269 return ellipsis + s[-width:]
269 270 return s[:width] + ellipsis
270 271
271 272 if ucolwidth(u) <= width: # trimming is not needed
272 273 return s
273 274
274 275 width -= len(ellipsis)
275 276 if width <= 0: # no enough room even for ellipsis
276 277 return ellipsis[:width + len(ellipsis)]
277 278
278 279 if leftside:
279 280 uslice = lambda i: u[i:]
280 281 concat = lambda s: ellipsis + s
281 282 else:
282 283 uslice = lambda i: u[:-i]
283 284 concat = lambda s: s + ellipsis
284 285 for i in xrange(1, len(u)):
285 286 usub = uslice(i)
286 287 if ucolwidth(usub) <= width:
287 288 return concat(usub.encode(encoding))
288 289 return ellipsis # no enough room for multi-column characters
289 290
290 291 def _asciilower(s):
291 292 '''convert a string to lowercase if ASCII
292 293
293 294 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 295 s.decode('ascii')
295 296 return s.lower()
296 297
297 298 def asciilower(s):
298 299 # delay importing avoids cyclic dependency around "parsers" in
299 300 # pure Python build (util => i18n => encoding => parsers => util)
300 301 from . import parsers
301 302 impl = getattr(parsers, 'asciilower', _asciilower)
302 303 global asciilower
303 304 asciilower = impl
304 305 return impl(s)
305 306
306 307 def _asciiupper(s):
307 308 '''convert a string to uppercase if ASCII
308 309
309 310 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 311 s.decode('ascii')
311 312 return s.upper()
312 313
313 314 def asciiupper(s):
314 315 # delay importing avoids cyclic dependency around "parsers" in
315 316 # pure Python build (util => i18n => encoding => parsers => util)
316 317 from . import parsers
317 318 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 319 global asciiupper
319 320 asciiupper = impl
320 321 return impl(s)
321 322
322 323 def lower(s):
323 324 "best-effort encoding-aware case-folding of local string s"
324 325 try:
325 326 return asciilower(s)
326 327 except UnicodeDecodeError:
327 328 pass
328 329 try:
329 330 if isinstance(s, localstr):
330 331 u = s._utf8.decode("utf-8")
331 332 else:
332 333 u = s.decode(encoding, encodingmode)
333 334
334 335 lu = u.lower()
335 336 if u == lu:
336 337 return s # preserve localstring
337 338 return lu.encode(encoding)
338 339 except UnicodeError:
339 340 return s.lower() # we don't know how to fold this except in ASCII
340 341 except LookupError as k:
341 342 raise error.Abort(k, hint="please check your locale settings")
342 343
343 344 def upper(s):
344 345 "best-effort encoding-aware case-folding of local string s"
345 346 try:
346 347 return asciiupper(s)
347 348 except UnicodeDecodeError:
348 349 return upperfallback(s)
349 350
350 351 def upperfallback(s):
351 352 try:
352 353 if isinstance(s, localstr):
353 354 u = s._utf8.decode("utf-8")
354 355 else:
355 356 u = s.decode(encoding, encodingmode)
356 357
357 358 uu = u.upper()
358 359 if u == uu:
359 360 return s # preserve localstring
360 361 return uu.encode(encoding)
361 362 except UnicodeError:
362 363 return s.upper() # we don't know how to fold this except in ASCII
363 364 except LookupError as k:
364 365 raise error.Abort(k, hint="please check your locale settings")
365 366
366 367 class normcasespecs(object):
367 368 '''what a platform's normcase does to ASCII strings
368 369
369 370 This is specified per platform, and should be consistent with what normcase
370 371 on that platform actually does.
371 372
372 373 lower: normcase lowercases ASCII strings
373 374 upper: normcase uppercases ASCII strings
374 375 other: the fallback function should always be called
375 376
376 377 This should be kept in sync with normcase_spec in util.h.'''
377 378 lower = -1
378 379 upper = 1
379 380 other = 0
380 381
381 382 _jsonmap = []
382 383 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
383 _jsonmap.extend(chr(x) for x in xrange(32, 256))
384 _jsonmap[0x7f] = '\\u007f'
384 _jsonmap.extend(chr(x) for x in xrange(32, 127))
385 _jsonmap.append('\\u007f')
385 386 _jsonmap[0x09] = '\\t'
386 387 _jsonmap[0x0a] = '\\n'
387 388 _jsonmap[0x22] = '\\"'
388 389 _jsonmap[0x5c] = '\\\\'
389 390 _jsonmap[0x08] = '\\b'
390 391 _jsonmap[0x0c] = '\\f'
391 392 _jsonmap[0x0d] = '\\r'
393 _paranoidjsonmap = _jsonmap[:]
394 _jsonmap.extend(chr(x) for x in xrange(128, 256))
392 395
393 def jsonescape(s):
396 def jsonescape(s, paranoid=False):
394 397 '''returns a string suitable for JSON
395 398
396 399 JSON is problematic for us because it doesn't support non-Unicode
397 400 bytes. To deal with this, we take the following approach:
398 401
399 402 - localstr objects are converted back to UTF-8
400 403 - valid UTF-8/ASCII strings are passed as-is
401 404 - other strings are converted to UTF-8b surrogate encoding
402 405 - apply JSON-specified string escaping
403 406
404 407 (escapes are doubled in these tests)
405 408
406 409 >>> jsonescape('this is a test')
407 410 'this is a test'
408 411 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
409 412 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
410 413 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
411 414 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
412 415 >>> jsonescape('a weird byte: \\xdd')
413 416 'a weird byte: \\xed\\xb3\\x9d'
414 417 >>> jsonescape('utf-8: caf\\xc3\\xa9')
415 418 'utf-8: caf\\xc3\\xa9'
416 419 >>> jsonescape('')
417 420 ''
421
422 If paranoid, non-ascii characters are also escaped. This is suitable for
423 web output.
424
425 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
426 'escape boundary: ~ \\\\u007f \\\\u0080'
427 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
428 'a weird byte: \\\\udcdd'
429 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
430 'utf-8: caf\\\\u00e9'
431 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
432 'non-BMP: \\\\ud834\\\\udd1e'
418 433 '''
419 434
420 return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
435 if paranoid:
436 jm = _paranoidjsonmap
437 else:
438 jm = _jsonmap
439
440 u8chars = toutf8b(s)
441 try:
442 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
443 except IndexError:
444 pass
445 # non-BMP char is represented as UTF-16 surrogate pair
446 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
447 u16codes.pop(0) # drop BOM
448 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
421 449
422 450 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
423 451
424 452 def getutf8char(s, pos):
425 453 '''get the next full utf-8 character in the given string, starting at pos
426 454
427 455 Raises a UnicodeError if the given location does not start a valid
428 456 utf-8 character.
429 457 '''
430 458
431 459 # find how many bytes to attempt decoding from first nibble
432 460 l = _utf8len[ord(s[pos]) >> 4]
433 461 if not l: # ascii
434 462 return s[pos]
435 463
436 464 c = s[pos:pos + l]
437 465 # validate with attempted decode
438 466 c.decode("utf-8")
439 467 return c
440 468
441 469 def toutf8b(s):
442 470 '''convert a local, possibly-binary string into UTF-8b
443 471
444 472 This is intended as a generic method to preserve data when working
445 473 with schemes like JSON and XML that have no provision for
446 474 arbitrary byte strings. As Mercurial often doesn't know
447 475 what encoding data is in, we use so-called UTF-8b.
448 476
449 477 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
450 478 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
451 479 uDC00-uDCFF.
452 480
453 481 Principles of operation:
454 482
455 483 - ASCII and UTF-8 data successfully round-trips and is understood
456 484 by Unicode-oriented clients
457 485 - filenames and file contents in arbitrary other encodings can have
458 486 be round-tripped or recovered by clueful clients
459 487 - local strings that have a cached known UTF-8 encoding (aka
460 488 localstr) get sent as UTF-8 so Unicode-oriented clients get the
461 489 Unicode data they want
462 490 - because we must preserve UTF-8 bytestring in places such as
463 491 filenames, metadata can't be roundtripped without help
464 492
465 493 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
466 494 arbitrary bytes into an internal Unicode format that can be
467 495 re-encoded back into the original. Here we are exposing the
468 496 internal surrogate encoding as a UTF-8 string.)
469 497 '''
470 498
471 499 if "\xed" not in s:
472 500 if isinstance(s, localstr):
473 501 return s._utf8
474 502 try:
475 503 s.decode('utf-8')
476 504 return s
477 505 except UnicodeDecodeError:
478 506 pass
479 507
480 508 r = ""
481 509 pos = 0
482 510 l = len(s)
483 511 while pos < l:
484 512 try:
485 513 c = getutf8char(s, pos)
486 514 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
487 515 # have to re-escape existing U+DCxx characters
488 516 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
489 517 pos += 1
490 518 else:
491 519 pos += len(c)
492 520 except UnicodeDecodeError:
493 521 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
494 522 pos += 1
495 523 r += c
496 524 return r
497 525
498 526 def fromutf8b(s):
499 527 '''Given a UTF-8b string, return a local, possibly-binary string.
500 528
501 529 return the original binary string. This
502 530 is a round-trip process for strings like filenames, but metadata
503 531 that's was passed through tolocal will remain in UTF-8.
504 532
505 533 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
506 534 >>> m = "\\xc3\\xa9\\x99abcd"
507 535 >>> toutf8b(m)
508 536 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
509 537 >>> roundtrip(m)
510 538 True
511 539 >>> roundtrip("\\xc2\\xc2\\x80")
512 540 True
513 541 >>> roundtrip("\\xef\\xbf\\xbd")
514 542 True
515 543 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
516 544 True
517 545 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
518 546 True
519 547 '''
520 548
521 549 # fast path - look for uDxxx prefixes in s
522 550 if "\xed" not in s:
523 551 return s
524 552
525 553 # We could do this with the unicode type but some Python builds
526 554 # use UTF-16 internally (issue5031) which causes non-BMP code
527 555 # points to be escaped. Instead, we use our handy getutf8char
528 556 # helper again to walk the string without "decoding" it.
529 557
530 558 r = ""
531 559 pos = 0
532 560 l = len(s)
533 561 while pos < l:
534 562 c = getutf8char(s, pos)
535 563 pos += len(c)
536 564 # unescape U+DCxx characters
537 565 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
538 566 c = chr(ord(c.decode("utf-8")) & 0xff)
539 567 r += c
540 568 return r
General Comments 0
You need to be logged in to leave comments. Login now