##// END OF EJS Templates
encoding: escape U+007F (DEL) character in JSON...
Yuya Nishihara -
r27881:ffa599f3 default
parent child Browse files
Show More
@@ -1,542 +1,545 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from . import (
15 15 error,
16 16 )
17 17
18 18 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 19 # "Unicode Subtleties"), so we need to ignore them in some places for
20 20 # sanity.
21 21 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 22 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 23 "206a 206b 206c 206d 206e 206f feff".split()]
24 24 # verify the next function will work
25 25 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26 26
27 27 def hfsignoreclean(s):
28 28 """Remove codepoints ignored by HFS+ from s.
29 29
30 30 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 31 '.hg'
32 32 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 33 '.hg'
34 34 """
35 35 if "\xe2" in s or "\xef" in s:
36 36 for c in _ignore:
37 37 s = s.replace(c, '')
38 38 return s
39 39
40 40 def _getpreferredencoding():
41 41 '''
42 42 On darwin, getpreferredencoding ignores the locale environment and
43 43 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 44 for Python 2.7 and up. This is the same corrected code for earlier
45 45 Python versions.
46 46
47 47 However, we can't use a version check for this method, as some distributions
48 48 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 49 encoding, as it is unlikely that this encoding is the actually expected.
50 50 '''
51 51 try:
52 52 locale.CODESET
53 53 except AttributeError:
54 54 # Fall back to parsing environment variables :-(
55 55 return locale.getdefaultlocale()[1]
56 56
57 57 oldloc = locale.setlocale(locale.LC_CTYPE)
58 58 locale.setlocale(locale.LC_CTYPE, "")
59 59 result = locale.nl_langinfo(locale.CODESET)
60 60 locale.setlocale(locale.LC_CTYPE, oldloc)
61 61
62 62 return result
63 63
64 64 _encodingfixers = {
65 65 '646': lambda: 'ascii',
66 66 'ANSI_X3.4-1968': lambda: 'ascii',
67 67 'mac-roman': _getpreferredencoding
68 68 }
69 69
70 70 try:
71 71 encoding = os.environ.get("HGENCODING")
72 72 if not encoding:
73 73 encoding = locale.getpreferredencoding() or 'ascii'
74 74 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 75 except locale.Error:
76 76 encoding = 'ascii'
77 77 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 78 fallbackencoding = 'ISO-8859-1'
79 79
80 80 class localstr(str):
81 81 '''This class allows strings that are unmodified to be
82 82 round-tripped to the local encoding and back'''
83 83 def __new__(cls, u, l):
84 84 s = str.__new__(cls, l)
85 85 s._utf8 = u
86 86 return s
87 87 def __hash__(self):
88 88 return hash(self._utf8) # avoid collisions in local string space
89 89
90 90 def tolocal(s):
91 91 """
92 92 Convert a string from internal UTF-8 to local encoding
93 93
94 94 All internal strings should be UTF-8 but some repos before the
95 95 implementation of locale support may contain latin1 or possibly
96 96 other character sets. We attempt to decode everything strictly
97 97 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 98 replace unknown characters.
99 99
100 100 The localstr class is used to cache the known UTF-8 encoding of
101 101 strings next to their local representation to allow lossless
102 102 round-trip conversion back to UTF-8.
103 103
104 104 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 105 >>> l = tolocal(u)
106 106 >>> l
107 107 'foo: ?'
108 108 >>> fromlocal(l)
109 109 'foo: \\xc3\\xa4'
110 110 >>> u2 = 'foo: \\xc3\\xa1'
111 111 >>> d = { l: 1, tolocal(u2): 2 }
112 112 >>> len(d) # no collision
113 113 2
114 114 >>> 'foo: ?' in d
115 115 False
116 116 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 117 >>> l = tolocal(l1)
118 118 >>> l
119 119 'foo: ?'
120 120 >>> fromlocal(l) # magically in utf-8
121 121 'foo: \\xc3\\xa4'
122 122 """
123 123
124 124 try:
125 125 try:
126 126 # make sure string is actually stored in UTF-8
127 127 u = s.decode('UTF-8')
128 128 if encoding == 'UTF-8':
129 129 # fast path
130 130 return s
131 131 r = u.encode(encoding, "replace")
132 132 if u == r.decode(encoding):
133 133 # r is a safe, non-lossy encoding of s
134 134 return r
135 135 return localstr(s, r)
136 136 except UnicodeDecodeError:
137 137 # we should only get here if we're looking at an ancient changeset
138 138 try:
139 139 u = s.decode(fallbackencoding)
140 140 r = u.encode(encoding, "replace")
141 141 if u == r.decode(encoding):
142 142 # r is a safe, non-lossy encoding of s
143 143 return r
144 144 return localstr(u.encode('UTF-8'), r)
145 145 except UnicodeDecodeError:
146 146 u = s.decode("utf-8", "replace") # last ditch
147 147 return u.encode(encoding, "replace") # can't round-trip
148 148 except LookupError as k:
149 149 raise error.Abort(k, hint="please check your locale settings")
150 150
151 151 def fromlocal(s):
152 152 """
153 153 Convert a string from the local character encoding to UTF-8
154 154
155 155 We attempt to decode strings using the encoding mode set by
156 156 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 157 characters will cause an error message. Other modes include
158 158 'replace', which replaces unknown characters with a special
159 159 Unicode character, and 'ignore', which drops the character.
160 160 """
161 161
162 162 # can we do a lossless round-trip?
163 163 if isinstance(s, localstr):
164 164 return s._utf8
165 165
166 166 try:
167 167 return s.decode(encoding, encodingmode).encode("utf-8")
168 168 except UnicodeDecodeError as inst:
169 169 sub = s[max(0, inst.start - 10):inst.start + 10]
170 170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 171 except LookupError as k:
172 172 raise error.Abort(k, hint="please check your locale settings")
173 173
174 174 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 175 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 176 and "WFA" or "WF")
177 177
178 178 def colwidth(s):
179 179 "Find the column width of a string for display in the local encoding"
180 180 return ucolwidth(s.decode(encoding, 'replace'))
181 181
182 182 def ucolwidth(d):
183 183 "Find the column width of a Unicode string for display"
184 184 eaw = getattr(unicodedata, 'east_asian_width', None)
185 185 if eaw is not None:
186 186 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 187 return len(d)
188 188
189 189 def getcols(s, start, c):
190 190 '''Use colwidth to find a c-column substring of s starting at byte
191 191 index start'''
192 192 for x in xrange(start + c, len(s)):
193 193 t = s[start:x]
194 194 if colwidth(t) == c:
195 195 return t
196 196
197 197 def trim(s, width, ellipsis='', leftside=False):
198 198 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199 199
200 200 If 'leftside' is True, left side of string 's' is trimmed.
201 201 'ellipsis' is always placed at trimmed side.
202 202
203 203 >>> ellipsis = '+++'
204 204 >>> from . import encoding
205 205 >>> encoding.encoding = 'utf-8'
206 206 >>> t= '1234567890'
207 207 >>> print trim(t, 12, ellipsis=ellipsis)
208 208 1234567890
209 209 >>> print trim(t, 10, ellipsis=ellipsis)
210 210 1234567890
211 211 >>> print trim(t, 8, ellipsis=ellipsis)
212 212 12345+++
213 213 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 214 +++67890
215 215 >>> print trim(t, 8)
216 216 12345678
217 217 >>> print trim(t, 8, leftside=True)
218 218 34567890
219 219 >>> print trim(t, 3, ellipsis=ellipsis)
220 220 +++
221 221 >>> print trim(t, 1, ellipsis=ellipsis)
222 222 +
223 223 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 224 >>> t = u.encode(encoding.encoding)
225 225 >>> print trim(t, 12, ellipsis=ellipsis)
226 226 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 227 >>> print trim(t, 10, ellipsis=ellipsis)
228 228 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 229 >>> print trim(t, 8, ellipsis=ellipsis)
230 230 \xe3\x81\x82\xe3\x81\x84+++
231 231 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 232 +++\xe3\x81\x88\xe3\x81\x8a
233 233 >>> print trim(t, 5)
234 234 \xe3\x81\x82\xe3\x81\x84
235 235 >>> print trim(t, 5, leftside=True)
236 236 \xe3\x81\x88\xe3\x81\x8a
237 237 >>> print trim(t, 4, ellipsis=ellipsis)
238 238 +++
239 239 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 240 +++
241 241 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 242 >>> print trim(t, 12, ellipsis=ellipsis)
243 243 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 244 >>> print trim(t, 10, ellipsis=ellipsis)
245 245 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 246 >>> print trim(t, 8, ellipsis=ellipsis)
247 247 \x11\x22\x33\x44\x55+++
248 248 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 249 +++\x66\x77\x88\x99\xaa
250 250 >>> print trim(t, 8)
251 251 \x11\x22\x33\x44\x55\x66\x77\x88
252 252 >>> print trim(t, 8, leftside=True)
253 253 \x33\x44\x55\x66\x77\x88\x99\xaa
254 254 >>> print trim(t, 3, ellipsis=ellipsis)
255 255 +++
256 256 >>> print trim(t, 1, ellipsis=ellipsis)
257 257 +
258 258 """
259 259 try:
260 260 u = s.decode(encoding)
261 261 except UnicodeDecodeError:
262 262 if len(s) <= width: # trimming is not needed
263 263 return s
264 264 width -= len(ellipsis)
265 265 if width <= 0: # no enough room even for ellipsis
266 266 return ellipsis[:width + len(ellipsis)]
267 267 if leftside:
268 268 return ellipsis + s[-width:]
269 269 return s[:width] + ellipsis
270 270
271 271 if ucolwidth(u) <= width: # trimming is not needed
272 272 return s
273 273
274 274 width -= len(ellipsis)
275 275 if width <= 0: # no enough room even for ellipsis
276 276 return ellipsis[:width + len(ellipsis)]
277 277
278 278 if leftside:
279 279 uslice = lambda i: u[i:]
280 280 concat = lambda s: ellipsis + s
281 281 else:
282 282 uslice = lambda i: u[:-i]
283 283 concat = lambda s: s + ellipsis
284 284 for i in xrange(1, len(u)):
285 285 usub = uslice(i)
286 286 if ucolwidth(usub) <= width:
287 287 return concat(usub.encode(encoding))
288 288 return ellipsis # no enough room for multi-column characters
289 289
290 290 def _asciilower(s):
291 291 '''convert a string to lowercase if ASCII
292 292
293 293 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 294 s.decode('ascii')
295 295 return s.lower()
296 296
297 297 def asciilower(s):
298 298 # delay importing avoids cyclic dependency around "parsers" in
299 299 # pure Python build (util => i18n => encoding => parsers => util)
300 300 from . import parsers
301 301 impl = getattr(parsers, 'asciilower', _asciilower)
302 302 global asciilower
303 303 asciilower = impl
304 304 return impl(s)
305 305
306 306 def _asciiupper(s):
307 307 '''convert a string to uppercase if ASCII
308 308
309 309 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 310 s.decode('ascii')
311 311 return s.upper()
312 312
313 313 def asciiupper(s):
314 314 # delay importing avoids cyclic dependency around "parsers" in
315 315 # pure Python build (util => i18n => encoding => parsers => util)
316 316 from . import parsers
317 317 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 318 global asciiupper
319 319 asciiupper = impl
320 320 return impl(s)
321 321
322 322 def lower(s):
323 323 "best-effort encoding-aware case-folding of local string s"
324 324 try:
325 325 return asciilower(s)
326 326 except UnicodeDecodeError:
327 327 pass
328 328 try:
329 329 if isinstance(s, localstr):
330 330 u = s._utf8.decode("utf-8")
331 331 else:
332 332 u = s.decode(encoding, encodingmode)
333 333
334 334 lu = u.lower()
335 335 if u == lu:
336 336 return s # preserve localstring
337 337 return lu.encode(encoding)
338 338 except UnicodeError:
339 339 return s.lower() # we don't know how to fold this except in ASCII
340 340 except LookupError as k:
341 341 raise error.Abort(k, hint="please check your locale settings")
342 342
343 343 def upper(s):
344 344 "best-effort encoding-aware case-folding of local string s"
345 345 try:
346 346 return asciiupper(s)
347 347 except UnicodeDecodeError:
348 348 return upperfallback(s)
349 349
350 350 def upperfallback(s):
351 351 try:
352 352 if isinstance(s, localstr):
353 353 u = s._utf8.decode("utf-8")
354 354 else:
355 355 u = s.decode(encoding, encodingmode)
356 356
357 357 uu = u.upper()
358 358 if u == uu:
359 359 return s # preserve localstring
360 360 return uu.encode(encoding)
361 361 except UnicodeError:
362 362 return s.upper() # we don't know how to fold this except in ASCII
363 363 except LookupError as k:
364 364 raise error.Abort(k, hint="please check your locale settings")
365 365
366 366 class normcasespecs(object):
367 367 '''what a platform's normcase does to ASCII strings
368 368
369 369 This is specified per platform, and should be consistent with what normcase
370 370 on that platform actually does.
371 371
372 372 lower: normcase lowercases ASCII strings
373 373 upper: normcase uppercases ASCII strings
374 374 other: the fallback function should always be called
375 375
376 376 This should be kept in sync with normcase_spec in util.h.'''
377 377 lower = -1
378 378 upper = 1
379 379 other = 0
380 380
381 381 _jsonmap = {}
382 382
383 383 def jsonescape(s):
384 384 '''returns a string suitable for JSON
385 385
386 386 JSON is problematic for us because it doesn't support non-Unicode
387 387 bytes. To deal with this, we take the following approach:
388 388
389 389 - localstr objects are converted back to UTF-8
390 390 - valid UTF-8/ASCII strings are passed as-is
391 391 - other strings are converted to UTF-8b surrogate encoding
392 392 - apply JSON-specified string escaping
393 393
394 394 (escapes are doubled in these tests)
395 395
396 396 >>> jsonescape('this is a test')
397 397 'this is a test'
398 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
399 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
398 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
399 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
400 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
401 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
400 402 >>> jsonescape('a weird byte: \\xdd')
401 403 'a weird byte: \\xed\\xb3\\x9d'
402 404 >>> jsonescape('utf-8: caf\\xc3\\xa9')
403 405 'utf-8: caf\\xc3\\xa9'
404 406 >>> jsonescape('')
405 407 ''
406 408 '''
407 409
408 410 if not _jsonmap:
409 411 for x in xrange(32):
410 412 _jsonmap[chr(x)] = "\\u%04x" % x
411 413 for x in xrange(32, 256):
412 414 c = chr(x)
413 415 _jsonmap[c] = c
416 _jsonmap['\x7f'] = '\\u007f'
414 417 _jsonmap['\t'] = '\\t'
415 418 _jsonmap['\n'] = '\\n'
416 419 _jsonmap['\"'] = '\\"'
417 420 _jsonmap['\\'] = '\\\\'
418 421 _jsonmap['\b'] = '\\b'
419 422 _jsonmap['\f'] = '\\f'
420 423 _jsonmap['\r'] = '\\r'
421 424
422 425 return ''.join(_jsonmap[c] for c in toutf8b(s))
423 426
424 427 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
425 428
426 429 def getutf8char(s, pos):
427 430 '''get the next full utf-8 character in the given string, starting at pos
428 431
429 432 Raises a UnicodeError if the given location does not start a valid
430 433 utf-8 character.
431 434 '''
432 435
433 436 # find how many bytes to attempt decoding from first nibble
434 437 l = _utf8len[ord(s[pos]) >> 4]
435 438 if not l: # ascii
436 439 return s[pos]
437 440
438 441 c = s[pos:pos + l]
439 442 # validate with attempted decode
440 443 c.decode("utf-8")
441 444 return c
442 445
443 446 def toutf8b(s):
444 447 '''convert a local, possibly-binary string into UTF-8b
445 448
446 449 This is intended as a generic method to preserve data when working
447 450 with schemes like JSON and XML that have no provision for
448 451 arbitrary byte strings. As Mercurial often doesn't know
449 452 what encoding data is in, we use so-called UTF-8b.
450 453
451 454 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
452 455 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
453 456 uDC00-uDCFF.
454 457
455 458 Principles of operation:
456 459
457 460 - ASCII and UTF-8 data successfully round-trips and is understood
458 461 by Unicode-oriented clients
459 462 - filenames and file contents in arbitrary other encodings can have
460 463 be round-tripped or recovered by clueful clients
461 464 - local strings that have a cached known UTF-8 encoding (aka
462 465 localstr) get sent as UTF-8 so Unicode-oriented clients get the
463 466 Unicode data they want
464 467 - because we must preserve UTF-8 bytestring in places such as
465 468 filenames, metadata can't be roundtripped without help
466 469
467 470 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
468 471 arbitrary bytes into an internal Unicode format that can be
469 472 re-encoded back into the original. Here we are exposing the
470 473 internal surrogate encoding as a UTF-8 string.)
471 474 '''
472 475
473 476 if "\xed" not in s:
474 477 if isinstance(s, localstr):
475 478 return s._utf8
476 479 try:
477 480 s.decode('utf-8')
478 481 return s
479 482 except UnicodeDecodeError:
480 483 pass
481 484
482 485 r = ""
483 486 pos = 0
484 487 l = len(s)
485 488 while pos < l:
486 489 try:
487 490 c = getutf8char(s, pos)
488 491 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
489 492 # have to re-escape existing U+DCxx characters
490 493 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
491 494 pos += 1
492 495 else:
493 496 pos += len(c)
494 497 except UnicodeDecodeError:
495 498 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
496 499 pos += 1
497 500 r += c
498 501 return r
499 502
500 503 def fromutf8b(s):
501 504 '''Given a UTF-8b string, return a local, possibly-binary string.
502 505
503 506 return the original binary string. This
504 507 is a round-trip process for strings like filenames, but metadata
505 508 that's was passed through tolocal will remain in UTF-8.
506 509
507 510 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
508 511 >>> m = "\\xc3\\xa9\\x99abcd"
509 512 >>> toutf8b(m)
510 513 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
511 514 >>> roundtrip(m)
512 515 True
513 516 >>> roundtrip("\\xc2\\xc2\\x80")
514 517 True
515 518 >>> roundtrip("\\xef\\xbf\\xbd")
516 519 True
517 520 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
518 521 True
519 522 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
520 523 True
521 524 '''
522 525
523 526 # fast path - look for uDxxx prefixes in s
524 527 if "\xed" not in s:
525 528 return s
526 529
527 530 # We could do this with the unicode type but some Python builds
528 531 # use UTF-16 internally (issue5031) which causes non-BMP code
529 532 # points to be escaped. Instead, we use our handy getutf8char
530 533 # helper again to walk the string without "decoding" it.
531 534
532 535 r = ""
533 536 pos = 0
534 537 l = len(s)
535 538 while pos < l:
536 539 c = getutf8char(s, pos)
537 540 pos += len(c)
538 541 # unescape U+DCxx characters
539 542 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
540 543 c = chr(ord(c.decode("utf-8")) & 0xff)
541 544 r += c
542 545 return r
General Comments 0
You need to be logged in to leave comments. Login now