##// END OF EJS Templates
encoding: extract stub for fast JSON escape...
Yuya Nishihara -
r33925:b9101467 default
parent child Browse files
Show More
@@ -1,591 +1,571 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 import array
11 10 import io
12 11 import locale
13 12 import os
14 13 import unicodedata
15 14
16 15 from . import (
17 16 error,
18 17 policy,
19 18 pycompat,
20 19 )
21 20
21 from .pure import (
22 charencode as charencodepure,
23 )
24
22 25 charencode = policy.importmod(r'charencode')
23 26
24 27 asciilower = charencode.asciilower
25 28 asciiupper = charencode.asciiupper
29 _jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure"
26 30
27 31 _sysstr = pycompat.sysstr
28 32
29 33 if pycompat.ispy3:
30 34 unichr = chr
31 35
32 36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
33 37 # "Unicode Subtleties"), so we need to ignore them in some places for
34 38 # sanity.
35 39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
36 40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
37 41 "206a 206b 206c 206d 206e 206f feff".split()]
38 42 # verify the next function will work
39 43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
40 44
41 45 def hfsignoreclean(s):
42 46 """Remove codepoints ignored by HFS+ from s.
43 47
44 48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
45 49 '.hg'
46 50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
47 51 '.hg'
48 52 """
49 53 if "\xe2" in s or "\xef" in s:
50 54 for c in _ignore:
51 55 s = s.replace(c, '')
52 56 return s
53 57
54 58 # encoding.environ is provided read-only, which may not be used to modify
55 59 # the process environment
56 60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
57 61 if not pycompat.ispy3:
58 62 environ = os.environ # re-exports
59 63 elif _nativeenviron:
60 64 environ = os.environb # re-exports
61 65 else:
62 66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
63 67 # and recreate it once encoding is settled
64 68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
65 69 for k, v in os.environ.items()) # re-exports
66 70
67 71 _encodingfixers = {
68 72 '646': lambda: 'ascii',
69 73 'ANSI_X3.4-1968': lambda: 'ascii',
70 74 }
71 75
72 76 try:
73 77 encoding = environ.get("HGENCODING")
74 78 if not encoding:
75 79 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
76 80 encoding = _encodingfixers.get(encoding, lambda: encoding)()
77 81 except locale.Error:
78 82 encoding = 'ascii'
79 83 encodingmode = environ.get("HGENCODINGMODE", "strict")
80 84 fallbackencoding = 'ISO-8859-1'
81 85
82 86 class localstr(bytes):
83 87 '''This class allows strings that are unmodified to be
84 88 round-tripped to the local encoding and back'''
85 89 def __new__(cls, u, l):
86 90 s = bytes.__new__(cls, l)
87 91 s._utf8 = u
88 92 return s
89 93 def __hash__(self):
90 94 return hash(self._utf8) # avoid collisions in local string space
91 95
92 96 def tolocal(s):
93 97 """
94 98 Convert a string from internal UTF-8 to local encoding
95 99
96 100 All internal strings should be UTF-8 but some repos before the
97 101 implementation of locale support may contain latin1 or possibly
98 102 other character sets. We attempt to decode everything strictly
99 103 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
100 104 replace unknown characters.
101 105
102 106 The localstr class is used to cache the known UTF-8 encoding of
103 107 strings next to their local representation to allow lossless
104 108 round-trip conversion back to UTF-8.
105 109
106 110 >>> u = 'foo: \\xc3\\xa4' # utf-8
107 111 >>> l = tolocal(u)
108 112 >>> l
109 113 'foo: ?'
110 114 >>> fromlocal(l)
111 115 'foo: \\xc3\\xa4'
112 116 >>> u2 = 'foo: \\xc3\\xa1'
113 117 >>> d = { l: 1, tolocal(u2): 2 }
114 118 >>> len(d) # no collision
115 119 2
116 120 >>> 'foo: ?' in d
117 121 False
118 122 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
119 123 >>> l = tolocal(l1)
120 124 >>> l
121 125 'foo: ?'
122 126 >>> fromlocal(l) # magically in utf-8
123 127 'foo: \\xc3\\xa4'
124 128 """
125 129
126 130 try:
127 131 try:
128 132 # make sure string is actually stored in UTF-8
129 133 u = s.decode('UTF-8')
130 134 if encoding == 'UTF-8':
131 135 # fast path
132 136 return s
133 137 r = u.encode(_sysstr(encoding), u"replace")
134 138 if u == r.decode(_sysstr(encoding)):
135 139 # r is a safe, non-lossy encoding of s
136 140 return r
137 141 return localstr(s, r)
138 142 except UnicodeDecodeError:
139 143 # we should only get here if we're looking at an ancient changeset
140 144 try:
141 145 u = s.decode(_sysstr(fallbackencoding))
142 146 r = u.encode(_sysstr(encoding), u"replace")
143 147 if u == r.decode(_sysstr(encoding)):
144 148 # r is a safe, non-lossy encoding of s
145 149 return r
146 150 return localstr(u.encode('UTF-8'), r)
147 151 except UnicodeDecodeError:
148 152 u = s.decode("utf-8", "replace") # last ditch
149 153 # can't round-trip
150 154 return u.encode(_sysstr(encoding), u"replace")
151 155 except LookupError as k:
152 156 raise error.Abort(k, hint="please check your locale settings")
153 157
154 158 def fromlocal(s):
155 159 """
156 160 Convert a string from the local character encoding to UTF-8
157 161
158 162 We attempt to decode strings using the encoding mode set by
159 163 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
160 164 characters will cause an error message. Other modes include
161 165 'replace', which replaces unknown characters with a special
162 166 Unicode character, and 'ignore', which drops the character.
163 167 """
164 168
165 169 # can we do a lossless round-trip?
166 170 if isinstance(s, localstr):
167 171 return s._utf8
168 172
169 173 try:
170 174 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
171 175 return u.encode("utf-8")
172 176 except UnicodeDecodeError as inst:
173 177 sub = s[max(0, inst.start - 10):inst.start + 10]
174 178 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
175 179 except LookupError as k:
176 180 raise error.Abort(k, hint="please check your locale settings")
177 181
178 182 def unitolocal(u):
179 183 """Convert a unicode string to a byte string of local encoding"""
180 184 return tolocal(u.encode('utf-8'))
181 185
182 186 def unifromlocal(s):
183 187 """Convert a byte string of local encoding to a unicode string"""
184 188 return fromlocal(s).decode('utf-8')
185 189
186 190 def unimethod(bytesfunc):
187 191 """Create a proxy method that forwards __unicode__() and __str__() of
188 192 Python 3 to __bytes__()"""
189 193 def unifunc(obj):
190 194 return unifromlocal(bytesfunc(obj))
191 195 return unifunc
192 196
193 197 # converter functions between native str and byte string. use these if the
194 198 # character encoding is not aware (e.g. exception message) or is known to
195 199 # be locale dependent (e.g. date formatting.)
196 200 if pycompat.ispy3:
197 201 strtolocal = unitolocal
198 202 strfromlocal = unifromlocal
199 203 strmethod = unimethod
200 204 else:
201 205 strtolocal = pycompat.identity
202 206 strfromlocal = pycompat.identity
203 207 strmethod = pycompat.identity
204 208
205 209 if not _nativeenviron:
206 210 # now encoding and helper functions are available, recreate the environ
207 211 # dict to be exported to other modules
208 212 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
209 213 for k, v in os.environ.items()) # re-exports
210 214
211 215 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
212 216 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
213 217 and "WFA" or "WF")
214 218
215 219 def colwidth(s):
216 220 "Find the column width of a string for display in the local encoding"
217 221 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
218 222
219 223 def ucolwidth(d):
220 224 "Find the column width of a Unicode string for display"
221 225 eaw = getattr(unicodedata, 'east_asian_width', None)
222 226 if eaw is not None:
223 227 return sum([eaw(c) in _wide and 2 or 1 for c in d])
224 228 return len(d)
225 229
226 230 def getcols(s, start, c):
227 231 '''Use colwidth to find a c-column substring of s starting at byte
228 232 index start'''
229 233 for x in xrange(start + c, len(s)):
230 234 t = s[start:x]
231 235 if colwidth(t) == c:
232 236 return t
233 237
234 238 def trim(s, width, ellipsis='', leftside=False):
235 239 """Trim string 's' to at most 'width' columns (including 'ellipsis').
236 240
237 241 If 'leftside' is True, left side of string 's' is trimmed.
238 242 'ellipsis' is always placed at trimmed side.
239 243
240 244 >>> ellipsis = '+++'
241 245 >>> from . import encoding
242 246 >>> encoding.encoding = 'utf-8'
243 247 >>> t= '1234567890'
244 248 >>> print trim(t, 12, ellipsis=ellipsis)
245 249 1234567890
246 250 >>> print trim(t, 10, ellipsis=ellipsis)
247 251 1234567890
248 252 >>> print trim(t, 8, ellipsis=ellipsis)
249 253 12345+++
250 254 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
251 255 +++67890
252 256 >>> print trim(t, 8)
253 257 12345678
254 258 >>> print trim(t, 8, leftside=True)
255 259 34567890
256 260 >>> print trim(t, 3, ellipsis=ellipsis)
257 261 +++
258 262 >>> print trim(t, 1, ellipsis=ellipsis)
259 263 +
260 264 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
261 265 >>> t = u.encode(encoding.encoding)
262 266 >>> print trim(t, 12, ellipsis=ellipsis)
263 267 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
264 268 >>> print trim(t, 10, ellipsis=ellipsis)
265 269 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
266 270 >>> print trim(t, 8, ellipsis=ellipsis)
267 271 \xe3\x81\x82\xe3\x81\x84+++
268 272 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
269 273 +++\xe3\x81\x88\xe3\x81\x8a
270 274 >>> print trim(t, 5)
271 275 \xe3\x81\x82\xe3\x81\x84
272 276 >>> print trim(t, 5, leftside=True)
273 277 \xe3\x81\x88\xe3\x81\x8a
274 278 >>> print trim(t, 4, ellipsis=ellipsis)
275 279 +++
276 280 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
277 281 +++
278 282 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
279 283 >>> print trim(t, 12, ellipsis=ellipsis)
280 284 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
281 285 >>> print trim(t, 10, ellipsis=ellipsis)
282 286 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
283 287 >>> print trim(t, 8, ellipsis=ellipsis)
284 288 \x11\x22\x33\x44\x55+++
285 289 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
286 290 +++\x66\x77\x88\x99\xaa
287 291 >>> print trim(t, 8)
288 292 \x11\x22\x33\x44\x55\x66\x77\x88
289 293 >>> print trim(t, 8, leftside=True)
290 294 \x33\x44\x55\x66\x77\x88\x99\xaa
291 295 >>> print trim(t, 3, ellipsis=ellipsis)
292 296 +++
293 297 >>> print trim(t, 1, ellipsis=ellipsis)
294 298 +
295 299 """
296 300 try:
297 301 u = s.decode(_sysstr(encoding))
298 302 except UnicodeDecodeError:
299 303 if len(s) <= width: # trimming is not needed
300 304 return s
301 305 width -= len(ellipsis)
302 306 if width <= 0: # no enough room even for ellipsis
303 307 return ellipsis[:width + len(ellipsis)]
304 308 if leftside:
305 309 return ellipsis + s[-width:]
306 310 return s[:width] + ellipsis
307 311
308 312 if ucolwidth(u) <= width: # trimming is not needed
309 313 return s
310 314
311 315 width -= len(ellipsis)
312 316 if width <= 0: # no enough room even for ellipsis
313 317 return ellipsis[:width + len(ellipsis)]
314 318
315 319 if leftside:
316 320 uslice = lambda i: u[i:]
317 321 concat = lambda s: ellipsis + s
318 322 else:
319 323 uslice = lambda i: u[:-i]
320 324 concat = lambda s: s + ellipsis
321 325 for i in xrange(1, len(u)):
322 326 usub = uslice(i)
323 327 if ucolwidth(usub) <= width:
324 328 return concat(usub.encode(_sysstr(encoding)))
325 329 return ellipsis # no enough room for multi-column characters
326 330
327 331 def lower(s):
328 332 "best-effort encoding-aware case-folding of local string s"
329 333 try:
330 334 return asciilower(s)
331 335 except UnicodeDecodeError:
332 336 pass
333 337 try:
334 338 if isinstance(s, localstr):
335 339 u = s._utf8.decode("utf-8")
336 340 else:
337 341 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
338 342
339 343 lu = u.lower()
340 344 if u == lu:
341 345 return s # preserve localstring
342 346 return lu.encode(_sysstr(encoding))
343 347 except UnicodeError:
344 348 return s.lower() # we don't know how to fold this except in ASCII
345 349 except LookupError as k:
346 350 raise error.Abort(k, hint="please check your locale settings")
347 351
348 352 def upper(s):
349 353 "best-effort encoding-aware case-folding of local string s"
350 354 try:
351 355 return asciiupper(s)
352 356 except UnicodeDecodeError:
353 357 return upperfallback(s)
354 358
355 359 def upperfallback(s):
356 360 try:
357 361 if isinstance(s, localstr):
358 362 u = s._utf8.decode("utf-8")
359 363 else:
360 364 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
361 365
362 366 uu = u.upper()
363 367 if u == uu:
364 368 return s # preserve localstring
365 369 return uu.encode(_sysstr(encoding))
366 370 except UnicodeError:
367 371 return s.upper() # we don't know how to fold this except in ASCII
368 372 except LookupError as k:
369 373 raise error.Abort(k, hint="please check your locale settings")
370 374
371 375 class normcasespecs(object):
372 376 '''what a platform's normcase does to ASCII strings
373 377
374 378 This is specified per platform, and should be consistent with what normcase
375 379 on that platform actually does.
376 380
377 381 lower: normcase lowercases ASCII strings
378 382 upper: normcase uppercases ASCII strings
379 383 other: the fallback function should always be called
380 384
381 385 This should be kept in sync with normcase_spec in util.h.'''
382 386 lower = -1
383 387 upper = 1
384 388 other = 0
385 389
386 _jsonmap = []
387 _jsonmap.extend("\\u%04x" % x for x in range(32))
388 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
389 _jsonmap.append('\\u007f')
390 _jsonmap[0x09] = '\\t'
391 _jsonmap[0x0a] = '\\n'
392 _jsonmap[0x22] = '\\"'
393 _jsonmap[0x5c] = '\\\\'
394 _jsonmap[0x08] = '\\b'
395 _jsonmap[0x0c] = '\\f'
396 _jsonmap[0x0d] = '\\r'
397 _paranoidjsonmap = _jsonmap[:]
398 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
399 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
400 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
401
402 390 def jsonescape(s, paranoid=False):
403 391 '''returns a string suitable for JSON
404 392
405 393 JSON is problematic for us because it doesn't support non-Unicode
406 394 bytes. To deal with this, we take the following approach:
407 395
408 396 - localstr objects are converted back to UTF-8
409 397 - valid UTF-8/ASCII strings are passed as-is
410 398 - other strings are converted to UTF-8b surrogate encoding
411 399 - apply JSON-specified string escaping
412 400
413 401 (escapes are doubled in these tests)
414 402
415 403 >>> jsonescape('this is a test')
416 404 'this is a test'
417 405 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
418 406 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
419 407 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
420 408 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
421 409 >>> jsonescape('a weird byte: \\xdd')
422 410 'a weird byte: \\xed\\xb3\\x9d'
423 411 >>> jsonescape('utf-8: caf\\xc3\\xa9')
424 412 'utf-8: caf\\xc3\\xa9'
425 413 >>> jsonescape('')
426 414 ''
427 415
428 416 If paranoid, non-ascii and common troublesome characters are also escaped.
429 417 This is suitable for web output.
430 418
431 419 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
432 420 'escape boundary: ~ \\\\u007f \\\\u0080'
433 421 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
434 422 'a weird byte: \\\\udcdd'
435 423 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
436 424 'utf-8: caf\\\\u00e9'
437 425 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
438 426 'non-BMP: \\\\ud834\\\\udd1e'
439 427 >>> jsonescape('<foo@example.org>', paranoid=True)
440 428 '\\\\u003cfoo@example.org\\\\u003e'
441 429 '''
442 430
443 if paranoid:
444 jm = _paranoidjsonmap
445 else:
446 jm = _jsonmap
447
448 431 u8chars = toutf8b(s)
449 432 try:
450 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
451 except IndexError:
433 return _jsonescapeu8fast(u8chars, paranoid)
434 except ValueError:
452 435 pass
453 # non-BMP char is represented as UTF-16 surrogate pair
454 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
455 u16codes.pop(0) # drop BOM
456 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
436 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
457 437
458 438 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
459 439
460 440 def getutf8char(s, pos):
461 441 '''get the next full utf-8 character in the given string, starting at pos
462 442
463 443 Raises a UnicodeError if the given location does not start a valid
464 444 utf-8 character.
465 445 '''
466 446
467 447 # find how many bytes to attempt decoding from first nibble
468 448 l = _utf8len[ord(s[pos]) >> 4]
469 449 if not l: # ascii
470 450 return s[pos]
471 451
472 452 c = s[pos:pos + l]
473 453 # validate with attempted decode
474 454 c.decode("utf-8")
475 455 return c
476 456
477 457 def toutf8b(s):
478 458 '''convert a local, possibly-binary string into UTF-8b
479 459
480 460 This is intended as a generic method to preserve data when working
481 461 with schemes like JSON and XML that have no provision for
482 462 arbitrary byte strings. As Mercurial often doesn't know
483 463 what encoding data is in, we use so-called UTF-8b.
484 464
485 465 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
486 466 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
487 467 uDC00-uDCFF.
488 468
489 469 Principles of operation:
490 470
491 471 - ASCII and UTF-8 data successfully round-trips and is understood
492 472 by Unicode-oriented clients
493 473 - filenames and file contents in arbitrary other encodings can have
494 474 be round-tripped or recovered by clueful clients
495 475 - local strings that have a cached known UTF-8 encoding (aka
496 476 localstr) get sent as UTF-8 so Unicode-oriented clients get the
497 477 Unicode data they want
498 478 - because we must preserve UTF-8 bytestring in places such as
499 479 filenames, metadata can't be roundtripped without help
500 480
501 481 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
502 482 arbitrary bytes into an internal Unicode format that can be
503 483 re-encoded back into the original. Here we are exposing the
504 484 internal surrogate encoding as a UTF-8 string.)
505 485 '''
506 486
507 487 if "\xed" not in s:
508 488 if isinstance(s, localstr):
509 489 return s._utf8
510 490 try:
511 491 s.decode('utf-8')
512 492 return s
513 493 except UnicodeDecodeError:
514 494 pass
515 495
516 496 r = ""
517 497 pos = 0
518 498 l = len(s)
519 499 while pos < l:
520 500 try:
521 501 c = getutf8char(s, pos)
522 502 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
523 503 # have to re-escape existing U+DCxx characters
524 504 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
525 505 pos += 1
526 506 else:
527 507 pos += len(c)
528 508 except UnicodeDecodeError:
529 509 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
530 510 pos += 1
531 511 r += c
532 512 return r
533 513
534 514 def fromutf8b(s):
535 515 '''Given a UTF-8b string, return a local, possibly-binary string.
536 516
537 517 return the original binary string. This
538 518 is a round-trip process for strings like filenames, but metadata
539 519 that's was passed through tolocal will remain in UTF-8.
540 520
541 521 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
542 522 >>> m = "\\xc3\\xa9\\x99abcd"
543 523 >>> toutf8b(m)
544 524 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
545 525 >>> roundtrip(m)
546 526 True
547 527 >>> roundtrip("\\xc2\\xc2\\x80")
548 528 True
549 529 >>> roundtrip("\\xef\\xbf\\xbd")
550 530 True
551 531 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
552 532 True
553 533 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
554 534 True
555 535 '''
556 536
557 537 # fast path - look for uDxxx prefixes in s
558 538 if "\xed" not in s:
559 539 return s
560 540
561 541 # We could do this with the unicode type but some Python builds
562 542 # use UTF-16 internally (issue5031) which causes non-BMP code
563 543 # points to be escaped. Instead, we use our handy getutf8char
564 544 # helper again to walk the string without "decoding" it.
565 545
566 546 r = ""
567 547 pos = 0
568 548 l = len(s)
569 549 while pos < l:
570 550 c = getutf8char(s, pos)
571 551 pos += len(c)
572 552 # unescape U+DCxx characters
573 553 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
574 554 c = chr(ord(c.decode("utf-8")) & 0xff)
575 555 r += c
576 556 return r
577 557
578 558 if pycompat.ispy3:
579 559 class strio(io.TextIOWrapper):
580 560 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
581 561
582 562 Also works around Python closing streams.
583 563 """
584 564
585 565 def __init__(self, buffer):
586 566 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
587 567
588 568 def __del__(self):
589 569 """Override __del__ so it doesn't close the underlying stream."""
590 570 else:
591 571 strio = pycompat.identity
@@ -1,22 +1,72 b''
1 1 # charencode.py - miscellaneous character encoding
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 import array
11
12 from .. import (
13 pycompat,
14 )
15
10 16 def asciilower(s):
11 17 '''convert a string to lowercase if ASCII
12 18
13 19 Raises UnicodeDecodeError if non-ASCII characters are found.'''
14 20 s.decode('ascii')
15 21 return s.lower()
16 22
17 23 def asciiupper(s):
18 24 '''convert a string to uppercase if ASCII
19 25
20 26 Raises UnicodeDecodeError if non-ASCII characters are found.'''
21 27 s.decode('ascii')
22 28 return s.upper()
29
30 _jsonmap = []
31 _jsonmap.extend("\\u%04x" % x for x in range(32))
32 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
33 _jsonmap.append('\\u007f')
34 _jsonmap[0x09] = '\\t'
35 _jsonmap[0x0a] = '\\n'
36 _jsonmap[0x22] = '\\"'
37 _jsonmap[0x5c] = '\\\\'
38 _jsonmap[0x08] = '\\b'
39 _jsonmap[0x0c] = '\\f'
40 _jsonmap[0x0d] = '\\r'
41 _paranoidjsonmap = _jsonmap[:]
42 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
43 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
44 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
45
46 def jsonescapeu8fast(u8chars, paranoid):
47 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
48
49 Raises ValueError if non-ASCII characters have to be escaped.
50 """
51 if paranoid:
52 jm = _paranoidjsonmap
53 else:
54 jm = _jsonmap
55 try:
56 return ''.join(jm[x] for x in bytearray(u8chars))
57 except IndexError:
58 raise ValueError
59
60 def jsonescapeu8fallback(u8chars, paranoid):
61 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
62
63 Escapes all non-ASCII characters no matter if paranoid is False.
64 """
65 if paranoid:
66 jm = _paranoidjsonmap
67 else:
68 jm = _jsonmap
69 # non-BMP char is represented as UTF-16 surrogate pair
70 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
71 u16codes.pop(0) # drop BOM
72 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
General Comments 0
You need to be logged in to leave comments. Login now