##// END OF EJS Templates
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
Yuya Nishihara -
r34215:aa877860 default
parent child Browse files
Show More
@@ -1,590 +1,597 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import io
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import (
22 22 charencode as charencodepure,
23 23 )
24 24
25 25 charencode = policy.importmod(r'charencode')
26 26
27 27 isasciistr = charencode.isasciistr
28 28 asciilower = charencode.asciilower
29 29 asciiupper = charencode.asciiupper
30 30 _jsonescapeu8fast = charencode.jsonescapeu8fast
31 31
32 32 _sysstr = pycompat.sysstr
33 33
34 34 if pycompat.ispy3:
35 35 unichr = chr
36 36
37 37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
38 38 # "Unicode Subtleties"), so we need to ignore them in some places for
39 39 # sanity.
40 40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
41 41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
42 42 "206a 206b 206c 206d 206e 206f feff".split()]
43 43 # verify the next function will work
44 44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
45 45
46 46 def hfsignoreclean(s):
47 47 """Remove codepoints ignored by HFS+ from s.
48 48
49 49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 50 '.hg'
51 51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 52 '.hg'
53 53 """
54 54 if "\xe2" in s or "\xef" in s:
55 55 for c in _ignore:
56 56 s = s.replace(c, '')
57 57 return s
58 58
59 59 # encoding.environ is provided read-only, which may not be used to modify
60 60 # the process environment
61 61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
62 62 if not pycompat.ispy3:
63 63 environ = os.environ # re-exports
64 64 elif _nativeenviron:
65 65 environ = os.environb # re-exports
66 66 else:
67 67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
68 68 # and recreate it once encoding is settled
69 69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
70 70 for k, v in os.environ.items()) # re-exports
71 71
72 72 _encodingfixers = {
73 73 '646': lambda: 'ascii',
74 74 'ANSI_X3.4-1968': lambda: 'ascii',
75 75 }
76 76
77 77 try:
78 78 encoding = environ.get("HGENCODING")
79 79 if not encoding:
80 80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
81 81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
82 82 except locale.Error:
83 83 encoding = 'ascii'
84 84 encodingmode = environ.get("HGENCODINGMODE", "strict")
85 85 fallbackencoding = 'ISO-8859-1'
86 86
87 87 class localstr(bytes):
88 88 '''This class allows strings that are unmodified to be
89 89 round-tripped to the local encoding and back'''
90 90 def __new__(cls, u, l):
91 91 s = bytes.__new__(cls, l)
92 92 s._utf8 = u
93 93 return s
94 94 def __hash__(self):
95 95 return hash(self._utf8) # avoid collisions in local string space
96 96
97 97 def tolocal(s):
98 98 """
99 99 Convert a string from internal UTF-8 to local encoding
100 100
101 101 All internal strings should be UTF-8 but some repos before the
102 102 implementation of locale support may contain latin1 or possibly
103 103 other character sets. We attempt to decode everything strictly
104 104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
105 105 replace unknown characters.
106 106
107 107 The localstr class is used to cache the known UTF-8 encoding of
108 108 strings next to their local representation to allow lossless
109 109 round-trip conversion back to UTF-8.
110 110
111 111 >>> u = b'foo: \\xc3\\xa4' # utf-8
112 112 >>> l = tolocal(u)
113 113 >>> l
114 114 'foo: ?'
115 115 >>> fromlocal(l)
116 116 'foo: \\xc3\\xa4'
117 117 >>> u2 = b'foo: \\xc3\\xa1'
118 118 >>> d = { l: 1, tolocal(u2): 2 }
119 119 >>> len(d) # no collision
120 120 2
121 121 >>> b'foo: ?' in d
122 122 False
123 123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
124 124 >>> l = tolocal(l1)
125 125 >>> l
126 126 'foo: ?'
127 127 >>> fromlocal(l) # magically in utf-8
128 128 'foo: \\xc3\\xa4'
129 129 """
130 130
131 131 if isasciistr(s):
132 132 return s
133 133
134 134 try:
135 135 try:
136 136 # make sure string is actually stored in UTF-8
137 137 u = s.decode('UTF-8')
138 138 if encoding == 'UTF-8':
139 139 # fast path
140 140 return s
141 141 r = u.encode(_sysstr(encoding), u"replace")
142 142 if u == r.decode(_sysstr(encoding)):
143 143 # r is a safe, non-lossy encoding of s
144 144 return r
145 145 return localstr(s, r)
146 146 except UnicodeDecodeError:
147 147 # we should only get here if we're looking at an ancient changeset
148 148 try:
149 149 u = s.decode(_sysstr(fallbackencoding))
150 150 r = u.encode(_sysstr(encoding), u"replace")
151 151 if u == r.decode(_sysstr(encoding)):
152 152 # r is a safe, non-lossy encoding of s
153 153 return r
154 154 return localstr(u.encode('UTF-8'), r)
155 155 except UnicodeDecodeError:
156 156 u = s.decode("utf-8", "replace") # last ditch
157 157 # can't round-trip
158 158 return u.encode(_sysstr(encoding), u"replace")
159 159 except LookupError as k:
160 160 raise error.Abort(k, hint="please check your locale settings")
161 161
162 162 def fromlocal(s):
163 163 """
164 164 Convert a string from the local character encoding to UTF-8
165 165
166 166 We attempt to decode strings using the encoding mode set by
167 167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
168 168 characters will cause an error message. Other modes include
169 169 'replace', which replaces unknown characters with a special
170 170 Unicode character, and 'ignore', which drops the character.
171 171 """
172 172
173 173 # can we do a lossless round-trip?
174 174 if isinstance(s, localstr):
175 175 return s._utf8
176 176 if isasciistr(s):
177 177 return s
178 178
179 179 try:
180 180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
181 181 return u.encode("utf-8")
182 182 except UnicodeDecodeError as inst:
183 183 sub = s[max(0, inst.start - 10):inst.start + 10]
184 184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
185 185 except LookupError as k:
186 186 raise error.Abort(k, hint="please check your locale settings")
187 187
188 188 def unitolocal(u):
189 189 """Convert a unicode string to a byte string of local encoding"""
190 190 return tolocal(u.encode('utf-8'))
191 191
192 192 def unifromlocal(s):
193 193 """Convert a byte string of local encoding to a unicode string"""
194 194 return fromlocal(s).decode('utf-8')
195 195
196 196 def unimethod(bytesfunc):
197 197 """Create a proxy method that forwards __unicode__() and __str__() of
198 198 Python 3 to __bytes__()"""
199 199 def unifunc(obj):
200 200 return unifromlocal(bytesfunc(obj))
201 201 return unifunc
202 202
203 203 # converter functions between native str and byte string. use these if the
204 204 # character encoding is not aware (e.g. exception message) or is known to
205 205 # be locale dependent (e.g. date formatting.)
206 206 if pycompat.ispy3:
207 207 strtolocal = unitolocal
208 208 strfromlocal = unifromlocal
209 209 strmethod = unimethod
210 210 else:
211 211 strtolocal = pycompat.identity
212 212 strfromlocal = pycompat.identity
213 213 strmethod = pycompat.identity
214 214
215 215 if not _nativeenviron:
216 216 # now encoding and helper functions are available, recreate the environ
217 217 # dict to be exported to other modules
218 218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 219 for k, v in os.environ.items()) # re-exports
220 220
221 221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 223 and "WFA" or "WF")
224 224
225 225 def colwidth(s):
226 226 "Find the column width of a string for display in the local encoding"
227 227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228 228
229 229 def ucolwidth(d):
230 230 "Find the column width of a Unicode string for display"
231 231 eaw = getattr(unicodedata, 'east_asian_width', None)
232 232 if eaw is not None:
233 233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 234 return len(d)
235 235
236 236 def getcols(s, start, c):
237 237 '''Use colwidth to find a c-column substring of s starting at byte
238 238 index start'''
239 239 for x in xrange(start + c, len(s)):
240 240 t = s[start:x]
241 241 if colwidth(t) == c:
242 242 return t
243 243
244 244 def trim(s, width, ellipsis='', leftside=False):
245 245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246 246
247 247 If 'leftside' is True, left side of string 's' is trimmed.
248 248 'ellipsis' is always placed at trimmed side.
249 249
250 250 >>> from .node import bin
251 251 >>> def bprint(s):
252 252 ... print(pycompat.sysstr(s))
253 253 >>> ellipsis = b'+++'
254 254 >>> from . import encoding
255 255 >>> encoding.encoding = b'utf-8'
256 256 >>> t = b'1234567890'
257 257 >>> bprint(trim(t, 12, ellipsis=ellipsis))
258 258 1234567890
259 259 >>> bprint(trim(t, 10, ellipsis=ellipsis))
260 260 1234567890
261 261 >>> bprint(trim(t, 8, ellipsis=ellipsis))
262 262 12345+++
263 263 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
264 264 +++67890
265 265 >>> bprint(trim(t, 8))
266 266 12345678
267 267 >>> bprint(trim(t, 8, leftside=True))
268 268 34567890
269 269 >>> bprint(trim(t, 3, ellipsis=ellipsis))
270 270 +++
271 271 >>> bprint(trim(t, 1, ellipsis=ellipsis))
272 272 +
273 273 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
274 274 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
275 275 >>> bprint(trim(t, 12, ellipsis=ellipsis))
276 276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
277 277 >>> bprint(trim(t, 10, ellipsis=ellipsis))
278 278 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
279 279 >>> bprint(trim(t, 8, ellipsis=ellipsis))
280 280 \xe3\x81\x82\xe3\x81\x84+++
281 281 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
282 282 +++\xe3\x81\x88\xe3\x81\x8a
283 283 >>> bprint(trim(t, 5))
284 284 \xe3\x81\x82\xe3\x81\x84
285 285 >>> bprint(trim(t, 5, leftside=True))
286 286 \xe3\x81\x88\xe3\x81\x8a
287 287 >>> bprint(trim(t, 4, ellipsis=ellipsis))
288 288 +++
289 289 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
290 290 +++
291 291 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
292 292 >>> bprint(trim(t, 12, ellipsis=ellipsis))
293 293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
294 294 >>> bprint(trim(t, 10, ellipsis=ellipsis))
295 295 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
296 296 >>> bprint(trim(t, 8, ellipsis=ellipsis))
297 297 \x11\x22\x33\x44\x55+++
298 298 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
299 299 +++\x66\x77\x88\x99\xaa
300 300 >>> bprint(trim(t, 8))
301 301 \x11\x22\x33\x44\x55\x66\x77\x88
302 302 >>> bprint(trim(t, 8, leftside=True))
303 303 \x33\x44\x55\x66\x77\x88\x99\xaa
304 304 >>> bprint(trim(t, 3, ellipsis=ellipsis))
305 305 +++
306 306 >>> bprint(trim(t, 1, ellipsis=ellipsis))
307 307 +
308 308 """
309 309 try:
310 310 u = s.decode(_sysstr(encoding))
311 311 except UnicodeDecodeError:
312 312 if len(s) <= width: # trimming is not needed
313 313 return s
314 314 width -= len(ellipsis)
315 315 if width <= 0: # no enough room even for ellipsis
316 316 return ellipsis[:width + len(ellipsis)]
317 317 if leftside:
318 318 return ellipsis + s[-width:]
319 319 return s[:width] + ellipsis
320 320
321 321 if ucolwidth(u) <= width: # trimming is not needed
322 322 return s
323 323
324 324 width -= len(ellipsis)
325 325 if width <= 0: # no enough room even for ellipsis
326 326 return ellipsis[:width + len(ellipsis)]
327 327
328 328 if leftside:
329 329 uslice = lambda i: u[i:]
330 330 concat = lambda s: ellipsis + s
331 331 else:
332 332 uslice = lambda i: u[:-i]
333 333 concat = lambda s: s + ellipsis
334 334 for i in xrange(1, len(u)):
335 335 usub = uslice(i)
336 336 if ucolwidth(usub) <= width:
337 337 return concat(usub.encode(_sysstr(encoding)))
338 338 return ellipsis # no enough room for multi-column characters
339 339
340 340 def lower(s):
341 341 "best-effort encoding-aware case-folding of local string s"
342 342 try:
343 343 return asciilower(s)
344 344 except UnicodeDecodeError:
345 345 pass
346 346 try:
347 347 if isinstance(s, localstr):
348 348 u = s._utf8.decode("utf-8")
349 349 else:
350 350 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
351 351
352 352 lu = u.lower()
353 353 if u == lu:
354 354 return s # preserve localstring
355 355 return lu.encode(_sysstr(encoding))
356 356 except UnicodeError:
357 357 return s.lower() # we don't know how to fold this except in ASCII
358 358 except LookupError as k:
359 359 raise error.Abort(k, hint="please check your locale settings")
360 360
361 361 def upper(s):
362 362 "best-effort encoding-aware case-folding of local string s"
363 363 try:
364 364 return asciiupper(s)
365 365 except UnicodeDecodeError:
366 366 return upperfallback(s)
367 367
368 368 def upperfallback(s):
369 369 try:
370 370 if isinstance(s, localstr):
371 371 u = s._utf8.decode("utf-8")
372 372 else:
373 373 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
374 374
375 375 uu = u.upper()
376 376 if u == uu:
377 377 return s # preserve localstring
378 378 return uu.encode(_sysstr(encoding))
379 379 except UnicodeError:
380 380 return s.upper() # we don't know how to fold this except in ASCII
381 381 except LookupError as k:
382 382 raise error.Abort(k, hint="please check your locale settings")
383 383
384 384 class normcasespecs(object):
385 385 '''what a platform's normcase does to ASCII strings
386 386
387 387 This is specified per platform, and should be consistent with what normcase
388 388 on that platform actually does.
389 389
390 390 lower: normcase lowercases ASCII strings
391 391 upper: normcase uppercases ASCII strings
392 392 other: the fallback function should always be called
393 393
394 394 This should be kept in sync with normcase_spec in util.h.'''
395 395 lower = -1
396 396 upper = 1
397 397 other = 0
398 398
399 399 def jsonescape(s, paranoid=False):
400 400 '''returns a string suitable for JSON
401 401
402 402 JSON is problematic for us because it doesn't support non-Unicode
403 403 bytes. To deal with this, we take the following approach:
404 404
405 405 - localstr objects are converted back to UTF-8
406 406 - valid UTF-8/ASCII strings are passed as-is
407 407 - other strings are converted to UTF-8b surrogate encoding
408 408 - apply JSON-specified string escaping
409 409
410 410 (escapes are doubled in these tests)
411 411
412 412 >>> jsonescape(b'this is a test')
413 413 'this is a test'
414 414 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
415 415 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
416 416 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
417 417 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
418 418 >>> jsonescape(b'a weird byte: \\xdd')
419 419 'a weird byte: \\xed\\xb3\\x9d'
420 420 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
421 421 'utf-8: caf\\xc3\\xa9'
422 422 >>> jsonescape(b'')
423 423 ''
424 424
425 425 If paranoid, non-ascii and common troublesome characters are also escaped.
426 426 This is suitable for web output.
427 427
428 428 >>> s = b'escape characters: \\0 \\x0b \\x7f'
429 429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
430 430 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
431 431 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
432 432 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
433 433 'escape boundary: ~ \\\\u007f \\\\u0080'
434 434 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
435 435 'a weird byte: \\\\udcdd'
436 436 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
437 437 'utf-8: caf\\\\u00e9'
438 438 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
439 439 'non-BMP: \\\\ud834\\\\udd1e'
440 440 >>> jsonescape(b'<foo@example.org>', paranoid=True)
441 441 '\\\\u003cfoo@example.org\\\\u003e'
442 442 '''
443 443
444 444 u8chars = toutf8b(s)
445 445 try:
446 446 return _jsonescapeu8fast(u8chars, paranoid)
447 447 except ValueError:
448 448 pass
449 449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
450 450
451 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
452 # bytes are mapped to that range.
453 if pycompat.ispy3:
454 _utf8strict = r'surrogatepass'
455 else:
456 _utf8strict = r'strict'
457
451 458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
452 459
453 460 def getutf8char(s, pos):
454 461 '''get the next full utf-8 character in the given string, starting at pos
455 462
456 463 Raises a UnicodeError if the given location does not start a valid
457 464 utf-8 character.
458 465 '''
459 466
460 467 # find how many bytes to attempt decoding from first nibble
461 468 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
462 469 if not l: # ascii
463 470 return s[pos:pos + 1]
464 471
465 472 c = s[pos:pos + l]
466 473 # validate with attempted decode
467 c.decode("utf-8")
474 c.decode("utf-8", _utf8strict)
468 475 return c
469 476
470 477 def toutf8b(s):
471 478 '''convert a local, possibly-binary string into UTF-8b
472 479
473 480 This is intended as a generic method to preserve data when working
474 481 with schemes like JSON and XML that have no provision for
475 482 arbitrary byte strings. As Mercurial often doesn't know
476 483 what encoding data is in, we use so-called UTF-8b.
477 484
478 485 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
479 486 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
480 487 uDC00-uDCFF.
481 488
482 489 Principles of operation:
483 490
484 491 - ASCII and UTF-8 data successfully round-trips and is understood
485 492 by Unicode-oriented clients
486 493 - filenames and file contents in arbitrary other encodings can have
487 494 be round-tripped or recovered by clueful clients
488 495 - local strings that have a cached known UTF-8 encoding (aka
489 496 localstr) get sent as UTF-8 so Unicode-oriented clients get the
490 497 Unicode data they want
491 498 - because we must preserve UTF-8 bytestring in places such as
492 499 filenames, metadata can't be roundtripped without help
493 500
494 501 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
495 502 arbitrary bytes into an internal Unicode format that can be
496 503 re-encoded back into the original. Here we are exposing the
497 504 internal surrogate encoding as a UTF-8 string.)
498 505 '''
499 506
500 507 if not isinstance(s, localstr) and isasciistr(s):
501 508 return s
502 509 if "\xed" not in s:
503 510 if isinstance(s, localstr):
504 511 return s._utf8
505 512 try:
506 s.decode('utf-8')
513 s.decode('utf-8', _utf8strict)
507 514 return s
508 515 except UnicodeDecodeError:
509 516 pass
510 517
511 518 s = pycompat.bytestr(s)
512 519 r = ""
513 520 pos = 0
514 521 l = len(s)
515 522 while pos < l:
516 523 try:
517 524 c = getutf8char(s, pos)
518 525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
519 526 # have to re-escape existing U+DCxx characters
520 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
521 528 pos += 1
522 529 else:
523 530 pos += len(c)
524 531 except UnicodeDecodeError:
525 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
526 533 pos += 1
527 534 r += c
528 535 return r
529 536
530 537 def fromutf8b(s):
531 538 '''Given a UTF-8b string, return a local, possibly-binary string.
532 539
533 540 return the original binary string. This
534 541 is a round-trip process for strings like filenames, but metadata
535 542 that's was passed through tolocal will remain in UTF-8.
536 543
537 544 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
538 545 >>> m = b"\\xc3\\xa9\\x99abcd"
539 546 >>> toutf8b(m)
540 547 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
541 548 >>> roundtrip(m)
542 549 True
543 550 >>> roundtrip(b"\\xc2\\xc2\\x80")
544 551 True
545 552 >>> roundtrip(b"\\xef\\xbf\\xbd")
546 553 True
547 554 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
548 555 True
549 556 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
550 557 True
551 558 '''
552 559
553 560 if isasciistr(s):
554 561 return s
555 562 # fast path - look for uDxxx prefixes in s
556 563 if "\xed" not in s:
557 564 return s
558 565
559 566 # We could do this with the unicode type but some Python builds
560 567 # use UTF-16 internally (issue5031) which causes non-BMP code
561 568 # points to be escaped. Instead, we use our handy getutf8char
562 569 # helper again to walk the string without "decoding" it.
563 570
564 571 s = pycompat.bytestr(s)
565 572 r = ""
566 573 pos = 0
567 574 l = len(s)
568 575 while pos < l:
569 576 c = getutf8char(s, pos)
570 577 pos += len(c)
571 578 # unescape U+DCxx characters
572 579 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
573 c = chr(ord(c.decode("utf-8")) & 0xff)
580 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
574 581 r += c
575 582 return r
576 583
577 584 if pycompat.ispy3:
578 585 class strio(io.TextIOWrapper):
579 586 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
580 587
581 588 Also works around Python closing streams.
582 589 """
583 590
584 591 def __init__(self, buffer):
585 592 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
586 593
587 594 def __del__(self):
588 595 """Override __del__ so it doesn't close the underlying stream."""
589 596 else:
590 597 strio = pycompat.identity
@@ -1,79 +1,85 b''
1 1 # charencode.py - miscellaneous character encoding
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import array
11 11
12 12 from .. import (
13 13 pycompat,
14 14 )
15 15
16 16 def isasciistr(s):
17 17 try:
18 18 s.decode('ascii')
19 19 return True
20 20 except UnicodeDecodeError:
21 21 return False
22 22
23 23 def asciilower(s):
24 24 '''convert a string to lowercase if ASCII
25 25
26 26 Raises UnicodeDecodeError if non-ASCII characters are found.'''
27 27 s.decode('ascii')
28 28 return s.lower()
29 29
30 30 def asciiupper(s):
31 31 '''convert a string to uppercase if ASCII
32 32
33 33 Raises UnicodeDecodeError if non-ASCII characters are found.'''
34 34 s.decode('ascii')
35 35 return s.upper()
36 36
37 37 _jsonmap = []
38 38 _jsonmap.extend("\\u%04x" % x for x in range(32))
39 39 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
40 40 _jsonmap.append('\\u007f')
41 41 _jsonmap[0x09] = '\\t'
42 42 _jsonmap[0x0a] = '\\n'
43 43 _jsonmap[0x22] = '\\"'
44 44 _jsonmap[0x5c] = '\\\\'
45 45 _jsonmap[0x08] = '\\b'
46 46 _jsonmap[0x0c] = '\\f'
47 47 _jsonmap[0x0d] = '\\r'
48 48 _paranoidjsonmap = _jsonmap[:]
49 49 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
50 50 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
51 51 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
52 52
53 53 def jsonescapeu8fast(u8chars, paranoid):
54 54 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
55 55
56 56 Raises ValueError if non-ASCII characters have to be escaped.
57 57 """
58 58 if paranoid:
59 59 jm = _paranoidjsonmap
60 60 else:
61 61 jm = _jsonmap
62 62 try:
63 63 return ''.join(jm[x] for x in bytearray(u8chars))
64 64 except IndexError:
65 65 raise ValueError
66 66
67 if pycompat.ispy3:
68 _utf8strict = r'surrogatepass'
69 else:
70 _utf8strict = r'strict'
71
67 72 def jsonescapeu8fallback(u8chars, paranoid):
68 73 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
69 74
70 75 Escapes all non-ASCII characters no matter if paranoid is False.
71 76 """
72 77 if paranoid:
73 78 jm = _paranoidjsonmap
74 79 else:
75 80 jm = _jsonmap
76 81 # non-BMP char is represented as UTF-16 surrogate pair
77 u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
82 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
83 u16codes = array.array(r'H', u16b)
78 84 u16codes.pop(0) # drop BOM
79 85 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
@@ -1,81 +1,81 b''
1 1 # this is hack to make sure no escape characters are inserted into the output
2 2
3 3 from __future__ import absolute_import
4 4
5 5 import doctest
6 6 import os
7 7 import re
8 8 import sys
9 9
10 10 ispy3 = (sys.version_info[0] >= 3)
11 11
12 12 if 'TERM' in os.environ:
13 13 del os.environ['TERM']
14 14
15 15 class py3docchecker(doctest.OutputChecker):
16 16 def check_output(self, want, got, optionflags):
17 17 want2 = re.sub(r'''\bu(['"])(.*?)\1''', r'\1\2\1', want) # py2: u''
18 18 got2 = re.sub(r'''\bb(['"])(.*?)\1''', r'\1\2\1', got) # py3: b''
19 19 # py3: <exc.name>: b'<msg>' -> <name>: <msg>
20 20 # <exc.name>: <others> -> <name>: <others>
21 21 got2 = re.sub(r'''^mercurial\.\w+\.(\w+): (['"])(.*?)\2''', r'\1: \3',
22 22 got2, re.MULTILINE)
23 23 got2 = re.sub(r'^mercurial\.\w+\.(\w+): ', r'\1: ', got2, re.MULTILINE)
24 24 return any(doctest.OutputChecker.check_output(self, w, g, optionflags)
25 25 for w, g in [(want, got), (want2, got2)])
26 26
27 27 # TODO: migrate doctests to py3 and enable them on both versions
28 28 def testmod(name, optionflags=0, testtarget=None, py2=True, py3=True):
29 29 if not (not ispy3 and py2 or ispy3 and py3):
30 30 return
31 31 __import__(name)
32 32 mod = sys.modules[name]
33 33 if testtarget is not None:
34 34 mod = getattr(mod, testtarget)
35 35
36 36 # minimal copy of doctest.testmod()
37 37 finder = doctest.DocTestFinder()
38 38 checker = None
39 39 if ispy3:
40 40 checker = py3docchecker()
41 41 runner = doctest.DocTestRunner(checker=checker, optionflags=optionflags)
42 42 for test in finder.find(mod, name):
43 43 runner.run(test)
44 44 runner.summarize()
45 45
46 46 testmod('mercurial.changegroup')
47 47 testmod('mercurial.changelog')
48 48 testmod('mercurial.color')
49 49 testmod('mercurial.config')
50 50 testmod('mercurial.context')
51 51 testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
52 52 testmod('mercurial.dispatch')
53 testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues
53 testmod('mercurial.encoding')
54 54 testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout
55 55 testmod('mercurial.hg')
56 56 testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?
57 57 testmod('mercurial.match')
58 58 testmod('mercurial.mdiff')
59 59 testmod('mercurial.minirst')
60 60 testmod('mercurial.patch', py3=False) # py3: bytes[n], etc. ?
61 61 testmod('mercurial.pathutil', py3=False) # py3: os.sep
62 62 testmod('mercurial.parser')
63 63 testmod('mercurial.pycompat')
64 64 testmod('mercurial.revsetlang')
65 65 testmod('mercurial.smartset')
66 66 testmod('mercurial.store')
67 67 testmod('mercurial.subrepo')
68 68 testmod('mercurial.templatefilters')
69 69 testmod('mercurial.templater')
70 70 testmod('mercurial.ui')
71 71 testmod('mercurial.url')
72 72 testmod('mercurial.util', py3=False) # py3: multiple bytes/unicode issues
73 73 testmod('mercurial.util', testtarget='platform')
74 74 testmod('hgext.convert.convcmd', py3=False) # py3: use of str() ?
75 75 testmod('hgext.convert.cvsps')
76 76 testmod('hgext.convert.filemap')
77 77 testmod('hgext.convert.p4')
78 78 testmod('hgext.convert.subversion')
79 79 testmod('hgext.mq')
80 80 # Helper scripts in tests/ that have doctests:
81 81 testmod('drawdag')
General Comments 0
You need to be logged in to leave comments. Login now