Show More
@@ -448,6 +448,13 b' def jsonescape(s, paranoid=False):' | |||||
448 | pass |
|
448 | pass | |
449 | return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
|
449 | return charencodepure.jsonescapeu8fallback(u8chars, paranoid) | |
450 |
|
450 | |||
|
451 | # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 | |||
|
452 | # bytes are mapped to that range. | |||
|
453 | if pycompat.ispy3: | |||
|
454 | _utf8strict = r'surrogatepass' | |||
|
455 | else: | |||
|
456 | _utf8strict = r'strict' | |||
|
457 | ||||
451 | _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
|
458 | _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | |
452 |
|
459 | |||
453 | def getutf8char(s, pos): |
|
460 | def getutf8char(s, pos): | |
@@ -464,7 +471,7 b' def getutf8char(s, pos):' | |||||
464 |
|
471 | |||
465 | c = s[pos:pos + l] |
|
472 | c = s[pos:pos + l] | |
466 | # validate with attempted decode |
|
473 | # validate with attempted decode | |
467 | c.decode("utf-8") |
|
474 | c.decode("utf-8", _utf8strict) | |
468 | return c |
|
475 | return c | |
469 |
|
476 | |||
470 | def toutf8b(s): |
|
477 | def toutf8b(s): | |
@@ -503,7 +510,7 b' def toutf8b(s):' | |||||
503 | if isinstance(s, localstr): |
|
510 | if isinstance(s, localstr): | |
504 | return s._utf8 |
|
511 | return s._utf8 | |
505 | try: |
|
512 | try: | |
506 | s.decode('utf-8') |
|
513 | s.decode('utf-8', _utf8strict) | |
507 | return s |
|
514 | return s | |
508 | except UnicodeDecodeError: |
|
515 | except UnicodeDecodeError: | |
509 | pass |
|
516 | pass | |
@@ -517,12 +524,12 b' def toutf8b(s):' | |||||
517 | c = getutf8char(s, pos) |
|
524 | c = getutf8char(s, pos) | |
518 | if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
|
525 | if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": | |
519 | # have to re-escape existing U+DCxx characters |
|
526 | # have to re-escape existing U+DCxx characters | |
520 | c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
|
527 | c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) | |
521 | pos += 1 |
|
528 | pos += 1 | |
522 | else: |
|
529 | else: | |
523 | pos += len(c) |
|
530 | pos += len(c) | |
524 | except UnicodeDecodeError: |
|
531 | except UnicodeDecodeError: | |
525 | c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
|
532 | c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) | |
526 | pos += 1 |
|
533 | pos += 1 | |
527 | r += c |
|
534 | r += c | |
528 | return r |
|
535 | return r | |
@@ -570,7 +577,7 b' def fromutf8b(s):' | |||||
570 | pos += len(c) |
|
577 | pos += len(c) | |
571 | # unescape U+DCxx characters |
|
578 | # unescape U+DCxx characters | |
572 | if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
|
579 | if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": | |
573 | c = chr(ord(c.decode("utf-8")) & 0xff) |
|
580 | c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) | |
574 | r += c |
|
581 | r += c | |
575 | return r |
|
582 | return r | |
576 |
|
583 |
@@ -64,6 +64,11 b' def jsonescapeu8fast(u8chars, paranoid):' | |||||
64 | except IndexError: |
|
64 | except IndexError: | |
65 | raise ValueError |
|
65 | raise ValueError | |
66 |
|
66 | |||
|
67 | if pycompat.ispy3: | |||
|
68 | _utf8strict = r'surrogatepass' | |||
|
69 | else: | |||
|
70 | _utf8strict = r'strict' | |||
|
71 | ||||
67 | def jsonescapeu8fallback(u8chars, paranoid): |
|
72 | def jsonescapeu8fallback(u8chars, paranoid): | |
68 | """Convert a UTF-8 byte string to JSON-escaped form (slow path) |
|
73 | """Convert a UTF-8 byte string to JSON-escaped form (slow path) | |
69 |
|
74 | |||
@@ -74,6 +79,7 b' def jsonescapeu8fallback(u8chars, parano' | |||||
74 | else: |
|
79 | else: | |
75 | jm = _jsonmap |
|
80 | jm = _jsonmap | |
76 | # non-BMP char is represented as UTF-16 surrogate pair |
|
81 | # non-BMP char is represented as UTF-16 surrogate pair | |
77 |
u16 |
|
82 | u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict) | |
|
83 | u16codes = array.array(r'H', u16b) | |||
78 | u16codes.pop(0) # drop BOM |
|
84 | u16codes.pop(0) # drop BOM | |
79 | return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) |
|
85 | return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) |
@@ -50,7 +50,7 b" testmod('mercurial.config')" | |||||
50 | testmod('mercurial.context') |
|
50 | testmod('mercurial.context') | |
51 | testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE) |
|
51 | testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE) | |
52 | testmod('mercurial.dispatch') |
|
52 | testmod('mercurial.dispatch') | |
53 | testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues |
|
53 | testmod('mercurial.encoding') | |
54 | testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout |
|
54 | testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout | |
55 | testmod('mercurial.hg') |
|
55 | testmod('mercurial.hg') | |
56 | testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ? |
|
56 | testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ? |
General Comments 0
You need to be logged in to leave comments.
Login now