upstream/mercurial-mirror Commit - r34215:aa877860

py3: use 'surrogatepass' error handler to process U+DCxx transparently...

Yuya Nishihara -

r34215:aa877860 default

parent child

mercurial/encoding.py

0 +12 -5

                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
+            # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+            # bytes are mapped to that range.
+            if pycompat.ispy3:
+                _utf8strict = r'surrogatepass'
+            else:
+                _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 c = s[pos:pos + l]
                 # validate with attempted decode
-                c.decode("utf-8")
+                c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
-                        s.decode('utf-8')
+                        s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
-                            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
-                        c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                        c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
-                        c = chr(ord(c.decode("utf-8")) & 0xff)
+                        c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
                     r += c
                 return r

mercurial/pure/charencode.py

0 +7 -1

                 except IndexError:
                     raise ValueError
+            if pycompat.ispy3:
+                _utf8strict = r'surrogatepass'
+            else:
+                _utf8strict = r'strict'
             def jsonescapeu8fallback(u8chars, paranoid):
                 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
                 else:
                     jm = _jsonmap
                 # non-BMP char is represented as UTF-16 surrogate pair
-                u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
+                u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
+                u16codes = array.array(r'H', u16b)
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

tests/test-doctest.py

0 +1 -1

             testmod('mercurial.context')
             testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
             testmod('mercurial.dispatch')
-            testmod('mercurial.encoding', py3=False)  # py3: multiple encoding issues
+            testmod('mercurial.encoding')
             testmod('mercurial.formatter', py3=False)  # py3: write bytes to stdout
             testmod('mercurial.hg')
             testmod('mercurial.hgweb.hgwebdir_mod', py3=False)  # py3: repr(bytes) ?

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages