upstream/mercurial-mirror Commit - r34215:aa877860

py3: use 'surrogatepass' error handler to process U+DCxx transparently...

Yuya Nishihara -

r34215:aa877860 default

parent child

mercurial/encoding.py

0 +12 -5

                      pass
                  return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
+             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+             # bytes are mapped to that range.
+             if pycompat.ispy3:
+                 _utf8strict = r'surrogatepass'
+             else:
+                 _utf8strict = r'strict'
              _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
              def getutf8char(s, pos):
                  c = s[pos:pos + l]
                  # validate with attempted decode
-                 c.decode("utf-8")
+                 c.decode("utf-8", _utf8strict)
                  return c
              def toutf8b(s):
                      if isinstance(s, localstr):
                          return s._utf8
                      try:
-                         s.decode('utf-8')
+                         s.decode('utf-8', _utf8strict)
                          return s
                      except UnicodeDecodeError:
                          pass
                          c = getutf8char(s, pos)
                          if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                              # have to re-escape existing U+DCxx characters
-                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                              pos += 1
                          else:
                              pos += len(c)
                      except UnicodeDecodeError:
-                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                          pos += 1
                      r += c
                  return r
                      pos += len(c)
                      # unescape U+DCxx characters
                      if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
-                         c = chr(ord(c.decode("utf-8")) & 0xff)
+                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
                      r += c
                  return r

mercurial/pure/charencode.py

0 +7 -1

                  except IndexError:
                      raise ValueError
+             if pycompat.ispy3:
+                 _utf8strict = r'surrogatepass'
+             else:
+                 _utf8strict = r'strict'
              def jsonescapeu8fallback(u8chars, paranoid):
                  """Convert a UTF-8 byte string to JSON-escaped form (slow path)
                  else:
                      jm = _jsonmap
                  # non-BMP char is represented as UTF-16 surrogate pair
-                 u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
+                 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
+                 u16codes = array.array(r'H', u16b)
                  u16codes.pop(0)  # drop BOM
                  return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

tests/test-doctest.py

0 +1 -1

              testmod('mercurial.context')
              testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
              testmod('mercurial.dispatch')
-             testmod('mercurial.encoding', py3=False)  # py3: multiple encoding issues
+             testmod('mercurial.encoding')
              testmod('mercurial.formatter', py3=False)  # py3: write bytes to stdout
              testmod('mercurial.hg')
              testmod('mercurial.hgweb.hgwebdir_mod', py3=False)  # py3: repr(bytes) ?

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages