upstream/mercurial-mirror Commit - r28068:9ece901f

encoding: add option to escape non-ascii characters in JSON...

Yuya Nishihara -

r28068:9ece901f default

parent child

mercurial/encoding.py

0 +32 -4

             from __future__ import absolute_import
+            import array
             import locale
             import os
             import unicodedata
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in xrange(32))
-            _jsonmap.extend(chr(x) for x in xrange(32, 256))
+            _jsonmap.extend(chr(x) for x in xrange(32, 127))
-            _jsonmap[0x7f] = '\\u007f'
+            _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
+            _paranoidjsonmap = _jsonmap[:]
+            _jsonmap.extend(chr(x) for x in xrange(128, 256))
-            def jsonescape(s):
+            def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
+                If paranoid, non-ascii characters are also escaped. This is suitable for
+                web output.
+                >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+                'escape boundary: ~ \\\\u007f \\\\u0080'
+                >>> jsonescape('a weird byte: \\xdd', paranoid=True)
+                'a weird byte: \\\\udcdd'
+                >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+                'utf-8: caf\\\\u00e9'
+                >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+                'non-BMP: \\\\ud834\\\\udd1e'
                 '''
-                return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
+                if paranoid:
+                    jm = _paranoidjsonmap
+                else:
+                    jm = _jsonmap
+                u8chars = toutf8b(s)
+                try:
+                    return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
+                except IndexError:
+                    pass
+                # non-BMP char is represented as UTF-16 surrogate pair
+                u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
+                u16codes.pop(0)  # drop BOM
+                return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages