diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -7,6 +7,7 @@ from __future__ import absolute_import +import array import locale import os import unicodedata @@ -380,8 +381,8 @@ class normcasespecs(object): _jsonmap = [] _jsonmap.extend("\\u%04x" % x for x in xrange(32)) -_jsonmap.extend(chr(x) for x in xrange(32, 256)) -_jsonmap[0x7f] = '\\u007f' +_jsonmap.extend(chr(x) for x in xrange(32, 127)) +_jsonmap.append('\\u007f') _jsonmap[0x09] = '\\t' _jsonmap[0x0a] = '\\n' _jsonmap[0x22] = '\\"' @@ -389,8 +390,10 @@ class normcasespecs(object): _jsonmap[0x08] = '\\b' _jsonmap[0x0c] = '\\f' _jsonmap[0x0d] = '\\r' +_paranoidjsonmap = _jsonmap[:] +_jsonmap.extend(chr(x) for x in xrange(128, 256)) -def jsonescape(s): +def jsonescape(s, paranoid=False): '''returns a string suitable for JSON JSON is problematic for us because it doesn't support non-Unicode @@ -415,9 +418,34 @@ def jsonescape(s): 'utf-8: caf\\xc3\\xa9' >>> jsonescape('') '' + + If paranoid, non-ascii characters are also escaped. This is suitable for + web output. + + >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) + 'escape boundary: ~ \\\\u007f \\\\u0080' + >>> jsonescape('a weird byte: \\xdd', paranoid=True) + 'a weird byte: \\\\udcdd' + >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True) + 'utf-8: caf\\\\u00e9' + >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) + 'non-BMP: \\\\ud834\\\\udd1e' ''' - return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s))) + if paranoid: + jm = _paranoidjsonmap + else: + jm = _jsonmap + + u8chars = toutf8b(s) + try: + return ''.join(jm[x] for x in bytearray(u8chars)) # fast path + except IndexError: + pass + # non-BMP char is represented as UTF-16 surrogate pair + u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) + u16codes.pop(0) # drop BOM + return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]