encoding.py
620 lines
| 20.0 KiB
| text/x-python
|
PythonLexer
/ mercurial / encoding.py
Martin Geisler
|
r8226 | # encoding.py - character transcoding support for Mercurial | ||
# | ||||
# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
Matt Mackall
|
r10263 | # GNU General Public License version 2 or any later version. | ||
Matt Mackall
|
r7948 | |||
Gregory Szorc
|
r27355 | from __future__ import absolute_import | ||
Yuya Nishihara
|
r28068 | import array | ||
Gregory Szorc
|
r27355 | import locale | ||
import os | ||||
import unicodedata | ||||
from . import ( | ||||
error, | ||||
Yuya Nishihara
|
r30030 | pycompat, | ||
Gregory Szorc
|
r27355 | ) | ||
Matt Mackall
|
r7948 | |||
Yuya Nishihara
|
r30033 | _sysstr = pycompat.sysstr | ||
Yuya Nishihara
|
r30030 | if pycompat.ispy3: | ||
Gregory Szorc
|
r28507 | unichr = chr | ||
Augie Fackler
|
r23596 | # These unicode characters are ignored by HFS+ (Apple Technote 1150, | ||
# "Unicode Subtleties"), so we need to ignore them in some places for | ||||
# sanity. | ||||
_ignore = [unichr(int(x, 16)).encode("utf-8") for x in | ||||
"200c 200d 200e 200f 202a 202b 202c 202d 202e " | ||||
"206a 206b 206c 206d 206e 206f feff".split()] | ||||
# verify the next function will work | ||||
Yuya Nishihara
|
r30030 | if pycompat.ispy3: | ||
Gregory Szorc
|
r28507 | assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')]) | ||
else: | ||||
assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"]) | ||||
Augie Fackler
|
r23596 | |||
def hfsignoreclean(s): | ||||
"""Remove codepoints ignored by HFS+ from s. | ||||
>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) | ||||
'.hg' | ||||
>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) | ||||
'.hg' | ||||
""" | ||||
if "\xe2" in s or "\xef" in s: | ||||
for c in _ignore: | ||||
s = s.replace(c, '') | ||||
return s | ||||
Yuya Nishihara
|
r30034 | # encoding.environ is provided read-only, which may not be used to modify | ||
# the process environment | ||||
_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ) | ||||
if not pycompat.ispy3: | ||||
environ = os.environ | ||||
elif _nativeenviron: | ||||
environ = os.environb | ||||
else: | ||||
# preferred encoding isn't known yet; use utf-8 to avoid unicode error | ||||
# and recreate it once encoding is settled | ||||
environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8')) | ||||
for k, v in os.environ.items()) | ||||
Dan Villiom Podlaski Christiansen
|
r11892 | def _getpreferredencoding(): | ||
''' | ||||
On darwin, getpreferredencoding ignores the locale environment and | ||||
always returns mac-roman. http://bugs.python.org/issue6202 fixes this | ||||
for Python 2.7 and up. This is the same corrected code for earlier | ||||
Python versions. | ||||
Martin Geisler
|
r12770 | However, we can't use a version check for this method, as some distributions | ||
Dan Villiom Podlaski Christiansen
|
r11892 | patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman | ||
encoding, as it is unlikely that this encoding is the actually expected. | ||||
''' | ||||
try: | ||||
locale.CODESET | ||||
except AttributeError: | ||||
# Fall back to parsing environment variables :-( | ||||
return locale.getdefaultlocale()[1] | ||||
oldloc = locale.setlocale(locale.LC_CTYPE) | ||||
locale.setlocale(locale.LC_CTYPE, "") | ||||
result = locale.nl_langinfo(locale.CODESET) | ||||
locale.setlocale(locale.LC_CTYPE, oldloc) | ||||
return result | ||||
_encodingfixers = { | ||||
'646': lambda: 'ascii', | ||||
'ANSI_X3.4-1968': lambda: 'ascii', | ||||
'mac-roman': _getpreferredencoding | ||||
} | ||||
Matt Mackall
|
r7948 | |||
try: | ||||
Yuya Nishihara
|
r30034 | encoding = environ.get("HGENCODING") | ||
Matt Mackall
|
r7948 | if not encoding: | ||
Pulkit Goyal
|
r30622 | encoding = locale.getpreferredencoding().encode('ascii') or 'ascii' | ||
Dan Villiom Podlaski Christiansen
|
r11892 | encoding = _encodingfixers.get(encoding, lambda: encoding)() | ||
Matt Mackall
|
r7948 | except locale.Error: | ||
encoding = 'ascii' | ||||
Yuya Nishihara
|
r30034 | encodingmode = environ.get("HGENCODINGMODE", "strict") | ||
Matt Mackall
|
r7948 | fallbackencoding = 'ISO-8859-1' | ||
Matt Mackall
|
r13046 | class localstr(str): | ||
'''This class allows strings that are unmodified to be | ||||
round-tripped to the local encoding and back''' | ||||
def __new__(cls, u, l): | ||||
s = str.__new__(cls, l) | ||||
s._utf8 = u | ||||
return s | ||||
def __hash__(self): | ||||
return hash(self._utf8) # avoid collisions in local string space | ||||
Matt Mackall
|
r7948 | def tolocal(s): | ||
""" | ||||
Convert a string from internal UTF-8 to local encoding | ||||
All internal strings should be UTF-8 but some repos before the | ||||
implementation of locale support may contain latin1 or possibly | ||||
other character sets. We attempt to decode everything strictly | ||||
using UTF-8, then Latin-1, and failing that, we use UTF-8 and | ||||
replace unknown characters. | ||||
Matt Mackall
|
r13046 | |||
The localstr class is used to cache the known UTF-8 encoding of | ||||
strings next to their local representation to allow lossless | ||||
round-trip conversion back to UTF-8. | ||||
>>> u = 'foo: \\xc3\\xa4' # utf-8 | ||||
>>> l = tolocal(u) | ||||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) | ||||
'foo: \\xc3\\xa4' | ||||
>>> u2 = 'foo: \\xc3\\xa1' | ||||
>>> d = { l: 1, tolocal(u2): 2 } | ||||
Mads Kiilerich
|
r18378 | >>> len(d) # no collision | ||
2 | ||||
Matt Mackall
|
r13046 | >>> 'foo: ?' in d | ||
False | ||||
>>> l1 = 'foo: \\xe4' # historical latin1 fallback | ||||
>>> l = tolocal(l1) | ||||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) # magically in utf-8 | ||||
'foo: \\xc3\\xa4' | ||||
Matt Mackall
|
r7948 | """ | ||
Matt Mackall
|
r13046 | |||
Matt Mackall
|
r16274 | try: | ||
Matt Mackall
|
r7948 | try: | ||
Matt Mackall
|
r16274 | # make sure string is actually stored in UTF-8 | ||
u = s.decode('UTF-8') | ||||
if encoding == 'UTF-8': | ||||
# fast path | ||||
return s | ||||
Yuya Nishihara
|
r30033 | r = u.encode(_sysstr(encoding), u"replace") | ||
if u == r.decode(_sysstr(encoding)): | ||||
Matt Mackall
|
r13940 | # r is a safe, non-lossy encoding of s | ||
return r | ||||
Matt Mackall
|
r16274 | return localstr(s, r) | ||
except UnicodeDecodeError: | ||||
# we should only get here if we're looking at an ancient changeset | ||||
try: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(fallbackencoding)) | ||
r = u.encode(_sysstr(encoding), u"replace") | ||||
if u == r.decode(_sysstr(encoding)): | ||||
Matt Mackall
|
r16274 | # r is a safe, non-lossy encoding of s | ||
return r | ||||
Matt Mackall
|
r13940 | return localstr(u.encode('UTF-8'), r) | ||
Matt Mackall
|
r16274 | except UnicodeDecodeError: | ||
u = s.decode("utf-8", "replace") # last ditch | ||||
Yuya Nishihara
|
r30033 | # can't round-trip | ||
return u.encode(_sysstr(encoding), u"replace") | ||||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Matt Mackall
|
r16274 | raise error.Abort(k, hint="please check your locale settings") | ||
Matt Mackall
|
r7948 | |||
def fromlocal(s): | ||||
""" | ||||
Convert a string from the local character encoding to UTF-8 | ||||
We attempt to decode strings using the encoding mode set by | ||||
HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown | ||||
characters will cause an error message. Other modes include | ||||
'replace', which replaces unknown characters with a special | ||||
Unicode character, and 'ignore', which drops the character. | ||||
""" | ||||
Matt Mackall
|
r13046 | |||
# can we do a lossless round-trip? | ||||
if isinstance(s, localstr): | ||||
return s._utf8 | ||||
Matt Mackall
|
r7948 | try: | ||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||
return u.encode("utf-8") | ||||
Gregory Szorc
|
r25660 | except UnicodeDecodeError as inst: | ||
Matt Mackall
|
r10282 | sub = s[max(0, inst.start - 10):inst.start + 10] | ||
Matt Mackall
|
r7948 | raise error.Abort("decoding near '%s': %s!" % (sub, inst)) | ||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Mads Kiilerich
|
r15769 | raise error.Abort(k, hint="please check your locale settings") | ||
Matt Mackall
|
r7948 | |||
Yuya Nishihara
|
r31447 | def unitolocal(u): | ||
"""Convert a unicode string to a byte string of local encoding""" | ||||
return tolocal(u.encode('utf-8')) | ||||
def unifromlocal(s): | ||||
"""Convert a byte string of local encoding to a unicode string""" | ||||
return fromlocal(s).decode('utf-8') | ||||
Yuya Nishihara
|
r31448 | # converter functions between native str and byte string. use these if the | ||
# character encoding is not aware (e.g. exception message) or is known to | ||||
# be locale dependent (e.g. date formatting.) | ||||
if pycompat.ispy3: | ||||
strtolocal = unitolocal | ||||
strfromlocal = unifromlocal | ||||
else: | ||||
Yuya Nishihara
|
r31774 | strtolocal = pycompat.identity | ||
strfromlocal = pycompat.identity | ||||
Yuya Nishihara
|
r31448 | |||
Yuya Nishihara
|
r30034 | if not _nativeenviron: | ||
# now encoding and helper functions are available, recreate the environ | ||||
# dict to be exported to other modules | ||||
environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8'))) | ||||
for k, v in os.environ.items()) | ||||
Matt Mackall
|
r12866 | # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. | ||
Yuya Nishihara
|
r30034 | wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" | ||
FUJIWARA Katsunori
|
r15066 | and "WFA" or "WF") | ||
Matt Mackall
|
r12866 | |||
Matt Mackall
|
r7948 | def colwidth(s): | ||
Matt Mackall
|
r15142 | "Find the column width of a string for display in the local encoding" | ||
Yuya Nishihara
|
r30033 | return ucolwidth(s.decode(_sysstr(encoding), u'replace')) | ||
FUJIWARA Katsunori
|
r15066 | |||
def ucolwidth(d): | ||||
"Find the column width of a Unicode string for display" | ||||
Augie Fackler
|
r14951 | eaw = getattr(unicodedata, 'east_asian_width', None) | ||
if eaw is not None: | ||||
return sum([eaw(c) in wide and 2 or 1 for c in d]) | ||||
Matt Mackall
|
r7948 | return len(d) | ||
Matt Mackall
|
r15143 | def getcols(s, start, c): | ||
'''Use colwidth to find a c-column substring of s starting at byte | ||||
index start''' | ||||
for x in xrange(start + c, len(s)): | ||||
t = s[start:x] | ||||
if colwidth(t) == c: | ||||
return t | ||||
FUJIWARA Katsunori
|
r21861 | def trim(s, width, ellipsis='', leftside=False): | ||
FUJIWARA Katsunori
|
r21856 | """Trim string 's' to at most 'width' columns (including 'ellipsis'). | ||
FUJIWARA Katsunori
|
r21861 | If 'leftside' is True, left side of string 's' is trimmed. | ||
'ellipsis' is always placed at trimmed side. | ||||
FUJIWARA Katsunori
|
r21856 | >>> ellipsis = '+++' | ||
Gregory Szorc
|
r27355 | >>> from . import encoding | ||
FUJIWARA Katsunori
|
r21856 | >>> encoding.encoding = 'utf-8' | ||
>>> t= '1234567890' | ||||
>>> print trim(t, 12, ellipsis=ellipsis) | ||||
1234567890 | ||||
>>> print trim(t, 10, ellipsis=ellipsis) | ||||
1234567890 | ||||
>>> print trim(t, 8, ellipsis=ellipsis) | ||||
12345+++ | ||||
FUJIWARA Katsunori
|
r21861 | >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) | ||
+++67890 | ||||
FUJIWARA Katsunori
|
r21856 | >>> print trim(t, 8) | ||
12345678 | ||||
FUJIWARA Katsunori
|
r21861 | >>> print trim(t, 8, leftside=True) | ||
34567890 | ||||
FUJIWARA Katsunori
|
r21856 | >>> print trim(t, 3, ellipsis=ellipsis) | ||
+++ | ||||
>>> print trim(t, 1, ellipsis=ellipsis) | ||||
+ | ||||
>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns | ||||
>>> t = u.encode(encoding.encoding) | ||||
>>> print trim(t, 12, ellipsis=ellipsis) | ||||
\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a | ||||
>>> print trim(t, 10, ellipsis=ellipsis) | ||||
\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a | ||||
>>> print trim(t, 8, ellipsis=ellipsis) | ||||
\xe3\x81\x82\xe3\x81\x84+++ | ||||
FUJIWARA Katsunori
|
r21861 | >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) | ||
+++\xe3\x81\x88\xe3\x81\x8a | ||||
FUJIWARA Katsunori
|
r21856 | >>> print trim(t, 5) | ||
\xe3\x81\x82\xe3\x81\x84 | ||||
FUJIWARA Katsunori
|
r21861 | >>> print trim(t, 5, leftside=True) | ||
\xe3\x81\x88\xe3\x81\x8a | ||||
FUJIWARA Katsunori
|
r21856 | >>> print trim(t, 4, ellipsis=ellipsis) | ||
+++ | ||||
FUJIWARA Katsunori
|
r21861 | >>> print trim(t, 4, ellipsis=ellipsis, leftside=True) | ||
+++ | ||||
FUJIWARA Katsunori
|
r21856 | >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence | ||
>>> print trim(t, 12, ellipsis=ellipsis) | ||||
\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa | ||||
>>> print trim(t, 10, ellipsis=ellipsis) | ||||
\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa | ||||
>>> print trim(t, 8, ellipsis=ellipsis) | ||||
\x11\x22\x33\x44\x55+++ | ||||
FUJIWARA Katsunori
|
r21861 | >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) | ||
+++\x66\x77\x88\x99\xaa | ||||
FUJIWARA Katsunori
|
r21856 | >>> print trim(t, 8) | ||
\x11\x22\x33\x44\x55\x66\x77\x88 | ||||
FUJIWARA Katsunori
|
r21861 | >>> print trim(t, 8, leftside=True) | ||
\x33\x44\x55\x66\x77\x88\x99\xaa | ||||
FUJIWARA Katsunori
|
r21856 | >>> print trim(t, 3, ellipsis=ellipsis) | ||
+++ | ||||
>>> print trim(t, 1, ellipsis=ellipsis) | ||||
+ | ||||
""" | ||||
try: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding)) | ||
FUJIWARA Katsunori
|
r21856 | except UnicodeDecodeError: | ||
if len(s) <= width: # trimming is not needed | ||||
return s | ||||
width -= len(ellipsis) | ||||
if width <= 0: # no enough room even for ellipsis | ||||
return ellipsis[:width + len(ellipsis)] | ||||
FUJIWARA Katsunori
|
r21861 | if leftside: | ||
return ellipsis + s[-width:] | ||||
FUJIWARA Katsunori
|
r21856 | return s[:width] + ellipsis | ||
if ucolwidth(u) <= width: # trimming is not needed | ||||
return s | ||||
width -= len(ellipsis) | ||||
if width <= 0: # no enough room even for ellipsis | ||||
return ellipsis[:width + len(ellipsis)] | ||||
FUJIWARA Katsunori
|
r21861 | if leftside: | ||
uslice = lambda i: u[i:] | ||||
concat = lambda s: ellipsis + s | ||||
else: | ||||
uslice = lambda i: u[:-i] | ||||
concat = lambda s: s + ellipsis | ||||
FUJIWARA Katsunori
|
r21856 | for i in xrange(1, len(u)): | ||
usub = uslice(i) | ||||
if ucolwidth(usub) <= width: | ||||
Yuya Nishihara
|
r30033 | return concat(usub.encode(_sysstr(encoding))) | ||
FUJIWARA Katsunori
|
r21856 | return ellipsis # no enough room for multi-column characters | ||
FUJIWARA Katsunori
|
r22973 | def _asciilower(s): | ||
Siddharth Agarwal
|
r22778 | '''convert a string to lowercase if ASCII | ||
Raises UnicodeDecodeError if non-ASCII characters are found.''' | ||||
s.decode('ascii') | ||||
return s.lower() | ||||
FUJIWARA Katsunori
|
r22973 | def asciilower(s): | ||
# delay importing avoids cyclic dependency around "parsers" in | ||||
# pure Python build (util => i18n => encoding => parsers => util) | ||||
Gregory Szorc
|
r27355 | from . import parsers | ||
FUJIWARA Katsunori
|
r22973 | impl = getattr(parsers, 'asciilower', _asciilower) | ||
global asciilower | ||||
asciilower = impl | ||||
return impl(s) | ||||
Siddharth Agarwal
|
r22778 | |||
Siddharth Agarwal
|
r24578 | def _asciiupper(s): | ||
'''convert a string to uppercase if ASCII | ||||
Raises UnicodeDecodeError if non-ASCII characters are found.''' | ||||
s.decode('ascii') | ||||
return s.upper() | ||||
def asciiupper(s): | ||||
# delay importing avoids cyclic dependency around "parsers" in | ||||
# pure Python build (util => i18n => encoding => parsers => util) | ||||
Gregory Szorc
|
r27355 | from . import parsers | ||
Siddharth Agarwal
|
r24578 | impl = getattr(parsers, 'asciiupper', _asciiupper) | ||
global asciiupper | ||||
asciiupper = impl | ||||
return impl(s) | ||||
Matt Mackall
|
r14069 | def lower(s): | ||
"best-effort encoding-aware case-folding of local string s" | ||||
try: | ||||
Siddharth Agarwal
|
r22779 | return asciilower(s) | ||
Martin Geisler
|
r17235 | except UnicodeDecodeError: | ||
Matt Mackall
|
r16387 | pass | ||
try: | ||||
Matt Mackall
|
r14069 | if isinstance(s, localstr): | ||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||
Matt Mackall
|
r14069 | |||
lu = u.lower() | ||||
if u == lu: | ||||
return s # preserve localstring | ||||
Yuya Nishihara
|
r30033 | return lu.encode(_sysstr(encoding)) | ||
Matt Mackall
|
r14069 | except UnicodeError: | ||
return s.lower() # we don't know how to fold this except in ASCII | ||||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
FUJIWARA Katsunori
|
r15672 | raise error.Abort(k, hint="please check your locale settings") | ||
def upper(s): | ||||
"best-effort encoding-aware case-folding of local string s" | ||||
try: | ||||
Siddharth Agarwal
|
r24578 | return asciiupper(s) | ||
Martin Geisler
|
r17236 | except UnicodeDecodeError: | ||
Siddharth Agarwal
|
r24597 | return upperfallback(s) | ||
def upperfallback(s): | ||||
Martin Geisler
|
r17236 | try: | ||
FUJIWARA Katsunori
|
r15672 | if isinstance(s, localstr): | ||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||
FUJIWARA Katsunori
|
r15672 | |||
uu = u.upper() | ||||
if u == uu: | ||||
return s # preserve localstring | ||||
Yuya Nishihara
|
r30033 | return uu.encode(_sysstr(encoding)) | ||
FUJIWARA Katsunori
|
r15672 | except UnicodeError: | ||
return s.upper() # we don't know how to fold this except in ASCII | ||||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
FUJIWARA Katsunori
|
r15672 | raise error.Abort(k, hint="please check your locale settings") | ||
Matt Mackall
|
r16133 | |||
Siddharth Agarwal
|
r24593 | class normcasespecs(object): | ||
'''what a platform's normcase does to ASCII strings | ||||
This is specified per platform, and should be consistent with what normcase | ||||
on that platform actually does. | ||||
lower: normcase lowercases ASCII strings | ||||
upper: normcase uppercases ASCII strings | ||||
Siddharth Agarwal
|
r24608 | other: the fallback function should always be called | ||
This should be kept in sync with normcase_spec in util.h.''' | ||||
Siddharth Agarwal
|
r24593 | lower = -1 | ||
upper = 1 | ||||
other = 0 | ||||
Yuya Nishihara
|
r28066 | _jsonmap = [] | ||
Gregory Szorc
|
r28508 | _jsonmap.extend("\\u%04x" % x for x in range(32)) | ||
_jsonmap.extend(chr(x) for x in range(32, 127)) | ||||
Yuya Nishihara
|
r28068 | _jsonmap.append('\\u007f') | ||
Yuya Nishihara
|
r28067 | _jsonmap[0x09] = '\\t' | ||
_jsonmap[0x0a] = '\\n' | ||||
_jsonmap[0x22] = '\\"' | ||||
_jsonmap[0x5c] = '\\\\' | ||||
_jsonmap[0x08] = '\\b' | ||||
_jsonmap[0x0c] = '\\f' | ||||
_jsonmap[0x0d] = '\\r' | ||||
Yuya Nishihara
|
r28068 | _paranoidjsonmap = _jsonmap[:] | ||
Yuya Nishihara
|
r28069 | _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>") | ||
_paranoidjsonmap[0x3e] = '\\u003e' # '>' | ||||
Gregory Szorc
|
r28508 | _jsonmap.extend(chr(x) for x in range(128, 256)) | ||
Matt Mackall
|
r22426 | |||
Yuya Nishihara
|
r28068 | def jsonescape(s, paranoid=False): | ||
Matt Mackall
|
r22426 | '''returns a string suitable for JSON | ||
JSON is problematic for us because it doesn't support non-Unicode | ||||
bytes. To deal with this, we take the following approach: | ||||
- localstr objects are converted back to UTF-8 | ||||
- valid UTF-8/ASCII strings are passed as-is | ||||
- other strings are converted to UTF-8b surrogate encoding | ||||
- apply JSON-specified string escaping | ||||
(escapes are doubled in these tests) | ||||
>>> jsonescape('this is a test') | ||||
'this is a test' | ||||
Yuya Nishihara
|
r27881 | >>> jsonescape('escape characters: \\0 \\x0b \\x7f') | ||
'escape characters: \\\\u0000 \\\\u000b \\\\u007f' | ||||
>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\') | ||||
'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\' | ||||
Matt Mackall
|
r22426 | >>> jsonescape('a weird byte: \\xdd') | ||
'a weird byte: \\xed\\xb3\\x9d' | ||||
>>> jsonescape('utf-8: caf\\xc3\\xa9') | ||||
'utf-8: caf\\xc3\\xa9' | ||||
>>> jsonescape('') | ||||
'' | ||||
Yuya Nishihara
|
r28068 | |||
Yuya Nishihara
|
r28069 | If paranoid, non-ascii and common troublesome characters are also escaped. | ||
This is suitable for web output. | ||||
Yuya Nishihara
|
r28068 | |||
>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) | ||||
'escape boundary: ~ \\\\u007f \\\\u0080' | ||||
>>> jsonescape('a weird byte: \\xdd', paranoid=True) | ||||
'a weird byte: \\\\udcdd' | ||||
>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True) | ||||
'utf-8: caf\\\\u00e9' | ||||
>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) | ||||
'non-BMP: \\\\ud834\\\\udd1e' | ||||
Yuya Nishihara
|
r28069 | >>> jsonescape('<foo@example.org>', paranoid=True) | ||
'\\\\u003cfoo@example.org\\\\u003e' | ||||
Matt Mackall
|
r22426 | ''' | ||
Yuya Nishihara
|
r28068 | if paranoid: | ||
jm = _paranoidjsonmap | ||||
else: | ||||
jm = _jsonmap | ||||
u8chars = toutf8b(s) | ||||
try: | ||||
return ''.join(jm[x] for x in bytearray(u8chars)) # fast path | ||||
except IndexError: | ||||
pass | ||||
# non-BMP char is represented as UTF-16 surrogate pair | ||||
u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) | ||||
u16codes.pop(0) # drop BOM | ||||
return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) | ||||
Matt Mackall
|
r22426 | |||
Matt Mackall
|
r26875 | _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | ||
def getutf8char(s, pos): | ||||
'''get the next full utf-8 character in the given string, starting at pos | ||||
Raises a UnicodeError if the given location does not start a valid | ||||
utf-8 character. | ||||
''' | ||||
# find how many bytes to attempt decoding from first nibble | ||||
l = _utf8len[ord(s[pos]) >> 4] | ||||
if not l: # ascii | ||||
return s[pos] | ||||
c = s[pos:pos + l] | ||||
# validate with attempted decode | ||||
c.decode("utf-8") | ||||
return c | ||||
Matt Mackall
|
r16133 | def toutf8b(s): | ||
'''convert a local, possibly-binary string into UTF-8b | ||||
This is intended as a generic method to preserve data when working | ||||
with schemes like JSON and XML that have no provision for | ||||
arbitrary byte strings. As Mercurial often doesn't know | ||||
what encoding data is in, we use so-called UTF-8b. | ||||
If a string is already valid UTF-8 (or ASCII), it passes unmodified. | ||||
Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, | ||||
uDC00-uDCFF. | ||||
Principles of operation: | ||||
Mads Kiilerich
|
r17424 | - ASCII and UTF-8 data successfully round-trips and is understood | ||
Matt Mackall
|
r16133 | by Unicode-oriented clients | ||
- filenames and file contents in arbitrary other encodings can have | ||||
be round-tripped or recovered by clueful clients | ||||
- local strings that have a cached known UTF-8 encoding (aka | ||||
localstr) get sent as UTF-8 so Unicode-oriented clients get the | ||||
Unicode data they want | ||||
- because we must preserve UTF-8 bytestring in places such as | ||||
filenames, metadata can't be roundtripped without help | ||||
(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | ||||
arbitrary bytes into an internal Unicode format that can be | ||||
re-encoded back into the original. Here we are exposing the | ||||
internal surrogate encoding as a UTF-8 string.) | ||||
''' | ||||
Matt Mackall
|
r26879 | if "\xed" not in s: | ||
if isinstance(s, localstr): | ||||
return s._utf8 | ||||
try: | ||||
s.decode('utf-8') | ||||
return s | ||||
except UnicodeDecodeError: | ||||
pass | ||||
Matt Mackall
|
r26878 | |||
r = "" | ||||
pos = 0 | ||||
l = len(s) | ||||
while pos < l: | ||||
try: | ||||
c = getutf8char(s, pos) | ||||
Matt Mackall
|
r26879 | if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": | ||
# have to re-escape existing U+DCxx characters | ||||
c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') | ||||
pos += 1 | ||||
else: | ||||
pos += len(c) | ||||
Matt Mackall
|
r26878 | except UnicodeDecodeError: | ||
c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') | ||||
pos += 1 | ||||
r += c | ||||
return r | ||||
Matt Mackall
|
r16133 | |||
def fromutf8b(s): | ||||
'''Given a UTF-8b string, return a local, possibly-binary string. | ||||
return the original binary string. This | ||||
is a round-trip process for strings like filenames, but metadata | ||||
that's was passed through tolocal will remain in UTF-8. | ||||
Matt Mackall
|
r26963 | >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x | ||
Matt Mackall
|
r16133 | >>> m = "\\xc3\\xa9\\x99abcd" | ||
Matt Mackall
|
r26963 | >>> toutf8b(m) | ||
Matt Mackall
|
r16133 | '\\xc3\\xa9\\xed\\xb2\\x99abcd' | ||
Matt Mackall
|
r26963 | >>> roundtrip(m) | ||
True | ||||
>>> roundtrip("\\xc2\\xc2\\x80") | ||||
True | ||||
>>> roundtrip("\\xef\\xbf\\xbd") | ||||
True | ||||
>>> roundtrip("\\xef\\xef\\xbf\\xbd") | ||||
Matt Mackall
|
r16133 | True | ||
Matt Mackall
|
r27699 | >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80") | ||
True | ||||
Matt Mackall
|
r16133 | ''' | ||
# fast path - look for uDxxx prefixes in s | ||||
if "\xed" not in s: | ||||
return s | ||||
Matt Mackall
|
r27699 | # We could do this with the unicode type but some Python builds | ||
# use UTF-16 internally (issue5031) which causes non-BMP code | ||||
# points to be escaped. Instead, we use our handy getutf8char | ||||
# helper again to walk the string without "decoding" it. | ||||
Matt Mackall
|
r16133 | r = "" | ||
Matt Mackall
|
r27699 | pos = 0 | ||
l = len(s) | ||||
while pos < l: | ||||
c = getutf8char(s, pos) | ||||
pos += len(c) | ||||
# unescape U+DCxx characters | ||||
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": | ||||
c = chr(ord(c.decode("utf-8")) & 0xff) | ||||
r += c | ||||
Matt Mackall
|
r16133 | return r | ||