encoding.py
269 lines
| 8.7 KiB
| text/x-python
|
PythonLexer
/ mercurial / encoding.py
Martin Geisler
|
r8226 | # encoding.py - character transcoding support for Mercurial | ||
# | ||||
# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
Matt Mackall
|
r10263 | # GNU General Public License version 2 or any later version. | ||
Matt Mackall
|
r7948 | |||
Simon Heimberg
|
r8312 | import error | ||
Brodie Rao
|
r12062 | import unicodedata, locale, os | ||
Matt Mackall
|
r7948 | |||
Dan Villiom Podlaski Christiansen
|
r11892 | def _getpreferredencoding(): | ||
''' | ||||
On darwin, getpreferredencoding ignores the locale environment and | ||||
always returns mac-roman. http://bugs.python.org/issue6202 fixes this | ||||
for Python 2.7 and up. This is the same corrected code for earlier | ||||
Python versions. | ||||
Martin Geisler
|
r12770 | However, we can't use a version check for this method, as some distributions | ||
Dan Villiom Podlaski Christiansen
|
r11892 | patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman | ||
encoding, as it is unlikely that this encoding is the actually expected. | ||||
''' | ||||
try: | ||||
locale.CODESET | ||||
except AttributeError: | ||||
# Fall back to parsing environment variables :-( | ||||
return locale.getdefaultlocale()[1] | ||||
oldloc = locale.setlocale(locale.LC_CTYPE) | ||||
locale.setlocale(locale.LC_CTYPE, "") | ||||
result = locale.nl_langinfo(locale.CODESET) | ||||
locale.setlocale(locale.LC_CTYPE, oldloc) | ||||
return result | ||||
_encodingfixers = { | ||||
'646': lambda: 'ascii', | ||||
'ANSI_X3.4-1968': lambda: 'ascii', | ||||
'mac-roman': _getpreferredencoding | ||||
} | ||||
Matt Mackall
|
r7948 | |||
try: | ||||
encoding = os.environ.get("HGENCODING") | ||||
if not encoding: | ||||
encoding = locale.getpreferredencoding() or 'ascii' | ||||
Dan Villiom Podlaski Christiansen
|
r11892 | encoding = _encodingfixers.get(encoding, lambda: encoding)() | ||
Matt Mackall
|
r7948 | except locale.Error: | ||
encoding = 'ascii' | ||||
encodingmode = os.environ.get("HGENCODINGMODE", "strict") | ||||
fallbackencoding = 'ISO-8859-1' | ||||
Matt Mackall
|
r13046 | class localstr(str): | ||
'''This class allows strings that are unmodified to be | ||||
round-tripped to the local encoding and back''' | ||||
def __new__(cls, u, l): | ||||
s = str.__new__(cls, l) | ||||
s._utf8 = u | ||||
return s | ||||
def __hash__(self): | ||||
return hash(self._utf8) # avoid collisions in local string space | ||||
Matt Mackall
|
r7948 | def tolocal(s): | ||
""" | ||||
Convert a string from internal UTF-8 to local encoding | ||||
All internal strings should be UTF-8 but some repos before the | ||||
implementation of locale support may contain latin1 or possibly | ||||
other character sets. We attempt to decode everything strictly | ||||
using UTF-8, then Latin-1, and failing that, we use UTF-8 and | ||||
replace unknown characters. | ||||
Matt Mackall
|
r13046 | |||
The localstr class is used to cache the known UTF-8 encoding of | ||||
strings next to their local representation to allow lossless | ||||
round-trip conversion back to UTF-8. | ||||
>>> u = 'foo: \\xc3\\xa4' # utf-8 | ||||
>>> l = tolocal(u) | ||||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) | ||||
'foo: \\xc3\\xa4' | ||||
>>> u2 = 'foo: \\xc3\\xa1' | ||||
>>> d = { l: 1, tolocal(u2): 2 } | ||||
>>> d # no collision | ||||
{'foo: ?': 1, 'foo: ?': 2} | ||||
>>> 'foo: ?' in d | ||||
False | ||||
>>> l1 = 'foo: \\xe4' # historical latin1 fallback | ||||
>>> l = tolocal(l1) | ||||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) # magically in utf-8 | ||||
'foo: \\xc3\\xa4' | ||||
Matt Mackall
|
r7948 | """ | ||
Matt Mackall
|
r13046 | |||
Matt Mackall
|
r7948 | for e in ('UTF-8', fallbackencoding): | ||
try: | ||||
u = s.decode(e) # attempt strict decoding | ||||
Matt Mackall
|
r13940 | r = u.encode(encoding, "replace") | ||
if u == r.decode(encoding): | ||||
# r is a safe, non-lossy encoding of s | ||||
return r | ||||
elif e == 'UTF-8': | ||||
return localstr(s, r) | ||||
Matt Mackall
|
r13046 | else: | ||
Matt Mackall
|
r13940 | return localstr(u.encode('UTF-8'), r) | ||
Matt Mackall
|
r7948 | except LookupError, k: | ||
Mads Kiilerich
|
r15769 | raise error.Abort(k, hint="please check your locale settings") | ||
Matt Mackall
|
r7948 | except UnicodeDecodeError: | ||
pass | ||||
u = s.decode("utf-8", "replace") # last ditch | ||||
Matt Mackall
|
r13046 | return u.encode(encoding, "replace") # can't round-trip | ||
Matt Mackall
|
r7948 | |||
def fromlocal(s): | ||||
""" | ||||
Convert a string from the local character encoding to UTF-8 | ||||
We attempt to decode strings using the encoding mode set by | ||||
HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown | ||||
characters will cause an error message. Other modes include | ||||
'replace', which replaces unknown characters with a special | ||||
Unicode character, and 'ignore', which drops the character. | ||||
""" | ||||
Matt Mackall
|
r13046 | |||
# can we do a lossless round-trip? | ||||
if isinstance(s, localstr): | ||||
return s._utf8 | ||||
Matt Mackall
|
r7948 | try: | ||
return s.decode(encoding, encodingmode).encode("utf-8") | ||||
except UnicodeDecodeError, inst: | ||||
Matt Mackall
|
r10282 | sub = s[max(0, inst.start - 10):inst.start + 10] | ||
Matt Mackall
|
r7948 | raise error.Abort("decoding near '%s': %s!" % (sub, inst)) | ||
except LookupError, k: | ||||
Mads Kiilerich
|
r15769 | raise error.Abort(k, hint="please check your locale settings") | ||
Matt Mackall
|
r7948 | |||
Matt Mackall
|
r12866 | # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. | ||
FUJIWARA Katsunori
|
r15066 | wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" | ||
and "WFA" or "WF") | ||||
Matt Mackall
|
r12866 | |||
Matt Mackall
|
r7948 | def colwidth(s): | ||
Matt Mackall
|
r15142 | "Find the column width of a string for display in the local encoding" | ||
FUJIWARA Katsunori
|
r15066 | return ucolwidth(s.decode(encoding, 'replace')) | ||
def ucolwidth(d): | ||||
"Find the column width of a Unicode string for display" | ||||
Augie Fackler
|
r14951 | eaw = getattr(unicodedata, 'east_asian_width', None) | ||
if eaw is not None: | ||||
return sum([eaw(c) in wide and 2 or 1 for c in d]) | ||||
Matt Mackall
|
r7948 | return len(d) | ||
Matt Mackall
|
r15143 | def getcols(s, start, c): | ||
'''Use colwidth to find a c-column substring of s starting at byte | ||||
index start''' | ||||
for x in xrange(start + c, len(s)): | ||||
t = s[start:x] | ||||
if colwidth(t) == c: | ||||
return t | ||||
Matt Mackall
|
r14069 | def lower(s): | ||
"best-effort encoding-aware case-folding of local string s" | ||||
try: | ||||
if isinstance(s, localstr): | ||||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
u = s.decode(encoding, encodingmode) | ||||
lu = u.lower() | ||||
if u == lu: | ||||
return s # preserve localstring | ||||
return lu.encode(encoding) | ||||
except UnicodeError: | ||||
return s.lower() # we don't know how to fold this except in ASCII | ||||
FUJIWARA Katsunori
|
r15672 | except LookupError, k: | ||
raise error.Abort(k, hint="please check your locale settings") | ||||
def upper(s): | ||||
"best-effort encoding-aware case-folding of local string s" | ||||
try: | ||||
if isinstance(s, localstr): | ||||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
u = s.decode(encoding, encodingmode) | ||||
uu = u.upper() | ||||
if u == uu: | ||||
return s # preserve localstring | ||||
return uu.encode(encoding) | ||||
except UnicodeError: | ||||
return s.upper() # we don't know how to fold this except in ASCII | ||||
except LookupError, k: | ||||
raise error.Abort(k, hint="please check your locale settings") | ||||
Matt Mackall
|
r16133 | |||
def toutf8b(s): | ||||
'''convert a local, possibly-binary string into UTF-8b | ||||
This is intended as a generic method to preserve data when working | ||||
with schemes like JSON and XML that have no provision for | ||||
arbitrary byte strings. As Mercurial often doesn't know | ||||
what encoding data is in, we use so-called UTF-8b. | ||||
If a string is already valid UTF-8 (or ASCII), it passes unmodified. | ||||
Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, | ||||
uDC00-uDCFF. | ||||
Principles of operation: | ||||
- ASCII and UTF-8 data sucessfully round-trips and is understood | ||||
by Unicode-oriented clients | ||||
- filenames and file contents in arbitrary other encodings can have | ||||
be round-tripped or recovered by clueful clients | ||||
- local strings that have a cached known UTF-8 encoding (aka | ||||
localstr) get sent as UTF-8 so Unicode-oriented clients get the | ||||
Unicode data they want | ||||
- because we must preserve UTF-8 bytestring in places such as | ||||
filenames, metadata can't be roundtripped without help | ||||
(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | ||||
arbitrary bytes into an internal Unicode format that can be | ||||
re-encoded back into the original. Here we are exposing the | ||||
internal surrogate encoding as a UTF-8 string.) | ||||
''' | ||||
if isinstance(s, localstr): | ||||
return s._utf8 | ||||
try: | ||||
if s.decode('utf-8'): | ||||
return s | ||||
except UnicodeDecodeError: | ||||
# surrogate-encode any characters that don't round-trip | ||||
s2 = s.decode('utf-8', 'ignore').encode('utf-8') | ||||
r = "" | ||||
pos = 0 | ||||
for c in s: | ||||
if s2[pos:pos + 1] == c: | ||||
r += c | ||||
pos += 1 | ||||
else: | ||||
r += unichr(0xdc00 + ord(c)).encode('utf-8') | ||||
return r | ||||
def fromutf8b(s): | ||||
'''Given a UTF-8b string, return a local, possibly-binary string. | ||||
return the original binary string. This | ||||
is a round-trip process for strings like filenames, but metadata | ||||
that's was passed through tolocal will remain in UTF-8. | ||||
>>> m = "\\xc3\\xa9\\x99abcd" | ||||
>>> n = toutf8b(m) | ||||
>>> n | ||||
'\\xc3\\xa9\\xed\\xb2\\x99abcd' | ||||
>>> fromutf8b(n) == m | ||||
True | ||||
''' | ||||
# fast path - look for uDxxx prefixes in s | ||||
if "\xed" not in s: | ||||
return s | ||||
u = s.decode("utf-8") | ||||
r = "" | ||||
for c in u: | ||||
if ord(c) & 0xff00 == 0xdc00: | ||||
r += chr(ord(c) & 0xff) | ||||
else: | ||||
r += c.encode("utf-8") | ||||
return r | ||||