diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -48,6 +48,16 @@ except locale.Error: encodingmode = os.environ.get("HGENCODINGMODE", "strict") fallbackencoding = 'ISO-8859-1' +class localstr(str): + '''This class allows strings that are unmodified to be + round-tripped to the local encoding and back''' + def __new__(cls, u, l): + s = str.__new__(cls, l) + s._utf8 = u + return s + def __hash__(self): + return hash(self._utf8) # avoid collisions in local string space + def tolocal(s): """ Convert a string from internal UTF-8 to local encoding @@ -57,17 +67,45 @@ def tolocal(s): other character sets. We attempt to decode everything strictly using UTF-8, then Latin-1, and failing that, we use UTF-8 and replace unknown characters. + + The localstr class is used to cache the known UTF-8 encoding of + strings next to their local representation to allow lossless + round-trip conversion back to UTF-8. + + >>> u = 'foo: \\xc3\\xa4' # utf-8 + >>> l = tolocal(u) + >>> l + 'foo: ?' + >>> fromlocal(l) + 'foo: \\xc3\\xa4' + >>> u2 = 'foo: \\xc3\\xa1' + >>> d = { l: 1, tolocal(u2): 2 } + >>> d # no collision + {'foo: ?': 1, 'foo: ?': 2} + >>> 'foo: ?' in d + False + >>> l1 = 'foo: \\xe4' # historical latin1 fallback + >>> l = tolocal(l1) + >>> l + 'foo: ?' + >>> fromlocal(l) # magically in utf-8 + 'foo: \\xc3\\xa4' """ + for e in ('UTF-8', fallbackencoding): try: u = s.decode(e) # attempt strict decoding - return u.encode(encoding, "replace") + if u == 'UTF-8': + return localstr(s, u.encode(encoding, "replace")) + else: + return localstr(u.encode('UTF-8'), + u.encode(encoding, "replace")) except LookupError, k: raise error.Abort("%s, please check your locale settings" % k) except UnicodeDecodeError: pass u = s.decode("utf-8", "replace") # last ditch - return u.encode(encoding, "replace") + return u.encode(encoding, "replace") # can't round-trip def fromlocal(s): """ @@ -79,6 +117,11 @@ def fromlocal(s): 'replace', which replaces unknown characters with a special Unicode character, and 'ignore', which drops the character. """ + + # can we do a lossless round-trip? + if isinstance(s, localstr): + return s._utf8 + try: return s.decode(encoding, encodingmode).encode("utf-8") except UnicodeDecodeError, inst: diff --git a/tests/test-doctest.py b/tests/test-doctest.py --- a/tests/test-doctest.py +++ b/tests/test-doctest.py @@ -13,8 +13,8 @@ doctest.testmod(mercurial.dagparser, opt import mercurial.match doctest.testmod(mercurial.match) -import mercurial.url -doctest.testmod(mercurial.url) +import mercurial.encoding +doctest.testmod(mercurial.encoding) import hgext.convert.cvsps doctest.testmod(hgext.convert.cvsps)