##// END OF EJS Templates
encoding: fix typo in variable name...
Martin Geisler -
r13051:120eccaa default
parent child Browse files
Show More
@@ -1,146 +1,146 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 11 def _getpreferredencoding():
12 12 '''
13 13 On darwin, getpreferredencoding ignores the locale environment and
14 14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 15 for Python 2.7 and up. This is the same corrected code for earlier
16 16 Python versions.
17 17
18 18 However, we can't use a version check for this method, as some distributions
19 19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 20 encoding, as it is unlikely that this encoding is the actually expected.
21 21 '''
22 22 try:
23 23 locale.CODESET
24 24 except AttributeError:
25 25 # Fall back to parsing environment variables :-(
26 26 return locale.getdefaultlocale()[1]
27 27
28 28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 29 locale.setlocale(locale.LC_CTYPE, "")
30 30 result = locale.nl_langinfo(locale.CODESET)
31 31 locale.setlocale(locale.LC_CTYPE, oldloc)
32 32
33 33 return result
34 34
35 35 _encodingfixers = {
36 36 '646': lambda: 'ascii',
37 37 'ANSI_X3.4-1968': lambda: 'ascii',
38 38 'mac-roman': _getpreferredencoding
39 39 }
40 40
41 41 try:
42 42 encoding = os.environ.get("HGENCODING")
43 43 if not encoding:
44 44 encoding = locale.getpreferredencoding() or 'ascii'
45 45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 46 except locale.Error:
47 47 encoding = 'ascii'
48 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 49 fallbackencoding = 'ISO-8859-1'
50 50
51 51 class localstr(str):
52 52 '''This class allows strings that are unmodified to be
53 53 round-tripped to the local encoding and back'''
54 54 def __new__(cls, u, l):
55 55 s = str.__new__(cls, l)
56 56 s._utf8 = u
57 57 return s
58 58 def __hash__(self):
59 59 return hash(self._utf8) # avoid collisions in local string space
60 60
61 61 def tolocal(s):
62 62 """
63 63 Convert a string from internal UTF-8 to local encoding
64 64
65 65 All internal strings should be UTF-8 but some repos before the
66 66 implementation of locale support may contain latin1 or possibly
67 67 other character sets. We attempt to decode everything strictly
68 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 69 replace unknown characters.
70 70
71 71 The localstr class is used to cache the known UTF-8 encoding of
72 72 strings next to their local representation to allow lossless
73 73 round-trip conversion back to UTF-8.
74 74
75 75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 76 >>> l = tolocal(u)
77 77 >>> l
78 78 'foo: ?'
79 79 >>> fromlocal(l)
80 80 'foo: \\xc3\\xa4'
81 81 >>> u2 = 'foo: \\xc3\\xa1'
82 82 >>> d = { l: 1, tolocal(u2): 2 }
83 83 >>> d # no collision
84 84 {'foo: ?': 1, 'foo: ?': 2}
85 85 >>> 'foo: ?' in d
86 86 False
87 87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 88 >>> l = tolocal(l1)
89 89 >>> l
90 90 'foo: ?'
91 91 >>> fromlocal(l) # magically in utf-8
92 92 'foo: \\xc3\\xa4'
93 93 """
94 94
95 95 for e in ('UTF-8', fallbackencoding):
96 96 try:
97 97 u = s.decode(e) # attempt strict decoding
98 if u == 'UTF-8':
98 if e == 'UTF-8':
99 99 return localstr(s, u.encode(encoding, "replace"))
100 100 else:
101 101 return localstr(u.encode('UTF-8'),
102 102 u.encode(encoding, "replace"))
103 103 except LookupError, k:
104 104 raise error.Abort("%s, please check your locale settings" % k)
105 105 except UnicodeDecodeError:
106 106 pass
107 107 u = s.decode("utf-8", "replace") # last ditch
108 108 return u.encode(encoding, "replace") # can't round-trip
109 109
110 110 def fromlocal(s):
111 111 """
112 112 Convert a string from the local character encoding to UTF-8
113 113
114 114 We attempt to decode strings using the encoding mode set by
115 115 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
116 116 characters will cause an error message. Other modes include
117 117 'replace', which replaces unknown characters with a special
118 118 Unicode character, and 'ignore', which drops the character.
119 119 """
120 120
121 121 # can we do a lossless round-trip?
122 122 if isinstance(s, localstr):
123 123 return s._utf8
124 124
125 125 try:
126 126 return s.decode(encoding, encodingmode).encode("utf-8")
127 127 except UnicodeDecodeError, inst:
128 128 sub = s[max(0, inst.start - 10):inst.start + 10]
129 129 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
130 130 except LookupError, k:
131 131 raise error.Abort("%s, please check your locale settings" % k)
132 132
133 133 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
134 134 ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")
135 135
136 136 def colwidth(s):
137 137 "Find the column width of a UTF-8 string for display"
138 138 d = s.decode(encoding, 'replace')
139 139 if hasattr(unicodedata, 'east_asian_width'):
140 140 wide = "WF"
141 141 if ambiguous == "wide":
142 142 wide = "WFA"
143 143 w = unicodedata.east_asian_width
144 144 return sum([w(c) in wide and 2 or 1 for c in d])
145 145 return len(d)
146 146
General Comments 0
You need to be logged in to leave comments. Login now