##// END OF EJS Templates
encoding: fix typo in variable name...
Martin Geisler -
r13051:120eccaa default
parent child Browse files
Show More
@@ -1,146 +1,146 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 def _getpreferredencoding():
11 def _getpreferredencoding():
12 '''
12 '''
13 On darwin, getpreferredencoding ignores the locale environment and
13 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
15 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
16 Python versions.
17
17
18 However, we can't use a version check for this method, as some distributions
18 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
20 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
21 '''
22 try:
22 try:
23 locale.CODESET
23 locale.CODESET
24 except AttributeError:
24 except AttributeError:
25 # Fall back to parsing environment variables :-(
25 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
26 return locale.getdefaultlocale()[1]
27
27
28 oldloc = locale.setlocale(locale.LC_CTYPE)
28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
29 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
30 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
32
32
33 return result
33 return result
34
34
35 _encodingfixers = {
35 _encodingfixers = {
36 '646': lambda: 'ascii',
36 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
38 'mac-roman': _getpreferredencoding
39 }
39 }
40
40
41 try:
41 try:
42 encoding = os.environ.get("HGENCODING")
42 encoding = os.environ.get("HGENCODING")
43 if not encoding:
43 if not encoding:
44 encoding = locale.getpreferredencoding() or 'ascii'
44 encoding = locale.getpreferredencoding() or 'ascii'
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 except locale.Error:
46 except locale.Error:
47 encoding = 'ascii'
47 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1'
49 fallbackencoding = 'ISO-8859-1'
50
50
51 class localstr(str):
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
55 s = str.__new__(cls, l)
56 s._utf8 = u
56 s._utf8 = u
57 return s
57 return s
58 def __hash__(self):
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
59 return hash(self._utf8) # avoid collisions in local string space
60
60
61 def tolocal(s):
61 def tolocal(s):
62 """
62 """
63 Convert a string from internal UTF-8 to local encoding
63 Convert a string from internal UTF-8 to local encoding
64
64
65 All internal strings should be UTF-8 but some repos before the
65 All internal strings should be UTF-8 but some repos before the
66 implementation of locale support may contain latin1 or possibly
66 implementation of locale support may contain latin1 or possibly
67 other character sets. We attempt to decode everything strictly
67 other character sets. We attempt to decode everything strictly
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 replace unknown characters.
69 replace unknown characters.
70
70
71 The localstr class is used to cache the known UTF-8 encoding of
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
73 round-trip conversion back to UTF-8.
74
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
76 >>> l = tolocal(u)
77 >>> l
77 >>> l
78 'foo: ?'
78 'foo: ?'
79 >>> fromlocal(l)
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> d # no collision
83 >>> d # no collision
84 {'foo: ?': 1, 'foo: ?': 2}
84 {'foo: ?': 1, 'foo: ?': 2}
85 >>> 'foo: ?' in d
85 >>> 'foo: ?' in d
86 False
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
88 >>> l = tolocal(l1)
89 >>> l
89 >>> l
90 'foo: ?'
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
92 'foo: \\xc3\\xa4'
93 """
93 """
94
94
95 for e in ('UTF-8', fallbackencoding):
95 for e in ('UTF-8', fallbackencoding):
96 try:
96 try:
97 u = s.decode(e) # attempt strict decoding
97 u = s.decode(e) # attempt strict decoding
98 if u == 'UTF-8':
98 if e == 'UTF-8':
99 return localstr(s, u.encode(encoding, "replace"))
99 return localstr(s, u.encode(encoding, "replace"))
100 else:
100 else:
101 return localstr(u.encode('UTF-8'),
101 return localstr(u.encode('UTF-8'),
102 u.encode(encoding, "replace"))
102 u.encode(encoding, "replace"))
103 except LookupError, k:
103 except LookupError, k:
104 raise error.Abort("%s, please check your locale settings" % k)
104 raise error.Abort("%s, please check your locale settings" % k)
105 except UnicodeDecodeError:
105 except UnicodeDecodeError:
106 pass
106 pass
107 u = s.decode("utf-8", "replace") # last ditch
107 u = s.decode("utf-8", "replace") # last ditch
108 return u.encode(encoding, "replace") # can't round-trip
108 return u.encode(encoding, "replace") # can't round-trip
109
109
110 def fromlocal(s):
110 def fromlocal(s):
111 """
111 """
112 Convert a string from the local character encoding to UTF-8
112 Convert a string from the local character encoding to UTF-8
113
113
114 We attempt to decode strings using the encoding mode set by
114 We attempt to decode strings using the encoding mode set by
115 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
115 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
116 characters will cause an error message. Other modes include
116 characters will cause an error message. Other modes include
117 'replace', which replaces unknown characters with a special
117 'replace', which replaces unknown characters with a special
118 Unicode character, and 'ignore', which drops the character.
118 Unicode character, and 'ignore', which drops the character.
119 """
119 """
120
120
121 # can we do a lossless round-trip?
121 # can we do a lossless round-trip?
122 if isinstance(s, localstr):
122 if isinstance(s, localstr):
123 return s._utf8
123 return s._utf8
124
124
125 try:
125 try:
126 return s.decode(encoding, encodingmode).encode("utf-8")
126 return s.decode(encoding, encodingmode).encode("utf-8")
127 except UnicodeDecodeError, inst:
127 except UnicodeDecodeError, inst:
128 sub = s[max(0, inst.start - 10):inst.start + 10]
128 sub = s[max(0, inst.start - 10):inst.start + 10]
129 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
129 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
130 except LookupError, k:
130 except LookupError, k:
131 raise error.Abort("%s, please check your locale settings" % k)
131 raise error.Abort("%s, please check your locale settings" % k)
132
132
133 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
133 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
134 ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")
134 ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")
135
135
136 def colwidth(s):
136 def colwidth(s):
137 "Find the column width of a UTF-8 string for display"
137 "Find the column width of a UTF-8 string for display"
138 d = s.decode(encoding, 'replace')
138 d = s.decode(encoding, 'replace')
139 if hasattr(unicodedata, 'east_asian_width'):
139 if hasattr(unicodedata, 'east_asian_width'):
140 wide = "WF"
140 wide = "WF"
141 if ambiguous == "wide":
141 if ambiguous == "wide":
142 wide = "WFA"
142 wide = "WFA"
143 w = unicodedata.east_asian_width
143 w = unicodedata.east_asian_width
144 return sum([w(c) in wide and 2 or 1 for c in d])
144 return sum([w(c) in wide and 2 or 1 for c in d])
145 return len(d)
145 return len(d)
146
146
General Comments 0
You need to be logged in to leave comments. Login now