##// END OF EJS Templates
encoding: introduce utf8-b helpers
Matt Mackall -
r16133:84c58da3 default
parent child Browse files
Show More
@@ -1,192 +1,269 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 def _getpreferredencoding():
11 def _getpreferredencoding():
12 '''
12 '''
13 On darwin, getpreferredencoding ignores the locale environment and
13 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
15 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
16 Python versions.
17
17
18 However, we can't use a version check for this method, as some distributions
18 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
20 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
21 '''
22 try:
22 try:
23 locale.CODESET
23 locale.CODESET
24 except AttributeError:
24 except AttributeError:
25 # Fall back to parsing environment variables :-(
25 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
26 return locale.getdefaultlocale()[1]
27
27
28 oldloc = locale.setlocale(locale.LC_CTYPE)
28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
29 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
30 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
32
32
33 return result
33 return result
34
34
35 _encodingfixers = {
35 _encodingfixers = {
36 '646': lambda: 'ascii',
36 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
38 'mac-roman': _getpreferredencoding
39 }
39 }
40
40
41 try:
41 try:
42 encoding = os.environ.get("HGENCODING")
42 encoding = os.environ.get("HGENCODING")
43 if not encoding:
43 if not encoding:
44 encoding = locale.getpreferredencoding() or 'ascii'
44 encoding = locale.getpreferredencoding() or 'ascii'
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 except locale.Error:
46 except locale.Error:
47 encoding = 'ascii'
47 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1'
49 fallbackencoding = 'ISO-8859-1'
50
50
51 class localstr(str):
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
55 s = str.__new__(cls, l)
56 s._utf8 = u
56 s._utf8 = u
57 return s
57 return s
58 def __hash__(self):
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
59 return hash(self._utf8) # avoid collisions in local string space
60
60
61 def tolocal(s):
61 def tolocal(s):
62 """
62 """
63 Convert a string from internal UTF-8 to local encoding
63 Convert a string from internal UTF-8 to local encoding
64
64
65 All internal strings should be UTF-8 but some repos before the
65 All internal strings should be UTF-8 but some repos before the
66 implementation of locale support may contain latin1 or possibly
66 implementation of locale support may contain latin1 or possibly
67 other character sets. We attempt to decode everything strictly
67 other character sets. We attempt to decode everything strictly
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 replace unknown characters.
69 replace unknown characters.
70
70
71 The localstr class is used to cache the known UTF-8 encoding of
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
73 round-trip conversion back to UTF-8.
74
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
76 >>> l = tolocal(u)
77 >>> l
77 >>> l
78 'foo: ?'
78 'foo: ?'
79 >>> fromlocal(l)
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> d # no collision
83 >>> d # no collision
84 {'foo: ?': 1, 'foo: ?': 2}
84 {'foo: ?': 1, 'foo: ?': 2}
85 >>> 'foo: ?' in d
85 >>> 'foo: ?' in d
86 False
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
88 >>> l = tolocal(l1)
89 >>> l
89 >>> l
90 'foo: ?'
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
92 'foo: \\xc3\\xa4'
93 """
93 """
94
94
95 for e in ('UTF-8', fallbackencoding):
95 for e in ('UTF-8', fallbackencoding):
96 try:
96 try:
97 u = s.decode(e) # attempt strict decoding
97 u = s.decode(e) # attempt strict decoding
98 r = u.encode(encoding, "replace")
98 r = u.encode(encoding, "replace")
99 if u == r.decode(encoding):
99 if u == r.decode(encoding):
100 # r is a safe, non-lossy encoding of s
100 # r is a safe, non-lossy encoding of s
101 return r
101 return r
102 elif e == 'UTF-8':
102 elif e == 'UTF-8':
103 return localstr(s, r)
103 return localstr(s, r)
104 else:
104 else:
105 return localstr(u.encode('UTF-8'), r)
105 return localstr(u.encode('UTF-8'), r)
106
106
107 except LookupError, k:
107 except LookupError, k:
108 raise error.Abort(k, hint="please check your locale settings")
108 raise error.Abort(k, hint="please check your locale settings")
109 except UnicodeDecodeError:
109 except UnicodeDecodeError:
110 pass
110 pass
111 u = s.decode("utf-8", "replace") # last ditch
111 u = s.decode("utf-8", "replace") # last ditch
112 return u.encode(encoding, "replace") # can't round-trip
112 return u.encode(encoding, "replace") # can't round-trip
113
113
114 def fromlocal(s):
114 def fromlocal(s):
115 """
115 """
116 Convert a string from the local character encoding to UTF-8
116 Convert a string from the local character encoding to UTF-8
117
117
118 We attempt to decode strings using the encoding mode set by
118 We attempt to decode strings using the encoding mode set by
119 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
119 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
120 characters will cause an error message. Other modes include
120 characters will cause an error message. Other modes include
121 'replace', which replaces unknown characters with a special
121 'replace', which replaces unknown characters with a special
122 Unicode character, and 'ignore', which drops the character.
122 Unicode character, and 'ignore', which drops the character.
123 """
123 """
124
124
125 # can we do a lossless round-trip?
125 # can we do a lossless round-trip?
126 if isinstance(s, localstr):
126 if isinstance(s, localstr):
127 return s._utf8
127 return s._utf8
128
128
129 try:
129 try:
130 return s.decode(encoding, encodingmode).encode("utf-8")
130 return s.decode(encoding, encodingmode).encode("utf-8")
131 except UnicodeDecodeError, inst:
131 except UnicodeDecodeError, inst:
132 sub = s[max(0, inst.start - 10):inst.start + 10]
132 sub = s[max(0, inst.start - 10):inst.start + 10]
133 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
133 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
134 except LookupError, k:
134 except LookupError, k:
135 raise error.Abort(k, hint="please check your locale settings")
135 raise error.Abort(k, hint="please check your locale settings")
136
136
137 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
137 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
138 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
138 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
139 and "WFA" or "WF")
139 and "WFA" or "WF")
140
140
141 def colwidth(s):
141 def colwidth(s):
142 "Find the column width of a string for display in the local encoding"
142 "Find the column width of a string for display in the local encoding"
143 return ucolwidth(s.decode(encoding, 'replace'))
143 return ucolwidth(s.decode(encoding, 'replace'))
144
144
145 def ucolwidth(d):
145 def ucolwidth(d):
146 "Find the column width of a Unicode string for display"
146 "Find the column width of a Unicode string for display"
147 eaw = getattr(unicodedata, 'east_asian_width', None)
147 eaw = getattr(unicodedata, 'east_asian_width', None)
148 if eaw is not None:
148 if eaw is not None:
149 return sum([eaw(c) in wide and 2 or 1 for c in d])
149 return sum([eaw(c) in wide and 2 or 1 for c in d])
150 return len(d)
150 return len(d)
151
151
152 def getcols(s, start, c):
152 def getcols(s, start, c):
153 '''Use colwidth to find a c-column substring of s starting at byte
153 '''Use colwidth to find a c-column substring of s starting at byte
154 index start'''
154 index start'''
155 for x in xrange(start + c, len(s)):
155 for x in xrange(start + c, len(s)):
156 t = s[start:x]
156 t = s[start:x]
157 if colwidth(t) == c:
157 if colwidth(t) == c:
158 return t
158 return t
159
159
160 def lower(s):
160 def lower(s):
161 "best-effort encoding-aware case-folding of local string s"
161 "best-effort encoding-aware case-folding of local string s"
162 try:
162 try:
163 if isinstance(s, localstr):
163 if isinstance(s, localstr):
164 u = s._utf8.decode("utf-8")
164 u = s._utf8.decode("utf-8")
165 else:
165 else:
166 u = s.decode(encoding, encodingmode)
166 u = s.decode(encoding, encodingmode)
167
167
168 lu = u.lower()
168 lu = u.lower()
169 if u == lu:
169 if u == lu:
170 return s # preserve localstring
170 return s # preserve localstring
171 return lu.encode(encoding)
171 return lu.encode(encoding)
172 except UnicodeError:
172 except UnicodeError:
173 return s.lower() # we don't know how to fold this except in ASCII
173 return s.lower() # we don't know how to fold this except in ASCII
174 except LookupError, k:
174 except LookupError, k:
175 raise error.Abort(k, hint="please check your locale settings")
175 raise error.Abort(k, hint="please check your locale settings")
176
176
177 def upper(s):
177 def upper(s):
178 "best-effort encoding-aware case-folding of local string s"
178 "best-effort encoding-aware case-folding of local string s"
179 try:
179 try:
180 if isinstance(s, localstr):
180 if isinstance(s, localstr):
181 u = s._utf8.decode("utf-8")
181 u = s._utf8.decode("utf-8")
182 else:
182 else:
183 u = s.decode(encoding, encodingmode)
183 u = s.decode(encoding, encodingmode)
184
184
185 uu = u.upper()
185 uu = u.upper()
186 if u == uu:
186 if u == uu:
187 return s # preserve localstring
187 return s # preserve localstring
188 return uu.encode(encoding)
188 return uu.encode(encoding)
189 except UnicodeError:
189 except UnicodeError:
190 return s.upper() # we don't know how to fold this except in ASCII
190 return s.upper() # we don't know how to fold this except in ASCII
191 except LookupError, k:
191 except LookupError, k:
192 raise error.Abort(k, hint="please check your locale settings")
192 raise error.Abort(k, hint="please check your locale settings")
193
194 def toutf8b(s):
195 '''convert a local, possibly-binary string into UTF-8b
196
197 This is intended as a generic method to preserve data when working
198 with schemes like JSON and XML that have no provision for
199 arbitrary byte strings. As Mercurial often doesn't know
200 what encoding data is in, we use so-called UTF-8b.
201
202 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
203 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
204 uDC00-uDCFF.
205
206 Principles of operation:
207
208 - ASCII and UTF-8 data sucessfully round-trips and is understood
209 by Unicode-oriented clients
210 - filenames and file contents in arbitrary other encodings can have
211 be round-tripped or recovered by clueful clients
212 - local strings that have a cached known UTF-8 encoding (aka
213 localstr) get sent as UTF-8 so Unicode-oriented clients get the
214 Unicode data they want
215 - because we must preserve UTF-8 bytestring in places such as
216 filenames, metadata can't be roundtripped without help
217
218 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
219 arbitrary bytes into an internal Unicode format that can be
220 re-encoded back into the original. Here we are exposing the
221 internal surrogate encoding as a UTF-8 string.)
222 '''
223
224 if isinstance(s, localstr):
225 return s._utf8
226
227 try:
228 if s.decode('utf-8'):
229 return s
230 except UnicodeDecodeError:
231 # surrogate-encode any characters that don't round-trip
232 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
233 r = ""
234 pos = 0
235 for c in s:
236 if s2[pos:pos + 1] == c:
237 r += c
238 pos += 1
239 else:
240 r += unichr(0xdc00 + ord(c)).encode('utf-8')
241 return r
242
243 def fromutf8b(s):
244 '''Given a UTF-8b string, return a local, possibly-binary string.
245
246 return the original binary string. This
247 is a round-trip process for strings like filenames, but metadata
248 that's was passed through tolocal will remain in UTF-8.
249
250 >>> m = "\\xc3\\xa9\\x99abcd"
251 >>> n = toutf8b(m)
252 >>> n
253 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
254 >>> fromutf8b(n) == m
255 True
256 '''
257
258 # fast path - look for uDxxx prefixes in s
259 if "\xed" not in s:
260 return s
261
262 u = s.decode("utf-8")
263 r = ""
264 for c in u:
265 if ord(c) & 0xff00 == 0xdc00:
266 r += chr(ord(c) & 0xff)
267 else:
268 r += c.encode("utf-8")
269 return r
General Comments 0
You need to be logged in to leave comments. Login now