##// END OF EJS Templates
encoding: use range() instead of xrange()...
Gregory Szorc -
r28508:3c6e94d0 default
parent child Browse files
Show More
@@ -1,579 +1,579
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11 import locale
11 import locale
12 import os
12 import os
13 import sys
13 import sys
14 import unicodedata
14 import unicodedata
15
15
16 from . import (
16 from . import (
17 error,
17 error,
18 )
18 )
19
19
20 if sys.version_info[0] >= 3:
20 if sys.version_info[0] >= 3:
21 unichr = chr
21 unichr = chr
22
22
23 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
23 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
24 # "Unicode Subtleties"), so we need to ignore them in some places for
24 # "Unicode Subtleties"), so we need to ignore them in some places for
25 # sanity.
25 # sanity.
26 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
26 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
27 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
27 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
28 "206a 206b 206c 206d 206e 206f feff".split()]
28 "206a 206b 206c 206d 206e 206f feff".split()]
29 # verify the next function will work
29 # verify the next function will work
30 if sys.version_info[0] >= 3:
30 if sys.version_info[0] >= 3:
31 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
31 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
32 else:
32 else:
33 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
33 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
34
34
35 def hfsignoreclean(s):
35 def hfsignoreclean(s):
36 """Remove codepoints ignored by HFS+ from s.
36 """Remove codepoints ignored by HFS+ from s.
37
37
38 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
38 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
39 '.hg'
39 '.hg'
40 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
40 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
41 '.hg'
41 '.hg'
42 """
42 """
43 if "\xe2" in s or "\xef" in s:
43 if "\xe2" in s or "\xef" in s:
44 for c in _ignore:
44 for c in _ignore:
45 s = s.replace(c, '')
45 s = s.replace(c, '')
46 return s
46 return s
47
47
48 def _getpreferredencoding():
48 def _getpreferredencoding():
49 '''
49 '''
50 On darwin, getpreferredencoding ignores the locale environment and
50 On darwin, getpreferredencoding ignores the locale environment and
51 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
51 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
52 for Python 2.7 and up. This is the same corrected code for earlier
52 for Python 2.7 and up. This is the same corrected code for earlier
53 Python versions.
53 Python versions.
54
54
55 However, we can't use a version check for this method, as some distributions
55 However, we can't use a version check for this method, as some distributions
56 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
56 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
57 encoding, as it is unlikely that this encoding is the actually expected.
57 encoding, as it is unlikely that this encoding is the actually expected.
58 '''
58 '''
59 try:
59 try:
60 locale.CODESET
60 locale.CODESET
61 except AttributeError:
61 except AttributeError:
62 # Fall back to parsing environment variables :-(
62 # Fall back to parsing environment variables :-(
63 return locale.getdefaultlocale()[1]
63 return locale.getdefaultlocale()[1]
64
64
65 oldloc = locale.setlocale(locale.LC_CTYPE)
65 oldloc = locale.setlocale(locale.LC_CTYPE)
66 locale.setlocale(locale.LC_CTYPE, "")
66 locale.setlocale(locale.LC_CTYPE, "")
67 result = locale.nl_langinfo(locale.CODESET)
67 result = locale.nl_langinfo(locale.CODESET)
68 locale.setlocale(locale.LC_CTYPE, oldloc)
68 locale.setlocale(locale.LC_CTYPE, oldloc)
69
69
70 return result
70 return result
71
71
72 _encodingfixers = {
72 _encodingfixers = {
73 '646': lambda: 'ascii',
73 '646': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
75 'mac-roman': _getpreferredencoding
75 'mac-roman': _getpreferredencoding
76 }
76 }
77
77
78 try:
78 try:
79 encoding = os.environ.get("HGENCODING")
79 encoding = os.environ.get("HGENCODING")
80 if not encoding:
80 if not encoding:
81 encoding = locale.getpreferredencoding() or 'ascii'
81 encoding = locale.getpreferredencoding() or 'ascii'
82 encoding = _encodingfixers.get(encoding, lambda: encoding)()
82 encoding = _encodingfixers.get(encoding, lambda: encoding)()
83 except locale.Error:
83 except locale.Error:
84 encoding = 'ascii'
84 encoding = 'ascii'
85 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
85 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
86 fallbackencoding = 'ISO-8859-1'
86 fallbackencoding = 'ISO-8859-1'
87
87
88 class localstr(str):
88 class localstr(str):
89 '''This class allows strings that are unmodified to be
89 '''This class allows strings that are unmodified to be
90 round-tripped to the local encoding and back'''
90 round-tripped to the local encoding and back'''
91 def __new__(cls, u, l):
91 def __new__(cls, u, l):
92 s = str.__new__(cls, l)
92 s = str.__new__(cls, l)
93 s._utf8 = u
93 s._utf8 = u
94 return s
94 return s
95 def __hash__(self):
95 def __hash__(self):
96 return hash(self._utf8) # avoid collisions in local string space
96 return hash(self._utf8) # avoid collisions in local string space
97
97
98 def tolocal(s):
98 def tolocal(s):
99 """
99 """
100 Convert a string from internal UTF-8 to local encoding
100 Convert a string from internal UTF-8 to local encoding
101
101
102 All internal strings should be UTF-8 but some repos before the
102 All internal strings should be UTF-8 but some repos before the
103 implementation of locale support may contain latin1 or possibly
103 implementation of locale support may contain latin1 or possibly
104 other character sets. We attempt to decode everything strictly
104 other character sets. We attempt to decode everything strictly
105 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
105 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
106 replace unknown characters.
106 replace unknown characters.
107
107
108 The localstr class is used to cache the known UTF-8 encoding of
108 The localstr class is used to cache the known UTF-8 encoding of
109 strings next to their local representation to allow lossless
109 strings next to their local representation to allow lossless
110 round-trip conversion back to UTF-8.
110 round-trip conversion back to UTF-8.
111
111
112 >>> u = 'foo: \\xc3\\xa4' # utf-8
112 >>> u = 'foo: \\xc3\\xa4' # utf-8
113 >>> l = tolocal(u)
113 >>> l = tolocal(u)
114 >>> l
114 >>> l
115 'foo: ?'
115 'foo: ?'
116 >>> fromlocal(l)
116 >>> fromlocal(l)
117 'foo: \\xc3\\xa4'
117 'foo: \\xc3\\xa4'
118 >>> u2 = 'foo: \\xc3\\xa1'
118 >>> u2 = 'foo: \\xc3\\xa1'
119 >>> d = { l: 1, tolocal(u2): 2 }
119 >>> d = { l: 1, tolocal(u2): 2 }
120 >>> len(d) # no collision
120 >>> len(d) # no collision
121 2
121 2
122 >>> 'foo: ?' in d
122 >>> 'foo: ?' in d
123 False
123 False
124 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
124 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
125 >>> l = tolocal(l1)
125 >>> l = tolocal(l1)
126 >>> l
126 >>> l
127 'foo: ?'
127 'foo: ?'
128 >>> fromlocal(l) # magically in utf-8
128 >>> fromlocal(l) # magically in utf-8
129 'foo: \\xc3\\xa4'
129 'foo: \\xc3\\xa4'
130 """
130 """
131
131
132 try:
132 try:
133 try:
133 try:
134 # make sure string is actually stored in UTF-8
134 # make sure string is actually stored in UTF-8
135 u = s.decode('UTF-8')
135 u = s.decode('UTF-8')
136 if encoding == 'UTF-8':
136 if encoding == 'UTF-8':
137 # fast path
137 # fast path
138 return s
138 return s
139 r = u.encode(encoding, "replace")
139 r = u.encode(encoding, "replace")
140 if u == r.decode(encoding):
140 if u == r.decode(encoding):
141 # r is a safe, non-lossy encoding of s
141 # r is a safe, non-lossy encoding of s
142 return r
142 return r
143 return localstr(s, r)
143 return localstr(s, r)
144 except UnicodeDecodeError:
144 except UnicodeDecodeError:
145 # we should only get here if we're looking at an ancient changeset
145 # we should only get here if we're looking at an ancient changeset
146 try:
146 try:
147 u = s.decode(fallbackencoding)
147 u = s.decode(fallbackencoding)
148 r = u.encode(encoding, "replace")
148 r = u.encode(encoding, "replace")
149 if u == r.decode(encoding):
149 if u == r.decode(encoding):
150 # r is a safe, non-lossy encoding of s
150 # r is a safe, non-lossy encoding of s
151 return r
151 return r
152 return localstr(u.encode('UTF-8'), r)
152 return localstr(u.encode('UTF-8'), r)
153 except UnicodeDecodeError:
153 except UnicodeDecodeError:
154 u = s.decode("utf-8", "replace") # last ditch
154 u = s.decode("utf-8", "replace") # last ditch
155 return u.encode(encoding, "replace") # can't round-trip
155 return u.encode(encoding, "replace") # can't round-trip
156 except LookupError as k:
156 except LookupError as k:
157 raise error.Abort(k, hint="please check your locale settings")
157 raise error.Abort(k, hint="please check your locale settings")
158
158
159 def fromlocal(s):
159 def fromlocal(s):
160 """
160 """
161 Convert a string from the local character encoding to UTF-8
161 Convert a string from the local character encoding to UTF-8
162
162
163 We attempt to decode strings using the encoding mode set by
163 We attempt to decode strings using the encoding mode set by
164 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
164 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
165 characters will cause an error message. Other modes include
165 characters will cause an error message. Other modes include
166 'replace', which replaces unknown characters with a special
166 'replace', which replaces unknown characters with a special
167 Unicode character, and 'ignore', which drops the character.
167 Unicode character, and 'ignore', which drops the character.
168 """
168 """
169
169
170 # can we do a lossless round-trip?
170 # can we do a lossless round-trip?
171 if isinstance(s, localstr):
171 if isinstance(s, localstr):
172 return s._utf8
172 return s._utf8
173
173
174 try:
174 try:
175 return s.decode(encoding, encodingmode).encode("utf-8")
175 return s.decode(encoding, encodingmode).encode("utf-8")
176 except UnicodeDecodeError as inst:
176 except UnicodeDecodeError as inst:
177 sub = s[max(0, inst.start - 10):inst.start + 10]
177 sub = s[max(0, inst.start - 10):inst.start + 10]
178 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
178 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
179 except LookupError as k:
179 except LookupError as k:
180 raise error.Abort(k, hint="please check your locale settings")
180 raise error.Abort(k, hint="please check your locale settings")
181
181
182 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
182 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
183 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
183 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
184 and "WFA" or "WF")
184 and "WFA" or "WF")
185
185
186 def colwidth(s):
186 def colwidth(s):
187 "Find the column width of a string for display in the local encoding"
187 "Find the column width of a string for display in the local encoding"
188 return ucolwidth(s.decode(encoding, 'replace'))
188 return ucolwidth(s.decode(encoding, 'replace'))
189
189
190 def ucolwidth(d):
190 def ucolwidth(d):
191 "Find the column width of a Unicode string for display"
191 "Find the column width of a Unicode string for display"
192 eaw = getattr(unicodedata, 'east_asian_width', None)
192 eaw = getattr(unicodedata, 'east_asian_width', None)
193 if eaw is not None:
193 if eaw is not None:
194 return sum([eaw(c) in wide and 2 or 1 for c in d])
194 return sum([eaw(c) in wide and 2 or 1 for c in d])
195 return len(d)
195 return len(d)
196
196
197 def getcols(s, start, c):
197 def getcols(s, start, c):
198 '''Use colwidth to find a c-column substring of s starting at byte
198 '''Use colwidth to find a c-column substring of s starting at byte
199 index start'''
199 index start'''
200 for x in xrange(start + c, len(s)):
200 for x in xrange(start + c, len(s)):
201 t = s[start:x]
201 t = s[start:x]
202 if colwidth(t) == c:
202 if colwidth(t) == c:
203 return t
203 return t
204
204
205 def trim(s, width, ellipsis='', leftside=False):
205 def trim(s, width, ellipsis='', leftside=False):
206 """Trim string 's' to at most 'width' columns (including 'ellipsis').
206 """Trim string 's' to at most 'width' columns (including 'ellipsis').
207
207
208 If 'leftside' is True, left side of string 's' is trimmed.
208 If 'leftside' is True, left side of string 's' is trimmed.
209 'ellipsis' is always placed at trimmed side.
209 'ellipsis' is always placed at trimmed side.
210
210
211 >>> ellipsis = '+++'
211 >>> ellipsis = '+++'
212 >>> from . import encoding
212 >>> from . import encoding
213 >>> encoding.encoding = 'utf-8'
213 >>> encoding.encoding = 'utf-8'
214 >>> t= '1234567890'
214 >>> t= '1234567890'
215 >>> print trim(t, 12, ellipsis=ellipsis)
215 >>> print trim(t, 12, ellipsis=ellipsis)
216 1234567890
216 1234567890
217 >>> print trim(t, 10, ellipsis=ellipsis)
217 >>> print trim(t, 10, ellipsis=ellipsis)
218 1234567890
218 1234567890
219 >>> print trim(t, 8, ellipsis=ellipsis)
219 >>> print trim(t, 8, ellipsis=ellipsis)
220 12345+++
220 12345+++
221 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
221 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
222 +++67890
222 +++67890
223 >>> print trim(t, 8)
223 >>> print trim(t, 8)
224 12345678
224 12345678
225 >>> print trim(t, 8, leftside=True)
225 >>> print trim(t, 8, leftside=True)
226 34567890
226 34567890
227 >>> print trim(t, 3, ellipsis=ellipsis)
227 >>> print trim(t, 3, ellipsis=ellipsis)
228 +++
228 +++
229 >>> print trim(t, 1, ellipsis=ellipsis)
229 >>> print trim(t, 1, ellipsis=ellipsis)
230 +
230 +
231 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
231 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
232 >>> t = u.encode(encoding.encoding)
232 >>> t = u.encode(encoding.encoding)
233 >>> print trim(t, 12, ellipsis=ellipsis)
233 >>> print trim(t, 12, ellipsis=ellipsis)
234 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
234 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
235 >>> print trim(t, 10, ellipsis=ellipsis)
235 >>> print trim(t, 10, ellipsis=ellipsis)
236 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
236 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
237 >>> print trim(t, 8, ellipsis=ellipsis)
237 >>> print trim(t, 8, ellipsis=ellipsis)
238 \xe3\x81\x82\xe3\x81\x84+++
238 \xe3\x81\x82\xe3\x81\x84+++
239 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
239 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
240 +++\xe3\x81\x88\xe3\x81\x8a
240 +++\xe3\x81\x88\xe3\x81\x8a
241 >>> print trim(t, 5)
241 >>> print trim(t, 5)
242 \xe3\x81\x82\xe3\x81\x84
242 \xe3\x81\x82\xe3\x81\x84
243 >>> print trim(t, 5, leftside=True)
243 >>> print trim(t, 5, leftside=True)
244 \xe3\x81\x88\xe3\x81\x8a
244 \xe3\x81\x88\xe3\x81\x8a
245 >>> print trim(t, 4, ellipsis=ellipsis)
245 >>> print trim(t, 4, ellipsis=ellipsis)
246 +++
246 +++
247 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
247 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
248 +++
248 +++
249 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
249 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
250 >>> print trim(t, 12, ellipsis=ellipsis)
250 >>> print trim(t, 12, ellipsis=ellipsis)
251 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
251 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
252 >>> print trim(t, 10, ellipsis=ellipsis)
252 >>> print trim(t, 10, ellipsis=ellipsis)
253 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
253 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
254 >>> print trim(t, 8, ellipsis=ellipsis)
254 >>> print trim(t, 8, ellipsis=ellipsis)
255 \x11\x22\x33\x44\x55+++
255 \x11\x22\x33\x44\x55+++
256 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
256 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
257 +++\x66\x77\x88\x99\xaa
257 +++\x66\x77\x88\x99\xaa
258 >>> print trim(t, 8)
258 >>> print trim(t, 8)
259 \x11\x22\x33\x44\x55\x66\x77\x88
259 \x11\x22\x33\x44\x55\x66\x77\x88
260 >>> print trim(t, 8, leftside=True)
260 >>> print trim(t, 8, leftside=True)
261 \x33\x44\x55\x66\x77\x88\x99\xaa
261 \x33\x44\x55\x66\x77\x88\x99\xaa
262 >>> print trim(t, 3, ellipsis=ellipsis)
262 >>> print trim(t, 3, ellipsis=ellipsis)
263 +++
263 +++
264 >>> print trim(t, 1, ellipsis=ellipsis)
264 >>> print trim(t, 1, ellipsis=ellipsis)
265 +
265 +
266 """
266 """
267 try:
267 try:
268 u = s.decode(encoding)
268 u = s.decode(encoding)
269 except UnicodeDecodeError:
269 except UnicodeDecodeError:
270 if len(s) <= width: # trimming is not needed
270 if len(s) <= width: # trimming is not needed
271 return s
271 return s
272 width -= len(ellipsis)
272 width -= len(ellipsis)
273 if width <= 0: # no enough room even for ellipsis
273 if width <= 0: # no enough room even for ellipsis
274 return ellipsis[:width + len(ellipsis)]
274 return ellipsis[:width + len(ellipsis)]
275 if leftside:
275 if leftside:
276 return ellipsis + s[-width:]
276 return ellipsis + s[-width:]
277 return s[:width] + ellipsis
277 return s[:width] + ellipsis
278
278
279 if ucolwidth(u) <= width: # trimming is not needed
279 if ucolwidth(u) <= width: # trimming is not needed
280 return s
280 return s
281
281
282 width -= len(ellipsis)
282 width -= len(ellipsis)
283 if width <= 0: # no enough room even for ellipsis
283 if width <= 0: # no enough room even for ellipsis
284 return ellipsis[:width + len(ellipsis)]
284 return ellipsis[:width + len(ellipsis)]
285
285
286 if leftside:
286 if leftside:
287 uslice = lambda i: u[i:]
287 uslice = lambda i: u[i:]
288 concat = lambda s: ellipsis + s
288 concat = lambda s: ellipsis + s
289 else:
289 else:
290 uslice = lambda i: u[:-i]
290 uslice = lambda i: u[:-i]
291 concat = lambda s: s + ellipsis
291 concat = lambda s: s + ellipsis
292 for i in xrange(1, len(u)):
292 for i in xrange(1, len(u)):
293 usub = uslice(i)
293 usub = uslice(i)
294 if ucolwidth(usub) <= width:
294 if ucolwidth(usub) <= width:
295 return concat(usub.encode(encoding))
295 return concat(usub.encode(encoding))
296 return ellipsis # no enough room for multi-column characters
296 return ellipsis # no enough room for multi-column characters
297
297
298 def _asciilower(s):
298 def _asciilower(s):
299 '''convert a string to lowercase if ASCII
299 '''convert a string to lowercase if ASCII
300
300
301 Raises UnicodeDecodeError if non-ASCII characters are found.'''
301 Raises UnicodeDecodeError if non-ASCII characters are found.'''
302 s.decode('ascii')
302 s.decode('ascii')
303 return s.lower()
303 return s.lower()
304
304
305 def asciilower(s):
305 def asciilower(s):
306 # delay importing avoids cyclic dependency around "parsers" in
306 # delay importing avoids cyclic dependency around "parsers" in
307 # pure Python build (util => i18n => encoding => parsers => util)
307 # pure Python build (util => i18n => encoding => parsers => util)
308 from . import parsers
308 from . import parsers
309 impl = getattr(parsers, 'asciilower', _asciilower)
309 impl = getattr(parsers, 'asciilower', _asciilower)
310 global asciilower
310 global asciilower
311 asciilower = impl
311 asciilower = impl
312 return impl(s)
312 return impl(s)
313
313
314 def _asciiupper(s):
314 def _asciiupper(s):
315 '''convert a string to uppercase if ASCII
315 '''convert a string to uppercase if ASCII
316
316
317 Raises UnicodeDecodeError if non-ASCII characters are found.'''
317 Raises UnicodeDecodeError if non-ASCII characters are found.'''
318 s.decode('ascii')
318 s.decode('ascii')
319 return s.upper()
319 return s.upper()
320
320
321 def asciiupper(s):
321 def asciiupper(s):
322 # delay importing avoids cyclic dependency around "parsers" in
322 # delay importing avoids cyclic dependency around "parsers" in
323 # pure Python build (util => i18n => encoding => parsers => util)
323 # pure Python build (util => i18n => encoding => parsers => util)
324 from . import parsers
324 from . import parsers
325 impl = getattr(parsers, 'asciiupper', _asciiupper)
325 impl = getattr(parsers, 'asciiupper', _asciiupper)
326 global asciiupper
326 global asciiupper
327 asciiupper = impl
327 asciiupper = impl
328 return impl(s)
328 return impl(s)
329
329
330 def lower(s):
330 def lower(s):
331 "best-effort encoding-aware case-folding of local string s"
331 "best-effort encoding-aware case-folding of local string s"
332 try:
332 try:
333 return asciilower(s)
333 return asciilower(s)
334 except UnicodeDecodeError:
334 except UnicodeDecodeError:
335 pass
335 pass
336 try:
336 try:
337 if isinstance(s, localstr):
337 if isinstance(s, localstr):
338 u = s._utf8.decode("utf-8")
338 u = s._utf8.decode("utf-8")
339 else:
339 else:
340 u = s.decode(encoding, encodingmode)
340 u = s.decode(encoding, encodingmode)
341
341
342 lu = u.lower()
342 lu = u.lower()
343 if u == lu:
343 if u == lu:
344 return s # preserve localstring
344 return s # preserve localstring
345 return lu.encode(encoding)
345 return lu.encode(encoding)
346 except UnicodeError:
346 except UnicodeError:
347 return s.lower() # we don't know how to fold this except in ASCII
347 return s.lower() # we don't know how to fold this except in ASCII
348 except LookupError as k:
348 except LookupError as k:
349 raise error.Abort(k, hint="please check your locale settings")
349 raise error.Abort(k, hint="please check your locale settings")
350
350
351 def upper(s):
351 def upper(s):
352 "best-effort encoding-aware case-folding of local string s"
352 "best-effort encoding-aware case-folding of local string s"
353 try:
353 try:
354 return asciiupper(s)
354 return asciiupper(s)
355 except UnicodeDecodeError:
355 except UnicodeDecodeError:
356 return upperfallback(s)
356 return upperfallback(s)
357
357
358 def upperfallback(s):
358 def upperfallback(s):
359 try:
359 try:
360 if isinstance(s, localstr):
360 if isinstance(s, localstr):
361 u = s._utf8.decode("utf-8")
361 u = s._utf8.decode("utf-8")
362 else:
362 else:
363 u = s.decode(encoding, encodingmode)
363 u = s.decode(encoding, encodingmode)
364
364
365 uu = u.upper()
365 uu = u.upper()
366 if u == uu:
366 if u == uu:
367 return s # preserve localstring
367 return s # preserve localstring
368 return uu.encode(encoding)
368 return uu.encode(encoding)
369 except UnicodeError:
369 except UnicodeError:
370 return s.upper() # we don't know how to fold this except in ASCII
370 return s.upper() # we don't know how to fold this except in ASCII
371 except LookupError as k:
371 except LookupError as k:
372 raise error.Abort(k, hint="please check your locale settings")
372 raise error.Abort(k, hint="please check your locale settings")
373
373
374 class normcasespecs(object):
374 class normcasespecs(object):
375 '''what a platform's normcase does to ASCII strings
375 '''what a platform's normcase does to ASCII strings
376
376
377 This is specified per platform, and should be consistent with what normcase
377 This is specified per platform, and should be consistent with what normcase
378 on that platform actually does.
378 on that platform actually does.
379
379
380 lower: normcase lowercases ASCII strings
380 lower: normcase lowercases ASCII strings
381 upper: normcase uppercases ASCII strings
381 upper: normcase uppercases ASCII strings
382 other: the fallback function should always be called
382 other: the fallback function should always be called
383
383
384 This should be kept in sync with normcase_spec in util.h.'''
384 This should be kept in sync with normcase_spec in util.h.'''
385 lower = -1
385 lower = -1
386 upper = 1
386 upper = 1
387 other = 0
387 other = 0
388
388
389 _jsonmap = []
389 _jsonmap = []
390 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
390 _jsonmap.extend("\\u%04x" % x for x in range(32))
391 _jsonmap.extend(chr(x) for x in xrange(32, 127))
391 _jsonmap.extend(chr(x) for x in range(32, 127))
392 _jsonmap.append('\\u007f')
392 _jsonmap.append('\\u007f')
393 _jsonmap[0x09] = '\\t'
393 _jsonmap[0x09] = '\\t'
394 _jsonmap[0x0a] = '\\n'
394 _jsonmap[0x0a] = '\\n'
395 _jsonmap[0x22] = '\\"'
395 _jsonmap[0x22] = '\\"'
396 _jsonmap[0x5c] = '\\\\'
396 _jsonmap[0x5c] = '\\\\'
397 _jsonmap[0x08] = '\\b'
397 _jsonmap[0x08] = '\\b'
398 _jsonmap[0x0c] = '\\f'
398 _jsonmap[0x0c] = '\\f'
399 _jsonmap[0x0d] = '\\r'
399 _jsonmap[0x0d] = '\\r'
400 _paranoidjsonmap = _jsonmap[:]
400 _paranoidjsonmap = _jsonmap[:]
401 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
401 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
402 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
402 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
403 _jsonmap.extend(chr(x) for x in xrange(128, 256))
403 _jsonmap.extend(chr(x) for x in range(128, 256))
404
404
405 def jsonescape(s, paranoid=False):
405 def jsonescape(s, paranoid=False):
406 '''returns a string suitable for JSON
406 '''returns a string suitable for JSON
407
407
408 JSON is problematic for us because it doesn't support non-Unicode
408 JSON is problematic for us because it doesn't support non-Unicode
409 bytes. To deal with this, we take the following approach:
409 bytes. To deal with this, we take the following approach:
410
410
411 - localstr objects are converted back to UTF-8
411 - localstr objects are converted back to UTF-8
412 - valid UTF-8/ASCII strings are passed as-is
412 - valid UTF-8/ASCII strings are passed as-is
413 - other strings are converted to UTF-8b surrogate encoding
413 - other strings are converted to UTF-8b surrogate encoding
414 - apply JSON-specified string escaping
414 - apply JSON-specified string escaping
415
415
416 (escapes are doubled in these tests)
416 (escapes are doubled in these tests)
417
417
418 >>> jsonescape('this is a test')
418 >>> jsonescape('this is a test')
419 'this is a test'
419 'this is a test'
420 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
420 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
421 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
421 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
422 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
422 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
423 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
423 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
424 >>> jsonescape('a weird byte: \\xdd')
424 >>> jsonescape('a weird byte: \\xdd')
425 'a weird byte: \\xed\\xb3\\x9d'
425 'a weird byte: \\xed\\xb3\\x9d'
426 >>> jsonescape('utf-8: caf\\xc3\\xa9')
426 >>> jsonescape('utf-8: caf\\xc3\\xa9')
427 'utf-8: caf\\xc3\\xa9'
427 'utf-8: caf\\xc3\\xa9'
428 >>> jsonescape('')
428 >>> jsonescape('')
429 ''
429 ''
430
430
431 If paranoid, non-ascii and common troublesome characters are also escaped.
431 If paranoid, non-ascii and common troublesome characters are also escaped.
432 This is suitable for web output.
432 This is suitable for web output.
433
433
434 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
434 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
435 'escape boundary: ~ \\\\u007f \\\\u0080'
435 'escape boundary: ~ \\\\u007f \\\\u0080'
436 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
436 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
437 'a weird byte: \\\\udcdd'
437 'a weird byte: \\\\udcdd'
438 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
438 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
439 'utf-8: caf\\\\u00e9'
439 'utf-8: caf\\\\u00e9'
440 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
440 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
441 'non-BMP: \\\\ud834\\\\udd1e'
441 'non-BMP: \\\\ud834\\\\udd1e'
442 >>> jsonescape('<foo@example.org>', paranoid=True)
442 >>> jsonescape('<foo@example.org>', paranoid=True)
443 '\\\\u003cfoo@example.org\\\\u003e'
443 '\\\\u003cfoo@example.org\\\\u003e'
444 '''
444 '''
445
445
446 if paranoid:
446 if paranoid:
447 jm = _paranoidjsonmap
447 jm = _paranoidjsonmap
448 else:
448 else:
449 jm = _jsonmap
449 jm = _jsonmap
450
450
451 u8chars = toutf8b(s)
451 u8chars = toutf8b(s)
452 try:
452 try:
453 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
453 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
454 except IndexError:
454 except IndexError:
455 pass
455 pass
456 # non-BMP char is represented as UTF-16 surrogate pair
456 # non-BMP char is represented as UTF-16 surrogate pair
457 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
457 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
458 u16codes.pop(0) # drop BOM
458 u16codes.pop(0) # drop BOM
459 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
459 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
460
460
461 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
461 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
462
462
463 def getutf8char(s, pos):
463 def getutf8char(s, pos):
464 '''get the next full utf-8 character in the given string, starting at pos
464 '''get the next full utf-8 character in the given string, starting at pos
465
465
466 Raises a UnicodeError if the given location does not start a valid
466 Raises a UnicodeError if the given location does not start a valid
467 utf-8 character.
467 utf-8 character.
468 '''
468 '''
469
469
470 # find how many bytes to attempt decoding from first nibble
470 # find how many bytes to attempt decoding from first nibble
471 l = _utf8len[ord(s[pos]) >> 4]
471 l = _utf8len[ord(s[pos]) >> 4]
472 if not l: # ascii
472 if not l: # ascii
473 return s[pos]
473 return s[pos]
474
474
475 c = s[pos:pos + l]
475 c = s[pos:pos + l]
476 # validate with attempted decode
476 # validate with attempted decode
477 c.decode("utf-8")
477 c.decode("utf-8")
478 return c
478 return c
479
479
480 def toutf8b(s):
480 def toutf8b(s):
481 '''convert a local, possibly-binary string into UTF-8b
481 '''convert a local, possibly-binary string into UTF-8b
482
482
483 This is intended as a generic method to preserve data when working
483 This is intended as a generic method to preserve data when working
484 with schemes like JSON and XML that have no provision for
484 with schemes like JSON and XML that have no provision for
485 arbitrary byte strings. As Mercurial often doesn't know
485 arbitrary byte strings. As Mercurial often doesn't know
486 what encoding data is in, we use so-called UTF-8b.
486 what encoding data is in, we use so-called UTF-8b.
487
487
488 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
488 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
489 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
489 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
490 uDC00-uDCFF.
490 uDC00-uDCFF.
491
491
492 Principles of operation:
492 Principles of operation:
493
493
494 - ASCII and UTF-8 data successfully round-trips and is understood
494 - ASCII and UTF-8 data successfully round-trips and is understood
495 by Unicode-oriented clients
495 by Unicode-oriented clients
496 - filenames and file contents in arbitrary other encodings can have
496 - filenames and file contents in arbitrary other encodings can have
497 be round-tripped or recovered by clueful clients
497 be round-tripped or recovered by clueful clients
498 - local strings that have a cached known UTF-8 encoding (aka
498 - local strings that have a cached known UTF-8 encoding (aka
499 localstr) get sent as UTF-8 so Unicode-oriented clients get the
499 localstr) get sent as UTF-8 so Unicode-oriented clients get the
500 Unicode data they want
500 Unicode data they want
501 - because we must preserve UTF-8 bytestring in places such as
501 - because we must preserve UTF-8 bytestring in places such as
502 filenames, metadata can't be roundtripped without help
502 filenames, metadata can't be roundtripped without help
503
503
504 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
504 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
505 arbitrary bytes into an internal Unicode format that can be
505 arbitrary bytes into an internal Unicode format that can be
506 re-encoded back into the original. Here we are exposing the
506 re-encoded back into the original. Here we are exposing the
507 internal surrogate encoding as a UTF-8 string.)
507 internal surrogate encoding as a UTF-8 string.)
508 '''
508 '''
509
509
510 if "\xed" not in s:
510 if "\xed" not in s:
511 if isinstance(s, localstr):
511 if isinstance(s, localstr):
512 return s._utf8
512 return s._utf8
513 try:
513 try:
514 s.decode('utf-8')
514 s.decode('utf-8')
515 return s
515 return s
516 except UnicodeDecodeError:
516 except UnicodeDecodeError:
517 pass
517 pass
518
518
519 r = ""
519 r = ""
520 pos = 0
520 pos = 0
521 l = len(s)
521 l = len(s)
522 while pos < l:
522 while pos < l:
523 try:
523 try:
524 c = getutf8char(s, pos)
524 c = getutf8char(s, pos)
525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
526 # have to re-escape existing U+DCxx characters
526 # have to re-escape existing U+DCxx characters
527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
528 pos += 1
528 pos += 1
529 else:
529 else:
530 pos += len(c)
530 pos += len(c)
531 except UnicodeDecodeError:
531 except UnicodeDecodeError:
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
533 pos += 1
533 pos += 1
534 r += c
534 r += c
535 return r
535 return r
536
536
537 def fromutf8b(s):
537 def fromutf8b(s):
538 '''Given a UTF-8b string, return a local, possibly-binary string.
538 '''Given a UTF-8b string, return a local, possibly-binary string.
539
539
540 return the original binary string. This
540 return the original binary string. This
541 is a round-trip process for strings like filenames, but metadata
541 is a round-trip process for strings like filenames, but metadata
542 that's was passed through tolocal will remain in UTF-8.
542 that's was passed through tolocal will remain in UTF-8.
543
543
544 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
544 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
545 >>> m = "\\xc3\\xa9\\x99abcd"
545 >>> m = "\\xc3\\xa9\\x99abcd"
546 >>> toutf8b(m)
546 >>> toutf8b(m)
547 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
547 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
548 >>> roundtrip(m)
548 >>> roundtrip(m)
549 True
549 True
550 >>> roundtrip("\\xc2\\xc2\\x80")
550 >>> roundtrip("\\xc2\\xc2\\x80")
551 True
551 True
552 >>> roundtrip("\\xef\\xbf\\xbd")
552 >>> roundtrip("\\xef\\xbf\\xbd")
553 True
553 True
554 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
554 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
555 True
555 True
556 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
556 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
557 True
557 True
558 '''
558 '''
559
559
560 # fast path - look for uDxxx prefixes in s
560 # fast path - look for uDxxx prefixes in s
561 if "\xed" not in s:
561 if "\xed" not in s:
562 return s
562 return s
563
563
564 # We could do this with the unicode type but some Python builds
564 # We could do this with the unicode type but some Python builds
565 # use UTF-16 internally (issue5031) which causes non-BMP code
565 # use UTF-16 internally (issue5031) which causes non-BMP code
566 # points to be escaped. Instead, we use our handy getutf8char
566 # points to be escaped. Instead, we use our handy getutf8char
567 # helper again to walk the string without "decoding" it.
567 # helper again to walk the string without "decoding" it.
568
568
569 r = ""
569 r = ""
570 pos = 0
570 pos = 0
571 l = len(s)
571 l = len(s)
572 while pos < l:
572 while pos < l:
573 c = getutf8char(s, pos)
573 c = getutf8char(s, pos)
574 pos += len(c)
574 pos += len(c)
575 # unescape U+DCxx characters
575 # unescape U+DCxx characters
576 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
576 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
577 c = chr(ord(c.decode("utf-8")) & 0xff)
577 c = chr(ord(c.decode("utf-8")) & 0xff)
578 r += c
578 r += c
579 return r
579 return r
General Comments 0
You need to be logged in to leave comments. Login now