##// END OF EJS Templates
encoding: backport paranoid escaping from templatefilters.jsonescape()...
Yuya Nishihara -
r28069:b2d24c28 default
parent child Browse files
Show More
@@ -1,568 +1,572 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 )
17 )
18
18
19 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
20 # "Unicode Subtleties"), so we need to ignore them in some places for
20 # "Unicode Subtleties"), so we need to ignore them in some places for
21 # sanity.
21 # sanity.
22 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
23 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
24 "206a 206b 206c 206d 206e 206f feff".split()]
24 "206a 206b 206c 206d 206e 206f feff".split()]
25 # verify the next function will work
25 # verify the next function will work
26 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
27
27
28 def hfsignoreclean(s):
28 def hfsignoreclean(s):
29 """Remove codepoints ignored by HFS+ from s.
29 """Remove codepoints ignored by HFS+ from s.
30
30
31 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
32 '.hg'
32 '.hg'
33 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
34 '.hg'
34 '.hg'
35 """
35 """
36 if "\xe2" in s or "\xef" in s:
36 if "\xe2" in s or "\xef" in s:
37 for c in _ignore:
37 for c in _ignore:
38 s = s.replace(c, '')
38 s = s.replace(c, '')
39 return s
39 return s
40
40
41 def _getpreferredencoding():
41 def _getpreferredencoding():
42 '''
42 '''
43 On darwin, getpreferredencoding ignores the locale environment and
43 On darwin, getpreferredencoding ignores the locale environment and
44 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
45 for Python 2.7 and up. This is the same corrected code for earlier
45 for Python 2.7 and up. This is the same corrected code for earlier
46 Python versions.
46 Python versions.
47
47
48 However, we can't use a version check for this method, as some distributions
48 However, we can't use a version check for this method, as some distributions
49 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
50 encoding, as it is unlikely that this encoding is the actually expected.
50 encoding, as it is unlikely that this encoding is the actually expected.
51 '''
51 '''
52 try:
52 try:
53 locale.CODESET
53 locale.CODESET
54 except AttributeError:
54 except AttributeError:
55 # Fall back to parsing environment variables :-(
55 # Fall back to parsing environment variables :-(
56 return locale.getdefaultlocale()[1]
56 return locale.getdefaultlocale()[1]
57
57
58 oldloc = locale.setlocale(locale.LC_CTYPE)
58 oldloc = locale.setlocale(locale.LC_CTYPE)
59 locale.setlocale(locale.LC_CTYPE, "")
59 locale.setlocale(locale.LC_CTYPE, "")
60 result = locale.nl_langinfo(locale.CODESET)
60 result = locale.nl_langinfo(locale.CODESET)
61 locale.setlocale(locale.LC_CTYPE, oldloc)
61 locale.setlocale(locale.LC_CTYPE, oldloc)
62
62
63 return result
63 return result
64
64
65 _encodingfixers = {
65 _encodingfixers = {
66 '646': lambda: 'ascii',
66 '646': lambda: 'ascii',
67 'ANSI_X3.4-1968': lambda: 'ascii',
67 'ANSI_X3.4-1968': lambda: 'ascii',
68 'mac-roman': _getpreferredencoding
68 'mac-roman': _getpreferredencoding
69 }
69 }
70
70
71 try:
71 try:
72 encoding = os.environ.get("HGENCODING")
72 encoding = os.environ.get("HGENCODING")
73 if not encoding:
73 if not encoding:
74 encoding = locale.getpreferredencoding() or 'ascii'
74 encoding = locale.getpreferredencoding() or 'ascii'
75 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 encoding = _encodingfixers.get(encoding, lambda: encoding)()
76 except locale.Error:
76 except locale.Error:
77 encoding = 'ascii'
77 encoding = 'ascii'
78 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
79 fallbackencoding = 'ISO-8859-1'
79 fallbackencoding = 'ISO-8859-1'
80
80
81 class localstr(str):
81 class localstr(str):
82 '''This class allows strings that are unmodified to be
82 '''This class allows strings that are unmodified to be
83 round-tripped to the local encoding and back'''
83 round-tripped to the local encoding and back'''
84 def __new__(cls, u, l):
84 def __new__(cls, u, l):
85 s = str.__new__(cls, l)
85 s = str.__new__(cls, l)
86 s._utf8 = u
86 s._utf8 = u
87 return s
87 return s
88 def __hash__(self):
88 def __hash__(self):
89 return hash(self._utf8) # avoid collisions in local string space
89 return hash(self._utf8) # avoid collisions in local string space
90
90
91 def tolocal(s):
91 def tolocal(s):
92 """
92 """
93 Convert a string from internal UTF-8 to local encoding
93 Convert a string from internal UTF-8 to local encoding
94
94
95 All internal strings should be UTF-8 but some repos before the
95 All internal strings should be UTF-8 but some repos before the
96 implementation of locale support may contain latin1 or possibly
96 implementation of locale support may contain latin1 or possibly
97 other character sets. We attempt to decode everything strictly
97 other character sets. We attempt to decode everything strictly
98 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
99 replace unknown characters.
99 replace unknown characters.
100
100
101 The localstr class is used to cache the known UTF-8 encoding of
101 The localstr class is used to cache the known UTF-8 encoding of
102 strings next to their local representation to allow lossless
102 strings next to their local representation to allow lossless
103 round-trip conversion back to UTF-8.
103 round-trip conversion back to UTF-8.
104
104
105 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 >>> u = 'foo: \\xc3\\xa4' # utf-8
106 >>> l = tolocal(u)
106 >>> l = tolocal(u)
107 >>> l
107 >>> l
108 'foo: ?'
108 'foo: ?'
109 >>> fromlocal(l)
109 >>> fromlocal(l)
110 'foo: \\xc3\\xa4'
110 'foo: \\xc3\\xa4'
111 >>> u2 = 'foo: \\xc3\\xa1'
111 >>> u2 = 'foo: \\xc3\\xa1'
112 >>> d = { l: 1, tolocal(u2): 2 }
112 >>> d = { l: 1, tolocal(u2): 2 }
113 >>> len(d) # no collision
113 >>> len(d) # no collision
114 2
114 2
115 >>> 'foo: ?' in d
115 >>> 'foo: ?' in d
116 False
116 False
117 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
118 >>> l = tolocal(l1)
118 >>> l = tolocal(l1)
119 >>> l
119 >>> l
120 'foo: ?'
120 'foo: ?'
121 >>> fromlocal(l) # magically in utf-8
121 >>> fromlocal(l) # magically in utf-8
122 'foo: \\xc3\\xa4'
122 'foo: \\xc3\\xa4'
123 """
123 """
124
124
125 try:
125 try:
126 try:
126 try:
127 # make sure string is actually stored in UTF-8
127 # make sure string is actually stored in UTF-8
128 u = s.decode('UTF-8')
128 u = s.decode('UTF-8')
129 if encoding == 'UTF-8':
129 if encoding == 'UTF-8':
130 # fast path
130 # fast path
131 return s
131 return s
132 r = u.encode(encoding, "replace")
132 r = u.encode(encoding, "replace")
133 if u == r.decode(encoding):
133 if u == r.decode(encoding):
134 # r is a safe, non-lossy encoding of s
134 # r is a safe, non-lossy encoding of s
135 return r
135 return r
136 return localstr(s, r)
136 return localstr(s, r)
137 except UnicodeDecodeError:
137 except UnicodeDecodeError:
138 # we should only get here if we're looking at an ancient changeset
138 # we should only get here if we're looking at an ancient changeset
139 try:
139 try:
140 u = s.decode(fallbackencoding)
140 u = s.decode(fallbackencoding)
141 r = u.encode(encoding, "replace")
141 r = u.encode(encoding, "replace")
142 if u == r.decode(encoding):
142 if u == r.decode(encoding):
143 # r is a safe, non-lossy encoding of s
143 # r is a safe, non-lossy encoding of s
144 return r
144 return r
145 return localstr(u.encode('UTF-8'), r)
145 return localstr(u.encode('UTF-8'), r)
146 except UnicodeDecodeError:
146 except UnicodeDecodeError:
147 u = s.decode("utf-8", "replace") # last ditch
147 u = s.decode("utf-8", "replace") # last ditch
148 return u.encode(encoding, "replace") # can't round-trip
148 return u.encode(encoding, "replace") # can't round-trip
149 except LookupError as k:
149 except LookupError as k:
150 raise error.Abort(k, hint="please check your locale settings")
150 raise error.Abort(k, hint="please check your locale settings")
151
151
152 def fromlocal(s):
152 def fromlocal(s):
153 """
153 """
154 Convert a string from the local character encoding to UTF-8
154 Convert a string from the local character encoding to UTF-8
155
155
156 We attempt to decode strings using the encoding mode set by
156 We attempt to decode strings using the encoding mode set by
157 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
158 characters will cause an error message. Other modes include
158 characters will cause an error message. Other modes include
159 'replace', which replaces unknown characters with a special
159 'replace', which replaces unknown characters with a special
160 Unicode character, and 'ignore', which drops the character.
160 Unicode character, and 'ignore', which drops the character.
161 """
161 """
162
162
163 # can we do a lossless round-trip?
163 # can we do a lossless round-trip?
164 if isinstance(s, localstr):
164 if isinstance(s, localstr):
165 return s._utf8
165 return s._utf8
166
166
167 try:
167 try:
168 return s.decode(encoding, encodingmode).encode("utf-8")
168 return s.decode(encoding, encodingmode).encode("utf-8")
169 except UnicodeDecodeError as inst:
169 except UnicodeDecodeError as inst:
170 sub = s[max(0, inst.start - 10):inst.start + 10]
170 sub = s[max(0, inst.start - 10):inst.start + 10]
171 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
172 except LookupError as k:
172 except LookupError as k:
173 raise error.Abort(k, hint="please check your locale settings")
173 raise error.Abort(k, hint="please check your locale settings")
174
174
175 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
176 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
177 and "WFA" or "WF")
177 and "WFA" or "WF")
178
178
179 def colwidth(s):
179 def colwidth(s):
180 "Find the column width of a string for display in the local encoding"
180 "Find the column width of a string for display in the local encoding"
181 return ucolwidth(s.decode(encoding, 'replace'))
181 return ucolwidth(s.decode(encoding, 'replace'))
182
182
183 def ucolwidth(d):
183 def ucolwidth(d):
184 "Find the column width of a Unicode string for display"
184 "Find the column width of a Unicode string for display"
185 eaw = getattr(unicodedata, 'east_asian_width', None)
185 eaw = getattr(unicodedata, 'east_asian_width', None)
186 if eaw is not None:
186 if eaw is not None:
187 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 return sum([eaw(c) in wide and 2 or 1 for c in d])
188 return len(d)
188 return len(d)
189
189
190 def getcols(s, start, c):
190 def getcols(s, start, c):
191 '''Use colwidth to find a c-column substring of s starting at byte
191 '''Use colwidth to find a c-column substring of s starting at byte
192 index start'''
192 index start'''
193 for x in xrange(start + c, len(s)):
193 for x in xrange(start + c, len(s)):
194 t = s[start:x]
194 t = s[start:x]
195 if colwidth(t) == c:
195 if colwidth(t) == c:
196 return t
196 return t
197
197
198 def trim(s, width, ellipsis='', leftside=False):
198 def trim(s, width, ellipsis='', leftside=False):
199 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199 """Trim string 's' to at most 'width' columns (including 'ellipsis').
200
200
201 If 'leftside' is True, left side of string 's' is trimmed.
201 If 'leftside' is True, left side of string 's' is trimmed.
202 'ellipsis' is always placed at trimmed side.
202 'ellipsis' is always placed at trimmed side.
203
203
204 >>> ellipsis = '+++'
204 >>> ellipsis = '+++'
205 >>> from . import encoding
205 >>> from . import encoding
206 >>> encoding.encoding = 'utf-8'
206 >>> encoding.encoding = 'utf-8'
207 >>> t= '1234567890'
207 >>> t= '1234567890'
208 >>> print trim(t, 12, ellipsis=ellipsis)
208 >>> print trim(t, 12, ellipsis=ellipsis)
209 1234567890
209 1234567890
210 >>> print trim(t, 10, ellipsis=ellipsis)
210 >>> print trim(t, 10, ellipsis=ellipsis)
211 1234567890
211 1234567890
212 >>> print trim(t, 8, ellipsis=ellipsis)
212 >>> print trim(t, 8, ellipsis=ellipsis)
213 12345+++
213 12345+++
214 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
215 +++67890
215 +++67890
216 >>> print trim(t, 8)
216 >>> print trim(t, 8)
217 12345678
217 12345678
218 >>> print trim(t, 8, leftside=True)
218 >>> print trim(t, 8, leftside=True)
219 34567890
219 34567890
220 >>> print trim(t, 3, ellipsis=ellipsis)
220 >>> print trim(t, 3, ellipsis=ellipsis)
221 +++
221 +++
222 >>> print trim(t, 1, ellipsis=ellipsis)
222 >>> print trim(t, 1, ellipsis=ellipsis)
223 +
223 +
224 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
225 >>> t = u.encode(encoding.encoding)
225 >>> t = u.encode(encoding.encoding)
226 >>> print trim(t, 12, ellipsis=ellipsis)
226 >>> print trim(t, 12, ellipsis=ellipsis)
227 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
228 >>> print trim(t, 10, ellipsis=ellipsis)
228 >>> print trim(t, 10, ellipsis=ellipsis)
229 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
230 >>> print trim(t, 8, ellipsis=ellipsis)
230 >>> print trim(t, 8, ellipsis=ellipsis)
231 \xe3\x81\x82\xe3\x81\x84+++
231 \xe3\x81\x82\xe3\x81\x84+++
232 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
233 +++\xe3\x81\x88\xe3\x81\x8a
233 +++\xe3\x81\x88\xe3\x81\x8a
234 >>> print trim(t, 5)
234 >>> print trim(t, 5)
235 \xe3\x81\x82\xe3\x81\x84
235 \xe3\x81\x82\xe3\x81\x84
236 >>> print trim(t, 5, leftside=True)
236 >>> print trim(t, 5, leftside=True)
237 \xe3\x81\x88\xe3\x81\x8a
237 \xe3\x81\x88\xe3\x81\x8a
238 >>> print trim(t, 4, ellipsis=ellipsis)
238 >>> print trim(t, 4, ellipsis=ellipsis)
239 +++
239 +++
240 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
241 +++
241 +++
242 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
243 >>> print trim(t, 12, ellipsis=ellipsis)
243 >>> print trim(t, 12, ellipsis=ellipsis)
244 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
245 >>> print trim(t, 10, ellipsis=ellipsis)
245 >>> print trim(t, 10, ellipsis=ellipsis)
246 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
247 >>> print trim(t, 8, ellipsis=ellipsis)
247 >>> print trim(t, 8, ellipsis=ellipsis)
248 \x11\x22\x33\x44\x55+++
248 \x11\x22\x33\x44\x55+++
249 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
250 +++\x66\x77\x88\x99\xaa
250 +++\x66\x77\x88\x99\xaa
251 >>> print trim(t, 8)
251 >>> print trim(t, 8)
252 \x11\x22\x33\x44\x55\x66\x77\x88
252 \x11\x22\x33\x44\x55\x66\x77\x88
253 >>> print trim(t, 8, leftside=True)
253 >>> print trim(t, 8, leftside=True)
254 \x33\x44\x55\x66\x77\x88\x99\xaa
254 \x33\x44\x55\x66\x77\x88\x99\xaa
255 >>> print trim(t, 3, ellipsis=ellipsis)
255 >>> print trim(t, 3, ellipsis=ellipsis)
256 +++
256 +++
257 >>> print trim(t, 1, ellipsis=ellipsis)
257 >>> print trim(t, 1, ellipsis=ellipsis)
258 +
258 +
259 """
259 """
260 try:
260 try:
261 u = s.decode(encoding)
261 u = s.decode(encoding)
262 except UnicodeDecodeError:
262 except UnicodeDecodeError:
263 if len(s) <= width: # trimming is not needed
263 if len(s) <= width: # trimming is not needed
264 return s
264 return s
265 width -= len(ellipsis)
265 width -= len(ellipsis)
266 if width <= 0: # no enough room even for ellipsis
266 if width <= 0: # no enough room even for ellipsis
267 return ellipsis[:width + len(ellipsis)]
267 return ellipsis[:width + len(ellipsis)]
268 if leftside:
268 if leftside:
269 return ellipsis + s[-width:]
269 return ellipsis + s[-width:]
270 return s[:width] + ellipsis
270 return s[:width] + ellipsis
271
271
272 if ucolwidth(u) <= width: # trimming is not needed
272 if ucolwidth(u) <= width: # trimming is not needed
273 return s
273 return s
274
274
275 width -= len(ellipsis)
275 width -= len(ellipsis)
276 if width <= 0: # no enough room even for ellipsis
276 if width <= 0: # no enough room even for ellipsis
277 return ellipsis[:width + len(ellipsis)]
277 return ellipsis[:width + len(ellipsis)]
278
278
279 if leftside:
279 if leftside:
280 uslice = lambda i: u[i:]
280 uslice = lambda i: u[i:]
281 concat = lambda s: ellipsis + s
281 concat = lambda s: ellipsis + s
282 else:
282 else:
283 uslice = lambda i: u[:-i]
283 uslice = lambda i: u[:-i]
284 concat = lambda s: s + ellipsis
284 concat = lambda s: s + ellipsis
285 for i in xrange(1, len(u)):
285 for i in xrange(1, len(u)):
286 usub = uslice(i)
286 usub = uslice(i)
287 if ucolwidth(usub) <= width:
287 if ucolwidth(usub) <= width:
288 return concat(usub.encode(encoding))
288 return concat(usub.encode(encoding))
289 return ellipsis # no enough room for multi-column characters
289 return ellipsis # no enough room for multi-column characters
290
290
291 def _asciilower(s):
291 def _asciilower(s):
292 '''convert a string to lowercase if ASCII
292 '''convert a string to lowercase if ASCII
293
293
294 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 Raises UnicodeDecodeError if non-ASCII characters are found.'''
295 s.decode('ascii')
295 s.decode('ascii')
296 return s.lower()
296 return s.lower()
297
297
298 def asciilower(s):
298 def asciilower(s):
299 # delay importing avoids cyclic dependency around "parsers" in
299 # delay importing avoids cyclic dependency around "parsers" in
300 # pure Python build (util => i18n => encoding => parsers => util)
300 # pure Python build (util => i18n => encoding => parsers => util)
301 from . import parsers
301 from . import parsers
302 impl = getattr(parsers, 'asciilower', _asciilower)
302 impl = getattr(parsers, 'asciilower', _asciilower)
303 global asciilower
303 global asciilower
304 asciilower = impl
304 asciilower = impl
305 return impl(s)
305 return impl(s)
306
306
307 def _asciiupper(s):
307 def _asciiupper(s):
308 '''convert a string to uppercase if ASCII
308 '''convert a string to uppercase if ASCII
309
309
310 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 Raises UnicodeDecodeError if non-ASCII characters are found.'''
311 s.decode('ascii')
311 s.decode('ascii')
312 return s.upper()
312 return s.upper()
313
313
314 def asciiupper(s):
314 def asciiupper(s):
315 # delay importing avoids cyclic dependency around "parsers" in
315 # delay importing avoids cyclic dependency around "parsers" in
316 # pure Python build (util => i18n => encoding => parsers => util)
316 # pure Python build (util => i18n => encoding => parsers => util)
317 from . import parsers
317 from . import parsers
318 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 impl = getattr(parsers, 'asciiupper', _asciiupper)
319 global asciiupper
319 global asciiupper
320 asciiupper = impl
320 asciiupper = impl
321 return impl(s)
321 return impl(s)
322
322
323 def lower(s):
323 def lower(s):
324 "best-effort encoding-aware case-folding of local string s"
324 "best-effort encoding-aware case-folding of local string s"
325 try:
325 try:
326 return asciilower(s)
326 return asciilower(s)
327 except UnicodeDecodeError:
327 except UnicodeDecodeError:
328 pass
328 pass
329 try:
329 try:
330 if isinstance(s, localstr):
330 if isinstance(s, localstr):
331 u = s._utf8.decode("utf-8")
331 u = s._utf8.decode("utf-8")
332 else:
332 else:
333 u = s.decode(encoding, encodingmode)
333 u = s.decode(encoding, encodingmode)
334
334
335 lu = u.lower()
335 lu = u.lower()
336 if u == lu:
336 if u == lu:
337 return s # preserve localstring
337 return s # preserve localstring
338 return lu.encode(encoding)
338 return lu.encode(encoding)
339 except UnicodeError:
339 except UnicodeError:
340 return s.lower() # we don't know how to fold this except in ASCII
340 return s.lower() # we don't know how to fold this except in ASCII
341 except LookupError as k:
341 except LookupError as k:
342 raise error.Abort(k, hint="please check your locale settings")
342 raise error.Abort(k, hint="please check your locale settings")
343
343
344 def upper(s):
344 def upper(s):
345 "best-effort encoding-aware case-folding of local string s"
345 "best-effort encoding-aware case-folding of local string s"
346 try:
346 try:
347 return asciiupper(s)
347 return asciiupper(s)
348 except UnicodeDecodeError:
348 except UnicodeDecodeError:
349 return upperfallback(s)
349 return upperfallback(s)
350
350
351 def upperfallback(s):
351 def upperfallback(s):
352 try:
352 try:
353 if isinstance(s, localstr):
353 if isinstance(s, localstr):
354 u = s._utf8.decode("utf-8")
354 u = s._utf8.decode("utf-8")
355 else:
355 else:
356 u = s.decode(encoding, encodingmode)
356 u = s.decode(encoding, encodingmode)
357
357
358 uu = u.upper()
358 uu = u.upper()
359 if u == uu:
359 if u == uu:
360 return s # preserve localstring
360 return s # preserve localstring
361 return uu.encode(encoding)
361 return uu.encode(encoding)
362 except UnicodeError:
362 except UnicodeError:
363 return s.upper() # we don't know how to fold this except in ASCII
363 return s.upper() # we don't know how to fold this except in ASCII
364 except LookupError as k:
364 except LookupError as k:
365 raise error.Abort(k, hint="please check your locale settings")
365 raise error.Abort(k, hint="please check your locale settings")
366
366
367 class normcasespecs(object):
367 class normcasespecs(object):
368 '''what a platform's normcase does to ASCII strings
368 '''what a platform's normcase does to ASCII strings
369
369
370 This is specified per platform, and should be consistent with what normcase
370 This is specified per platform, and should be consistent with what normcase
371 on that platform actually does.
371 on that platform actually does.
372
372
373 lower: normcase lowercases ASCII strings
373 lower: normcase lowercases ASCII strings
374 upper: normcase uppercases ASCII strings
374 upper: normcase uppercases ASCII strings
375 other: the fallback function should always be called
375 other: the fallback function should always be called
376
376
377 This should be kept in sync with normcase_spec in util.h.'''
377 This should be kept in sync with normcase_spec in util.h.'''
378 lower = -1
378 lower = -1
379 upper = 1
379 upper = 1
380 other = 0
380 other = 0
381
381
382 _jsonmap = []
382 _jsonmap = []
383 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
383 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
384 _jsonmap.extend(chr(x) for x in xrange(32, 127))
384 _jsonmap.extend(chr(x) for x in xrange(32, 127))
385 _jsonmap.append('\\u007f')
385 _jsonmap.append('\\u007f')
386 _jsonmap[0x09] = '\\t'
386 _jsonmap[0x09] = '\\t'
387 _jsonmap[0x0a] = '\\n'
387 _jsonmap[0x0a] = '\\n'
388 _jsonmap[0x22] = '\\"'
388 _jsonmap[0x22] = '\\"'
389 _jsonmap[0x5c] = '\\\\'
389 _jsonmap[0x5c] = '\\\\'
390 _jsonmap[0x08] = '\\b'
390 _jsonmap[0x08] = '\\b'
391 _jsonmap[0x0c] = '\\f'
391 _jsonmap[0x0c] = '\\f'
392 _jsonmap[0x0d] = '\\r'
392 _jsonmap[0x0d] = '\\r'
393 _paranoidjsonmap = _jsonmap[:]
393 _paranoidjsonmap = _jsonmap[:]
394 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
395 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
394 _jsonmap.extend(chr(x) for x in xrange(128, 256))
396 _jsonmap.extend(chr(x) for x in xrange(128, 256))
395
397
396 def jsonescape(s, paranoid=False):
398 def jsonescape(s, paranoid=False):
397 '''returns a string suitable for JSON
399 '''returns a string suitable for JSON
398
400
399 JSON is problematic for us because it doesn't support non-Unicode
401 JSON is problematic for us because it doesn't support non-Unicode
400 bytes. To deal with this, we take the following approach:
402 bytes. To deal with this, we take the following approach:
401
403
402 - localstr objects are converted back to UTF-8
404 - localstr objects are converted back to UTF-8
403 - valid UTF-8/ASCII strings are passed as-is
405 - valid UTF-8/ASCII strings are passed as-is
404 - other strings are converted to UTF-8b surrogate encoding
406 - other strings are converted to UTF-8b surrogate encoding
405 - apply JSON-specified string escaping
407 - apply JSON-specified string escaping
406
408
407 (escapes are doubled in these tests)
409 (escapes are doubled in these tests)
408
410
409 >>> jsonescape('this is a test')
411 >>> jsonescape('this is a test')
410 'this is a test'
412 'this is a test'
411 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
413 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
412 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
414 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
413 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
415 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
414 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
416 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
415 >>> jsonescape('a weird byte: \\xdd')
417 >>> jsonescape('a weird byte: \\xdd')
416 'a weird byte: \\xed\\xb3\\x9d'
418 'a weird byte: \\xed\\xb3\\x9d'
417 >>> jsonescape('utf-8: caf\\xc3\\xa9')
419 >>> jsonescape('utf-8: caf\\xc3\\xa9')
418 'utf-8: caf\\xc3\\xa9'
420 'utf-8: caf\\xc3\\xa9'
419 >>> jsonescape('')
421 >>> jsonescape('')
420 ''
422 ''
421
423
422 If paranoid, non-ascii characters are also escaped. This is suitable for
424 If paranoid, non-ascii and common troublesome characters are also escaped.
423 web output.
425 This is suitable for web output.
424
426
425 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
427 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
426 'escape boundary: ~ \\\\u007f \\\\u0080'
428 'escape boundary: ~ \\\\u007f \\\\u0080'
427 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
429 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
428 'a weird byte: \\\\udcdd'
430 'a weird byte: \\\\udcdd'
429 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
431 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
430 'utf-8: caf\\\\u00e9'
432 'utf-8: caf\\\\u00e9'
431 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
433 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
432 'non-BMP: \\\\ud834\\\\udd1e'
434 'non-BMP: \\\\ud834\\\\udd1e'
435 >>> jsonescape('<foo@example.org>', paranoid=True)
436 '\\\\u003cfoo@example.org\\\\u003e'
433 '''
437 '''
434
438
435 if paranoid:
439 if paranoid:
436 jm = _paranoidjsonmap
440 jm = _paranoidjsonmap
437 else:
441 else:
438 jm = _jsonmap
442 jm = _jsonmap
439
443
440 u8chars = toutf8b(s)
444 u8chars = toutf8b(s)
441 try:
445 try:
442 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
446 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
443 except IndexError:
447 except IndexError:
444 pass
448 pass
445 # non-BMP char is represented as UTF-16 surrogate pair
449 # non-BMP char is represented as UTF-16 surrogate pair
446 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
450 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
447 u16codes.pop(0) # drop BOM
451 u16codes.pop(0) # drop BOM
448 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
452 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
449
453
450 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
454 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
451
455
452 def getutf8char(s, pos):
456 def getutf8char(s, pos):
453 '''get the next full utf-8 character in the given string, starting at pos
457 '''get the next full utf-8 character in the given string, starting at pos
454
458
455 Raises a UnicodeError if the given location does not start a valid
459 Raises a UnicodeError if the given location does not start a valid
456 utf-8 character.
460 utf-8 character.
457 '''
461 '''
458
462
459 # find how many bytes to attempt decoding from first nibble
463 # find how many bytes to attempt decoding from first nibble
460 l = _utf8len[ord(s[pos]) >> 4]
464 l = _utf8len[ord(s[pos]) >> 4]
461 if not l: # ascii
465 if not l: # ascii
462 return s[pos]
466 return s[pos]
463
467
464 c = s[pos:pos + l]
468 c = s[pos:pos + l]
465 # validate with attempted decode
469 # validate with attempted decode
466 c.decode("utf-8")
470 c.decode("utf-8")
467 return c
471 return c
468
472
469 def toutf8b(s):
473 def toutf8b(s):
470 '''convert a local, possibly-binary string into UTF-8b
474 '''convert a local, possibly-binary string into UTF-8b
471
475
472 This is intended as a generic method to preserve data when working
476 This is intended as a generic method to preserve data when working
473 with schemes like JSON and XML that have no provision for
477 with schemes like JSON and XML that have no provision for
474 arbitrary byte strings. As Mercurial often doesn't know
478 arbitrary byte strings. As Mercurial often doesn't know
475 what encoding data is in, we use so-called UTF-8b.
479 what encoding data is in, we use so-called UTF-8b.
476
480
477 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
481 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
478 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
482 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
479 uDC00-uDCFF.
483 uDC00-uDCFF.
480
484
481 Principles of operation:
485 Principles of operation:
482
486
483 - ASCII and UTF-8 data successfully round-trips and is understood
487 - ASCII and UTF-8 data successfully round-trips and is understood
484 by Unicode-oriented clients
488 by Unicode-oriented clients
485 - filenames and file contents in arbitrary other encodings can have
489 - filenames and file contents in arbitrary other encodings can have
486 be round-tripped or recovered by clueful clients
490 be round-tripped or recovered by clueful clients
487 - local strings that have a cached known UTF-8 encoding (aka
491 - local strings that have a cached known UTF-8 encoding (aka
488 localstr) get sent as UTF-8 so Unicode-oriented clients get the
492 localstr) get sent as UTF-8 so Unicode-oriented clients get the
489 Unicode data they want
493 Unicode data they want
490 - because we must preserve UTF-8 bytestring in places such as
494 - because we must preserve UTF-8 bytestring in places such as
491 filenames, metadata can't be roundtripped without help
495 filenames, metadata can't be roundtripped without help
492
496
493 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
497 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
494 arbitrary bytes into an internal Unicode format that can be
498 arbitrary bytes into an internal Unicode format that can be
495 re-encoded back into the original. Here we are exposing the
499 re-encoded back into the original. Here we are exposing the
496 internal surrogate encoding as a UTF-8 string.)
500 internal surrogate encoding as a UTF-8 string.)
497 '''
501 '''
498
502
499 if "\xed" not in s:
503 if "\xed" not in s:
500 if isinstance(s, localstr):
504 if isinstance(s, localstr):
501 return s._utf8
505 return s._utf8
502 try:
506 try:
503 s.decode('utf-8')
507 s.decode('utf-8')
504 return s
508 return s
505 except UnicodeDecodeError:
509 except UnicodeDecodeError:
506 pass
510 pass
507
511
508 r = ""
512 r = ""
509 pos = 0
513 pos = 0
510 l = len(s)
514 l = len(s)
511 while pos < l:
515 while pos < l:
512 try:
516 try:
513 c = getutf8char(s, pos)
517 c = getutf8char(s, pos)
514 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
518 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
515 # have to re-escape existing U+DCxx characters
519 # have to re-escape existing U+DCxx characters
516 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
520 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
517 pos += 1
521 pos += 1
518 else:
522 else:
519 pos += len(c)
523 pos += len(c)
520 except UnicodeDecodeError:
524 except UnicodeDecodeError:
521 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
525 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
522 pos += 1
526 pos += 1
523 r += c
527 r += c
524 return r
528 return r
525
529
526 def fromutf8b(s):
530 def fromutf8b(s):
527 '''Given a UTF-8b string, return a local, possibly-binary string.
531 '''Given a UTF-8b string, return a local, possibly-binary string.
528
532
529 return the original binary string. This
533 return the original binary string. This
530 is a round-trip process for strings like filenames, but metadata
534 is a round-trip process for strings like filenames, but metadata
531 that's was passed through tolocal will remain in UTF-8.
535 that's was passed through tolocal will remain in UTF-8.
532
536
533 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
537 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
534 >>> m = "\\xc3\\xa9\\x99abcd"
538 >>> m = "\\xc3\\xa9\\x99abcd"
535 >>> toutf8b(m)
539 >>> toutf8b(m)
536 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
540 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
537 >>> roundtrip(m)
541 >>> roundtrip(m)
538 True
542 True
539 >>> roundtrip("\\xc2\\xc2\\x80")
543 >>> roundtrip("\\xc2\\xc2\\x80")
540 True
544 True
541 >>> roundtrip("\\xef\\xbf\\xbd")
545 >>> roundtrip("\\xef\\xbf\\xbd")
542 True
546 True
543 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
547 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
544 True
548 True
545 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
549 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
546 True
550 True
547 '''
551 '''
548
552
549 # fast path - look for uDxxx prefixes in s
553 # fast path - look for uDxxx prefixes in s
550 if "\xed" not in s:
554 if "\xed" not in s:
551 return s
555 return s
552
556
553 # We could do this with the unicode type but some Python builds
557 # We could do this with the unicode type but some Python builds
554 # use UTF-16 internally (issue5031) which causes non-BMP code
558 # use UTF-16 internally (issue5031) which causes non-BMP code
555 # points to be escaped. Instead, we use our handy getutf8char
559 # points to be escaped. Instead, we use our handy getutf8char
556 # helper again to walk the string without "decoding" it.
560 # helper again to walk the string without "decoding" it.
557
561
558 r = ""
562 r = ""
559 pos = 0
563 pos = 0
560 l = len(s)
564 l = len(s)
561 while pos < l:
565 while pos < l:
562 c = getutf8char(s, pos)
566 c = getutf8char(s, pos)
563 pos += len(c)
567 pos += len(c)
564 # unescape U+DCxx characters
568 # unescape U+DCxx characters
565 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
569 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
566 c = chr(ord(c.decode("utf-8")) & 0xff)
570 c = chr(ord(c.decode("utf-8")) & 0xff)
567 r += c
571 r += c
568 return r
572 return r
General Comments 0
You need to be logged in to leave comments. Login now