##// END OF EJS Templates
encoding: add option to escape non-ascii characters in JSON...
Yuya Nishihara -
r28068:9ece901f default
parent child Browse files
Show More
@@ -1,540 +1,568 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import locale
11 import locale
11 import os
12 import os
12 import unicodedata
13 import unicodedata
13
14
14 from . import (
15 from . import (
15 error,
16 error,
16 )
17 )
17
18
18 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 # "Unicode Subtleties"), so we need to ignore them in some places for
20 # "Unicode Subtleties"), so we need to ignore them in some places for
20 # sanity.
21 # sanity.
21 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 "206a 206b 206c 206d 206e 206f feff".split()]
24 "206a 206b 206c 206d 206e 206f feff".split()]
24 # verify the next function will work
25 # verify the next function will work
25 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26
27
27 def hfsignoreclean(s):
28 def hfsignoreclean(s):
28 """Remove codepoints ignored by HFS+ from s.
29 """Remove codepoints ignored by HFS+ from s.
29
30
30 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 '.hg'
32 '.hg'
32 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 '.hg'
34 '.hg'
34 """
35 """
35 if "\xe2" in s or "\xef" in s:
36 if "\xe2" in s or "\xef" in s:
36 for c in _ignore:
37 for c in _ignore:
37 s = s.replace(c, '')
38 s = s.replace(c, '')
38 return s
39 return s
39
40
40 def _getpreferredencoding():
41 def _getpreferredencoding():
41 '''
42 '''
42 On darwin, getpreferredencoding ignores the locale environment and
43 On darwin, getpreferredencoding ignores the locale environment and
43 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 for Python 2.7 and up. This is the same corrected code for earlier
45 for Python 2.7 and up. This is the same corrected code for earlier
45 Python versions.
46 Python versions.
46
47
47 However, we can't use a version check for this method, as some distributions
48 However, we can't use a version check for this method, as some distributions
48 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 encoding, as it is unlikely that this encoding is the actually expected.
50 encoding, as it is unlikely that this encoding is the actually expected.
50 '''
51 '''
51 try:
52 try:
52 locale.CODESET
53 locale.CODESET
53 except AttributeError:
54 except AttributeError:
54 # Fall back to parsing environment variables :-(
55 # Fall back to parsing environment variables :-(
55 return locale.getdefaultlocale()[1]
56 return locale.getdefaultlocale()[1]
56
57
57 oldloc = locale.setlocale(locale.LC_CTYPE)
58 oldloc = locale.setlocale(locale.LC_CTYPE)
58 locale.setlocale(locale.LC_CTYPE, "")
59 locale.setlocale(locale.LC_CTYPE, "")
59 result = locale.nl_langinfo(locale.CODESET)
60 result = locale.nl_langinfo(locale.CODESET)
60 locale.setlocale(locale.LC_CTYPE, oldloc)
61 locale.setlocale(locale.LC_CTYPE, oldloc)
61
62
62 return result
63 return result
63
64
64 _encodingfixers = {
65 _encodingfixers = {
65 '646': lambda: 'ascii',
66 '646': lambda: 'ascii',
66 'ANSI_X3.4-1968': lambda: 'ascii',
67 'ANSI_X3.4-1968': lambda: 'ascii',
67 'mac-roman': _getpreferredencoding
68 'mac-roman': _getpreferredencoding
68 }
69 }
69
70
70 try:
71 try:
71 encoding = os.environ.get("HGENCODING")
72 encoding = os.environ.get("HGENCODING")
72 if not encoding:
73 if not encoding:
73 encoding = locale.getpreferredencoding() or 'ascii'
74 encoding = locale.getpreferredencoding() or 'ascii'
74 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 except locale.Error:
76 except locale.Error:
76 encoding = 'ascii'
77 encoding = 'ascii'
77 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 fallbackencoding = 'ISO-8859-1'
79 fallbackencoding = 'ISO-8859-1'
79
80
80 class localstr(str):
81 class localstr(str):
81 '''This class allows strings that are unmodified to be
82 '''This class allows strings that are unmodified to be
82 round-tripped to the local encoding and back'''
83 round-tripped to the local encoding and back'''
83 def __new__(cls, u, l):
84 def __new__(cls, u, l):
84 s = str.__new__(cls, l)
85 s = str.__new__(cls, l)
85 s._utf8 = u
86 s._utf8 = u
86 return s
87 return s
87 def __hash__(self):
88 def __hash__(self):
88 return hash(self._utf8) # avoid collisions in local string space
89 return hash(self._utf8) # avoid collisions in local string space
89
90
90 def tolocal(s):
91 def tolocal(s):
91 """
92 """
92 Convert a string from internal UTF-8 to local encoding
93 Convert a string from internal UTF-8 to local encoding
93
94
94 All internal strings should be UTF-8 but some repos before the
95 All internal strings should be UTF-8 but some repos before the
95 implementation of locale support may contain latin1 or possibly
96 implementation of locale support may contain latin1 or possibly
96 other character sets. We attempt to decode everything strictly
97 other character sets. We attempt to decode everything strictly
97 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 replace unknown characters.
99 replace unknown characters.
99
100
100 The localstr class is used to cache the known UTF-8 encoding of
101 The localstr class is used to cache the known UTF-8 encoding of
101 strings next to their local representation to allow lossless
102 strings next to their local representation to allow lossless
102 round-trip conversion back to UTF-8.
103 round-trip conversion back to UTF-8.
103
104
104 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 >>> l = tolocal(u)
106 >>> l = tolocal(u)
106 >>> l
107 >>> l
107 'foo: ?'
108 'foo: ?'
108 >>> fromlocal(l)
109 >>> fromlocal(l)
109 'foo: \\xc3\\xa4'
110 'foo: \\xc3\\xa4'
110 >>> u2 = 'foo: \\xc3\\xa1'
111 >>> u2 = 'foo: \\xc3\\xa1'
111 >>> d = { l: 1, tolocal(u2): 2 }
112 >>> d = { l: 1, tolocal(u2): 2 }
112 >>> len(d) # no collision
113 >>> len(d) # no collision
113 2
114 2
114 >>> 'foo: ?' in d
115 >>> 'foo: ?' in d
115 False
116 False
116 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 >>> l = tolocal(l1)
118 >>> l = tolocal(l1)
118 >>> l
119 >>> l
119 'foo: ?'
120 'foo: ?'
120 >>> fromlocal(l) # magically in utf-8
121 >>> fromlocal(l) # magically in utf-8
121 'foo: \\xc3\\xa4'
122 'foo: \\xc3\\xa4'
122 """
123 """
123
124
124 try:
125 try:
125 try:
126 try:
126 # make sure string is actually stored in UTF-8
127 # make sure string is actually stored in UTF-8
127 u = s.decode('UTF-8')
128 u = s.decode('UTF-8')
128 if encoding == 'UTF-8':
129 if encoding == 'UTF-8':
129 # fast path
130 # fast path
130 return s
131 return s
131 r = u.encode(encoding, "replace")
132 r = u.encode(encoding, "replace")
132 if u == r.decode(encoding):
133 if u == r.decode(encoding):
133 # r is a safe, non-lossy encoding of s
134 # r is a safe, non-lossy encoding of s
134 return r
135 return r
135 return localstr(s, r)
136 return localstr(s, r)
136 except UnicodeDecodeError:
137 except UnicodeDecodeError:
137 # we should only get here if we're looking at an ancient changeset
138 # we should only get here if we're looking at an ancient changeset
138 try:
139 try:
139 u = s.decode(fallbackencoding)
140 u = s.decode(fallbackencoding)
140 r = u.encode(encoding, "replace")
141 r = u.encode(encoding, "replace")
141 if u == r.decode(encoding):
142 if u == r.decode(encoding):
142 # r is a safe, non-lossy encoding of s
143 # r is a safe, non-lossy encoding of s
143 return r
144 return r
144 return localstr(u.encode('UTF-8'), r)
145 return localstr(u.encode('UTF-8'), r)
145 except UnicodeDecodeError:
146 except UnicodeDecodeError:
146 u = s.decode("utf-8", "replace") # last ditch
147 u = s.decode("utf-8", "replace") # last ditch
147 return u.encode(encoding, "replace") # can't round-trip
148 return u.encode(encoding, "replace") # can't round-trip
148 except LookupError as k:
149 except LookupError as k:
149 raise error.Abort(k, hint="please check your locale settings")
150 raise error.Abort(k, hint="please check your locale settings")
150
151
151 def fromlocal(s):
152 def fromlocal(s):
152 """
153 """
153 Convert a string from the local character encoding to UTF-8
154 Convert a string from the local character encoding to UTF-8
154
155
155 We attempt to decode strings using the encoding mode set by
156 We attempt to decode strings using the encoding mode set by
156 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 characters will cause an error message. Other modes include
158 characters will cause an error message. Other modes include
158 'replace', which replaces unknown characters with a special
159 'replace', which replaces unknown characters with a special
159 Unicode character, and 'ignore', which drops the character.
160 Unicode character, and 'ignore', which drops the character.
160 """
161 """
161
162
162 # can we do a lossless round-trip?
163 # can we do a lossless round-trip?
163 if isinstance(s, localstr):
164 if isinstance(s, localstr):
164 return s._utf8
165 return s._utf8
165
166
166 try:
167 try:
167 return s.decode(encoding, encodingmode).encode("utf-8")
168 return s.decode(encoding, encodingmode).encode("utf-8")
168 except UnicodeDecodeError as inst:
169 except UnicodeDecodeError as inst:
169 sub = s[max(0, inst.start - 10):inst.start + 10]
170 sub = s[max(0, inst.start - 10):inst.start + 10]
170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 except LookupError as k:
172 except LookupError as k:
172 raise error.Abort(k, hint="please check your locale settings")
173 raise error.Abort(k, hint="please check your locale settings")
173
174
174 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 and "WFA" or "WF")
177 and "WFA" or "WF")
177
178
178 def colwidth(s):
179 def colwidth(s):
179 "Find the column width of a string for display in the local encoding"
180 "Find the column width of a string for display in the local encoding"
180 return ucolwidth(s.decode(encoding, 'replace'))
181 return ucolwidth(s.decode(encoding, 'replace'))
181
182
182 def ucolwidth(d):
183 def ucolwidth(d):
183 "Find the column width of a Unicode string for display"
184 "Find the column width of a Unicode string for display"
184 eaw = getattr(unicodedata, 'east_asian_width', None)
185 eaw = getattr(unicodedata, 'east_asian_width', None)
185 if eaw is not None:
186 if eaw is not None:
186 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 return len(d)
188 return len(d)
188
189
189 def getcols(s, start, c):
190 def getcols(s, start, c):
190 '''Use colwidth to find a c-column substring of s starting at byte
191 '''Use colwidth to find a c-column substring of s starting at byte
191 index start'''
192 index start'''
192 for x in xrange(start + c, len(s)):
193 for x in xrange(start + c, len(s)):
193 t = s[start:x]
194 t = s[start:x]
194 if colwidth(t) == c:
195 if colwidth(t) == c:
195 return t
196 return t
196
197
197 def trim(s, width, ellipsis='', leftside=False):
198 def trim(s, width, ellipsis='', leftside=False):
198 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199
200
200 If 'leftside' is True, left side of string 's' is trimmed.
201 If 'leftside' is True, left side of string 's' is trimmed.
201 'ellipsis' is always placed at trimmed side.
202 'ellipsis' is always placed at trimmed side.
202
203
203 >>> ellipsis = '+++'
204 >>> ellipsis = '+++'
204 >>> from . import encoding
205 >>> from . import encoding
205 >>> encoding.encoding = 'utf-8'
206 >>> encoding.encoding = 'utf-8'
206 >>> t= '1234567890'
207 >>> t= '1234567890'
207 >>> print trim(t, 12, ellipsis=ellipsis)
208 >>> print trim(t, 12, ellipsis=ellipsis)
208 1234567890
209 1234567890
209 >>> print trim(t, 10, ellipsis=ellipsis)
210 >>> print trim(t, 10, ellipsis=ellipsis)
210 1234567890
211 1234567890
211 >>> print trim(t, 8, ellipsis=ellipsis)
212 >>> print trim(t, 8, ellipsis=ellipsis)
212 12345+++
213 12345+++
213 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 +++67890
215 +++67890
215 >>> print trim(t, 8)
216 >>> print trim(t, 8)
216 12345678
217 12345678
217 >>> print trim(t, 8, leftside=True)
218 >>> print trim(t, 8, leftside=True)
218 34567890
219 34567890
219 >>> print trim(t, 3, ellipsis=ellipsis)
220 >>> print trim(t, 3, ellipsis=ellipsis)
220 +++
221 +++
221 >>> print trim(t, 1, ellipsis=ellipsis)
222 >>> print trim(t, 1, ellipsis=ellipsis)
222 +
223 +
223 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 >>> t = u.encode(encoding.encoding)
225 >>> t = u.encode(encoding.encoding)
225 >>> print trim(t, 12, ellipsis=ellipsis)
226 >>> print trim(t, 12, ellipsis=ellipsis)
226 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 >>> print trim(t, 10, ellipsis=ellipsis)
228 >>> print trim(t, 10, ellipsis=ellipsis)
228 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 >>> print trim(t, 8, ellipsis=ellipsis)
230 >>> print trim(t, 8, ellipsis=ellipsis)
230 \xe3\x81\x82\xe3\x81\x84+++
231 \xe3\x81\x82\xe3\x81\x84+++
231 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 +++\xe3\x81\x88\xe3\x81\x8a
233 +++\xe3\x81\x88\xe3\x81\x8a
233 >>> print trim(t, 5)
234 >>> print trim(t, 5)
234 \xe3\x81\x82\xe3\x81\x84
235 \xe3\x81\x82\xe3\x81\x84
235 >>> print trim(t, 5, leftside=True)
236 >>> print trim(t, 5, leftside=True)
236 \xe3\x81\x88\xe3\x81\x8a
237 \xe3\x81\x88\xe3\x81\x8a
237 >>> print trim(t, 4, ellipsis=ellipsis)
238 >>> print trim(t, 4, ellipsis=ellipsis)
238 +++
239 +++
239 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 +++
241 +++
241 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 >>> print trim(t, 12, ellipsis=ellipsis)
243 >>> print trim(t, 12, ellipsis=ellipsis)
243 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 >>> print trim(t, 10, ellipsis=ellipsis)
245 >>> print trim(t, 10, ellipsis=ellipsis)
245 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 >>> print trim(t, 8, ellipsis=ellipsis)
247 >>> print trim(t, 8, ellipsis=ellipsis)
247 \x11\x22\x33\x44\x55+++
248 \x11\x22\x33\x44\x55+++
248 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 +++\x66\x77\x88\x99\xaa
250 +++\x66\x77\x88\x99\xaa
250 >>> print trim(t, 8)
251 >>> print trim(t, 8)
251 \x11\x22\x33\x44\x55\x66\x77\x88
252 \x11\x22\x33\x44\x55\x66\x77\x88
252 >>> print trim(t, 8, leftside=True)
253 >>> print trim(t, 8, leftside=True)
253 \x33\x44\x55\x66\x77\x88\x99\xaa
254 \x33\x44\x55\x66\x77\x88\x99\xaa
254 >>> print trim(t, 3, ellipsis=ellipsis)
255 >>> print trim(t, 3, ellipsis=ellipsis)
255 +++
256 +++
256 >>> print trim(t, 1, ellipsis=ellipsis)
257 >>> print trim(t, 1, ellipsis=ellipsis)
257 +
258 +
258 """
259 """
259 try:
260 try:
260 u = s.decode(encoding)
261 u = s.decode(encoding)
261 except UnicodeDecodeError:
262 except UnicodeDecodeError:
262 if len(s) <= width: # trimming is not needed
263 if len(s) <= width: # trimming is not needed
263 return s
264 return s
264 width -= len(ellipsis)
265 width -= len(ellipsis)
265 if width <= 0: # no enough room even for ellipsis
266 if width <= 0: # no enough room even for ellipsis
266 return ellipsis[:width + len(ellipsis)]
267 return ellipsis[:width + len(ellipsis)]
267 if leftside:
268 if leftside:
268 return ellipsis + s[-width:]
269 return ellipsis + s[-width:]
269 return s[:width] + ellipsis
270 return s[:width] + ellipsis
270
271
271 if ucolwidth(u) <= width: # trimming is not needed
272 if ucolwidth(u) <= width: # trimming is not needed
272 return s
273 return s
273
274
274 width -= len(ellipsis)
275 width -= len(ellipsis)
275 if width <= 0: # no enough room even for ellipsis
276 if width <= 0: # no enough room even for ellipsis
276 return ellipsis[:width + len(ellipsis)]
277 return ellipsis[:width + len(ellipsis)]
277
278
278 if leftside:
279 if leftside:
279 uslice = lambda i: u[i:]
280 uslice = lambda i: u[i:]
280 concat = lambda s: ellipsis + s
281 concat = lambda s: ellipsis + s
281 else:
282 else:
282 uslice = lambda i: u[:-i]
283 uslice = lambda i: u[:-i]
283 concat = lambda s: s + ellipsis
284 concat = lambda s: s + ellipsis
284 for i in xrange(1, len(u)):
285 for i in xrange(1, len(u)):
285 usub = uslice(i)
286 usub = uslice(i)
286 if ucolwidth(usub) <= width:
287 if ucolwidth(usub) <= width:
287 return concat(usub.encode(encoding))
288 return concat(usub.encode(encoding))
288 return ellipsis # no enough room for multi-column characters
289 return ellipsis # no enough room for multi-column characters
289
290
290 def _asciilower(s):
291 def _asciilower(s):
291 '''convert a string to lowercase if ASCII
292 '''convert a string to lowercase if ASCII
292
293
293 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 s.decode('ascii')
295 s.decode('ascii')
295 return s.lower()
296 return s.lower()
296
297
297 def asciilower(s):
298 def asciilower(s):
298 # delay importing avoids cyclic dependency around "parsers" in
299 # delay importing avoids cyclic dependency around "parsers" in
299 # pure Python build (util => i18n => encoding => parsers => util)
300 # pure Python build (util => i18n => encoding => parsers => util)
300 from . import parsers
301 from . import parsers
301 impl = getattr(parsers, 'asciilower', _asciilower)
302 impl = getattr(parsers, 'asciilower', _asciilower)
302 global asciilower
303 global asciilower
303 asciilower = impl
304 asciilower = impl
304 return impl(s)
305 return impl(s)
305
306
306 def _asciiupper(s):
307 def _asciiupper(s):
307 '''convert a string to uppercase if ASCII
308 '''convert a string to uppercase if ASCII
308
309
309 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 s.decode('ascii')
311 s.decode('ascii')
311 return s.upper()
312 return s.upper()
312
313
313 def asciiupper(s):
314 def asciiupper(s):
314 # delay importing avoids cyclic dependency around "parsers" in
315 # delay importing avoids cyclic dependency around "parsers" in
315 # pure Python build (util => i18n => encoding => parsers => util)
316 # pure Python build (util => i18n => encoding => parsers => util)
316 from . import parsers
317 from . import parsers
317 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 global asciiupper
319 global asciiupper
319 asciiupper = impl
320 asciiupper = impl
320 return impl(s)
321 return impl(s)
321
322
322 def lower(s):
323 def lower(s):
323 "best-effort encoding-aware case-folding of local string s"
324 "best-effort encoding-aware case-folding of local string s"
324 try:
325 try:
325 return asciilower(s)
326 return asciilower(s)
326 except UnicodeDecodeError:
327 except UnicodeDecodeError:
327 pass
328 pass
328 try:
329 try:
329 if isinstance(s, localstr):
330 if isinstance(s, localstr):
330 u = s._utf8.decode("utf-8")
331 u = s._utf8.decode("utf-8")
331 else:
332 else:
332 u = s.decode(encoding, encodingmode)
333 u = s.decode(encoding, encodingmode)
333
334
334 lu = u.lower()
335 lu = u.lower()
335 if u == lu:
336 if u == lu:
336 return s # preserve localstring
337 return s # preserve localstring
337 return lu.encode(encoding)
338 return lu.encode(encoding)
338 except UnicodeError:
339 except UnicodeError:
339 return s.lower() # we don't know how to fold this except in ASCII
340 return s.lower() # we don't know how to fold this except in ASCII
340 except LookupError as k:
341 except LookupError as k:
341 raise error.Abort(k, hint="please check your locale settings")
342 raise error.Abort(k, hint="please check your locale settings")
342
343
343 def upper(s):
344 def upper(s):
344 "best-effort encoding-aware case-folding of local string s"
345 "best-effort encoding-aware case-folding of local string s"
345 try:
346 try:
346 return asciiupper(s)
347 return asciiupper(s)
347 except UnicodeDecodeError:
348 except UnicodeDecodeError:
348 return upperfallback(s)
349 return upperfallback(s)
349
350
350 def upperfallback(s):
351 def upperfallback(s):
351 try:
352 try:
352 if isinstance(s, localstr):
353 if isinstance(s, localstr):
353 u = s._utf8.decode("utf-8")
354 u = s._utf8.decode("utf-8")
354 else:
355 else:
355 u = s.decode(encoding, encodingmode)
356 u = s.decode(encoding, encodingmode)
356
357
357 uu = u.upper()
358 uu = u.upper()
358 if u == uu:
359 if u == uu:
359 return s # preserve localstring
360 return s # preserve localstring
360 return uu.encode(encoding)
361 return uu.encode(encoding)
361 except UnicodeError:
362 except UnicodeError:
362 return s.upper() # we don't know how to fold this except in ASCII
363 return s.upper() # we don't know how to fold this except in ASCII
363 except LookupError as k:
364 except LookupError as k:
364 raise error.Abort(k, hint="please check your locale settings")
365 raise error.Abort(k, hint="please check your locale settings")
365
366
366 class normcasespecs(object):
367 class normcasespecs(object):
367 '''what a platform's normcase does to ASCII strings
368 '''what a platform's normcase does to ASCII strings
368
369
369 This is specified per platform, and should be consistent with what normcase
370 This is specified per platform, and should be consistent with what normcase
370 on that platform actually does.
371 on that platform actually does.
371
372
372 lower: normcase lowercases ASCII strings
373 lower: normcase lowercases ASCII strings
373 upper: normcase uppercases ASCII strings
374 upper: normcase uppercases ASCII strings
374 other: the fallback function should always be called
375 other: the fallback function should always be called
375
376
376 This should be kept in sync with normcase_spec in util.h.'''
377 This should be kept in sync with normcase_spec in util.h.'''
377 lower = -1
378 lower = -1
378 upper = 1
379 upper = 1
379 other = 0
380 other = 0
380
381
381 _jsonmap = []
382 _jsonmap = []
382 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
383 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
383 _jsonmap.extend(chr(x) for x in xrange(32, 256))
384 _jsonmap.extend(chr(x) for x in xrange(32, 127))
384 _jsonmap[0x7f] = '\\u007f'
385 _jsonmap.append('\\u007f')
385 _jsonmap[0x09] = '\\t'
386 _jsonmap[0x09] = '\\t'
386 _jsonmap[0x0a] = '\\n'
387 _jsonmap[0x0a] = '\\n'
387 _jsonmap[0x22] = '\\"'
388 _jsonmap[0x22] = '\\"'
388 _jsonmap[0x5c] = '\\\\'
389 _jsonmap[0x5c] = '\\\\'
389 _jsonmap[0x08] = '\\b'
390 _jsonmap[0x08] = '\\b'
390 _jsonmap[0x0c] = '\\f'
391 _jsonmap[0x0c] = '\\f'
391 _jsonmap[0x0d] = '\\r'
392 _jsonmap[0x0d] = '\\r'
393 _paranoidjsonmap = _jsonmap[:]
394 _jsonmap.extend(chr(x) for x in xrange(128, 256))
392
395
393 def jsonescape(s):
396 def jsonescape(s, paranoid=False):
394 '''returns a string suitable for JSON
397 '''returns a string suitable for JSON
395
398
396 JSON is problematic for us because it doesn't support non-Unicode
399 JSON is problematic for us because it doesn't support non-Unicode
397 bytes. To deal with this, we take the following approach:
400 bytes. To deal with this, we take the following approach:
398
401
399 - localstr objects are converted back to UTF-8
402 - localstr objects are converted back to UTF-8
400 - valid UTF-8/ASCII strings are passed as-is
403 - valid UTF-8/ASCII strings are passed as-is
401 - other strings are converted to UTF-8b surrogate encoding
404 - other strings are converted to UTF-8b surrogate encoding
402 - apply JSON-specified string escaping
405 - apply JSON-specified string escaping
403
406
404 (escapes are doubled in these tests)
407 (escapes are doubled in these tests)
405
408
406 >>> jsonescape('this is a test')
409 >>> jsonescape('this is a test')
407 'this is a test'
410 'this is a test'
408 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
411 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
409 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
412 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
410 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
413 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
411 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
414 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
412 >>> jsonescape('a weird byte: \\xdd')
415 >>> jsonescape('a weird byte: \\xdd')
413 'a weird byte: \\xed\\xb3\\x9d'
416 'a weird byte: \\xed\\xb3\\x9d'
414 >>> jsonescape('utf-8: caf\\xc3\\xa9')
417 >>> jsonescape('utf-8: caf\\xc3\\xa9')
415 'utf-8: caf\\xc3\\xa9'
418 'utf-8: caf\\xc3\\xa9'
416 >>> jsonescape('')
419 >>> jsonescape('')
417 ''
420 ''
421
422 If paranoid, non-ascii characters are also escaped. This is suitable for
423 web output.
424
425 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
426 'escape boundary: ~ \\\\u007f \\\\u0080'
427 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
428 'a weird byte: \\\\udcdd'
429 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
430 'utf-8: caf\\\\u00e9'
431 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
432 'non-BMP: \\\\ud834\\\\udd1e'
418 '''
433 '''
419
434
420 return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
435 if paranoid:
436 jm = _paranoidjsonmap
437 else:
438 jm = _jsonmap
439
440 u8chars = toutf8b(s)
441 try:
442 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
443 except IndexError:
444 pass
445 # non-BMP char is represented as UTF-16 surrogate pair
446 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
447 u16codes.pop(0) # drop BOM
448 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
421
449
422 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
450 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
423
451
424 def getutf8char(s, pos):
452 def getutf8char(s, pos):
425 '''get the next full utf-8 character in the given string, starting at pos
453 '''get the next full utf-8 character in the given string, starting at pos
426
454
427 Raises a UnicodeError if the given location does not start a valid
455 Raises a UnicodeError if the given location does not start a valid
428 utf-8 character.
456 utf-8 character.
429 '''
457 '''
430
458
431 # find how many bytes to attempt decoding from first nibble
459 # find how many bytes to attempt decoding from first nibble
432 l = _utf8len[ord(s[pos]) >> 4]
460 l = _utf8len[ord(s[pos]) >> 4]
433 if not l: # ascii
461 if not l: # ascii
434 return s[pos]
462 return s[pos]
435
463
436 c = s[pos:pos + l]
464 c = s[pos:pos + l]
437 # validate with attempted decode
465 # validate with attempted decode
438 c.decode("utf-8")
466 c.decode("utf-8")
439 return c
467 return c
440
468
441 def toutf8b(s):
469 def toutf8b(s):
442 '''convert a local, possibly-binary string into UTF-8b
470 '''convert a local, possibly-binary string into UTF-8b
443
471
444 This is intended as a generic method to preserve data when working
472 This is intended as a generic method to preserve data when working
445 with schemes like JSON and XML that have no provision for
473 with schemes like JSON and XML that have no provision for
446 arbitrary byte strings. As Mercurial often doesn't know
474 arbitrary byte strings. As Mercurial often doesn't know
447 what encoding data is in, we use so-called UTF-8b.
475 what encoding data is in, we use so-called UTF-8b.
448
476
449 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
477 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
450 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
478 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
451 uDC00-uDCFF.
479 uDC00-uDCFF.
452
480
453 Principles of operation:
481 Principles of operation:
454
482
455 - ASCII and UTF-8 data successfully round-trips and is understood
483 - ASCII and UTF-8 data successfully round-trips and is understood
456 by Unicode-oriented clients
484 by Unicode-oriented clients
457 - filenames and file contents in arbitrary other encodings can have
485 - filenames and file contents in arbitrary other encodings can have
458 be round-tripped or recovered by clueful clients
486 be round-tripped or recovered by clueful clients
459 - local strings that have a cached known UTF-8 encoding (aka
487 - local strings that have a cached known UTF-8 encoding (aka
460 localstr) get sent as UTF-8 so Unicode-oriented clients get the
488 localstr) get sent as UTF-8 so Unicode-oriented clients get the
461 Unicode data they want
489 Unicode data they want
462 - because we must preserve UTF-8 bytestring in places such as
490 - because we must preserve UTF-8 bytestring in places such as
463 filenames, metadata can't be roundtripped without help
491 filenames, metadata can't be roundtripped without help
464
492
465 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
493 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
466 arbitrary bytes into an internal Unicode format that can be
494 arbitrary bytes into an internal Unicode format that can be
467 re-encoded back into the original. Here we are exposing the
495 re-encoded back into the original. Here we are exposing the
468 internal surrogate encoding as a UTF-8 string.)
496 internal surrogate encoding as a UTF-8 string.)
469 '''
497 '''
470
498
471 if "\xed" not in s:
499 if "\xed" not in s:
472 if isinstance(s, localstr):
500 if isinstance(s, localstr):
473 return s._utf8
501 return s._utf8
474 try:
502 try:
475 s.decode('utf-8')
503 s.decode('utf-8')
476 return s
504 return s
477 except UnicodeDecodeError:
505 except UnicodeDecodeError:
478 pass
506 pass
479
507
480 r = ""
508 r = ""
481 pos = 0
509 pos = 0
482 l = len(s)
510 l = len(s)
483 while pos < l:
511 while pos < l:
484 try:
512 try:
485 c = getutf8char(s, pos)
513 c = getutf8char(s, pos)
486 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
514 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
487 # have to re-escape existing U+DCxx characters
515 # have to re-escape existing U+DCxx characters
488 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
516 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
489 pos += 1
517 pos += 1
490 else:
518 else:
491 pos += len(c)
519 pos += len(c)
492 except UnicodeDecodeError:
520 except UnicodeDecodeError:
493 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
521 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
494 pos += 1
522 pos += 1
495 r += c
523 r += c
496 return r
524 return r
497
525
498 def fromutf8b(s):
526 def fromutf8b(s):
499 '''Given a UTF-8b string, return a local, possibly-binary string.
527 '''Given a UTF-8b string, return a local, possibly-binary string.
500
528
501 return the original binary string. This
529 return the original binary string. This
502 is a round-trip process for strings like filenames, but metadata
530 is a round-trip process for strings like filenames, but metadata
503 that's was passed through tolocal will remain in UTF-8.
531 that's was passed through tolocal will remain in UTF-8.
504
532
505 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
533 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
506 >>> m = "\\xc3\\xa9\\x99abcd"
534 >>> m = "\\xc3\\xa9\\x99abcd"
507 >>> toutf8b(m)
535 >>> toutf8b(m)
508 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
536 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
509 >>> roundtrip(m)
537 >>> roundtrip(m)
510 True
538 True
511 >>> roundtrip("\\xc2\\xc2\\x80")
539 >>> roundtrip("\\xc2\\xc2\\x80")
512 True
540 True
513 >>> roundtrip("\\xef\\xbf\\xbd")
541 >>> roundtrip("\\xef\\xbf\\xbd")
514 True
542 True
515 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
543 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
516 True
544 True
517 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
545 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
518 True
546 True
519 '''
547 '''
520
548
521 # fast path - look for uDxxx prefixes in s
549 # fast path - look for uDxxx prefixes in s
522 if "\xed" not in s:
550 if "\xed" not in s:
523 return s
551 return s
524
552
525 # We could do this with the unicode type but some Python builds
553 # We could do this with the unicode type but some Python builds
526 # use UTF-16 internally (issue5031) which causes non-BMP code
554 # use UTF-16 internally (issue5031) which causes non-BMP code
527 # points to be escaped. Instead, we use our handy getutf8char
555 # points to be escaped. Instead, we use our handy getutf8char
528 # helper again to walk the string without "decoding" it.
556 # helper again to walk the string without "decoding" it.
529
557
530 r = ""
558 r = ""
531 pos = 0
559 pos = 0
532 l = len(s)
560 l = len(s)
533 while pos < l:
561 while pos < l:
534 c = getutf8char(s, pos)
562 c = getutf8char(s, pos)
535 pos += len(c)
563 pos += len(c)
536 # unescape U+DCxx characters
564 # unescape U+DCxx characters
537 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
565 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
538 c = chr(ord(c.decode("utf-8")) & 0xff)
566 c = chr(ord(c.decode("utf-8")) & 0xff)
539 r += c
567 r += c
540 return r
568 return r
General Comments 0
You need to be logged in to leave comments. Login now