##// END OF EJS Templates
encoding.lower: use fast ASCII lower...
Siddharth Agarwal -
r22779:d9585dda default
parent child Browse files
Show More
@@ -1,432 +1,431 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error, parsers
8 import error, parsers
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 def _getpreferredencoding():
11 def _getpreferredencoding():
12 '''
12 '''
13 On darwin, getpreferredencoding ignores the locale environment and
13 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
15 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
16 Python versions.
17
17
18 However, we can't use a version check for this method, as some distributions
18 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
20 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
21 '''
22 try:
22 try:
23 locale.CODESET
23 locale.CODESET
24 except AttributeError:
24 except AttributeError:
25 # Fall back to parsing environment variables :-(
25 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
26 return locale.getdefaultlocale()[1]
27
27
28 oldloc = locale.setlocale(locale.LC_CTYPE)
28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
29 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
30 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
32
32
33 return result
33 return result
34
34
35 _encodingfixers = {
35 _encodingfixers = {
36 '646': lambda: 'ascii',
36 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
38 'mac-roman': _getpreferredencoding
39 }
39 }
40
40
41 try:
41 try:
42 encoding = os.environ.get("HGENCODING")
42 encoding = os.environ.get("HGENCODING")
43 if not encoding:
43 if not encoding:
44 encoding = locale.getpreferredencoding() or 'ascii'
44 encoding = locale.getpreferredencoding() or 'ascii'
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 except locale.Error:
46 except locale.Error:
47 encoding = 'ascii'
47 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1'
49 fallbackencoding = 'ISO-8859-1'
50
50
51 class localstr(str):
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
55 s = str.__new__(cls, l)
56 s._utf8 = u
56 s._utf8 = u
57 return s
57 return s
58 def __hash__(self):
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
59 return hash(self._utf8) # avoid collisions in local string space
60
60
61 def tolocal(s):
61 def tolocal(s):
62 """
62 """
63 Convert a string from internal UTF-8 to local encoding
63 Convert a string from internal UTF-8 to local encoding
64
64
65 All internal strings should be UTF-8 but some repos before the
65 All internal strings should be UTF-8 but some repos before the
66 implementation of locale support may contain latin1 or possibly
66 implementation of locale support may contain latin1 or possibly
67 other character sets. We attempt to decode everything strictly
67 other character sets. We attempt to decode everything strictly
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 replace unknown characters.
69 replace unknown characters.
70
70
71 The localstr class is used to cache the known UTF-8 encoding of
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
73 round-trip conversion back to UTF-8.
74
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
76 >>> l = tolocal(u)
77 >>> l
77 >>> l
78 'foo: ?'
78 'foo: ?'
79 >>> fromlocal(l)
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> len(d) # no collision
83 >>> len(d) # no collision
84 2
84 2
85 >>> 'foo: ?' in d
85 >>> 'foo: ?' in d
86 False
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
88 >>> l = tolocal(l1)
89 >>> l
89 >>> l
90 'foo: ?'
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
92 'foo: \\xc3\\xa4'
93 """
93 """
94
94
95 try:
95 try:
96 try:
96 try:
97 # make sure string is actually stored in UTF-8
97 # make sure string is actually stored in UTF-8
98 u = s.decode('UTF-8')
98 u = s.decode('UTF-8')
99 if encoding == 'UTF-8':
99 if encoding == 'UTF-8':
100 # fast path
100 # fast path
101 return s
101 return s
102 r = u.encode(encoding, "replace")
102 r = u.encode(encoding, "replace")
103 if u == r.decode(encoding):
103 if u == r.decode(encoding):
104 # r is a safe, non-lossy encoding of s
104 # r is a safe, non-lossy encoding of s
105 return r
105 return r
106 return localstr(s, r)
106 return localstr(s, r)
107 except UnicodeDecodeError:
107 except UnicodeDecodeError:
108 # we should only get here if we're looking at an ancient changeset
108 # we should only get here if we're looking at an ancient changeset
109 try:
109 try:
110 u = s.decode(fallbackencoding)
110 u = s.decode(fallbackencoding)
111 r = u.encode(encoding, "replace")
111 r = u.encode(encoding, "replace")
112 if u == r.decode(encoding):
112 if u == r.decode(encoding):
113 # r is a safe, non-lossy encoding of s
113 # r is a safe, non-lossy encoding of s
114 return r
114 return r
115 return localstr(u.encode('UTF-8'), r)
115 return localstr(u.encode('UTF-8'), r)
116 except UnicodeDecodeError:
116 except UnicodeDecodeError:
117 u = s.decode("utf-8", "replace") # last ditch
117 u = s.decode("utf-8", "replace") # last ditch
118 return u.encode(encoding, "replace") # can't round-trip
118 return u.encode(encoding, "replace") # can't round-trip
119 except LookupError, k:
119 except LookupError, k:
120 raise error.Abort(k, hint="please check your locale settings")
120 raise error.Abort(k, hint="please check your locale settings")
121
121
122 def fromlocal(s):
122 def fromlocal(s):
123 """
123 """
124 Convert a string from the local character encoding to UTF-8
124 Convert a string from the local character encoding to UTF-8
125
125
126 We attempt to decode strings using the encoding mode set by
126 We attempt to decode strings using the encoding mode set by
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 characters will cause an error message. Other modes include
128 characters will cause an error message. Other modes include
129 'replace', which replaces unknown characters with a special
129 'replace', which replaces unknown characters with a special
130 Unicode character, and 'ignore', which drops the character.
130 Unicode character, and 'ignore', which drops the character.
131 """
131 """
132
132
133 # can we do a lossless round-trip?
133 # can we do a lossless round-trip?
134 if isinstance(s, localstr):
134 if isinstance(s, localstr):
135 return s._utf8
135 return s._utf8
136
136
137 try:
137 try:
138 return s.decode(encoding, encodingmode).encode("utf-8")
138 return s.decode(encoding, encodingmode).encode("utf-8")
139 except UnicodeDecodeError, inst:
139 except UnicodeDecodeError, inst:
140 sub = s[max(0, inst.start - 10):inst.start + 10]
140 sub = s[max(0, inst.start - 10):inst.start + 10]
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 except LookupError, k:
142 except LookupError, k:
143 raise error.Abort(k, hint="please check your locale settings")
143 raise error.Abort(k, hint="please check your locale settings")
144
144
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 and "WFA" or "WF")
147 and "WFA" or "WF")
148
148
149 def colwidth(s):
149 def colwidth(s):
150 "Find the column width of a string for display in the local encoding"
150 "Find the column width of a string for display in the local encoding"
151 return ucolwidth(s.decode(encoding, 'replace'))
151 return ucolwidth(s.decode(encoding, 'replace'))
152
152
153 def ucolwidth(d):
153 def ucolwidth(d):
154 "Find the column width of a Unicode string for display"
154 "Find the column width of a Unicode string for display"
155 eaw = getattr(unicodedata, 'east_asian_width', None)
155 eaw = getattr(unicodedata, 'east_asian_width', None)
156 if eaw is not None:
156 if eaw is not None:
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 return len(d)
158 return len(d)
159
159
160 def getcols(s, start, c):
160 def getcols(s, start, c):
161 '''Use colwidth to find a c-column substring of s starting at byte
161 '''Use colwidth to find a c-column substring of s starting at byte
162 index start'''
162 index start'''
163 for x in xrange(start + c, len(s)):
163 for x in xrange(start + c, len(s)):
164 t = s[start:x]
164 t = s[start:x]
165 if colwidth(t) == c:
165 if colwidth(t) == c:
166 return t
166 return t
167
167
168 def trim(s, width, ellipsis='', leftside=False):
168 def trim(s, width, ellipsis='', leftside=False):
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170
170
171 If 'leftside' is True, left side of string 's' is trimmed.
171 If 'leftside' is True, left side of string 's' is trimmed.
172 'ellipsis' is always placed at trimmed side.
172 'ellipsis' is always placed at trimmed side.
173
173
174 >>> ellipsis = '+++'
174 >>> ellipsis = '+++'
175 >>> from mercurial import encoding
175 >>> from mercurial import encoding
176 >>> encoding.encoding = 'utf-8'
176 >>> encoding.encoding = 'utf-8'
177 >>> t= '1234567890'
177 >>> t= '1234567890'
178 >>> print trim(t, 12, ellipsis=ellipsis)
178 >>> print trim(t, 12, ellipsis=ellipsis)
179 1234567890
179 1234567890
180 >>> print trim(t, 10, ellipsis=ellipsis)
180 >>> print trim(t, 10, ellipsis=ellipsis)
181 1234567890
181 1234567890
182 >>> print trim(t, 8, ellipsis=ellipsis)
182 >>> print trim(t, 8, ellipsis=ellipsis)
183 12345+++
183 12345+++
184 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
184 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
185 +++67890
185 +++67890
186 >>> print trim(t, 8)
186 >>> print trim(t, 8)
187 12345678
187 12345678
188 >>> print trim(t, 8, leftside=True)
188 >>> print trim(t, 8, leftside=True)
189 34567890
189 34567890
190 >>> print trim(t, 3, ellipsis=ellipsis)
190 >>> print trim(t, 3, ellipsis=ellipsis)
191 +++
191 +++
192 >>> print trim(t, 1, ellipsis=ellipsis)
192 >>> print trim(t, 1, ellipsis=ellipsis)
193 +
193 +
194 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
194 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
195 >>> t = u.encode(encoding.encoding)
195 >>> t = u.encode(encoding.encoding)
196 >>> print trim(t, 12, ellipsis=ellipsis)
196 >>> print trim(t, 12, ellipsis=ellipsis)
197 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
197 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
198 >>> print trim(t, 10, ellipsis=ellipsis)
198 >>> print trim(t, 10, ellipsis=ellipsis)
199 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
199 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
200 >>> print trim(t, 8, ellipsis=ellipsis)
200 >>> print trim(t, 8, ellipsis=ellipsis)
201 \xe3\x81\x82\xe3\x81\x84+++
201 \xe3\x81\x82\xe3\x81\x84+++
202 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
202 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
203 +++\xe3\x81\x88\xe3\x81\x8a
203 +++\xe3\x81\x88\xe3\x81\x8a
204 >>> print trim(t, 5)
204 >>> print trim(t, 5)
205 \xe3\x81\x82\xe3\x81\x84
205 \xe3\x81\x82\xe3\x81\x84
206 >>> print trim(t, 5, leftside=True)
206 >>> print trim(t, 5, leftside=True)
207 \xe3\x81\x88\xe3\x81\x8a
207 \xe3\x81\x88\xe3\x81\x8a
208 >>> print trim(t, 4, ellipsis=ellipsis)
208 >>> print trim(t, 4, ellipsis=ellipsis)
209 +++
209 +++
210 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
210 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
211 +++
211 +++
212 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
212 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
213 >>> print trim(t, 12, ellipsis=ellipsis)
213 >>> print trim(t, 12, ellipsis=ellipsis)
214 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
214 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
215 >>> print trim(t, 10, ellipsis=ellipsis)
215 >>> print trim(t, 10, ellipsis=ellipsis)
216 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
216 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
217 >>> print trim(t, 8, ellipsis=ellipsis)
217 >>> print trim(t, 8, ellipsis=ellipsis)
218 \x11\x22\x33\x44\x55+++
218 \x11\x22\x33\x44\x55+++
219 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
219 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
220 +++\x66\x77\x88\x99\xaa
220 +++\x66\x77\x88\x99\xaa
221 >>> print trim(t, 8)
221 >>> print trim(t, 8)
222 \x11\x22\x33\x44\x55\x66\x77\x88
222 \x11\x22\x33\x44\x55\x66\x77\x88
223 >>> print trim(t, 8, leftside=True)
223 >>> print trim(t, 8, leftside=True)
224 \x33\x44\x55\x66\x77\x88\x99\xaa
224 \x33\x44\x55\x66\x77\x88\x99\xaa
225 >>> print trim(t, 3, ellipsis=ellipsis)
225 >>> print trim(t, 3, ellipsis=ellipsis)
226 +++
226 +++
227 >>> print trim(t, 1, ellipsis=ellipsis)
227 >>> print trim(t, 1, ellipsis=ellipsis)
228 +
228 +
229 """
229 """
230 try:
230 try:
231 u = s.decode(encoding)
231 u = s.decode(encoding)
232 except UnicodeDecodeError:
232 except UnicodeDecodeError:
233 if len(s) <= width: # trimming is not needed
233 if len(s) <= width: # trimming is not needed
234 return s
234 return s
235 width -= len(ellipsis)
235 width -= len(ellipsis)
236 if width <= 0: # no enough room even for ellipsis
236 if width <= 0: # no enough room even for ellipsis
237 return ellipsis[:width + len(ellipsis)]
237 return ellipsis[:width + len(ellipsis)]
238 if leftside:
238 if leftside:
239 return ellipsis + s[-width:]
239 return ellipsis + s[-width:]
240 return s[:width] + ellipsis
240 return s[:width] + ellipsis
241
241
242 if ucolwidth(u) <= width: # trimming is not needed
242 if ucolwidth(u) <= width: # trimming is not needed
243 return s
243 return s
244
244
245 width -= len(ellipsis)
245 width -= len(ellipsis)
246 if width <= 0: # no enough room even for ellipsis
246 if width <= 0: # no enough room even for ellipsis
247 return ellipsis[:width + len(ellipsis)]
247 return ellipsis[:width + len(ellipsis)]
248
248
249 if leftside:
249 if leftside:
250 uslice = lambda i: u[i:]
250 uslice = lambda i: u[i:]
251 concat = lambda s: ellipsis + s
251 concat = lambda s: ellipsis + s
252 else:
252 else:
253 uslice = lambda i: u[:-i]
253 uslice = lambda i: u[:-i]
254 concat = lambda s: s + ellipsis
254 concat = lambda s: s + ellipsis
255 for i in xrange(1, len(u)):
255 for i in xrange(1, len(u)):
256 usub = uslice(i)
256 usub = uslice(i)
257 if ucolwidth(usub) <= width:
257 if ucolwidth(usub) <= width:
258 return concat(usub.encode(encoding))
258 return concat(usub.encode(encoding))
259 return ellipsis # no enough room for multi-column characters
259 return ellipsis # no enough room for multi-column characters
260
260
261 def asciilower(s):
261 def asciilower(s):
262 '''convert a string to lowercase if ASCII
262 '''convert a string to lowercase if ASCII
263
263
264 Raises UnicodeDecodeError if non-ASCII characters are found.'''
264 Raises UnicodeDecodeError if non-ASCII characters are found.'''
265 s.decode('ascii')
265 s.decode('ascii')
266 return s.lower()
266 return s.lower()
267
267
268 asciilower = getattr(parsers, 'asciilower', asciilower)
268 asciilower = getattr(parsers, 'asciilower', asciilower)
269
269
270 def lower(s):
270 def lower(s):
271 "best-effort encoding-aware case-folding of local string s"
271 "best-effort encoding-aware case-folding of local string s"
272 try:
272 try:
273 s.decode('ascii') # throw exception for non-ASCII character
273 return asciilower(s)
274 return s.lower()
275 except UnicodeDecodeError:
274 except UnicodeDecodeError:
276 pass
275 pass
277 try:
276 try:
278 if isinstance(s, localstr):
277 if isinstance(s, localstr):
279 u = s._utf8.decode("utf-8")
278 u = s._utf8.decode("utf-8")
280 else:
279 else:
281 u = s.decode(encoding, encodingmode)
280 u = s.decode(encoding, encodingmode)
282
281
283 lu = u.lower()
282 lu = u.lower()
284 if u == lu:
283 if u == lu:
285 return s # preserve localstring
284 return s # preserve localstring
286 return lu.encode(encoding)
285 return lu.encode(encoding)
287 except UnicodeError:
286 except UnicodeError:
288 return s.lower() # we don't know how to fold this except in ASCII
287 return s.lower() # we don't know how to fold this except in ASCII
289 except LookupError, k:
288 except LookupError, k:
290 raise error.Abort(k, hint="please check your locale settings")
289 raise error.Abort(k, hint="please check your locale settings")
291
290
292 def upper(s):
291 def upper(s):
293 "best-effort encoding-aware case-folding of local string s"
292 "best-effort encoding-aware case-folding of local string s"
294 try:
293 try:
295 s.decode('ascii') # throw exception for non-ASCII character
294 s.decode('ascii') # throw exception for non-ASCII character
296 return s.upper()
295 return s.upper()
297 except UnicodeDecodeError:
296 except UnicodeDecodeError:
298 pass
297 pass
299 try:
298 try:
300 if isinstance(s, localstr):
299 if isinstance(s, localstr):
301 u = s._utf8.decode("utf-8")
300 u = s._utf8.decode("utf-8")
302 else:
301 else:
303 u = s.decode(encoding, encodingmode)
302 u = s.decode(encoding, encodingmode)
304
303
305 uu = u.upper()
304 uu = u.upper()
306 if u == uu:
305 if u == uu:
307 return s # preserve localstring
306 return s # preserve localstring
308 return uu.encode(encoding)
307 return uu.encode(encoding)
309 except UnicodeError:
308 except UnicodeError:
310 return s.upper() # we don't know how to fold this except in ASCII
309 return s.upper() # we don't know how to fold this except in ASCII
311 except LookupError, k:
310 except LookupError, k:
312 raise error.Abort(k, hint="please check your locale settings")
311 raise error.Abort(k, hint="please check your locale settings")
313
312
314 _jsonmap = {}
313 _jsonmap = {}
315
314
316 def jsonescape(s):
315 def jsonescape(s):
317 '''returns a string suitable for JSON
316 '''returns a string suitable for JSON
318
317
319 JSON is problematic for us because it doesn't support non-Unicode
318 JSON is problematic for us because it doesn't support non-Unicode
320 bytes. To deal with this, we take the following approach:
319 bytes. To deal with this, we take the following approach:
321
320
322 - localstr objects are converted back to UTF-8
321 - localstr objects are converted back to UTF-8
323 - valid UTF-8/ASCII strings are passed as-is
322 - valid UTF-8/ASCII strings are passed as-is
324 - other strings are converted to UTF-8b surrogate encoding
323 - other strings are converted to UTF-8b surrogate encoding
325 - apply JSON-specified string escaping
324 - apply JSON-specified string escaping
326
325
327 (escapes are doubled in these tests)
326 (escapes are doubled in these tests)
328
327
329 >>> jsonescape('this is a test')
328 >>> jsonescape('this is a test')
330 'this is a test'
329 'this is a test'
331 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
330 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
332 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
331 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
333 >>> jsonescape('a weird byte: \\xdd')
332 >>> jsonescape('a weird byte: \\xdd')
334 'a weird byte: \\xed\\xb3\\x9d'
333 'a weird byte: \\xed\\xb3\\x9d'
335 >>> jsonescape('utf-8: caf\\xc3\\xa9')
334 >>> jsonescape('utf-8: caf\\xc3\\xa9')
336 'utf-8: caf\\xc3\\xa9'
335 'utf-8: caf\\xc3\\xa9'
337 >>> jsonescape('')
336 >>> jsonescape('')
338 ''
337 ''
339 '''
338 '''
340
339
341 if not _jsonmap:
340 if not _jsonmap:
342 for x in xrange(32):
341 for x in xrange(32):
343 _jsonmap[chr(x)] = "\u%04x" %x
342 _jsonmap[chr(x)] = "\u%04x" %x
344 for x in xrange(32, 256):
343 for x in xrange(32, 256):
345 c = chr(x)
344 c = chr(x)
346 _jsonmap[c] = c
345 _jsonmap[c] = c
347 _jsonmap['\t'] = '\\t'
346 _jsonmap['\t'] = '\\t'
348 _jsonmap['\n'] = '\\n'
347 _jsonmap['\n'] = '\\n'
349 _jsonmap['\"'] = '\\"'
348 _jsonmap['\"'] = '\\"'
350 _jsonmap['\\'] = '\\\\'
349 _jsonmap['\\'] = '\\\\'
351 _jsonmap['\b'] = '\\b'
350 _jsonmap['\b'] = '\\b'
352 _jsonmap['\f'] = '\\f'
351 _jsonmap['\f'] = '\\f'
353 _jsonmap['\r'] = '\\r'
352 _jsonmap['\r'] = '\\r'
354
353
355 return ''.join(_jsonmap[c] for c in toutf8b(s))
354 return ''.join(_jsonmap[c] for c in toutf8b(s))
356
355
357 def toutf8b(s):
356 def toutf8b(s):
358 '''convert a local, possibly-binary string into UTF-8b
357 '''convert a local, possibly-binary string into UTF-8b
359
358
360 This is intended as a generic method to preserve data when working
359 This is intended as a generic method to preserve data when working
361 with schemes like JSON and XML that have no provision for
360 with schemes like JSON and XML that have no provision for
362 arbitrary byte strings. As Mercurial often doesn't know
361 arbitrary byte strings. As Mercurial often doesn't know
363 what encoding data is in, we use so-called UTF-8b.
362 what encoding data is in, we use so-called UTF-8b.
364
363
365 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
364 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
366 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
365 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
367 uDC00-uDCFF.
366 uDC00-uDCFF.
368
367
369 Principles of operation:
368 Principles of operation:
370
369
371 - ASCII and UTF-8 data successfully round-trips and is understood
370 - ASCII and UTF-8 data successfully round-trips and is understood
372 by Unicode-oriented clients
371 by Unicode-oriented clients
373 - filenames and file contents in arbitrary other encodings can have
372 - filenames and file contents in arbitrary other encodings can have
374 be round-tripped or recovered by clueful clients
373 be round-tripped or recovered by clueful clients
375 - local strings that have a cached known UTF-8 encoding (aka
374 - local strings that have a cached known UTF-8 encoding (aka
376 localstr) get sent as UTF-8 so Unicode-oriented clients get the
375 localstr) get sent as UTF-8 so Unicode-oriented clients get the
377 Unicode data they want
376 Unicode data they want
378 - because we must preserve UTF-8 bytestring in places such as
377 - because we must preserve UTF-8 bytestring in places such as
379 filenames, metadata can't be roundtripped without help
378 filenames, metadata can't be roundtripped without help
380
379
381 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
380 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
382 arbitrary bytes into an internal Unicode format that can be
381 arbitrary bytes into an internal Unicode format that can be
383 re-encoded back into the original. Here we are exposing the
382 re-encoded back into the original. Here we are exposing the
384 internal surrogate encoding as a UTF-8 string.)
383 internal surrogate encoding as a UTF-8 string.)
385 '''
384 '''
386
385
387 if isinstance(s, localstr):
386 if isinstance(s, localstr):
388 return s._utf8
387 return s._utf8
389
388
390 try:
389 try:
391 s.decode('utf-8')
390 s.decode('utf-8')
392 return s
391 return s
393 except UnicodeDecodeError:
392 except UnicodeDecodeError:
394 # surrogate-encode any characters that don't round-trip
393 # surrogate-encode any characters that don't round-trip
395 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
394 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
396 r = ""
395 r = ""
397 pos = 0
396 pos = 0
398 for c in s:
397 for c in s:
399 if s2[pos:pos + 1] == c:
398 if s2[pos:pos + 1] == c:
400 r += c
399 r += c
401 pos += 1
400 pos += 1
402 else:
401 else:
403 r += unichr(0xdc00 + ord(c)).encode('utf-8')
402 r += unichr(0xdc00 + ord(c)).encode('utf-8')
404 return r
403 return r
405
404
406 def fromutf8b(s):
405 def fromutf8b(s):
407 '''Given a UTF-8b string, return a local, possibly-binary string.
406 '''Given a UTF-8b string, return a local, possibly-binary string.
408
407
409 return the original binary string. This
408 return the original binary string. This
410 is a round-trip process for strings like filenames, but metadata
409 is a round-trip process for strings like filenames, but metadata
411 that's was passed through tolocal will remain in UTF-8.
410 that's was passed through tolocal will remain in UTF-8.
412
411
413 >>> m = "\\xc3\\xa9\\x99abcd"
412 >>> m = "\\xc3\\xa9\\x99abcd"
414 >>> n = toutf8b(m)
413 >>> n = toutf8b(m)
415 >>> n
414 >>> n
416 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
415 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
417 >>> fromutf8b(n) == m
416 >>> fromutf8b(n) == m
418 True
417 True
419 '''
418 '''
420
419
421 # fast path - look for uDxxx prefixes in s
420 # fast path - look for uDxxx prefixes in s
422 if "\xed" not in s:
421 if "\xed" not in s:
423 return s
422 return s
424
423
425 u = s.decode("utf-8")
424 u = s.decode("utf-8")
426 r = ""
425 r = ""
427 for c in u:
426 for c in u:
428 if ord(c) & 0xff00 == 0xdc00:
427 if ord(c) & 0xff00 == 0xdc00:
429 r += chr(ord(c) & 0xff)
428 r += chr(ord(c) & 0xff)
430 else:
429 else:
431 r += c.encode("utf-8")
430 r += c.encode("utf-8")
432 return r
431 return r
General Comments 0
You need to be logged in to leave comments. Login now