##// END OF EJS Templates
encoding: add hfsignoreclean to clean out HFS-ignored characters...
Augie Fackler -
r23596:885bd7c5 stable
parent child Browse files
Show More
@@ -1,438 +1,460 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 # sanity.
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 "206a 206b 206c 206d 206e 206f feff".split()]
17 # verify the next function will work
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19
20 def hfsignoreclean(s):
21 """Remove codepoints ignored by HFS+ from s.
22
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 '.hg'
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 '.hg'
27 """
28 if "\xe2" in s or "\xef" in s:
29 for c in _ignore:
30 s = s.replace(c, '')
31 return s
32
11 def _getpreferredencoding():
33 def _getpreferredencoding():
12 '''
34 '''
13 On darwin, getpreferredencoding ignores the locale environment and
35 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
37 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
38 Python versions.
17
39
18 However, we can't use a version check for this method, as some distributions
40 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
42 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
43 '''
22 try:
44 try:
23 locale.CODESET
45 locale.CODESET
24 except AttributeError:
46 except AttributeError:
25 # Fall back to parsing environment variables :-(
47 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
48 return locale.getdefaultlocale()[1]
27
49
28 oldloc = locale.setlocale(locale.LC_CTYPE)
50 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
51 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
52 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
53 locale.setlocale(locale.LC_CTYPE, oldloc)
32
54
33 return result
55 return result
34
56
35 _encodingfixers = {
57 _encodingfixers = {
36 '646': lambda: 'ascii',
58 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
59 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
60 'mac-roman': _getpreferredencoding
39 }
61 }
40
62
41 try:
63 try:
42 encoding = os.environ.get("HGENCODING")
64 encoding = os.environ.get("HGENCODING")
43 if not encoding:
65 if not encoding:
44 encoding = locale.getpreferredencoding() or 'ascii'
66 encoding = locale.getpreferredencoding() or 'ascii'
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 except locale.Error:
68 except locale.Error:
47 encoding = 'ascii'
69 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1'
71 fallbackencoding = 'ISO-8859-1'
50
72
51 class localstr(str):
73 class localstr(str):
52 '''This class allows strings that are unmodified to be
74 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
75 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
76 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
77 s = str.__new__(cls, l)
56 s._utf8 = u
78 s._utf8 = u
57 return s
79 return s
58 def __hash__(self):
80 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
81 return hash(self._utf8) # avoid collisions in local string space
60
82
61 def tolocal(s):
83 def tolocal(s):
62 """
84 """
63 Convert a string from internal UTF-8 to local encoding
85 Convert a string from internal UTF-8 to local encoding
64
86
65 All internal strings should be UTF-8 but some repos before the
87 All internal strings should be UTF-8 but some repos before the
66 implementation of locale support may contain latin1 or possibly
88 implementation of locale support may contain latin1 or possibly
67 other character sets. We attempt to decode everything strictly
89 other character sets. We attempt to decode everything strictly
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 replace unknown characters.
91 replace unknown characters.
70
92
71 The localstr class is used to cache the known UTF-8 encoding of
93 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
94 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
95 round-trip conversion back to UTF-8.
74
96
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
97 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
98 >>> l = tolocal(u)
77 >>> l
99 >>> l
78 'foo: ?'
100 'foo: ?'
79 >>> fromlocal(l)
101 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
102 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
103 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
104 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> len(d) # no collision
105 >>> len(d) # no collision
84 2
106 2
85 >>> 'foo: ?' in d
107 >>> 'foo: ?' in d
86 False
108 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
110 >>> l = tolocal(l1)
89 >>> l
111 >>> l
90 'foo: ?'
112 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
113 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
114 'foo: \\xc3\\xa4'
93 """
115 """
94
116
95 try:
117 try:
96 try:
118 try:
97 # make sure string is actually stored in UTF-8
119 # make sure string is actually stored in UTF-8
98 u = s.decode('UTF-8')
120 u = s.decode('UTF-8')
99 if encoding == 'UTF-8':
121 if encoding == 'UTF-8':
100 # fast path
122 # fast path
101 return s
123 return s
102 r = u.encode(encoding, "replace")
124 r = u.encode(encoding, "replace")
103 if u == r.decode(encoding):
125 if u == r.decode(encoding):
104 # r is a safe, non-lossy encoding of s
126 # r is a safe, non-lossy encoding of s
105 return r
127 return r
106 return localstr(s, r)
128 return localstr(s, r)
107 except UnicodeDecodeError:
129 except UnicodeDecodeError:
108 # we should only get here if we're looking at an ancient changeset
130 # we should only get here if we're looking at an ancient changeset
109 try:
131 try:
110 u = s.decode(fallbackencoding)
132 u = s.decode(fallbackencoding)
111 r = u.encode(encoding, "replace")
133 r = u.encode(encoding, "replace")
112 if u == r.decode(encoding):
134 if u == r.decode(encoding):
113 # r is a safe, non-lossy encoding of s
135 # r is a safe, non-lossy encoding of s
114 return r
136 return r
115 return localstr(u.encode('UTF-8'), r)
137 return localstr(u.encode('UTF-8'), r)
116 except UnicodeDecodeError:
138 except UnicodeDecodeError:
117 u = s.decode("utf-8", "replace") # last ditch
139 u = s.decode("utf-8", "replace") # last ditch
118 return u.encode(encoding, "replace") # can't round-trip
140 return u.encode(encoding, "replace") # can't round-trip
119 except LookupError, k:
141 except LookupError, k:
120 raise error.Abort(k, hint="please check your locale settings")
142 raise error.Abort(k, hint="please check your locale settings")
121
143
122 def fromlocal(s):
144 def fromlocal(s):
123 """
145 """
124 Convert a string from the local character encoding to UTF-8
146 Convert a string from the local character encoding to UTF-8
125
147
126 We attempt to decode strings using the encoding mode set by
148 We attempt to decode strings using the encoding mode set by
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 characters will cause an error message. Other modes include
150 characters will cause an error message. Other modes include
129 'replace', which replaces unknown characters with a special
151 'replace', which replaces unknown characters with a special
130 Unicode character, and 'ignore', which drops the character.
152 Unicode character, and 'ignore', which drops the character.
131 """
153 """
132
154
133 # can we do a lossless round-trip?
155 # can we do a lossless round-trip?
134 if isinstance(s, localstr):
156 if isinstance(s, localstr):
135 return s._utf8
157 return s._utf8
136
158
137 try:
159 try:
138 return s.decode(encoding, encodingmode).encode("utf-8")
160 return s.decode(encoding, encodingmode).encode("utf-8")
139 except UnicodeDecodeError, inst:
161 except UnicodeDecodeError, inst:
140 sub = s[max(0, inst.start - 10):inst.start + 10]
162 sub = s[max(0, inst.start - 10):inst.start + 10]
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 except LookupError, k:
164 except LookupError, k:
143 raise error.Abort(k, hint="please check your locale settings")
165 raise error.Abort(k, hint="please check your locale settings")
144
166
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 and "WFA" or "WF")
169 and "WFA" or "WF")
148
170
149 def colwidth(s):
171 def colwidth(s):
150 "Find the column width of a string for display in the local encoding"
172 "Find the column width of a string for display in the local encoding"
151 return ucolwidth(s.decode(encoding, 'replace'))
173 return ucolwidth(s.decode(encoding, 'replace'))
152
174
153 def ucolwidth(d):
175 def ucolwidth(d):
154 "Find the column width of a Unicode string for display"
176 "Find the column width of a Unicode string for display"
155 eaw = getattr(unicodedata, 'east_asian_width', None)
177 eaw = getattr(unicodedata, 'east_asian_width', None)
156 if eaw is not None:
178 if eaw is not None:
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
179 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 return len(d)
180 return len(d)
159
181
160 def getcols(s, start, c):
182 def getcols(s, start, c):
161 '''Use colwidth to find a c-column substring of s starting at byte
183 '''Use colwidth to find a c-column substring of s starting at byte
162 index start'''
184 index start'''
163 for x in xrange(start + c, len(s)):
185 for x in xrange(start + c, len(s)):
164 t = s[start:x]
186 t = s[start:x]
165 if colwidth(t) == c:
187 if colwidth(t) == c:
166 return t
188 return t
167
189
168 def trim(s, width, ellipsis='', leftside=False):
190 def trim(s, width, ellipsis='', leftside=False):
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170
192
171 If 'leftside' is True, left side of string 's' is trimmed.
193 If 'leftside' is True, left side of string 's' is trimmed.
172 'ellipsis' is always placed at trimmed side.
194 'ellipsis' is always placed at trimmed side.
173
195
174 >>> ellipsis = '+++'
196 >>> ellipsis = '+++'
175 >>> from mercurial import encoding
197 >>> from mercurial import encoding
176 >>> encoding.encoding = 'utf-8'
198 >>> encoding.encoding = 'utf-8'
177 >>> t= '1234567890'
199 >>> t= '1234567890'
178 >>> print trim(t, 12, ellipsis=ellipsis)
200 >>> print trim(t, 12, ellipsis=ellipsis)
179 1234567890
201 1234567890
180 >>> print trim(t, 10, ellipsis=ellipsis)
202 >>> print trim(t, 10, ellipsis=ellipsis)
181 1234567890
203 1234567890
182 >>> print trim(t, 8, ellipsis=ellipsis)
204 >>> print trim(t, 8, ellipsis=ellipsis)
183 12345+++
205 12345+++
184 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
185 +++67890
207 +++67890
186 >>> print trim(t, 8)
208 >>> print trim(t, 8)
187 12345678
209 12345678
188 >>> print trim(t, 8, leftside=True)
210 >>> print trim(t, 8, leftside=True)
189 34567890
211 34567890
190 >>> print trim(t, 3, ellipsis=ellipsis)
212 >>> print trim(t, 3, ellipsis=ellipsis)
191 +++
213 +++
192 >>> print trim(t, 1, ellipsis=ellipsis)
214 >>> print trim(t, 1, ellipsis=ellipsis)
193 +
215 +
194 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
195 >>> t = u.encode(encoding.encoding)
217 >>> t = u.encode(encoding.encoding)
196 >>> print trim(t, 12, ellipsis=ellipsis)
218 >>> print trim(t, 12, ellipsis=ellipsis)
197 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
198 >>> print trim(t, 10, ellipsis=ellipsis)
220 >>> print trim(t, 10, ellipsis=ellipsis)
199 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
200 >>> print trim(t, 8, ellipsis=ellipsis)
222 >>> print trim(t, 8, ellipsis=ellipsis)
201 \xe3\x81\x82\xe3\x81\x84+++
223 \xe3\x81\x82\xe3\x81\x84+++
202 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
203 +++\xe3\x81\x88\xe3\x81\x8a
225 +++\xe3\x81\x88\xe3\x81\x8a
204 >>> print trim(t, 5)
226 >>> print trim(t, 5)
205 \xe3\x81\x82\xe3\x81\x84
227 \xe3\x81\x82\xe3\x81\x84
206 >>> print trim(t, 5, leftside=True)
228 >>> print trim(t, 5, leftside=True)
207 \xe3\x81\x88\xe3\x81\x8a
229 \xe3\x81\x88\xe3\x81\x8a
208 >>> print trim(t, 4, ellipsis=ellipsis)
230 >>> print trim(t, 4, ellipsis=ellipsis)
209 +++
231 +++
210 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
211 +++
233 +++
212 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
213 >>> print trim(t, 12, ellipsis=ellipsis)
235 >>> print trim(t, 12, ellipsis=ellipsis)
214 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
215 >>> print trim(t, 10, ellipsis=ellipsis)
237 >>> print trim(t, 10, ellipsis=ellipsis)
216 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
217 >>> print trim(t, 8, ellipsis=ellipsis)
239 >>> print trim(t, 8, ellipsis=ellipsis)
218 \x11\x22\x33\x44\x55+++
240 \x11\x22\x33\x44\x55+++
219 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
220 +++\x66\x77\x88\x99\xaa
242 +++\x66\x77\x88\x99\xaa
221 >>> print trim(t, 8)
243 >>> print trim(t, 8)
222 \x11\x22\x33\x44\x55\x66\x77\x88
244 \x11\x22\x33\x44\x55\x66\x77\x88
223 >>> print trim(t, 8, leftside=True)
245 >>> print trim(t, 8, leftside=True)
224 \x33\x44\x55\x66\x77\x88\x99\xaa
246 \x33\x44\x55\x66\x77\x88\x99\xaa
225 >>> print trim(t, 3, ellipsis=ellipsis)
247 >>> print trim(t, 3, ellipsis=ellipsis)
226 +++
248 +++
227 >>> print trim(t, 1, ellipsis=ellipsis)
249 >>> print trim(t, 1, ellipsis=ellipsis)
228 +
250 +
229 """
251 """
230 try:
252 try:
231 u = s.decode(encoding)
253 u = s.decode(encoding)
232 except UnicodeDecodeError:
254 except UnicodeDecodeError:
233 if len(s) <= width: # trimming is not needed
255 if len(s) <= width: # trimming is not needed
234 return s
256 return s
235 width -= len(ellipsis)
257 width -= len(ellipsis)
236 if width <= 0: # no enough room even for ellipsis
258 if width <= 0: # no enough room even for ellipsis
237 return ellipsis[:width + len(ellipsis)]
259 return ellipsis[:width + len(ellipsis)]
238 if leftside:
260 if leftside:
239 return ellipsis + s[-width:]
261 return ellipsis + s[-width:]
240 return s[:width] + ellipsis
262 return s[:width] + ellipsis
241
263
242 if ucolwidth(u) <= width: # trimming is not needed
264 if ucolwidth(u) <= width: # trimming is not needed
243 return s
265 return s
244
266
245 width -= len(ellipsis)
267 width -= len(ellipsis)
246 if width <= 0: # no enough room even for ellipsis
268 if width <= 0: # no enough room even for ellipsis
247 return ellipsis[:width + len(ellipsis)]
269 return ellipsis[:width + len(ellipsis)]
248
270
249 if leftside:
271 if leftside:
250 uslice = lambda i: u[i:]
272 uslice = lambda i: u[i:]
251 concat = lambda s: ellipsis + s
273 concat = lambda s: ellipsis + s
252 else:
274 else:
253 uslice = lambda i: u[:-i]
275 uslice = lambda i: u[:-i]
254 concat = lambda s: s + ellipsis
276 concat = lambda s: s + ellipsis
255 for i in xrange(1, len(u)):
277 for i in xrange(1, len(u)):
256 usub = uslice(i)
278 usub = uslice(i)
257 if ucolwidth(usub) <= width:
279 if ucolwidth(usub) <= width:
258 return concat(usub.encode(encoding))
280 return concat(usub.encode(encoding))
259 return ellipsis # no enough room for multi-column characters
281 return ellipsis # no enough room for multi-column characters
260
282
261 def _asciilower(s):
283 def _asciilower(s):
262 '''convert a string to lowercase if ASCII
284 '''convert a string to lowercase if ASCII
263
285
264 Raises UnicodeDecodeError if non-ASCII characters are found.'''
286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
265 s.decode('ascii')
287 s.decode('ascii')
266 return s.lower()
288 return s.lower()
267
289
268 def asciilower(s):
290 def asciilower(s):
269 # delay importing avoids cyclic dependency around "parsers" in
291 # delay importing avoids cyclic dependency around "parsers" in
270 # pure Python build (util => i18n => encoding => parsers => util)
292 # pure Python build (util => i18n => encoding => parsers => util)
271 import parsers
293 import parsers
272 impl = getattr(parsers, 'asciilower', _asciilower)
294 impl = getattr(parsers, 'asciilower', _asciilower)
273 global asciilower
295 global asciilower
274 asciilower = impl
296 asciilower = impl
275 return impl(s)
297 return impl(s)
276
298
277 def lower(s):
299 def lower(s):
278 "best-effort encoding-aware case-folding of local string s"
300 "best-effort encoding-aware case-folding of local string s"
279 try:
301 try:
280 return asciilower(s)
302 return asciilower(s)
281 except UnicodeDecodeError:
303 except UnicodeDecodeError:
282 pass
304 pass
283 try:
305 try:
284 if isinstance(s, localstr):
306 if isinstance(s, localstr):
285 u = s._utf8.decode("utf-8")
307 u = s._utf8.decode("utf-8")
286 else:
308 else:
287 u = s.decode(encoding, encodingmode)
309 u = s.decode(encoding, encodingmode)
288
310
289 lu = u.lower()
311 lu = u.lower()
290 if u == lu:
312 if u == lu:
291 return s # preserve localstring
313 return s # preserve localstring
292 return lu.encode(encoding)
314 return lu.encode(encoding)
293 except UnicodeError:
315 except UnicodeError:
294 return s.lower() # we don't know how to fold this except in ASCII
316 return s.lower() # we don't know how to fold this except in ASCII
295 except LookupError, k:
317 except LookupError, k:
296 raise error.Abort(k, hint="please check your locale settings")
318 raise error.Abort(k, hint="please check your locale settings")
297
319
298 def upper(s):
320 def upper(s):
299 "best-effort encoding-aware case-folding of local string s"
321 "best-effort encoding-aware case-folding of local string s"
300 try:
322 try:
301 s.decode('ascii') # throw exception for non-ASCII character
323 s.decode('ascii') # throw exception for non-ASCII character
302 return s.upper()
324 return s.upper()
303 except UnicodeDecodeError:
325 except UnicodeDecodeError:
304 pass
326 pass
305 try:
327 try:
306 if isinstance(s, localstr):
328 if isinstance(s, localstr):
307 u = s._utf8.decode("utf-8")
329 u = s._utf8.decode("utf-8")
308 else:
330 else:
309 u = s.decode(encoding, encodingmode)
331 u = s.decode(encoding, encodingmode)
310
332
311 uu = u.upper()
333 uu = u.upper()
312 if u == uu:
334 if u == uu:
313 return s # preserve localstring
335 return s # preserve localstring
314 return uu.encode(encoding)
336 return uu.encode(encoding)
315 except UnicodeError:
337 except UnicodeError:
316 return s.upper() # we don't know how to fold this except in ASCII
338 return s.upper() # we don't know how to fold this except in ASCII
317 except LookupError, k:
339 except LookupError, k:
318 raise error.Abort(k, hint="please check your locale settings")
340 raise error.Abort(k, hint="please check your locale settings")
319
341
320 _jsonmap = {}
342 _jsonmap = {}
321
343
322 def jsonescape(s):
344 def jsonescape(s):
323 '''returns a string suitable for JSON
345 '''returns a string suitable for JSON
324
346
325 JSON is problematic for us because it doesn't support non-Unicode
347 JSON is problematic for us because it doesn't support non-Unicode
326 bytes. To deal with this, we take the following approach:
348 bytes. To deal with this, we take the following approach:
327
349
328 - localstr objects are converted back to UTF-8
350 - localstr objects are converted back to UTF-8
329 - valid UTF-8/ASCII strings are passed as-is
351 - valid UTF-8/ASCII strings are passed as-is
330 - other strings are converted to UTF-8b surrogate encoding
352 - other strings are converted to UTF-8b surrogate encoding
331 - apply JSON-specified string escaping
353 - apply JSON-specified string escaping
332
354
333 (escapes are doubled in these tests)
355 (escapes are doubled in these tests)
334
356
335 >>> jsonescape('this is a test')
357 >>> jsonescape('this is a test')
336 'this is a test'
358 'this is a test'
337 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
359 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
338 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
360 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
339 >>> jsonescape('a weird byte: \\xdd')
361 >>> jsonescape('a weird byte: \\xdd')
340 'a weird byte: \\xed\\xb3\\x9d'
362 'a weird byte: \\xed\\xb3\\x9d'
341 >>> jsonescape('utf-8: caf\\xc3\\xa9')
363 >>> jsonescape('utf-8: caf\\xc3\\xa9')
342 'utf-8: caf\\xc3\\xa9'
364 'utf-8: caf\\xc3\\xa9'
343 >>> jsonescape('')
365 >>> jsonescape('')
344 ''
366 ''
345 '''
367 '''
346
368
347 if not _jsonmap:
369 if not _jsonmap:
348 for x in xrange(32):
370 for x in xrange(32):
349 _jsonmap[chr(x)] = "\u%04x" %x
371 _jsonmap[chr(x)] = "\u%04x" %x
350 for x in xrange(32, 256):
372 for x in xrange(32, 256):
351 c = chr(x)
373 c = chr(x)
352 _jsonmap[c] = c
374 _jsonmap[c] = c
353 _jsonmap['\t'] = '\\t'
375 _jsonmap['\t'] = '\\t'
354 _jsonmap['\n'] = '\\n'
376 _jsonmap['\n'] = '\\n'
355 _jsonmap['\"'] = '\\"'
377 _jsonmap['\"'] = '\\"'
356 _jsonmap['\\'] = '\\\\'
378 _jsonmap['\\'] = '\\\\'
357 _jsonmap['\b'] = '\\b'
379 _jsonmap['\b'] = '\\b'
358 _jsonmap['\f'] = '\\f'
380 _jsonmap['\f'] = '\\f'
359 _jsonmap['\r'] = '\\r'
381 _jsonmap['\r'] = '\\r'
360
382
361 return ''.join(_jsonmap[c] for c in toutf8b(s))
383 return ''.join(_jsonmap[c] for c in toutf8b(s))
362
384
363 def toutf8b(s):
385 def toutf8b(s):
364 '''convert a local, possibly-binary string into UTF-8b
386 '''convert a local, possibly-binary string into UTF-8b
365
387
366 This is intended as a generic method to preserve data when working
388 This is intended as a generic method to preserve data when working
367 with schemes like JSON and XML that have no provision for
389 with schemes like JSON and XML that have no provision for
368 arbitrary byte strings. As Mercurial often doesn't know
390 arbitrary byte strings. As Mercurial often doesn't know
369 what encoding data is in, we use so-called UTF-8b.
391 what encoding data is in, we use so-called UTF-8b.
370
392
371 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
393 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
372 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
394 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
373 uDC00-uDCFF.
395 uDC00-uDCFF.
374
396
375 Principles of operation:
397 Principles of operation:
376
398
377 - ASCII and UTF-8 data successfully round-trips and is understood
399 - ASCII and UTF-8 data successfully round-trips and is understood
378 by Unicode-oriented clients
400 by Unicode-oriented clients
379 - filenames and file contents in arbitrary other encodings can have
401 - filenames and file contents in arbitrary other encodings can have
380 be round-tripped or recovered by clueful clients
402 be round-tripped or recovered by clueful clients
381 - local strings that have a cached known UTF-8 encoding (aka
403 - local strings that have a cached known UTF-8 encoding (aka
382 localstr) get sent as UTF-8 so Unicode-oriented clients get the
404 localstr) get sent as UTF-8 so Unicode-oriented clients get the
383 Unicode data they want
405 Unicode data they want
384 - because we must preserve UTF-8 bytestring in places such as
406 - because we must preserve UTF-8 bytestring in places such as
385 filenames, metadata can't be roundtripped without help
407 filenames, metadata can't be roundtripped without help
386
408
387 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
409 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
388 arbitrary bytes into an internal Unicode format that can be
410 arbitrary bytes into an internal Unicode format that can be
389 re-encoded back into the original. Here we are exposing the
411 re-encoded back into the original. Here we are exposing the
390 internal surrogate encoding as a UTF-8 string.)
412 internal surrogate encoding as a UTF-8 string.)
391 '''
413 '''
392
414
393 if isinstance(s, localstr):
415 if isinstance(s, localstr):
394 return s._utf8
416 return s._utf8
395
417
396 try:
418 try:
397 s.decode('utf-8')
419 s.decode('utf-8')
398 return s
420 return s
399 except UnicodeDecodeError:
421 except UnicodeDecodeError:
400 # surrogate-encode any characters that don't round-trip
422 # surrogate-encode any characters that don't round-trip
401 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
423 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
402 r = ""
424 r = ""
403 pos = 0
425 pos = 0
404 for c in s:
426 for c in s:
405 if s2[pos:pos + 1] == c:
427 if s2[pos:pos + 1] == c:
406 r += c
428 r += c
407 pos += 1
429 pos += 1
408 else:
430 else:
409 r += unichr(0xdc00 + ord(c)).encode('utf-8')
431 r += unichr(0xdc00 + ord(c)).encode('utf-8')
410 return r
432 return r
411
433
412 def fromutf8b(s):
434 def fromutf8b(s):
413 '''Given a UTF-8b string, return a local, possibly-binary string.
435 '''Given a UTF-8b string, return a local, possibly-binary string.
414
436
415 return the original binary string. This
437 return the original binary string. This
416 is a round-trip process for strings like filenames, but metadata
438 is a round-trip process for strings like filenames, but metadata
417 that's was passed through tolocal will remain in UTF-8.
439 that's was passed through tolocal will remain in UTF-8.
418
440
419 >>> m = "\\xc3\\xa9\\x99abcd"
441 >>> m = "\\xc3\\xa9\\x99abcd"
420 >>> n = toutf8b(m)
442 >>> n = toutf8b(m)
421 >>> n
443 >>> n
422 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
444 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
423 >>> fromutf8b(n) == m
445 >>> fromutf8b(n) == m
424 True
446 True
425 '''
447 '''
426
448
427 # fast path - look for uDxxx prefixes in s
449 # fast path - look for uDxxx prefixes in s
428 if "\xed" not in s:
450 if "\xed" not in s:
429 return s
451 return s
430
452
431 u = s.decode("utf-8")
453 u = s.decode("utf-8")
432 r = ""
454 r = ""
433 for c in u:
455 for c in u:
434 if ord(c) & 0xff00 == 0xdc00:
456 if ord(c) & 0xff00 == 0xdc00:
435 r += chr(ord(c) & 0xff)
457 r += chr(ord(c) & 0xff)
436 else:
458 else:
437 r += c.encode("utf-8")
459 r += c.encode("utf-8")
438 return r
460 return r
General Comments 0
You need to be logged in to leave comments. Login now