##// END OF EJS Templates
encoding.upper: factor out fallback code...
Siddharth Agarwal -
r24597:b4258d5a default
parent child Browse files
Show More
@@ -1,488 +1,490
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 # "Unicode Subtleties"), so we need to ignore them in some places for
12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 # sanity.
13 # sanity.
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 "206a 206b 206c 206d 206e 206f feff".split()]
16 "206a 206b 206c 206d 206e 206f feff".split()]
17 # verify the next function will work
17 # verify the next function will work
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19
19
20 def hfsignoreclean(s):
20 def hfsignoreclean(s):
21 """Remove codepoints ignored by HFS+ from s.
21 """Remove codepoints ignored by HFS+ from s.
22
22
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 '.hg'
24 '.hg'
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 '.hg'
26 '.hg'
27 """
27 """
28 if "\xe2" in s or "\xef" in s:
28 if "\xe2" in s or "\xef" in s:
29 for c in _ignore:
29 for c in _ignore:
30 s = s.replace(c, '')
30 s = s.replace(c, '')
31 return s
31 return s
32
32
33 def _getpreferredencoding():
33 def _getpreferredencoding():
34 '''
34 '''
35 On darwin, getpreferredencoding ignores the locale environment and
35 On darwin, getpreferredencoding ignores the locale environment and
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
37 for Python 2.7 and up. This is the same corrected code for earlier
37 for Python 2.7 and up. This is the same corrected code for earlier
38 Python versions.
38 Python versions.
39
39
40 However, we can't use a version check for this method, as some distributions
40 However, we can't use a version check for this method, as some distributions
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
42 encoding, as it is unlikely that this encoding is the actually expected.
42 encoding, as it is unlikely that this encoding is the actually expected.
43 '''
43 '''
44 try:
44 try:
45 locale.CODESET
45 locale.CODESET
46 except AttributeError:
46 except AttributeError:
47 # Fall back to parsing environment variables :-(
47 # Fall back to parsing environment variables :-(
48 return locale.getdefaultlocale()[1]
48 return locale.getdefaultlocale()[1]
49
49
50 oldloc = locale.setlocale(locale.LC_CTYPE)
50 oldloc = locale.setlocale(locale.LC_CTYPE)
51 locale.setlocale(locale.LC_CTYPE, "")
51 locale.setlocale(locale.LC_CTYPE, "")
52 result = locale.nl_langinfo(locale.CODESET)
52 result = locale.nl_langinfo(locale.CODESET)
53 locale.setlocale(locale.LC_CTYPE, oldloc)
53 locale.setlocale(locale.LC_CTYPE, oldloc)
54
54
55 return result
55 return result
56
56
57 _encodingfixers = {
57 _encodingfixers = {
58 '646': lambda: 'ascii',
58 '646': lambda: 'ascii',
59 'ANSI_X3.4-1968': lambda: 'ascii',
59 'ANSI_X3.4-1968': lambda: 'ascii',
60 'mac-roman': _getpreferredencoding
60 'mac-roman': _getpreferredencoding
61 }
61 }
62
62
63 try:
63 try:
64 encoding = os.environ.get("HGENCODING")
64 encoding = os.environ.get("HGENCODING")
65 if not encoding:
65 if not encoding:
66 encoding = locale.getpreferredencoding() or 'ascii'
66 encoding = locale.getpreferredencoding() or 'ascii'
67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
68 except locale.Error:
68 except locale.Error:
69 encoding = 'ascii'
69 encoding = 'ascii'
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
71 fallbackencoding = 'ISO-8859-1'
71 fallbackencoding = 'ISO-8859-1'
72
72
73 class localstr(str):
73 class localstr(str):
74 '''This class allows strings that are unmodified to be
74 '''This class allows strings that are unmodified to be
75 round-tripped to the local encoding and back'''
75 round-tripped to the local encoding and back'''
76 def __new__(cls, u, l):
76 def __new__(cls, u, l):
77 s = str.__new__(cls, l)
77 s = str.__new__(cls, l)
78 s._utf8 = u
78 s._utf8 = u
79 return s
79 return s
80 def __hash__(self):
80 def __hash__(self):
81 return hash(self._utf8) # avoid collisions in local string space
81 return hash(self._utf8) # avoid collisions in local string space
82
82
83 def tolocal(s):
83 def tolocal(s):
84 """
84 """
85 Convert a string from internal UTF-8 to local encoding
85 Convert a string from internal UTF-8 to local encoding
86
86
87 All internal strings should be UTF-8 but some repos before the
87 All internal strings should be UTF-8 but some repos before the
88 implementation of locale support may contain latin1 or possibly
88 implementation of locale support may contain latin1 or possibly
89 other character sets. We attempt to decode everything strictly
89 other character sets. We attempt to decode everything strictly
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
91 replace unknown characters.
91 replace unknown characters.
92
92
93 The localstr class is used to cache the known UTF-8 encoding of
93 The localstr class is used to cache the known UTF-8 encoding of
94 strings next to their local representation to allow lossless
94 strings next to their local representation to allow lossless
95 round-trip conversion back to UTF-8.
95 round-trip conversion back to UTF-8.
96
96
97 >>> u = 'foo: \\xc3\\xa4' # utf-8
97 >>> u = 'foo: \\xc3\\xa4' # utf-8
98 >>> l = tolocal(u)
98 >>> l = tolocal(u)
99 >>> l
99 >>> l
100 'foo: ?'
100 'foo: ?'
101 >>> fromlocal(l)
101 >>> fromlocal(l)
102 'foo: \\xc3\\xa4'
102 'foo: \\xc3\\xa4'
103 >>> u2 = 'foo: \\xc3\\xa1'
103 >>> u2 = 'foo: \\xc3\\xa1'
104 >>> d = { l: 1, tolocal(u2): 2 }
104 >>> d = { l: 1, tolocal(u2): 2 }
105 >>> len(d) # no collision
105 >>> len(d) # no collision
106 2
106 2
107 >>> 'foo: ?' in d
107 >>> 'foo: ?' in d
108 False
108 False
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
110 >>> l = tolocal(l1)
110 >>> l = tolocal(l1)
111 >>> l
111 >>> l
112 'foo: ?'
112 'foo: ?'
113 >>> fromlocal(l) # magically in utf-8
113 >>> fromlocal(l) # magically in utf-8
114 'foo: \\xc3\\xa4'
114 'foo: \\xc3\\xa4'
115 """
115 """
116
116
117 try:
117 try:
118 try:
118 try:
119 # make sure string is actually stored in UTF-8
119 # make sure string is actually stored in UTF-8
120 u = s.decode('UTF-8')
120 u = s.decode('UTF-8')
121 if encoding == 'UTF-8':
121 if encoding == 'UTF-8':
122 # fast path
122 # fast path
123 return s
123 return s
124 r = u.encode(encoding, "replace")
124 r = u.encode(encoding, "replace")
125 if u == r.decode(encoding):
125 if u == r.decode(encoding):
126 # r is a safe, non-lossy encoding of s
126 # r is a safe, non-lossy encoding of s
127 return r
127 return r
128 return localstr(s, r)
128 return localstr(s, r)
129 except UnicodeDecodeError:
129 except UnicodeDecodeError:
130 # we should only get here if we're looking at an ancient changeset
130 # we should only get here if we're looking at an ancient changeset
131 try:
131 try:
132 u = s.decode(fallbackencoding)
132 u = s.decode(fallbackencoding)
133 r = u.encode(encoding, "replace")
133 r = u.encode(encoding, "replace")
134 if u == r.decode(encoding):
134 if u == r.decode(encoding):
135 # r is a safe, non-lossy encoding of s
135 # r is a safe, non-lossy encoding of s
136 return r
136 return r
137 return localstr(u.encode('UTF-8'), r)
137 return localstr(u.encode('UTF-8'), r)
138 except UnicodeDecodeError:
138 except UnicodeDecodeError:
139 u = s.decode("utf-8", "replace") # last ditch
139 u = s.decode("utf-8", "replace") # last ditch
140 return u.encode(encoding, "replace") # can't round-trip
140 return u.encode(encoding, "replace") # can't round-trip
141 except LookupError, k:
141 except LookupError, k:
142 raise error.Abort(k, hint="please check your locale settings")
142 raise error.Abort(k, hint="please check your locale settings")
143
143
144 def fromlocal(s):
144 def fromlocal(s):
145 """
145 """
146 Convert a string from the local character encoding to UTF-8
146 Convert a string from the local character encoding to UTF-8
147
147
148 We attempt to decode strings using the encoding mode set by
148 We attempt to decode strings using the encoding mode set by
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
150 characters will cause an error message. Other modes include
150 characters will cause an error message. Other modes include
151 'replace', which replaces unknown characters with a special
151 'replace', which replaces unknown characters with a special
152 Unicode character, and 'ignore', which drops the character.
152 Unicode character, and 'ignore', which drops the character.
153 """
153 """
154
154
155 # can we do a lossless round-trip?
155 # can we do a lossless round-trip?
156 if isinstance(s, localstr):
156 if isinstance(s, localstr):
157 return s._utf8
157 return s._utf8
158
158
159 try:
159 try:
160 return s.decode(encoding, encodingmode).encode("utf-8")
160 return s.decode(encoding, encodingmode).encode("utf-8")
161 except UnicodeDecodeError, inst:
161 except UnicodeDecodeError, inst:
162 sub = s[max(0, inst.start - 10):inst.start + 10]
162 sub = s[max(0, inst.start - 10):inst.start + 10]
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
164 except LookupError, k:
164 except LookupError, k:
165 raise error.Abort(k, hint="please check your locale settings")
165 raise error.Abort(k, hint="please check your locale settings")
166
166
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
169 and "WFA" or "WF")
169 and "WFA" or "WF")
170
170
171 def colwidth(s):
171 def colwidth(s):
172 "Find the column width of a string for display in the local encoding"
172 "Find the column width of a string for display in the local encoding"
173 return ucolwidth(s.decode(encoding, 'replace'))
173 return ucolwidth(s.decode(encoding, 'replace'))
174
174
175 def ucolwidth(d):
175 def ucolwidth(d):
176 "Find the column width of a Unicode string for display"
176 "Find the column width of a Unicode string for display"
177 eaw = getattr(unicodedata, 'east_asian_width', None)
177 eaw = getattr(unicodedata, 'east_asian_width', None)
178 if eaw is not None:
178 if eaw is not None:
179 return sum([eaw(c) in wide and 2 or 1 for c in d])
179 return sum([eaw(c) in wide and 2 or 1 for c in d])
180 return len(d)
180 return len(d)
181
181
182 def getcols(s, start, c):
182 def getcols(s, start, c):
183 '''Use colwidth to find a c-column substring of s starting at byte
183 '''Use colwidth to find a c-column substring of s starting at byte
184 index start'''
184 index start'''
185 for x in xrange(start + c, len(s)):
185 for x in xrange(start + c, len(s)):
186 t = s[start:x]
186 t = s[start:x]
187 if colwidth(t) == c:
187 if colwidth(t) == c:
188 return t
188 return t
189
189
190 def trim(s, width, ellipsis='', leftside=False):
190 def trim(s, width, ellipsis='', leftside=False):
191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
192
192
193 If 'leftside' is True, left side of string 's' is trimmed.
193 If 'leftside' is True, left side of string 's' is trimmed.
194 'ellipsis' is always placed at trimmed side.
194 'ellipsis' is always placed at trimmed side.
195
195
196 >>> ellipsis = '+++'
196 >>> ellipsis = '+++'
197 >>> from mercurial import encoding
197 >>> from mercurial import encoding
198 >>> encoding.encoding = 'utf-8'
198 >>> encoding.encoding = 'utf-8'
199 >>> t= '1234567890'
199 >>> t= '1234567890'
200 >>> print trim(t, 12, ellipsis=ellipsis)
200 >>> print trim(t, 12, ellipsis=ellipsis)
201 1234567890
201 1234567890
202 >>> print trim(t, 10, ellipsis=ellipsis)
202 >>> print trim(t, 10, ellipsis=ellipsis)
203 1234567890
203 1234567890
204 >>> print trim(t, 8, ellipsis=ellipsis)
204 >>> print trim(t, 8, ellipsis=ellipsis)
205 12345+++
205 12345+++
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
207 +++67890
207 +++67890
208 >>> print trim(t, 8)
208 >>> print trim(t, 8)
209 12345678
209 12345678
210 >>> print trim(t, 8, leftside=True)
210 >>> print trim(t, 8, leftside=True)
211 34567890
211 34567890
212 >>> print trim(t, 3, ellipsis=ellipsis)
212 >>> print trim(t, 3, ellipsis=ellipsis)
213 +++
213 +++
214 >>> print trim(t, 1, ellipsis=ellipsis)
214 >>> print trim(t, 1, ellipsis=ellipsis)
215 +
215 +
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
217 >>> t = u.encode(encoding.encoding)
217 >>> t = u.encode(encoding.encoding)
218 >>> print trim(t, 12, ellipsis=ellipsis)
218 >>> print trim(t, 12, ellipsis=ellipsis)
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
220 >>> print trim(t, 10, ellipsis=ellipsis)
220 >>> print trim(t, 10, ellipsis=ellipsis)
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
222 >>> print trim(t, 8, ellipsis=ellipsis)
222 >>> print trim(t, 8, ellipsis=ellipsis)
223 \xe3\x81\x82\xe3\x81\x84+++
223 \xe3\x81\x82\xe3\x81\x84+++
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
225 +++\xe3\x81\x88\xe3\x81\x8a
225 +++\xe3\x81\x88\xe3\x81\x8a
226 >>> print trim(t, 5)
226 >>> print trim(t, 5)
227 \xe3\x81\x82\xe3\x81\x84
227 \xe3\x81\x82\xe3\x81\x84
228 >>> print trim(t, 5, leftside=True)
228 >>> print trim(t, 5, leftside=True)
229 \xe3\x81\x88\xe3\x81\x8a
229 \xe3\x81\x88\xe3\x81\x8a
230 >>> print trim(t, 4, ellipsis=ellipsis)
230 >>> print trim(t, 4, ellipsis=ellipsis)
231 +++
231 +++
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
233 +++
233 +++
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
235 >>> print trim(t, 12, ellipsis=ellipsis)
235 >>> print trim(t, 12, ellipsis=ellipsis)
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
237 >>> print trim(t, 10, ellipsis=ellipsis)
237 >>> print trim(t, 10, ellipsis=ellipsis)
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
239 >>> print trim(t, 8, ellipsis=ellipsis)
239 >>> print trim(t, 8, ellipsis=ellipsis)
240 \x11\x22\x33\x44\x55+++
240 \x11\x22\x33\x44\x55+++
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
242 +++\x66\x77\x88\x99\xaa
242 +++\x66\x77\x88\x99\xaa
243 >>> print trim(t, 8)
243 >>> print trim(t, 8)
244 \x11\x22\x33\x44\x55\x66\x77\x88
244 \x11\x22\x33\x44\x55\x66\x77\x88
245 >>> print trim(t, 8, leftside=True)
245 >>> print trim(t, 8, leftside=True)
246 \x33\x44\x55\x66\x77\x88\x99\xaa
246 \x33\x44\x55\x66\x77\x88\x99\xaa
247 >>> print trim(t, 3, ellipsis=ellipsis)
247 >>> print trim(t, 3, ellipsis=ellipsis)
248 +++
248 +++
249 >>> print trim(t, 1, ellipsis=ellipsis)
249 >>> print trim(t, 1, ellipsis=ellipsis)
250 +
250 +
251 """
251 """
252 try:
252 try:
253 u = s.decode(encoding)
253 u = s.decode(encoding)
254 except UnicodeDecodeError:
254 except UnicodeDecodeError:
255 if len(s) <= width: # trimming is not needed
255 if len(s) <= width: # trimming is not needed
256 return s
256 return s
257 width -= len(ellipsis)
257 width -= len(ellipsis)
258 if width <= 0: # no enough room even for ellipsis
258 if width <= 0: # no enough room even for ellipsis
259 return ellipsis[:width + len(ellipsis)]
259 return ellipsis[:width + len(ellipsis)]
260 if leftside:
260 if leftside:
261 return ellipsis + s[-width:]
261 return ellipsis + s[-width:]
262 return s[:width] + ellipsis
262 return s[:width] + ellipsis
263
263
264 if ucolwidth(u) <= width: # trimming is not needed
264 if ucolwidth(u) <= width: # trimming is not needed
265 return s
265 return s
266
266
267 width -= len(ellipsis)
267 width -= len(ellipsis)
268 if width <= 0: # no enough room even for ellipsis
268 if width <= 0: # no enough room even for ellipsis
269 return ellipsis[:width + len(ellipsis)]
269 return ellipsis[:width + len(ellipsis)]
270
270
271 if leftside:
271 if leftside:
272 uslice = lambda i: u[i:]
272 uslice = lambda i: u[i:]
273 concat = lambda s: ellipsis + s
273 concat = lambda s: ellipsis + s
274 else:
274 else:
275 uslice = lambda i: u[:-i]
275 uslice = lambda i: u[:-i]
276 concat = lambda s: s + ellipsis
276 concat = lambda s: s + ellipsis
277 for i in xrange(1, len(u)):
277 for i in xrange(1, len(u)):
278 usub = uslice(i)
278 usub = uslice(i)
279 if ucolwidth(usub) <= width:
279 if ucolwidth(usub) <= width:
280 return concat(usub.encode(encoding))
280 return concat(usub.encode(encoding))
281 return ellipsis # no enough room for multi-column characters
281 return ellipsis # no enough room for multi-column characters
282
282
283 def _asciilower(s):
283 def _asciilower(s):
284 '''convert a string to lowercase if ASCII
284 '''convert a string to lowercase if ASCII
285
285
286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
287 s.decode('ascii')
287 s.decode('ascii')
288 return s.lower()
288 return s.lower()
289
289
290 def asciilower(s):
290 def asciilower(s):
291 # delay importing avoids cyclic dependency around "parsers" in
291 # delay importing avoids cyclic dependency around "parsers" in
292 # pure Python build (util => i18n => encoding => parsers => util)
292 # pure Python build (util => i18n => encoding => parsers => util)
293 import parsers
293 import parsers
294 impl = getattr(parsers, 'asciilower', _asciilower)
294 impl = getattr(parsers, 'asciilower', _asciilower)
295 global asciilower
295 global asciilower
296 asciilower = impl
296 asciilower = impl
297 return impl(s)
297 return impl(s)
298
298
299 def _asciiupper(s):
299 def _asciiupper(s):
300 '''convert a string to uppercase if ASCII
300 '''convert a string to uppercase if ASCII
301
301
302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
303 s.decode('ascii')
303 s.decode('ascii')
304 return s.upper()
304 return s.upper()
305
305
306 def asciiupper(s):
306 def asciiupper(s):
307 # delay importing avoids cyclic dependency around "parsers" in
307 # delay importing avoids cyclic dependency around "parsers" in
308 # pure Python build (util => i18n => encoding => parsers => util)
308 # pure Python build (util => i18n => encoding => parsers => util)
309 import parsers
309 import parsers
310 impl = getattr(parsers, 'asciiupper', _asciiupper)
310 impl = getattr(parsers, 'asciiupper', _asciiupper)
311 global asciiupper
311 global asciiupper
312 asciiupper = impl
312 asciiupper = impl
313 return impl(s)
313 return impl(s)
314
314
315 def lower(s):
315 def lower(s):
316 "best-effort encoding-aware case-folding of local string s"
316 "best-effort encoding-aware case-folding of local string s"
317 try:
317 try:
318 return asciilower(s)
318 return asciilower(s)
319 except UnicodeDecodeError:
319 except UnicodeDecodeError:
320 pass
320 pass
321 try:
321 try:
322 if isinstance(s, localstr):
322 if isinstance(s, localstr):
323 u = s._utf8.decode("utf-8")
323 u = s._utf8.decode("utf-8")
324 else:
324 else:
325 u = s.decode(encoding, encodingmode)
325 u = s.decode(encoding, encodingmode)
326
326
327 lu = u.lower()
327 lu = u.lower()
328 if u == lu:
328 if u == lu:
329 return s # preserve localstring
329 return s # preserve localstring
330 return lu.encode(encoding)
330 return lu.encode(encoding)
331 except UnicodeError:
331 except UnicodeError:
332 return s.lower() # we don't know how to fold this except in ASCII
332 return s.lower() # we don't know how to fold this except in ASCII
333 except LookupError, k:
333 except LookupError, k:
334 raise error.Abort(k, hint="please check your locale settings")
334 raise error.Abort(k, hint="please check your locale settings")
335
335
336 def upper(s):
336 def upper(s):
337 "best-effort encoding-aware case-folding of local string s"
337 "best-effort encoding-aware case-folding of local string s"
338 try:
338 try:
339 return asciiupper(s)
339 return asciiupper(s)
340 except UnicodeDecodeError:
340 except UnicodeDecodeError:
341 pass
341 return upperfallback(s)
342
343 def upperfallback(s):
342 try:
344 try:
343 if isinstance(s, localstr):
345 if isinstance(s, localstr):
344 u = s._utf8.decode("utf-8")
346 u = s._utf8.decode("utf-8")
345 else:
347 else:
346 u = s.decode(encoding, encodingmode)
348 u = s.decode(encoding, encodingmode)
347
349
348 uu = u.upper()
350 uu = u.upper()
349 if u == uu:
351 if u == uu:
350 return s # preserve localstring
352 return s # preserve localstring
351 return uu.encode(encoding)
353 return uu.encode(encoding)
352 except UnicodeError:
354 except UnicodeError:
353 return s.upper() # we don't know how to fold this except in ASCII
355 return s.upper() # we don't know how to fold this except in ASCII
354 except LookupError, k:
356 except LookupError, k:
355 raise error.Abort(k, hint="please check your locale settings")
357 raise error.Abort(k, hint="please check your locale settings")
356
358
357 class normcasespecs(object):
359 class normcasespecs(object):
358 '''what a platform's normcase does to ASCII strings
360 '''what a platform's normcase does to ASCII strings
359
361
360 This is specified per platform, and should be consistent with what normcase
362 This is specified per platform, and should be consistent with what normcase
361 on that platform actually does.
363 on that platform actually does.
362
364
363 lower: normcase lowercases ASCII strings
365 lower: normcase lowercases ASCII strings
364 upper: normcase uppercases ASCII strings
366 upper: normcase uppercases ASCII strings
365 other: the fallback function should always be called'''
367 other: the fallback function should always be called'''
366 lower = -1
368 lower = -1
367 upper = 1
369 upper = 1
368 other = 0
370 other = 0
369
371
370 _jsonmap = {}
372 _jsonmap = {}
371
373
372 def jsonescape(s):
374 def jsonescape(s):
373 '''returns a string suitable for JSON
375 '''returns a string suitable for JSON
374
376
375 JSON is problematic for us because it doesn't support non-Unicode
377 JSON is problematic for us because it doesn't support non-Unicode
376 bytes. To deal with this, we take the following approach:
378 bytes. To deal with this, we take the following approach:
377
379
378 - localstr objects are converted back to UTF-8
380 - localstr objects are converted back to UTF-8
379 - valid UTF-8/ASCII strings are passed as-is
381 - valid UTF-8/ASCII strings are passed as-is
380 - other strings are converted to UTF-8b surrogate encoding
382 - other strings are converted to UTF-8b surrogate encoding
381 - apply JSON-specified string escaping
383 - apply JSON-specified string escaping
382
384
383 (escapes are doubled in these tests)
385 (escapes are doubled in these tests)
384
386
385 >>> jsonescape('this is a test')
387 >>> jsonescape('this is a test')
386 'this is a test'
388 'this is a test'
387 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
389 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
388 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
390 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
389 >>> jsonescape('a weird byte: \\xdd')
391 >>> jsonescape('a weird byte: \\xdd')
390 'a weird byte: \\xed\\xb3\\x9d'
392 'a weird byte: \\xed\\xb3\\x9d'
391 >>> jsonescape('utf-8: caf\\xc3\\xa9')
393 >>> jsonescape('utf-8: caf\\xc3\\xa9')
392 'utf-8: caf\\xc3\\xa9'
394 'utf-8: caf\\xc3\\xa9'
393 >>> jsonescape('')
395 >>> jsonescape('')
394 ''
396 ''
395 '''
397 '''
396
398
397 if not _jsonmap:
399 if not _jsonmap:
398 for x in xrange(32):
400 for x in xrange(32):
399 _jsonmap[chr(x)] = "\u%04x" %x
401 _jsonmap[chr(x)] = "\u%04x" %x
400 for x in xrange(32, 256):
402 for x in xrange(32, 256):
401 c = chr(x)
403 c = chr(x)
402 _jsonmap[c] = c
404 _jsonmap[c] = c
403 _jsonmap['\t'] = '\\t'
405 _jsonmap['\t'] = '\\t'
404 _jsonmap['\n'] = '\\n'
406 _jsonmap['\n'] = '\\n'
405 _jsonmap['\"'] = '\\"'
407 _jsonmap['\"'] = '\\"'
406 _jsonmap['\\'] = '\\\\'
408 _jsonmap['\\'] = '\\\\'
407 _jsonmap['\b'] = '\\b'
409 _jsonmap['\b'] = '\\b'
408 _jsonmap['\f'] = '\\f'
410 _jsonmap['\f'] = '\\f'
409 _jsonmap['\r'] = '\\r'
411 _jsonmap['\r'] = '\\r'
410
412
411 return ''.join(_jsonmap[c] for c in toutf8b(s))
413 return ''.join(_jsonmap[c] for c in toutf8b(s))
412
414
413 def toutf8b(s):
415 def toutf8b(s):
414 '''convert a local, possibly-binary string into UTF-8b
416 '''convert a local, possibly-binary string into UTF-8b
415
417
416 This is intended as a generic method to preserve data when working
418 This is intended as a generic method to preserve data when working
417 with schemes like JSON and XML that have no provision for
419 with schemes like JSON and XML that have no provision for
418 arbitrary byte strings. As Mercurial often doesn't know
420 arbitrary byte strings. As Mercurial often doesn't know
419 what encoding data is in, we use so-called UTF-8b.
421 what encoding data is in, we use so-called UTF-8b.
420
422
421 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
423 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
422 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
424 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
423 uDC00-uDCFF.
425 uDC00-uDCFF.
424
426
425 Principles of operation:
427 Principles of operation:
426
428
427 - ASCII and UTF-8 data successfully round-trips and is understood
429 - ASCII and UTF-8 data successfully round-trips and is understood
428 by Unicode-oriented clients
430 by Unicode-oriented clients
429 - filenames and file contents in arbitrary other encodings can have
431 - filenames and file contents in arbitrary other encodings can have
430 be round-tripped or recovered by clueful clients
432 be round-tripped or recovered by clueful clients
431 - local strings that have a cached known UTF-8 encoding (aka
433 - local strings that have a cached known UTF-8 encoding (aka
432 localstr) get sent as UTF-8 so Unicode-oriented clients get the
434 localstr) get sent as UTF-8 so Unicode-oriented clients get the
433 Unicode data they want
435 Unicode data they want
434 - because we must preserve UTF-8 bytestring in places such as
436 - because we must preserve UTF-8 bytestring in places such as
435 filenames, metadata can't be roundtripped without help
437 filenames, metadata can't be roundtripped without help
436
438
437 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
439 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
438 arbitrary bytes into an internal Unicode format that can be
440 arbitrary bytes into an internal Unicode format that can be
439 re-encoded back into the original. Here we are exposing the
441 re-encoded back into the original. Here we are exposing the
440 internal surrogate encoding as a UTF-8 string.)
442 internal surrogate encoding as a UTF-8 string.)
441 '''
443 '''
442
444
443 if isinstance(s, localstr):
445 if isinstance(s, localstr):
444 return s._utf8
446 return s._utf8
445
447
446 try:
448 try:
447 s.decode('utf-8')
449 s.decode('utf-8')
448 return s
450 return s
449 except UnicodeDecodeError:
451 except UnicodeDecodeError:
450 # surrogate-encode any characters that don't round-trip
452 # surrogate-encode any characters that don't round-trip
451 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
453 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
452 r = ""
454 r = ""
453 pos = 0
455 pos = 0
454 for c in s:
456 for c in s:
455 if s2[pos:pos + 1] == c:
457 if s2[pos:pos + 1] == c:
456 r += c
458 r += c
457 pos += 1
459 pos += 1
458 else:
460 else:
459 r += unichr(0xdc00 + ord(c)).encode('utf-8')
461 r += unichr(0xdc00 + ord(c)).encode('utf-8')
460 return r
462 return r
461
463
462 def fromutf8b(s):
464 def fromutf8b(s):
463 '''Given a UTF-8b string, return a local, possibly-binary string.
465 '''Given a UTF-8b string, return a local, possibly-binary string.
464
466
465 return the original binary string. This
467 return the original binary string. This
466 is a round-trip process for strings like filenames, but metadata
468 is a round-trip process for strings like filenames, but metadata
467 that's was passed through tolocal will remain in UTF-8.
469 that's was passed through tolocal will remain in UTF-8.
468
470
469 >>> m = "\\xc3\\xa9\\x99abcd"
471 >>> m = "\\xc3\\xa9\\x99abcd"
470 >>> n = toutf8b(m)
472 >>> n = toutf8b(m)
471 >>> n
473 >>> n
472 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
474 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
473 >>> fromutf8b(n) == m
475 >>> fromutf8b(n) == m
474 True
476 True
475 '''
477 '''
476
478
477 # fast path - look for uDxxx prefixes in s
479 # fast path - look for uDxxx prefixes in s
478 if "\xed" not in s:
480 if "\xed" not in s:
479 return s
481 return s
480
482
481 u = s.decode("utf-8")
483 u = s.decode("utf-8")
482 r = ""
484 r = ""
483 for c in u:
485 for c in u:
484 if ord(c) & 0xff00 == 0xdc00:
486 if ord(c) & 0xff00 == 0xdc00:
485 r += chr(ord(c) & 0xff)
487 r += chr(ord(c) & 0xff)
486 else:
488 else:
487 r += c.encode("utf-8")
489 r += c.encode("utf-8")
488 return r
490 return r
General Comments 0
You need to be logged in to leave comments. Login now