##// END OF EJS Templates
encoding: define an enum that specifies what normcase does to ASCII strings...
Siddharth Agarwal -
r24593:f473a1fe default
parent child Browse files
Show More
@@ -1,475 +1,488
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 # "Unicode Subtleties"), so we need to ignore them in some places for
12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 # sanity.
13 # sanity.
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 "206a 206b 206c 206d 206e 206f feff".split()]
16 "206a 206b 206c 206d 206e 206f feff".split()]
17 # verify the next function will work
17 # verify the next function will work
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19
19
20 def hfsignoreclean(s):
20 def hfsignoreclean(s):
21 """Remove codepoints ignored by HFS+ from s.
21 """Remove codepoints ignored by HFS+ from s.
22
22
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 '.hg'
24 '.hg'
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 '.hg'
26 '.hg'
27 """
27 """
28 if "\xe2" in s or "\xef" in s:
28 if "\xe2" in s or "\xef" in s:
29 for c in _ignore:
29 for c in _ignore:
30 s = s.replace(c, '')
30 s = s.replace(c, '')
31 return s
31 return s
32
32
33 def _getpreferredencoding():
33 def _getpreferredencoding():
34 '''
34 '''
35 On darwin, getpreferredencoding ignores the locale environment and
35 On darwin, getpreferredencoding ignores the locale environment and
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
37 for Python 2.7 and up. This is the same corrected code for earlier
37 for Python 2.7 and up. This is the same corrected code for earlier
38 Python versions.
38 Python versions.
39
39
40 However, we can't use a version check for this method, as some distributions
40 However, we can't use a version check for this method, as some distributions
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
42 encoding, as it is unlikely that this encoding is the actually expected.
42 encoding, as it is unlikely that this encoding is the actually expected.
43 '''
43 '''
44 try:
44 try:
45 locale.CODESET
45 locale.CODESET
46 except AttributeError:
46 except AttributeError:
47 # Fall back to parsing environment variables :-(
47 # Fall back to parsing environment variables :-(
48 return locale.getdefaultlocale()[1]
48 return locale.getdefaultlocale()[1]
49
49
50 oldloc = locale.setlocale(locale.LC_CTYPE)
50 oldloc = locale.setlocale(locale.LC_CTYPE)
51 locale.setlocale(locale.LC_CTYPE, "")
51 locale.setlocale(locale.LC_CTYPE, "")
52 result = locale.nl_langinfo(locale.CODESET)
52 result = locale.nl_langinfo(locale.CODESET)
53 locale.setlocale(locale.LC_CTYPE, oldloc)
53 locale.setlocale(locale.LC_CTYPE, oldloc)
54
54
55 return result
55 return result
56
56
57 _encodingfixers = {
57 _encodingfixers = {
58 '646': lambda: 'ascii',
58 '646': lambda: 'ascii',
59 'ANSI_X3.4-1968': lambda: 'ascii',
59 'ANSI_X3.4-1968': lambda: 'ascii',
60 'mac-roman': _getpreferredencoding
60 'mac-roman': _getpreferredencoding
61 }
61 }
62
62
63 try:
63 try:
64 encoding = os.environ.get("HGENCODING")
64 encoding = os.environ.get("HGENCODING")
65 if not encoding:
65 if not encoding:
66 encoding = locale.getpreferredencoding() or 'ascii'
66 encoding = locale.getpreferredencoding() or 'ascii'
67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
68 except locale.Error:
68 except locale.Error:
69 encoding = 'ascii'
69 encoding = 'ascii'
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
71 fallbackencoding = 'ISO-8859-1'
71 fallbackencoding = 'ISO-8859-1'
72
72
73 class localstr(str):
73 class localstr(str):
74 '''This class allows strings that are unmodified to be
74 '''This class allows strings that are unmodified to be
75 round-tripped to the local encoding and back'''
75 round-tripped to the local encoding and back'''
76 def __new__(cls, u, l):
76 def __new__(cls, u, l):
77 s = str.__new__(cls, l)
77 s = str.__new__(cls, l)
78 s._utf8 = u
78 s._utf8 = u
79 return s
79 return s
80 def __hash__(self):
80 def __hash__(self):
81 return hash(self._utf8) # avoid collisions in local string space
81 return hash(self._utf8) # avoid collisions in local string space
82
82
83 def tolocal(s):
83 def tolocal(s):
84 """
84 """
85 Convert a string from internal UTF-8 to local encoding
85 Convert a string from internal UTF-8 to local encoding
86
86
87 All internal strings should be UTF-8 but some repos before the
87 All internal strings should be UTF-8 but some repos before the
88 implementation of locale support may contain latin1 or possibly
88 implementation of locale support may contain latin1 or possibly
89 other character sets. We attempt to decode everything strictly
89 other character sets. We attempt to decode everything strictly
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
91 replace unknown characters.
91 replace unknown characters.
92
92
93 The localstr class is used to cache the known UTF-8 encoding of
93 The localstr class is used to cache the known UTF-8 encoding of
94 strings next to their local representation to allow lossless
94 strings next to their local representation to allow lossless
95 round-trip conversion back to UTF-8.
95 round-trip conversion back to UTF-8.
96
96
97 >>> u = 'foo: \\xc3\\xa4' # utf-8
97 >>> u = 'foo: \\xc3\\xa4' # utf-8
98 >>> l = tolocal(u)
98 >>> l = tolocal(u)
99 >>> l
99 >>> l
100 'foo: ?'
100 'foo: ?'
101 >>> fromlocal(l)
101 >>> fromlocal(l)
102 'foo: \\xc3\\xa4'
102 'foo: \\xc3\\xa4'
103 >>> u2 = 'foo: \\xc3\\xa1'
103 >>> u2 = 'foo: \\xc3\\xa1'
104 >>> d = { l: 1, tolocal(u2): 2 }
104 >>> d = { l: 1, tolocal(u2): 2 }
105 >>> len(d) # no collision
105 >>> len(d) # no collision
106 2
106 2
107 >>> 'foo: ?' in d
107 >>> 'foo: ?' in d
108 False
108 False
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
110 >>> l = tolocal(l1)
110 >>> l = tolocal(l1)
111 >>> l
111 >>> l
112 'foo: ?'
112 'foo: ?'
113 >>> fromlocal(l) # magically in utf-8
113 >>> fromlocal(l) # magically in utf-8
114 'foo: \\xc3\\xa4'
114 'foo: \\xc3\\xa4'
115 """
115 """
116
116
117 try:
117 try:
118 try:
118 try:
119 # make sure string is actually stored in UTF-8
119 # make sure string is actually stored in UTF-8
120 u = s.decode('UTF-8')
120 u = s.decode('UTF-8')
121 if encoding == 'UTF-8':
121 if encoding == 'UTF-8':
122 # fast path
122 # fast path
123 return s
123 return s
124 r = u.encode(encoding, "replace")
124 r = u.encode(encoding, "replace")
125 if u == r.decode(encoding):
125 if u == r.decode(encoding):
126 # r is a safe, non-lossy encoding of s
126 # r is a safe, non-lossy encoding of s
127 return r
127 return r
128 return localstr(s, r)
128 return localstr(s, r)
129 except UnicodeDecodeError:
129 except UnicodeDecodeError:
130 # we should only get here if we're looking at an ancient changeset
130 # we should only get here if we're looking at an ancient changeset
131 try:
131 try:
132 u = s.decode(fallbackencoding)
132 u = s.decode(fallbackencoding)
133 r = u.encode(encoding, "replace")
133 r = u.encode(encoding, "replace")
134 if u == r.decode(encoding):
134 if u == r.decode(encoding):
135 # r is a safe, non-lossy encoding of s
135 # r is a safe, non-lossy encoding of s
136 return r
136 return r
137 return localstr(u.encode('UTF-8'), r)
137 return localstr(u.encode('UTF-8'), r)
138 except UnicodeDecodeError:
138 except UnicodeDecodeError:
139 u = s.decode("utf-8", "replace") # last ditch
139 u = s.decode("utf-8", "replace") # last ditch
140 return u.encode(encoding, "replace") # can't round-trip
140 return u.encode(encoding, "replace") # can't round-trip
141 except LookupError, k:
141 except LookupError, k:
142 raise error.Abort(k, hint="please check your locale settings")
142 raise error.Abort(k, hint="please check your locale settings")
143
143
144 def fromlocal(s):
144 def fromlocal(s):
145 """
145 """
146 Convert a string from the local character encoding to UTF-8
146 Convert a string from the local character encoding to UTF-8
147
147
148 We attempt to decode strings using the encoding mode set by
148 We attempt to decode strings using the encoding mode set by
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
150 characters will cause an error message. Other modes include
150 characters will cause an error message. Other modes include
151 'replace', which replaces unknown characters with a special
151 'replace', which replaces unknown characters with a special
152 Unicode character, and 'ignore', which drops the character.
152 Unicode character, and 'ignore', which drops the character.
153 """
153 """
154
154
155 # can we do a lossless round-trip?
155 # can we do a lossless round-trip?
156 if isinstance(s, localstr):
156 if isinstance(s, localstr):
157 return s._utf8
157 return s._utf8
158
158
159 try:
159 try:
160 return s.decode(encoding, encodingmode).encode("utf-8")
160 return s.decode(encoding, encodingmode).encode("utf-8")
161 except UnicodeDecodeError, inst:
161 except UnicodeDecodeError, inst:
162 sub = s[max(0, inst.start - 10):inst.start + 10]
162 sub = s[max(0, inst.start - 10):inst.start + 10]
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
164 except LookupError, k:
164 except LookupError, k:
165 raise error.Abort(k, hint="please check your locale settings")
165 raise error.Abort(k, hint="please check your locale settings")
166
166
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
169 and "WFA" or "WF")
169 and "WFA" or "WF")
170
170
171 def colwidth(s):
171 def colwidth(s):
172 "Find the column width of a string for display in the local encoding"
172 "Find the column width of a string for display in the local encoding"
173 return ucolwidth(s.decode(encoding, 'replace'))
173 return ucolwidth(s.decode(encoding, 'replace'))
174
174
175 def ucolwidth(d):
175 def ucolwidth(d):
176 "Find the column width of a Unicode string for display"
176 "Find the column width of a Unicode string for display"
177 eaw = getattr(unicodedata, 'east_asian_width', None)
177 eaw = getattr(unicodedata, 'east_asian_width', None)
178 if eaw is not None:
178 if eaw is not None:
179 return sum([eaw(c) in wide and 2 or 1 for c in d])
179 return sum([eaw(c) in wide and 2 or 1 for c in d])
180 return len(d)
180 return len(d)
181
181
182 def getcols(s, start, c):
182 def getcols(s, start, c):
183 '''Use colwidth to find a c-column substring of s starting at byte
183 '''Use colwidth to find a c-column substring of s starting at byte
184 index start'''
184 index start'''
185 for x in xrange(start + c, len(s)):
185 for x in xrange(start + c, len(s)):
186 t = s[start:x]
186 t = s[start:x]
187 if colwidth(t) == c:
187 if colwidth(t) == c:
188 return t
188 return t
189
189
190 def trim(s, width, ellipsis='', leftside=False):
190 def trim(s, width, ellipsis='', leftside=False):
191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
192
192
193 If 'leftside' is True, left side of string 's' is trimmed.
193 If 'leftside' is True, left side of string 's' is trimmed.
194 'ellipsis' is always placed at trimmed side.
194 'ellipsis' is always placed at trimmed side.
195
195
196 >>> ellipsis = '+++'
196 >>> ellipsis = '+++'
197 >>> from mercurial import encoding
197 >>> from mercurial import encoding
198 >>> encoding.encoding = 'utf-8'
198 >>> encoding.encoding = 'utf-8'
199 >>> t= '1234567890'
199 >>> t= '1234567890'
200 >>> print trim(t, 12, ellipsis=ellipsis)
200 >>> print trim(t, 12, ellipsis=ellipsis)
201 1234567890
201 1234567890
202 >>> print trim(t, 10, ellipsis=ellipsis)
202 >>> print trim(t, 10, ellipsis=ellipsis)
203 1234567890
203 1234567890
204 >>> print trim(t, 8, ellipsis=ellipsis)
204 >>> print trim(t, 8, ellipsis=ellipsis)
205 12345+++
205 12345+++
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
207 +++67890
207 +++67890
208 >>> print trim(t, 8)
208 >>> print trim(t, 8)
209 12345678
209 12345678
210 >>> print trim(t, 8, leftside=True)
210 >>> print trim(t, 8, leftside=True)
211 34567890
211 34567890
212 >>> print trim(t, 3, ellipsis=ellipsis)
212 >>> print trim(t, 3, ellipsis=ellipsis)
213 +++
213 +++
214 >>> print trim(t, 1, ellipsis=ellipsis)
214 >>> print trim(t, 1, ellipsis=ellipsis)
215 +
215 +
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
217 >>> t = u.encode(encoding.encoding)
217 >>> t = u.encode(encoding.encoding)
218 >>> print trim(t, 12, ellipsis=ellipsis)
218 >>> print trim(t, 12, ellipsis=ellipsis)
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
220 >>> print trim(t, 10, ellipsis=ellipsis)
220 >>> print trim(t, 10, ellipsis=ellipsis)
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
222 >>> print trim(t, 8, ellipsis=ellipsis)
222 >>> print trim(t, 8, ellipsis=ellipsis)
223 \xe3\x81\x82\xe3\x81\x84+++
223 \xe3\x81\x82\xe3\x81\x84+++
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
225 +++\xe3\x81\x88\xe3\x81\x8a
225 +++\xe3\x81\x88\xe3\x81\x8a
226 >>> print trim(t, 5)
226 >>> print trim(t, 5)
227 \xe3\x81\x82\xe3\x81\x84
227 \xe3\x81\x82\xe3\x81\x84
228 >>> print trim(t, 5, leftside=True)
228 >>> print trim(t, 5, leftside=True)
229 \xe3\x81\x88\xe3\x81\x8a
229 \xe3\x81\x88\xe3\x81\x8a
230 >>> print trim(t, 4, ellipsis=ellipsis)
230 >>> print trim(t, 4, ellipsis=ellipsis)
231 +++
231 +++
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
233 +++
233 +++
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
235 >>> print trim(t, 12, ellipsis=ellipsis)
235 >>> print trim(t, 12, ellipsis=ellipsis)
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
237 >>> print trim(t, 10, ellipsis=ellipsis)
237 >>> print trim(t, 10, ellipsis=ellipsis)
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
239 >>> print trim(t, 8, ellipsis=ellipsis)
239 >>> print trim(t, 8, ellipsis=ellipsis)
240 \x11\x22\x33\x44\x55+++
240 \x11\x22\x33\x44\x55+++
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
242 +++\x66\x77\x88\x99\xaa
242 +++\x66\x77\x88\x99\xaa
243 >>> print trim(t, 8)
243 >>> print trim(t, 8)
244 \x11\x22\x33\x44\x55\x66\x77\x88
244 \x11\x22\x33\x44\x55\x66\x77\x88
245 >>> print trim(t, 8, leftside=True)
245 >>> print trim(t, 8, leftside=True)
246 \x33\x44\x55\x66\x77\x88\x99\xaa
246 \x33\x44\x55\x66\x77\x88\x99\xaa
247 >>> print trim(t, 3, ellipsis=ellipsis)
247 >>> print trim(t, 3, ellipsis=ellipsis)
248 +++
248 +++
249 >>> print trim(t, 1, ellipsis=ellipsis)
249 >>> print trim(t, 1, ellipsis=ellipsis)
250 +
250 +
251 """
251 """
252 try:
252 try:
253 u = s.decode(encoding)
253 u = s.decode(encoding)
254 except UnicodeDecodeError:
254 except UnicodeDecodeError:
255 if len(s) <= width: # trimming is not needed
255 if len(s) <= width: # trimming is not needed
256 return s
256 return s
257 width -= len(ellipsis)
257 width -= len(ellipsis)
258 if width <= 0: # no enough room even for ellipsis
258 if width <= 0: # no enough room even for ellipsis
259 return ellipsis[:width + len(ellipsis)]
259 return ellipsis[:width + len(ellipsis)]
260 if leftside:
260 if leftside:
261 return ellipsis + s[-width:]
261 return ellipsis + s[-width:]
262 return s[:width] + ellipsis
262 return s[:width] + ellipsis
263
263
264 if ucolwidth(u) <= width: # trimming is not needed
264 if ucolwidth(u) <= width: # trimming is not needed
265 return s
265 return s
266
266
267 width -= len(ellipsis)
267 width -= len(ellipsis)
268 if width <= 0: # no enough room even for ellipsis
268 if width <= 0: # no enough room even for ellipsis
269 return ellipsis[:width + len(ellipsis)]
269 return ellipsis[:width + len(ellipsis)]
270
270
271 if leftside:
271 if leftside:
272 uslice = lambda i: u[i:]
272 uslice = lambda i: u[i:]
273 concat = lambda s: ellipsis + s
273 concat = lambda s: ellipsis + s
274 else:
274 else:
275 uslice = lambda i: u[:-i]
275 uslice = lambda i: u[:-i]
276 concat = lambda s: s + ellipsis
276 concat = lambda s: s + ellipsis
277 for i in xrange(1, len(u)):
277 for i in xrange(1, len(u)):
278 usub = uslice(i)
278 usub = uslice(i)
279 if ucolwidth(usub) <= width:
279 if ucolwidth(usub) <= width:
280 return concat(usub.encode(encoding))
280 return concat(usub.encode(encoding))
281 return ellipsis # no enough room for multi-column characters
281 return ellipsis # no enough room for multi-column characters
282
282
283 def _asciilower(s):
283 def _asciilower(s):
284 '''convert a string to lowercase if ASCII
284 '''convert a string to lowercase if ASCII
285
285
286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
287 s.decode('ascii')
287 s.decode('ascii')
288 return s.lower()
288 return s.lower()
289
289
290 def asciilower(s):
290 def asciilower(s):
291 # delay importing avoids cyclic dependency around "parsers" in
291 # delay importing avoids cyclic dependency around "parsers" in
292 # pure Python build (util => i18n => encoding => parsers => util)
292 # pure Python build (util => i18n => encoding => parsers => util)
293 import parsers
293 import parsers
294 impl = getattr(parsers, 'asciilower', _asciilower)
294 impl = getattr(parsers, 'asciilower', _asciilower)
295 global asciilower
295 global asciilower
296 asciilower = impl
296 asciilower = impl
297 return impl(s)
297 return impl(s)
298
298
299 def _asciiupper(s):
299 def _asciiupper(s):
300 '''convert a string to uppercase if ASCII
300 '''convert a string to uppercase if ASCII
301
301
302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
303 s.decode('ascii')
303 s.decode('ascii')
304 return s.upper()
304 return s.upper()
305
305
306 def asciiupper(s):
306 def asciiupper(s):
307 # delay importing avoids cyclic dependency around "parsers" in
307 # delay importing avoids cyclic dependency around "parsers" in
308 # pure Python build (util => i18n => encoding => parsers => util)
308 # pure Python build (util => i18n => encoding => parsers => util)
309 import parsers
309 import parsers
310 impl = getattr(parsers, 'asciiupper', _asciiupper)
310 impl = getattr(parsers, 'asciiupper', _asciiupper)
311 global asciiupper
311 global asciiupper
312 asciiupper = impl
312 asciiupper = impl
313 return impl(s)
313 return impl(s)
314
314
315 def lower(s):
315 def lower(s):
316 "best-effort encoding-aware case-folding of local string s"
316 "best-effort encoding-aware case-folding of local string s"
317 try:
317 try:
318 return asciilower(s)
318 return asciilower(s)
319 except UnicodeDecodeError:
319 except UnicodeDecodeError:
320 pass
320 pass
321 try:
321 try:
322 if isinstance(s, localstr):
322 if isinstance(s, localstr):
323 u = s._utf8.decode("utf-8")
323 u = s._utf8.decode("utf-8")
324 else:
324 else:
325 u = s.decode(encoding, encodingmode)
325 u = s.decode(encoding, encodingmode)
326
326
327 lu = u.lower()
327 lu = u.lower()
328 if u == lu:
328 if u == lu:
329 return s # preserve localstring
329 return s # preserve localstring
330 return lu.encode(encoding)
330 return lu.encode(encoding)
331 except UnicodeError:
331 except UnicodeError:
332 return s.lower() # we don't know how to fold this except in ASCII
332 return s.lower() # we don't know how to fold this except in ASCII
333 except LookupError, k:
333 except LookupError, k:
334 raise error.Abort(k, hint="please check your locale settings")
334 raise error.Abort(k, hint="please check your locale settings")
335
335
336 def upper(s):
336 def upper(s):
337 "best-effort encoding-aware case-folding of local string s"
337 "best-effort encoding-aware case-folding of local string s"
338 try:
338 try:
339 return asciiupper(s)
339 return asciiupper(s)
340 except UnicodeDecodeError:
340 except UnicodeDecodeError:
341 pass
341 pass
342 try:
342 try:
343 if isinstance(s, localstr):
343 if isinstance(s, localstr):
344 u = s._utf8.decode("utf-8")
344 u = s._utf8.decode("utf-8")
345 else:
345 else:
346 u = s.decode(encoding, encodingmode)
346 u = s.decode(encoding, encodingmode)
347
347
348 uu = u.upper()
348 uu = u.upper()
349 if u == uu:
349 if u == uu:
350 return s # preserve localstring
350 return s # preserve localstring
351 return uu.encode(encoding)
351 return uu.encode(encoding)
352 except UnicodeError:
352 except UnicodeError:
353 return s.upper() # we don't know how to fold this except in ASCII
353 return s.upper() # we don't know how to fold this except in ASCII
354 except LookupError, k:
354 except LookupError, k:
355 raise error.Abort(k, hint="please check your locale settings")
355 raise error.Abort(k, hint="please check your locale settings")
356
356
357 class normcasespecs(object):
358 '''what a platform's normcase does to ASCII strings
359
360 This is specified per platform, and should be consistent with what normcase
361 on that platform actually does.
362
363 lower: normcase lowercases ASCII strings
364 upper: normcase uppercases ASCII strings
365 other: the fallback function should always be called'''
366 lower = -1
367 upper = 1
368 other = 0
369
357 _jsonmap = {}
370 _jsonmap = {}
358
371
359 def jsonescape(s):
372 def jsonescape(s):
360 '''returns a string suitable for JSON
373 '''returns a string suitable for JSON
361
374
362 JSON is problematic for us because it doesn't support non-Unicode
375 JSON is problematic for us because it doesn't support non-Unicode
363 bytes. To deal with this, we take the following approach:
376 bytes. To deal with this, we take the following approach:
364
377
365 - localstr objects are converted back to UTF-8
378 - localstr objects are converted back to UTF-8
366 - valid UTF-8/ASCII strings are passed as-is
379 - valid UTF-8/ASCII strings are passed as-is
367 - other strings are converted to UTF-8b surrogate encoding
380 - other strings are converted to UTF-8b surrogate encoding
368 - apply JSON-specified string escaping
381 - apply JSON-specified string escaping
369
382
370 (escapes are doubled in these tests)
383 (escapes are doubled in these tests)
371
384
372 >>> jsonescape('this is a test')
385 >>> jsonescape('this is a test')
373 'this is a test'
386 'this is a test'
374 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
387 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
375 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
388 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
376 >>> jsonescape('a weird byte: \\xdd')
389 >>> jsonescape('a weird byte: \\xdd')
377 'a weird byte: \\xed\\xb3\\x9d'
390 'a weird byte: \\xed\\xb3\\x9d'
378 >>> jsonescape('utf-8: caf\\xc3\\xa9')
391 >>> jsonescape('utf-8: caf\\xc3\\xa9')
379 'utf-8: caf\\xc3\\xa9'
392 'utf-8: caf\\xc3\\xa9'
380 >>> jsonescape('')
393 >>> jsonescape('')
381 ''
394 ''
382 '''
395 '''
383
396
384 if not _jsonmap:
397 if not _jsonmap:
385 for x in xrange(32):
398 for x in xrange(32):
386 _jsonmap[chr(x)] = "\u%04x" %x
399 _jsonmap[chr(x)] = "\u%04x" %x
387 for x in xrange(32, 256):
400 for x in xrange(32, 256):
388 c = chr(x)
401 c = chr(x)
389 _jsonmap[c] = c
402 _jsonmap[c] = c
390 _jsonmap['\t'] = '\\t'
403 _jsonmap['\t'] = '\\t'
391 _jsonmap['\n'] = '\\n'
404 _jsonmap['\n'] = '\\n'
392 _jsonmap['\"'] = '\\"'
405 _jsonmap['\"'] = '\\"'
393 _jsonmap['\\'] = '\\\\'
406 _jsonmap['\\'] = '\\\\'
394 _jsonmap['\b'] = '\\b'
407 _jsonmap['\b'] = '\\b'
395 _jsonmap['\f'] = '\\f'
408 _jsonmap['\f'] = '\\f'
396 _jsonmap['\r'] = '\\r'
409 _jsonmap['\r'] = '\\r'
397
410
398 return ''.join(_jsonmap[c] for c in toutf8b(s))
411 return ''.join(_jsonmap[c] for c in toutf8b(s))
399
412
400 def toutf8b(s):
413 def toutf8b(s):
401 '''convert a local, possibly-binary string into UTF-8b
414 '''convert a local, possibly-binary string into UTF-8b
402
415
403 This is intended as a generic method to preserve data when working
416 This is intended as a generic method to preserve data when working
404 with schemes like JSON and XML that have no provision for
417 with schemes like JSON and XML that have no provision for
405 arbitrary byte strings. As Mercurial often doesn't know
418 arbitrary byte strings. As Mercurial often doesn't know
406 what encoding data is in, we use so-called UTF-8b.
419 what encoding data is in, we use so-called UTF-8b.
407
420
408 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
421 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
409 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
422 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
410 uDC00-uDCFF.
423 uDC00-uDCFF.
411
424
412 Principles of operation:
425 Principles of operation:
413
426
414 - ASCII and UTF-8 data successfully round-trips and is understood
427 - ASCII and UTF-8 data successfully round-trips and is understood
415 by Unicode-oriented clients
428 by Unicode-oriented clients
416 - filenames and file contents in arbitrary other encodings can have
429 - filenames and file contents in arbitrary other encodings can have
417 be round-tripped or recovered by clueful clients
430 be round-tripped or recovered by clueful clients
418 - local strings that have a cached known UTF-8 encoding (aka
431 - local strings that have a cached known UTF-8 encoding (aka
419 localstr) get sent as UTF-8 so Unicode-oriented clients get the
432 localstr) get sent as UTF-8 so Unicode-oriented clients get the
420 Unicode data they want
433 Unicode data they want
421 - because we must preserve UTF-8 bytestring in places such as
434 - because we must preserve UTF-8 bytestring in places such as
422 filenames, metadata can't be roundtripped without help
435 filenames, metadata can't be roundtripped without help
423
436
424 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
437 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
425 arbitrary bytes into an internal Unicode format that can be
438 arbitrary bytes into an internal Unicode format that can be
426 re-encoded back into the original. Here we are exposing the
439 re-encoded back into the original. Here we are exposing the
427 internal surrogate encoding as a UTF-8 string.)
440 internal surrogate encoding as a UTF-8 string.)
428 '''
441 '''
429
442
430 if isinstance(s, localstr):
443 if isinstance(s, localstr):
431 return s._utf8
444 return s._utf8
432
445
433 try:
446 try:
434 s.decode('utf-8')
447 s.decode('utf-8')
435 return s
448 return s
436 except UnicodeDecodeError:
449 except UnicodeDecodeError:
437 # surrogate-encode any characters that don't round-trip
450 # surrogate-encode any characters that don't round-trip
438 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
451 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
439 r = ""
452 r = ""
440 pos = 0
453 pos = 0
441 for c in s:
454 for c in s:
442 if s2[pos:pos + 1] == c:
455 if s2[pos:pos + 1] == c:
443 r += c
456 r += c
444 pos += 1
457 pos += 1
445 else:
458 else:
446 r += unichr(0xdc00 + ord(c)).encode('utf-8')
459 r += unichr(0xdc00 + ord(c)).encode('utf-8')
447 return r
460 return r
448
461
449 def fromutf8b(s):
462 def fromutf8b(s):
450 '''Given a UTF-8b string, return a local, possibly-binary string.
463 '''Given a UTF-8b string, return a local, possibly-binary string.
451
464
452 return the original binary string. This
465 return the original binary string. This
453 is a round-trip process for strings like filenames, but metadata
466 is a round-trip process for strings like filenames, but metadata
454 that's was passed through tolocal will remain in UTF-8.
467 that's was passed through tolocal will remain in UTF-8.
455
468
456 >>> m = "\\xc3\\xa9\\x99abcd"
469 >>> m = "\\xc3\\xa9\\x99abcd"
457 >>> n = toutf8b(m)
470 >>> n = toutf8b(m)
458 >>> n
471 >>> n
459 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
472 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
460 >>> fromutf8b(n) == m
473 >>> fromutf8b(n) == m
461 True
474 True
462 '''
475 '''
463
476
464 # fast path - look for uDxxx prefixes in s
477 # fast path - look for uDxxx prefixes in s
465 if "\xed" not in s:
478 if "\xed" not in s:
466 return s
479 return s
467
480
468 u = s.decode("utf-8")
481 u = s.decode("utf-8")
469 r = ""
482 r = ""
470 for c in u:
483 for c in u:
471 if ord(c) & 0xff00 == 0xdc00:
484 if ord(c) & 0xff00 == 0xdc00:
472 r += chr(ord(c) & 0xff)
485 r += chr(ord(c) & 0xff)
473 else:
486 else:
474 r += c.encode("utf-8")
487 r += c.encode("utf-8")
475 return r
488 return r
General Comments 0
You need to be logged in to leave comments. Login now