##// END OF EJS Templates
encoding: extend test cases for utf8b...
Matt Mackall -
r26963:de5ae97c default
parent child Browse files
Show More
@@ -1,519 +1,525 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 # "Unicode Subtleties"), so we need to ignore them in some places for
12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 # sanity.
13 # sanity.
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 "206a 206b 206c 206d 206e 206f feff".split()]
16 "206a 206b 206c 206d 206e 206f feff".split()]
17 # verify the next function will work
17 # verify the next function will work
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19
19
20 def hfsignoreclean(s):
20 def hfsignoreclean(s):
21 """Remove codepoints ignored by HFS+ from s.
21 """Remove codepoints ignored by HFS+ from s.
22
22
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 '.hg'
24 '.hg'
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 '.hg'
26 '.hg'
27 """
27 """
28 if "\xe2" in s or "\xef" in s:
28 if "\xe2" in s or "\xef" in s:
29 for c in _ignore:
29 for c in _ignore:
30 s = s.replace(c, '')
30 s = s.replace(c, '')
31 return s
31 return s
32
32
33 def _getpreferredencoding():
33 def _getpreferredencoding():
34 '''
34 '''
35 On darwin, getpreferredencoding ignores the locale environment and
35 On darwin, getpreferredencoding ignores the locale environment and
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
37 for Python 2.7 and up. This is the same corrected code for earlier
37 for Python 2.7 and up. This is the same corrected code for earlier
38 Python versions.
38 Python versions.
39
39
40 However, we can't use a version check for this method, as some distributions
40 However, we can't use a version check for this method, as some distributions
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
42 encoding, as it is unlikely that this encoding is the actually expected.
42 encoding, as it is unlikely that this encoding is the actually expected.
43 '''
43 '''
44 try:
44 try:
45 locale.CODESET
45 locale.CODESET
46 except AttributeError:
46 except AttributeError:
47 # Fall back to parsing environment variables :-(
47 # Fall back to parsing environment variables :-(
48 return locale.getdefaultlocale()[1]
48 return locale.getdefaultlocale()[1]
49
49
50 oldloc = locale.setlocale(locale.LC_CTYPE)
50 oldloc = locale.setlocale(locale.LC_CTYPE)
51 locale.setlocale(locale.LC_CTYPE, "")
51 locale.setlocale(locale.LC_CTYPE, "")
52 result = locale.nl_langinfo(locale.CODESET)
52 result = locale.nl_langinfo(locale.CODESET)
53 locale.setlocale(locale.LC_CTYPE, oldloc)
53 locale.setlocale(locale.LC_CTYPE, oldloc)
54
54
55 return result
55 return result
56
56
57 _encodingfixers = {
57 _encodingfixers = {
58 '646': lambda: 'ascii',
58 '646': lambda: 'ascii',
59 'ANSI_X3.4-1968': lambda: 'ascii',
59 'ANSI_X3.4-1968': lambda: 'ascii',
60 'mac-roman': _getpreferredencoding
60 'mac-roman': _getpreferredencoding
61 }
61 }
62
62
63 try:
63 try:
64 encoding = os.environ.get("HGENCODING")
64 encoding = os.environ.get("HGENCODING")
65 if not encoding:
65 if not encoding:
66 encoding = locale.getpreferredencoding() or 'ascii'
66 encoding = locale.getpreferredencoding() or 'ascii'
67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
68 except locale.Error:
68 except locale.Error:
69 encoding = 'ascii'
69 encoding = 'ascii'
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
71 fallbackencoding = 'ISO-8859-1'
71 fallbackencoding = 'ISO-8859-1'
72
72
73 class localstr(str):
73 class localstr(str):
74 '''This class allows strings that are unmodified to be
74 '''This class allows strings that are unmodified to be
75 round-tripped to the local encoding and back'''
75 round-tripped to the local encoding and back'''
76 def __new__(cls, u, l):
76 def __new__(cls, u, l):
77 s = str.__new__(cls, l)
77 s = str.__new__(cls, l)
78 s._utf8 = u
78 s._utf8 = u
79 return s
79 return s
80 def __hash__(self):
80 def __hash__(self):
81 return hash(self._utf8) # avoid collisions in local string space
81 return hash(self._utf8) # avoid collisions in local string space
82
82
83 def tolocal(s):
83 def tolocal(s):
84 """
84 """
85 Convert a string from internal UTF-8 to local encoding
85 Convert a string from internal UTF-8 to local encoding
86
86
87 All internal strings should be UTF-8 but some repos before the
87 All internal strings should be UTF-8 but some repos before the
88 implementation of locale support may contain latin1 or possibly
88 implementation of locale support may contain latin1 or possibly
89 other character sets. We attempt to decode everything strictly
89 other character sets. We attempt to decode everything strictly
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
91 replace unknown characters.
91 replace unknown characters.
92
92
93 The localstr class is used to cache the known UTF-8 encoding of
93 The localstr class is used to cache the known UTF-8 encoding of
94 strings next to their local representation to allow lossless
94 strings next to their local representation to allow lossless
95 round-trip conversion back to UTF-8.
95 round-trip conversion back to UTF-8.
96
96
97 >>> u = 'foo: \\xc3\\xa4' # utf-8
97 >>> u = 'foo: \\xc3\\xa4' # utf-8
98 >>> l = tolocal(u)
98 >>> l = tolocal(u)
99 >>> l
99 >>> l
100 'foo: ?'
100 'foo: ?'
101 >>> fromlocal(l)
101 >>> fromlocal(l)
102 'foo: \\xc3\\xa4'
102 'foo: \\xc3\\xa4'
103 >>> u2 = 'foo: \\xc3\\xa1'
103 >>> u2 = 'foo: \\xc3\\xa1'
104 >>> d = { l: 1, tolocal(u2): 2 }
104 >>> d = { l: 1, tolocal(u2): 2 }
105 >>> len(d) # no collision
105 >>> len(d) # no collision
106 2
106 2
107 >>> 'foo: ?' in d
107 >>> 'foo: ?' in d
108 False
108 False
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
110 >>> l = tolocal(l1)
110 >>> l = tolocal(l1)
111 >>> l
111 >>> l
112 'foo: ?'
112 'foo: ?'
113 >>> fromlocal(l) # magically in utf-8
113 >>> fromlocal(l) # magically in utf-8
114 'foo: \\xc3\\xa4'
114 'foo: \\xc3\\xa4'
115 """
115 """
116
116
117 try:
117 try:
118 try:
118 try:
119 # make sure string is actually stored in UTF-8
119 # make sure string is actually stored in UTF-8
120 u = s.decode('UTF-8')
120 u = s.decode('UTF-8')
121 if encoding == 'UTF-8':
121 if encoding == 'UTF-8':
122 # fast path
122 # fast path
123 return s
123 return s
124 r = u.encode(encoding, "replace")
124 r = u.encode(encoding, "replace")
125 if u == r.decode(encoding):
125 if u == r.decode(encoding):
126 # r is a safe, non-lossy encoding of s
126 # r is a safe, non-lossy encoding of s
127 return r
127 return r
128 return localstr(s, r)
128 return localstr(s, r)
129 except UnicodeDecodeError:
129 except UnicodeDecodeError:
130 # we should only get here if we're looking at an ancient changeset
130 # we should only get here if we're looking at an ancient changeset
131 try:
131 try:
132 u = s.decode(fallbackencoding)
132 u = s.decode(fallbackencoding)
133 r = u.encode(encoding, "replace")
133 r = u.encode(encoding, "replace")
134 if u == r.decode(encoding):
134 if u == r.decode(encoding):
135 # r is a safe, non-lossy encoding of s
135 # r is a safe, non-lossy encoding of s
136 return r
136 return r
137 return localstr(u.encode('UTF-8'), r)
137 return localstr(u.encode('UTF-8'), r)
138 except UnicodeDecodeError:
138 except UnicodeDecodeError:
139 u = s.decode("utf-8", "replace") # last ditch
139 u = s.decode("utf-8", "replace") # last ditch
140 return u.encode(encoding, "replace") # can't round-trip
140 return u.encode(encoding, "replace") # can't round-trip
141 except LookupError as k:
141 except LookupError as k:
142 raise error.Abort(k, hint="please check your locale settings")
142 raise error.Abort(k, hint="please check your locale settings")
143
143
144 def fromlocal(s):
144 def fromlocal(s):
145 """
145 """
146 Convert a string from the local character encoding to UTF-8
146 Convert a string from the local character encoding to UTF-8
147
147
148 We attempt to decode strings using the encoding mode set by
148 We attempt to decode strings using the encoding mode set by
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
150 characters will cause an error message. Other modes include
150 characters will cause an error message. Other modes include
151 'replace', which replaces unknown characters with a special
151 'replace', which replaces unknown characters with a special
152 Unicode character, and 'ignore', which drops the character.
152 Unicode character, and 'ignore', which drops the character.
153 """
153 """
154
154
155 # can we do a lossless round-trip?
155 # can we do a lossless round-trip?
156 if isinstance(s, localstr):
156 if isinstance(s, localstr):
157 return s._utf8
157 return s._utf8
158
158
159 try:
159 try:
160 return s.decode(encoding, encodingmode).encode("utf-8")
160 return s.decode(encoding, encodingmode).encode("utf-8")
161 except UnicodeDecodeError as inst:
161 except UnicodeDecodeError as inst:
162 sub = s[max(0, inst.start - 10):inst.start + 10]
162 sub = s[max(0, inst.start - 10):inst.start + 10]
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
164 except LookupError as k:
164 except LookupError as k:
165 raise error.Abort(k, hint="please check your locale settings")
165 raise error.Abort(k, hint="please check your locale settings")
166
166
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
169 and "WFA" or "WF")
169 and "WFA" or "WF")
170
170
171 def colwidth(s):
171 def colwidth(s):
172 "Find the column width of a string for display in the local encoding"
172 "Find the column width of a string for display in the local encoding"
173 return ucolwidth(s.decode(encoding, 'replace'))
173 return ucolwidth(s.decode(encoding, 'replace'))
174
174
175 def ucolwidth(d):
175 def ucolwidth(d):
176 "Find the column width of a Unicode string for display"
176 "Find the column width of a Unicode string for display"
177 eaw = getattr(unicodedata, 'east_asian_width', None)
177 eaw = getattr(unicodedata, 'east_asian_width', None)
178 if eaw is not None:
178 if eaw is not None:
179 return sum([eaw(c) in wide and 2 or 1 for c in d])
179 return sum([eaw(c) in wide and 2 or 1 for c in d])
180 return len(d)
180 return len(d)
181
181
182 def getcols(s, start, c):
182 def getcols(s, start, c):
183 '''Use colwidth to find a c-column substring of s starting at byte
183 '''Use colwidth to find a c-column substring of s starting at byte
184 index start'''
184 index start'''
185 for x in xrange(start + c, len(s)):
185 for x in xrange(start + c, len(s)):
186 t = s[start:x]
186 t = s[start:x]
187 if colwidth(t) == c:
187 if colwidth(t) == c:
188 return t
188 return t
189
189
190 def trim(s, width, ellipsis='', leftside=False):
190 def trim(s, width, ellipsis='', leftside=False):
191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
192
192
193 If 'leftside' is True, left side of string 's' is trimmed.
193 If 'leftside' is True, left side of string 's' is trimmed.
194 'ellipsis' is always placed at trimmed side.
194 'ellipsis' is always placed at trimmed side.
195
195
196 >>> ellipsis = '+++'
196 >>> ellipsis = '+++'
197 >>> from mercurial import encoding
197 >>> from mercurial import encoding
198 >>> encoding.encoding = 'utf-8'
198 >>> encoding.encoding = 'utf-8'
199 >>> t= '1234567890'
199 >>> t= '1234567890'
200 >>> print trim(t, 12, ellipsis=ellipsis)
200 >>> print trim(t, 12, ellipsis=ellipsis)
201 1234567890
201 1234567890
202 >>> print trim(t, 10, ellipsis=ellipsis)
202 >>> print trim(t, 10, ellipsis=ellipsis)
203 1234567890
203 1234567890
204 >>> print trim(t, 8, ellipsis=ellipsis)
204 >>> print trim(t, 8, ellipsis=ellipsis)
205 12345+++
205 12345+++
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
207 +++67890
207 +++67890
208 >>> print trim(t, 8)
208 >>> print trim(t, 8)
209 12345678
209 12345678
210 >>> print trim(t, 8, leftside=True)
210 >>> print trim(t, 8, leftside=True)
211 34567890
211 34567890
212 >>> print trim(t, 3, ellipsis=ellipsis)
212 >>> print trim(t, 3, ellipsis=ellipsis)
213 +++
213 +++
214 >>> print trim(t, 1, ellipsis=ellipsis)
214 >>> print trim(t, 1, ellipsis=ellipsis)
215 +
215 +
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
217 >>> t = u.encode(encoding.encoding)
217 >>> t = u.encode(encoding.encoding)
218 >>> print trim(t, 12, ellipsis=ellipsis)
218 >>> print trim(t, 12, ellipsis=ellipsis)
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
220 >>> print trim(t, 10, ellipsis=ellipsis)
220 >>> print trim(t, 10, ellipsis=ellipsis)
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
222 >>> print trim(t, 8, ellipsis=ellipsis)
222 >>> print trim(t, 8, ellipsis=ellipsis)
223 \xe3\x81\x82\xe3\x81\x84+++
223 \xe3\x81\x82\xe3\x81\x84+++
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
225 +++\xe3\x81\x88\xe3\x81\x8a
225 +++\xe3\x81\x88\xe3\x81\x8a
226 >>> print trim(t, 5)
226 >>> print trim(t, 5)
227 \xe3\x81\x82\xe3\x81\x84
227 \xe3\x81\x82\xe3\x81\x84
228 >>> print trim(t, 5, leftside=True)
228 >>> print trim(t, 5, leftside=True)
229 \xe3\x81\x88\xe3\x81\x8a
229 \xe3\x81\x88\xe3\x81\x8a
230 >>> print trim(t, 4, ellipsis=ellipsis)
230 >>> print trim(t, 4, ellipsis=ellipsis)
231 +++
231 +++
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
233 +++
233 +++
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
235 >>> print trim(t, 12, ellipsis=ellipsis)
235 >>> print trim(t, 12, ellipsis=ellipsis)
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
237 >>> print trim(t, 10, ellipsis=ellipsis)
237 >>> print trim(t, 10, ellipsis=ellipsis)
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
239 >>> print trim(t, 8, ellipsis=ellipsis)
239 >>> print trim(t, 8, ellipsis=ellipsis)
240 \x11\x22\x33\x44\x55+++
240 \x11\x22\x33\x44\x55+++
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
242 +++\x66\x77\x88\x99\xaa
242 +++\x66\x77\x88\x99\xaa
243 >>> print trim(t, 8)
243 >>> print trim(t, 8)
244 \x11\x22\x33\x44\x55\x66\x77\x88
244 \x11\x22\x33\x44\x55\x66\x77\x88
245 >>> print trim(t, 8, leftside=True)
245 >>> print trim(t, 8, leftside=True)
246 \x33\x44\x55\x66\x77\x88\x99\xaa
246 \x33\x44\x55\x66\x77\x88\x99\xaa
247 >>> print trim(t, 3, ellipsis=ellipsis)
247 >>> print trim(t, 3, ellipsis=ellipsis)
248 +++
248 +++
249 >>> print trim(t, 1, ellipsis=ellipsis)
249 >>> print trim(t, 1, ellipsis=ellipsis)
250 +
250 +
251 """
251 """
252 try:
252 try:
253 u = s.decode(encoding)
253 u = s.decode(encoding)
254 except UnicodeDecodeError:
254 except UnicodeDecodeError:
255 if len(s) <= width: # trimming is not needed
255 if len(s) <= width: # trimming is not needed
256 return s
256 return s
257 width -= len(ellipsis)
257 width -= len(ellipsis)
258 if width <= 0: # no enough room even for ellipsis
258 if width <= 0: # no enough room even for ellipsis
259 return ellipsis[:width + len(ellipsis)]
259 return ellipsis[:width + len(ellipsis)]
260 if leftside:
260 if leftside:
261 return ellipsis + s[-width:]
261 return ellipsis + s[-width:]
262 return s[:width] + ellipsis
262 return s[:width] + ellipsis
263
263
264 if ucolwidth(u) <= width: # trimming is not needed
264 if ucolwidth(u) <= width: # trimming is not needed
265 return s
265 return s
266
266
267 width -= len(ellipsis)
267 width -= len(ellipsis)
268 if width <= 0: # no enough room even for ellipsis
268 if width <= 0: # no enough room even for ellipsis
269 return ellipsis[:width + len(ellipsis)]
269 return ellipsis[:width + len(ellipsis)]
270
270
271 if leftside:
271 if leftside:
272 uslice = lambda i: u[i:]
272 uslice = lambda i: u[i:]
273 concat = lambda s: ellipsis + s
273 concat = lambda s: ellipsis + s
274 else:
274 else:
275 uslice = lambda i: u[:-i]
275 uslice = lambda i: u[:-i]
276 concat = lambda s: s + ellipsis
276 concat = lambda s: s + ellipsis
277 for i in xrange(1, len(u)):
277 for i in xrange(1, len(u)):
278 usub = uslice(i)
278 usub = uslice(i)
279 if ucolwidth(usub) <= width:
279 if ucolwidth(usub) <= width:
280 return concat(usub.encode(encoding))
280 return concat(usub.encode(encoding))
281 return ellipsis # no enough room for multi-column characters
281 return ellipsis # no enough room for multi-column characters
282
282
283 def _asciilower(s):
283 def _asciilower(s):
284 '''convert a string to lowercase if ASCII
284 '''convert a string to lowercase if ASCII
285
285
286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
287 s.decode('ascii')
287 s.decode('ascii')
288 return s.lower()
288 return s.lower()
289
289
290 def asciilower(s):
290 def asciilower(s):
291 # delay importing avoids cyclic dependency around "parsers" in
291 # delay importing avoids cyclic dependency around "parsers" in
292 # pure Python build (util => i18n => encoding => parsers => util)
292 # pure Python build (util => i18n => encoding => parsers => util)
293 import parsers
293 import parsers
294 impl = getattr(parsers, 'asciilower', _asciilower)
294 impl = getattr(parsers, 'asciilower', _asciilower)
295 global asciilower
295 global asciilower
296 asciilower = impl
296 asciilower = impl
297 return impl(s)
297 return impl(s)
298
298
299 def _asciiupper(s):
299 def _asciiupper(s):
300 '''convert a string to uppercase if ASCII
300 '''convert a string to uppercase if ASCII
301
301
302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
303 s.decode('ascii')
303 s.decode('ascii')
304 return s.upper()
304 return s.upper()
305
305
306 def asciiupper(s):
306 def asciiupper(s):
307 # delay importing avoids cyclic dependency around "parsers" in
307 # delay importing avoids cyclic dependency around "parsers" in
308 # pure Python build (util => i18n => encoding => parsers => util)
308 # pure Python build (util => i18n => encoding => parsers => util)
309 import parsers
309 import parsers
310 impl = getattr(parsers, 'asciiupper', _asciiupper)
310 impl = getattr(parsers, 'asciiupper', _asciiupper)
311 global asciiupper
311 global asciiupper
312 asciiupper = impl
312 asciiupper = impl
313 return impl(s)
313 return impl(s)
314
314
315 def lower(s):
315 def lower(s):
316 "best-effort encoding-aware case-folding of local string s"
316 "best-effort encoding-aware case-folding of local string s"
317 try:
317 try:
318 return asciilower(s)
318 return asciilower(s)
319 except UnicodeDecodeError:
319 except UnicodeDecodeError:
320 pass
320 pass
321 try:
321 try:
322 if isinstance(s, localstr):
322 if isinstance(s, localstr):
323 u = s._utf8.decode("utf-8")
323 u = s._utf8.decode("utf-8")
324 else:
324 else:
325 u = s.decode(encoding, encodingmode)
325 u = s.decode(encoding, encodingmode)
326
326
327 lu = u.lower()
327 lu = u.lower()
328 if u == lu:
328 if u == lu:
329 return s # preserve localstring
329 return s # preserve localstring
330 return lu.encode(encoding)
330 return lu.encode(encoding)
331 except UnicodeError:
331 except UnicodeError:
332 return s.lower() # we don't know how to fold this except in ASCII
332 return s.lower() # we don't know how to fold this except in ASCII
333 except LookupError as k:
333 except LookupError as k:
334 raise error.Abort(k, hint="please check your locale settings")
334 raise error.Abort(k, hint="please check your locale settings")
335
335
336 def upper(s):
336 def upper(s):
337 "best-effort encoding-aware case-folding of local string s"
337 "best-effort encoding-aware case-folding of local string s"
338 try:
338 try:
339 return asciiupper(s)
339 return asciiupper(s)
340 except UnicodeDecodeError:
340 except UnicodeDecodeError:
341 return upperfallback(s)
341 return upperfallback(s)
342
342
343 def upperfallback(s):
343 def upperfallback(s):
344 try:
344 try:
345 if isinstance(s, localstr):
345 if isinstance(s, localstr):
346 u = s._utf8.decode("utf-8")
346 u = s._utf8.decode("utf-8")
347 else:
347 else:
348 u = s.decode(encoding, encodingmode)
348 u = s.decode(encoding, encodingmode)
349
349
350 uu = u.upper()
350 uu = u.upper()
351 if u == uu:
351 if u == uu:
352 return s # preserve localstring
352 return s # preserve localstring
353 return uu.encode(encoding)
353 return uu.encode(encoding)
354 except UnicodeError:
354 except UnicodeError:
355 return s.upper() # we don't know how to fold this except in ASCII
355 return s.upper() # we don't know how to fold this except in ASCII
356 except LookupError as k:
356 except LookupError as k:
357 raise error.Abort(k, hint="please check your locale settings")
357 raise error.Abort(k, hint="please check your locale settings")
358
358
359 class normcasespecs(object):
359 class normcasespecs(object):
360 '''what a platform's normcase does to ASCII strings
360 '''what a platform's normcase does to ASCII strings
361
361
362 This is specified per platform, and should be consistent with what normcase
362 This is specified per platform, and should be consistent with what normcase
363 on that platform actually does.
363 on that platform actually does.
364
364
365 lower: normcase lowercases ASCII strings
365 lower: normcase lowercases ASCII strings
366 upper: normcase uppercases ASCII strings
366 upper: normcase uppercases ASCII strings
367 other: the fallback function should always be called
367 other: the fallback function should always be called
368
368
369 This should be kept in sync with normcase_spec in util.h.'''
369 This should be kept in sync with normcase_spec in util.h.'''
370 lower = -1
370 lower = -1
371 upper = 1
371 upper = 1
372 other = 0
372 other = 0
373
373
374 _jsonmap = {}
374 _jsonmap = {}
375
375
376 def jsonescape(s):
376 def jsonescape(s):
377 '''returns a string suitable for JSON
377 '''returns a string suitable for JSON
378
378
379 JSON is problematic for us because it doesn't support non-Unicode
379 JSON is problematic for us because it doesn't support non-Unicode
380 bytes. To deal with this, we take the following approach:
380 bytes. To deal with this, we take the following approach:
381
381
382 - localstr objects are converted back to UTF-8
382 - localstr objects are converted back to UTF-8
383 - valid UTF-8/ASCII strings are passed as-is
383 - valid UTF-8/ASCII strings are passed as-is
384 - other strings are converted to UTF-8b surrogate encoding
384 - other strings are converted to UTF-8b surrogate encoding
385 - apply JSON-specified string escaping
385 - apply JSON-specified string escaping
386
386
387 (escapes are doubled in these tests)
387 (escapes are doubled in these tests)
388
388
389 >>> jsonescape('this is a test')
389 >>> jsonescape('this is a test')
390 'this is a test'
390 'this is a test'
391 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
391 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
392 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
392 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
393 >>> jsonescape('a weird byte: \\xdd')
393 >>> jsonescape('a weird byte: \\xdd')
394 'a weird byte: \\xed\\xb3\\x9d'
394 'a weird byte: \\xed\\xb3\\x9d'
395 >>> jsonescape('utf-8: caf\\xc3\\xa9')
395 >>> jsonescape('utf-8: caf\\xc3\\xa9')
396 'utf-8: caf\\xc3\\xa9'
396 'utf-8: caf\\xc3\\xa9'
397 >>> jsonescape('')
397 >>> jsonescape('')
398 ''
398 ''
399 '''
399 '''
400
400
401 if not _jsonmap:
401 if not _jsonmap:
402 for x in xrange(32):
402 for x in xrange(32):
403 _jsonmap[chr(x)] = "\u%04x" %x
403 _jsonmap[chr(x)] = "\u%04x" %x
404 for x in xrange(32, 256):
404 for x in xrange(32, 256):
405 c = chr(x)
405 c = chr(x)
406 _jsonmap[c] = c
406 _jsonmap[c] = c
407 _jsonmap['\t'] = '\\t'
407 _jsonmap['\t'] = '\\t'
408 _jsonmap['\n'] = '\\n'
408 _jsonmap['\n'] = '\\n'
409 _jsonmap['\"'] = '\\"'
409 _jsonmap['\"'] = '\\"'
410 _jsonmap['\\'] = '\\\\'
410 _jsonmap['\\'] = '\\\\'
411 _jsonmap['\b'] = '\\b'
411 _jsonmap['\b'] = '\\b'
412 _jsonmap['\f'] = '\\f'
412 _jsonmap['\f'] = '\\f'
413 _jsonmap['\r'] = '\\r'
413 _jsonmap['\r'] = '\\r'
414
414
415 return ''.join(_jsonmap[c] for c in toutf8b(s))
415 return ''.join(_jsonmap[c] for c in toutf8b(s))
416
416
417 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
417 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
418
418
419 def getutf8char(s, pos):
419 def getutf8char(s, pos):
420 '''get the next full utf-8 character in the given string, starting at pos
420 '''get the next full utf-8 character in the given string, starting at pos
421
421
422 Raises a UnicodeError if the given location does not start a valid
422 Raises a UnicodeError if the given location does not start a valid
423 utf-8 character.
423 utf-8 character.
424 '''
424 '''
425
425
426 # find how many bytes to attempt decoding from first nibble
426 # find how many bytes to attempt decoding from first nibble
427 l = _utf8len[ord(s[pos]) >> 4]
427 l = _utf8len[ord(s[pos]) >> 4]
428 if not l: # ascii
428 if not l: # ascii
429 return s[pos]
429 return s[pos]
430
430
431 c = s[pos:pos + l]
431 c = s[pos:pos + l]
432 # validate with attempted decode
432 # validate with attempted decode
433 c.decode("utf-8")
433 c.decode("utf-8")
434 return c
434 return c
435
435
436 def toutf8b(s):
436 def toutf8b(s):
437 '''convert a local, possibly-binary string into UTF-8b
437 '''convert a local, possibly-binary string into UTF-8b
438
438
439 This is intended as a generic method to preserve data when working
439 This is intended as a generic method to preserve data when working
440 with schemes like JSON and XML that have no provision for
440 with schemes like JSON and XML that have no provision for
441 arbitrary byte strings. As Mercurial often doesn't know
441 arbitrary byte strings. As Mercurial often doesn't know
442 what encoding data is in, we use so-called UTF-8b.
442 what encoding data is in, we use so-called UTF-8b.
443
443
444 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
444 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
445 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
445 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
446 uDC00-uDCFF.
446 uDC00-uDCFF.
447
447
448 Principles of operation:
448 Principles of operation:
449
449
450 - ASCII and UTF-8 data successfully round-trips and is understood
450 - ASCII and UTF-8 data successfully round-trips and is understood
451 by Unicode-oriented clients
451 by Unicode-oriented clients
452 - filenames and file contents in arbitrary other encodings can have
452 - filenames and file contents in arbitrary other encodings can have
453 be round-tripped or recovered by clueful clients
453 be round-tripped or recovered by clueful clients
454 - local strings that have a cached known UTF-8 encoding (aka
454 - local strings that have a cached known UTF-8 encoding (aka
455 localstr) get sent as UTF-8 so Unicode-oriented clients get the
455 localstr) get sent as UTF-8 so Unicode-oriented clients get the
456 Unicode data they want
456 Unicode data they want
457 - because we must preserve UTF-8 bytestring in places such as
457 - because we must preserve UTF-8 bytestring in places such as
458 filenames, metadata can't be roundtripped without help
458 filenames, metadata can't be roundtripped without help
459
459
460 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
460 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
461 arbitrary bytes into an internal Unicode format that can be
461 arbitrary bytes into an internal Unicode format that can be
462 re-encoded back into the original. Here we are exposing the
462 re-encoded back into the original. Here we are exposing the
463 internal surrogate encoding as a UTF-8 string.)
463 internal surrogate encoding as a UTF-8 string.)
464 '''
464 '''
465
465
466 if "\xed" not in s:
466 if "\xed" not in s:
467 if isinstance(s, localstr):
467 if isinstance(s, localstr):
468 return s._utf8
468 return s._utf8
469 try:
469 try:
470 s.decode('utf-8')
470 s.decode('utf-8')
471 return s
471 return s
472 except UnicodeDecodeError:
472 except UnicodeDecodeError:
473 pass
473 pass
474
474
475 r = ""
475 r = ""
476 pos = 0
476 pos = 0
477 l = len(s)
477 l = len(s)
478 while pos < l:
478 while pos < l:
479 try:
479 try:
480 c = getutf8char(s, pos)
480 c = getutf8char(s, pos)
481 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
481 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
482 # have to re-escape existing U+DCxx characters
482 # have to re-escape existing U+DCxx characters
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
484 pos += 1
484 pos += 1
485 else:
485 else:
486 pos += len(c)
486 pos += len(c)
487 except UnicodeDecodeError:
487 except UnicodeDecodeError:
488 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
488 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
489 pos += 1
489 pos += 1
490 r += c
490 r += c
491 return r
491 return r
492
492
493 def fromutf8b(s):
493 def fromutf8b(s):
494 '''Given a UTF-8b string, return a local, possibly-binary string.
494 '''Given a UTF-8b string, return a local, possibly-binary string.
495
495
496 return the original binary string. This
496 return the original binary string. This
497 is a round-trip process for strings like filenames, but metadata
497 is a round-trip process for strings like filenames, but metadata
498 that's was passed through tolocal will remain in UTF-8.
498 that's was passed through tolocal will remain in UTF-8.
499
499
500 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
500 >>> m = "\\xc3\\xa9\\x99abcd"
501 >>> m = "\\xc3\\xa9\\x99abcd"
501 >>> n = toutf8b(m)
502 >>> toutf8b(m)
502 >>> n
503 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
503 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
504 >>> fromutf8b(n) == m
504 >>> roundtrip(m)
505 True
506 >>> roundtrip("\\xc2\\xc2\\x80")
507 True
508 >>> roundtrip("\\xef\\xbf\\xbd")
509 True
510 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
505 True
511 True
506 '''
512 '''
507
513
508 # fast path - look for uDxxx prefixes in s
514 # fast path - look for uDxxx prefixes in s
509 if "\xed" not in s:
515 if "\xed" not in s:
510 return s
516 return s
511
517
512 u = s.decode("utf-8")
518 u = s.decode("utf-8")
513 r = ""
519 r = ""
514 for c in u:
520 for c in u:
515 if ord(c) & 0xffff00 == 0xdc00:
521 if ord(c) & 0xffff00 == 0xdc00:
516 r += chr(ord(c) & 0xff)
522 r += chr(ord(c) & 0xff)
517 else:
523 else:
518 r += c.encode("utf-8")
524 r += c.encode("utf-8")
519 return r
525 return r
General Comments 0
You need to be logged in to leave comments. Login now