##// END OF EJS Templates
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
Matt Mackall -
r27699:c8d3392f default
parent child Browse files
Show More
@@ -1,532 +1,542 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from . import (
14 from . import (
15 error,
15 error,
16 )
16 )
17
17
18 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
18 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 # "Unicode Subtleties"), so we need to ignore them in some places for
19 # "Unicode Subtleties"), so we need to ignore them in some places for
20 # sanity.
20 # sanity.
21 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
21 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
22 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 "206a 206b 206c 206d 206e 206f feff".split()]
23 "206a 206b 206c 206d 206e 206f feff".split()]
24 # verify the next function will work
24 # verify the next function will work
25 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
25 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26
26
27 def hfsignoreclean(s):
27 def hfsignoreclean(s):
28 """Remove codepoints ignored by HFS+ from s.
28 """Remove codepoints ignored by HFS+ from s.
29
29
30 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
30 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 '.hg'
31 '.hg'
32 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
32 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 '.hg'
33 '.hg'
34 """
34 """
35 if "\xe2" in s or "\xef" in s:
35 if "\xe2" in s or "\xef" in s:
36 for c in _ignore:
36 for c in _ignore:
37 s = s.replace(c, '')
37 s = s.replace(c, '')
38 return s
38 return s
39
39
40 def _getpreferredencoding():
40 def _getpreferredencoding():
41 '''
41 '''
42 On darwin, getpreferredencoding ignores the locale environment and
42 On darwin, getpreferredencoding ignores the locale environment and
43 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
43 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 for Python 2.7 and up. This is the same corrected code for earlier
44 for Python 2.7 and up. This is the same corrected code for earlier
45 Python versions.
45 Python versions.
46
46
47 However, we can't use a version check for this method, as some distributions
47 However, we can't use a version check for this method, as some distributions
48 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
48 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 encoding, as it is unlikely that this encoding is the actually expected.
49 encoding, as it is unlikely that this encoding is the actually expected.
50 '''
50 '''
51 try:
51 try:
52 locale.CODESET
52 locale.CODESET
53 except AttributeError:
53 except AttributeError:
54 # Fall back to parsing environment variables :-(
54 # Fall back to parsing environment variables :-(
55 return locale.getdefaultlocale()[1]
55 return locale.getdefaultlocale()[1]
56
56
57 oldloc = locale.setlocale(locale.LC_CTYPE)
57 oldloc = locale.setlocale(locale.LC_CTYPE)
58 locale.setlocale(locale.LC_CTYPE, "")
58 locale.setlocale(locale.LC_CTYPE, "")
59 result = locale.nl_langinfo(locale.CODESET)
59 result = locale.nl_langinfo(locale.CODESET)
60 locale.setlocale(locale.LC_CTYPE, oldloc)
60 locale.setlocale(locale.LC_CTYPE, oldloc)
61
61
62 return result
62 return result
63
63
64 _encodingfixers = {
64 _encodingfixers = {
65 '646': lambda: 'ascii',
65 '646': lambda: 'ascii',
66 'ANSI_X3.4-1968': lambda: 'ascii',
66 'ANSI_X3.4-1968': lambda: 'ascii',
67 'mac-roman': _getpreferredencoding
67 'mac-roman': _getpreferredencoding
68 }
68 }
69
69
70 try:
70 try:
71 encoding = os.environ.get("HGENCODING")
71 encoding = os.environ.get("HGENCODING")
72 if not encoding:
72 if not encoding:
73 encoding = locale.getpreferredencoding() or 'ascii'
73 encoding = locale.getpreferredencoding() or 'ascii'
74 encoding = _encodingfixers.get(encoding, lambda: encoding)()
74 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 except locale.Error:
75 except locale.Error:
76 encoding = 'ascii'
76 encoding = 'ascii'
77 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
77 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 fallbackencoding = 'ISO-8859-1'
78 fallbackencoding = 'ISO-8859-1'
79
79
80 class localstr(str):
80 class localstr(str):
81 '''This class allows strings that are unmodified to be
81 '''This class allows strings that are unmodified to be
82 round-tripped to the local encoding and back'''
82 round-tripped to the local encoding and back'''
83 def __new__(cls, u, l):
83 def __new__(cls, u, l):
84 s = str.__new__(cls, l)
84 s = str.__new__(cls, l)
85 s._utf8 = u
85 s._utf8 = u
86 return s
86 return s
87 def __hash__(self):
87 def __hash__(self):
88 return hash(self._utf8) # avoid collisions in local string space
88 return hash(self._utf8) # avoid collisions in local string space
89
89
90 def tolocal(s):
90 def tolocal(s):
91 """
91 """
92 Convert a string from internal UTF-8 to local encoding
92 Convert a string from internal UTF-8 to local encoding
93
93
94 All internal strings should be UTF-8 but some repos before the
94 All internal strings should be UTF-8 but some repos before the
95 implementation of locale support may contain latin1 or possibly
95 implementation of locale support may contain latin1 or possibly
96 other character sets. We attempt to decode everything strictly
96 other character sets. We attempt to decode everything strictly
97 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
97 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 replace unknown characters.
98 replace unknown characters.
99
99
100 The localstr class is used to cache the known UTF-8 encoding of
100 The localstr class is used to cache the known UTF-8 encoding of
101 strings next to their local representation to allow lossless
101 strings next to their local representation to allow lossless
102 round-trip conversion back to UTF-8.
102 round-trip conversion back to UTF-8.
103
103
104 >>> u = 'foo: \\xc3\\xa4' # utf-8
104 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 >>> l = tolocal(u)
105 >>> l = tolocal(u)
106 >>> l
106 >>> l
107 'foo: ?'
107 'foo: ?'
108 >>> fromlocal(l)
108 >>> fromlocal(l)
109 'foo: \\xc3\\xa4'
109 'foo: \\xc3\\xa4'
110 >>> u2 = 'foo: \\xc3\\xa1'
110 >>> u2 = 'foo: \\xc3\\xa1'
111 >>> d = { l: 1, tolocal(u2): 2 }
111 >>> d = { l: 1, tolocal(u2): 2 }
112 >>> len(d) # no collision
112 >>> len(d) # no collision
113 2
113 2
114 >>> 'foo: ?' in d
114 >>> 'foo: ?' in d
115 False
115 False
116 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
116 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 >>> l = tolocal(l1)
117 >>> l = tolocal(l1)
118 >>> l
118 >>> l
119 'foo: ?'
119 'foo: ?'
120 >>> fromlocal(l) # magically in utf-8
120 >>> fromlocal(l) # magically in utf-8
121 'foo: \\xc3\\xa4'
121 'foo: \\xc3\\xa4'
122 """
122 """
123
123
124 try:
124 try:
125 try:
125 try:
126 # make sure string is actually stored in UTF-8
126 # make sure string is actually stored in UTF-8
127 u = s.decode('UTF-8')
127 u = s.decode('UTF-8')
128 if encoding == 'UTF-8':
128 if encoding == 'UTF-8':
129 # fast path
129 # fast path
130 return s
130 return s
131 r = u.encode(encoding, "replace")
131 r = u.encode(encoding, "replace")
132 if u == r.decode(encoding):
132 if u == r.decode(encoding):
133 # r is a safe, non-lossy encoding of s
133 # r is a safe, non-lossy encoding of s
134 return r
134 return r
135 return localstr(s, r)
135 return localstr(s, r)
136 except UnicodeDecodeError:
136 except UnicodeDecodeError:
137 # we should only get here if we're looking at an ancient changeset
137 # we should only get here if we're looking at an ancient changeset
138 try:
138 try:
139 u = s.decode(fallbackencoding)
139 u = s.decode(fallbackencoding)
140 r = u.encode(encoding, "replace")
140 r = u.encode(encoding, "replace")
141 if u == r.decode(encoding):
141 if u == r.decode(encoding):
142 # r is a safe, non-lossy encoding of s
142 # r is a safe, non-lossy encoding of s
143 return r
143 return r
144 return localstr(u.encode('UTF-8'), r)
144 return localstr(u.encode('UTF-8'), r)
145 except UnicodeDecodeError:
145 except UnicodeDecodeError:
146 u = s.decode("utf-8", "replace") # last ditch
146 u = s.decode("utf-8", "replace") # last ditch
147 return u.encode(encoding, "replace") # can't round-trip
147 return u.encode(encoding, "replace") # can't round-trip
148 except LookupError as k:
148 except LookupError as k:
149 raise error.Abort(k, hint="please check your locale settings")
149 raise error.Abort(k, hint="please check your locale settings")
150
150
151 def fromlocal(s):
151 def fromlocal(s):
152 """
152 """
153 Convert a string from the local character encoding to UTF-8
153 Convert a string from the local character encoding to UTF-8
154
154
155 We attempt to decode strings using the encoding mode set by
155 We attempt to decode strings using the encoding mode set by
156 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
156 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 characters will cause an error message. Other modes include
157 characters will cause an error message. Other modes include
158 'replace', which replaces unknown characters with a special
158 'replace', which replaces unknown characters with a special
159 Unicode character, and 'ignore', which drops the character.
159 Unicode character, and 'ignore', which drops the character.
160 """
160 """
161
161
162 # can we do a lossless round-trip?
162 # can we do a lossless round-trip?
163 if isinstance(s, localstr):
163 if isinstance(s, localstr):
164 return s._utf8
164 return s._utf8
165
165
166 try:
166 try:
167 return s.decode(encoding, encodingmode).encode("utf-8")
167 return s.decode(encoding, encodingmode).encode("utf-8")
168 except UnicodeDecodeError as inst:
168 except UnicodeDecodeError as inst:
169 sub = s[max(0, inst.start - 10):inst.start + 10]
169 sub = s[max(0, inst.start - 10):inst.start + 10]
170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 except LookupError as k:
171 except LookupError as k:
172 raise error.Abort(k, hint="please check your locale settings")
172 raise error.Abort(k, hint="please check your locale settings")
173
173
174 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
174 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
175 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 and "WFA" or "WF")
176 and "WFA" or "WF")
177
177
178 def colwidth(s):
178 def colwidth(s):
179 "Find the column width of a string for display in the local encoding"
179 "Find the column width of a string for display in the local encoding"
180 return ucolwidth(s.decode(encoding, 'replace'))
180 return ucolwidth(s.decode(encoding, 'replace'))
181
181
182 def ucolwidth(d):
182 def ucolwidth(d):
183 "Find the column width of a Unicode string for display"
183 "Find the column width of a Unicode string for display"
184 eaw = getattr(unicodedata, 'east_asian_width', None)
184 eaw = getattr(unicodedata, 'east_asian_width', None)
185 if eaw is not None:
185 if eaw is not None:
186 return sum([eaw(c) in wide and 2 or 1 for c in d])
186 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 return len(d)
187 return len(d)
188
188
189 def getcols(s, start, c):
189 def getcols(s, start, c):
190 '''Use colwidth to find a c-column substring of s starting at byte
190 '''Use colwidth to find a c-column substring of s starting at byte
191 index start'''
191 index start'''
192 for x in xrange(start + c, len(s)):
192 for x in xrange(start + c, len(s)):
193 t = s[start:x]
193 t = s[start:x]
194 if colwidth(t) == c:
194 if colwidth(t) == c:
195 return t
195 return t
196
196
197 def trim(s, width, ellipsis='', leftside=False):
197 def trim(s, width, ellipsis='', leftside=False):
198 """Trim string 's' to at most 'width' columns (including 'ellipsis').
198 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199
199
200 If 'leftside' is True, left side of string 's' is trimmed.
200 If 'leftside' is True, left side of string 's' is trimmed.
201 'ellipsis' is always placed at trimmed side.
201 'ellipsis' is always placed at trimmed side.
202
202
203 >>> ellipsis = '+++'
203 >>> ellipsis = '+++'
204 >>> from . import encoding
204 >>> from . import encoding
205 >>> encoding.encoding = 'utf-8'
205 >>> encoding.encoding = 'utf-8'
206 >>> t= '1234567890'
206 >>> t= '1234567890'
207 >>> print trim(t, 12, ellipsis=ellipsis)
207 >>> print trim(t, 12, ellipsis=ellipsis)
208 1234567890
208 1234567890
209 >>> print trim(t, 10, ellipsis=ellipsis)
209 >>> print trim(t, 10, ellipsis=ellipsis)
210 1234567890
210 1234567890
211 >>> print trim(t, 8, ellipsis=ellipsis)
211 >>> print trim(t, 8, ellipsis=ellipsis)
212 12345+++
212 12345+++
213 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
213 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 +++67890
214 +++67890
215 >>> print trim(t, 8)
215 >>> print trim(t, 8)
216 12345678
216 12345678
217 >>> print trim(t, 8, leftside=True)
217 >>> print trim(t, 8, leftside=True)
218 34567890
218 34567890
219 >>> print trim(t, 3, ellipsis=ellipsis)
219 >>> print trim(t, 3, ellipsis=ellipsis)
220 +++
220 +++
221 >>> print trim(t, 1, ellipsis=ellipsis)
221 >>> print trim(t, 1, ellipsis=ellipsis)
222 +
222 +
223 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
223 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 >>> t = u.encode(encoding.encoding)
224 >>> t = u.encode(encoding.encoding)
225 >>> print trim(t, 12, ellipsis=ellipsis)
225 >>> print trim(t, 12, ellipsis=ellipsis)
226 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
226 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 >>> print trim(t, 10, ellipsis=ellipsis)
227 >>> print trim(t, 10, ellipsis=ellipsis)
228 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
228 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 >>> print trim(t, 8, ellipsis=ellipsis)
229 >>> print trim(t, 8, ellipsis=ellipsis)
230 \xe3\x81\x82\xe3\x81\x84+++
230 \xe3\x81\x82\xe3\x81\x84+++
231 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
231 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 +++\xe3\x81\x88\xe3\x81\x8a
232 +++\xe3\x81\x88\xe3\x81\x8a
233 >>> print trim(t, 5)
233 >>> print trim(t, 5)
234 \xe3\x81\x82\xe3\x81\x84
234 \xe3\x81\x82\xe3\x81\x84
235 >>> print trim(t, 5, leftside=True)
235 >>> print trim(t, 5, leftside=True)
236 \xe3\x81\x88\xe3\x81\x8a
236 \xe3\x81\x88\xe3\x81\x8a
237 >>> print trim(t, 4, ellipsis=ellipsis)
237 >>> print trim(t, 4, ellipsis=ellipsis)
238 +++
238 +++
239 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
239 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 +++
240 +++
241 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
241 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 >>> print trim(t, 12, ellipsis=ellipsis)
242 >>> print trim(t, 12, ellipsis=ellipsis)
243 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
243 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 >>> print trim(t, 10, ellipsis=ellipsis)
244 >>> print trim(t, 10, ellipsis=ellipsis)
245 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
245 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 >>> print trim(t, 8, ellipsis=ellipsis)
246 >>> print trim(t, 8, ellipsis=ellipsis)
247 \x11\x22\x33\x44\x55+++
247 \x11\x22\x33\x44\x55+++
248 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
248 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 +++\x66\x77\x88\x99\xaa
249 +++\x66\x77\x88\x99\xaa
250 >>> print trim(t, 8)
250 >>> print trim(t, 8)
251 \x11\x22\x33\x44\x55\x66\x77\x88
251 \x11\x22\x33\x44\x55\x66\x77\x88
252 >>> print trim(t, 8, leftside=True)
252 >>> print trim(t, 8, leftside=True)
253 \x33\x44\x55\x66\x77\x88\x99\xaa
253 \x33\x44\x55\x66\x77\x88\x99\xaa
254 >>> print trim(t, 3, ellipsis=ellipsis)
254 >>> print trim(t, 3, ellipsis=ellipsis)
255 +++
255 +++
256 >>> print trim(t, 1, ellipsis=ellipsis)
256 >>> print trim(t, 1, ellipsis=ellipsis)
257 +
257 +
258 """
258 """
259 try:
259 try:
260 u = s.decode(encoding)
260 u = s.decode(encoding)
261 except UnicodeDecodeError:
261 except UnicodeDecodeError:
262 if len(s) <= width: # trimming is not needed
262 if len(s) <= width: # trimming is not needed
263 return s
263 return s
264 width -= len(ellipsis)
264 width -= len(ellipsis)
265 if width <= 0: # no enough room even for ellipsis
265 if width <= 0: # no enough room even for ellipsis
266 return ellipsis[:width + len(ellipsis)]
266 return ellipsis[:width + len(ellipsis)]
267 if leftside:
267 if leftside:
268 return ellipsis + s[-width:]
268 return ellipsis + s[-width:]
269 return s[:width] + ellipsis
269 return s[:width] + ellipsis
270
270
271 if ucolwidth(u) <= width: # trimming is not needed
271 if ucolwidth(u) <= width: # trimming is not needed
272 return s
272 return s
273
273
274 width -= len(ellipsis)
274 width -= len(ellipsis)
275 if width <= 0: # no enough room even for ellipsis
275 if width <= 0: # no enough room even for ellipsis
276 return ellipsis[:width + len(ellipsis)]
276 return ellipsis[:width + len(ellipsis)]
277
277
278 if leftside:
278 if leftside:
279 uslice = lambda i: u[i:]
279 uslice = lambda i: u[i:]
280 concat = lambda s: ellipsis + s
280 concat = lambda s: ellipsis + s
281 else:
281 else:
282 uslice = lambda i: u[:-i]
282 uslice = lambda i: u[:-i]
283 concat = lambda s: s + ellipsis
283 concat = lambda s: s + ellipsis
284 for i in xrange(1, len(u)):
284 for i in xrange(1, len(u)):
285 usub = uslice(i)
285 usub = uslice(i)
286 if ucolwidth(usub) <= width:
286 if ucolwidth(usub) <= width:
287 return concat(usub.encode(encoding))
287 return concat(usub.encode(encoding))
288 return ellipsis # no enough room for multi-column characters
288 return ellipsis # no enough room for multi-column characters
289
289
290 def _asciilower(s):
290 def _asciilower(s):
291 '''convert a string to lowercase if ASCII
291 '''convert a string to lowercase if ASCII
292
292
293 Raises UnicodeDecodeError if non-ASCII characters are found.'''
293 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 s.decode('ascii')
294 s.decode('ascii')
295 return s.lower()
295 return s.lower()
296
296
297 def asciilower(s):
297 def asciilower(s):
298 # delay importing avoids cyclic dependency around "parsers" in
298 # delay importing avoids cyclic dependency around "parsers" in
299 # pure Python build (util => i18n => encoding => parsers => util)
299 # pure Python build (util => i18n => encoding => parsers => util)
300 from . import parsers
300 from . import parsers
301 impl = getattr(parsers, 'asciilower', _asciilower)
301 impl = getattr(parsers, 'asciilower', _asciilower)
302 global asciilower
302 global asciilower
303 asciilower = impl
303 asciilower = impl
304 return impl(s)
304 return impl(s)
305
305
306 def _asciiupper(s):
306 def _asciiupper(s):
307 '''convert a string to uppercase if ASCII
307 '''convert a string to uppercase if ASCII
308
308
309 Raises UnicodeDecodeError if non-ASCII characters are found.'''
309 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 s.decode('ascii')
310 s.decode('ascii')
311 return s.upper()
311 return s.upper()
312
312
313 def asciiupper(s):
313 def asciiupper(s):
314 # delay importing avoids cyclic dependency around "parsers" in
314 # delay importing avoids cyclic dependency around "parsers" in
315 # pure Python build (util => i18n => encoding => parsers => util)
315 # pure Python build (util => i18n => encoding => parsers => util)
316 from . import parsers
316 from . import parsers
317 impl = getattr(parsers, 'asciiupper', _asciiupper)
317 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 global asciiupper
318 global asciiupper
319 asciiupper = impl
319 asciiupper = impl
320 return impl(s)
320 return impl(s)
321
321
322 def lower(s):
322 def lower(s):
323 "best-effort encoding-aware case-folding of local string s"
323 "best-effort encoding-aware case-folding of local string s"
324 try:
324 try:
325 return asciilower(s)
325 return asciilower(s)
326 except UnicodeDecodeError:
326 except UnicodeDecodeError:
327 pass
327 pass
328 try:
328 try:
329 if isinstance(s, localstr):
329 if isinstance(s, localstr):
330 u = s._utf8.decode("utf-8")
330 u = s._utf8.decode("utf-8")
331 else:
331 else:
332 u = s.decode(encoding, encodingmode)
332 u = s.decode(encoding, encodingmode)
333
333
334 lu = u.lower()
334 lu = u.lower()
335 if u == lu:
335 if u == lu:
336 return s # preserve localstring
336 return s # preserve localstring
337 return lu.encode(encoding)
337 return lu.encode(encoding)
338 except UnicodeError:
338 except UnicodeError:
339 return s.lower() # we don't know how to fold this except in ASCII
339 return s.lower() # we don't know how to fold this except in ASCII
340 except LookupError as k:
340 except LookupError as k:
341 raise error.Abort(k, hint="please check your locale settings")
341 raise error.Abort(k, hint="please check your locale settings")
342
342
343 def upper(s):
343 def upper(s):
344 "best-effort encoding-aware case-folding of local string s"
344 "best-effort encoding-aware case-folding of local string s"
345 try:
345 try:
346 return asciiupper(s)
346 return asciiupper(s)
347 except UnicodeDecodeError:
347 except UnicodeDecodeError:
348 return upperfallback(s)
348 return upperfallback(s)
349
349
350 def upperfallback(s):
350 def upperfallback(s):
351 try:
351 try:
352 if isinstance(s, localstr):
352 if isinstance(s, localstr):
353 u = s._utf8.decode("utf-8")
353 u = s._utf8.decode("utf-8")
354 else:
354 else:
355 u = s.decode(encoding, encodingmode)
355 u = s.decode(encoding, encodingmode)
356
356
357 uu = u.upper()
357 uu = u.upper()
358 if u == uu:
358 if u == uu:
359 return s # preserve localstring
359 return s # preserve localstring
360 return uu.encode(encoding)
360 return uu.encode(encoding)
361 except UnicodeError:
361 except UnicodeError:
362 return s.upper() # we don't know how to fold this except in ASCII
362 return s.upper() # we don't know how to fold this except in ASCII
363 except LookupError as k:
363 except LookupError as k:
364 raise error.Abort(k, hint="please check your locale settings")
364 raise error.Abort(k, hint="please check your locale settings")
365
365
366 class normcasespecs(object):
366 class normcasespecs(object):
367 '''what a platform's normcase does to ASCII strings
367 '''what a platform's normcase does to ASCII strings
368
368
369 This is specified per platform, and should be consistent with what normcase
369 This is specified per platform, and should be consistent with what normcase
370 on that platform actually does.
370 on that platform actually does.
371
371
372 lower: normcase lowercases ASCII strings
372 lower: normcase lowercases ASCII strings
373 upper: normcase uppercases ASCII strings
373 upper: normcase uppercases ASCII strings
374 other: the fallback function should always be called
374 other: the fallback function should always be called
375
375
376 This should be kept in sync with normcase_spec in util.h.'''
376 This should be kept in sync with normcase_spec in util.h.'''
377 lower = -1
377 lower = -1
378 upper = 1
378 upper = 1
379 other = 0
379 other = 0
380
380
381 _jsonmap = {}
381 _jsonmap = {}
382
382
383 def jsonescape(s):
383 def jsonescape(s):
384 '''returns a string suitable for JSON
384 '''returns a string suitable for JSON
385
385
386 JSON is problematic for us because it doesn't support non-Unicode
386 JSON is problematic for us because it doesn't support non-Unicode
387 bytes. To deal with this, we take the following approach:
387 bytes. To deal with this, we take the following approach:
388
388
389 - localstr objects are converted back to UTF-8
389 - localstr objects are converted back to UTF-8
390 - valid UTF-8/ASCII strings are passed as-is
390 - valid UTF-8/ASCII strings are passed as-is
391 - other strings are converted to UTF-8b surrogate encoding
391 - other strings are converted to UTF-8b surrogate encoding
392 - apply JSON-specified string escaping
392 - apply JSON-specified string escaping
393
393
394 (escapes are doubled in these tests)
394 (escapes are doubled in these tests)
395
395
396 >>> jsonescape('this is a test')
396 >>> jsonescape('this is a test')
397 'this is a test'
397 'this is a test'
398 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
398 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
399 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
399 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
400 >>> jsonescape('a weird byte: \\xdd')
400 >>> jsonescape('a weird byte: \\xdd')
401 'a weird byte: \\xed\\xb3\\x9d'
401 'a weird byte: \\xed\\xb3\\x9d'
402 >>> jsonescape('utf-8: caf\\xc3\\xa9')
402 >>> jsonescape('utf-8: caf\\xc3\\xa9')
403 'utf-8: caf\\xc3\\xa9'
403 'utf-8: caf\\xc3\\xa9'
404 >>> jsonescape('')
404 >>> jsonescape('')
405 ''
405 ''
406 '''
406 '''
407
407
408 if not _jsonmap:
408 if not _jsonmap:
409 for x in xrange(32):
409 for x in xrange(32):
410 _jsonmap[chr(x)] = "\\u%04x" % x
410 _jsonmap[chr(x)] = "\\u%04x" % x
411 for x in xrange(32, 256):
411 for x in xrange(32, 256):
412 c = chr(x)
412 c = chr(x)
413 _jsonmap[c] = c
413 _jsonmap[c] = c
414 _jsonmap['\t'] = '\\t'
414 _jsonmap['\t'] = '\\t'
415 _jsonmap['\n'] = '\\n'
415 _jsonmap['\n'] = '\\n'
416 _jsonmap['\"'] = '\\"'
416 _jsonmap['\"'] = '\\"'
417 _jsonmap['\\'] = '\\\\'
417 _jsonmap['\\'] = '\\\\'
418 _jsonmap['\b'] = '\\b'
418 _jsonmap['\b'] = '\\b'
419 _jsonmap['\f'] = '\\f'
419 _jsonmap['\f'] = '\\f'
420 _jsonmap['\r'] = '\\r'
420 _jsonmap['\r'] = '\\r'
421
421
422 return ''.join(_jsonmap[c] for c in toutf8b(s))
422 return ''.join(_jsonmap[c] for c in toutf8b(s))
423
423
424 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
424 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
425
425
426 def getutf8char(s, pos):
426 def getutf8char(s, pos):
427 '''get the next full utf-8 character in the given string, starting at pos
427 '''get the next full utf-8 character in the given string, starting at pos
428
428
429 Raises a UnicodeError if the given location does not start a valid
429 Raises a UnicodeError if the given location does not start a valid
430 utf-8 character.
430 utf-8 character.
431 '''
431 '''
432
432
433 # find how many bytes to attempt decoding from first nibble
433 # find how many bytes to attempt decoding from first nibble
434 l = _utf8len[ord(s[pos]) >> 4]
434 l = _utf8len[ord(s[pos]) >> 4]
435 if not l: # ascii
435 if not l: # ascii
436 return s[pos]
436 return s[pos]
437
437
438 c = s[pos:pos + l]
438 c = s[pos:pos + l]
439 # validate with attempted decode
439 # validate with attempted decode
440 c.decode("utf-8")
440 c.decode("utf-8")
441 return c
441 return c
442
442
443 def toutf8b(s):
443 def toutf8b(s):
444 '''convert a local, possibly-binary string into UTF-8b
444 '''convert a local, possibly-binary string into UTF-8b
445
445
446 This is intended as a generic method to preserve data when working
446 This is intended as a generic method to preserve data when working
447 with schemes like JSON and XML that have no provision for
447 with schemes like JSON and XML that have no provision for
448 arbitrary byte strings. As Mercurial often doesn't know
448 arbitrary byte strings. As Mercurial often doesn't know
449 what encoding data is in, we use so-called UTF-8b.
449 what encoding data is in, we use so-called UTF-8b.
450
450
451 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
451 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
452 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
452 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
453 uDC00-uDCFF.
453 uDC00-uDCFF.
454
454
455 Principles of operation:
455 Principles of operation:
456
456
457 - ASCII and UTF-8 data successfully round-trips and is understood
457 - ASCII and UTF-8 data successfully round-trips and is understood
458 by Unicode-oriented clients
458 by Unicode-oriented clients
459 - filenames and file contents in arbitrary other encodings can have
459 - filenames and file contents in arbitrary other encodings can have
460 be round-tripped or recovered by clueful clients
460 be round-tripped or recovered by clueful clients
461 - local strings that have a cached known UTF-8 encoding (aka
461 - local strings that have a cached known UTF-8 encoding (aka
462 localstr) get sent as UTF-8 so Unicode-oriented clients get the
462 localstr) get sent as UTF-8 so Unicode-oriented clients get the
463 Unicode data they want
463 Unicode data they want
464 - because we must preserve UTF-8 bytestring in places such as
464 - because we must preserve UTF-8 bytestring in places such as
465 filenames, metadata can't be roundtripped without help
465 filenames, metadata can't be roundtripped without help
466
466
467 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
467 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
468 arbitrary bytes into an internal Unicode format that can be
468 arbitrary bytes into an internal Unicode format that can be
469 re-encoded back into the original. Here we are exposing the
469 re-encoded back into the original. Here we are exposing the
470 internal surrogate encoding as a UTF-8 string.)
470 internal surrogate encoding as a UTF-8 string.)
471 '''
471 '''
472
472
473 if "\xed" not in s:
473 if "\xed" not in s:
474 if isinstance(s, localstr):
474 if isinstance(s, localstr):
475 return s._utf8
475 return s._utf8
476 try:
476 try:
477 s.decode('utf-8')
477 s.decode('utf-8')
478 return s
478 return s
479 except UnicodeDecodeError:
479 except UnicodeDecodeError:
480 pass
480 pass
481
481
482 r = ""
482 r = ""
483 pos = 0
483 pos = 0
484 l = len(s)
484 l = len(s)
485 while pos < l:
485 while pos < l:
486 try:
486 try:
487 c = getutf8char(s, pos)
487 c = getutf8char(s, pos)
488 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
488 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
489 # have to re-escape existing U+DCxx characters
489 # have to re-escape existing U+DCxx characters
490 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
490 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
491 pos += 1
491 pos += 1
492 else:
492 else:
493 pos += len(c)
493 pos += len(c)
494 except UnicodeDecodeError:
494 except UnicodeDecodeError:
495 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
495 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
496 pos += 1
496 pos += 1
497 r += c
497 r += c
498 return r
498 return r
499
499
500 def fromutf8b(s):
500 def fromutf8b(s):
501 '''Given a UTF-8b string, return a local, possibly-binary string.
501 '''Given a UTF-8b string, return a local, possibly-binary string.
502
502
503 return the original binary string. This
503 return the original binary string. This
504 is a round-trip process for strings like filenames, but metadata
504 is a round-trip process for strings like filenames, but metadata
505 that's was passed through tolocal will remain in UTF-8.
505 that's was passed through tolocal will remain in UTF-8.
506
506
507 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
507 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
508 >>> m = "\\xc3\\xa9\\x99abcd"
508 >>> m = "\\xc3\\xa9\\x99abcd"
509 >>> toutf8b(m)
509 >>> toutf8b(m)
510 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
510 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
511 >>> roundtrip(m)
511 >>> roundtrip(m)
512 True
512 True
513 >>> roundtrip("\\xc2\\xc2\\x80")
513 >>> roundtrip("\\xc2\\xc2\\x80")
514 True
514 True
515 >>> roundtrip("\\xef\\xbf\\xbd")
515 >>> roundtrip("\\xef\\xbf\\xbd")
516 True
516 True
517 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
517 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
518 True
518 True
519 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
520 True
519 '''
521 '''
520
522
521 # fast path - look for uDxxx prefixes in s
523 # fast path - look for uDxxx prefixes in s
522 if "\xed" not in s:
524 if "\xed" not in s:
523 return s
525 return s
524
526
525 u = s.decode("utf-8")
527 # We could do this with the unicode type but some Python builds
528 # use UTF-16 internally (issue5031) which causes non-BMP code
529 # points to be escaped. Instead, we use our handy getutf8char
530 # helper again to walk the string without "decoding" it.
531
526 r = ""
532 r = ""
527 for c in u:
533 pos = 0
528 if ord(c) & 0xffff00 == 0xdc00:
534 l = len(s)
529 r += chr(ord(c) & 0xff)
535 while pos < l:
530 else:
536 c = getutf8char(s, pos)
531 r += c.encode("utf-8")
537 pos += len(c)
538 # unescape U+DCxx characters
539 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
540 c = chr(ord(c.decode("utf-8")) & 0xff)
541 r += c
532 return r
542 return r
General Comments 0
You need to be logged in to leave comments. Login now