##// END OF EJS Templates
py3: make sure encoding.encoding is a bytes variable...
Pulkit Goyal -
r30622:ce36fa9b default
parent child Browse files
Show More
@@ -1,602 +1,602
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 pycompat,
17 pycompat,
18 )
18 )
19
19
20 _sysstr = pycompat.sysstr
20 _sysstr = pycompat.sysstr
21
21
22 if pycompat.ispy3:
22 if pycompat.ispy3:
23 unichr = chr
23 unichr = chr
24
24
25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
26 # "Unicode Subtleties"), so we need to ignore them in some places for
26 # "Unicode Subtleties"), so we need to ignore them in some places for
27 # sanity.
27 # sanity.
28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
30 "206a 206b 206c 206d 206e 206f feff".split()]
30 "206a 206b 206c 206d 206e 206f feff".split()]
31 # verify the next function will work
31 # verify the next function will work
32 if pycompat.ispy3:
32 if pycompat.ispy3:
33 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
33 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
34 else:
34 else:
35 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
35 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
36
36
37 def hfsignoreclean(s):
37 def hfsignoreclean(s):
38 """Remove codepoints ignored by HFS+ from s.
38 """Remove codepoints ignored by HFS+ from s.
39
39
40 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
40 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
41 '.hg'
41 '.hg'
42 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
42 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
43 '.hg'
43 '.hg'
44 """
44 """
45 if "\xe2" in s or "\xef" in s:
45 if "\xe2" in s or "\xef" in s:
46 for c in _ignore:
46 for c in _ignore:
47 s = s.replace(c, '')
47 s = s.replace(c, '')
48 return s
48 return s
49
49
50 # encoding.environ is provided read-only, which may not be used to modify
50 # encoding.environ is provided read-only, which may not be used to modify
51 # the process environment
51 # the process environment
52 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
52 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
53 if not pycompat.ispy3:
53 if not pycompat.ispy3:
54 environ = os.environ
54 environ = os.environ
55 elif _nativeenviron:
55 elif _nativeenviron:
56 environ = os.environb
56 environ = os.environb
57 else:
57 else:
58 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
58 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
59 # and recreate it once encoding is settled
59 # and recreate it once encoding is settled
60 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
60 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
61 for k, v in os.environ.items())
61 for k, v in os.environ.items())
62
62
63 def _getpreferredencoding():
63 def _getpreferredencoding():
64 '''
64 '''
65 On darwin, getpreferredencoding ignores the locale environment and
65 On darwin, getpreferredencoding ignores the locale environment and
66 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
66 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
67 for Python 2.7 and up. This is the same corrected code for earlier
67 for Python 2.7 and up. This is the same corrected code for earlier
68 Python versions.
68 Python versions.
69
69
70 However, we can't use a version check for this method, as some distributions
70 However, we can't use a version check for this method, as some distributions
71 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
71 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
72 encoding, as it is unlikely that this encoding is the actually expected.
72 encoding, as it is unlikely that this encoding is the actually expected.
73 '''
73 '''
74 try:
74 try:
75 locale.CODESET
75 locale.CODESET
76 except AttributeError:
76 except AttributeError:
77 # Fall back to parsing environment variables :-(
77 # Fall back to parsing environment variables :-(
78 return locale.getdefaultlocale()[1]
78 return locale.getdefaultlocale()[1]
79
79
80 oldloc = locale.setlocale(locale.LC_CTYPE)
80 oldloc = locale.setlocale(locale.LC_CTYPE)
81 locale.setlocale(locale.LC_CTYPE, "")
81 locale.setlocale(locale.LC_CTYPE, "")
82 result = locale.nl_langinfo(locale.CODESET)
82 result = locale.nl_langinfo(locale.CODESET)
83 locale.setlocale(locale.LC_CTYPE, oldloc)
83 locale.setlocale(locale.LC_CTYPE, oldloc)
84
84
85 return result
85 return result
86
86
87 _encodingfixers = {
87 _encodingfixers = {
88 '646': lambda: 'ascii',
88 '646': lambda: 'ascii',
89 'ANSI_X3.4-1968': lambda: 'ascii',
89 'ANSI_X3.4-1968': lambda: 'ascii',
90 'mac-roman': _getpreferredencoding
90 'mac-roman': _getpreferredencoding
91 }
91 }
92
92
93 try:
93 try:
94 encoding = environ.get("HGENCODING")
94 encoding = environ.get("HGENCODING")
95 if not encoding:
95 if not encoding:
96 encoding = locale.getpreferredencoding() or 'ascii'
96 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
97 encoding = _encodingfixers.get(encoding, lambda: encoding)()
97 encoding = _encodingfixers.get(encoding, lambda: encoding)()
98 except locale.Error:
98 except locale.Error:
99 encoding = 'ascii'
99 encoding = 'ascii'
100 encodingmode = environ.get("HGENCODINGMODE", "strict")
100 encodingmode = environ.get("HGENCODINGMODE", "strict")
101 fallbackencoding = 'ISO-8859-1'
101 fallbackencoding = 'ISO-8859-1'
102
102
103 class localstr(str):
103 class localstr(str):
104 '''This class allows strings that are unmodified to be
104 '''This class allows strings that are unmodified to be
105 round-tripped to the local encoding and back'''
105 round-tripped to the local encoding and back'''
106 def __new__(cls, u, l):
106 def __new__(cls, u, l):
107 s = str.__new__(cls, l)
107 s = str.__new__(cls, l)
108 s._utf8 = u
108 s._utf8 = u
109 return s
109 return s
110 def __hash__(self):
110 def __hash__(self):
111 return hash(self._utf8) # avoid collisions in local string space
111 return hash(self._utf8) # avoid collisions in local string space
112
112
113 def tolocal(s):
113 def tolocal(s):
114 """
114 """
115 Convert a string from internal UTF-8 to local encoding
115 Convert a string from internal UTF-8 to local encoding
116
116
117 All internal strings should be UTF-8 but some repos before the
117 All internal strings should be UTF-8 but some repos before the
118 implementation of locale support may contain latin1 or possibly
118 implementation of locale support may contain latin1 or possibly
119 other character sets. We attempt to decode everything strictly
119 other character sets. We attempt to decode everything strictly
120 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
120 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
121 replace unknown characters.
121 replace unknown characters.
122
122
123 The localstr class is used to cache the known UTF-8 encoding of
123 The localstr class is used to cache the known UTF-8 encoding of
124 strings next to their local representation to allow lossless
124 strings next to their local representation to allow lossless
125 round-trip conversion back to UTF-8.
125 round-trip conversion back to UTF-8.
126
126
127 >>> u = 'foo: \\xc3\\xa4' # utf-8
127 >>> u = 'foo: \\xc3\\xa4' # utf-8
128 >>> l = tolocal(u)
128 >>> l = tolocal(u)
129 >>> l
129 >>> l
130 'foo: ?'
130 'foo: ?'
131 >>> fromlocal(l)
131 >>> fromlocal(l)
132 'foo: \\xc3\\xa4'
132 'foo: \\xc3\\xa4'
133 >>> u2 = 'foo: \\xc3\\xa1'
133 >>> u2 = 'foo: \\xc3\\xa1'
134 >>> d = { l: 1, tolocal(u2): 2 }
134 >>> d = { l: 1, tolocal(u2): 2 }
135 >>> len(d) # no collision
135 >>> len(d) # no collision
136 2
136 2
137 >>> 'foo: ?' in d
137 >>> 'foo: ?' in d
138 False
138 False
139 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
139 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
140 >>> l = tolocal(l1)
140 >>> l = tolocal(l1)
141 >>> l
141 >>> l
142 'foo: ?'
142 'foo: ?'
143 >>> fromlocal(l) # magically in utf-8
143 >>> fromlocal(l) # magically in utf-8
144 'foo: \\xc3\\xa4'
144 'foo: \\xc3\\xa4'
145 """
145 """
146
146
147 try:
147 try:
148 try:
148 try:
149 # make sure string is actually stored in UTF-8
149 # make sure string is actually stored in UTF-8
150 u = s.decode('UTF-8')
150 u = s.decode('UTF-8')
151 if encoding == 'UTF-8':
151 if encoding == 'UTF-8':
152 # fast path
152 # fast path
153 return s
153 return s
154 r = u.encode(_sysstr(encoding), u"replace")
154 r = u.encode(_sysstr(encoding), u"replace")
155 if u == r.decode(_sysstr(encoding)):
155 if u == r.decode(_sysstr(encoding)):
156 # r is a safe, non-lossy encoding of s
156 # r is a safe, non-lossy encoding of s
157 return r
157 return r
158 return localstr(s, r)
158 return localstr(s, r)
159 except UnicodeDecodeError:
159 except UnicodeDecodeError:
160 # we should only get here if we're looking at an ancient changeset
160 # we should only get here if we're looking at an ancient changeset
161 try:
161 try:
162 u = s.decode(_sysstr(fallbackencoding))
162 u = s.decode(_sysstr(fallbackencoding))
163 r = u.encode(_sysstr(encoding), u"replace")
163 r = u.encode(_sysstr(encoding), u"replace")
164 if u == r.decode(_sysstr(encoding)):
164 if u == r.decode(_sysstr(encoding)):
165 # r is a safe, non-lossy encoding of s
165 # r is a safe, non-lossy encoding of s
166 return r
166 return r
167 return localstr(u.encode('UTF-8'), r)
167 return localstr(u.encode('UTF-8'), r)
168 except UnicodeDecodeError:
168 except UnicodeDecodeError:
169 u = s.decode("utf-8", "replace") # last ditch
169 u = s.decode("utf-8", "replace") # last ditch
170 # can't round-trip
170 # can't round-trip
171 return u.encode(_sysstr(encoding), u"replace")
171 return u.encode(_sysstr(encoding), u"replace")
172 except LookupError as k:
172 except LookupError as k:
173 raise error.Abort(k, hint="please check your locale settings")
173 raise error.Abort(k, hint="please check your locale settings")
174
174
175 def fromlocal(s):
175 def fromlocal(s):
176 """
176 """
177 Convert a string from the local character encoding to UTF-8
177 Convert a string from the local character encoding to UTF-8
178
178
179 We attempt to decode strings using the encoding mode set by
179 We attempt to decode strings using the encoding mode set by
180 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
180 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
181 characters will cause an error message. Other modes include
181 characters will cause an error message. Other modes include
182 'replace', which replaces unknown characters with a special
182 'replace', which replaces unknown characters with a special
183 Unicode character, and 'ignore', which drops the character.
183 Unicode character, and 'ignore', which drops the character.
184 """
184 """
185
185
186 # can we do a lossless round-trip?
186 # can we do a lossless round-trip?
187 if isinstance(s, localstr):
187 if isinstance(s, localstr):
188 return s._utf8
188 return s._utf8
189
189
190 try:
190 try:
191 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
191 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
192 return u.encode("utf-8")
192 return u.encode("utf-8")
193 except UnicodeDecodeError as inst:
193 except UnicodeDecodeError as inst:
194 sub = s[max(0, inst.start - 10):inst.start + 10]
194 sub = s[max(0, inst.start - 10):inst.start + 10]
195 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
195 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
196 except LookupError as k:
196 except LookupError as k:
197 raise error.Abort(k, hint="please check your locale settings")
197 raise error.Abort(k, hint="please check your locale settings")
198
198
199 if not _nativeenviron:
199 if not _nativeenviron:
200 # now encoding and helper functions are available, recreate the environ
200 # now encoding and helper functions are available, recreate the environ
201 # dict to be exported to other modules
201 # dict to be exported to other modules
202 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
202 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
203 for k, v in os.environ.items())
203 for k, v in os.environ.items())
204
204
205 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
205 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
206 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
206 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
207 and "WFA" or "WF")
207 and "WFA" or "WF")
208
208
209 def colwidth(s):
209 def colwidth(s):
210 "Find the column width of a string for display in the local encoding"
210 "Find the column width of a string for display in the local encoding"
211 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
211 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
212
212
213 def ucolwidth(d):
213 def ucolwidth(d):
214 "Find the column width of a Unicode string for display"
214 "Find the column width of a Unicode string for display"
215 eaw = getattr(unicodedata, 'east_asian_width', None)
215 eaw = getattr(unicodedata, 'east_asian_width', None)
216 if eaw is not None:
216 if eaw is not None:
217 return sum([eaw(c) in wide and 2 or 1 for c in d])
217 return sum([eaw(c) in wide and 2 or 1 for c in d])
218 return len(d)
218 return len(d)
219
219
220 def getcols(s, start, c):
220 def getcols(s, start, c):
221 '''Use colwidth to find a c-column substring of s starting at byte
221 '''Use colwidth to find a c-column substring of s starting at byte
222 index start'''
222 index start'''
223 for x in xrange(start + c, len(s)):
223 for x in xrange(start + c, len(s)):
224 t = s[start:x]
224 t = s[start:x]
225 if colwidth(t) == c:
225 if colwidth(t) == c:
226 return t
226 return t
227
227
228 def trim(s, width, ellipsis='', leftside=False):
228 def trim(s, width, ellipsis='', leftside=False):
229 """Trim string 's' to at most 'width' columns (including 'ellipsis').
229 """Trim string 's' to at most 'width' columns (including 'ellipsis').
230
230
231 If 'leftside' is True, left side of string 's' is trimmed.
231 If 'leftside' is True, left side of string 's' is trimmed.
232 'ellipsis' is always placed at trimmed side.
232 'ellipsis' is always placed at trimmed side.
233
233
234 >>> ellipsis = '+++'
234 >>> ellipsis = '+++'
235 >>> from . import encoding
235 >>> from . import encoding
236 >>> encoding.encoding = 'utf-8'
236 >>> encoding.encoding = 'utf-8'
237 >>> t= '1234567890'
237 >>> t= '1234567890'
238 >>> print trim(t, 12, ellipsis=ellipsis)
238 >>> print trim(t, 12, ellipsis=ellipsis)
239 1234567890
239 1234567890
240 >>> print trim(t, 10, ellipsis=ellipsis)
240 >>> print trim(t, 10, ellipsis=ellipsis)
241 1234567890
241 1234567890
242 >>> print trim(t, 8, ellipsis=ellipsis)
242 >>> print trim(t, 8, ellipsis=ellipsis)
243 12345+++
243 12345+++
244 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
244 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
245 +++67890
245 +++67890
246 >>> print trim(t, 8)
246 >>> print trim(t, 8)
247 12345678
247 12345678
248 >>> print trim(t, 8, leftside=True)
248 >>> print trim(t, 8, leftside=True)
249 34567890
249 34567890
250 >>> print trim(t, 3, ellipsis=ellipsis)
250 >>> print trim(t, 3, ellipsis=ellipsis)
251 +++
251 +++
252 >>> print trim(t, 1, ellipsis=ellipsis)
252 >>> print trim(t, 1, ellipsis=ellipsis)
253 +
253 +
254 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
254 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
255 >>> t = u.encode(encoding.encoding)
255 >>> t = u.encode(encoding.encoding)
256 >>> print trim(t, 12, ellipsis=ellipsis)
256 >>> print trim(t, 12, ellipsis=ellipsis)
257 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
257 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
258 >>> print trim(t, 10, ellipsis=ellipsis)
258 >>> print trim(t, 10, ellipsis=ellipsis)
259 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
259 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
260 >>> print trim(t, 8, ellipsis=ellipsis)
260 >>> print trim(t, 8, ellipsis=ellipsis)
261 \xe3\x81\x82\xe3\x81\x84+++
261 \xe3\x81\x82\xe3\x81\x84+++
262 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
262 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
263 +++\xe3\x81\x88\xe3\x81\x8a
263 +++\xe3\x81\x88\xe3\x81\x8a
264 >>> print trim(t, 5)
264 >>> print trim(t, 5)
265 \xe3\x81\x82\xe3\x81\x84
265 \xe3\x81\x82\xe3\x81\x84
266 >>> print trim(t, 5, leftside=True)
266 >>> print trim(t, 5, leftside=True)
267 \xe3\x81\x88\xe3\x81\x8a
267 \xe3\x81\x88\xe3\x81\x8a
268 >>> print trim(t, 4, ellipsis=ellipsis)
268 >>> print trim(t, 4, ellipsis=ellipsis)
269 +++
269 +++
270 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
270 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
271 +++
271 +++
272 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
272 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
273 >>> print trim(t, 12, ellipsis=ellipsis)
273 >>> print trim(t, 12, ellipsis=ellipsis)
274 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
274 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
275 >>> print trim(t, 10, ellipsis=ellipsis)
275 >>> print trim(t, 10, ellipsis=ellipsis)
276 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
276 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
277 >>> print trim(t, 8, ellipsis=ellipsis)
277 >>> print trim(t, 8, ellipsis=ellipsis)
278 \x11\x22\x33\x44\x55+++
278 \x11\x22\x33\x44\x55+++
279 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
279 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
280 +++\x66\x77\x88\x99\xaa
280 +++\x66\x77\x88\x99\xaa
281 >>> print trim(t, 8)
281 >>> print trim(t, 8)
282 \x11\x22\x33\x44\x55\x66\x77\x88
282 \x11\x22\x33\x44\x55\x66\x77\x88
283 >>> print trim(t, 8, leftside=True)
283 >>> print trim(t, 8, leftside=True)
284 \x33\x44\x55\x66\x77\x88\x99\xaa
284 \x33\x44\x55\x66\x77\x88\x99\xaa
285 >>> print trim(t, 3, ellipsis=ellipsis)
285 >>> print trim(t, 3, ellipsis=ellipsis)
286 +++
286 +++
287 >>> print trim(t, 1, ellipsis=ellipsis)
287 >>> print trim(t, 1, ellipsis=ellipsis)
288 +
288 +
289 """
289 """
290 try:
290 try:
291 u = s.decode(_sysstr(encoding))
291 u = s.decode(_sysstr(encoding))
292 except UnicodeDecodeError:
292 except UnicodeDecodeError:
293 if len(s) <= width: # trimming is not needed
293 if len(s) <= width: # trimming is not needed
294 return s
294 return s
295 width -= len(ellipsis)
295 width -= len(ellipsis)
296 if width <= 0: # no enough room even for ellipsis
296 if width <= 0: # no enough room even for ellipsis
297 return ellipsis[:width + len(ellipsis)]
297 return ellipsis[:width + len(ellipsis)]
298 if leftside:
298 if leftside:
299 return ellipsis + s[-width:]
299 return ellipsis + s[-width:]
300 return s[:width] + ellipsis
300 return s[:width] + ellipsis
301
301
302 if ucolwidth(u) <= width: # trimming is not needed
302 if ucolwidth(u) <= width: # trimming is not needed
303 return s
303 return s
304
304
305 width -= len(ellipsis)
305 width -= len(ellipsis)
306 if width <= 0: # no enough room even for ellipsis
306 if width <= 0: # no enough room even for ellipsis
307 return ellipsis[:width + len(ellipsis)]
307 return ellipsis[:width + len(ellipsis)]
308
308
309 if leftside:
309 if leftside:
310 uslice = lambda i: u[i:]
310 uslice = lambda i: u[i:]
311 concat = lambda s: ellipsis + s
311 concat = lambda s: ellipsis + s
312 else:
312 else:
313 uslice = lambda i: u[:-i]
313 uslice = lambda i: u[:-i]
314 concat = lambda s: s + ellipsis
314 concat = lambda s: s + ellipsis
315 for i in xrange(1, len(u)):
315 for i in xrange(1, len(u)):
316 usub = uslice(i)
316 usub = uslice(i)
317 if ucolwidth(usub) <= width:
317 if ucolwidth(usub) <= width:
318 return concat(usub.encode(_sysstr(encoding)))
318 return concat(usub.encode(_sysstr(encoding)))
319 return ellipsis # no enough room for multi-column characters
319 return ellipsis # no enough room for multi-column characters
320
320
321 def _asciilower(s):
321 def _asciilower(s):
322 '''convert a string to lowercase if ASCII
322 '''convert a string to lowercase if ASCII
323
323
324 Raises UnicodeDecodeError if non-ASCII characters are found.'''
324 Raises UnicodeDecodeError if non-ASCII characters are found.'''
325 s.decode('ascii')
325 s.decode('ascii')
326 return s.lower()
326 return s.lower()
327
327
328 def asciilower(s):
328 def asciilower(s):
329 # delay importing avoids cyclic dependency around "parsers" in
329 # delay importing avoids cyclic dependency around "parsers" in
330 # pure Python build (util => i18n => encoding => parsers => util)
330 # pure Python build (util => i18n => encoding => parsers => util)
331 from . import parsers
331 from . import parsers
332 impl = getattr(parsers, 'asciilower', _asciilower)
332 impl = getattr(parsers, 'asciilower', _asciilower)
333 global asciilower
333 global asciilower
334 asciilower = impl
334 asciilower = impl
335 return impl(s)
335 return impl(s)
336
336
337 def _asciiupper(s):
337 def _asciiupper(s):
338 '''convert a string to uppercase if ASCII
338 '''convert a string to uppercase if ASCII
339
339
340 Raises UnicodeDecodeError if non-ASCII characters are found.'''
340 Raises UnicodeDecodeError if non-ASCII characters are found.'''
341 s.decode('ascii')
341 s.decode('ascii')
342 return s.upper()
342 return s.upper()
343
343
344 def asciiupper(s):
344 def asciiupper(s):
345 # delay importing avoids cyclic dependency around "parsers" in
345 # delay importing avoids cyclic dependency around "parsers" in
346 # pure Python build (util => i18n => encoding => parsers => util)
346 # pure Python build (util => i18n => encoding => parsers => util)
347 from . import parsers
347 from . import parsers
348 impl = getattr(parsers, 'asciiupper', _asciiupper)
348 impl = getattr(parsers, 'asciiupper', _asciiupper)
349 global asciiupper
349 global asciiupper
350 asciiupper = impl
350 asciiupper = impl
351 return impl(s)
351 return impl(s)
352
352
353 def lower(s):
353 def lower(s):
354 "best-effort encoding-aware case-folding of local string s"
354 "best-effort encoding-aware case-folding of local string s"
355 try:
355 try:
356 return asciilower(s)
356 return asciilower(s)
357 except UnicodeDecodeError:
357 except UnicodeDecodeError:
358 pass
358 pass
359 try:
359 try:
360 if isinstance(s, localstr):
360 if isinstance(s, localstr):
361 u = s._utf8.decode("utf-8")
361 u = s._utf8.decode("utf-8")
362 else:
362 else:
363 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
363 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
364
364
365 lu = u.lower()
365 lu = u.lower()
366 if u == lu:
366 if u == lu:
367 return s # preserve localstring
367 return s # preserve localstring
368 return lu.encode(_sysstr(encoding))
368 return lu.encode(_sysstr(encoding))
369 except UnicodeError:
369 except UnicodeError:
370 return s.lower() # we don't know how to fold this except in ASCII
370 return s.lower() # we don't know how to fold this except in ASCII
371 except LookupError as k:
371 except LookupError as k:
372 raise error.Abort(k, hint="please check your locale settings")
372 raise error.Abort(k, hint="please check your locale settings")
373
373
374 def upper(s):
374 def upper(s):
375 "best-effort encoding-aware case-folding of local string s"
375 "best-effort encoding-aware case-folding of local string s"
376 try:
376 try:
377 return asciiupper(s)
377 return asciiupper(s)
378 except UnicodeDecodeError:
378 except UnicodeDecodeError:
379 return upperfallback(s)
379 return upperfallback(s)
380
380
381 def upperfallback(s):
381 def upperfallback(s):
382 try:
382 try:
383 if isinstance(s, localstr):
383 if isinstance(s, localstr):
384 u = s._utf8.decode("utf-8")
384 u = s._utf8.decode("utf-8")
385 else:
385 else:
386 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
386 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
387
387
388 uu = u.upper()
388 uu = u.upper()
389 if u == uu:
389 if u == uu:
390 return s # preserve localstring
390 return s # preserve localstring
391 return uu.encode(_sysstr(encoding))
391 return uu.encode(_sysstr(encoding))
392 except UnicodeError:
392 except UnicodeError:
393 return s.upper() # we don't know how to fold this except in ASCII
393 return s.upper() # we don't know how to fold this except in ASCII
394 except LookupError as k:
394 except LookupError as k:
395 raise error.Abort(k, hint="please check your locale settings")
395 raise error.Abort(k, hint="please check your locale settings")
396
396
397 class normcasespecs(object):
397 class normcasespecs(object):
398 '''what a platform's normcase does to ASCII strings
398 '''what a platform's normcase does to ASCII strings
399
399
400 This is specified per platform, and should be consistent with what normcase
400 This is specified per platform, and should be consistent with what normcase
401 on that platform actually does.
401 on that platform actually does.
402
402
403 lower: normcase lowercases ASCII strings
403 lower: normcase lowercases ASCII strings
404 upper: normcase uppercases ASCII strings
404 upper: normcase uppercases ASCII strings
405 other: the fallback function should always be called
405 other: the fallback function should always be called
406
406
407 This should be kept in sync with normcase_spec in util.h.'''
407 This should be kept in sync with normcase_spec in util.h.'''
408 lower = -1
408 lower = -1
409 upper = 1
409 upper = 1
410 other = 0
410 other = 0
411
411
412 _jsonmap = []
412 _jsonmap = []
413 _jsonmap.extend("\\u%04x" % x for x in range(32))
413 _jsonmap.extend("\\u%04x" % x for x in range(32))
414 _jsonmap.extend(chr(x) for x in range(32, 127))
414 _jsonmap.extend(chr(x) for x in range(32, 127))
415 _jsonmap.append('\\u007f')
415 _jsonmap.append('\\u007f')
416 _jsonmap[0x09] = '\\t'
416 _jsonmap[0x09] = '\\t'
417 _jsonmap[0x0a] = '\\n'
417 _jsonmap[0x0a] = '\\n'
418 _jsonmap[0x22] = '\\"'
418 _jsonmap[0x22] = '\\"'
419 _jsonmap[0x5c] = '\\\\'
419 _jsonmap[0x5c] = '\\\\'
420 _jsonmap[0x08] = '\\b'
420 _jsonmap[0x08] = '\\b'
421 _jsonmap[0x0c] = '\\f'
421 _jsonmap[0x0c] = '\\f'
422 _jsonmap[0x0d] = '\\r'
422 _jsonmap[0x0d] = '\\r'
423 _paranoidjsonmap = _jsonmap[:]
423 _paranoidjsonmap = _jsonmap[:]
424 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
424 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
425 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
425 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
426 _jsonmap.extend(chr(x) for x in range(128, 256))
426 _jsonmap.extend(chr(x) for x in range(128, 256))
427
427
428 def jsonescape(s, paranoid=False):
428 def jsonescape(s, paranoid=False):
429 '''returns a string suitable for JSON
429 '''returns a string suitable for JSON
430
430
431 JSON is problematic for us because it doesn't support non-Unicode
431 JSON is problematic for us because it doesn't support non-Unicode
432 bytes. To deal with this, we take the following approach:
432 bytes. To deal with this, we take the following approach:
433
433
434 - localstr objects are converted back to UTF-8
434 - localstr objects are converted back to UTF-8
435 - valid UTF-8/ASCII strings are passed as-is
435 - valid UTF-8/ASCII strings are passed as-is
436 - other strings are converted to UTF-8b surrogate encoding
436 - other strings are converted to UTF-8b surrogate encoding
437 - apply JSON-specified string escaping
437 - apply JSON-specified string escaping
438
438
439 (escapes are doubled in these tests)
439 (escapes are doubled in these tests)
440
440
441 >>> jsonescape('this is a test')
441 >>> jsonescape('this is a test')
442 'this is a test'
442 'this is a test'
443 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
443 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
444 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
444 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
445 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
445 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
446 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
446 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
447 >>> jsonescape('a weird byte: \\xdd')
447 >>> jsonescape('a weird byte: \\xdd')
448 'a weird byte: \\xed\\xb3\\x9d'
448 'a weird byte: \\xed\\xb3\\x9d'
449 >>> jsonescape('utf-8: caf\\xc3\\xa9')
449 >>> jsonescape('utf-8: caf\\xc3\\xa9')
450 'utf-8: caf\\xc3\\xa9'
450 'utf-8: caf\\xc3\\xa9'
451 >>> jsonescape('')
451 >>> jsonescape('')
452 ''
452 ''
453
453
454 If paranoid, non-ascii and common troublesome characters are also escaped.
454 If paranoid, non-ascii and common troublesome characters are also escaped.
455 This is suitable for web output.
455 This is suitable for web output.
456
456
457 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
457 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
458 'escape boundary: ~ \\\\u007f \\\\u0080'
458 'escape boundary: ~ \\\\u007f \\\\u0080'
459 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
459 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
460 'a weird byte: \\\\udcdd'
460 'a weird byte: \\\\udcdd'
461 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
461 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
462 'utf-8: caf\\\\u00e9'
462 'utf-8: caf\\\\u00e9'
463 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
463 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
464 'non-BMP: \\\\ud834\\\\udd1e'
464 'non-BMP: \\\\ud834\\\\udd1e'
465 >>> jsonescape('<foo@example.org>', paranoid=True)
465 >>> jsonescape('<foo@example.org>', paranoid=True)
466 '\\\\u003cfoo@example.org\\\\u003e'
466 '\\\\u003cfoo@example.org\\\\u003e'
467 '''
467 '''
468
468
469 if paranoid:
469 if paranoid:
470 jm = _paranoidjsonmap
470 jm = _paranoidjsonmap
471 else:
471 else:
472 jm = _jsonmap
472 jm = _jsonmap
473
473
474 u8chars = toutf8b(s)
474 u8chars = toutf8b(s)
475 try:
475 try:
476 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
476 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
477 except IndexError:
477 except IndexError:
478 pass
478 pass
479 # non-BMP char is represented as UTF-16 surrogate pair
479 # non-BMP char is represented as UTF-16 surrogate pair
480 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
480 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
481 u16codes.pop(0) # drop BOM
481 u16codes.pop(0) # drop BOM
482 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
482 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
483
483
484 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
484 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
485
485
486 def getutf8char(s, pos):
486 def getutf8char(s, pos):
487 '''get the next full utf-8 character in the given string, starting at pos
487 '''get the next full utf-8 character in the given string, starting at pos
488
488
489 Raises a UnicodeError if the given location does not start a valid
489 Raises a UnicodeError if the given location does not start a valid
490 utf-8 character.
490 utf-8 character.
491 '''
491 '''
492
492
493 # find how many bytes to attempt decoding from first nibble
493 # find how many bytes to attempt decoding from first nibble
494 l = _utf8len[ord(s[pos]) >> 4]
494 l = _utf8len[ord(s[pos]) >> 4]
495 if not l: # ascii
495 if not l: # ascii
496 return s[pos]
496 return s[pos]
497
497
498 c = s[pos:pos + l]
498 c = s[pos:pos + l]
499 # validate with attempted decode
499 # validate with attempted decode
500 c.decode("utf-8")
500 c.decode("utf-8")
501 return c
501 return c
502
502
503 def toutf8b(s):
503 def toutf8b(s):
504 '''convert a local, possibly-binary string into UTF-8b
504 '''convert a local, possibly-binary string into UTF-8b
505
505
506 This is intended as a generic method to preserve data when working
506 This is intended as a generic method to preserve data when working
507 with schemes like JSON and XML that have no provision for
507 with schemes like JSON and XML that have no provision for
508 arbitrary byte strings. As Mercurial often doesn't know
508 arbitrary byte strings. As Mercurial often doesn't know
509 what encoding data is in, we use so-called UTF-8b.
509 what encoding data is in, we use so-called UTF-8b.
510
510
511 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
511 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
512 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
512 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
513 uDC00-uDCFF.
513 uDC00-uDCFF.
514
514
515 Principles of operation:
515 Principles of operation:
516
516
517 - ASCII and UTF-8 data successfully round-trips and is understood
517 - ASCII and UTF-8 data successfully round-trips and is understood
518 by Unicode-oriented clients
518 by Unicode-oriented clients
519 - filenames and file contents in arbitrary other encodings can have
519 - filenames and file contents in arbitrary other encodings can have
520 be round-tripped or recovered by clueful clients
520 be round-tripped or recovered by clueful clients
521 - local strings that have a cached known UTF-8 encoding (aka
521 - local strings that have a cached known UTF-8 encoding (aka
522 localstr) get sent as UTF-8 so Unicode-oriented clients get the
522 localstr) get sent as UTF-8 so Unicode-oriented clients get the
523 Unicode data they want
523 Unicode data they want
524 - because we must preserve UTF-8 bytestring in places such as
524 - because we must preserve UTF-8 bytestring in places such as
525 filenames, metadata can't be roundtripped without help
525 filenames, metadata can't be roundtripped without help
526
526
527 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
527 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
528 arbitrary bytes into an internal Unicode format that can be
528 arbitrary bytes into an internal Unicode format that can be
529 re-encoded back into the original. Here we are exposing the
529 re-encoded back into the original. Here we are exposing the
530 internal surrogate encoding as a UTF-8 string.)
530 internal surrogate encoding as a UTF-8 string.)
531 '''
531 '''
532
532
533 if "\xed" not in s:
533 if "\xed" not in s:
534 if isinstance(s, localstr):
534 if isinstance(s, localstr):
535 return s._utf8
535 return s._utf8
536 try:
536 try:
537 s.decode('utf-8')
537 s.decode('utf-8')
538 return s
538 return s
539 except UnicodeDecodeError:
539 except UnicodeDecodeError:
540 pass
540 pass
541
541
542 r = ""
542 r = ""
543 pos = 0
543 pos = 0
544 l = len(s)
544 l = len(s)
545 while pos < l:
545 while pos < l:
546 try:
546 try:
547 c = getutf8char(s, pos)
547 c = getutf8char(s, pos)
548 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
548 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
549 # have to re-escape existing U+DCxx characters
549 # have to re-escape existing U+DCxx characters
550 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
550 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
551 pos += 1
551 pos += 1
552 else:
552 else:
553 pos += len(c)
553 pos += len(c)
554 except UnicodeDecodeError:
554 except UnicodeDecodeError:
555 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
555 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
556 pos += 1
556 pos += 1
557 r += c
557 r += c
558 return r
558 return r
559
559
560 def fromutf8b(s):
560 def fromutf8b(s):
561 '''Given a UTF-8b string, return a local, possibly-binary string.
561 '''Given a UTF-8b string, return a local, possibly-binary string.
562
562
563 return the original binary string. This
563 return the original binary string. This
564 is a round-trip process for strings like filenames, but metadata
564 is a round-trip process for strings like filenames, but metadata
565 that's was passed through tolocal will remain in UTF-8.
565 that's was passed through tolocal will remain in UTF-8.
566
566
567 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
567 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
568 >>> m = "\\xc3\\xa9\\x99abcd"
568 >>> m = "\\xc3\\xa9\\x99abcd"
569 >>> toutf8b(m)
569 >>> toutf8b(m)
570 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
570 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
571 >>> roundtrip(m)
571 >>> roundtrip(m)
572 True
572 True
573 >>> roundtrip("\\xc2\\xc2\\x80")
573 >>> roundtrip("\\xc2\\xc2\\x80")
574 True
574 True
575 >>> roundtrip("\\xef\\xbf\\xbd")
575 >>> roundtrip("\\xef\\xbf\\xbd")
576 True
576 True
577 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
577 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
578 True
578 True
579 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
579 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
580 True
580 True
581 '''
581 '''
582
582
583 # fast path - look for uDxxx prefixes in s
583 # fast path - look for uDxxx prefixes in s
584 if "\xed" not in s:
584 if "\xed" not in s:
585 return s
585 return s
586
586
587 # We could do this with the unicode type but some Python builds
587 # We could do this with the unicode type but some Python builds
588 # use UTF-16 internally (issue5031) which causes non-BMP code
588 # use UTF-16 internally (issue5031) which causes non-BMP code
589 # points to be escaped. Instead, we use our handy getutf8char
589 # points to be escaped. Instead, we use our handy getutf8char
590 # helper again to walk the string without "decoding" it.
590 # helper again to walk the string without "decoding" it.
591
591
592 r = ""
592 r = ""
593 pos = 0
593 pos = 0
594 l = len(s)
594 l = len(s)
595 while pos < l:
595 while pos < l:
596 c = getutf8char(s, pos)
596 c = getutf8char(s, pos)
597 pos += len(c)
597 pos += len(c)
598 # unescape U+DCxx characters
598 # unescape U+DCxx characters
599 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
599 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
600 c = chr(ord(c.decode("utf-8")) & 0xff)
600 c = chr(ord(c.decode("utf-8")) & 0xff)
601 r += c
601 r += c
602 return r
602 return r
General Comments 0
You need to be logged in to leave comments. Login now