##// END OF EJS Templates
encoding: remove workaround for locale.getpreferredencoding()...
Gregory Szorc -
r32276:1a3a08b5 default
parent child Browse files
Show More
@@ -1,620 +1,595 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 pycompat,
17 pycompat,
18 )
18 )
19
19
20 _sysstr = pycompat.sysstr
20 _sysstr = pycompat.sysstr
21
21
22 if pycompat.ispy3:
22 if pycompat.ispy3:
23 unichr = chr
23 unichr = chr
24
24
25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
26 # "Unicode Subtleties"), so we need to ignore them in some places for
26 # "Unicode Subtleties"), so we need to ignore them in some places for
27 # sanity.
27 # sanity.
28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
30 "206a 206b 206c 206d 206e 206f feff".split()]
30 "206a 206b 206c 206d 206e 206f feff".split()]
31 # verify the next function will work
31 # verify the next function will work
32 if pycompat.ispy3:
32 if pycompat.ispy3:
33 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
33 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
34 else:
34 else:
35 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
35 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
36
36
37 def hfsignoreclean(s):
37 def hfsignoreclean(s):
38 """Remove codepoints ignored by HFS+ from s.
38 """Remove codepoints ignored by HFS+ from s.
39
39
40 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
40 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
41 '.hg'
41 '.hg'
42 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
42 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
43 '.hg'
43 '.hg'
44 """
44 """
45 if "\xe2" in s or "\xef" in s:
45 if "\xe2" in s or "\xef" in s:
46 for c in _ignore:
46 for c in _ignore:
47 s = s.replace(c, '')
47 s = s.replace(c, '')
48 return s
48 return s
49
49
50 # encoding.environ is provided read-only, which may not be used to modify
50 # encoding.environ is provided read-only, which may not be used to modify
51 # the process environment
51 # the process environment
52 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
52 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
53 if not pycompat.ispy3:
53 if not pycompat.ispy3:
54 environ = os.environ # re-exports
54 environ = os.environ # re-exports
55 elif _nativeenviron:
55 elif _nativeenviron:
56 environ = os.environb # re-exports
56 environ = os.environb # re-exports
57 else:
57 else:
58 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
58 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
59 # and recreate it once encoding is settled
59 # and recreate it once encoding is settled
60 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
60 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
61 for k, v in os.environ.items()) # re-exports
61 for k, v in os.environ.items()) # re-exports
62
62
63 def _getpreferredencoding():
64 '''
65 On darwin, getpreferredencoding ignores the locale environment and
66 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
67 for Python 2.7 and up. This is the same corrected code for earlier
68 Python versions.
69
70 However, we can't use a version check for this method, as some distributions
71 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
72 encoding, as it is unlikely that this encoding is the actually expected.
73 '''
74 try:
75 locale.CODESET
76 except AttributeError:
77 # Fall back to parsing environment variables :-(
78 return locale.getdefaultlocale()[1]
79
80 oldloc = locale.setlocale(locale.LC_CTYPE)
81 locale.setlocale(locale.LC_CTYPE, "")
82 result = locale.nl_langinfo(locale.CODESET)
83 locale.setlocale(locale.LC_CTYPE, oldloc)
84
85 return result
86
87 _encodingfixers = {
63 _encodingfixers = {
88 '646': lambda: 'ascii',
64 '646': lambda: 'ascii',
89 'ANSI_X3.4-1968': lambda: 'ascii',
65 'ANSI_X3.4-1968': lambda: 'ascii',
90 'mac-roman': _getpreferredencoding
91 }
66 }
92
67
93 try:
68 try:
94 encoding = environ.get("HGENCODING")
69 encoding = environ.get("HGENCODING")
95 if not encoding:
70 if not encoding:
96 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
71 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
97 encoding = _encodingfixers.get(encoding, lambda: encoding)()
72 encoding = _encodingfixers.get(encoding, lambda: encoding)()
98 except locale.Error:
73 except locale.Error:
99 encoding = 'ascii'
74 encoding = 'ascii'
100 encodingmode = environ.get("HGENCODINGMODE", "strict")
75 encodingmode = environ.get("HGENCODINGMODE", "strict")
101 fallbackencoding = 'ISO-8859-1'
76 fallbackencoding = 'ISO-8859-1'
102
77
103 class localstr(str):
78 class localstr(str):
104 '''This class allows strings that are unmodified to be
79 '''This class allows strings that are unmodified to be
105 round-tripped to the local encoding and back'''
80 round-tripped to the local encoding and back'''
106 def __new__(cls, u, l):
81 def __new__(cls, u, l):
107 s = str.__new__(cls, l)
82 s = str.__new__(cls, l)
108 s._utf8 = u
83 s._utf8 = u
109 return s
84 return s
110 def __hash__(self):
85 def __hash__(self):
111 return hash(self._utf8) # avoid collisions in local string space
86 return hash(self._utf8) # avoid collisions in local string space
112
87
113 def tolocal(s):
88 def tolocal(s):
114 """
89 """
115 Convert a string from internal UTF-8 to local encoding
90 Convert a string from internal UTF-8 to local encoding
116
91
117 All internal strings should be UTF-8 but some repos before the
92 All internal strings should be UTF-8 but some repos before the
118 implementation of locale support may contain latin1 or possibly
93 implementation of locale support may contain latin1 or possibly
119 other character sets. We attempt to decode everything strictly
94 other character sets. We attempt to decode everything strictly
120 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
95 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
121 replace unknown characters.
96 replace unknown characters.
122
97
123 The localstr class is used to cache the known UTF-8 encoding of
98 The localstr class is used to cache the known UTF-8 encoding of
124 strings next to their local representation to allow lossless
99 strings next to their local representation to allow lossless
125 round-trip conversion back to UTF-8.
100 round-trip conversion back to UTF-8.
126
101
127 >>> u = 'foo: \\xc3\\xa4' # utf-8
102 >>> u = 'foo: \\xc3\\xa4' # utf-8
128 >>> l = tolocal(u)
103 >>> l = tolocal(u)
129 >>> l
104 >>> l
130 'foo: ?'
105 'foo: ?'
131 >>> fromlocal(l)
106 >>> fromlocal(l)
132 'foo: \\xc3\\xa4'
107 'foo: \\xc3\\xa4'
133 >>> u2 = 'foo: \\xc3\\xa1'
108 >>> u2 = 'foo: \\xc3\\xa1'
134 >>> d = { l: 1, tolocal(u2): 2 }
109 >>> d = { l: 1, tolocal(u2): 2 }
135 >>> len(d) # no collision
110 >>> len(d) # no collision
136 2
111 2
137 >>> 'foo: ?' in d
112 >>> 'foo: ?' in d
138 False
113 False
139 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
114 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
140 >>> l = tolocal(l1)
115 >>> l = tolocal(l1)
141 >>> l
116 >>> l
142 'foo: ?'
117 'foo: ?'
143 >>> fromlocal(l) # magically in utf-8
118 >>> fromlocal(l) # magically in utf-8
144 'foo: \\xc3\\xa4'
119 'foo: \\xc3\\xa4'
145 """
120 """
146
121
147 try:
122 try:
148 try:
123 try:
149 # make sure string is actually stored in UTF-8
124 # make sure string is actually stored in UTF-8
150 u = s.decode('UTF-8')
125 u = s.decode('UTF-8')
151 if encoding == 'UTF-8':
126 if encoding == 'UTF-8':
152 # fast path
127 # fast path
153 return s
128 return s
154 r = u.encode(_sysstr(encoding), u"replace")
129 r = u.encode(_sysstr(encoding), u"replace")
155 if u == r.decode(_sysstr(encoding)):
130 if u == r.decode(_sysstr(encoding)):
156 # r is a safe, non-lossy encoding of s
131 # r is a safe, non-lossy encoding of s
157 return r
132 return r
158 return localstr(s, r)
133 return localstr(s, r)
159 except UnicodeDecodeError:
134 except UnicodeDecodeError:
160 # we should only get here if we're looking at an ancient changeset
135 # we should only get here if we're looking at an ancient changeset
161 try:
136 try:
162 u = s.decode(_sysstr(fallbackencoding))
137 u = s.decode(_sysstr(fallbackencoding))
163 r = u.encode(_sysstr(encoding), u"replace")
138 r = u.encode(_sysstr(encoding), u"replace")
164 if u == r.decode(_sysstr(encoding)):
139 if u == r.decode(_sysstr(encoding)):
165 # r is a safe, non-lossy encoding of s
140 # r is a safe, non-lossy encoding of s
166 return r
141 return r
167 return localstr(u.encode('UTF-8'), r)
142 return localstr(u.encode('UTF-8'), r)
168 except UnicodeDecodeError:
143 except UnicodeDecodeError:
169 u = s.decode("utf-8", "replace") # last ditch
144 u = s.decode("utf-8", "replace") # last ditch
170 # can't round-trip
145 # can't round-trip
171 return u.encode(_sysstr(encoding), u"replace")
146 return u.encode(_sysstr(encoding), u"replace")
172 except LookupError as k:
147 except LookupError as k:
173 raise error.Abort(k, hint="please check your locale settings")
148 raise error.Abort(k, hint="please check your locale settings")
174
149
175 def fromlocal(s):
150 def fromlocal(s):
176 """
151 """
177 Convert a string from the local character encoding to UTF-8
152 Convert a string from the local character encoding to UTF-8
178
153
179 We attempt to decode strings using the encoding mode set by
154 We attempt to decode strings using the encoding mode set by
180 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
155 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
181 characters will cause an error message. Other modes include
156 characters will cause an error message. Other modes include
182 'replace', which replaces unknown characters with a special
157 'replace', which replaces unknown characters with a special
183 Unicode character, and 'ignore', which drops the character.
158 Unicode character, and 'ignore', which drops the character.
184 """
159 """
185
160
186 # can we do a lossless round-trip?
161 # can we do a lossless round-trip?
187 if isinstance(s, localstr):
162 if isinstance(s, localstr):
188 return s._utf8
163 return s._utf8
189
164
190 try:
165 try:
191 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
166 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
192 return u.encode("utf-8")
167 return u.encode("utf-8")
193 except UnicodeDecodeError as inst:
168 except UnicodeDecodeError as inst:
194 sub = s[max(0, inst.start - 10):inst.start + 10]
169 sub = s[max(0, inst.start - 10):inst.start + 10]
195 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
196 except LookupError as k:
171 except LookupError as k:
197 raise error.Abort(k, hint="please check your locale settings")
172 raise error.Abort(k, hint="please check your locale settings")
198
173
199 def unitolocal(u):
174 def unitolocal(u):
200 """Convert a unicode string to a byte string of local encoding"""
175 """Convert a unicode string to a byte string of local encoding"""
201 return tolocal(u.encode('utf-8'))
176 return tolocal(u.encode('utf-8'))
202
177
203 def unifromlocal(s):
178 def unifromlocal(s):
204 """Convert a byte string of local encoding to a unicode string"""
179 """Convert a byte string of local encoding to a unicode string"""
205 return fromlocal(s).decode('utf-8')
180 return fromlocal(s).decode('utf-8')
206
181
207 # converter functions between native str and byte string. use these if the
182 # converter functions between native str and byte string. use these if the
208 # character encoding is not aware (e.g. exception message) or is known to
183 # character encoding is not aware (e.g. exception message) or is known to
209 # be locale dependent (e.g. date formatting.)
184 # be locale dependent (e.g. date formatting.)
210 if pycompat.ispy3:
185 if pycompat.ispy3:
211 strtolocal = unitolocal
186 strtolocal = unitolocal
212 strfromlocal = unifromlocal
187 strfromlocal = unifromlocal
213 else:
188 else:
214 strtolocal = pycompat.identity
189 strtolocal = pycompat.identity
215 strfromlocal = pycompat.identity
190 strfromlocal = pycompat.identity
216
191
217 if not _nativeenviron:
192 if not _nativeenviron:
218 # now encoding and helper functions are available, recreate the environ
193 # now encoding and helper functions are available, recreate the environ
219 # dict to be exported to other modules
194 # dict to be exported to other modules
220 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
195 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
221 for k, v in os.environ.items()) # re-exports
196 for k, v in os.environ.items()) # re-exports
222
197
223 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
198 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
224 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
199 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
225 and "WFA" or "WF")
200 and "WFA" or "WF")
226
201
227 def colwidth(s):
202 def colwidth(s):
228 "Find the column width of a string for display in the local encoding"
203 "Find the column width of a string for display in the local encoding"
229 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
204 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
230
205
231 def ucolwidth(d):
206 def ucolwidth(d):
232 "Find the column width of a Unicode string for display"
207 "Find the column width of a Unicode string for display"
233 eaw = getattr(unicodedata, 'east_asian_width', None)
208 eaw = getattr(unicodedata, 'east_asian_width', None)
234 if eaw is not None:
209 if eaw is not None:
235 return sum([eaw(c) in wide and 2 or 1 for c in d])
210 return sum([eaw(c) in wide and 2 or 1 for c in d])
236 return len(d)
211 return len(d)
237
212
238 def getcols(s, start, c):
213 def getcols(s, start, c):
239 '''Use colwidth to find a c-column substring of s starting at byte
214 '''Use colwidth to find a c-column substring of s starting at byte
240 index start'''
215 index start'''
241 for x in xrange(start + c, len(s)):
216 for x in xrange(start + c, len(s)):
242 t = s[start:x]
217 t = s[start:x]
243 if colwidth(t) == c:
218 if colwidth(t) == c:
244 return t
219 return t
245
220
246 def trim(s, width, ellipsis='', leftside=False):
221 def trim(s, width, ellipsis='', leftside=False):
247 """Trim string 's' to at most 'width' columns (including 'ellipsis').
222 """Trim string 's' to at most 'width' columns (including 'ellipsis').
248
223
249 If 'leftside' is True, left side of string 's' is trimmed.
224 If 'leftside' is True, left side of string 's' is trimmed.
250 'ellipsis' is always placed at trimmed side.
225 'ellipsis' is always placed at trimmed side.
251
226
252 >>> ellipsis = '+++'
227 >>> ellipsis = '+++'
253 >>> from . import encoding
228 >>> from . import encoding
254 >>> encoding.encoding = 'utf-8'
229 >>> encoding.encoding = 'utf-8'
255 >>> t= '1234567890'
230 >>> t= '1234567890'
256 >>> print trim(t, 12, ellipsis=ellipsis)
231 >>> print trim(t, 12, ellipsis=ellipsis)
257 1234567890
232 1234567890
258 >>> print trim(t, 10, ellipsis=ellipsis)
233 >>> print trim(t, 10, ellipsis=ellipsis)
259 1234567890
234 1234567890
260 >>> print trim(t, 8, ellipsis=ellipsis)
235 >>> print trim(t, 8, ellipsis=ellipsis)
261 12345+++
236 12345+++
262 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
237 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
263 +++67890
238 +++67890
264 >>> print trim(t, 8)
239 >>> print trim(t, 8)
265 12345678
240 12345678
266 >>> print trim(t, 8, leftside=True)
241 >>> print trim(t, 8, leftside=True)
267 34567890
242 34567890
268 >>> print trim(t, 3, ellipsis=ellipsis)
243 >>> print trim(t, 3, ellipsis=ellipsis)
269 +++
244 +++
270 >>> print trim(t, 1, ellipsis=ellipsis)
245 >>> print trim(t, 1, ellipsis=ellipsis)
271 +
246 +
272 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
247 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
273 >>> t = u.encode(encoding.encoding)
248 >>> t = u.encode(encoding.encoding)
274 >>> print trim(t, 12, ellipsis=ellipsis)
249 >>> print trim(t, 12, ellipsis=ellipsis)
275 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
250 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
276 >>> print trim(t, 10, ellipsis=ellipsis)
251 >>> print trim(t, 10, ellipsis=ellipsis)
277 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
252 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
278 >>> print trim(t, 8, ellipsis=ellipsis)
253 >>> print trim(t, 8, ellipsis=ellipsis)
279 \xe3\x81\x82\xe3\x81\x84+++
254 \xe3\x81\x82\xe3\x81\x84+++
280 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
255 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
281 +++\xe3\x81\x88\xe3\x81\x8a
256 +++\xe3\x81\x88\xe3\x81\x8a
282 >>> print trim(t, 5)
257 >>> print trim(t, 5)
283 \xe3\x81\x82\xe3\x81\x84
258 \xe3\x81\x82\xe3\x81\x84
284 >>> print trim(t, 5, leftside=True)
259 >>> print trim(t, 5, leftside=True)
285 \xe3\x81\x88\xe3\x81\x8a
260 \xe3\x81\x88\xe3\x81\x8a
286 >>> print trim(t, 4, ellipsis=ellipsis)
261 >>> print trim(t, 4, ellipsis=ellipsis)
287 +++
262 +++
288 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
263 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
289 +++
264 +++
290 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
265 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
291 >>> print trim(t, 12, ellipsis=ellipsis)
266 >>> print trim(t, 12, ellipsis=ellipsis)
292 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
267 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
293 >>> print trim(t, 10, ellipsis=ellipsis)
268 >>> print trim(t, 10, ellipsis=ellipsis)
294 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
269 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
295 >>> print trim(t, 8, ellipsis=ellipsis)
270 >>> print trim(t, 8, ellipsis=ellipsis)
296 \x11\x22\x33\x44\x55+++
271 \x11\x22\x33\x44\x55+++
297 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
272 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
298 +++\x66\x77\x88\x99\xaa
273 +++\x66\x77\x88\x99\xaa
299 >>> print trim(t, 8)
274 >>> print trim(t, 8)
300 \x11\x22\x33\x44\x55\x66\x77\x88
275 \x11\x22\x33\x44\x55\x66\x77\x88
301 >>> print trim(t, 8, leftside=True)
276 >>> print trim(t, 8, leftside=True)
302 \x33\x44\x55\x66\x77\x88\x99\xaa
277 \x33\x44\x55\x66\x77\x88\x99\xaa
303 >>> print trim(t, 3, ellipsis=ellipsis)
278 >>> print trim(t, 3, ellipsis=ellipsis)
304 +++
279 +++
305 >>> print trim(t, 1, ellipsis=ellipsis)
280 >>> print trim(t, 1, ellipsis=ellipsis)
306 +
281 +
307 """
282 """
308 try:
283 try:
309 u = s.decode(_sysstr(encoding))
284 u = s.decode(_sysstr(encoding))
310 except UnicodeDecodeError:
285 except UnicodeDecodeError:
311 if len(s) <= width: # trimming is not needed
286 if len(s) <= width: # trimming is not needed
312 return s
287 return s
313 width -= len(ellipsis)
288 width -= len(ellipsis)
314 if width <= 0: # no enough room even for ellipsis
289 if width <= 0: # no enough room even for ellipsis
315 return ellipsis[:width + len(ellipsis)]
290 return ellipsis[:width + len(ellipsis)]
316 if leftside:
291 if leftside:
317 return ellipsis + s[-width:]
292 return ellipsis + s[-width:]
318 return s[:width] + ellipsis
293 return s[:width] + ellipsis
319
294
320 if ucolwidth(u) <= width: # trimming is not needed
295 if ucolwidth(u) <= width: # trimming is not needed
321 return s
296 return s
322
297
323 width -= len(ellipsis)
298 width -= len(ellipsis)
324 if width <= 0: # no enough room even for ellipsis
299 if width <= 0: # no enough room even for ellipsis
325 return ellipsis[:width + len(ellipsis)]
300 return ellipsis[:width + len(ellipsis)]
326
301
327 if leftside:
302 if leftside:
328 uslice = lambda i: u[i:]
303 uslice = lambda i: u[i:]
329 concat = lambda s: ellipsis + s
304 concat = lambda s: ellipsis + s
330 else:
305 else:
331 uslice = lambda i: u[:-i]
306 uslice = lambda i: u[:-i]
332 concat = lambda s: s + ellipsis
307 concat = lambda s: s + ellipsis
333 for i in xrange(1, len(u)):
308 for i in xrange(1, len(u)):
334 usub = uslice(i)
309 usub = uslice(i)
335 if ucolwidth(usub) <= width:
310 if ucolwidth(usub) <= width:
336 return concat(usub.encode(_sysstr(encoding)))
311 return concat(usub.encode(_sysstr(encoding)))
337 return ellipsis # no enough room for multi-column characters
312 return ellipsis # no enough room for multi-column characters
338
313
339 def _asciilower(s):
314 def _asciilower(s):
340 '''convert a string to lowercase if ASCII
315 '''convert a string to lowercase if ASCII
341
316
342 Raises UnicodeDecodeError if non-ASCII characters are found.'''
317 Raises UnicodeDecodeError if non-ASCII characters are found.'''
343 s.decode('ascii')
318 s.decode('ascii')
344 return s.lower()
319 return s.lower()
345
320
346 def asciilower(s):
321 def asciilower(s):
347 # delay importing avoids cyclic dependency around "parsers" in
322 # delay importing avoids cyclic dependency around "parsers" in
348 # pure Python build (util => i18n => encoding => parsers => util)
323 # pure Python build (util => i18n => encoding => parsers => util)
349 from . import parsers
324 from . import parsers
350 impl = getattr(parsers, 'asciilower', _asciilower)
325 impl = getattr(parsers, 'asciilower', _asciilower)
351 global asciilower
326 global asciilower
352 asciilower = impl
327 asciilower = impl
353 return impl(s)
328 return impl(s)
354
329
355 def _asciiupper(s):
330 def _asciiupper(s):
356 '''convert a string to uppercase if ASCII
331 '''convert a string to uppercase if ASCII
357
332
358 Raises UnicodeDecodeError if non-ASCII characters are found.'''
333 Raises UnicodeDecodeError if non-ASCII characters are found.'''
359 s.decode('ascii')
334 s.decode('ascii')
360 return s.upper()
335 return s.upper()
361
336
362 def asciiupper(s):
337 def asciiupper(s):
363 # delay importing avoids cyclic dependency around "parsers" in
338 # delay importing avoids cyclic dependency around "parsers" in
364 # pure Python build (util => i18n => encoding => parsers => util)
339 # pure Python build (util => i18n => encoding => parsers => util)
365 from . import parsers
340 from . import parsers
366 impl = getattr(parsers, 'asciiupper', _asciiupper)
341 impl = getattr(parsers, 'asciiupper', _asciiupper)
367 global asciiupper
342 global asciiupper
368 asciiupper = impl
343 asciiupper = impl
369 return impl(s)
344 return impl(s)
370
345
371 def lower(s):
346 def lower(s):
372 "best-effort encoding-aware case-folding of local string s"
347 "best-effort encoding-aware case-folding of local string s"
373 try:
348 try:
374 return asciilower(s)
349 return asciilower(s)
375 except UnicodeDecodeError:
350 except UnicodeDecodeError:
376 pass
351 pass
377 try:
352 try:
378 if isinstance(s, localstr):
353 if isinstance(s, localstr):
379 u = s._utf8.decode("utf-8")
354 u = s._utf8.decode("utf-8")
380 else:
355 else:
381 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
356 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
382
357
383 lu = u.lower()
358 lu = u.lower()
384 if u == lu:
359 if u == lu:
385 return s # preserve localstring
360 return s # preserve localstring
386 return lu.encode(_sysstr(encoding))
361 return lu.encode(_sysstr(encoding))
387 except UnicodeError:
362 except UnicodeError:
388 return s.lower() # we don't know how to fold this except in ASCII
363 return s.lower() # we don't know how to fold this except in ASCII
389 except LookupError as k:
364 except LookupError as k:
390 raise error.Abort(k, hint="please check your locale settings")
365 raise error.Abort(k, hint="please check your locale settings")
391
366
392 def upper(s):
367 def upper(s):
393 "best-effort encoding-aware case-folding of local string s"
368 "best-effort encoding-aware case-folding of local string s"
394 try:
369 try:
395 return asciiupper(s)
370 return asciiupper(s)
396 except UnicodeDecodeError:
371 except UnicodeDecodeError:
397 return upperfallback(s)
372 return upperfallback(s)
398
373
399 def upperfallback(s):
374 def upperfallback(s):
400 try:
375 try:
401 if isinstance(s, localstr):
376 if isinstance(s, localstr):
402 u = s._utf8.decode("utf-8")
377 u = s._utf8.decode("utf-8")
403 else:
378 else:
404 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
379 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
405
380
406 uu = u.upper()
381 uu = u.upper()
407 if u == uu:
382 if u == uu:
408 return s # preserve localstring
383 return s # preserve localstring
409 return uu.encode(_sysstr(encoding))
384 return uu.encode(_sysstr(encoding))
410 except UnicodeError:
385 except UnicodeError:
411 return s.upper() # we don't know how to fold this except in ASCII
386 return s.upper() # we don't know how to fold this except in ASCII
412 except LookupError as k:
387 except LookupError as k:
413 raise error.Abort(k, hint="please check your locale settings")
388 raise error.Abort(k, hint="please check your locale settings")
414
389
415 class normcasespecs(object):
390 class normcasespecs(object):
416 '''what a platform's normcase does to ASCII strings
391 '''what a platform's normcase does to ASCII strings
417
392
418 This is specified per platform, and should be consistent with what normcase
393 This is specified per platform, and should be consistent with what normcase
419 on that platform actually does.
394 on that platform actually does.
420
395
421 lower: normcase lowercases ASCII strings
396 lower: normcase lowercases ASCII strings
422 upper: normcase uppercases ASCII strings
397 upper: normcase uppercases ASCII strings
423 other: the fallback function should always be called
398 other: the fallback function should always be called
424
399
425 This should be kept in sync with normcase_spec in util.h.'''
400 This should be kept in sync with normcase_spec in util.h.'''
426 lower = -1
401 lower = -1
427 upper = 1
402 upper = 1
428 other = 0
403 other = 0
429
404
430 _jsonmap = []
405 _jsonmap = []
431 _jsonmap.extend("\\u%04x" % x for x in range(32))
406 _jsonmap.extend("\\u%04x" % x for x in range(32))
432 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
407 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
433 _jsonmap.append('\\u007f')
408 _jsonmap.append('\\u007f')
434 _jsonmap[0x09] = '\\t'
409 _jsonmap[0x09] = '\\t'
435 _jsonmap[0x0a] = '\\n'
410 _jsonmap[0x0a] = '\\n'
436 _jsonmap[0x22] = '\\"'
411 _jsonmap[0x22] = '\\"'
437 _jsonmap[0x5c] = '\\\\'
412 _jsonmap[0x5c] = '\\\\'
438 _jsonmap[0x08] = '\\b'
413 _jsonmap[0x08] = '\\b'
439 _jsonmap[0x0c] = '\\f'
414 _jsonmap[0x0c] = '\\f'
440 _jsonmap[0x0d] = '\\r'
415 _jsonmap[0x0d] = '\\r'
441 _paranoidjsonmap = _jsonmap[:]
416 _paranoidjsonmap = _jsonmap[:]
442 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
417 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
443 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
418 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
444 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
419 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
445
420
446 def jsonescape(s, paranoid=False):
421 def jsonescape(s, paranoid=False):
447 '''returns a string suitable for JSON
422 '''returns a string suitable for JSON
448
423
449 JSON is problematic for us because it doesn't support non-Unicode
424 JSON is problematic for us because it doesn't support non-Unicode
450 bytes. To deal with this, we take the following approach:
425 bytes. To deal with this, we take the following approach:
451
426
452 - localstr objects are converted back to UTF-8
427 - localstr objects are converted back to UTF-8
453 - valid UTF-8/ASCII strings are passed as-is
428 - valid UTF-8/ASCII strings are passed as-is
454 - other strings are converted to UTF-8b surrogate encoding
429 - other strings are converted to UTF-8b surrogate encoding
455 - apply JSON-specified string escaping
430 - apply JSON-specified string escaping
456
431
457 (escapes are doubled in these tests)
432 (escapes are doubled in these tests)
458
433
459 >>> jsonescape('this is a test')
434 >>> jsonescape('this is a test')
460 'this is a test'
435 'this is a test'
461 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
436 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
462 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
437 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
463 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
438 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
464 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
439 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
465 >>> jsonescape('a weird byte: \\xdd')
440 >>> jsonescape('a weird byte: \\xdd')
466 'a weird byte: \\xed\\xb3\\x9d'
441 'a weird byte: \\xed\\xb3\\x9d'
467 >>> jsonescape('utf-8: caf\\xc3\\xa9')
442 >>> jsonescape('utf-8: caf\\xc3\\xa9')
468 'utf-8: caf\\xc3\\xa9'
443 'utf-8: caf\\xc3\\xa9'
469 >>> jsonescape('')
444 >>> jsonescape('')
470 ''
445 ''
471
446
472 If paranoid, non-ascii and common troublesome characters are also escaped.
447 If paranoid, non-ascii and common troublesome characters are also escaped.
473 This is suitable for web output.
448 This is suitable for web output.
474
449
475 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
450 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
476 'escape boundary: ~ \\\\u007f \\\\u0080'
451 'escape boundary: ~ \\\\u007f \\\\u0080'
477 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
452 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
478 'a weird byte: \\\\udcdd'
453 'a weird byte: \\\\udcdd'
479 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
454 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
480 'utf-8: caf\\\\u00e9'
455 'utf-8: caf\\\\u00e9'
481 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
456 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
482 'non-BMP: \\\\ud834\\\\udd1e'
457 'non-BMP: \\\\ud834\\\\udd1e'
483 >>> jsonescape('<foo@example.org>', paranoid=True)
458 >>> jsonescape('<foo@example.org>', paranoid=True)
484 '\\\\u003cfoo@example.org\\\\u003e'
459 '\\\\u003cfoo@example.org\\\\u003e'
485 '''
460 '''
486
461
487 if paranoid:
462 if paranoid:
488 jm = _paranoidjsonmap
463 jm = _paranoidjsonmap
489 else:
464 else:
490 jm = _jsonmap
465 jm = _jsonmap
491
466
492 u8chars = toutf8b(s)
467 u8chars = toutf8b(s)
493 try:
468 try:
494 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
469 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
495 except IndexError:
470 except IndexError:
496 pass
471 pass
497 # non-BMP char is represented as UTF-16 surrogate pair
472 # non-BMP char is represented as UTF-16 surrogate pair
498 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
473 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
499 u16codes.pop(0) # drop BOM
474 u16codes.pop(0) # drop BOM
500 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
475 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
501
476
502 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
477 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
503
478
504 def getutf8char(s, pos):
479 def getutf8char(s, pos):
505 '''get the next full utf-8 character in the given string, starting at pos
480 '''get the next full utf-8 character in the given string, starting at pos
506
481
507 Raises a UnicodeError if the given location does not start a valid
482 Raises a UnicodeError if the given location does not start a valid
508 utf-8 character.
483 utf-8 character.
509 '''
484 '''
510
485
511 # find how many bytes to attempt decoding from first nibble
486 # find how many bytes to attempt decoding from first nibble
512 l = _utf8len[ord(s[pos]) >> 4]
487 l = _utf8len[ord(s[pos]) >> 4]
513 if not l: # ascii
488 if not l: # ascii
514 return s[pos]
489 return s[pos]
515
490
516 c = s[pos:pos + l]
491 c = s[pos:pos + l]
517 # validate with attempted decode
492 # validate with attempted decode
518 c.decode("utf-8")
493 c.decode("utf-8")
519 return c
494 return c
520
495
521 def toutf8b(s):
496 def toutf8b(s):
522 '''convert a local, possibly-binary string into UTF-8b
497 '''convert a local, possibly-binary string into UTF-8b
523
498
524 This is intended as a generic method to preserve data when working
499 This is intended as a generic method to preserve data when working
525 with schemes like JSON and XML that have no provision for
500 with schemes like JSON and XML that have no provision for
526 arbitrary byte strings. As Mercurial often doesn't know
501 arbitrary byte strings. As Mercurial often doesn't know
527 what encoding data is in, we use so-called UTF-8b.
502 what encoding data is in, we use so-called UTF-8b.
528
503
529 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
504 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
530 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
505 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
531 uDC00-uDCFF.
506 uDC00-uDCFF.
532
507
533 Principles of operation:
508 Principles of operation:
534
509
535 - ASCII and UTF-8 data successfully round-trips and is understood
510 - ASCII and UTF-8 data successfully round-trips and is understood
536 by Unicode-oriented clients
511 by Unicode-oriented clients
537 - filenames and file contents in arbitrary other encodings can have
512 - filenames and file contents in arbitrary other encodings can have
538 be round-tripped or recovered by clueful clients
513 be round-tripped or recovered by clueful clients
539 - local strings that have a cached known UTF-8 encoding (aka
514 - local strings that have a cached known UTF-8 encoding (aka
540 localstr) get sent as UTF-8 so Unicode-oriented clients get the
515 localstr) get sent as UTF-8 so Unicode-oriented clients get the
541 Unicode data they want
516 Unicode data they want
542 - because we must preserve UTF-8 bytestring in places such as
517 - because we must preserve UTF-8 bytestring in places such as
543 filenames, metadata can't be roundtripped without help
518 filenames, metadata can't be roundtripped without help
544
519
545 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
520 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
546 arbitrary bytes into an internal Unicode format that can be
521 arbitrary bytes into an internal Unicode format that can be
547 re-encoded back into the original. Here we are exposing the
522 re-encoded back into the original. Here we are exposing the
548 internal surrogate encoding as a UTF-8 string.)
523 internal surrogate encoding as a UTF-8 string.)
549 '''
524 '''
550
525
551 if "\xed" not in s:
526 if "\xed" not in s:
552 if isinstance(s, localstr):
527 if isinstance(s, localstr):
553 return s._utf8
528 return s._utf8
554 try:
529 try:
555 s.decode('utf-8')
530 s.decode('utf-8')
556 return s
531 return s
557 except UnicodeDecodeError:
532 except UnicodeDecodeError:
558 pass
533 pass
559
534
560 r = ""
535 r = ""
561 pos = 0
536 pos = 0
562 l = len(s)
537 l = len(s)
563 while pos < l:
538 while pos < l:
564 try:
539 try:
565 c = getutf8char(s, pos)
540 c = getutf8char(s, pos)
566 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
541 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
567 # have to re-escape existing U+DCxx characters
542 # have to re-escape existing U+DCxx characters
568 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
543 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
569 pos += 1
544 pos += 1
570 else:
545 else:
571 pos += len(c)
546 pos += len(c)
572 except UnicodeDecodeError:
547 except UnicodeDecodeError:
573 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
548 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
574 pos += 1
549 pos += 1
575 r += c
550 r += c
576 return r
551 return r
577
552
578 def fromutf8b(s):
553 def fromutf8b(s):
579 '''Given a UTF-8b string, return a local, possibly-binary string.
554 '''Given a UTF-8b string, return a local, possibly-binary string.
580
555
581 return the original binary string. This
556 return the original binary string. This
582 is a round-trip process for strings like filenames, but metadata
557 is a round-trip process for strings like filenames, but metadata
583 that's was passed through tolocal will remain in UTF-8.
558 that's was passed through tolocal will remain in UTF-8.
584
559
585 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
560 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
586 >>> m = "\\xc3\\xa9\\x99abcd"
561 >>> m = "\\xc3\\xa9\\x99abcd"
587 >>> toutf8b(m)
562 >>> toutf8b(m)
588 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
563 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
589 >>> roundtrip(m)
564 >>> roundtrip(m)
590 True
565 True
591 >>> roundtrip("\\xc2\\xc2\\x80")
566 >>> roundtrip("\\xc2\\xc2\\x80")
592 True
567 True
593 >>> roundtrip("\\xef\\xbf\\xbd")
568 >>> roundtrip("\\xef\\xbf\\xbd")
594 True
569 True
595 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
570 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
596 True
571 True
597 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
572 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
598 True
573 True
599 '''
574 '''
600
575
601 # fast path - look for uDxxx prefixes in s
576 # fast path - look for uDxxx prefixes in s
602 if "\xed" not in s:
577 if "\xed" not in s:
603 return s
578 return s
604
579
605 # We could do this with the unicode type but some Python builds
580 # We could do this with the unicode type but some Python builds
606 # use UTF-16 internally (issue5031) which causes non-BMP code
581 # use UTF-16 internally (issue5031) which causes non-BMP code
607 # points to be escaped. Instead, we use our handy getutf8char
582 # points to be escaped. Instead, we use our handy getutf8char
608 # helper again to walk the string without "decoding" it.
583 # helper again to walk the string without "decoding" it.
609
584
610 r = ""
585 r = ""
611 pos = 0
586 pos = 0
612 l = len(s)
587 l = len(s)
613 while pos < l:
588 while pos < l:
614 c = getutf8char(s, pos)
589 c = getutf8char(s, pos)
615 pos += len(c)
590 pos += len(c)
616 # unescape U+DCxx characters
591 # unescape U+DCxx characters
617 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
618 c = chr(ord(c.decode("utf-8")) & 0xff)
593 c = chr(ord(c.decode("utf-8")) & 0xff)
619 r += c
594 r += c
620 return r
595 return r
General Comments 0
You need to be logged in to leave comments. Login now