##// END OF EJS Templates
doctest: pass encoding name as system string
Yuya Nishihara -
r34137:e9e225f1 default
parent child Browse files
Show More
@@ -1,585 +1,585 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import io
10 import io
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import (
21 from .pure import (
22 charencode as charencodepure,
22 charencode as charencodepure,
23 )
23 )
24
24
25 charencode = policy.importmod(r'charencode')
25 charencode = policy.importmod(r'charencode')
26
26
27 isasciistr = charencode.isasciistr
27 isasciistr = charencode.isasciistr
28 asciilower = charencode.asciilower
28 asciilower = charencode.asciilower
29 asciiupper = charencode.asciiupper
29 asciiupper = charencode.asciiupper
30 _jsonescapeu8fast = charencode.jsonescapeu8fast
30 _jsonescapeu8fast = charencode.jsonescapeu8fast
31
31
32 _sysstr = pycompat.sysstr
32 _sysstr = pycompat.sysstr
33
33
34 if pycompat.ispy3:
34 if pycompat.ispy3:
35 unichr = chr
35 unichr = chr
36
36
37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
38 # "Unicode Subtleties"), so we need to ignore them in some places for
38 # "Unicode Subtleties"), so we need to ignore them in some places for
39 # sanity.
39 # sanity.
40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
42 "206a 206b 206c 206d 206e 206f feff".split()]
42 "206a 206b 206c 206d 206e 206f feff".split()]
43 # verify the next function will work
43 # verify the next function will work
44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
45
45
46 def hfsignoreclean(s):
46 def hfsignoreclean(s):
47 """Remove codepoints ignored by HFS+ from s.
47 """Remove codepoints ignored by HFS+ from s.
48
48
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 '.hg'
50 '.hg'
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 '.hg'
52 '.hg'
53 """
53 """
54 if "\xe2" in s or "\xef" in s:
54 if "\xe2" in s or "\xef" in s:
55 for c in _ignore:
55 for c in _ignore:
56 s = s.replace(c, '')
56 s = s.replace(c, '')
57 return s
57 return s
58
58
59 # encoding.environ is provided read-only, which may not be used to modify
59 # encoding.environ is provided read-only, which may not be used to modify
60 # the process environment
60 # the process environment
61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
62 if not pycompat.ispy3:
62 if not pycompat.ispy3:
63 environ = os.environ # re-exports
63 environ = os.environ # re-exports
64 elif _nativeenviron:
64 elif _nativeenviron:
65 environ = os.environb # re-exports
65 environ = os.environb # re-exports
66 else:
66 else:
67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
68 # and recreate it once encoding is settled
68 # and recreate it once encoding is settled
69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
70 for k, v in os.environ.items()) # re-exports
70 for k, v in os.environ.items()) # re-exports
71
71
72 _encodingfixers = {
72 _encodingfixers = {
73 '646': lambda: 'ascii',
73 '646': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
75 }
75 }
76
76
77 try:
77 try:
78 encoding = environ.get("HGENCODING")
78 encoding = environ.get("HGENCODING")
79 if not encoding:
79 if not encoding:
80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
82 except locale.Error:
82 except locale.Error:
83 encoding = 'ascii'
83 encoding = 'ascii'
84 encodingmode = environ.get("HGENCODINGMODE", "strict")
84 encodingmode = environ.get("HGENCODINGMODE", "strict")
85 fallbackencoding = 'ISO-8859-1'
85 fallbackencoding = 'ISO-8859-1'
86
86
87 class localstr(bytes):
87 class localstr(bytes):
88 '''This class allows strings that are unmodified to be
88 '''This class allows strings that are unmodified to be
89 round-tripped to the local encoding and back'''
89 round-tripped to the local encoding and back'''
90 def __new__(cls, u, l):
90 def __new__(cls, u, l):
91 s = bytes.__new__(cls, l)
91 s = bytes.__new__(cls, l)
92 s._utf8 = u
92 s._utf8 = u
93 return s
93 return s
94 def __hash__(self):
94 def __hash__(self):
95 return hash(self._utf8) # avoid collisions in local string space
95 return hash(self._utf8) # avoid collisions in local string space
96
96
97 def tolocal(s):
97 def tolocal(s):
98 """
98 """
99 Convert a string from internal UTF-8 to local encoding
99 Convert a string from internal UTF-8 to local encoding
100
100
101 All internal strings should be UTF-8 but some repos before the
101 All internal strings should be UTF-8 but some repos before the
102 implementation of locale support may contain latin1 or possibly
102 implementation of locale support may contain latin1 or possibly
103 other character sets. We attempt to decode everything strictly
103 other character sets. We attempt to decode everything strictly
104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
105 replace unknown characters.
105 replace unknown characters.
106
106
107 The localstr class is used to cache the known UTF-8 encoding of
107 The localstr class is used to cache the known UTF-8 encoding of
108 strings next to their local representation to allow lossless
108 strings next to their local representation to allow lossless
109 round-trip conversion back to UTF-8.
109 round-trip conversion back to UTF-8.
110
110
111 >>> u = b'foo: \\xc3\\xa4' # utf-8
111 >>> u = b'foo: \\xc3\\xa4' # utf-8
112 >>> l = tolocal(u)
112 >>> l = tolocal(u)
113 >>> l
113 >>> l
114 'foo: ?'
114 'foo: ?'
115 >>> fromlocal(l)
115 >>> fromlocal(l)
116 'foo: \\xc3\\xa4'
116 'foo: \\xc3\\xa4'
117 >>> u2 = b'foo: \\xc3\\xa1'
117 >>> u2 = b'foo: \\xc3\\xa1'
118 >>> d = { l: 1, tolocal(u2): 2 }
118 >>> d = { l: 1, tolocal(u2): 2 }
119 >>> len(d) # no collision
119 >>> len(d) # no collision
120 2
120 2
121 >>> b'foo: ?' in d
121 >>> b'foo: ?' in d
122 False
122 False
123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
124 >>> l = tolocal(l1)
124 >>> l = tolocal(l1)
125 >>> l
125 >>> l
126 'foo: ?'
126 'foo: ?'
127 >>> fromlocal(l) # magically in utf-8
127 >>> fromlocal(l) # magically in utf-8
128 'foo: \\xc3\\xa4'
128 'foo: \\xc3\\xa4'
129 """
129 """
130
130
131 if isasciistr(s):
131 if isasciistr(s):
132 return s
132 return s
133
133
134 try:
134 try:
135 try:
135 try:
136 # make sure string is actually stored in UTF-8
136 # make sure string is actually stored in UTF-8
137 u = s.decode('UTF-8')
137 u = s.decode('UTF-8')
138 if encoding == 'UTF-8':
138 if encoding == 'UTF-8':
139 # fast path
139 # fast path
140 return s
140 return s
141 r = u.encode(_sysstr(encoding), u"replace")
141 r = u.encode(_sysstr(encoding), u"replace")
142 if u == r.decode(_sysstr(encoding)):
142 if u == r.decode(_sysstr(encoding)):
143 # r is a safe, non-lossy encoding of s
143 # r is a safe, non-lossy encoding of s
144 return r
144 return r
145 return localstr(s, r)
145 return localstr(s, r)
146 except UnicodeDecodeError:
146 except UnicodeDecodeError:
147 # we should only get here if we're looking at an ancient changeset
147 # we should only get here if we're looking at an ancient changeset
148 try:
148 try:
149 u = s.decode(_sysstr(fallbackencoding))
149 u = s.decode(_sysstr(fallbackencoding))
150 r = u.encode(_sysstr(encoding), u"replace")
150 r = u.encode(_sysstr(encoding), u"replace")
151 if u == r.decode(_sysstr(encoding)):
151 if u == r.decode(_sysstr(encoding)):
152 # r is a safe, non-lossy encoding of s
152 # r is a safe, non-lossy encoding of s
153 return r
153 return r
154 return localstr(u.encode('UTF-8'), r)
154 return localstr(u.encode('UTF-8'), r)
155 except UnicodeDecodeError:
155 except UnicodeDecodeError:
156 u = s.decode("utf-8", "replace") # last ditch
156 u = s.decode("utf-8", "replace") # last ditch
157 # can't round-trip
157 # can't round-trip
158 return u.encode(_sysstr(encoding), u"replace")
158 return u.encode(_sysstr(encoding), u"replace")
159 except LookupError as k:
159 except LookupError as k:
160 raise error.Abort(k, hint="please check your locale settings")
160 raise error.Abort(k, hint="please check your locale settings")
161
161
162 def fromlocal(s):
162 def fromlocal(s):
163 """
163 """
164 Convert a string from the local character encoding to UTF-8
164 Convert a string from the local character encoding to UTF-8
165
165
166 We attempt to decode strings using the encoding mode set by
166 We attempt to decode strings using the encoding mode set by
167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
168 characters will cause an error message. Other modes include
168 characters will cause an error message. Other modes include
169 'replace', which replaces unknown characters with a special
169 'replace', which replaces unknown characters with a special
170 Unicode character, and 'ignore', which drops the character.
170 Unicode character, and 'ignore', which drops the character.
171 """
171 """
172
172
173 # can we do a lossless round-trip?
173 # can we do a lossless round-trip?
174 if isinstance(s, localstr):
174 if isinstance(s, localstr):
175 return s._utf8
175 return s._utf8
176 if isasciistr(s):
176 if isasciistr(s):
177 return s
177 return s
178
178
179 try:
179 try:
180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
181 return u.encode("utf-8")
181 return u.encode("utf-8")
182 except UnicodeDecodeError as inst:
182 except UnicodeDecodeError as inst:
183 sub = s[max(0, inst.start - 10):inst.start + 10]
183 sub = s[max(0, inst.start - 10):inst.start + 10]
184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
185 except LookupError as k:
185 except LookupError as k:
186 raise error.Abort(k, hint="please check your locale settings")
186 raise error.Abort(k, hint="please check your locale settings")
187
187
188 def unitolocal(u):
188 def unitolocal(u):
189 """Convert a unicode string to a byte string of local encoding"""
189 """Convert a unicode string to a byte string of local encoding"""
190 return tolocal(u.encode('utf-8'))
190 return tolocal(u.encode('utf-8'))
191
191
192 def unifromlocal(s):
192 def unifromlocal(s):
193 """Convert a byte string of local encoding to a unicode string"""
193 """Convert a byte string of local encoding to a unicode string"""
194 return fromlocal(s).decode('utf-8')
194 return fromlocal(s).decode('utf-8')
195
195
196 def unimethod(bytesfunc):
196 def unimethod(bytesfunc):
197 """Create a proxy method that forwards __unicode__() and __str__() of
197 """Create a proxy method that forwards __unicode__() and __str__() of
198 Python 3 to __bytes__()"""
198 Python 3 to __bytes__()"""
199 def unifunc(obj):
199 def unifunc(obj):
200 return unifromlocal(bytesfunc(obj))
200 return unifromlocal(bytesfunc(obj))
201 return unifunc
201 return unifunc
202
202
203 # converter functions between native str and byte string. use these if the
203 # converter functions between native str and byte string. use these if the
204 # character encoding is not aware (e.g. exception message) or is known to
204 # character encoding is not aware (e.g. exception message) or is known to
205 # be locale dependent (e.g. date formatting.)
205 # be locale dependent (e.g. date formatting.)
206 if pycompat.ispy3:
206 if pycompat.ispy3:
207 strtolocal = unitolocal
207 strtolocal = unitolocal
208 strfromlocal = unifromlocal
208 strfromlocal = unifromlocal
209 strmethod = unimethod
209 strmethod = unimethod
210 else:
210 else:
211 strtolocal = pycompat.identity
211 strtolocal = pycompat.identity
212 strfromlocal = pycompat.identity
212 strfromlocal = pycompat.identity
213 strmethod = pycompat.identity
213 strmethod = pycompat.identity
214
214
215 if not _nativeenviron:
215 if not _nativeenviron:
216 # now encoding and helper functions are available, recreate the environ
216 # now encoding and helper functions are available, recreate the environ
217 # dict to be exported to other modules
217 # dict to be exported to other modules
218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 for k, v in os.environ.items()) # re-exports
219 for k, v in os.environ.items()) # re-exports
220
220
221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 and "WFA" or "WF")
223 and "WFA" or "WF")
224
224
225 def colwidth(s):
225 def colwidth(s):
226 "Find the column width of a string for display in the local encoding"
226 "Find the column width of a string for display in the local encoding"
227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228
228
229 def ucolwidth(d):
229 def ucolwidth(d):
230 "Find the column width of a Unicode string for display"
230 "Find the column width of a Unicode string for display"
231 eaw = getattr(unicodedata, 'east_asian_width', None)
231 eaw = getattr(unicodedata, 'east_asian_width', None)
232 if eaw is not None:
232 if eaw is not None:
233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 return len(d)
234 return len(d)
235
235
236 def getcols(s, start, c):
236 def getcols(s, start, c):
237 '''Use colwidth to find a c-column substring of s starting at byte
237 '''Use colwidth to find a c-column substring of s starting at byte
238 index start'''
238 index start'''
239 for x in xrange(start + c, len(s)):
239 for x in xrange(start + c, len(s)):
240 t = s[start:x]
240 t = s[start:x]
241 if colwidth(t) == c:
241 if colwidth(t) == c:
242 return t
242 return t
243
243
244 def trim(s, width, ellipsis='', leftside=False):
244 def trim(s, width, ellipsis='', leftside=False):
245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246
246
247 If 'leftside' is True, left side of string 's' is trimmed.
247 If 'leftside' is True, left side of string 's' is trimmed.
248 'ellipsis' is always placed at trimmed side.
248 'ellipsis' is always placed at trimmed side.
249
249
250 >>> ellipsis = b'+++'
250 >>> ellipsis = b'+++'
251 >>> from . import encoding
251 >>> from . import encoding
252 >>> encoding.encoding = b'utf-8'
252 >>> encoding.encoding = b'utf-8'
253 >>> t = b'1234567890'
253 >>> t = b'1234567890'
254 >>> print trim(t, 12, ellipsis=ellipsis)
254 >>> print trim(t, 12, ellipsis=ellipsis)
255 1234567890
255 1234567890
256 >>> print trim(t, 10, ellipsis=ellipsis)
256 >>> print trim(t, 10, ellipsis=ellipsis)
257 1234567890
257 1234567890
258 >>> print trim(t, 8, ellipsis=ellipsis)
258 >>> print trim(t, 8, ellipsis=ellipsis)
259 12345+++
259 12345+++
260 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
260 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
261 +++67890
261 +++67890
262 >>> print trim(t, 8)
262 >>> print trim(t, 8)
263 12345678
263 12345678
264 >>> print trim(t, 8, leftside=True)
264 >>> print trim(t, 8, leftside=True)
265 34567890
265 34567890
266 >>> print trim(t, 3, ellipsis=ellipsis)
266 >>> print trim(t, 3, ellipsis=ellipsis)
267 +++
267 +++
268 >>> print trim(t, 1, ellipsis=ellipsis)
268 >>> print trim(t, 1, ellipsis=ellipsis)
269 +
269 +
270 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
270 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
271 >>> t = u.encode(encoding.encoding)
271 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
272 >>> print trim(t, 12, ellipsis=ellipsis)
272 >>> print trim(t, 12, ellipsis=ellipsis)
273 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
273 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
274 >>> print trim(t, 10, ellipsis=ellipsis)
274 >>> print trim(t, 10, ellipsis=ellipsis)
275 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
275 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
276 >>> print trim(t, 8, ellipsis=ellipsis)
276 >>> print trim(t, 8, ellipsis=ellipsis)
277 \xe3\x81\x82\xe3\x81\x84+++
277 \xe3\x81\x82\xe3\x81\x84+++
278 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
278 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
279 +++\xe3\x81\x88\xe3\x81\x8a
279 +++\xe3\x81\x88\xe3\x81\x8a
280 >>> print trim(t, 5)
280 >>> print trim(t, 5)
281 \xe3\x81\x82\xe3\x81\x84
281 \xe3\x81\x82\xe3\x81\x84
282 >>> print trim(t, 5, leftside=True)
282 >>> print trim(t, 5, leftside=True)
283 \xe3\x81\x88\xe3\x81\x8a
283 \xe3\x81\x88\xe3\x81\x8a
284 >>> print trim(t, 4, ellipsis=ellipsis)
284 >>> print trim(t, 4, ellipsis=ellipsis)
285 +++
285 +++
286 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
286 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
287 +++
287 +++
288 >>> t = b'\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
288 >>> t = b'\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
289 >>> print trim(t, 12, ellipsis=ellipsis)
289 >>> print trim(t, 12, ellipsis=ellipsis)
290 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
290 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
291 >>> print trim(t, 10, ellipsis=ellipsis)
291 >>> print trim(t, 10, ellipsis=ellipsis)
292 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
292 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
293 >>> print trim(t, 8, ellipsis=ellipsis)
293 >>> print trim(t, 8, ellipsis=ellipsis)
294 \x11\x22\x33\x44\x55+++
294 \x11\x22\x33\x44\x55+++
295 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
295 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
296 +++\x66\x77\x88\x99\xaa
296 +++\x66\x77\x88\x99\xaa
297 >>> print trim(t, 8)
297 >>> print trim(t, 8)
298 \x11\x22\x33\x44\x55\x66\x77\x88
298 \x11\x22\x33\x44\x55\x66\x77\x88
299 >>> print trim(t, 8, leftside=True)
299 >>> print trim(t, 8, leftside=True)
300 \x33\x44\x55\x66\x77\x88\x99\xaa
300 \x33\x44\x55\x66\x77\x88\x99\xaa
301 >>> print trim(t, 3, ellipsis=ellipsis)
301 >>> print trim(t, 3, ellipsis=ellipsis)
302 +++
302 +++
303 >>> print trim(t, 1, ellipsis=ellipsis)
303 >>> print trim(t, 1, ellipsis=ellipsis)
304 +
304 +
305 """
305 """
306 try:
306 try:
307 u = s.decode(_sysstr(encoding))
307 u = s.decode(_sysstr(encoding))
308 except UnicodeDecodeError:
308 except UnicodeDecodeError:
309 if len(s) <= width: # trimming is not needed
309 if len(s) <= width: # trimming is not needed
310 return s
310 return s
311 width -= len(ellipsis)
311 width -= len(ellipsis)
312 if width <= 0: # no enough room even for ellipsis
312 if width <= 0: # no enough room even for ellipsis
313 return ellipsis[:width + len(ellipsis)]
313 return ellipsis[:width + len(ellipsis)]
314 if leftside:
314 if leftside:
315 return ellipsis + s[-width:]
315 return ellipsis + s[-width:]
316 return s[:width] + ellipsis
316 return s[:width] + ellipsis
317
317
318 if ucolwidth(u) <= width: # trimming is not needed
318 if ucolwidth(u) <= width: # trimming is not needed
319 return s
319 return s
320
320
321 width -= len(ellipsis)
321 width -= len(ellipsis)
322 if width <= 0: # no enough room even for ellipsis
322 if width <= 0: # no enough room even for ellipsis
323 return ellipsis[:width + len(ellipsis)]
323 return ellipsis[:width + len(ellipsis)]
324
324
325 if leftside:
325 if leftside:
326 uslice = lambda i: u[i:]
326 uslice = lambda i: u[i:]
327 concat = lambda s: ellipsis + s
327 concat = lambda s: ellipsis + s
328 else:
328 else:
329 uslice = lambda i: u[:-i]
329 uslice = lambda i: u[:-i]
330 concat = lambda s: s + ellipsis
330 concat = lambda s: s + ellipsis
331 for i in xrange(1, len(u)):
331 for i in xrange(1, len(u)):
332 usub = uslice(i)
332 usub = uslice(i)
333 if ucolwidth(usub) <= width:
333 if ucolwidth(usub) <= width:
334 return concat(usub.encode(_sysstr(encoding)))
334 return concat(usub.encode(_sysstr(encoding)))
335 return ellipsis # no enough room for multi-column characters
335 return ellipsis # no enough room for multi-column characters
336
336
337 def lower(s):
337 def lower(s):
338 "best-effort encoding-aware case-folding of local string s"
338 "best-effort encoding-aware case-folding of local string s"
339 try:
339 try:
340 return asciilower(s)
340 return asciilower(s)
341 except UnicodeDecodeError:
341 except UnicodeDecodeError:
342 pass
342 pass
343 try:
343 try:
344 if isinstance(s, localstr):
344 if isinstance(s, localstr):
345 u = s._utf8.decode("utf-8")
345 u = s._utf8.decode("utf-8")
346 else:
346 else:
347 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
347 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
348
348
349 lu = u.lower()
349 lu = u.lower()
350 if u == lu:
350 if u == lu:
351 return s # preserve localstring
351 return s # preserve localstring
352 return lu.encode(_sysstr(encoding))
352 return lu.encode(_sysstr(encoding))
353 except UnicodeError:
353 except UnicodeError:
354 return s.lower() # we don't know how to fold this except in ASCII
354 return s.lower() # we don't know how to fold this except in ASCII
355 except LookupError as k:
355 except LookupError as k:
356 raise error.Abort(k, hint="please check your locale settings")
356 raise error.Abort(k, hint="please check your locale settings")
357
357
358 def upper(s):
358 def upper(s):
359 "best-effort encoding-aware case-folding of local string s"
359 "best-effort encoding-aware case-folding of local string s"
360 try:
360 try:
361 return asciiupper(s)
361 return asciiupper(s)
362 except UnicodeDecodeError:
362 except UnicodeDecodeError:
363 return upperfallback(s)
363 return upperfallback(s)
364
364
365 def upperfallback(s):
365 def upperfallback(s):
366 try:
366 try:
367 if isinstance(s, localstr):
367 if isinstance(s, localstr):
368 u = s._utf8.decode("utf-8")
368 u = s._utf8.decode("utf-8")
369 else:
369 else:
370 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
370 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
371
371
372 uu = u.upper()
372 uu = u.upper()
373 if u == uu:
373 if u == uu:
374 return s # preserve localstring
374 return s # preserve localstring
375 return uu.encode(_sysstr(encoding))
375 return uu.encode(_sysstr(encoding))
376 except UnicodeError:
376 except UnicodeError:
377 return s.upper() # we don't know how to fold this except in ASCII
377 return s.upper() # we don't know how to fold this except in ASCII
378 except LookupError as k:
378 except LookupError as k:
379 raise error.Abort(k, hint="please check your locale settings")
379 raise error.Abort(k, hint="please check your locale settings")
380
380
381 class normcasespecs(object):
381 class normcasespecs(object):
382 '''what a platform's normcase does to ASCII strings
382 '''what a platform's normcase does to ASCII strings
383
383
384 This is specified per platform, and should be consistent with what normcase
384 This is specified per platform, and should be consistent with what normcase
385 on that platform actually does.
385 on that platform actually does.
386
386
387 lower: normcase lowercases ASCII strings
387 lower: normcase lowercases ASCII strings
388 upper: normcase uppercases ASCII strings
388 upper: normcase uppercases ASCII strings
389 other: the fallback function should always be called
389 other: the fallback function should always be called
390
390
391 This should be kept in sync with normcase_spec in util.h.'''
391 This should be kept in sync with normcase_spec in util.h.'''
392 lower = -1
392 lower = -1
393 upper = 1
393 upper = 1
394 other = 0
394 other = 0
395
395
396 def jsonescape(s, paranoid=False):
396 def jsonescape(s, paranoid=False):
397 '''returns a string suitable for JSON
397 '''returns a string suitable for JSON
398
398
399 JSON is problematic for us because it doesn't support non-Unicode
399 JSON is problematic for us because it doesn't support non-Unicode
400 bytes. To deal with this, we take the following approach:
400 bytes. To deal with this, we take the following approach:
401
401
402 - localstr objects are converted back to UTF-8
402 - localstr objects are converted back to UTF-8
403 - valid UTF-8/ASCII strings are passed as-is
403 - valid UTF-8/ASCII strings are passed as-is
404 - other strings are converted to UTF-8b surrogate encoding
404 - other strings are converted to UTF-8b surrogate encoding
405 - apply JSON-specified string escaping
405 - apply JSON-specified string escaping
406
406
407 (escapes are doubled in these tests)
407 (escapes are doubled in these tests)
408
408
409 >>> jsonescape(b'this is a test')
409 >>> jsonescape(b'this is a test')
410 'this is a test'
410 'this is a test'
411 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
411 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
412 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
412 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
413 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
413 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
414 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
414 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
415 >>> jsonescape(b'a weird byte: \\xdd')
415 >>> jsonescape(b'a weird byte: \\xdd')
416 'a weird byte: \\xed\\xb3\\x9d'
416 'a weird byte: \\xed\\xb3\\x9d'
417 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
417 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
418 'utf-8: caf\\xc3\\xa9'
418 'utf-8: caf\\xc3\\xa9'
419 >>> jsonescape(b'')
419 >>> jsonescape(b'')
420 ''
420 ''
421
421
422 If paranoid, non-ascii and common troublesome characters are also escaped.
422 If paranoid, non-ascii and common troublesome characters are also escaped.
423 This is suitable for web output.
423 This is suitable for web output.
424
424
425 >>> s = b'escape characters: \\0 \\x0b \\x7f'
425 >>> s = b'escape characters: \\0 \\x0b \\x7f'
426 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
426 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
427 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
427 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
428 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
428 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
429 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
429 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
430 'escape boundary: ~ \\\\u007f \\\\u0080'
430 'escape boundary: ~ \\\\u007f \\\\u0080'
431 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
431 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
432 'a weird byte: \\\\udcdd'
432 'a weird byte: \\\\udcdd'
433 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
433 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
434 'utf-8: caf\\\\u00e9'
434 'utf-8: caf\\\\u00e9'
435 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
435 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
436 'non-BMP: \\\\ud834\\\\udd1e'
436 'non-BMP: \\\\ud834\\\\udd1e'
437 >>> jsonescape(b'<foo@example.org>', paranoid=True)
437 >>> jsonescape(b'<foo@example.org>', paranoid=True)
438 '\\\\u003cfoo@example.org\\\\u003e'
438 '\\\\u003cfoo@example.org\\\\u003e'
439 '''
439 '''
440
440
441 u8chars = toutf8b(s)
441 u8chars = toutf8b(s)
442 try:
442 try:
443 return _jsonescapeu8fast(u8chars, paranoid)
443 return _jsonescapeu8fast(u8chars, paranoid)
444 except ValueError:
444 except ValueError:
445 pass
445 pass
446 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
446 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
447
447
448 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
448 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
449
449
450 def getutf8char(s, pos):
450 def getutf8char(s, pos):
451 '''get the next full utf-8 character in the given string, starting at pos
451 '''get the next full utf-8 character in the given string, starting at pos
452
452
453 Raises a UnicodeError if the given location does not start a valid
453 Raises a UnicodeError if the given location does not start a valid
454 utf-8 character.
454 utf-8 character.
455 '''
455 '''
456
456
457 # find how many bytes to attempt decoding from first nibble
457 # find how many bytes to attempt decoding from first nibble
458 l = _utf8len[ord(s[pos]) >> 4]
458 l = _utf8len[ord(s[pos]) >> 4]
459 if not l: # ascii
459 if not l: # ascii
460 return s[pos]
460 return s[pos]
461
461
462 c = s[pos:pos + l]
462 c = s[pos:pos + l]
463 # validate with attempted decode
463 # validate with attempted decode
464 c.decode("utf-8")
464 c.decode("utf-8")
465 return c
465 return c
466
466
467 def toutf8b(s):
467 def toutf8b(s):
468 '''convert a local, possibly-binary string into UTF-8b
468 '''convert a local, possibly-binary string into UTF-8b
469
469
470 This is intended as a generic method to preserve data when working
470 This is intended as a generic method to preserve data when working
471 with schemes like JSON and XML that have no provision for
471 with schemes like JSON and XML that have no provision for
472 arbitrary byte strings. As Mercurial often doesn't know
472 arbitrary byte strings. As Mercurial often doesn't know
473 what encoding data is in, we use so-called UTF-8b.
473 what encoding data is in, we use so-called UTF-8b.
474
474
475 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
475 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
476 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
476 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
477 uDC00-uDCFF.
477 uDC00-uDCFF.
478
478
479 Principles of operation:
479 Principles of operation:
480
480
481 - ASCII and UTF-8 data successfully round-trips and is understood
481 - ASCII and UTF-8 data successfully round-trips and is understood
482 by Unicode-oriented clients
482 by Unicode-oriented clients
483 - filenames and file contents in arbitrary other encodings can have
483 - filenames and file contents in arbitrary other encodings can have
484 be round-tripped or recovered by clueful clients
484 be round-tripped or recovered by clueful clients
485 - local strings that have a cached known UTF-8 encoding (aka
485 - local strings that have a cached known UTF-8 encoding (aka
486 localstr) get sent as UTF-8 so Unicode-oriented clients get the
486 localstr) get sent as UTF-8 so Unicode-oriented clients get the
487 Unicode data they want
487 Unicode data they want
488 - because we must preserve UTF-8 bytestring in places such as
488 - because we must preserve UTF-8 bytestring in places such as
489 filenames, metadata can't be roundtripped without help
489 filenames, metadata can't be roundtripped without help
490
490
491 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
491 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
492 arbitrary bytes into an internal Unicode format that can be
492 arbitrary bytes into an internal Unicode format that can be
493 re-encoded back into the original. Here we are exposing the
493 re-encoded back into the original. Here we are exposing the
494 internal surrogate encoding as a UTF-8 string.)
494 internal surrogate encoding as a UTF-8 string.)
495 '''
495 '''
496
496
497 if not isinstance(s, localstr) and isasciistr(s):
497 if not isinstance(s, localstr) and isasciistr(s):
498 return s
498 return s
499 if "\xed" not in s:
499 if "\xed" not in s:
500 if isinstance(s, localstr):
500 if isinstance(s, localstr):
501 return s._utf8
501 return s._utf8
502 try:
502 try:
503 s.decode('utf-8')
503 s.decode('utf-8')
504 return s
504 return s
505 except UnicodeDecodeError:
505 except UnicodeDecodeError:
506 pass
506 pass
507
507
508 r = ""
508 r = ""
509 pos = 0
509 pos = 0
510 l = len(s)
510 l = len(s)
511 while pos < l:
511 while pos < l:
512 try:
512 try:
513 c = getutf8char(s, pos)
513 c = getutf8char(s, pos)
514 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
514 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
515 # have to re-escape existing U+DCxx characters
515 # have to re-escape existing U+DCxx characters
516 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
516 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
517 pos += 1
517 pos += 1
518 else:
518 else:
519 pos += len(c)
519 pos += len(c)
520 except UnicodeDecodeError:
520 except UnicodeDecodeError:
521 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
521 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
522 pos += 1
522 pos += 1
523 r += c
523 r += c
524 return r
524 return r
525
525
526 def fromutf8b(s):
526 def fromutf8b(s):
527 '''Given a UTF-8b string, return a local, possibly-binary string.
527 '''Given a UTF-8b string, return a local, possibly-binary string.
528
528
529 return the original binary string. This
529 return the original binary string. This
530 is a round-trip process for strings like filenames, but metadata
530 is a round-trip process for strings like filenames, but metadata
531 that's was passed through tolocal will remain in UTF-8.
531 that's was passed through tolocal will remain in UTF-8.
532
532
533 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
533 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
534 >>> m = b"\\xc3\\xa9\\x99abcd"
534 >>> m = b"\\xc3\\xa9\\x99abcd"
535 >>> toutf8b(m)
535 >>> toutf8b(m)
536 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
536 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
537 >>> roundtrip(m)
537 >>> roundtrip(m)
538 True
538 True
539 >>> roundtrip(b"\\xc2\\xc2\\x80")
539 >>> roundtrip(b"\\xc2\\xc2\\x80")
540 True
540 True
541 >>> roundtrip(b"\\xef\\xbf\\xbd")
541 >>> roundtrip(b"\\xef\\xbf\\xbd")
542 True
542 True
543 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
543 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
544 True
544 True
545 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
545 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
546 True
546 True
547 '''
547 '''
548
548
549 if isasciistr(s):
549 if isasciistr(s):
550 return s
550 return s
551 # fast path - look for uDxxx prefixes in s
551 # fast path - look for uDxxx prefixes in s
552 if "\xed" not in s:
552 if "\xed" not in s:
553 return s
553 return s
554
554
555 # We could do this with the unicode type but some Python builds
555 # We could do this with the unicode type but some Python builds
556 # use UTF-16 internally (issue5031) which causes non-BMP code
556 # use UTF-16 internally (issue5031) which causes non-BMP code
557 # points to be escaped. Instead, we use our handy getutf8char
557 # points to be escaped. Instead, we use our handy getutf8char
558 # helper again to walk the string without "decoding" it.
558 # helper again to walk the string without "decoding" it.
559
559
560 r = ""
560 r = ""
561 pos = 0
561 pos = 0
562 l = len(s)
562 l = len(s)
563 while pos < l:
563 while pos < l:
564 c = getutf8char(s, pos)
564 c = getutf8char(s, pos)
565 pos += len(c)
565 pos += len(c)
566 # unescape U+DCxx characters
566 # unescape U+DCxx characters
567 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
567 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
568 c = chr(ord(c.decode("utf-8")) & 0xff)
568 c = chr(ord(c.decode("utf-8")) & 0xff)
569 r += c
569 r += c
570 return r
570 return r
571
571
572 if pycompat.ispy3:
572 if pycompat.ispy3:
573 class strio(io.TextIOWrapper):
573 class strio(io.TextIOWrapper):
574 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
574 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
575
575
576 Also works around Python closing streams.
576 Also works around Python closing streams.
577 """
577 """
578
578
579 def __init__(self, buffer):
579 def __init__(self, buffer):
580 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
580 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
581
581
582 def __del__(self):
582 def __del__(self):
583 """Override __del__ so it doesn't close the underlying stream."""
583 """Override __del__ so it doesn't close the underlying stream."""
584 else:
584 else:
585 strio = pycompat.identity
585 strio = pycompat.identity
General Comments 0
You need to be logged in to leave comments. Login now