##// END OF EJS Templates
encoding: ensure getutf8char always returns a bytestr, never an int
Augie Fackler -
r34197:112f118e default
parent child Browse files
Show More
@@ -1,588 +1,588 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import io
10 import io
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import (
21 from .pure import (
22 charencode as charencodepure,
22 charencode as charencodepure,
23 )
23 )
24
24
25 charencode = policy.importmod(r'charencode')
25 charencode = policy.importmod(r'charencode')
26
26
27 isasciistr = charencode.isasciistr
27 isasciistr = charencode.isasciistr
28 asciilower = charencode.asciilower
28 asciilower = charencode.asciilower
29 asciiupper = charencode.asciiupper
29 asciiupper = charencode.asciiupper
30 _jsonescapeu8fast = charencode.jsonescapeu8fast
30 _jsonescapeu8fast = charencode.jsonescapeu8fast
31
31
32 _sysstr = pycompat.sysstr
32 _sysstr = pycompat.sysstr
33
33
34 if pycompat.ispy3:
34 if pycompat.ispy3:
35 unichr = chr
35 unichr = chr
36
36
37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
38 # "Unicode Subtleties"), so we need to ignore them in some places for
38 # "Unicode Subtleties"), so we need to ignore them in some places for
39 # sanity.
39 # sanity.
40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
42 "206a 206b 206c 206d 206e 206f feff".split()]
42 "206a 206b 206c 206d 206e 206f feff".split()]
43 # verify the next function will work
43 # verify the next function will work
44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
45
45
46 def hfsignoreclean(s):
46 def hfsignoreclean(s):
47 """Remove codepoints ignored by HFS+ from s.
47 """Remove codepoints ignored by HFS+ from s.
48
48
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 '.hg'
50 '.hg'
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 '.hg'
52 '.hg'
53 """
53 """
54 if "\xe2" in s or "\xef" in s:
54 if "\xe2" in s or "\xef" in s:
55 for c in _ignore:
55 for c in _ignore:
56 s = s.replace(c, '')
56 s = s.replace(c, '')
57 return s
57 return s
58
58
59 # encoding.environ is provided read-only, which may not be used to modify
59 # encoding.environ is provided read-only, which may not be used to modify
60 # the process environment
60 # the process environment
61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
62 if not pycompat.ispy3:
62 if not pycompat.ispy3:
63 environ = os.environ # re-exports
63 environ = os.environ # re-exports
64 elif _nativeenviron:
64 elif _nativeenviron:
65 environ = os.environb # re-exports
65 environ = os.environb # re-exports
66 else:
66 else:
67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
68 # and recreate it once encoding is settled
68 # and recreate it once encoding is settled
69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
70 for k, v in os.environ.items()) # re-exports
70 for k, v in os.environ.items()) # re-exports
71
71
72 _encodingfixers = {
72 _encodingfixers = {
73 '646': lambda: 'ascii',
73 '646': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
75 }
75 }
76
76
77 try:
77 try:
78 encoding = environ.get("HGENCODING")
78 encoding = environ.get("HGENCODING")
79 if not encoding:
79 if not encoding:
80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
82 except locale.Error:
82 except locale.Error:
83 encoding = 'ascii'
83 encoding = 'ascii'
84 encodingmode = environ.get("HGENCODINGMODE", "strict")
84 encodingmode = environ.get("HGENCODINGMODE", "strict")
85 fallbackencoding = 'ISO-8859-1'
85 fallbackencoding = 'ISO-8859-1'
86
86
87 class localstr(bytes):
87 class localstr(bytes):
88 '''This class allows strings that are unmodified to be
88 '''This class allows strings that are unmodified to be
89 round-tripped to the local encoding and back'''
89 round-tripped to the local encoding and back'''
90 def __new__(cls, u, l):
90 def __new__(cls, u, l):
91 s = bytes.__new__(cls, l)
91 s = bytes.__new__(cls, l)
92 s._utf8 = u
92 s._utf8 = u
93 return s
93 return s
94 def __hash__(self):
94 def __hash__(self):
95 return hash(self._utf8) # avoid collisions in local string space
95 return hash(self._utf8) # avoid collisions in local string space
96
96
97 def tolocal(s):
97 def tolocal(s):
98 """
98 """
99 Convert a string from internal UTF-8 to local encoding
99 Convert a string from internal UTF-8 to local encoding
100
100
101 All internal strings should be UTF-8 but some repos before the
101 All internal strings should be UTF-8 but some repos before the
102 implementation of locale support may contain latin1 or possibly
102 implementation of locale support may contain latin1 or possibly
103 other character sets. We attempt to decode everything strictly
103 other character sets. We attempt to decode everything strictly
104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
105 replace unknown characters.
105 replace unknown characters.
106
106
107 The localstr class is used to cache the known UTF-8 encoding of
107 The localstr class is used to cache the known UTF-8 encoding of
108 strings next to their local representation to allow lossless
108 strings next to their local representation to allow lossless
109 round-trip conversion back to UTF-8.
109 round-trip conversion back to UTF-8.
110
110
111 >>> u = b'foo: \\xc3\\xa4' # utf-8
111 >>> u = b'foo: \\xc3\\xa4' # utf-8
112 >>> l = tolocal(u)
112 >>> l = tolocal(u)
113 >>> l
113 >>> l
114 'foo: ?'
114 'foo: ?'
115 >>> fromlocal(l)
115 >>> fromlocal(l)
116 'foo: \\xc3\\xa4'
116 'foo: \\xc3\\xa4'
117 >>> u2 = b'foo: \\xc3\\xa1'
117 >>> u2 = b'foo: \\xc3\\xa1'
118 >>> d = { l: 1, tolocal(u2): 2 }
118 >>> d = { l: 1, tolocal(u2): 2 }
119 >>> len(d) # no collision
119 >>> len(d) # no collision
120 2
120 2
121 >>> b'foo: ?' in d
121 >>> b'foo: ?' in d
122 False
122 False
123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
124 >>> l = tolocal(l1)
124 >>> l = tolocal(l1)
125 >>> l
125 >>> l
126 'foo: ?'
126 'foo: ?'
127 >>> fromlocal(l) # magically in utf-8
127 >>> fromlocal(l) # magically in utf-8
128 'foo: \\xc3\\xa4'
128 'foo: \\xc3\\xa4'
129 """
129 """
130
130
131 if isasciistr(s):
131 if isasciistr(s):
132 return s
132 return s
133
133
134 try:
134 try:
135 try:
135 try:
136 # make sure string is actually stored in UTF-8
136 # make sure string is actually stored in UTF-8
137 u = s.decode('UTF-8')
137 u = s.decode('UTF-8')
138 if encoding == 'UTF-8':
138 if encoding == 'UTF-8':
139 # fast path
139 # fast path
140 return s
140 return s
141 r = u.encode(_sysstr(encoding), u"replace")
141 r = u.encode(_sysstr(encoding), u"replace")
142 if u == r.decode(_sysstr(encoding)):
142 if u == r.decode(_sysstr(encoding)):
143 # r is a safe, non-lossy encoding of s
143 # r is a safe, non-lossy encoding of s
144 return r
144 return r
145 return localstr(s, r)
145 return localstr(s, r)
146 except UnicodeDecodeError:
146 except UnicodeDecodeError:
147 # we should only get here if we're looking at an ancient changeset
147 # we should only get here if we're looking at an ancient changeset
148 try:
148 try:
149 u = s.decode(_sysstr(fallbackencoding))
149 u = s.decode(_sysstr(fallbackencoding))
150 r = u.encode(_sysstr(encoding), u"replace")
150 r = u.encode(_sysstr(encoding), u"replace")
151 if u == r.decode(_sysstr(encoding)):
151 if u == r.decode(_sysstr(encoding)):
152 # r is a safe, non-lossy encoding of s
152 # r is a safe, non-lossy encoding of s
153 return r
153 return r
154 return localstr(u.encode('UTF-8'), r)
154 return localstr(u.encode('UTF-8'), r)
155 except UnicodeDecodeError:
155 except UnicodeDecodeError:
156 u = s.decode("utf-8", "replace") # last ditch
156 u = s.decode("utf-8", "replace") # last ditch
157 # can't round-trip
157 # can't round-trip
158 return u.encode(_sysstr(encoding), u"replace")
158 return u.encode(_sysstr(encoding), u"replace")
159 except LookupError as k:
159 except LookupError as k:
160 raise error.Abort(k, hint="please check your locale settings")
160 raise error.Abort(k, hint="please check your locale settings")
161
161
162 def fromlocal(s):
162 def fromlocal(s):
163 """
163 """
164 Convert a string from the local character encoding to UTF-8
164 Convert a string from the local character encoding to UTF-8
165
165
166 We attempt to decode strings using the encoding mode set by
166 We attempt to decode strings using the encoding mode set by
167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
168 characters will cause an error message. Other modes include
168 characters will cause an error message. Other modes include
169 'replace', which replaces unknown characters with a special
169 'replace', which replaces unknown characters with a special
170 Unicode character, and 'ignore', which drops the character.
170 Unicode character, and 'ignore', which drops the character.
171 """
171 """
172
172
173 # can we do a lossless round-trip?
173 # can we do a lossless round-trip?
174 if isinstance(s, localstr):
174 if isinstance(s, localstr):
175 return s._utf8
175 return s._utf8
176 if isasciistr(s):
176 if isasciistr(s):
177 return s
177 return s
178
178
179 try:
179 try:
180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
181 return u.encode("utf-8")
181 return u.encode("utf-8")
182 except UnicodeDecodeError as inst:
182 except UnicodeDecodeError as inst:
183 sub = s[max(0, inst.start - 10):inst.start + 10]
183 sub = s[max(0, inst.start - 10):inst.start + 10]
184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
185 except LookupError as k:
185 except LookupError as k:
186 raise error.Abort(k, hint="please check your locale settings")
186 raise error.Abort(k, hint="please check your locale settings")
187
187
188 def unitolocal(u):
188 def unitolocal(u):
189 """Convert a unicode string to a byte string of local encoding"""
189 """Convert a unicode string to a byte string of local encoding"""
190 return tolocal(u.encode('utf-8'))
190 return tolocal(u.encode('utf-8'))
191
191
192 def unifromlocal(s):
192 def unifromlocal(s):
193 """Convert a byte string of local encoding to a unicode string"""
193 """Convert a byte string of local encoding to a unicode string"""
194 return fromlocal(s).decode('utf-8')
194 return fromlocal(s).decode('utf-8')
195
195
196 def unimethod(bytesfunc):
196 def unimethod(bytesfunc):
197 """Create a proxy method that forwards __unicode__() and __str__() of
197 """Create a proxy method that forwards __unicode__() and __str__() of
198 Python 3 to __bytes__()"""
198 Python 3 to __bytes__()"""
199 def unifunc(obj):
199 def unifunc(obj):
200 return unifromlocal(bytesfunc(obj))
200 return unifromlocal(bytesfunc(obj))
201 return unifunc
201 return unifunc
202
202
203 # converter functions between native str and byte string. use these if the
203 # converter functions between native str and byte string. use these if the
204 # character encoding is not aware (e.g. exception message) or is known to
204 # character encoding is not aware (e.g. exception message) or is known to
205 # be locale dependent (e.g. date formatting.)
205 # be locale dependent (e.g. date formatting.)
206 if pycompat.ispy3:
206 if pycompat.ispy3:
207 strtolocal = unitolocal
207 strtolocal = unitolocal
208 strfromlocal = unifromlocal
208 strfromlocal = unifromlocal
209 strmethod = unimethod
209 strmethod = unimethod
210 else:
210 else:
211 strtolocal = pycompat.identity
211 strtolocal = pycompat.identity
212 strfromlocal = pycompat.identity
212 strfromlocal = pycompat.identity
213 strmethod = pycompat.identity
213 strmethod = pycompat.identity
214
214
215 if not _nativeenviron:
215 if not _nativeenviron:
216 # now encoding and helper functions are available, recreate the environ
216 # now encoding and helper functions are available, recreate the environ
217 # dict to be exported to other modules
217 # dict to be exported to other modules
218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 for k, v in os.environ.items()) # re-exports
219 for k, v in os.environ.items()) # re-exports
220
220
221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 and "WFA" or "WF")
223 and "WFA" or "WF")
224
224
225 def colwidth(s):
225 def colwidth(s):
226 "Find the column width of a string for display in the local encoding"
226 "Find the column width of a string for display in the local encoding"
227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228
228
229 def ucolwidth(d):
229 def ucolwidth(d):
230 "Find the column width of a Unicode string for display"
230 "Find the column width of a Unicode string for display"
231 eaw = getattr(unicodedata, 'east_asian_width', None)
231 eaw = getattr(unicodedata, 'east_asian_width', None)
232 if eaw is not None:
232 if eaw is not None:
233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 return len(d)
234 return len(d)
235
235
236 def getcols(s, start, c):
236 def getcols(s, start, c):
237 '''Use colwidth to find a c-column substring of s starting at byte
237 '''Use colwidth to find a c-column substring of s starting at byte
238 index start'''
238 index start'''
239 for x in xrange(start + c, len(s)):
239 for x in xrange(start + c, len(s)):
240 t = s[start:x]
240 t = s[start:x]
241 if colwidth(t) == c:
241 if colwidth(t) == c:
242 return t
242 return t
243
243
244 def trim(s, width, ellipsis='', leftside=False):
244 def trim(s, width, ellipsis='', leftside=False):
245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246
246
247 If 'leftside' is True, left side of string 's' is trimmed.
247 If 'leftside' is True, left side of string 's' is trimmed.
248 'ellipsis' is always placed at trimmed side.
248 'ellipsis' is always placed at trimmed side.
249
249
250 >>> from .node import bin
250 >>> from .node import bin
251 >>> def bprint(s):
251 >>> def bprint(s):
252 ... print(pycompat.sysstr(s))
252 ... print(pycompat.sysstr(s))
253 >>> ellipsis = b'+++'
253 >>> ellipsis = b'+++'
254 >>> from . import encoding
254 >>> from . import encoding
255 >>> encoding.encoding = b'utf-8'
255 >>> encoding.encoding = b'utf-8'
256 >>> t = b'1234567890'
256 >>> t = b'1234567890'
257 >>> bprint(trim(t, 12, ellipsis=ellipsis))
257 >>> bprint(trim(t, 12, ellipsis=ellipsis))
258 1234567890
258 1234567890
259 >>> bprint(trim(t, 10, ellipsis=ellipsis))
259 >>> bprint(trim(t, 10, ellipsis=ellipsis))
260 1234567890
260 1234567890
261 >>> bprint(trim(t, 8, ellipsis=ellipsis))
261 >>> bprint(trim(t, 8, ellipsis=ellipsis))
262 12345+++
262 12345+++
263 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
263 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
264 +++67890
264 +++67890
265 >>> bprint(trim(t, 8))
265 >>> bprint(trim(t, 8))
266 12345678
266 12345678
267 >>> bprint(trim(t, 8, leftside=True))
267 >>> bprint(trim(t, 8, leftside=True))
268 34567890
268 34567890
269 >>> bprint(trim(t, 3, ellipsis=ellipsis))
269 >>> bprint(trim(t, 3, ellipsis=ellipsis))
270 +++
270 +++
271 >>> bprint(trim(t, 1, ellipsis=ellipsis))
271 >>> bprint(trim(t, 1, ellipsis=ellipsis))
272 +
272 +
273 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
273 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
274 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
274 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
275 >>> bprint(trim(t, 12, ellipsis=ellipsis))
275 >>> bprint(trim(t, 12, ellipsis=ellipsis))
276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
277 >>> bprint(trim(t, 10, ellipsis=ellipsis))
277 >>> bprint(trim(t, 10, ellipsis=ellipsis))
278 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
278 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
279 >>> bprint(trim(t, 8, ellipsis=ellipsis))
279 >>> bprint(trim(t, 8, ellipsis=ellipsis))
280 \xe3\x81\x82\xe3\x81\x84+++
280 \xe3\x81\x82\xe3\x81\x84+++
281 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
281 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
282 +++\xe3\x81\x88\xe3\x81\x8a
282 +++\xe3\x81\x88\xe3\x81\x8a
283 >>> bprint(trim(t, 5))
283 >>> bprint(trim(t, 5))
284 \xe3\x81\x82\xe3\x81\x84
284 \xe3\x81\x82\xe3\x81\x84
285 >>> bprint(trim(t, 5, leftside=True))
285 >>> bprint(trim(t, 5, leftside=True))
286 \xe3\x81\x88\xe3\x81\x8a
286 \xe3\x81\x88\xe3\x81\x8a
287 >>> bprint(trim(t, 4, ellipsis=ellipsis))
287 >>> bprint(trim(t, 4, ellipsis=ellipsis))
288 +++
288 +++
289 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
289 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
290 +++
290 +++
291 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
291 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
292 >>> bprint(trim(t, 12, ellipsis=ellipsis))
292 >>> bprint(trim(t, 12, ellipsis=ellipsis))
293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
294 >>> bprint(trim(t, 10, ellipsis=ellipsis))
294 >>> bprint(trim(t, 10, ellipsis=ellipsis))
295 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
295 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
296 >>> bprint(trim(t, 8, ellipsis=ellipsis))
296 >>> bprint(trim(t, 8, ellipsis=ellipsis))
297 \x11\x22\x33\x44\x55+++
297 \x11\x22\x33\x44\x55+++
298 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
298 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
299 +++\x66\x77\x88\x99\xaa
299 +++\x66\x77\x88\x99\xaa
300 >>> bprint(trim(t, 8))
300 >>> bprint(trim(t, 8))
301 \x11\x22\x33\x44\x55\x66\x77\x88
301 \x11\x22\x33\x44\x55\x66\x77\x88
302 >>> bprint(trim(t, 8, leftside=True))
302 >>> bprint(trim(t, 8, leftside=True))
303 \x33\x44\x55\x66\x77\x88\x99\xaa
303 \x33\x44\x55\x66\x77\x88\x99\xaa
304 >>> bprint(trim(t, 3, ellipsis=ellipsis))
304 >>> bprint(trim(t, 3, ellipsis=ellipsis))
305 +++
305 +++
306 >>> bprint(trim(t, 1, ellipsis=ellipsis))
306 >>> bprint(trim(t, 1, ellipsis=ellipsis))
307 +
307 +
308 """
308 """
309 try:
309 try:
310 u = s.decode(_sysstr(encoding))
310 u = s.decode(_sysstr(encoding))
311 except UnicodeDecodeError:
311 except UnicodeDecodeError:
312 if len(s) <= width: # trimming is not needed
312 if len(s) <= width: # trimming is not needed
313 return s
313 return s
314 width -= len(ellipsis)
314 width -= len(ellipsis)
315 if width <= 0: # no enough room even for ellipsis
315 if width <= 0: # no enough room even for ellipsis
316 return ellipsis[:width + len(ellipsis)]
316 return ellipsis[:width + len(ellipsis)]
317 if leftside:
317 if leftside:
318 return ellipsis + s[-width:]
318 return ellipsis + s[-width:]
319 return s[:width] + ellipsis
319 return s[:width] + ellipsis
320
320
321 if ucolwidth(u) <= width: # trimming is not needed
321 if ucolwidth(u) <= width: # trimming is not needed
322 return s
322 return s
323
323
324 width -= len(ellipsis)
324 width -= len(ellipsis)
325 if width <= 0: # no enough room even for ellipsis
325 if width <= 0: # no enough room even for ellipsis
326 return ellipsis[:width + len(ellipsis)]
326 return ellipsis[:width + len(ellipsis)]
327
327
328 if leftside:
328 if leftside:
329 uslice = lambda i: u[i:]
329 uslice = lambda i: u[i:]
330 concat = lambda s: ellipsis + s
330 concat = lambda s: ellipsis + s
331 else:
331 else:
332 uslice = lambda i: u[:-i]
332 uslice = lambda i: u[:-i]
333 concat = lambda s: s + ellipsis
333 concat = lambda s: s + ellipsis
334 for i in xrange(1, len(u)):
334 for i in xrange(1, len(u)):
335 usub = uslice(i)
335 usub = uslice(i)
336 if ucolwidth(usub) <= width:
336 if ucolwidth(usub) <= width:
337 return concat(usub.encode(_sysstr(encoding)))
337 return concat(usub.encode(_sysstr(encoding)))
338 return ellipsis # no enough room for multi-column characters
338 return ellipsis # no enough room for multi-column characters
339
339
340 def lower(s):
340 def lower(s):
341 "best-effort encoding-aware case-folding of local string s"
341 "best-effort encoding-aware case-folding of local string s"
342 try:
342 try:
343 return asciilower(s)
343 return asciilower(s)
344 except UnicodeDecodeError:
344 except UnicodeDecodeError:
345 pass
345 pass
346 try:
346 try:
347 if isinstance(s, localstr):
347 if isinstance(s, localstr):
348 u = s._utf8.decode("utf-8")
348 u = s._utf8.decode("utf-8")
349 else:
349 else:
350 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
350 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
351
351
352 lu = u.lower()
352 lu = u.lower()
353 if u == lu:
353 if u == lu:
354 return s # preserve localstring
354 return s # preserve localstring
355 return lu.encode(_sysstr(encoding))
355 return lu.encode(_sysstr(encoding))
356 except UnicodeError:
356 except UnicodeError:
357 return s.lower() # we don't know how to fold this except in ASCII
357 return s.lower() # we don't know how to fold this except in ASCII
358 except LookupError as k:
358 except LookupError as k:
359 raise error.Abort(k, hint="please check your locale settings")
359 raise error.Abort(k, hint="please check your locale settings")
360
360
361 def upper(s):
361 def upper(s):
362 "best-effort encoding-aware case-folding of local string s"
362 "best-effort encoding-aware case-folding of local string s"
363 try:
363 try:
364 return asciiupper(s)
364 return asciiupper(s)
365 except UnicodeDecodeError:
365 except UnicodeDecodeError:
366 return upperfallback(s)
366 return upperfallback(s)
367
367
368 def upperfallback(s):
368 def upperfallback(s):
369 try:
369 try:
370 if isinstance(s, localstr):
370 if isinstance(s, localstr):
371 u = s._utf8.decode("utf-8")
371 u = s._utf8.decode("utf-8")
372 else:
372 else:
373 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
373 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
374
374
375 uu = u.upper()
375 uu = u.upper()
376 if u == uu:
376 if u == uu:
377 return s # preserve localstring
377 return s # preserve localstring
378 return uu.encode(_sysstr(encoding))
378 return uu.encode(_sysstr(encoding))
379 except UnicodeError:
379 except UnicodeError:
380 return s.upper() # we don't know how to fold this except in ASCII
380 return s.upper() # we don't know how to fold this except in ASCII
381 except LookupError as k:
381 except LookupError as k:
382 raise error.Abort(k, hint="please check your locale settings")
382 raise error.Abort(k, hint="please check your locale settings")
383
383
384 class normcasespecs(object):
384 class normcasespecs(object):
385 '''what a platform's normcase does to ASCII strings
385 '''what a platform's normcase does to ASCII strings
386
386
387 This is specified per platform, and should be consistent with what normcase
387 This is specified per platform, and should be consistent with what normcase
388 on that platform actually does.
388 on that platform actually does.
389
389
390 lower: normcase lowercases ASCII strings
390 lower: normcase lowercases ASCII strings
391 upper: normcase uppercases ASCII strings
391 upper: normcase uppercases ASCII strings
392 other: the fallback function should always be called
392 other: the fallback function should always be called
393
393
394 This should be kept in sync with normcase_spec in util.h.'''
394 This should be kept in sync with normcase_spec in util.h.'''
395 lower = -1
395 lower = -1
396 upper = 1
396 upper = 1
397 other = 0
397 other = 0
398
398
399 def jsonescape(s, paranoid=False):
399 def jsonescape(s, paranoid=False):
400 '''returns a string suitable for JSON
400 '''returns a string suitable for JSON
401
401
402 JSON is problematic for us because it doesn't support non-Unicode
402 JSON is problematic for us because it doesn't support non-Unicode
403 bytes. To deal with this, we take the following approach:
403 bytes. To deal with this, we take the following approach:
404
404
405 - localstr objects are converted back to UTF-8
405 - localstr objects are converted back to UTF-8
406 - valid UTF-8/ASCII strings are passed as-is
406 - valid UTF-8/ASCII strings are passed as-is
407 - other strings are converted to UTF-8b surrogate encoding
407 - other strings are converted to UTF-8b surrogate encoding
408 - apply JSON-specified string escaping
408 - apply JSON-specified string escaping
409
409
410 (escapes are doubled in these tests)
410 (escapes are doubled in these tests)
411
411
412 >>> jsonescape(b'this is a test')
412 >>> jsonescape(b'this is a test')
413 'this is a test'
413 'this is a test'
414 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
414 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
415 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
415 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
416 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
416 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
417 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
417 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
418 >>> jsonescape(b'a weird byte: \\xdd')
418 >>> jsonescape(b'a weird byte: \\xdd')
419 'a weird byte: \\xed\\xb3\\x9d'
419 'a weird byte: \\xed\\xb3\\x9d'
420 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
420 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
421 'utf-8: caf\\xc3\\xa9'
421 'utf-8: caf\\xc3\\xa9'
422 >>> jsonescape(b'')
422 >>> jsonescape(b'')
423 ''
423 ''
424
424
425 If paranoid, non-ascii and common troublesome characters are also escaped.
425 If paranoid, non-ascii and common troublesome characters are also escaped.
426 This is suitable for web output.
426 This is suitable for web output.
427
427
428 >>> s = b'escape characters: \\0 \\x0b \\x7f'
428 >>> s = b'escape characters: \\0 \\x0b \\x7f'
429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
430 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
430 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
431 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
431 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
432 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
432 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
433 'escape boundary: ~ \\\\u007f \\\\u0080'
433 'escape boundary: ~ \\\\u007f \\\\u0080'
434 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
434 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
435 'a weird byte: \\\\udcdd'
435 'a weird byte: \\\\udcdd'
436 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
436 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
437 'utf-8: caf\\\\u00e9'
437 'utf-8: caf\\\\u00e9'
438 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
438 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
439 'non-BMP: \\\\ud834\\\\udd1e'
439 'non-BMP: \\\\ud834\\\\udd1e'
440 >>> jsonescape(b'<foo@example.org>', paranoid=True)
440 >>> jsonescape(b'<foo@example.org>', paranoid=True)
441 '\\\\u003cfoo@example.org\\\\u003e'
441 '\\\\u003cfoo@example.org\\\\u003e'
442 '''
442 '''
443
443
444 u8chars = toutf8b(s)
444 u8chars = toutf8b(s)
445 try:
445 try:
446 return _jsonescapeu8fast(u8chars, paranoid)
446 return _jsonescapeu8fast(u8chars, paranoid)
447 except ValueError:
447 except ValueError:
448 pass
448 pass
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
450
450
451 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
451 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
452
452
453 def getutf8char(s, pos):
453 def getutf8char(s, pos):
454 '''get the next full utf-8 character in the given string, starting at pos
454 '''get the next full utf-8 character in the given string, starting at pos
455
455
456 Raises a UnicodeError if the given location does not start a valid
456 Raises a UnicodeError if the given location does not start a valid
457 utf-8 character.
457 utf-8 character.
458 '''
458 '''
459
459
460 # find how many bytes to attempt decoding from first nibble
460 # find how many bytes to attempt decoding from first nibble
461 l = _utf8len[ord(s[pos]) >> 4]
461 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
462 if not l: # ascii
462 if not l: # ascii
463 return s[pos]
463 return s[pos:pos + 1]
464
464
465 c = s[pos:pos + l]
465 c = s[pos:pos + l]
466 # validate with attempted decode
466 # validate with attempted decode
467 c.decode("utf-8")
467 c.decode("utf-8")
468 return c
468 return c
469
469
470 def toutf8b(s):
470 def toutf8b(s):
471 '''convert a local, possibly-binary string into UTF-8b
471 '''convert a local, possibly-binary string into UTF-8b
472
472
473 This is intended as a generic method to preserve data when working
473 This is intended as a generic method to preserve data when working
474 with schemes like JSON and XML that have no provision for
474 with schemes like JSON and XML that have no provision for
475 arbitrary byte strings. As Mercurial often doesn't know
475 arbitrary byte strings. As Mercurial often doesn't know
476 what encoding data is in, we use so-called UTF-8b.
476 what encoding data is in, we use so-called UTF-8b.
477
477
478 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
478 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
479 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
479 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
480 uDC00-uDCFF.
480 uDC00-uDCFF.
481
481
482 Principles of operation:
482 Principles of operation:
483
483
484 - ASCII and UTF-8 data successfully round-trips and is understood
484 - ASCII and UTF-8 data successfully round-trips and is understood
485 by Unicode-oriented clients
485 by Unicode-oriented clients
486 - filenames and file contents in arbitrary other encodings can have
486 - filenames and file contents in arbitrary other encodings can have
487 be round-tripped or recovered by clueful clients
487 be round-tripped or recovered by clueful clients
488 - local strings that have a cached known UTF-8 encoding (aka
488 - local strings that have a cached known UTF-8 encoding (aka
489 localstr) get sent as UTF-8 so Unicode-oriented clients get the
489 localstr) get sent as UTF-8 so Unicode-oriented clients get the
490 Unicode data they want
490 Unicode data they want
491 - because we must preserve UTF-8 bytestring in places such as
491 - because we must preserve UTF-8 bytestring in places such as
492 filenames, metadata can't be roundtripped without help
492 filenames, metadata can't be roundtripped without help
493
493
494 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
494 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
495 arbitrary bytes into an internal Unicode format that can be
495 arbitrary bytes into an internal Unicode format that can be
496 re-encoded back into the original. Here we are exposing the
496 re-encoded back into the original. Here we are exposing the
497 internal surrogate encoding as a UTF-8 string.)
497 internal surrogate encoding as a UTF-8 string.)
498 '''
498 '''
499
499
500 if not isinstance(s, localstr) and isasciistr(s):
500 if not isinstance(s, localstr) and isasciistr(s):
501 return s
501 return s
502 if "\xed" not in s:
502 if "\xed" not in s:
503 if isinstance(s, localstr):
503 if isinstance(s, localstr):
504 return s._utf8
504 return s._utf8
505 try:
505 try:
506 s.decode('utf-8')
506 s.decode('utf-8')
507 return s
507 return s
508 except UnicodeDecodeError:
508 except UnicodeDecodeError:
509 pass
509 pass
510
510
511 r = ""
511 r = ""
512 pos = 0
512 pos = 0
513 l = len(s)
513 l = len(s)
514 while pos < l:
514 while pos < l:
515 try:
515 try:
516 c = getutf8char(s, pos)
516 c = getutf8char(s, pos)
517 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
517 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
518 # have to re-escape existing U+DCxx characters
518 # have to re-escape existing U+DCxx characters
519 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
519 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
520 pos += 1
520 pos += 1
521 else:
521 else:
522 pos += len(c)
522 pos += len(c)
523 except UnicodeDecodeError:
523 except UnicodeDecodeError:
524 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
524 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
525 pos += 1
525 pos += 1
526 r += c
526 r += c
527 return r
527 return r
528
528
529 def fromutf8b(s):
529 def fromutf8b(s):
530 '''Given a UTF-8b string, return a local, possibly-binary string.
530 '''Given a UTF-8b string, return a local, possibly-binary string.
531
531
532 return the original binary string. This
532 return the original binary string. This
533 is a round-trip process for strings like filenames, but metadata
533 is a round-trip process for strings like filenames, but metadata
534 that's was passed through tolocal will remain in UTF-8.
534 that's was passed through tolocal will remain in UTF-8.
535
535
536 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
536 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
537 >>> m = b"\\xc3\\xa9\\x99abcd"
537 >>> m = b"\\xc3\\xa9\\x99abcd"
538 >>> toutf8b(m)
538 >>> toutf8b(m)
539 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
539 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
540 >>> roundtrip(m)
540 >>> roundtrip(m)
541 True
541 True
542 >>> roundtrip(b"\\xc2\\xc2\\x80")
542 >>> roundtrip(b"\\xc2\\xc2\\x80")
543 True
543 True
544 >>> roundtrip(b"\\xef\\xbf\\xbd")
544 >>> roundtrip(b"\\xef\\xbf\\xbd")
545 True
545 True
546 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
546 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
547 True
547 True
548 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
548 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
549 True
549 True
550 '''
550 '''
551
551
552 if isasciistr(s):
552 if isasciistr(s):
553 return s
553 return s
554 # fast path - look for uDxxx prefixes in s
554 # fast path - look for uDxxx prefixes in s
555 if "\xed" not in s:
555 if "\xed" not in s:
556 return s
556 return s
557
557
558 # We could do this with the unicode type but some Python builds
558 # We could do this with the unicode type but some Python builds
559 # use UTF-16 internally (issue5031) which causes non-BMP code
559 # use UTF-16 internally (issue5031) which causes non-BMP code
560 # points to be escaped. Instead, we use our handy getutf8char
560 # points to be escaped. Instead, we use our handy getutf8char
561 # helper again to walk the string without "decoding" it.
561 # helper again to walk the string without "decoding" it.
562
562
563 r = ""
563 r = ""
564 pos = 0
564 pos = 0
565 l = len(s)
565 l = len(s)
566 while pos < l:
566 while pos < l:
567 c = getutf8char(s, pos)
567 c = getutf8char(s, pos)
568 pos += len(c)
568 pos += len(c)
569 # unescape U+DCxx characters
569 # unescape U+DCxx characters
570 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
570 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
571 c = chr(ord(c.decode("utf-8")) & 0xff)
571 c = chr(ord(c.decode("utf-8")) & 0xff)
572 r += c
572 r += c
573 return r
573 return r
574
574
575 if pycompat.ispy3:
575 if pycompat.ispy3:
576 class strio(io.TextIOWrapper):
576 class strio(io.TextIOWrapper):
577 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
577 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
578
578
579 Also works around Python closing streams.
579 Also works around Python closing streams.
580 """
580 """
581
581
582 def __init__(self, buffer):
582 def __init__(self, buffer):
583 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
583 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
584
584
585 def __del__(self):
585 def __del__(self):
586 """Override __del__ so it doesn't close the underlying stream."""
586 """Override __del__ so it doesn't close the underlying stream."""
587 else:
587 else:
588 strio = pycompat.identity
588 strio = pycompat.identity
General Comments 0
You need to be logged in to leave comments. Login now