##// END OF EJS Templates
encoding: alias cp65001 to utf-8 on Windows...
Yuya Nishihara -
r38633:44302901 stable
parent child Browse files
Show More
@@ -1,582 +1,587 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from . import (
14 from . import (
15 error,
15 error,
16 policy,
16 policy,
17 pycompat,
17 pycompat,
18 )
18 )
19
19
20 from .pure import (
20 from .pure import (
21 charencode as charencodepure,
21 charencode as charencodepure,
22 )
22 )
23
23
24 charencode = policy.importmod(r'charencode')
24 charencode = policy.importmod(r'charencode')
25
25
26 isasciistr = charencode.isasciistr
26 isasciistr = charencode.isasciistr
27 asciilower = charencode.asciilower
27 asciilower = charencode.asciilower
28 asciiupper = charencode.asciiupper
28 asciiupper = charencode.asciiupper
29 _jsonescapeu8fast = charencode.jsonescapeu8fast
29 _jsonescapeu8fast = charencode.jsonescapeu8fast
30
30
31 _sysstr = pycompat.sysstr
31 _sysstr = pycompat.sysstr
32
32
33 if pycompat.ispy3:
33 if pycompat.ispy3:
34 unichr = chr
34 unichr = chr
35
35
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 # "Unicode Subtleties"), so we need to ignore them in some places for
37 # "Unicode Subtleties"), so we need to ignore them in some places for
38 # sanity.
38 # sanity.
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 "206a 206b 206c 206d 206e 206f feff".split()]
41 "206a 206b 206c 206d 206e 206f feff".split()]
42 # verify the next function will work
42 # verify the next function will work
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44
44
45 def hfsignoreclean(s):
45 def hfsignoreclean(s):
46 """Remove codepoints ignored by HFS+ from s.
46 """Remove codepoints ignored by HFS+ from s.
47
47
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
49 '.hg'
49 '.hg'
50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
51 '.hg'
51 '.hg'
52 """
52 """
53 if "\xe2" in s or "\xef" in s:
53 if "\xe2" in s or "\xef" in s:
54 for c in _ignore:
54 for c in _ignore:
55 s = s.replace(c, '')
55 s = s.replace(c, '')
56 return s
56 return s
57
57
58 # encoding.environ is provided read-only, which may not be used to modify
58 # encoding.environ is provided read-only, which may not be used to modify
59 # the process environment
59 # the process environment
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
61 if not pycompat.ispy3:
61 if not pycompat.ispy3:
62 environ = os.environ # re-exports
62 environ = os.environ # re-exports
63 elif _nativeenviron:
63 elif _nativeenviron:
64 environ = os.environb # re-exports
64 environ = os.environb # re-exports
65 else:
65 else:
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 # and recreate it once encoding is settled
67 # and recreate it once encoding is settled
68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
69 for k, v in os.environ.items()) # re-exports
69 for k, v in os.environ.items()) # re-exports
70
70
71 _encodingfixers = {
71 _encodingfixers = {
72 '646': lambda: 'ascii',
72 '646': lambda: 'ascii',
73 'ANSI_X3.4-1968': lambda: 'ascii',
73 'ANSI_X3.4-1968': lambda: 'ascii',
74 }
74 }
75 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
76 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
77 # https://bugs.python.org/issue13216
78 if pycompat.iswindows and not pycompat.ispy3:
79 _encodingfixers['cp65001'] = lambda: 'utf-8'
75
80
76 try:
81 try:
77 encoding = environ.get("HGENCODING")
82 encoding = environ.get("HGENCODING")
78 if not encoding:
83 if not encoding:
79 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
84 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
80 encoding = _encodingfixers.get(encoding, lambda: encoding)()
85 encoding = _encodingfixers.get(encoding, lambda: encoding)()
81 except locale.Error:
86 except locale.Error:
82 encoding = 'ascii'
87 encoding = 'ascii'
83 encodingmode = environ.get("HGENCODINGMODE", "strict")
88 encodingmode = environ.get("HGENCODINGMODE", "strict")
84 fallbackencoding = 'ISO-8859-1'
89 fallbackencoding = 'ISO-8859-1'
85
90
86 class localstr(bytes):
91 class localstr(bytes):
87 '''This class allows strings that are unmodified to be
92 '''This class allows strings that are unmodified to be
88 round-tripped to the local encoding and back'''
93 round-tripped to the local encoding and back'''
89 def __new__(cls, u, l):
94 def __new__(cls, u, l):
90 s = bytes.__new__(cls, l)
95 s = bytes.__new__(cls, l)
91 s._utf8 = u
96 s._utf8 = u
92 return s
97 return s
93 def __hash__(self):
98 def __hash__(self):
94 return hash(self._utf8) # avoid collisions in local string space
99 return hash(self._utf8) # avoid collisions in local string space
95
100
96 def tolocal(s):
101 def tolocal(s):
97 """
102 """
98 Convert a string from internal UTF-8 to local encoding
103 Convert a string from internal UTF-8 to local encoding
99
104
100 All internal strings should be UTF-8 but some repos before the
105 All internal strings should be UTF-8 but some repos before the
101 implementation of locale support may contain latin1 or possibly
106 implementation of locale support may contain latin1 or possibly
102 other character sets. We attempt to decode everything strictly
107 other character sets. We attempt to decode everything strictly
103 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
108 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
104 replace unknown characters.
109 replace unknown characters.
105
110
106 The localstr class is used to cache the known UTF-8 encoding of
111 The localstr class is used to cache the known UTF-8 encoding of
107 strings next to their local representation to allow lossless
112 strings next to their local representation to allow lossless
108 round-trip conversion back to UTF-8.
113 round-trip conversion back to UTF-8.
109
114
110 >>> u = b'foo: \\xc3\\xa4' # utf-8
115 >>> u = b'foo: \\xc3\\xa4' # utf-8
111 >>> l = tolocal(u)
116 >>> l = tolocal(u)
112 >>> l
117 >>> l
113 'foo: ?'
118 'foo: ?'
114 >>> fromlocal(l)
119 >>> fromlocal(l)
115 'foo: \\xc3\\xa4'
120 'foo: \\xc3\\xa4'
116 >>> u2 = b'foo: \\xc3\\xa1'
121 >>> u2 = b'foo: \\xc3\\xa1'
117 >>> d = { l: 1, tolocal(u2): 2 }
122 >>> d = { l: 1, tolocal(u2): 2 }
118 >>> len(d) # no collision
123 >>> len(d) # no collision
119 2
124 2
120 >>> b'foo: ?' in d
125 >>> b'foo: ?' in d
121 False
126 False
122 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
127 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
123 >>> l = tolocal(l1)
128 >>> l = tolocal(l1)
124 >>> l
129 >>> l
125 'foo: ?'
130 'foo: ?'
126 >>> fromlocal(l) # magically in utf-8
131 >>> fromlocal(l) # magically in utf-8
127 'foo: \\xc3\\xa4'
132 'foo: \\xc3\\xa4'
128 """
133 """
129
134
130 if isasciistr(s):
135 if isasciistr(s):
131 return s
136 return s
132
137
133 try:
138 try:
134 try:
139 try:
135 # make sure string is actually stored in UTF-8
140 # make sure string is actually stored in UTF-8
136 u = s.decode('UTF-8')
141 u = s.decode('UTF-8')
137 if encoding == 'UTF-8':
142 if encoding == 'UTF-8':
138 # fast path
143 # fast path
139 return s
144 return s
140 r = u.encode(_sysstr(encoding), u"replace")
145 r = u.encode(_sysstr(encoding), u"replace")
141 if u == r.decode(_sysstr(encoding)):
146 if u == r.decode(_sysstr(encoding)):
142 # r is a safe, non-lossy encoding of s
147 # r is a safe, non-lossy encoding of s
143 return r
148 return r
144 return localstr(s, r)
149 return localstr(s, r)
145 except UnicodeDecodeError:
150 except UnicodeDecodeError:
146 # we should only get here if we're looking at an ancient changeset
151 # we should only get here if we're looking at an ancient changeset
147 try:
152 try:
148 u = s.decode(_sysstr(fallbackencoding))
153 u = s.decode(_sysstr(fallbackencoding))
149 r = u.encode(_sysstr(encoding), u"replace")
154 r = u.encode(_sysstr(encoding), u"replace")
150 if u == r.decode(_sysstr(encoding)):
155 if u == r.decode(_sysstr(encoding)):
151 # r is a safe, non-lossy encoding of s
156 # r is a safe, non-lossy encoding of s
152 return r
157 return r
153 return localstr(u.encode('UTF-8'), r)
158 return localstr(u.encode('UTF-8'), r)
154 except UnicodeDecodeError:
159 except UnicodeDecodeError:
155 u = s.decode("utf-8", "replace") # last ditch
160 u = s.decode("utf-8", "replace") # last ditch
156 # can't round-trip
161 # can't round-trip
157 return u.encode(_sysstr(encoding), u"replace")
162 return u.encode(_sysstr(encoding), u"replace")
158 except LookupError as k:
163 except LookupError as k:
159 raise error.Abort(k, hint="please check your locale settings")
164 raise error.Abort(k, hint="please check your locale settings")
160
165
161 def fromlocal(s):
166 def fromlocal(s):
162 """
167 """
163 Convert a string from the local character encoding to UTF-8
168 Convert a string from the local character encoding to UTF-8
164
169
165 We attempt to decode strings using the encoding mode set by
170 We attempt to decode strings using the encoding mode set by
166 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
171 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
167 characters will cause an error message. Other modes include
172 characters will cause an error message. Other modes include
168 'replace', which replaces unknown characters with a special
173 'replace', which replaces unknown characters with a special
169 Unicode character, and 'ignore', which drops the character.
174 Unicode character, and 'ignore', which drops the character.
170 """
175 """
171
176
172 # can we do a lossless round-trip?
177 # can we do a lossless round-trip?
173 if isinstance(s, localstr):
178 if isinstance(s, localstr):
174 return s._utf8
179 return s._utf8
175 if isasciistr(s):
180 if isasciistr(s):
176 return s
181 return s
177
182
178 try:
183 try:
179 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
184 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
180 return u.encode("utf-8")
185 return u.encode("utf-8")
181 except UnicodeDecodeError as inst:
186 except UnicodeDecodeError as inst:
182 sub = s[max(0, inst.start - 10):inst.start + 10]
187 sub = s[max(0, inst.start - 10):inst.start + 10]
183 raise error.Abort("decoding near '%s': %s!"
188 raise error.Abort("decoding near '%s': %s!"
184 % (sub, pycompat.bytestr(inst)))
189 % (sub, pycompat.bytestr(inst)))
185 except LookupError as k:
190 except LookupError as k:
186 raise error.Abort(k, hint="please check your locale settings")
191 raise error.Abort(k, hint="please check your locale settings")
187
192
188 def unitolocal(u):
193 def unitolocal(u):
189 """Convert a unicode string to a byte string of local encoding"""
194 """Convert a unicode string to a byte string of local encoding"""
190 return tolocal(u.encode('utf-8'))
195 return tolocal(u.encode('utf-8'))
191
196
192 def unifromlocal(s):
197 def unifromlocal(s):
193 """Convert a byte string of local encoding to a unicode string"""
198 """Convert a byte string of local encoding to a unicode string"""
194 return fromlocal(s).decode('utf-8')
199 return fromlocal(s).decode('utf-8')
195
200
196 def unimethod(bytesfunc):
201 def unimethod(bytesfunc):
197 """Create a proxy method that forwards __unicode__() and __str__() of
202 """Create a proxy method that forwards __unicode__() and __str__() of
198 Python 3 to __bytes__()"""
203 Python 3 to __bytes__()"""
199 def unifunc(obj):
204 def unifunc(obj):
200 return unifromlocal(bytesfunc(obj))
205 return unifromlocal(bytesfunc(obj))
201 return unifunc
206 return unifunc
202
207
203 # converter functions between native str and byte string. use these if the
208 # converter functions between native str and byte string. use these if the
204 # character encoding is not aware (e.g. exception message) or is known to
209 # character encoding is not aware (e.g. exception message) or is known to
205 # be locale dependent (e.g. date formatting.)
210 # be locale dependent (e.g. date formatting.)
206 if pycompat.ispy3:
211 if pycompat.ispy3:
207 strtolocal = unitolocal
212 strtolocal = unitolocal
208 strfromlocal = unifromlocal
213 strfromlocal = unifromlocal
209 strmethod = unimethod
214 strmethod = unimethod
210 else:
215 else:
211 strtolocal = pycompat.identity
216 strtolocal = pycompat.identity
212 strfromlocal = pycompat.identity
217 strfromlocal = pycompat.identity
213 strmethod = pycompat.identity
218 strmethod = pycompat.identity
214
219
215 if not _nativeenviron:
220 if not _nativeenviron:
216 # now encoding and helper functions are available, recreate the environ
221 # now encoding and helper functions are available, recreate the environ
217 # dict to be exported to other modules
222 # dict to be exported to other modules
218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
223 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 for k, v in os.environ.items()) # re-exports
224 for k, v in os.environ.items()) # re-exports
220
225
221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
226 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
227 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 and "WFA" or "WF")
228 and "WFA" or "WF")
224
229
225 def colwidth(s):
230 def colwidth(s):
226 "Find the column width of a string for display in the local encoding"
231 "Find the column width of a string for display in the local encoding"
227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
232 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228
233
229 def ucolwidth(d):
234 def ucolwidth(d):
230 "Find the column width of a Unicode string for display"
235 "Find the column width of a Unicode string for display"
231 eaw = getattr(unicodedata, 'east_asian_width', None)
236 eaw = getattr(unicodedata, 'east_asian_width', None)
232 if eaw is not None:
237 if eaw is not None:
233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
238 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 return len(d)
239 return len(d)
235
240
236 def getcols(s, start, c):
241 def getcols(s, start, c):
237 '''Use colwidth to find a c-column substring of s starting at byte
242 '''Use colwidth to find a c-column substring of s starting at byte
238 index start'''
243 index start'''
239 for x in xrange(start + c, len(s)):
244 for x in xrange(start + c, len(s)):
240 t = s[start:x]
245 t = s[start:x]
241 if colwidth(t) == c:
246 if colwidth(t) == c:
242 return t
247 return t
243
248
244 def trim(s, width, ellipsis='', leftside=False):
249 def trim(s, width, ellipsis='', leftside=False):
245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
250 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246
251
247 If 'leftside' is True, left side of string 's' is trimmed.
252 If 'leftside' is True, left side of string 's' is trimmed.
248 'ellipsis' is always placed at trimmed side.
253 'ellipsis' is always placed at trimmed side.
249
254
250 >>> from .node import bin
255 >>> from .node import bin
251 >>> def bprint(s):
256 >>> def bprint(s):
252 ... print(pycompat.sysstr(s))
257 ... print(pycompat.sysstr(s))
253 >>> ellipsis = b'+++'
258 >>> ellipsis = b'+++'
254 >>> from . import encoding
259 >>> from . import encoding
255 >>> encoding.encoding = b'utf-8'
260 >>> encoding.encoding = b'utf-8'
256 >>> t = b'1234567890'
261 >>> t = b'1234567890'
257 >>> bprint(trim(t, 12, ellipsis=ellipsis))
262 >>> bprint(trim(t, 12, ellipsis=ellipsis))
258 1234567890
263 1234567890
259 >>> bprint(trim(t, 10, ellipsis=ellipsis))
264 >>> bprint(trim(t, 10, ellipsis=ellipsis))
260 1234567890
265 1234567890
261 >>> bprint(trim(t, 8, ellipsis=ellipsis))
266 >>> bprint(trim(t, 8, ellipsis=ellipsis))
262 12345+++
267 12345+++
263 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
268 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
264 +++67890
269 +++67890
265 >>> bprint(trim(t, 8))
270 >>> bprint(trim(t, 8))
266 12345678
271 12345678
267 >>> bprint(trim(t, 8, leftside=True))
272 >>> bprint(trim(t, 8, leftside=True))
268 34567890
273 34567890
269 >>> bprint(trim(t, 3, ellipsis=ellipsis))
274 >>> bprint(trim(t, 3, ellipsis=ellipsis))
270 +++
275 +++
271 >>> bprint(trim(t, 1, ellipsis=ellipsis))
276 >>> bprint(trim(t, 1, ellipsis=ellipsis))
272 +
277 +
273 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
278 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
274 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
279 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
275 >>> bprint(trim(t, 12, ellipsis=ellipsis))
280 >>> bprint(trim(t, 12, ellipsis=ellipsis))
276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
281 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
277 >>> bprint(trim(t, 10, ellipsis=ellipsis))
282 >>> bprint(trim(t, 10, ellipsis=ellipsis))
278 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
283 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
279 >>> bprint(trim(t, 8, ellipsis=ellipsis))
284 >>> bprint(trim(t, 8, ellipsis=ellipsis))
280 \xe3\x81\x82\xe3\x81\x84+++
285 \xe3\x81\x82\xe3\x81\x84+++
281 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
286 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
282 +++\xe3\x81\x88\xe3\x81\x8a
287 +++\xe3\x81\x88\xe3\x81\x8a
283 >>> bprint(trim(t, 5))
288 >>> bprint(trim(t, 5))
284 \xe3\x81\x82\xe3\x81\x84
289 \xe3\x81\x82\xe3\x81\x84
285 >>> bprint(trim(t, 5, leftside=True))
290 >>> bprint(trim(t, 5, leftside=True))
286 \xe3\x81\x88\xe3\x81\x8a
291 \xe3\x81\x88\xe3\x81\x8a
287 >>> bprint(trim(t, 4, ellipsis=ellipsis))
292 >>> bprint(trim(t, 4, ellipsis=ellipsis))
288 +++
293 +++
289 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
294 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
290 +++
295 +++
291 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
296 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
292 >>> bprint(trim(t, 12, ellipsis=ellipsis))
297 >>> bprint(trim(t, 12, ellipsis=ellipsis))
293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
298 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
294 >>> bprint(trim(t, 10, ellipsis=ellipsis))
299 >>> bprint(trim(t, 10, ellipsis=ellipsis))
295 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
300 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
296 >>> bprint(trim(t, 8, ellipsis=ellipsis))
301 >>> bprint(trim(t, 8, ellipsis=ellipsis))
297 \x11\x22\x33\x44\x55+++
302 \x11\x22\x33\x44\x55+++
298 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
303 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
299 +++\x66\x77\x88\x99\xaa
304 +++\x66\x77\x88\x99\xaa
300 >>> bprint(trim(t, 8))
305 >>> bprint(trim(t, 8))
301 \x11\x22\x33\x44\x55\x66\x77\x88
306 \x11\x22\x33\x44\x55\x66\x77\x88
302 >>> bprint(trim(t, 8, leftside=True))
307 >>> bprint(trim(t, 8, leftside=True))
303 \x33\x44\x55\x66\x77\x88\x99\xaa
308 \x33\x44\x55\x66\x77\x88\x99\xaa
304 >>> bprint(trim(t, 3, ellipsis=ellipsis))
309 >>> bprint(trim(t, 3, ellipsis=ellipsis))
305 +++
310 +++
306 >>> bprint(trim(t, 1, ellipsis=ellipsis))
311 >>> bprint(trim(t, 1, ellipsis=ellipsis))
307 +
312 +
308 """
313 """
309 try:
314 try:
310 u = s.decode(_sysstr(encoding))
315 u = s.decode(_sysstr(encoding))
311 except UnicodeDecodeError:
316 except UnicodeDecodeError:
312 if len(s) <= width: # trimming is not needed
317 if len(s) <= width: # trimming is not needed
313 return s
318 return s
314 width -= len(ellipsis)
319 width -= len(ellipsis)
315 if width <= 0: # no enough room even for ellipsis
320 if width <= 0: # no enough room even for ellipsis
316 return ellipsis[:width + len(ellipsis)]
321 return ellipsis[:width + len(ellipsis)]
317 if leftside:
322 if leftside:
318 return ellipsis + s[-width:]
323 return ellipsis + s[-width:]
319 return s[:width] + ellipsis
324 return s[:width] + ellipsis
320
325
321 if ucolwidth(u) <= width: # trimming is not needed
326 if ucolwidth(u) <= width: # trimming is not needed
322 return s
327 return s
323
328
324 width -= len(ellipsis)
329 width -= len(ellipsis)
325 if width <= 0: # no enough room even for ellipsis
330 if width <= 0: # no enough room even for ellipsis
326 return ellipsis[:width + len(ellipsis)]
331 return ellipsis[:width + len(ellipsis)]
327
332
328 if leftside:
333 if leftside:
329 uslice = lambda i: u[i:]
334 uslice = lambda i: u[i:]
330 concat = lambda s: ellipsis + s
335 concat = lambda s: ellipsis + s
331 else:
336 else:
332 uslice = lambda i: u[:-i]
337 uslice = lambda i: u[:-i]
333 concat = lambda s: s + ellipsis
338 concat = lambda s: s + ellipsis
334 for i in xrange(1, len(u)):
339 for i in xrange(1, len(u)):
335 usub = uslice(i)
340 usub = uslice(i)
336 if ucolwidth(usub) <= width:
341 if ucolwidth(usub) <= width:
337 return concat(usub.encode(_sysstr(encoding)))
342 return concat(usub.encode(_sysstr(encoding)))
338 return ellipsis # no enough room for multi-column characters
343 return ellipsis # no enough room for multi-column characters
339
344
340 def lower(s):
345 def lower(s):
341 "best-effort encoding-aware case-folding of local string s"
346 "best-effort encoding-aware case-folding of local string s"
342 try:
347 try:
343 return asciilower(s)
348 return asciilower(s)
344 except UnicodeDecodeError:
349 except UnicodeDecodeError:
345 pass
350 pass
346 try:
351 try:
347 if isinstance(s, localstr):
352 if isinstance(s, localstr):
348 u = s._utf8.decode("utf-8")
353 u = s._utf8.decode("utf-8")
349 else:
354 else:
350 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
355 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
351
356
352 lu = u.lower()
357 lu = u.lower()
353 if u == lu:
358 if u == lu:
354 return s # preserve localstring
359 return s # preserve localstring
355 return lu.encode(_sysstr(encoding))
360 return lu.encode(_sysstr(encoding))
356 except UnicodeError:
361 except UnicodeError:
357 return s.lower() # we don't know how to fold this except in ASCII
362 return s.lower() # we don't know how to fold this except in ASCII
358 except LookupError as k:
363 except LookupError as k:
359 raise error.Abort(k, hint="please check your locale settings")
364 raise error.Abort(k, hint="please check your locale settings")
360
365
361 def upper(s):
366 def upper(s):
362 "best-effort encoding-aware case-folding of local string s"
367 "best-effort encoding-aware case-folding of local string s"
363 try:
368 try:
364 return asciiupper(s)
369 return asciiupper(s)
365 except UnicodeDecodeError:
370 except UnicodeDecodeError:
366 return upperfallback(s)
371 return upperfallback(s)
367
372
368 def upperfallback(s):
373 def upperfallback(s):
369 try:
374 try:
370 if isinstance(s, localstr):
375 if isinstance(s, localstr):
371 u = s._utf8.decode("utf-8")
376 u = s._utf8.decode("utf-8")
372 else:
377 else:
373 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
378 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
374
379
375 uu = u.upper()
380 uu = u.upper()
376 if u == uu:
381 if u == uu:
377 return s # preserve localstring
382 return s # preserve localstring
378 return uu.encode(_sysstr(encoding))
383 return uu.encode(_sysstr(encoding))
379 except UnicodeError:
384 except UnicodeError:
380 return s.upper() # we don't know how to fold this except in ASCII
385 return s.upper() # we don't know how to fold this except in ASCII
381 except LookupError as k:
386 except LookupError as k:
382 raise error.Abort(k, hint="please check your locale settings")
387 raise error.Abort(k, hint="please check your locale settings")
383
388
384 class normcasespecs(object):
389 class normcasespecs(object):
385 '''what a platform's normcase does to ASCII strings
390 '''what a platform's normcase does to ASCII strings
386
391
387 This is specified per platform, and should be consistent with what normcase
392 This is specified per platform, and should be consistent with what normcase
388 on that platform actually does.
393 on that platform actually does.
389
394
390 lower: normcase lowercases ASCII strings
395 lower: normcase lowercases ASCII strings
391 upper: normcase uppercases ASCII strings
396 upper: normcase uppercases ASCII strings
392 other: the fallback function should always be called
397 other: the fallback function should always be called
393
398
394 This should be kept in sync with normcase_spec in util.h.'''
399 This should be kept in sync with normcase_spec in util.h.'''
395 lower = -1
400 lower = -1
396 upper = 1
401 upper = 1
397 other = 0
402 other = 0
398
403
399 def jsonescape(s, paranoid=False):
404 def jsonescape(s, paranoid=False):
400 '''returns a string suitable for JSON
405 '''returns a string suitable for JSON
401
406
402 JSON is problematic for us because it doesn't support non-Unicode
407 JSON is problematic for us because it doesn't support non-Unicode
403 bytes. To deal with this, we take the following approach:
408 bytes. To deal with this, we take the following approach:
404
409
405 - localstr objects are converted back to UTF-8
410 - localstr objects are converted back to UTF-8
406 - valid UTF-8/ASCII strings are passed as-is
411 - valid UTF-8/ASCII strings are passed as-is
407 - other strings are converted to UTF-8b surrogate encoding
412 - other strings are converted to UTF-8b surrogate encoding
408 - apply JSON-specified string escaping
413 - apply JSON-specified string escaping
409
414
410 (escapes are doubled in these tests)
415 (escapes are doubled in these tests)
411
416
412 >>> jsonescape(b'this is a test')
417 >>> jsonescape(b'this is a test')
413 'this is a test'
418 'this is a test'
414 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
419 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
415 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
420 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
416 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
421 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
417 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
422 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
418 >>> jsonescape(b'a weird byte: \\xdd')
423 >>> jsonescape(b'a weird byte: \\xdd')
419 'a weird byte: \\xed\\xb3\\x9d'
424 'a weird byte: \\xed\\xb3\\x9d'
420 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
425 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
421 'utf-8: caf\\xc3\\xa9'
426 'utf-8: caf\\xc3\\xa9'
422 >>> jsonescape(b'')
427 >>> jsonescape(b'')
423 ''
428 ''
424
429
425 If paranoid, non-ascii and common troublesome characters are also escaped.
430 If paranoid, non-ascii and common troublesome characters are also escaped.
426 This is suitable for web output.
431 This is suitable for web output.
427
432
428 >>> s = b'escape characters: \\0 \\x0b \\x7f'
433 >>> s = b'escape characters: \\0 \\x0b \\x7f'
429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
434 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
430 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
435 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
431 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
436 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
432 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
437 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
433 'escape boundary: ~ \\\\u007f \\\\u0080'
438 'escape boundary: ~ \\\\u007f \\\\u0080'
434 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
439 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
435 'a weird byte: \\\\udcdd'
440 'a weird byte: \\\\udcdd'
436 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
441 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
437 'utf-8: caf\\\\u00e9'
442 'utf-8: caf\\\\u00e9'
438 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
443 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
439 'non-BMP: \\\\ud834\\\\udd1e'
444 'non-BMP: \\\\ud834\\\\udd1e'
440 >>> jsonescape(b'<foo@example.org>', paranoid=True)
445 >>> jsonescape(b'<foo@example.org>', paranoid=True)
441 '\\\\u003cfoo@example.org\\\\u003e'
446 '\\\\u003cfoo@example.org\\\\u003e'
442 '''
447 '''
443
448
444 u8chars = toutf8b(s)
449 u8chars = toutf8b(s)
445 try:
450 try:
446 return _jsonescapeu8fast(u8chars, paranoid)
451 return _jsonescapeu8fast(u8chars, paranoid)
447 except ValueError:
452 except ValueError:
448 pass
453 pass
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
454 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
450
455
451 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
456 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
452 # bytes are mapped to that range.
457 # bytes are mapped to that range.
453 if pycompat.ispy3:
458 if pycompat.ispy3:
454 _utf8strict = r'surrogatepass'
459 _utf8strict = r'surrogatepass'
455 else:
460 else:
456 _utf8strict = r'strict'
461 _utf8strict = r'strict'
457
462
458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
463 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
459
464
460 def getutf8char(s, pos):
465 def getutf8char(s, pos):
461 '''get the next full utf-8 character in the given string, starting at pos
466 '''get the next full utf-8 character in the given string, starting at pos
462
467
463 Raises a UnicodeError if the given location does not start a valid
468 Raises a UnicodeError if the given location does not start a valid
464 utf-8 character.
469 utf-8 character.
465 '''
470 '''
466
471
467 # find how many bytes to attempt decoding from first nibble
472 # find how many bytes to attempt decoding from first nibble
468 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
473 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
469 if not l: # ascii
474 if not l: # ascii
470 return s[pos:pos + 1]
475 return s[pos:pos + 1]
471
476
472 c = s[pos:pos + l]
477 c = s[pos:pos + l]
473 # validate with attempted decode
478 # validate with attempted decode
474 c.decode("utf-8", _utf8strict)
479 c.decode("utf-8", _utf8strict)
475 return c
480 return c
476
481
477 def toutf8b(s):
482 def toutf8b(s):
478 '''convert a local, possibly-binary string into UTF-8b
483 '''convert a local, possibly-binary string into UTF-8b
479
484
480 This is intended as a generic method to preserve data when working
485 This is intended as a generic method to preserve data when working
481 with schemes like JSON and XML that have no provision for
486 with schemes like JSON and XML that have no provision for
482 arbitrary byte strings. As Mercurial often doesn't know
487 arbitrary byte strings. As Mercurial often doesn't know
483 what encoding data is in, we use so-called UTF-8b.
488 what encoding data is in, we use so-called UTF-8b.
484
489
485 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
490 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
486 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
491 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
487 uDC00-uDCFF.
492 uDC00-uDCFF.
488
493
489 Principles of operation:
494 Principles of operation:
490
495
491 - ASCII and UTF-8 data successfully round-trips and is understood
496 - ASCII and UTF-8 data successfully round-trips and is understood
492 by Unicode-oriented clients
497 by Unicode-oriented clients
493 - filenames and file contents in arbitrary other encodings can have
498 - filenames and file contents in arbitrary other encodings can have
494 be round-tripped or recovered by clueful clients
499 be round-tripped or recovered by clueful clients
495 - local strings that have a cached known UTF-8 encoding (aka
500 - local strings that have a cached known UTF-8 encoding (aka
496 localstr) get sent as UTF-8 so Unicode-oriented clients get the
501 localstr) get sent as UTF-8 so Unicode-oriented clients get the
497 Unicode data they want
502 Unicode data they want
498 - because we must preserve UTF-8 bytestring in places such as
503 - because we must preserve UTF-8 bytestring in places such as
499 filenames, metadata can't be roundtripped without help
504 filenames, metadata can't be roundtripped without help
500
505
501 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
506 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
502 arbitrary bytes into an internal Unicode format that can be
507 arbitrary bytes into an internal Unicode format that can be
503 re-encoded back into the original. Here we are exposing the
508 re-encoded back into the original. Here we are exposing the
504 internal surrogate encoding as a UTF-8 string.)
509 internal surrogate encoding as a UTF-8 string.)
505 '''
510 '''
506
511
507 if not isinstance(s, localstr) and isasciistr(s):
512 if not isinstance(s, localstr) and isasciistr(s):
508 return s
513 return s
509 if "\xed" not in s:
514 if "\xed" not in s:
510 if isinstance(s, localstr):
515 if isinstance(s, localstr):
511 return s._utf8
516 return s._utf8
512 try:
517 try:
513 s.decode('utf-8', _utf8strict)
518 s.decode('utf-8', _utf8strict)
514 return s
519 return s
515 except UnicodeDecodeError:
520 except UnicodeDecodeError:
516 pass
521 pass
517
522
518 s = pycompat.bytestr(s)
523 s = pycompat.bytestr(s)
519 r = ""
524 r = ""
520 pos = 0
525 pos = 0
521 l = len(s)
526 l = len(s)
522 while pos < l:
527 while pos < l:
523 try:
528 try:
524 c = getutf8char(s, pos)
529 c = getutf8char(s, pos)
525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
530 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
526 # have to re-escape existing U+DCxx characters
531 # have to re-escape existing U+DCxx characters
527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
528 pos += 1
533 pos += 1
529 else:
534 else:
530 pos += len(c)
535 pos += len(c)
531 except UnicodeDecodeError:
536 except UnicodeDecodeError:
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
537 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
533 pos += 1
538 pos += 1
534 r += c
539 r += c
535 return r
540 return r
536
541
537 def fromutf8b(s):
542 def fromutf8b(s):
538 '''Given a UTF-8b string, return a local, possibly-binary string.
543 '''Given a UTF-8b string, return a local, possibly-binary string.
539
544
540 return the original binary string. This
545 return the original binary string. This
541 is a round-trip process for strings like filenames, but metadata
546 is a round-trip process for strings like filenames, but metadata
542 that's was passed through tolocal will remain in UTF-8.
547 that's was passed through tolocal will remain in UTF-8.
543
548
544 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
549 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
545 >>> m = b"\\xc3\\xa9\\x99abcd"
550 >>> m = b"\\xc3\\xa9\\x99abcd"
546 >>> toutf8b(m)
551 >>> toutf8b(m)
547 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
552 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
548 >>> roundtrip(m)
553 >>> roundtrip(m)
549 True
554 True
550 >>> roundtrip(b"\\xc2\\xc2\\x80")
555 >>> roundtrip(b"\\xc2\\xc2\\x80")
551 True
556 True
552 >>> roundtrip(b"\\xef\\xbf\\xbd")
557 >>> roundtrip(b"\\xef\\xbf\\xbd")
553 True
558 True
554 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
559 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
555 True
560 True
556 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
561 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
557 True
562 True
558 '''
563 '''
559
564
560 if isasciistr(s):
565 if isasciistr(s):
561 return s
566 return s
562 # fast path - look for uDxxx prefixes in s
567 # fast path - look for uDxxx prefixes in s
563 if "\xed" not in s:
568 if "\xed" not in s:
564 return s
569 return s
565
570
566 # We could do this with the unicode type but some Python builds
571 # We could do this with the unicode type but some Python builds
567 # use UTF-16 internally (issue5031) which causes non-BMP code
572 # use UTF-16 internally (issue5031) which causes non-BMP code
568 # points to be escaped. Instead, we use our handy getutf8char
573 # points to be escaped. Instead, we use our handy getutf8char
569 # helper again to walk the string without "decoding" it.
574 # helper again to walk the string without "decoding" it.
570
575
571 s = pycompat.bytestr(s)
576 s = pycompat.bytestr(s)
572 r = ""
577 r = ""
573 pos = 0
578 pos = 0
574 l = len(s)
579 l = len(s)
575 while pos < l:
580 while pos < l:
576 c = getutf8char(s, pos)
581 c = getutf8char(s, pos)
577 pos += len(c)
582 pos += len(c)
578 # unescape U+DCxx characters
583 # unescape U+DCxx characters
579 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
584 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
580 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
585 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
581 r += c
586 r += c
582 return r
587 return r
General Comments 0
You need to be logged in to leave comments. Login now