##// END OF EJS Templates
encoding: extract stub for fast JSON escape...
Yuya Nishihara -
r33925:b9101467 default
parent child Browse files
Show More
@@ -1,591 +1,571 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
11 import io
10 import io
12 import locale
11 import locale
13 import os
12 import os
14 import unicodedata
13 import unicodedata
15
14
16 from . import (
15 from . import (
17 error,
16 error,
18 policy,
17 policy,
19 pycompat,
18 pycompat,
20 )
19 )
21
20
21 from .pure import (
22 charencode as charencodepure,
23 )
24
22 charencode = policy.importmod(r'charencode')
25 charencode = policy.importmod(r'charencode')
23
26
24 asciilower = charencode.asciilower
27 asciilower = charencode.asciilower
25 asciiupper = charencode.asciiupper
28 asciiupper = charencode.asciiupper
29 _jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure"
26
30
27 _sysstr = pycompat.sysstr
31 _sysstr = pycompat.sysstr
28
32
29 if pycompat.ispy3:
33 if pycompat.ispy3:
30 unichr = chr
34 unichr = chr
31
35
32 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
33 # "Unicode Subtleties"), so we need to ignore them in some places for
37 # "Unicode Subtleties"), so we need to ignore them in some places for
34 # sanity.
38 # sanity.
35 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
36 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
37 "206a 206b 206c 206d 206e 206f feff".split()]
41 "206a 206b 206c 206d 206e 206f feff".split()]
38 # verify the next function will work
42 # verify the next function will work
39 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
40
44
41 def hfsignoreclean(s):
45 def hfsignoreclean(s):
42 """Remove codepoints ignored by HFS+ from s.
46 """Remove codepoints ignored by HFS+ from s.
43
47
44 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
45 '.hg'
49 '.hg'
46 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
47 '.hg'
51 '.hg'
48 """
52 """
49 if "\xe2" in s or "\xef" in s:
53 if "\xe2" in s or "\xef" in s:
50 for c in _ignore:
54 for c in _ignore:
51 s = s.replace(c, '')
55 s = s.replace(c, '')
52 return s
56 return s
53
57
54 # encoding.environ is provided read-only, which may not be used to modify
58 # encoding.environ is provided read-only, which may not be used to modify
55 # the process environment
59 # the process environment
56 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
57 if not pycompat.ispy3:
61 if not pycompat.ispy3:
58 environ = os.environ # re-exports
62 environ = os.environ # re-exports
59 elif _nativeenviron:
63 elif _nativeenviron:
60 environ = os.environb # re-exports
64 environ = os.environb # re-exports
61 else:
65 else:
62 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
63 # and recreate it once encoding is settled
67 # and recreate it once encoding is settled
64 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
65 for k, v in os.environ.items()) # re-exports
69 for k, v in os.environ.items()) # re-exports
66
70
67 _encodingfixers = {
71 _encodingfixers = {
68 '646': lambda: 'ascii',
72 '646': lambda: 'ascii',
69 'ANSI_X3.4-1968': lambda: 'ascii',
73 'ANSI_X3.4-1968': lambda: 'ascii',
70 }
74 }
71
75
72 try:
76 try:
73 encoding = environ.get("HGENCODING")
77 encoding = environ.get("HGENCODING")
74 if not encoding:
78 if not encoding:
75 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
79 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
76 encoding = _encodingfixers.get(encoding, lambda: encoding)()
80 encoding = _encodingfixers.get(encoding, lambda: encoding)()
77 except locale.Error:
81 except locale.Error:
78 encoding = 'ascii'
82 encoding = 'ascii'
79 encodingmode = environ.get("HGENCODINGMODE", "strict")
83 encodingmode = environ.get("HGENCODINGMODE", "strict")
80 fallbackencoding = 'ISO-8859-1'
84 fallbackencoding = 'ISO-8859-1'
81
85
82 class localstr(bytes):
86 class localstr(bytes):
83 '''This class allows strings that are unmodified to be
87 '''This class allows strings that are unmodified to be
84 round-tripped to the local encoding and back'''
88 round-tripped to the local encoding and back'''
85 def __new__(cls, u, l):
89 def __new__(cls, u, l):
86 s = bytes.__new__(cls, l)
90 s = bytes.__new__(cls, l)
87 s._utf8 = u
91 s._utf8 = u
88 return s
92 return s
89 def __hash__(self):
93 def __hash__(self):
90 return hash(self._utf8) # avoid collisions in local string space
94 return hash(self._utf8) # avoid collisions in local string space
91
95
92 def tolocal(s):
96 def tolocal(s):
93 """
97 """
94 Convert a string from internal UTF-8 to local encoding
98 Convert a string from internal UTF-8 to local encoding
95
99
96 All internal strings should be UTF-8 but some repos before the
100 All internal strings should be UTF-8 but some repos before the
97 implementation of locale support may contain latin1 or possibly
101 implementation of locale support may contain latin1 or possibly
98 other character sets. We attempt to decode everything strictly
102 other character sets. We attempt to decode everything strictly
99 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
103 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
100 replace unknown characters.
104 replace unknown characters.
101
105
102 The localstr class is used to cache the known UTF-8 encoding of
106 The localstr class is used to cache the known UTF-8 encoding of
103 strings next to their local representation to allow lossless
107 strings next to their local representation to allow lossless
104 round-trip conversion back to UTF-8.
108 round-trip conversion back to UTF-8.
105
109
106 >>> u = 'foo: \\xc3\\xa4' # utf-8
110 >>> u = 'foo: \\xc3\\xa4' # utf-8
107 >>> l = tolocal(u)
111 >>> l = tolocal(u)
108 >>> l
112 >>> l
109 'foo: ?'
113 'foo: ?'
110 >>> fromlocal(l)
114 >>> fromlocal(l)
111 'foo: \\xc3\\xa4'
115 'foo: \\xc3\\xa4'
112 >>> u2 = 'foo: \\xc3\\xa1'
116 >>> u2 = 'foo: \\xc3\\xa1'
113 >>> d = { l: 1, tolocal(u2): 2 }
117 >>> d = { l: 1, tolocal(u2): 2 }
114 >>> len(d) # no collision
118 >>> len(d) # no collision
115 2
119 2
116 >>> 'foo: ?' in d
120 >>> 'foo: ?' in d
117 False
121 False
118 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
122 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
119 >>> l = tolocal(l1)
123 >>> l = tolocal(l1)
120 >>> l
124 >>> l
121 'foo: ?'
125 'foo: ?'
122 >>> fromlocal(l) # magically in utf-8
126 >>> fromlocal(l) # magically in utf-8
123 'foo: \\xc3\\xa4'
127 'foo: \\xc3\\xa4'
124 """
128 """
125
129
126 try:
130 try:
127 try:
131 try:
128 # make sure string is actually stored in UTF-8
132 # make sure string is actually stored in UTF-8
129 u = s.decode('UTF-8')
133 u = s.decode('UTF-8')
130 if encoding == 'UTF-8':
134 if encoding == 'UTF-8':
131 # fast path
135 # fast path
132 return s
136 return s
133 r = u.encode(_sysstr(encoding), u"replace")
137 r = u.encode(_sysstr(encoding), u"replace")
134 if u == r.decode(_sysstr(encoding)):
138 if u == r.decode(_sysstr(encoding)):
135 # r is a safe, non-lossy encoding of s
139 # r is a safe, non-lossy encoding of s
136 return r
140 return r
137 return localstr(s, r)
141 return localstr(s, r)
138 except UnicodeDecodeError:
142 except UnicodeDecodeError:
139 # we should only get here if we're looking at an ancient changeset
143 # we should only get here if we're looking at an ancient changeset
140 try:
144 try:
141 u = s.decode(_sysstr(fallbackencoding))
145 u = s.decode(_sysstr(fallbackencoding))
142 r = u.encode(_sysstr(encoding), u"replace")
146 r = u.encode(_sysstr(encoding), u"replace")
143 if u == r.decode(_sysstr(encoding)):
147 if u == r.decode(_sysstr(encoding)):
144 # r is a safe, non-lossy encoding of s
148 # r is a safe, non-lossy encoding of s
145 return r
149 return r
146 return localstr(u.encode('UTF-8'), r)
150 return localstr(u.encode('UTF-8'), r)
147 except UnicodeDecodeError:
151 except UnicodeDecodeError:
148 u = s.decode("utf-8", "replace") # last ditch
152 u = s.decode("utf-8", "replace") # last ditch
149 # can't round-trip
153 # can't round-trip
150 return u.encode(_sysstr(encoding), u"replace")
154 return u.encode(_sysstr(encoding), u"replace")
151 except LookupError as k:
155 except LookupError as k:
152 raise error.Abort(k, hint="please check your locale settings")
156 raise error.Abort(k, hint="please check your locale settings")
153
157
154 def fromlocal(s):
158 def fromlocal(s):
155 """
159 """
156 Convert a string from the local character encoding to UTF-8
160 Convert a string from the local character encoding to UTF-8
157
161
158 We attempt to decode strings using the encoding mode set by
162 We attempt to decode strings using the encoding mode set by
159 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
163 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
160 characters will cause an error message. Other modes include
164 characters will cause an error message. Other modes include
161 'replace', which replaces unknown characters with a special
165 'replace', which replaces unknown characters with a special
162 Unicode character, and 'ignore', which drops the character.
166 Unicode character, and 'ignore', which drops the character.
163 """
167 """
164
168
165 # can we do a lossless round-trip?
169 # can we do a lossless round-trip?
166 if isinstance(s, localstr):
170 if isinstance(s, localstr):
167 return s._utf8
171 return s._utf8
168
172
169 try:
173 try:
170 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
174 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
171 return u.encode("utf-8")
175 return u.encode("utf-8")
172 except UnicodeDecodeError as inst:
176 except UnicodeDecodeError as inst:
173 sub = s[max(0, inst.start - 10):inst.start + 10]
177 sub = s[max(0, inst.start - 10):inst.start + 10]
174 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
178 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
175 except LookupError as k:
179 except LookupError as k:
176 raise error.Abort(k, hint="please check your locale settings")
180 raise error.Abort(k, hint="please check your locale settings")
177
181
178 def unitolocal(u):
182 def unitolocal(u):
179 """Convert a unicode string to a byte string of local encoding"""
183 """Convert a unicode string to a byte string of local encoding"""
180 return tolocal(u.encode('utf-8'))
184 return tolocal(u.encode('utf-8'))
181
185
182 def unifromlocal(s):
186 def unifromlocal(s):
183 """Convert a byte string of local encoding to a unicode string"""
187 """Convert a byte string of local encoding to a unicode string"""
184 return fromlocal(s).decode('utf-8')
188 return fromlocal(s).decode('utf-8')
185
189
186 def unimethod(bytesfunc):
190 def unimethod(bytesfunc):
187 """Create a proxy method that forwards __unicode__() and __str__() of
191 """Create a proxy method that forwards __unicode__() and __str__() of
188 Python 3 to __bytes__()"""
192 Python 3 to __bytes__()"""
189 def unifunc(obj):
193 def unifunc(obj):
190 return unifromlocal(bytesfunc(obj))
194 return unifromlocal(bytesfunc(obj))
191 return unifunc
195 return unifunc
192
196
193 # converter functions between native str and byte string. use these if the
197 # converter functions between native str and byte string. use these if the
194 # character encoding is not aware (e.g. exception message) or is known to
198 # character encoding is not aware (e.g. exception message) or is known to
195 # be locale dependent (e.g. date formatting.)
199 # be locale dependent (e.g. date formatting.)
196 if pycompat.ispy3:
200 if pycompat.ispy3:
197 strtolocal = unitolocal
201 strtolocal = unitolocal
198 strfromlocal = unifromlocal
202 strfromlocal = unifromlocal
199 strmethod = unimethod
203 strmethod = unimethod
200 else:
204 else:
201 strtolocal = pycompat.identity
205 strtolocal = pycompat.identity
202 strfromlocal = pycompat.identity
206 strfromlocal = pycompat.identity
203 strmethod = pycompat.identity
207 strmethod = pycompat.identity
204
208
205 if not _nativeenviron:
209 if not _nativeenviron:
206 # now encoding and helper functions are available, recreate the environ
210 # now encoding and helper functions are available, recreate the environ
207 # dict to be exported to other modules
211 # dict to be exported to other modules
208 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
212 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
209 for k, v in os.environ.items()) # re-exports
213 for k, v in os.environ.items()) # re-exports
210
214
211 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
215 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
212 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
216 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
213 and "WFA" or "WF")
217 and "WFA" or "WF")
214
218
215 def colwidth(s):
219 def colwidth(s):
216 "Find the column width of a string for display in the local encoding"
220 "Find the column width of a string for display in the local encoding"
217 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
221 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
218
222
219 def ucolwidth(d):
223 def ucolwidth(d):
220 "Find the column width of a Unicode string for display"
224 "Find the column width of a Unicode string for display"
221 eaw = getattr(unicodedata, 'east_asian_width', None)
225 eaw = getattr(unicodedata, 'east_asian_width', None)
222 if eaw is not None:
226 if eaw is not None:
223 return sum([eaw(c) in _wide and 2 or 1 for c in d])
227 return sum([eaw(c) in _wide and 2 or 1 for c in d])
224 return len(d)
228 return len(d)
225
229
226 def getcols(s, start, c):
230 def getcols(s, start, c):
227 '''Use colwidth to find a c-column substring of s starting at byte
231 '''Use colwidth to find a c-column substring of s starting at byte
228 index start'''
232 index start'''
229 for x in xrange(start + c, len(s)):
233 for x in xrange(start + c, len(s)):
230 t = s[start:x]
234 t = s[start:x]
231 if colwidth(t) == c:
235 if colwidth(t) == c:
232 return t
236 return t
233
237
234 def trim(s, width, ellipsis='', leftside=False):
238 def trim(s, width, ellipsis='', leftside=False):
235 """Trim string 's' to at most 'width' columns (including 'ellipsis').
239 """Trim string 's' to at most 'width' columns (including 'ellipsis').
236
240
237 If 'leftside' is True, left side of string 's' is trimmed.
241 If 'leftside' is True, left side of string 's' is trimmed.
238 'ellipsis' is always placed at trimmed side.
242 'ellipsis' is always placed at trimmed side.
239
243
240 >>> ellipsis = '+++'
244 >>> ellipsis = '+++'
241 >>> from . import encoding
245 >>> from . import encoding
242 >>> encoding.encoding = 'utf-8'
246 >>> encoding.encoding = 'utf-8'
243 >>> t= '1234567890'
247 >>> t= '1234567890'
244 >>> print trim(t, 12, ellipsis=ellipsis)
248 >>> print trim(t, 12, ellipsis=ellipsis)
245 1234567890
249 1234567890
246 >>> print trim(t, 10, ellipsis=ellipsis)
250 >>> print trim(t, 10, ellipsis=ellipsis)
247 1234567890
251 1234567890
248 >>> print trim(t, 8, ellipsis=ellipsis)
252 >>> print trim(t, 8, ellipsis=ellipsis)
249 12345+++
253 12345+++
250 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
254 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
251 +++67890
255 +++67890
252 >>> print trim(t, 8)
256 >>> print trim(t, 8)
253 12345678
257 12345678
254 >>> print trim(t, 8, leftside=True)
258 >>> print trim(t, 8, leftside=True)
255 34567890
259 34567890
256 >>> print trim(t, 3, ellipsis=ellipsis)
260 >>> print trim(t, 3, ellipsis=ellipsis)
257 +++
261 +++
258 >>> print trim(t, 1, ellipsis=ellipsis)
262 >>> print trim(t, 1, ellipsis=ellipsis)
259 +
263 +
260 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
264 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
261 >>> t = u.encode(encoding.encoding)
265 >>> t = u.encode(encoding.encoding)
262 >>> print trim(t, 12, ellipsis=ellipsis)
266 >>> print trim(t, 12, ellipsis=ellipsis)
263 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
267 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
264 >>> print trim(t, 10, ellipsis=ellipsis)
268 >>> print trim(t, 10, ellipsis=ellipsis)
265 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
269 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
266 >>> print trim(t, 8, ellipsis=ellipsis)
270 >>> print trim(t, 8, ellipsis=ellipsis)
267 \xe3\x81\x82\xe3\x81\x84+++
271 \xe3\x81\x82\xe3\x81\x84+++
268 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
272 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
269 +++\xe3\x81\x88\xe3\x81\x8a
273 +++\xe3\x81\x88\xe3\x81\x8a
270 >>> print trim(t, 5)
274 >>> print trim(t, 5)
271 \xe3\x81\x82\xe3\x81\x84
275 \xe3\x81\x82\xe3\x81\x84
272 >>> print trim(t, 5, leftside=True)
276 >>> print trim(t, 5, leftside=True)
273 \xe3\x81\x88\xe3\x81\x8a
277 \xe3\x81\x88\xe3\x81\x8a
274 >>> print trim(t, 4, ellipsis=ellipsis)
278 >>> print trim(t, 4, ellipsis=ellipsis)
275 +++
279 +++
276 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
280 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
277 +++
281 +++
278 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
282 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
279 >>> print trim(t, 12, ellipsis=ellipsis)
283 >>> print trim(t, 12, ellipsis=ellipsis)
280 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
284 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
281 >>> print trim(t, 10, ellipsis=ellipsis)
285 >>> print trim(t, 10, ellipsis=ellipsis)
282 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
286 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
283 >>> print trim(t, 8, ellipsis=ellipsis)
287 >>> print trim(t, 8, ellipsis=ellipsis)
284 \x11\x22\x33\x44\x55+++
288 \x11\x22\x33\x44\x55+++
285 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
289 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
286 +++\x66\x77\x88\x99\xaa
290 +++\x66\x77\x88\x99\xaa
287 >>> print trim(t, 8)
291 >>> print trim(t, 8)
288 \x11\x22\x33\x44\x55\x66\x77\x88
292 \x11\x22\x33\x44\x55\x66\x77\x88
289 >>> print trim(t, 8, leftside=True)
293 >>> print trim(t, 8, leftside=True)
290 \x33\x44\x55\x66\x77\x88\x99\xaa
294 \x33\x44\x55\x66\x77\x88\x99\xaa
291 >>> print trim(t, 3, ellipsis=ellipsis)
295 >>> print trim(t, 3, ellipsis=ellipsis)
292 +++
296 +++
293 >>> print trim(t, 1, ellipsis=ellipsis)
297 >>> print trim(t, 1, ellipsis=ellipsis)
294 +
298 +
295 """
299 """
296 try:
300 try:
297 u = s.decode(_sysstr(encoding))
301 u = s.decode(_sysstr(encoding))
298 except UnicodeDecodeError:
302 except UnicodeDecodeError:
299 if len(s) <= width: # trimming is not needed
303 if len(s) <= width: # trimming is not needed
300 return s
304 return s
301 width -= len(ellipsis)
305 width -= len(ellipsis)
302 if width <= 0: # no enough room even for ellipsis
306 if width <= 0: # no enough room even for ellipsis
303 return ellipsis[:width + len(ellipsis)]
307 return ellipsis[:width + len(ellipsis)]
304 if leftside:
308 if leftside:
305 return ellipsis + s[-width:]
309 return ellipsis + s[-width:]
306 return s[:width] + ellipsis
310 return s[:width] + ellipsis
307
311
308 if ucolwidth(u) <= width: # trimming is not needed
312 if ucolwidth(u) <= width: # trimming is not needed
309 return s
313 return s
310
314
311 width -= len(ellipsis)
315 width -= len(ellipsis)
312 if width <= 0: # no enough room even for ellipsis
316 if width <= 0: # no enough room even for ellipsis
313 return ellipsis[:width + len(ellipsis)]
317 return ellipsis[:width + len(ellipsis)]
314
318
315 if leftside:
319 if leftside:
316 uslice = lambda i: u[i:]
320 uslice = lambda i: u[i:]
317 concat = lambda s: ellipsis + s
321 concat = lambda s: ellipsis + s
318 else:
322 else:
319 uslice = lambda i: u[:-i]
323 uslice = lambda i: u[:-i]
320 concat = lambda s: s + ellipsis
324 concat = lambda s: s + ellipsis
321 for i in xrange(1, len(u)):
325 for i in xrange(1, len(u)):
322 usub = uslice(i)
326 usub = uslice(i)
323 if ucolwidth(usub) <= width:
327 if ucolwidth(usub) <= width:
324 return concat(usub.encode(_sysstr(encoding)))
328 return concat(usub.encode(_sysstr(encoding)))
325 return ellipsis # no enough room for multi-column characters
329 return ellipsis # no enough room for multi-column characters
326
330
327 def lower(s):
331 def lower(s):
328 "best-effort encoding-aware case-folding of local string s"
332 "best-effort encoding-aware case-folding of local string s"
329 try:
333 try:
330 return asciilower(s)
334 return asciilower(s)
331 except UnicodeDecodeError:
335 except UnicodeDecodeError:
332 pass
336 pass
333 try:
337 try:
334 if isinstance(s, localstr):
338 if isinstance(s, localstr):
335 u = s._utf8.decode("utf-8")
339 u = s._utf8.decode("utf-8")
336 else:
340 else:
337 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
341 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
338
342
339 lu = u.lower()
343 lu = u.lower()
340 if u == lu:
344 if u == lu:
341 return s # preserve localstring
345 return s # preserve localstring
342 return lu.encode(_sysstr(encoding))
346 return lu.encode(_sysstr(encoding))
343 except UnicodeError:
347 except UnicodeError:
344 return s.lower() # we don't know how to fold this except in ASCII
348 return s.lower() # we don't know how to fold this except in ASCII
345 except LookupError as k:
349 except LookupError as k:
346 raise error.Abort(k, hint="please check your locale settings")
350 raise error.Abort(k, hint="please check your locale settings")
347
351
348 def upper(s):
352 def upper(s):
349 "best-effort encoding-aware case-folding of local string s"
353 "best-effort encoding-aware case-folding of local string s"
350 try:
354 try:
351 return asciiupper(s)
355 return asciiupper(s)
352 except UnicodeDecodeError:
356 except UnicodeDecodeError:
353 return upperfallback(s)
357 return upperfallback(s)
354
358
355 def upperfallback(s):
359 def upperfallback(s):
356 try:
360 try:
357 if isinstance(s, localstr):
361 if isinstance(s, localstr):
358 u = s._utf8.decode("utf-8")
362 u = s._utf8.decode("utf-8")
359 else:
363 else:
360 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
364 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
361
365
362 uu = u.upper()
366 uu = u.upper()
363 if u == uu:
367 if u == uu:
364 return s # preserve localstring
368 return s # preserve localstring
365 return uu.encode(_sysstr(encoding))
369 return uu.encode(_sysstr(encoding))
366 except UnicodeError:
370 except UnicodeError:
367 return s.upper() # we don't know how to fold this except in ASCII
371 return s.upper() # we don't know how to fold this except in ASCII
368 except LookupError as k:
372 except LookupError as k:
369 raise error.Abort(k, hint="please check your locale settings")
373 raise error.Abort(k, hint="please check your locale settings")
370
374
371 class normcasespecs(object):
375 class normcasespecs(object):
372 '''what a platform's normcase does to ASCII strings
376 '''what a platform's normcase does to ASCII strings
373
377
374 This is specified per platform, and should be consistent with what normcase
378 This is specified per platform, and should be consistent with what normcase
375 on that platform actually does.
379 on that platform actually does.
376
380
377 lower: normcase lowercases ASCII strings
381 lower: normcase lowercases ASCII strings
378 upper: normcase uppercases ASCII strings
382 upper: normcase uppercases ASCII strings
379 other: the fallback function should always be called
383 other: the fallback function should always be called
380
384
381 This should be kept in sync with normcase_spec in util.h.'''
385 This should be kept in sync with normcase_spec in util.h.'''
382 lower = -1
386 lower = -1
383 upper = 1
387 upper = 1
384 other = 0
388 other = 0
385
389
386 _jsonmap = []
387 _jsonmap.extend("\\u%04x" % x for x in range(32))
388 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
389 _jsonmap.append('\\u007f')
390 _jsonmap[0x09] = '\\t'
391 _jsonmap[0x0a] = '\\n'
392 _jsonmap[0x22] = '\\"'
393 _jsonmap[0x5c] = '\\\\'
394 _jsonmap[0x08] = '\\b'
395 _jsonmap[0x0c] = '\\f'
396 _jsonmap[0x0d] = '\\r'
397 _paranoidjsonmap = _jsonmap[:]
398 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
399 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
400 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
401
402 def jsonescape(s, paranoid=False):
390 def jsonescape(s, paranoid=False):
403 '''returns a string suitable for JSON
391 '''returns a string suitable for JSON
404
392
405 JSON is problematic for us because it doesn't support non-Unicode
393 JSON is problematic for us because it doesn't support non-Unicode
406 bytes. To deal with this, we take the following approach:
394 bytes. To deal with this, we take the following approach:
407
395
408 - localstr objects are converted back to UTF-8
396 - localstr objects are converted back to UTF-8
409 - valid UTF-8/ASCII strings are passed as-is
397 - valid UTF-8/ASCII strings are passed as-is
410 - other strings are converted to UTF-8b surrogate encoding
398 - other strings are converted to UTF-8b surrogate encoding
411 - apply JSON-specified string escaping
399 - apply JSON-specified string escaping
412
400
413 (escapes are doubled in these tests)
401 (escapes are doubled in these tests)
414
402
415 >>> jsonescape('this is a test')
403 >>> jsonescape('this is a test')
416 'this is a test'
404 'this is a test'
417 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
405 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
418 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
406 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
419 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
407 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
420 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
408 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
421 >>> jsonescape('a weird byte: \\xdd')
409 >>> jsonescape('a weird byte: \\xdd')
422 'a weird byte: \\xed\\xb3\\x9d'
410 'a weird byte: \\xed\\xb3\\x9d'
423 >>> jsonescape('utf-8: caf\\xc3\\xa9')
411 >>> jsonescape('utf-8: caf\\xc3\\xa9')
424 'utf-8: caf\\xc3\\xa9'
412 'utf-8: caf\\xc3\\xa9'
425 >>> jsonescape('')
413 >>> jsonescape('')
426 ''
414 ''
427
415
428 If paranoid, non-ascii and common troublesome characters are also escaped.
416 If paranoid, non-ascii and common troublesome characters are also escaped.
429 This is suitable for web output.
417 This is suitable for web output.
430
418
431 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
419 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
432 'escape boundary: ~ \\\\u007f \\\\u0080'
420 'escape boundary: ~ \\\\u007f \\\\u0080'
433 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
421 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
434 'a weird byte: \\\\udcdd'
422 'a weird byte: \\\\udcdd'
435 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
423 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
436 'utf-8: caf\\\\u00e9'
424 'utf-8: caf\\\\u00e9'
437 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
425 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
438 'non-BMP: \\\\ud834\\\\udd1e'
426 'non-BMP: \\\\ud834\\\\udd1e'
439 >>> jsonescape('<foo@example.org>', paranoid=True)
427 >>> jsonescape('<foo@example.org>', paranoid=True)
440 '\\\\u003cfoo@example.org\\\\u003e'
428 '\\\\u003cfoo@example.org\\\\u003e'
441 '''
429 '''
442
430
443 if paranoid:
444 jm = _paranoidjsonmap
445 else:
446 jm = _jsonmap
447
448 u8chars = toutf8b(s)
431 u8chars = toutf8b(s)
449 try:
432 try:
450 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
433 return _jsonescapeu8fast(u8chars, paranoid)
451 except IndexError:
434 except ValueError:
452 pass
435 pass
453 # non-BMP char is represented as UTF-16 surrogate pair
436 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
454 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
455 u16codes.pop(0) # drop BOM
456 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
457
437
458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
438 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
459
439
460 def getutf8char(s, pos):
440 def getutf8char(s, pos):
461 '''get the next full utf-8 character in the given string, starting at pos
441 '''get the next full utf-8 character in the given string, starting at pos
462
442
463 Raises a UnicodeError if the given location does not start a valid
443 Raises a UnicodeError if the given location does not start a valid
464 utf-8 character.
444 utf-8 character.
465 '''
445 '''
466
446
467 # find how many bytes to attempt decoding from first nibble
447 # find how many bytes to attempt decoding from first nibble
468 l = _utf8len[ord(s[pos]) >> 4]
448 l = _utf8len[ord(s[pos]) >> 4]
469 if not l: # ascii
449 if not l: # ascii
470 return s[pos]
450 return s[pos]
471
451
472 c = s[pos:pos + l]
452 c = s[pos:pos + l]
473 # validate with attempted decode
453 # validate with attempted decode
474 c.decode("utf-8")
454 c.decode("utf-8")
475 return c
455 return c
476
456
477 def toutf8b(s):
457 def toutf8b(s):
478 '''convert a local, possibly-binary string into UTF-8b
458 '''convert a local, possibly-binary string into UTF-8b
479
459
480 This is intended as a generic method to preserve data when working
460 This is intended as a generic method to preserve data when working
481 with schemes like JSON and XML that have no provision for
461 with schemes like JSON and XML that have no provision for
482 arbitrary byte strings. As Mercurial often doesn't know
462 arbitrary byte strings. As Mercurial often doesn't know
483 what encoding data is in, we use so-called UTF-8b.
463 what encoding data is in, we use so-called UTF-8b.
484
464
485 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
465 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
486 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
466 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
487 uDC00-uDCFF.
467 uDC00-uDCFF.
488
468
489 Principles of operation:
469 Principles of operation:
490
470
491 - ASCII and UTF-8 data successfully round-trips and is understood
471 - ASCII and UTF-8 data successfully round-trips and is understood
492 by Unicode-oriented clients
472 by Unicode-oriented clients
493 - filenames and file contents in arbitrary other encodings can have
473 - filenames and file contents in arbitrary other encodings can have
494 be round-tripped or recovered by clueful clients
474 be round-tripped or recovered by clueful clients
495 - local strings that have a cached known UTF-8 encoding (aka
475 - local strings that have a cached known UTF-8 encoding (aka
496 localstr) get sent as UTF-8 so Unicode-oriented clients get the
476 localstr) get sent as UTF-8 so Unicode-oriented clients get the
497 Unicode data they want
477 Unicode data they want
498 - because we must preserve UTF-8 bytestring in places such as
478 - because we must preserve UTF-8 bytestring in places such as
499 filenames, metadata can't be roundtripped without help
479 filenames, metadata can't be roundtripped without help
500
480
501 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
481 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
502 arbitrary bytes into an internal Unicode format that can be
482 arbitrary bytes into an internal Unicode format that can be
503 re-encoded back into the original. Here we are exposing the
483 re-encoded back into the original. Here we are exposing the
504 internal surrogate encoding as a UTF-8 string.)
484 internal surrogate encoding as a UTF-8 string.)
505 '''
485 '''
506
486
507 if "\xed" not in s:
487 if "\xed" not in s:
508 if isinstance(s, localstr):
488 if isinstance(s, localstr):
509 return s._utf8
489 return s._utf8
510 try:
490 try:
511 s.decode('utf-8')
491 s.decode('utf-8')
512 return s
492 return s
513 except UnicodeDecodeError:
493 except UnicodeDecodeError:
514 pass
494 pass
515
495
516 r = ""
496 r = ""
517 pos = 0
497 pos = 0
518 l = len(s)
498 l = len(s)
519 while pos < l:
499 while pos < l:
520 try:
500 try:
521 c = getutf8char(s, pos)
501 c = getutf8char(s, pos)
522 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
502 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
523 # have to re-escape existing U+DCxx characters
503 # have to re-escape existing U+DCxx characters
524 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
504 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
525 pos += 1
505 pos += 1
526 else:
506 else:
527 pos += len(c)
507 pos += len(c)
528 except UnicodeDecodeError:
508 except UnicodeDecodeError:
529 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
509 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
530 pos += 1
510 pos += 1
531 r += c
511 r += c
532 return r
512 return r
533
513
534 def fromutf8b(s):
514 def fromutf8b(s):
535 '''Given a UTF-8b string, return a local, possibly-binary string.
515 '''Given a UTF-8b string, return a local, possibly-binary string.
536
516
537 return the original binary string. This
517 return the original binary string. This
538 is a round-trip process for strings like filenames, but metadata
518 is a round-trip process for strings like filenames, but metadata
539 that's was passed through tolocal will remain in UTF-8.
519 that's was passed through tolocal will remain in UTF-8.
540
520
541 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
521 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
542 >>> m = "\\xc3\\xa9\\x99abcd"
522 >>> m = "\\xc3\\xa9\\x99abcd"
543 >>> toutf8b(m)
523 >>> toutf8b(m)
544 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
524 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
545 >>> roundtrip(m)
525 >>> roundtrip(m)
546 True
526 True
547 >>> roundtrip("\\xc2\\xc2\\x80")
527 >>> roundtrip("\\xc2\\xc2\\x80")
548 True
528 True
549 >>> roundtrip("\\xef\\xbf\\xbd")
529 >>> roundtrip("\\xef\\xbf\\xbd")
550 True
530 True
551 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
531 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
552 True
532 True
553 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
533 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
554 True
534 True
555 '''
535 '''
556
536
557 # fast path - look for uDxxx prefixes in s
537 # fast path - look for uDxxx prefixes in s
558 if "\xed" not in s:
538 if "\xed" not in s:
559 return s
539 return s
560
540
561 # We could do this with the unicode type but some Python builds
541 # We could do this with the unicode type but some Python builds
562 # use UTF-16 internally (issue5031) which causes non-BMP code
542 # use UTF-16 internally (issue5031) which causes non-BMP code
563 # points to be escaped. Instead, we use our handy getutf8char
543 # points to be escaped. Instead, we use our handy getutf8char
564 # helper again to walk the string without "decoding" it.
544 # helper again to walk the string without "decoding" it.
565
545
566 r = ""
546 r = ""
567 pos = 0
547 pos = 0
568 l = len(s)
548 l = len(s)
569 while pos < l:
549 while pos < l:
570 c = getutf8char(s, pos)
550 c = getutf8char(s, pos)
571 pos += len(c)
551 pos += len(c)
572 # unescape U+DCxx characters
552 # unescape U+DCxx characters
573 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
553 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
574 c = chr(ord(c.decode("utf-8")) & 0xff)
554 c = chr(ord(c.decode("utf-8")) & 0xff)
575 r += c
555 r += c
576 return r
556 return r
577
557
578 if pycompat.ispy3:
558 if pycompat.ispy3:
579 class strio(io.TextIOWrapper):
559 class strio(io.TextIOWrapper):
580 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
560 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
581
561
582 Also works around Python closing streams.
562 Also works around Python closing streams.
583 """
563 """
584
564
585 def __init__(self, buffer):
565 def __init__(self, buffer):
586 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
566 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
587
567
588 def __del__(self):
568 def __del__(self):
589 """Override __del__ so it doesn't close the underlying stream."""
569 """Override __del__ so it doesn't close the underlying stream."""
590 else:
570 else:
591 strio = pycompat.identity
571 strio = pycompat.identity
@@ -1,22 +1,72 b''
1 # charencode.py - miscellaneous character encoding
1 # charencode.py - miscellaneous character encoding
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
11
12 from .. import (
13 pycompat,
14 )
15
10 def asciilower(s):
16 def asciilower(s):
11 '''convert a string to lowercase if ASCII
17 '''convert a string to lowercase if ASCII
12
18
13 Raises UnicodeDecodeError if non-ASCII characters are found.'''
19 Raises UnicodeDecodeError if non-ASCII characters are found.'''
14 s.decode('ascii')
20 s.decode('ascii')
15 return s.lower()
21 return s.lower()
16
22
17 def asciiupper(s):
23 def asciiupper(s):
18 '''convert a string to uppercase if ASCII
24 '''convert a string to uppercase if ASCII
19
25
20 Raises UnicodeDecodeError if non-ASCII characters are found.'''
26 Raises UnicodeDecodeError if non-ASCII characters are found.'''
21 s.decode('ascii')
27 s.decode('ascii')
22 return s.upper()
28 return s.upper()
29
30 _jsonmap = []
31 _jsonmap.extend("\\u%04x" % x for x in range(32))
32 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
33 _jsonmap.append('\\u007f')
34 _jsonmap[0x09] = '\\t'
35 _jsonmap[0x0a] = '\\n'
36 _jsonmap[0x22] = '\\"'
37 _jsonmap[0x5c] = '\\\\'
38 _jsonmap[0x08] = '\\b'
39 _jsonmap[0x0c] = '\\f'
40 _jsonmap[0x0d] = '\\r'
41 _paranoidjsonmap = _jsonmap[:]
42 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
43 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
44 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
45
46 def jsonescapeu8fast(u8chars, paranoid):
47 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
48
49 Raises ValueError if non-ASCII characters have to be escaped.
50 """
51 if paranoid:
52 jm = _paranoidjsonmap
53 else:
54 jm = _jsonmap
55 try:
56 return ''.join(jm[x] for x in bytearray(u8chars))
57 except IndexError:
58 raise ValueError
59
60 def jsonescapeu8fallback(u8chars, paranoid):
61 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
62
63 Escapes all non-ASCII characters no matter if paranoid is False.
64 """
65 if paranoid:
66 jm = _paranoidjsonmap
67 else:
68 jm = _jsonmap
69 # non-BMP char is represented as UTF-16 surrogate pair
70 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
71 u16codes.pop(0) # drop BOM
72 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
General Comments 0
You need to be logged in to leave comments. Login now