##// END OF EJS Templates
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
Yuya Nishihara -
r34215:aa877860 default
parent child Browse files
Show More
@@ -1,590 +1,597 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import io
10 import io
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import (
21 from .pure import (
22 charencode as charencodepure,
22 charencode as charencodepure,
23 )
23 )
24
24
25 charencode = policy.importmod(r'charencode')
25 charencode = policy.importmod(r'charencode')
26
26
27 isasciistr = charencode.isasciistr
27 isasciistr = charencode.isasciistr
28 asciilower = charencode.asciilower
28 asciilower = charencode.asciilower
29 asciiupper = charencode.asciiupper
29 asciiupper = charencode.asciiupper
30 _jsonescapeu8fast = charencode.jsonescapeu8fast
30 _jsonescapeu8fast = charencode.jsonescapeu8fast
31
31
32 _sysstr = pycompat.sysstr
32 _sysstr = pycompat.sysstr
33
33
34 if pycompat.ispy3:
34 if pycompat.ispy3:
35 unichr = chr
35 unichr = chr
36
36
37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
38 # "Unicode Subtleties"), so we need to ignore them in some places for
38 # "Unicode Subtleties"), so we need to ignore them in some places for
39 # sanity.
39 # sanity.
40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
42 "206a 206b 206c 206d 206e 206f feff".split()]
42 "206a 206b 206c 206d 206e 206f feff".split()]
43 # verify the next function will work
43 # verify the next function will work
44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
45
45
46 def hfsignoreclean(s):
46 def hfsignoreclean(s):
47 """Remove codepoints ignored by HFS+ from s.
47 """Remove codepoints ignored by HFS+ from s.
48
48
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 '.hg'
50 '.hg'
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 '.hg'
52 '.hg'
53 """
53 """
54 if "\xe2" in s or "\xef" in s:
54 if "\xe2" in s or "\xef" in s:
55 for c in _ignore:
55 for c in _ignore:
56 s = s.replace(c, '')
56 s = s.replace(c, '')
57 return s
57 return s
58
58
59 # encoding.environ is provided read-only, which may not be used to modify
59 # encoding.environ is provided read-only, which may not be used to modify
60 # the process environment
60 # the process environment
61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
62 if not pycompat.ispy3:
62 if not pycompat.ispy3:
63 environ = os.environ # re-exports
63 environ = os.environ # re-exports
64 elif _nativeenviron:
64 elif _nativeenviron:
65 environ = os.environb # re-exports
65 environ = os.environb # re-exports
66 else:
66 else:
67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
68 # and recreate it once encoding is settled
68 # and recreate it once encoding is settled
69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
70 for k, v in os.environ.items()) # re-exports
70 for k, v in os.environ.items()) # re-exports
71
71
72 _encodingfixers = {
72 _encodingfixers = {
73 '646': lambda: 'ascii',
73 '646': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
74 'ANSI_X3.4-1968': lambda: 'ascii',
75 }
75 }
76
76
77 try:
77 try:
78 encoding = environ.get("HGENCODING")
78 encoding = environ.get("HGENCODING")
79 if not encoding:
79 if not encoding:
80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
82 except locale.Error:
82 except locale.Error:
83 encoding = 'ascii'
83 encoding = 'ascii'
84 encodingmode = environ.get("HGENCODINGMODE", "strict")
84 encodingmode = environ.get("HGENCODINGMODE", "strict")
85 fallbackencoding = 'ISO-8859-1'
85 fallbackencoding = 'ISO-8859-1'
86
86
87 class localstr(bytes):
87 class localstr(bytes):
88 '''This class allows strings that are unmodified to be
88 '''This class allows strings that are unmodified to be
89 round-tripped to the local encoding and back'''
89 round-tripped to the local encoding and back'''
90 def __new__(cls, u, l):
90 def __new__(cls, u, l):
91 s = bytes.__new__(cls, l)
91 s = bytes.__new__(cls, l)
92 s._utf8 = u
92 s._utf8 = u
93 return s
93 return s
94 def __hash__(self):
94 def __hash__(self):
95 return hash(self._utf8) # avoid collisions in local string space
95 return hash(self._utf8) # avoid collisions in local string space
96
96
97 def tolocal(s):
97 def tolocal(s):
98 """
98 """
99 Convert a string from internal UTF-8 to local encoding
99 Convert a string from internal UTF-8 to local encoding
100
100
101 All internal strings should be UTF-8 but some repos before the
101 All internal strings should be UTF-8 but some repos before the
102 implementation of locale support may contain latin1 or possibly
102 implementation of locale support may contain latin1 or possibly
103 other character sets. We attempt to decode everything strictly
103 other character sets. We attempt to decode everything strictly
104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
105 replace unknown characters.
105 replace unknown characters.
106
106
107 The localstr class is used to cache the known UTF-8 encoding of
107 The localstr class is used to cache the known UTF-8 encoding of
108 strings next to their local representation to allow lossless
108 strings next to their local representation to allow lossless
109 round-trip conversion back to UTF-8.
109 round-trip conversion back to UTF-8.
110
110
111 >>> u = b'foo: \\xc3\\xa4' # utf-8
111 >>> u = b'foo: \\xc3\\xa4' # utf-8
112 >>> l = tolocal(u)
112 >>> l = tolocal(u)
113 >>> l
113 >>> l
114 'foo: ?'
114 'foo: ?'
115 >>> fromlocal(l)
115 >>> fromlocal(l)
116 'foo: \\xc3\\xa4'
116 'foo: \\xc3\\xa4'
117 >>> u2 = b'foo: \\xc3\\xa1'
117 >>> u2 = b'foo: \\xc3\\xa1'
118 >>> d = { l: 1, tolocal(u2): 2 }
118 >>> d = { l: 1, tolocal(u2): 2 }
119 >>> len(d) # no collision
119 >>> len(d) # no collision
120 2
120 2
121 >>> b'foo: ?' in d
121 >>> b'foo: ?' in d
122 False
122 False
123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
124 >>> l = tolocal(l1)
124 >>> l = tolocal(l1)
125 >>> l
125 >>> l
126 'foo: ?'
126 'foo: ?'
127 >>> fromlocal(l) # magically in utf-8
127 >>> fromlocal(l) # magically in utf-8
128 'foo: \\xc3\\xa4'
128 'foo: \\xc3\\xa4'
129 """
129 """
130
130
131 if isasciistr(s):
131 if isasciistr(s):
132 return s
132 return s
133
133
134 try:
134 try:
135 try:
135 try:
136 # make sure string is actually stored in UTF-8
136 # make sure string is actually stored in UTF-8
137 u = s.decode('UTF-8')
137 u = s.decode('UTF-8')
138 if encoding == 'UTF-8':
138 if encoding == 'UTF-8':
139 # fast path
139 # fast path
140 return s
140 return s
141 r = u.encode(_sysstr(encoding), u"replace")
141 r = u.encode(_sysstr(encoding), u"replace")
142 if u == r.decode(_sysstr(encoding)):
142 if u == r.decode(_sysstr(encoding)):
143 # r is a safe, non-lossy encoding of s
143 # r is a safe, non-lossy encoding of s
144 return r
144 return r
145 return localstr(s, r)
145 return localstr(s, r)
146 except UnicodeDecodeError:
146 except UnicodeDecodeError:
147 # we should only get here if we're looking at an ancient changeset
147 # we should only get here if we're looking at an ancient changeset
148 try:
148 try:
149 u = s.decode(_sysstr(fallbackencoding))
149 u = s.decode(_sysstr(fallbackencoding))
150 r = u.encode(_sysstr(encoding), u"replace")
150 r = u.encode(_sysstr(encoding), u"replace")
151 if u == r.decode(_sysstr(encoding)):
151 if u == r.decode(_sysstr(encoding)):
152 # r is a safe, non-lossy encoding of s
152 # r is a safe, non-lossy encoding of s
153 return r
153 return r
154 return localstr(u.encode('UTF-8'), r)
154 return localstr(u.encode('UTF-8'), r)
155 except UnicodeDecodeError:
155 except UnicodeDecodeError:
156 u = s.decode("utf-8", "replace") # last ditch
156 u = s.decode("utf-8", "replace") # last ditch
157 # can't round-trip
157 # can't round-trip
158 return u.encode(_sysstr(encoding), u"replace")
158 return u.encode(_sysstr(encoding), u"replace")
159 except LookupError as k:
159 except LookupError as k:
160 raise error.Abort(k, hint="please check your locale settings")
160 raise error.Abort(k, hint="please check your locale settings")
161
161
162 def fromlocal(s):
162 def fromlocal(s):
163 """
163 """
164 Convert a string from the local character encoding to UTF-8
164 Convert a string from the local character encoding to UTF-8
165
165
166 We attempt to decode strings using the encoding mode set by
166 We attempt to decode strings using the encoding mode set by
167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
168 characters will cause an error message. Other modes include
168 characters will cause an error message. Other modes include
169 'replace', which replaces unknown characters with a special
169 'replace', which replaces unknown characters with a special
170 Unicode character, and 'ignore', which drops the character.
170 Unicode character, and 'ignore', which drops the character.
171 """
171 """
172
172
173 # can we do a lossless round-trip?
173 # can we do a lossless round-trip?
174 if isinstance(s, localstr):
174 if isinstance(s, localstr):
175 return s._utf8
175 return s._utf8
176 if isasciistr(s):
176 if isasciistr(s):
177 return s
177 return s
178
178
179 try:
179 try:
180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
181 return u.encode("utf-8")
181 return u.encode("utf-8")
182 except UnicodeDecodeError as inst:
182 except UnicodeDecodeError as inst:
183 sub = s[max(0, inst.start - 10):inst.start + 10]
183 sub = s[max(0, inst.start - 10):inst.start + 10]
184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
185 except LookupError as k:
185 except LookupError as k:
186 raise error.Abort(k, hint="please check your locale settings")
186 raise error.Abort(k, hint="please check your locale settings")
187
187
188 def unitolocal(u):
188 def unitolocal(u):
189 """Convert a unicode string to a byte string of local encoding"""
189 """Convert a unicode string to a byte string of local encoding"""
190 return tolocal(u.encode('utf-8'))
190 return tolocal(u.encode('utf-8'))
191
191
192 def unifromlocal(s):
192 def unifromlocal(s):
193 """Convert a byte string of local encoding to a unicode string"""
193 """Convert a byte string of local encoding to a unicode string"""
194 return fromlocal(s).decode('utf-8')
194 return fromlocal(s).decode('utf-8')
195
195
196 def unimethod(bytesfunc):
196 def unimethod(bytesfunc):
197 """Create a proxy method that forwards __unicode__() and __str__() of
197 """Create a proxy method that forwards __unicode__() and __str__() of
198 Python 3 to __bytes__()"""
198 Python 3 to __bytes__()"""
199 def unifunc(obj):
199 def unifunc(obj):
200 return unifromlocal(bytesfunc(obj))
200 return unifromlocal(bytesfunc(obj))
201 return unifunc
201 return unifunc
202
202
203 # converter functions between native str and byte string. use these if the
203 # converter functions between native str and byte string. use these if the
204 # character encoding is not aware (e.g. exception message) or is known to
204 # character encoding is not aware (e.g. exception message) or is known to
205 # be locale dependent (e.g. date formatting.)
205 # be locale dependent (e.g. date formatting.)
206 if pycompat.ispy3:
206 if pycompat.ispy3:
207 strtolocal = unitolocal
207 strtolocal = unitolocal
208 strfromlocal = unifromlocal
208 strfromlocal = unifromlocal
209 strmethod = unimethod
209 strmethod = unimethod
210 else:
210 else:
211 strtolocal = pycompat.identity
211 strtolocal = pycompat.identity
212 strfromlocal = pycompat.identity
212 strfromlocal = pycompat.identity
213 strmethod = pycompat.identity
213 strmethod = pycompat.identity
214
214
215 if not _nativeenviron:
215 if not _nativeenviron:
216 # now encoding and helper functions are available, recreate the environ
216 # now encoding and helper functions are available, recreate the environ
217 # dict to be exported to other modules
217 # dict to be exported to other modules
218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 for k, v in os.environ.items()) # re-exports
219 for k, v in os.environ.items()) # re-exports
220
220
221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 and "WFA" or "WF")
223 and "WFA" or "WF")
224
224
225 def colwidth(s):
225 def colwidth(s):
226 "Find the column width of a string for display in the local encoding"
226 "Find the column width of a string for display in the local encoding"
227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228
228
229 def ucolwidth(d):
229 def ucolwidth(d):
230 "Find the column width of a Unicode string for display"
230 "Find the column width of a Unicode string for display"
231 eaw = getattr(unicodedata, 'east_asian_width', None)
231 eaw = getattr(unicodedata, 'east_asian_width', None)
232 if eaw is not None:
232 if eaw is not None:
233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 return len(d)
234 return len(d)
235
235
236 def getcols(s, start, c):
236 def getcols(s, start, c):
237 '''Use colwidth to find a c-column substring of s starting at byte
237 '''Use colwidth to find a c-column substring of s starting at byte
238 index start'''
238 index start'''
239 for x in xrange(start + c, len(s)):
239 for x in xrange(start + c, len(s)):
240 t = s[start:x]
240 t = s[start:x]
241 if colwidth(t) == c:
241 if colwidth(t) == c:
242 return t
242 return t
243
243
244 def trim(s, width, ellipsis='', leftside=False):
244 def trim(s, width, ellipsis='', leftside=False):
245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246
246
247 If 'leftside' is True, left side of string 's' is trimmed.
247 If 'leftside' is True, left side of string 's' is trimmed.
248 'ellipsis' is always placed at trimmed side.
248 'ellipsis' is always placed at trimmed side.
249
249
250 >>> from .node import bin
250 >>> from .node import bin
251 >>> def bprint(s):
251 >>> def bprint(s):
252 ... print(pycompat.sysstr(s))
252 ... print(pycompat.sysstr(s))
253 >>> ellipsis = b'+++'
253 >>> ellipsis = b'+++'
254 >>> from . import encoding
254 >>> from . import encoding
255 >>> encoding.encoding = b'utf-8'
255 >>> encoding.encoding = b'utf-8'
256 >>> t = b'1234567890'
256 >>> t = b'1234567890'
257 >>> bprint(trim(t, 12, ellipsis=ellipsis))
257 >>> bprint(trim(t, 12, ellipsis=ellipsis))
258 1234567890
258 1234567890
259 >>> bprint(trim(t, 10, ellipsis=ellipsis))
259 >>> bprint(trim(t, 10, ellipsis=ellipsis))
260 1234567890
260 1234567890
261 >>> bprint(trim(t, 8, ellipsis=ellipsis))
261 >>> bprint(trim(t, 8, ellipsis=ellipsis))
262 12345+++
262 12345+++
263 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
263 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
264 +++67890
264 +++67890
265 >>> bprint(trim(t, 8))
265 >>> bprint(trim(t, 8))
266 12345678
266 12345678
267 >>> bprint(trim(t, 8, leftside=True))
267 >>> bprint(trim(t, 8, leftside=True))
268 34567890
268 34567890
269 >>> bprint(trim(t, 3, ellipsis=ellipsis))
269 >>> bprint(trim(t, 3, ellipsis=ellipsis))
270 +++
270 +++
271 >>> bprint(trim(t, 1, ellipsis=ellipsis))
271 >>> bprint(trim(t, 1, ellipsis=ellipsis))
272 +
272 +
273 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
273 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
274 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
274 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
275 >>> bprint(trim(t, 12, ellipsis=ellipsis))
275 >>> bprint(trim(t, 12, ellipsis=ellipsis))
276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
277 >>> bprint(trim(t, 10, ellipsis=ellipsis))
277 >>> bprint(trim(t, 10, ellipsis=ellipsis))
278 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
278 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
279 >>> bprint(trim(t, 8, ellipsis=ellipsis))
279 >>> bprint(trim(t, 8, ellipsis=ellipsis))
280 \xe3\x81\x82\xe3\x81\x84+++
280 \xe3\x81\x82\xe3\x81\x84+++
281 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
281 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
282 +++\xe3\x81\x88\xe3\x81\x8a
282 +++\xe3\x81\x88\xe3\x81\x8a
283 >>> bprint(trim(t, 5))
283 >>> bprint(trim(t, 5))
284 \xe3\x81\x82\xe3\x81\x84
284 \xe3\x81\x82\xe3\x81\x84
285 >>> bprint(trim(t, 5, leftside=True))
285 >>> bprint(trim(t, 5, leftside=True))
286 \xe3\x81\x88\xe3\x81\x8a
286 \xe3\x81\x88\xe3\x81\x8a
287 >>> bprint(trim(t, 4, ellipsis=ellipsis))
287 >>> bprint(trim(t, 4, ellipsis=ellipsis))
288 +++
288 +++
289 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
289 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
290 +++
290 +++
291 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
291 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
292 >>> bprint(trim(t, 12, ellipsis=ellipsis))
292 >>> bprint(trim(t, 12, ellipsis=ellipsis))
293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
294 >>> bprint(trim(t, 10, ellipsis=ellipsis))
294 >>> bprint(trim(t, 10, ellipsis=ellipsis))
295 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
295 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
296 >>> bprint(trim(t, 8, ellipsis=ellipsis))
296 >>> bprint(trim(t, 8, ellipsis=ellipsis))
297 \x11\x22\x33\x44\x55+++
297 \x11\x22\x33\x44\x55+++
298 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
298 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
299 +++\x66\x77\x88\x99\xaa
299 +++\x66\x77\x88\x99\xaa
300 >>> bprint(trim(t, 8))
300 >>> bprint(trim(t, 8))
301 \x11\x22\x33\x44\x55\x66\x77\x88
301 \x11\x22\x33\x44\x55\x66\x77\x88
302 >>> bprint(trim(t, 8, leftside=True))
302 >>> bprint(trim(t, 8, leftside=True))
303 \x33\x44\x55\x66\x77\x88\x99\xaa
303 \x33\x44\x55\x66\x77\x88\x99\xaa
304 >>> bprint(trim(t, 3, ellipsis=ellipsis))
304 >>> bprint(trim(t, 3, ellipsis=ellipsis))
305 +++
305 +++
306 >>> bprint(trim(t, 1, ellipsis=ellipsis))
306 >>> bprint(trim(t, 1, ellipsis=ellipsis))
307 +
307 +
308 """
308 """
309 try:
309 try:
310 u = s.decode(_sysstr(encoding))
310 u = s.decode(_sysstr(encoding))
311 except UnicodeDecodeError:
311 except UnicodeDecodeError:
312 if len(s) <= width: # trimming is not needed
312 if len(s) <= width: # trimming is not needed
313 return s
313 return s
314 width -= len(ellipsis)
314 width -= len(ellipsis)
315 if width <= 0: # no enough room even for ellipsis
315 if width <= 0: # no enough room even for ellipsis
316 return ellipsis[:width + len(ellipsis)]
316 return ellipsis[:width + len(ellipsis)]
317 if leftside:
317 if leftside:
318 return ellipsis + s[-width:]
318 return ellipsis + s[-width:]
319 return s[:width] + ellipsis
319 return s[:width] + ellipsis
320
320
321 if ucolwidth(u) <= width: # trimming is not needed
321 if ucolwidth(u) <= width: # trimming is not needed
322 return s
322 return s
323
323
324 width -= len(ellipsis)
324 width -= len(ellipsis)
325 if width <= 0: # no enough room even for ellipsis
325 if width <= 0: # no enough room even for ellipsis
326 return ellipsis[:width + len(ellipsis)]
326 return ellipsis[:width + len(ellipsis)]
327
327
328 if leftside:
328 if leftside:
329 uslice = lambda i: u[i:]
329 uslice = lambda i: u[i:]
330 concat = lambda s: ellipsis + s
330 concat = lambda s: ellipsis + s
331 else:
331 else:
332 uslice = lambda i: u[:-i]
332 uslice = lambda i: u[:-i]
333 concat = lambda s: s + ellipsis
333 concat = lambda s: s + ellipsis
334 for i in xrange(1, len(u)):
334 for i in xrange(1, len(u)):
335 usub = uslice(i)
335 usub = uslice(i)
336 if ucolwidth(usub) <= width:
336 if ucolwidth(usub) <= width:
337 return concat(usub.encode(_sysstr(encoding)))
337 return concat(usub.encode(_sysstr(encoding)))
338 return ellipsis # no enough room for multi-column characters
338 return ellipsis # no enough room for multi-column characters
339
339
340 def lower(s):
340 def lower(s):
341 "best-effort encoding-aware case-folding of local string s"
341 "best-effort encoding-aware case-folding of local string s"
342 try:
342 try:
343 return asciilower(s)
343 return asciilower(s)
344 except UnicodeDecodeError:
344 except UnicodeDecodeError:
345 pass
345 pass
346 try:
346 try:
347 if isinstance(s, localstr):
347 if isinstance(s, localstr):
348 u = s._utf8.decode("utf-8")
348 u = s._utf8.decode("utf-8")
349 else:
349 else:
350 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
350 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
351
351
352 lu = u.lower()
352 lu = u.lower()
353 if u == lu:
353 if u == lu:
354 return s # preserve localstring
354 return s # preserve localstring
355 return lu.encode(_sysstr(encoding))
355 return lu.encode(_sysstr(encoding))
356 except UnicodeError:
356 except UnicodeError:
357 return s.lower() # we don't know how to fold this except in ASCII
357 return s.lower() # we don't know how to fold this except in ASCII
358 except LookupError as k:
358 except LookupError as k:
359 raise error.Abort(k, hint="please check your locale settings")
359 raise error.Abort(k, hint="please check your locale settings")
360
360
361 def upper(s):
361 def upper(s):
362 "best-effort encoding-aware case-folding of local string s"
362 "best-effort encoding-aware case-folding of local string s"
363 try:
363 try:
364 return asciiupper(s)
364 return asciiupper(s)
365 except UnicodeDecodeError:
365 except UnicodeDecodeError:
366 return upperfallback(s)
366 return upperfallback(s)
367
367
368 def upperfallback(s):
368 def upperfallback(s):
369 try:
369 try:
370 if isinstance(s, localstr):
370 if isinstance(s, localstr):
371 u = s._utf8.decode("utf-8")
371 u = s._utf8.decode("utf-8")
372 else:
372 else:
373 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
373 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
374
374
375 uu = u.upper()
375 uu = u.upper()
376 if u == uu:
376 if u == uu:
377 return s # preserve localstring
377 return s # preserve localstring
378 return uu.encode(_sysstr(encoding))
378 return uu.encode(_sysstr(encoding))
379 except UnicodeError:
379 except UnicodeError:
380 return s.upper() # we don't know how to fold this except in ASCII
380 return s.upper() # we don't know how to fold this except in ASCII
381 except LookupError as k:
381 except LookupError as k:
382 raise error.Abort(k, hint="please check your locale settings")
382 raise error.Abort(k, hint="please check your locale settings")
383
383
384 class normcasespecs(object):
384 class normcasespecs(object):
385 '''what a platform's normcase does to ASCII strings
385 '''what a platform's normcase does to ASCII strings
386
386
387 This is specified per platform, and should be consistent with what normcase
387 This is specified per platform, and should be consistent with what normcase
388 on that platform actually does.
388 on that platform actually does.
389
389
390 lower: normcase lowercases ASCII strings
390 lower: normcase lowercases ASCII strings
391 upper: normcase uppercases ASCII strings
391 upper: normcase uppercases ASCII strings
392 other: the fallback function should always be called
392 other: the fallback function should always be called
393
393
394 This should be kept in sync with normcase_spec in util.h.'''
394 This should be kept in sync with normcase_spec in util.h.'''
395 lower = -1
395 lower = -1
396 upper = 1
396 upper = 1
397 other = 0
397 other = 0
398
398
399 def jsonescape(s, paranoid=False):
399 def jsonescape(s, paranoid=False):
400 '''returns a string suitable for JSON
400 '''returns a string suitable for JSON
401
401
402 JSON is problematic for us because it doesn't support non-Unicode
402 JSON is problematic for us because it doesn't support non-Unicode
403 bytes. To deal with this, we take the following approach:
403 bytes. To deal with this, we take the following approach:
404
404
405 - localstr objects are converted back to UTF-8
405 - localstr objects are converted back to UTF-8
406 - valid UTF-8/ASCII strings are passed as-is
406 - valid UTF-8/ASCII strings are passed as-is
407 - other strings are converted to UTF-8b surrogate encoding
407 - other strings are converted to UTF-8b surrogate encoding
408 - apply JSON-specified string escaping
408 - apply JSON-specified string escaping
409
409
410 (escapes are doubled in these tests)
410 (escapes are doubled in these tests)
411
411
412 >>> jsonescape(b'this is a test')
412 >>> jsonescape(b'this is a test')
413 'this is a test'
413 'this is a test'
414 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
414 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
415 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
415 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
416 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
416 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
417 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
417 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
418 >>> jsonescape(b'a weird byte: \\xdd')
418 >>> jsonescape(b'a weird byte: \\xdd')
419 'a weird byte: \\xed\\xb3\\x9d'
419 'a weird byte: \\xed\\xb3\\x9d'
420 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
420 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
421 'utf-8: caf\\xc3\\xa9'
421 'utf-8: caf\\xc3\\xa9'
422 >>> jsonescape(b'')
422 >>> jsonescape(b'')
423 ''
423 ''
424
424
425 If paranoid, non-ascii and common troublesome characters are also escaped.
425 If paranoid, non-ascii and common troublesome characters are also escaped.
426 This is suitable for web output.
426 This is suitable for web output.
427
427
428 >>> s = b'escape characters: \\0 \\x0b \\x7f'
428 >>> s = b'escape characters: \\0 \\x0b \\x7f'
429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
430 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
430 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
431 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
431 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
432 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
432 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
433 'escape boundary: ~ \\\\u007f \\\\u0080'
433 'escape boundary: ~ \\\\u007f \\\\u0080'
434 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
434 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
435 'a weird byte: \\\\udcdd'
435 'a weird byte: \\\\udcdd'
436 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
436 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
437 'utf-8: caf\\\\u00e9'
437 'utf-8: caf\\\\u00e9'
438 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
438 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
439 'non-BMP: \\\\ud834\\\\udd1e'
439 'non-BMP: \\\\ud834\\\\udd1e'
440 >>> jsonescape(b'<foo@example.org>', paranoid=True)
440 >>> jsonescape(b'<foo@example.org>', paranoid=True)
441 '\\\\u003cfoo@example.org\\\\u003e'
441 '\\\\u003cfoo@example.org\\\\u003e'
442 '''
442 '''
443
443
444 u8chars = toutf8b(s)
444 u8chars = toutf8b(s)
445 try:
445 try:
446 return _jsonescapeu8fast(u8chars, paranoid)
446 return _jsonescapeu8fast(u8chars, paranoid)
447 except ValueError:
447 except ValueError:
448 pass
448 pass
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
450
450
451 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
452 # bytes are mapped to that range.
453 if pycompat.ispy3:
454 _utf8strict = r'surrogatepass'
455 else:
456 _utf8strict = r'strict'
457
451 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
452
459
453 def getutf8char(s, pos):
460 def getutf8char(s, pos):
454 '''get the next full utf-8 character in the given string, starting at pos
461 '''get the next full utf-8 character in the given string, starting at pos
455
462
456 Raises a UnicodeError if the given location does not start a valid
463 Raises a UnicodeError if the given location does not start a valid
457 utf-8 character.
464 utf-8 character.
458 '''
465 '''
459
466
460 # find how many bytes to attempt decoding from first nibble
467 # find how many bytes to attempt decoding from first nibble
461 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
468 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
462 if not l: # ascii
469 if not l: # ascii
463 return s[pos:pos + 1]
470 return s[pos:pos + 1]
464
471
465 c = s[pos:pos + l]
472 c = s[pos:pos + l]
466 # validate with attempted decode
473 # validate with attempted decode
467 c.decode("utf-8")
474 c.decode("utf-8", _utf8strict)
468 return c
475 return c
469
476
470 def toutf8b(s):
477 def toutf8b(s):
471 '''convert a local, possibly-binary string into UTF-8b
478 '''convert a local, possibly-binary string into UTF-8b
472
479
473 This is intended as a generic method to preserve data when working
480 This is intended as a generic method to preserve data when working
474 with schemes like JSON and XML that have no provision for
481 with schemes like JSON and XML that have no provision for
475 arbitrary byte strings. As Mercurial often doesn't know
482 arbitrary byte strings. As Mercurial often doesn't know
476 what encoding data is in, we use so-called UTF-8b.
483 what encoding data is in, we use so-called UTF-8b.
477
484
478 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
485 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
479 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
486 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
480 uDC00-uDCFF.
487 uDC00-uDCFF.
481
488
482 Principles of operation:
489 Principles of operation:
483
490
484 - ASCII and UTF-8 data successfully round-trips and is understood
491 - ASCII and UTF-8 data successfully round-trips and is understood
485 by Unicode-oriented clients
492 by Unicode-oriented clients
486 - filenames and file contents in arbitrary other encodings can have
493 - filenames and file contents in arbitrary other encodings can have
487 be round-tripped or recovered by clueful clients
494 be round-tripped or recovered by clueful clients
488 - local strings that have a cached known UTF-8 encoding (aka
495 - local strings that have a cached known UTF-8 encoding (aka
489 localstr) get sent as UTF-8 so Unicode-oriented clients get the
496 localstr) get sent as UTF-8 so Unicode-oriented clients get the
490 Unicode data they want
497 Unicode data they want
491 - because we must preserve UTF-8 bytestring in places such as
498 - because we must preserve UTF-8 bytestring in places such as
492 filenames, metadata can't be roundtripped without help
499 filenames, metadata can't be roundtripped without help
493
500
494 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
501 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
495 arbitrary bytes into an internal Unicode format that can be
502 arbitrary bytes into an internal Unicode format that can be
496 re-encoded back into the original. Here we are exposing the
503 re-encoded back into the original. Here we are exposing the
497 internal surrogate encoding as a UTF-8 string.)
504 internal surrogate encoding as a UTF-8 string.)
498 '''
505 '''
499
506
500 if not isinstance(s, localstr) and isasciistr(s):
507 if not isinstance(s, localstr) and isasciistr(s):
501 return s
508 return s
502 if "\xed" not in s:
509 if "\xed" not in s:
503 if isinstance(s, localstr):
510 if isinstance(s, localstr):
504 return s._utf8
511 return s._utf8
505 try:
512 try:
506 s.decode('utf-8')
513 s.decode('utf-8', _utf8strict)
507 return s
514 return s
508 except UnicodeDecodeError:
515 except UnicodeDecodeError:
509 pass
516 pass
510
517
511 s = pycompat.bytestr(s)
518 s = pycompat.bytestr(s)
512 r = ""
519 r = ""
513 pos = 0
520 pos = 0
514 l = len(s)
521 l = len(s)
515 while pos < l:
522 while pos < l:
516 try:
523 try:
517 c = getutf8char(s, pos)
524 c = getutf8char(s, pos)
518 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
519 # have to re-escape existing U+DCxx characters
526 # have to re-escape existing U+DCxx characters
520 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
521 pos += 1
528 pos += 1
522 else:
529 else:
523 pos += len(c)
530 pos += len(c)
524 except UnicodeDecodeError:
531 except UnicodeDecodeError:
525 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
526 pos += 1
533 pos += 1
527 r += c
534 r += c
528 return r
535 return r
529
536
530 def fromutf8b(s):
537 def fromutf8b(s):
531 '''Given a UTF-8b string, return a local, possibly-binary string.
538 '''Given a UTF-8b string, return a local, possibly-binary string.
532
539
533 return the original binary string. This
540 return the original binary string. This
534 is a round-trip process for strings like filenames, but metadata
541 is a round-trip process for strings like filenames, but metadata
535 that's was passed through tolocal will remain in UTF-8.
542 that's was passed through tolocal will remain in UTF-8.
536
543
537 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
544 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
538 >>> m = b"\\xc3\\xa9\\x99abcd"
545 >>> m = b"\\xc3\\xa9\\x99abcd"
539 >>> toutf8b(m)
546 >>> toutf8b(m)
540 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
547 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
541 >>> roundtrip(m)
548 >>> roundtrip(m)
542 True
549 True
543 >>> roundtrip(b"\\xc2\\xc2\\x80")
550 >>> roundtrip(b"\\xc2\\xc2\\x80")
544 True
551 True
545 >>> roundtrip(b"\\xef\\xbf\\xbd")
552 >>> roundtrip(b"\\xef\\xbf\\xbd")
546 True
553 True
547 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
554 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
548 True
555 True
549 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
556 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
550 True
557 True
551 '''
558 '''
552
559
553 if isasciistr(s):
560 if isasciistr(s):
554 return s
561 return s
555 # fast path - look for uDxxx prefixes in s
562 # fast path - look for uDxxx prefixes in s
556 if "\xed" not in s:
563 if "\xed" not in s:
557 return s
564 return s
558
565
559 # We could do this with the unicode type but some Python builds
566 # We could do this with the unicode type but some Python builds
560 # use UTF-16 internally (issue5031) which causes non-BMP code
567 # use UTF-16 internally (issue5031) which causes non-BMP code
561 # points to be escaped. Instead, we use our handy getutf8char
568 # points to be escaped. Instead, we use our handy getutf8char
562 # helper again to walk the string without "decoding" it.
569 # helper again to walk the string without "decoding" it.
563
570
564 s = pycompat.bytestr(s)
571 s = pycompat.bytestr(s)
565 r = ""
572 r = ""
566 pos = 0
573 pos = 0
567 l = len(s)
574 l = len(s)
568 while pos < l:
575 while pos < l:
569 c = getutf8char(s, pos)
576 c = getutf8char(s, pos)
570 pos += len(c)
577 pos += len(c)
571 # unescape U+DCxx characters
578 # unescape U+DCxx characters
572 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
579 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
573 c = chr(ord(c.decode("utf-8")) & 0xff)
580 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
574 r += c
581 r += c
575 return r
582 return r
576
583
577 if pycompat.ispy3:
584 if pycompat.ispy3:
578 class strio(io.TextIOWrapper):
585 class strio(io.TextIOWrapper):
579 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
586 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
580
587
581 Also works around Python closing streams.
588 Also works around Python closing streams.
582 """
589 """
583
590
584 def __init__(self, buffer):
591 def __init__(self, buffer):
585 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
592 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
586
593
587 def __del__(self):
594 def __del__(self):
588 """Override __del__ so it doesn't close the underlying stream."""
595 """Override __del__ so it doesn't close the underlying stream."""
589 else:
596 else:
590 strio = pycompat.identity
597 strio = pycompat.identity
@@ -1,79 +1,85 b''
1 # charencode.py - miscellaneous character encoding
1 # charencode.py - miscellaneous character encoding
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11
11
12 from .. import (
12 from .. import (
13 pycompat,
13 pycompat,
14 )
14 )
15
15
16 def isasciistr(s):
16 def isasciistr(s):
17 try:
17 try:
18 s.decode('ascii')
18 s.decode('ascii')
19 return True
19 return True
20 except UnicodeDecodeError:
20 except UnicodeDecodeError:
21 return False
21 return False
22
22
23 def asciilower(s):
23 def asciilower(s):
24 '''convert a string to lowercase if ASCII
24 '''convert a string to lowercase if ASCII
25
25
26 Raises UnicodeDecodeError if non-ASCII characters are found.'''
26 Raises UnicodeDecodeError if non-ASCII characters are found.'''
27 s.decode('ascii')
27 s.decode('ascii')
28 return s.lower()
28 return s.lower()
29
29
30 def asciiupper(s):
30 def asciiupper(s):
31 '''convert a string to uppercase if ASCII
31 '''convert a string to uppercase if ASCII
32
32
33 Raises UnicodeDecodeError if non-ASCII characters are found.'''
33 Raises UnicodeDecodeError if non-ASCII characters are found.'''
34 s.decode('ascii')
34 s.decode('ascii')
35 return s.upper()
35 return s.upper()
36
36
37 _jsonmap = []
37 _jsonmap = []
38 _jsonmap.extend("\\u%04x" % x for x in range(32))
38 _jsonmap.extend("\\u%04x" % x for x in range(32))
39 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
39 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
40 _jsonmap.append('\\u007f')
40 _jsonmap.append('\\u007f')
41 _jsonmap[0x09] = '\\t'
41 _jsonmap[0x09] = '\\t'
42 _jsonmap[0x0a] = '\\n'
42 _jsonmap[0x0a] = '\\n'
43 _jsonmap[0x22] = '\\"'
43 _jsonmap[0x22] = '\\"'
44 _jsonmap[0x5c] = '\\\\'
44 _jsonmap[0x5c] = '\\\\'
45 _jsonmap[0x08] = '\\b'
45 _jsonmap[0x08] = '\\b'
46 _jsonmap[0x0c] = '\\f'
46 _jsonmap[0x0c] = '\\f'
47 _jsonmap[0x0d] = '\\r'
47 _jsonmap[0x0d] = '\\r'
48 _paranoidjsonmap = _jsonmap[:]
48 _paranoidjsonmap = _jsonmap[:]
49 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
49 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
50 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
50 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
51 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
51 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
52
52
53 def jsonescapeu8fast(u8chars, paranoid):
53 def jsonescapeu8fast(u8chars, paranoid):
54 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
54 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
55
55
56 Raises ValueError if non-ASCII characters have to be escaped.
56 Raises ValueError if non-ASCII characters have to be escaped.
57 """
57 """
58 if paranoid:
58 if paranoid:
59 jm = _paranoidjsonmap
59 jm = _paranoidjsonmap
60 else:
60 else:
61 jm = _jsonmap
61 jm = _jsonmap
62 try:
62 try:
63 return ''.join(jm[x] for x in bytearray(u8chars))
63 return ''.join(jm[x] for x in bytearray(u8chars))
64 except IndexError:
64 except IndexError:
65 raise ValueError
65 raise ValueError
66
66
67 if pycompat.ispy3:
68 _utf8strict = r'surrogatepass'
69 else:
70 _utf8strict = r'strict'
71
67 def jsonescapeu8fallback(u8chars, paranoid):
72 def jsonescapeu8fallback(u8chars, paranoid):
68 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
73 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
69
74
70 Escapes all non-ASCII characters no matter if paranoid is False.
75 Escapes all non-ASCII characters no matter if paranoid is False.
71 """
76 """
72 if paranoid:
77 if paranoid:
73 jm = _paranoidjsonmap
78 jm = _paranoidjsonmap
74 else:
79 else:
75 jm = _jsonmap
80 jm = _jsonmap
76 # non-BMP char is represented as UTF-16 surrogate pair
81 # non-BMP char is represented as UTF-16 surrogate pair
77 u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
82 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
83 u16codes = array.array(r'H', u16b)
78 u16codes.pop(0) # drop BOM
84 u16codes.pop(0) # drop BOM
79 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
85 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
@@ -1,81 +1,81 b''
1 # this is hack to make sure no escape characters are inserted into the output
1 # this is hack to make sure no escape characters are inserted into the output
2
2
3 from __future__ import absolute_import
3 from __future__ import absolute_import
4
4
5 import doctest
5 import doctest
6 import os
6 import os
7 import re
7 import re
8 import sys
8 import sys
9
9
10 ispy3 = (sys.version_info[0] >= 3)
10 ispy3 = (sys.version_info[0] >= 3)
11
11
12 if 'TERM' in os.environ:
12 if 'TERM' in os.environ:
13 del os.environ['TERM']
13 del os.environ['TERM']
14
14
15 class py3docchecker(doctest.OutputChecker):
15 class py3docchecker(doctest.OutputChecker):
16 def check_output(self, want, got, optionflags):
16 def check_output(self, want, got, optionflags):
17 want2 = re.sub(r'''\bu(['"])(.*?)\1''', r'\1\2\1', want) # py2: u''
17 want2 = re.sub(r'''\bu(['"])(.*?)\1''', r'\1\2\1', want) # py2: u''
18 got2 = re.sub(r'''\bb(['"])(.*?)\1''', r'\1\2\1', got) # py3: b''
18 got2 = re.sub(r'''\bb(['"])(.*?)\1''', r'\1\2\1', got) # py3: b''
19 # py3: <exc.name>: b'<msg>' -> <name>: <msg>
19 # py3: <exc.name>: b'<msg>' -> <name>: <msg>
20 # <exc.name>: <others> -> <name>: <others>
20 # <exc.name>: <others> -> <name>: <others>
21 got2 = re.sub(r'''^mercurial\.\w+\.(\w+): (['"])(.*?)\2''', r'\1: \3',
21 got2 = re.sub(r'''^mercurial\.\w+\.(\w+): (['"])(.*?)\2''', r'\1: \3',
22 got2, re.MULTILINE)
22 got2, re.MULTILINE)
23 got2 = re.sub(r'^mercurial\.\w+\.(\w+): ', r'\1: ', got2, re.MULTILINE)
23 got2 = re.sub(r'^mercurial\.\w+\.(\w+): ', r'\1: ', got2, re.MULTILINE)
24 return any(doctest.OutputChecker.check_output(self, w, g, optionflags)
24 return any(doctest.OutputChecker.check_output(self, w, g, optionflags)
25 for w, g in [(want, got), (want2, got2)])
25 for w, g in [(want, got), (want2, got2)])
26
26
27 # TODO: migrate doctests to py3 and enable them on both versions
27 # TODO: migrate doctests to py3 and enable them on both versions
28 def testmod(name, optionflags=0, testtarget=None, py2=True, py3=True):
28 def testmod(name, optionflags=0, testtarget=None, py2=True, py3=True):
29 if not (not ispy3 and py2 or ispy3 and py3):
29 if not (not ispy3 and py2 or ispy3 and py3):
30 return
30 return
31 __import__(name)
31 __import__(name)
32 mod = sys.modules[name]
32 mod = sys.modules[name]
33 if testtarget is not None:
33 if testtarget is not None:
34 mod = getattr(mod, testtarget)
34 mod = getattr(mod, testtarget)
35
35
36 # minimal copy of doctest.testmod()
36 # minimal copy of doctest.testmod()
37 finder = doctest.DocTestFinder()
37 finder = doctest.DocTestFinder()
38 checker = None
38 checker = None
39 if ispy3:
39 if ispy3:
40 checker = py3docchecker()
40 checker = py3docchecker()
41 runner = doctest.DocTestRunner(checker=checker, optionflags=optionflags)
41 runner = doctest.DocTestRunner(checker=checker, optionflags=optionflags)
42 for test in finder.find(mod, name):
42 for test in finder.find(mod, name):
43 runner.run(test)
43 runner.run(test)
44 runner.summarize()
44 runner.summarize()
45
45
46 testmod('mercurial.changegroup')
46 testmod('mercurial.changegroup')
47 testmod('mercurial.changelog')
47 testmod('mercurial.changelog')
48 testmod('mercurial.color')
48 testmod('mercurial.color')
49 testmod('mercurial.config')
49 testmod('mercurial.config')
50 testmod('mercurial.context')
50 testmod('mercurial.context')
51 testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
51 testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
52 testmod('mercurial.dispatch')
52 testmod('mercurial.dispatch')
53 testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues
53 testmod('mercurial.encoding')
54 testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout
54 testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout
55 testmod('mercurial.hg')
55 testmod('mercurial.hg')
56 testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?
56 testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?
57 testmod('mercurial.match')
57 testmod('mercurial.match')
58 testmod('mercurial.mdiff')
58 testmod('mercurial.mdiff')
59 testmod('mercurial.minirst')
59 testmod('mercurial.minirst')
60 testmod('mercurial.patch', py3=False) # py3: bytes[n], etc. ?
60 testmod('mercurial.patch', py3=False) # py3: bytes[n], etc. ?
61 testmod('mercurial.pathutil', py3=False) # py3: os.sep
61 testmod('mercurial.pathutil', py3=False) # py3: os.sep
62 testmod('mercurial.parser')
62 testmod('mercurial.parser')
63 testmod('mercurial.pycompat')
63 testmod('mercurial.pycompat')
64 testmod('mercurial.revsetlang')
64 testmod('mercurial.revsetlang')
65 testmod('mercurial.smartset')
65 testmod('mercurial.smartset')
66 testmod('mercurial.store')
66 testmod('mercurial.store')
67 testmod('mercurial.subrepo')
67 testmod('mercurial.subrepo')
68 testmod('mercurial.templatefilters')
68 testmod('mercurial.templatefilters')
69 testmod('mercurial.templater')
69 testmod('mercurial.templater')
70 testmod('mercurial.ui')
70 testmod('mercurial.ui')
71 testmod('mercurial.url')
71 testmod('mercurial.url')
72 testmod('mercurial.util', py3=False) # py3: multiple bytes/unicode issues
72 testmod('mercurial.util', py3=False) # py3: multiple bytes/unicode issues
73 testmod('mercurial.util', testtarget='platform')
73 testmod('mercurial.util', testtarget='platform')
74 testmod('hgext.convert.convcmd', py3=False) # py3: use of str() ?
74 testmod('hgext.convert.convcmd', py3=False) # py3: use of str() ?
75 testmod('hgext.convert.cvsps')
75 testmod('hgext.convert.cvsps')
76 testmod('hgext.convert.filemap')
76 testmod('hgext.convert.filemap')
77 testmod('hgext.convert.p4')
77 testmod('hgext.convert.p4')
78 testmod('hgext.convert.subversion')
78 testmod('hgext.convert.subversion')
79 testmod('hgext.mq')
79 testmod('hgext.mq')
80 # Helper scripts in tests/ that have doctests:
80 # Helper scripts in tests/ that have doctests:
81 testmod('drawdag')
81 testmod('drawdag')
General Comments 0
You need to be logged in to leave comments. Login now