##// END OF EJS Templates
encoding: add comment-based type hints for pytype...
Augie Fackler -
r44187:2ade00f3 default
parent child Browse files
Show More
@@ -1,660 +1,695
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import, print_function
8 from __future__ import absolute_import, print_function
9
9
10 import locale
10 import locale
11 import os
11 import os
12 import unicodedata
12 import unicodedata
13
13
14 from .pycompat import getattr
14 from .pycompat import getattr
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 from .pure import charencode as charencodepure
21 from .pure import charencode as charencodepure
22
22
23 if not globals(): # hide this from non-pytype users
24 from typing import (
25 Any,
26 Callable,
27 List,
28 Text,
29 Type,
30 TypeVar,
31 Union,
32 )
33
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)
39
23 charencode = policy.importmod(r'charencode')
40 charencode = policy.importmod(r'charencode')
24
41
25 isasciistr = charencode.isasciistr
42 isasciistr = charencode.isasciistr
26 asciilower = charencode.asciilower
43 asciilower = charencode.asciilower
27 asciiupper = charencode.asciiupper
44 asciiupper = charencode.asciiupper
28 _jsonescapeu8fast = charencode.jsonescapeu8fast
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
29
46
30 _sysstr = pycompat.sysstr
47 _sysstr = pycompat.sysstr
31
48
32 if pycompat.ispy3:
49 if pycompat.ispy3:
33 unichr = chr
50 unichr = chr
34
51
35 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
36 # "Unicode Subtleties"), so we need to ignore them in some places for
53 # "Unicode Subtleties"), so we need to ignore them in some places for
37 # sanity.
54 # sanity.
38 _ignore = [
55 _ignore = [
39 unichr(int(x, 16)).encode("utf-8")
56 unichr(int(x, 16)).encode("utf-8")
40 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 b"206a 206b 206c 206d 206e 206f feff".split()
58 b"206a 206b 206c 206d 206e 206f feff".split()
42 ]
59 ]
43 # verify the next function will work
60 # verify the next function will work
44 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
45
62
46
63
47 def hfsignoreclean(s):
64 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
48 """Remove codepoints ignored by HFS+ from s.
66 """Remove codepoints ignored by HFS+ from s.
49
67
50 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
51 '.hg'
69 '.hg'
52 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
53 '.hg'
71 '.hg'
54 """
72 """
55 if b"\xe2" in s or b"\xef" in s:
73 if b"\xe2" in s or b"\xef" in s:
56 for c in _ignore:
74 for c in _ignore:
57 s = s.replace(c, b'')
75 s = s.replace(c, b'')
58 return s
76 return s
59
77
60
78
61 # encoding.environ is provided read-only, which may not be used to modify
79 # encoding.environ is provided read-only, which may not be used to modify
62 # the process environment
80 # the process environment
63 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
64 if not pycompat.ispy3:
82 if not pycompat.ispy3:
65 environ = os.environ # re-exports
83 environ = os.environ # re-exports
66 elif _nativeenviron:
84 elif _nativeenviron:
67 environ = os.environb # re-exports
85 environ = os.environb # re-exports
68 else:
86 else:
69 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
70 # and recreate it once encoding is settled
88 # and recreate it once encoding is settled
71 environ = dict(
89 environ = dict(
72 (k.encode(r'utf-8'), v.encode(r'utf-8'))
90 (k.encode(r'utf-8'), v.encode(r'utf-8'))
73 for k, v in os.environ.items() # re-exports
91 for k, v in os.environ.items() # re-exports
74 )
92 )
75
93
76 _encodingrewrites = {
94 _encodingrewrites = {
77 b'646': b'ascii',
95 b'646': b'ascii',
78 b'ANSI_X3.4-1968': b'ascii',
96 b'ANSI_X3.4-1968': b'ascii',
79 }
97 }
80 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
81 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
82 # https://bugs.python.org/issue13216
100 # https://bugs.python.org/issue13216
83 if pycompat.iswindows and not pycompat.ispy3:
101 if pycompat.iswindows and not pycompat.ispy3:
84 _encodingrewrites[b'cp65001'] = b'utf-8'
102 _encodingrewrites[b'cp65001'] = b'utf-8'
85
103
86 try:
104 try:
87 encoding = environ.get(b"HGENCODING")
105 encoding = environ.get(b"HGENCODING")
88 if not encoding:
106 if not encoding:
89 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
90 encoding = _encodingrewrites.get(encoding, encoding)
108 encoding = _encodingrewrites.get(encoding, encoding)
91 except locale.Error:
109 except locale.Error:
92 encoding = b'ascii'
110 encoding = b'ascii'
93 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
94 fallbackencoding = b'ISO-8859-1'
112 fallbackencoding = b'ISO-8859-1'
95
113
96
114
97 class localstr(bytes):
115 class localstr(bytes):
98 '''This class allows strings that are unmodified to be
116 '''This class allows strings that are unmodified to be
99 round-tripped to the local encoding and back'''
117 round-tripped to the local encoding and back'''
100
118
101 def __new__(cls, u, l):
119 def __new__(cls, u, l):
120 # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
102 s = bytes.__new__(cls, l)
121 s = bytes.__new__(cls, l)
103 s._utf8 = u
122 s._utf8 = u
104 return s
123 return s
105
124
106 def __hash__(self):
125 def __hash__(self):
107 return hash(self._utf8) # avoid collisions in local string space
126 return hash(self._utf8) # avoid collisions in local string space
108
127
109
128
110 class safelocalstr(bytes):
129 class safelocalstr(bytes):
111 """Tagged string denoting it was previously an internal UTF-8 string,
130 """Tagged string denoting it was previously an internal UTF-8 string,
112 and can be converted back to UTF-8 losslessly
131 and can be converted back to UTF-8 losslessly
113
132
114 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
133 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
115 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
134 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
116 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
135 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
117 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
136 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
118 """
137 """
119
138
120
139
121 def tolocal(s):
140 def tolocal(s):
141 # type: (Text) -> bytes
122 """
142 """
123 Convert a string from internal UTF-8 to local encoding
143 Convert a string from internal UTF-8 to local encoding
124
144
125 All internal strings should be UTF-8 but some repos before the
145 All internal strings should be UTF-8 but some repos before the
126 implementation of locale support may contain latin1 or possibly
146 implementation of locale support may contain latin1 or possibly
127 other character sets. We attempt to decode everything strictly
147 other character sets. We attempt to decode everything strictly
128 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
148 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
129 replace unknown characters.
149 replace unknown characters.
130
150
131 The localstr class is used to cache the known UTF-8 encoding of
151 The localstr class is used to cache the known UTF-8 encoding of
132 strings next to their local representation to allow lossless
152 strings next to their local representation to allow lossless
133 round-trip conversion back to UTF-8.
153 round-trip conversion back to UTF-8.
134
154
135 >>> u = b'foo: \\xc3\\xa4' # utf-8
155 >>> u = b'foo: \\xc3\\xa4' # utf-8
136 >>> l = tolocal(u)
156 >>> l = tolocal(u)
137 >>> l
157 >>> l
138 'foo: ?'
158 'foo: ?'
139 >>> fromlocal(l)
159 >>> fromlocal(l)
140 'foo: \\xc3\\xa4'
160 'foo: \\xc3\\xa4'
141 >>> u2 = b'foo: \\xc3\\xa1'
161 >>> u2 = b'foo: \\xc3\\xa1'
142 >>> d = { l: 1, tolocal(u2): 2 }
162 >>> d = { l: 1, tolocal(u2): 2 }
143 >>> len(d) # no collision
163 >>> len(d) # no collision
144 2
164 2
145 >>> b'foo: ?' in d
165 >>> b'foo: ?' in d
146 False
166 False
147 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
167 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
148 >>> l = tolocal(l1)
168 >>> l = tolocal(l1)
149 >>> l
169 >>> l
150 'foo: ?'
170 'foo: ?'
151 >>> fromlocal(l) # magically in utf-8
171 >>> fromlocal(l) # magically in utf-8
152 'foo: \\xc3\\xa4'
172 'foo: \\xc3\\xa4'
153 """
173 """
154
174
155 if isasciistr(s):
175 if isasciistr(s):
156 return s
176 return s
157
177
158 try:
178 try:
159 try:
179 try:
160 # make sure string is actually stored in UTF-8
180 # make sure string is actually stored in UTF-8
161 u = s.decode('UTF-8')
181 u = s.decode('UTF-8')
162 if encoding == b'UTF-8':
182 if encoding == b'UTF-8':
163 # fast path
183 # fast path
164 return s
184 return s
165 r = u.encode(_sysstr(encoding), r"replace")
185 r = u.encode(_sysstr(encoding), r"replace")
166 if u == r.decode(_sysstr(encoding)):
186 if u == r.decode(_sysstr(encoding)):
167 # r is a safe, non-lossy encoding of s
187 # r is a safe, non-lossy encoding of s
168 return safelocalstr(r)
188 return safelocalstr(r)
169 return localstr(s, r)
189 return localstr(s, r)
170 except UnicodeDecodeError:
190 except UnicodeDecodeError:
171 # we should only get here if we're looking at an ancient changeset
191 # we should only get here if we're looking at an ancient changeset
172 try:
192 try:
173 u = s.decode(_sysstr(fallbackencoding))
193 u = s.decode(_sysstr(fallbackencoding))
174 r = u.encode(_sysstr(encoding), r"replace")
194 r = u.encode(_sysstr(encoding), r"replace")
175 if u == r.decode(_sysstr(encoding)):
195 if u == r.decode(_sysstr(encoding)):
176 # r is a safe, non-lossy encoding of s
196 # r is a safe, non-lossy encoding of s
177 return safelocalstr(r)
197 return safelocalstr(r)
178 return localstr(u.encode('UTF-8'), r)
198 return localstr(u.encode('UTF-8'), r)
179 except UnicodeDecodeError:
199 except UnicodeDecodeError:
180 u = s.decode("utf-8", "replace") # last ditch
200 u = s.decode("utf-8", "replace") # last ditch
181 # can't round-trip
201 # can't round-trip
182 return u.encode(_sysstr(encoding), r"replace")
202 return u.encode(_sysstr(encoding), r"replace")
183 except LookupError as k:
203 except LookupError as k:
184 raise error.Abort(k, hint=b"please check your locale settings")
204 raise error.Abort(k, hint=b"please check your locale settings")
185
205
186
206
187 def fromlocal(s):
207 def fromlocal(s):
208 # type: (bytes) -> Text
188 """
209 """
189 Convert a string from the local character encoding to UTF-8
210 Convert a string from the local character encoding to UTF-8
190
211
191 We attempt to decode strings using the encoding mode set by
212 We attempt to decode strings using the encoding mode set by
192 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
213 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
193 characters will cause an error message. Other modes include
214 characters will cause an error message. Other modes include
194 'replace', which replaces unknown characters with a special
215 'replace', which replaces unknown characters with a special
195 Unicode character, and 'ignore', which drops the character.
216 Unicode character, and 'ignore', which drops the character.
196 """
217 """
197
218
198 # can we do a lossless round-trip?
219 # can we do a lossless round-trip?
199 if isinstance(s, localstr):
220 if isinstance(s, localstr):
200 return s._utf8
221 return s._utf8
201 if isasciistr(s):
222 if isasciistr(s):
202 return s
223 return s
203
224
204 try:
225 try:
205 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
226 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
206 return u.encode("utf-8")
227 return u.encode("utf-8")
207 except UnicodeDecodeError as inst:
228 except UnicodeDecodeError as inst:
208 sub = s[max(0, inst.start - 10) : inst.start + 10]
229 sub = s[max(0, inst.start - 10) : inst.start + 10]
209 raise error.Abort(
230 raise error.Abort(
210 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
231 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
211 )
232 )
212 except LookupError as k:
233 except LookupError as k:
213 raise error.Abort(k, hint=b"please check your locale settings")
234 raise error.Abort(k, hint=b"please check your locale settings")
214
235
215
236
216 def unitolocal(u):
237 def unitolocal(u):
238 # type: (Text) -> bytes
217 """Convert a unicode string to a byte string of local encoding"""
239 """Convert a unicode string to a byte string of local encoding"""
218 return tolocal(u.encode('utf-8'))
240 return tolocal(u.encode('utf-8'))
219
241
220
242
221 def unifromlocal(s):
243 def unifromlocal(s):
244 # type: (bytes) -> Text
222 """Convert a byte string of local encoding to a unicode string"""
245 """Convert a byte string of local encoding to a unicode string"""
223 return fromlocal(s).decode('utf-8')
246 return fromlocal(s).decode('utf-8')
224
247
225
248
226 def unimethod(bytesfunc):
249 def unimethod(bytesfunc):
250 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
227 """Create a proxy method that forwards __unicode__() and __str__() of
251 """Create a proxy method that forwards __unicode__() and __str__() of
228 Python 3 to __bytes__()"""
252 Python 3 to __bytes__()"""
229
253
230 def unifunc(obj):
254 def unifunc(obj):
231 return unifromlocal(bytesfunc(obj))
255 return unifromlocal(bytesfunc(obj))
232
256
233 return unifunc
257 return unifunc
234
258
235
259
236 # converter functions between native str and byte string. use these if the
260 # converter functions between native str and byte string. use these if the
237 # character encoding is not aware (e.g. exception message) or is known to
261 # character encoding is not aware (e.g. exception message) or is known to
238 # be locale dependent (e.g. date formatting.)
262 # be locale dependent (e.g. date formatting.)
239 if pycompat.ispy3:
263 if pycompat.ispy3:
240 strtolocal = unitolocal
264 strtolocal = unitolocal
241 strfromlocal = unifromlocal
265 strfromlocal = unifromlocal
242 strmethod = unimethod
266 strmethod = unimethod
243 else:
267 else:
244
268
245 def strtolocal(s):
269 def strtolocal(s):
246 # type: (str) -> bytes
270 # type: (str) -> bytes
247 return s
271 return s
248
272
249 def strfromlocal(s):
273 def strfromlocal(s):
250 # type: (bytes) -> str
274 # type: (bytes) -> str
251 return s
275 return s
252
276
253 strmethod = pycompat.identity
277 strmethod = pycompat.identity
254
278
255 if not _nativeenviron:
279 if not _nativeenviron:
256 # now encoding and helper functions are available, recreate the environ
280 # now encoding and helper functions are available, recreate the environ
257 # dict to be exported to other modules
281 # dict to be exported to other modules
258 environ = dict(
282 environ = dict(
259 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
283 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
260 for k, v in os.environ.items() # re-exports
284 for k, v in os.environ.items() # re-exports
261 )
285 )
262
286
263 if pycompat.ispy3:
287 if pycompat.ispy3:
264 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
288 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
265 # returns bytes.
289 # returns bytes.
266 if pycompat.iswindows:
290 if pycompat.iswindows:
267 # Python 3 on Windows issues a DeprecationWarning about using the bytes
291 # Python 3 on Windows issues a DeprecationWarning about using the bytes
268 # API when os.getcwdb() is called.
292 # API when os.getcwdb() is called.
269 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
293 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
270 else:
294 else:
271 getcwd = os.getcwdb # re-exports
295 getcwd = os.getcwdb # re-exports
272 else:
296 else:
273 getcwd = os.getcwd # re-exports
297 getcwd = os.getcwd # re-exports
274
298
275 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
299 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
276 _wide = _sysstr(
300 _wide = _sysstr(
277 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
301 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
278 and b"WFA"
302 and b"WFA"
279 or b"WF"
303 or b"WF"
280 )
304 )
281
305
282
306
283 def colwidth(s):
307 def colwidth(s):
308 # type: (bytes) -> int
284 b"Find the column width of a string for display in the local encoding"
309 b"Find the column width of a string for display in the local encoding"
285 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
310 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
286
311
287
312
288 def ucolwidth(d):
313 def ucolwidth(d):
314 # type: (Text) -> int
289 b"Find the column width of a Unicode string for display"
315 b"Find the column width of a Unicode string for display"
290 eaw = getattr(unicodedata, 'east_asian_width', None)
316 eaw = getattr(unicodedata, 'east_asian_width', None)
291 if eaw is not None:
317 if eaw is not None:
292 return sum([eaw(c) in _wide and 2 or 1 for c in d])
318 return sum([eaw(c) in _wide and 2 or 1 for c in d])
293 return len(d)
319 return len(d)
294
320
295
321
296 def getcols(s, start, c):
322 def getcols(s, start, c):
323 # type: (bytes, int, int) -> bytes
297 '''Use colwidth to find a c-column substring of s starting at byte
324 '''Use colwidth to find a c-column substring of s starting at byte
298 index start'''
325 index start'''
299 for x in pycompat.xrange(start + c, len(s)):
326 for x in pycompat.xrange(start + c, len(s)):
300 t = s[start:x]
327 t = s[start:x]
301 if colwidth(t) == c:
328 if colwidth(t) == c:
302 return t
329 return t
303
330
304
331
305 def trim(s, width, ellipsis=b'', leftside=False):
332 def trim(s, width, ellipsis=b'', leftside=False):
333 # type: (bytes, int, bytes, bool) -> bytes
306 """Trim string 's' to at most 'width' columns (including 'ellipsis').
334 """Trim string 's' to at most 'width' columns (including 'ellipsis').
307
335
308 If 'leftside' is True, left side of string 's' is trimmed.
336 If 'leftside' is True, left side of string 's' is trimmed.
309 'ellipsis' is always placed at trimmed side.
337 'ellipsis' is always placed at trimmed side.
310
338
311 >>> from .node import bin
339 >>> from .node import bin
312 >>> def bprint(s):
340 >>> def bprint(s):
313 ... print(pycompat.sysstr(s))
341 ... print(pycompat.sysstr(s))
314 >>> ellipsis = b'+++'
342 >>> ellipsis = b'+++'
315 >>> from . import encoding
343 >>> from . import encoding
316 >>> encoding.encoding = b'utf-8'
344 >>> encoding.encoding = b'utf-8'
317 >>> t = b'1234567890'
345 >>> t = b'1234567890'
318 >>> bprint(trim(t, 12, ellipsis=ellipsis))
346 >>> bprint(trim(t, 12, ellipsis=ellipsis))
319 1234567890
347 1234567890
320 >>> bprint(trim(t, 10, ellipsis=ellipsis))
348 >>> bprint(trim(t, 10, ellipsis=ellipsis))
321 1234567890
349 1234567890
322 >>> bprint(trim(t, 8, ellipsis=ellipsis))
350 >>> bprint(trim(t, 8, ellipsis=ellipsis))
323 12345+++
351 12345+++
324 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
352 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
325 +++67890
353 +++67890
326 >>> bprint(trim(t, 8))
354 >>> bprint(trim(t, 8))
327 12345678
355 12345678
328 >>> bprint(trim(t, 8, leftside=True))
356 >>> bprint(trim(t, 8, leftside=True))
329 34567890
357 34567890
330 >>> bprint(trim(t, 3, ellipsis=ellipsis))
358 >>> bprint(trim(t, 3, ellipsis=ellipsis))
331 +++
359 +++
332 >>> bprint(trim(t, 1, ellipsis=ellipsis))
360 >>> bprint(trim(t, 1, ellipsis=ellipsis))
333 +
361 +
334 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
362 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
335 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
363 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
336 >>> bprint(trim(t, 12, ellipsis=ellipsis))
364 >>> bprint(trim(t, 12, ellipsis=ellipsis))
337 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
365 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
338 >>> bprint(trim(t, 10, ellipsis=ellipsis))
366 >>> bprint(trim(t, 10, ellipsis=ellipsis))
339 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
367 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
340 >>> bprint(trim(t, 8, ellipsis=ellipsis))
368 >>> bprint(trim(t, 8, ellipsis=ellipsis))
341 \xe3\x81\x82\xe3\x81\x84+++
369 \xe3\x81\x82\xe3\x81\x84+++
342 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
370 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
343 +++\xe3\x81\x88\xe3\x81\x8a
371 +++\xe3\x81\x88\xe3\x81\x8a
344 >>> bprint(trim(t, 5))
372 >>> bprint(trim(t, 5))
345 \xe3\x81\x82\xe3\x81\x84
373 \xe3\x81\x82\xe3\x81\x84
346 >>> bprint(trim(t, 5, leftside=True))
374 >>> bprint(trim(t, 5, leftside=True))
347 \xe3\x81\x88\xe3\x81\x8a
375 \xe3\x81\x88\xe3\x81\x8a
348 >>> bprint(trim(t, 4, ellipsis=ellipsis))
376 >>> bprint(trim(t, 4, ellipsis=ellipsis))
349 +++
377 +++
350 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
378 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
351 +++
379 +++
352 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
380 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
353 >>> bprint(trim(t, 12, ellipsis=ellipsis))
381 >>> bprint(trim(t, 12, ellipsis=ellipsis))
354 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
382 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
355 >>> bprint(trim(t, 10, ellipsis=ellipsis))
383 >>> bprint(trim(t, 10, ellipsis=ellipsis))
356 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
384 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
357 >>> bprint(trim(t, 8, ellipsis=ellipsis))
385 >>> bprint(trim(t, 8, ellipsis=ellipsis))
358 \x11\x22\x33\x44\x55+++
386 \x11\x22\x33\x44\x55+++
359 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
387 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
360 +++\x66\x77\x88\x99\xaa
388 +++\x66\x77\x88\x99\xaa
361 >>> bprint(trim(t, 8))
389 >>> bprint(trim(t, 8))
362 \x11\x22\x33\x44\x55\x66\x77\x88
390 \x11\x22\x33\x44\x55\x66\x77\x88
363 >>> bprint(trim(t, 8, leftside=True))
391 >>> bprint(trim(t, 8, leftside=True))
364 \x33\x44\x55\x66\x77\x88\x99\xaa
392 \x33\x44\x55\x66\x77\x88\x99\xaa
365 >>> bprint(trim(t, 3, ellipsis=ellipsis))
393 >>> bprint(trim(t, 3, ellipsis=ellipsis))
366 +++
394 +++
367 >>> bprint(trim(t, 1, ellipsis=ellipsis))
395 >>> bprint(trim(t, 1, ellipsis=ellipsis))
368 +
396 +
369 """
397 """
370 try:
398 try:
371 u = s.decode(_sysstr(encoding))
399 u = s.decode(_sysstr(encoding))
372 except UnicodeDecodeError:
400 except UnicodeDecodeError:
373 if len(s) <= width: # trimming is not needed
401 if len(s) <= width: # trimming is not needed
374 return s
402 return s
375 width -= len(ellipsis)
403 width -= len(ellipsis)
376 if width <= 0: # no enough room even for ellipsis
404 if width <= 0: # no enough room even for ellipsis
377 return ellipsis[: width + len(ellipsis)]
405 return ellipsis[: width + len(ellipsis)]
378 if leftside:
406 if leftside:
379 return ellipsis + s[-width:]
407 return ellipsis + s[-width:]
380 return s[:width] + ellipsis
408 return s[:width] + ellipsis
381
409
382 if ucolwidth(u) <= width: # trimming is not needed
410 if ucolwidth(u) <= width: # trimming is not needed
383 return s
411 return s
384
412
385 width -= len(ellipsis)
413 width -= len(ellipsis)
386 if width <= 0: # no enough room even for ellipsis
414 if width <= 0: # no enough room even for ellipsis
387 return ellipsis[: width + len(ellipsis)]
415 return ellipsis[: width + len(ellipsis)]
388
416
389 if leftside:
417 if leftside:
390 uslice = lambda i: u[i:]
418 uslice = lambda i: u[i:]
391 concat = lambda s: ellipsis + s
419 concat = lambda s: ellipsis + s
392 else:
420 else:
393 uslice = lambda i: u[:-i]
421 uslice = lambda i: u[:-i]
394 concat = lambda s: s + ellipsis
422 concat = lambda s: s + ellipsis
395 for i in pycompat.xrange(1, len(u)):
423 for i in pycompat.xrange(1, len(u)):
396 usub = uslice(i)
424 usub = uslice(i)
397 if ucolwidth(usub) <= width:
425 if ucolwidth(usub) <= width:
398 return concat(usub.encode(_sysstr(encoding)))
426 return concat(usub.encode(_sysstr(encoding)))
399 return ellipsis # no enough room for multi-column characters
427 return ellipsis # no enough room for multi-column characters
400
428
401
429
402 def lower(s):
430 def lower(s):
431 # type: (bytes) -> bytes
403 b"best-effort encoding-aware case-folding of local string s"
432 b"best-effort encoding-aware case-folding of local string s"
404 try:
433 try:
405 return asciilower(s)
434 return asciilower(s)
406 except UnicodeDecodeError:
435 except UnicodeDecodeError:
407 pass
436 pass
408 try:
437 try:
409 if isinstance(s, localstr):
438 if isinstance(s, localstr):
410 u = s._utf8.decode("utf-8")
439 u = s._utf8.decode("utf-8")
411 else:
440 else:
412 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
441 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
413
442
414 lu = u.lower()
443 lu = u.lower()
415 if u == lu:
444 if u == lu:
416 return s # preserve localstring
445 return s # preserve localstring
417 return lu.encode(_sysstr(encoding))
446 return lu.encode(_sysstr(encoding))
418 except UnicodeError:
447 except UnicodeError:
419 return s.lower() # we don't know how to fold this except in ASCII
448 return s.lower() # we don't know how to fold this except in ASCII
420 except LookupError as k:
449 except LookupError as k:
421 raise error.Abort(k, hint=b"please check your locale settings")
450 raise error.Abort(k, hint=b"please check your locale settings")
422
451
423
452
424 def upper(s):
453 def upper(s):
454 # type: (bytes) -> bytes
425 b"best-effort encoding-aware case-folding of local string s"
455 b"best-effort encoding-aware case-folding of local string s"
426 try:
456 try:
427 return asciiupper(s)
457 return asciiupper(s)
428 except UnicodeDecodeError:
458 except UnicodeDecodeError:
429 return upperfallback(s)
459 return upperfallback(s)
430
460
431
461
432 def upperfallback(s):
462 def upperfallback(s):
463 # type: (Any) -> Any
433 try:
464 try:
434 if isinstance(s, localstr):
465 if isinstance(s, localstr):
435 u = s._utf8.decode("utf-8")
466 u = s._utf8.decode("utf-8")
436 else:
467 else:
437 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
468 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
438
469
439 uu = u.upper()
470 uu = u.upper()
440 if u == uu:
471 if u == uu:
441 return s # preserve localstring
472 return s # preserve localstring
442 return uu.encode(_sysstr(encoding))
473 return uu.encode(_sysstr(encoding))
443 except UnicodeError:
474 except UnicodeError:
444 return s.upper() # we don't know how to fold this except in ASCII
475 return s.upper() # we don't know how to fold this except in ASCII
445 except LookupError as k:
476 except LookupError as k:
446 raise error.Abort(k, hint=b"please check your locale settings")
477 raise error.Abort(k, hint=b"please check your locale settings")
447
478
448
479
449 class normcasespecs(object):
480 class normcasespecs(object):
450 '''what a platform's normcase does to ASCII strings
481 '''what a platform's normcase does to ASCII strings
451
482
452 This is specified per platform, and should be consistent with what normcase
483 This is specified per platform, and should be consistent with what normcase
453 on that platform actually does.
484 on that platform actually does.
454
485
455 lower: normcase lowercases ASCII strings
486 lower: normcase lowercases ASCII strings
456 upper: normcase uppercases ASCII strings
487 upper: normcase uppercases ASCII strings
457 other: the fallback function should always be called
488 other: the fallback function should always be called
458
489
459 This should be kept in sync with normcase_spec in util.h.'''
490 This should be kept in sync with normcase_spec in util.h.'''
460
491
461 lower = -1
492 lower = -1
462 upper = 1
493 upper = 1
463 other = 0
494 other = 0
464
495
465
496
466 def jsonescape(s, paranoid=False):
497 def jsonescape(s, paranoid=False):
498 # type: (Any, Any) -> Any
467 '''returns a string suitable for JSON
499 '''returns a string suitable for JSON
468
500
469 JSON is problematic for us because it doesn't support non-Unicode
501 JSON is problematic for us because it doesn't support non-Unicode
470 bytes. To deal with this, we take the following approach:
502 bytes. To deal with this, we take the following approach:
471
503
472 - localstr/safelocalstr objects are converted back to UTF-8
504 - localstr/safelocalstr objects are converted back to UTF-8
473 - valid UTF-8/ASCII strings are passed as-is
505 - valid UTF-8/ASCII strings are passed as-is
474 - other strings are converted to UTF-8b surrogate encoding
506 - other strings are converted to UTF-8b surrogate encoding
475 - apply JSON-specified string escaping
507 - apply JSON-specified string escaping
476
508
477 (escapes are doubled in these tests)
509 (escapes are doubled in these tests)
478
510
479 >>> jsonescape(b'this is a test')
511 >>> jsonescape(b'this is a test')
480 'this is a test'
512 'this is a test'
481 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
513 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
482 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
514 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
483 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
515 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
484 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
516 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
485 >>> jsonescape(b'a weird byte: \\xdd')
517 >>> jsonescape(b'a weird byte: \\xdd')
486 'a weird byte: \\xed\\xb3\\x9d'
518 'a weird byte: \\xed\\xb3\\x9d'
487 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
519 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
488 'utf-8: caf\\xc3\\xa9'
520 'utf-8: caf\\xc3\\xa9'
489 >>> jsonescape(b'')
521 >>> jsonescape(b'')
490 ''
522 ''
491
523
492 If paranoid, non-ascii and common troublesome characters are also escaped.
524 If paranoid, non-ascii and common troublesome characters are also escaped.
493 This is suitable for web output.
525 This is suitable for web output.
494
526
495 >>> s = b'escape characters: \\0 \\x0b \\x7f'
527 >>> s = b'escape characters: \\0 \\x0b \\x7f'
496 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
528 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
497 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
529 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
498 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
530 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
499 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
531 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
500 'escape boundary: ~ \\\\u007f \\\\u0080'
532 'escape boundary: ~ \\\\u007f \\\\u0080'
501 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
533 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
502 'a weird byte: \\\\udcdd'
534 'a weird byte: \\\\udcdd'
503 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
535 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
504 'utf-8: caf\\\\u00e9'
536 'utf-8: caf\\\\u00e9'
505 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
537 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
506 'non-BMP: \\\\ud834\\\\udd1e'
538 'non-BMP: \\\\ud834\\\\udd1e'
507 >>> jsonescape(b'<foo@example.org>', paranoid=True)
539 >>> jsonescape(b'<foo@example.org>', paranoid=True)
508 '\\\\u003cfoo@example.org\\\\u003e'
540 '\\\\u003cfoo@example.org\\\\u003e'
509 '''
541 '''
510
542
511 u8chars = toutf8b(s)
543 u8chars = toutf8b(s)
512 try:
544 try:
513 return _jsonescapeu8fast(u8chars, paranoid)
545 return _jsonescapeu8fast(u8chars, paranoid)
514 except ValueError:
546 except ValueError:
515 pass
547 pass
516 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
548 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
517
549
518
550
519 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
551 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
520 # bytes are mapped to that range.
552 # bytes are mapped to that range.
521 if pycompat.ispy3:
553 if pycompat.ispy3:
522 _utf8strict = r'surrogatepass'
554 _utf8strict = r'surrogatepass'
523 else:
555 else:
524 _utf8strict = r'strict'
556 _utf8strict = r'strict'
525
557
526 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
558 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
527
559
528
560
529 def getutf8char(s, pos):
561 def getutf8char(s, pos):
562 # type: (Any, Any) -> Any
530 '''get the next full utf-8 character in the given string, starting at pos
563 '''get the next full utf-8 character in the given string, starting at pos
531
564
532 Raises a UnicodeError if the given location does not start a valid
565 Raises a UnicodeError if the given location does not start a valid
533 utf-8 character.
566 utf-8 character.
534 '''
567 '''
535
568
536 # find how many bytes to attempt decoding from first nibble
569 # find how many bytes to attempt decoding from first nibble
537 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
570 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
538 if not l: # ascii
571 if not l: # ascii
539 return s[pos : pos + 1]
572 return s[pos : pos + 1]
540
573
541 c = s[pos : pos + l]
574 c = s[pos : pos + l]
542 # validate with attempted decode
575 # validate with attempted decode
543 c.decode("utf-8", _utf8strict)
576 c.decode("utf-8", _utf8strict)
544 return c
577 return c
545
578
546
579
547 def toutf8b(s):
580 def toutf8b(s):
581 # type: (Any) -> Any
548 '''convert a local, possibly-binary string into UTF-8b
582 '''convert a local, possibly-binary string into UTF-8b
549
583
550 This is intended as a generic method to preserve data when working
584 This is intended as a generic method to preserve data when working
551 with schemes like JSON and XML that have no provision for
585 with schemes like JSON and XML that have no provision for
552 arbitrary byte strings. As Mercurial often doesn't know
586 arbitrary byte strings. As Mercurial often doesn't know
553 what encoding data is in, we use so-called UTF-8b.
587 what encoding data is in, we use so-called UTF-8b.
554
588
555 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
589 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
556 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
590 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
557 uDC00-uDCFF.
591 uDC00-uDCFF.
558
592
559 Principles of operation:
593 Principles of operation:
560
594
561 - ASCII and UTF-8 data successfully round-trips and is understood
595 - ASCII and UTF-8 data successfully round-trips and is understood
562 by Unicode-oriented clients
596 by Unicode-oriented clients
563 - filenames and file contents in arbitrary other encodings can have
597 - filenames and file contents in arbitrary other encodings can have
564 be round-tripped or recovered by clueful clients
598 be round-tripped or recovered by clueful clients
565 - local strings that have a cached known UTF-8 encoding (aka
599 - local strings that have a cached known UTF-8 encoding (aka
566 localstr) get sent as UTF-8 so Unicode-oriented clients get the
600 localstr) get sent as UTF-8 so Unicode-oriented clients get the
567 Unicode data they want
601 Unicode data they want
568 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
602 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
569 - because we must preserve UTF-8 bytestring in places such as
603 - because we must preserve UTF-8 bytestring in places such as
570 filenames, metadata can't be roundtripped without help
604 filenames, metadata can't be roundtripped without help
571
605
572 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
606 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
573 arbitrary bytes into an internal Unicode format that can be
607 arbitrary bytes into an internal Unicode format that can be
574 re-encoded back into the original. Here we are exposing the
608 re-encoded back into the original. Here we are exposing the
575 internal surrogate encoding as a UTF-8 string.)
609 internal surrogate encoding as a UTF-8 string.)
576 '''
610 '''
577
611
578 if isinstance(s, localstr):
612 if isinstance(s, localstr):
579 # assume that the original UTF-8 sequence would never contain
613 # assume that the original UTF-8 sequence would never contain
580 # invalid characters in U+DCxx range
614 # invalid characters in U+DCxx range
581 return s._utf8
615 return s._utf8
582 elif isinstance(s, safelocalstr):
616 elif isinstance(s, safelocalstr):
583 # already verified that s is non-lossy in legacy encoding, which
617 # already verified that s is non-lossy in legacy encoding, which
584 # shouldn't contain characters in U+DCxx range
618 # shouldn't contain characters in U+DCxx range
585 return fromlocal(s)
619 return fromlocal(s)
586 elif isasciistr(s):
620 elif isasciistr(s):
587 return s
621 return s
588 if b"\xed" not in s:
622 if b"\xed" not in s:
589 try:
623 try:
590 s.decode('utf-8', _utf8strict)
624 s.decode('utf-8', _utf8strict)
591 return s
625 return s
592 except UnicodeDecodeError:
626 except UnicodeDecodeError:
593 pass
627 pass
594
628
595 s = pycompat.bytestr(s)
629 s = pycompat.bytestr(s)
596 r = b""
630 r = b""
597 pos = 0
631 pos = 0
598 l = len(s)
632 l = len(s)
599 while pos < l:
633 while pos < l:
600 try:
634 try:
601 c = getutf8char(s, pos)
635 c = getutf8char(s, pos)
602 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
636 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
603 # have to re-escape existing U+DCxx characters
637 # have to re-escape existing U+DCxx characters
604 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
638 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
605 pos += 1
639 pos += 1
606 else:
640 else:
607 pos += len(c)
641 pos += len(c)
608 except UnicodeDecodeError:
642 except UnicodeDecodeError:
609 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
643 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
610 pos += 1
644 pos += 1
611 r += c
645 r += c
612 return r
646 return r
613
647
614
648
615 def fromutf8b(s):
649 def fromutf8b(s):
650 # type: (Text) -> bytes
616 '''Given a UTF-8b string, return a local, possibly-binary string.
651 '''Given a UTF-8b string, return a local, possibly-binary string.
617
652
618 return the original binary string. This
653 return the original binary string. This
619 is a round-trip process for strings like filenames, but metadata
654 is a round-trip process for strings like filenames, but metadata
620 that's was passed through tolocal will remain in UTF-8.
655 that's was passed through tolocal will remain in UTF-8.
621
656
622 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
657 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
623 >>> m = b"\\xc3\\xa9\\x99abcd"
658 >>> m = b"\\xc3\\xa9\\x99abcd"
624 >>> toutf8b(m)
659 >>> toutf8b(m)
625 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
660 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
626 >>> roundtrip(m)
661 >>> roundtrip(m)
627 True
662 True
628 >>> roundtrip(b"\\xc2\\xc2\\x80")
663 >>> roundtrip(b"\\xc2\\xc2\\x80")
629 True
664 True
630 >>> roundtrip(b"\\xef\\xbf\\xbd")
665 >>> roundtrip(b"\\xef\\xbf\\xbd")
631 True
666 True
632 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
667 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
633 True
668 True
634 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
669 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
635 True
670 True
636 '''
671 '''
637
672
638 if isasciistr(s):
673 if isasciistr(s):
639 return s
674 return s
640 # fast path - look for uDxxx prefixes in s
675 # fast path - look for uDxxx prefixes in s
641 if b"\xed" not in s:
676 if b"\xed" not in s:
642 return s
677 return s
643
678
644 # We could do this with the unicode type but some Python builds
679 # We could do this with the unicode type but some Python builds
645 # use UTF-16 internally (issue5031) which causes non-BMP code
680 # use UTF-16 internally (issue5031) which causes non-BMP code
646 # points to be escaped. Instead, we use our handy getutf8char
681 # points to be escaped. Instead, we use our handy getutf8char
647 # helper again to walk the string without "decoding" it.
682 # helper again to walk the string without "decoding" it.
648
683
649 s = pycompat.bytestr(s)
684 s = pycompat.bytestr(s)
650 r = b""
685 r = b""
651 pos = 0
686 pos = 0
652 l = len(s)
687 l = len(s)
653 while pos < l:
688 while pos < l:
654 c = getutf8char(s, pos)
689 c = getutf8char(s, pos)
655 pos += len(c)
690 pos += len(c)
656 # unescape U+DCxx characters
691 # unescape U+DCxx characters
657 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
692 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
658 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
693 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
659 r += c
694 r += c
660 return r
695 return r
@@ -1,27 +1,28
1 #require test-repo pyflakes hg10
1 #require test-repo pyflakes hg10
2
2
3 $ . "$TESTDIR/helpers-testrepo.sh"
3 $ . "$TESTDIR/helpers-testrepo.sh"
4
4
5 run pyflakes on all tracked files ending in .py or without a file ending
5 run pyflakes on all tracked files ending in .py or without a file ending
6 (skipping binary file random-seed)
6 (skipping binary file random-seed)
7
7
8 $ cat > test.py <<EOF
8 $ cat > test.py <<EOF
9 > print(undefinedname)
9 > print(undefinedname)
10 > EOF
10 > EOF
11 $ pyflakes test.py 2>/dev/null | "$TESTDIR/filterpyflakes.py"
11 $ pyflakes test.py 2>/dev/null | "$TESTDIR/filterpyflakes.py"
12 test.py:1: undefined name 'undefinedname'
12 test.py:1: undefined name 'undefinedname'
13
13
14 $ cd "`dirname "$TESTDIR"`"
14 $ cd "`dirname "$TESTDIR"`"
15
15
16 $ testrepohg locate 'set:**.py or grep("^#!.*python")' \
16 $ testrepohg locate 'set:**.py or grep("^#!.*python")' \
17 > -X hgext/fsmonitor/pywatchman \
17 > -X hgext/fsmonitor/pywatchman \
18 > -X mercurial/pycompat.py -X contrib/python-zstandard \
18 > -X mercurial/pycompat.py -X contrib/python-zstandard \
19 > -X mercurial/thirdparty/cbor \
19 > -X mercurial/thirdparty/cbor \
20 > -X mercurial/thirdparty/concurrent \
20 > -X mercurial/thirdparty/concurrent \
21 > -X mercurial/thirdparty/zope \
21 > -X mercurial/thirdparty/zope \
22 > 2>/dev/null \
22 > 2>/dev/null \
23 > | xargs pyflakes 2>/dev/null | "$TESTDIR/filterpyflakes.py"
23 > | xargs pyflakes 2>/dev/null | "$TESTDIR/filterpyflakes.py"
24 contrib/perf.py:*: undefined name 'xrange' (glob) (?)
24 contrib/perf.py:*: undefined name 'xrange' (glob) (?)
25 mercurial/hgweb/server.py:*: undefined name 'reload' (glob) (?)
25 mercurial/hgweb/server.py:*: undefined name 'reload' (glob) (?)
26 mercurial/util.py:*: undefined name 'file' (glob) (?)
26 mercurial/util.py:*: undefined name 'file' (glob) (?)
27 mercurial/encoding.py:*: undefined name 'localstr' (glob) (?)
27
28
General Comments 0
You need to be logged in to leave comments. Login now