##// END OF EJS Templates
encoding: add comment-based type hints for pytype...
Augie Fackler -
r44187:2ade00f3 default
parent child Browse files
Show More
@@ -1,660 +1,695
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from .pycompat import getattr
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import charencode as charencodepure
22 22
23 if not globals(): # hide this from non-pytype users
24 from typing import (
25 Any,
26 Callable,
27 List,
28 Text,
29 Type,
30 TypeVar,
31 Union,
32 )
33
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)
39
23 40 charencode = policy.importmod(r'charencode')
24 41
25 42 isasciistr = charencode.isasciistr
26 43 asciilower = charencode.asciilower
27 44 asciiupper = charencode.asciiupper
28 45 _jsonescapeu8fast = charencode.jsonescapeu8fast
29 46
30 47 _sysstr = pycompat.sysstr
31 48
32 49 if pycompat.ispy3:
33 50 unichr = chr
34 51
35 52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
36 53 # "Unicode Subtleties"), so we need to ignore them in some places for
37 54 # sanity.
38 55 _ignore = [
39 56 unichr(int(x, 16)).encode("utf-8")
40 57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 58 b"206a 206b 206c 206d 206e 206f feff".split()
42 59 ]
43 60 # verify the next function will work
44 61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
45 62
46 63
47 64 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
48 66 """Remove codepoints ignored by HFS+ from s.
49 67
50 68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
51 69 '.hg'
52 70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
53 71 '.hg'
54 72 """
55 73 if b"\xe2" in s or b"\xef" in s:
56 74 for c in _ignore:
57 75 s = s.replace(c, b'')
58 76 return s
59 77
60 78
61 79 # encoding.environ is provided read-only, which may not be used to modify
62 80 # the process environment
63 81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
64 82 if not pycompat.ispy3:
65 83 environ = os.environ # re-exports
66 84 elif _nativeenviron:
67 85 environ = os.environb # re-exports
68 86 else:
69 87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
70 88 # and recreate it once encoding is settled
71 89 environ = dict(
72 90 (k.encode(r'utf-8'), v.encode(r'utf-8'))
73 91 for k, v in os.environ.items() # re-exports
74 92 )
75 93
76 94 _encodingrewrites = {
77 95 b'646': b'ascii',
78 96 b'ANSI_X3.4-1968': b'ascii',
79 97 }
80 98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
81 99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
82 100 # https://bugs.python.org/issue13216
83 101 if pycompat.iswindows and not pycompat.ispy3:
84 102 _encodingrewrites[b'cp65001'] = b'utf-8'
85 103
86 104 try:
87 105 encoding = environ.get(b"HGENCODING")
88 106 if not encoding:
89 107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
90 108 encoding = _encodingrewrites.get(encoding, encoding)
91 109 except locale.Error:
92 110 encoding = b'ascii'
93 111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
94 112 fallbackencoding = b'ISO-8859-1'
95 113
96 114
97 115 class localstr(bytes):
98 116 '''This class allows strings that are unmodified to be
99 117 round-tripped to the local encoding and back'''
100 118
101 119 def __new__(cls, u, l):
120 # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
102 121 s = bytes.__new__(cls, l)
103 122 s._utf8 = u
104 123 return s
105 124
106 125 def __hash__(self):
107 126 return hash(self._utf8) # avoid collisions in local string space
108 127
109 128
110 129 class safelocalstr(bytes):
111 130 """Tagged string denoting it was previously an internal UTF-8 string,
112 131 and can be converted back to UTF-8 losslessly
113 132
114 133 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
115 134 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
116 135 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
117 136 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
118 137 """
119 138
120 139
121 140 def tolocal(s):
141 # type: (Text) -> bytes
122 142 """
123 143 Convert a string from internal UTF-8 to local encoding
124 144
125 145 All internal strings should be UTF-8 but some repos before the
126 146 implementation of locale support may contain latin1 or possibly
127 147 other character sets. We attempt to decode everything strictly
128 148 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
129 149 replace unknown characters.
130 150
131 151 The localstr class is used to cache the known UTF-8 encoding of
132 152 strings next to their local representation to allow lossless
133 153 round-trip conversion back to UTF-8.
134 154
135 155 >>> u = b'foo: \\xc3\\xa4' # utf-8
136 156 >>> l = tolocal(u)
137 157 >>> l
138 158 'foo: ?'
139 159 >>> fromlocal(l)
140 160 'foo: \\xc3\\xa4'
141 161 >>> u2 = b'foo: \\xc3\\xa1'
142 162 >>> d = { l: 1, tolocal(u2): 2 }
143 163 >>> len(d) # no collision
144 164 2
145 165 >>> b'foo: ?' in d
146 166 False
147 167 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
148 168 >>> l = tolocal(l1)
149 169 >>> l
150 170 'foo: ?'
151 171 >>> fromlocal(l) # magically in utf-8
152 172 'foo: \\xc3\\xa4'
153 173 """
154 174
155 175 if isasciistr(s):
156 176 return s
157 177
158 178 try:
159 179 try:
160 180 # make sure string is actually stored in UTF-8
161 181 u = s.decode('UTF-8')
162 182 if encoding == b'UTF-8':
163 183 # fast path
164 184 return s
165 185 r = u.encode(_sysstr(encoding), r"replace")
166 186 if u == r.decode(_sysstr(encoding)):
167 187 # r is a safe, non-lossy encoding of s
168 188 return safelocalstr(r)
169 189 return localstr(s, r)
170 190 except UnicodeDecodeError:
171 191 # we should only get here if we're looking at an ancient changeset
172 192 try:
173 193 u = s.decode(_sysstr(fallbackencoding))
174 194 r = u.encode(_sysstr(encoding), r"replace")
175 195 if u == r.decode(_sysstr(encoding)):
176 196 # r is a safe, non-lossy encoding of s
177 197 return safelocalstr(r)
178 198 return localstr(u.encode('UTF-8'), r)
179 199 except UnicodeDecodeError:
180 200 u = s.decode("utf-8", "replace") # last ditch
181 201 # can't round-trip
182 202 return u.encode(_sysstr(encoding), r"replace")
183 203 except LookupError as k:
184 204 raise error.Abort(k, hint=b"please check your locale settings")
185 205
186 206
187 207 def fromlocal(s):
208 # type: (bytes) -> Text
188 209 """
189 210 Convert a string from the local character encoding to UTF-8
190 211
191 212 We attempt to decode strings using the encoding mode set by
192 213 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
193 214 characters will cause an error message. Other modes include
194 215 'replace', which replaces unknown characters with a special
195 216 Unicode character, and 'ignore', which drops the character.
196 217 """
197 218
198 219 # can we do a lossless round-trip?
199 220 if isinstance(s, localstr):
200 221 return s._utf8
201 222 if isasciistr(s):
202 223 return s
203 224
204 225 try:
205 226 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
206 227 return u.encode("utf-8")
207 228 except UnicodeDecodeError as inst:
208 229 sub = s[max(0, inst.start - 10) : inst.start + 10]
209 230 raise error.Abort(
210 231 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
211 232 )
212 233 except LookupError as k:
213 234 raise error.Abort(k, hint=b"please check your locale settings")
214 235
215 236
216 237 def unitolocal(u):
238 # type: (Text) -> bytes
217 239 """Convert a unicode string to a byte string of local encoding"""
218 240 return tolocal(u.encode('utf-8'))
219 241
220 242
221 243 def unifromlocal(s):
244 # type: (bytes) -> Text
222 245 """Convert a byte string of local encoding to a unicode string"""
223 246 return fromlocal(s).decode('utf-8')
224 247
225 248
226 249 def unimethod(bytesfunc):
250 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
227 251 """Create a proxy method that forwards __unicode__() and __str__() of
228 252 Python 3 to __bytes__()"""
229 253
230 254 def unifunc(obj):
231 255 return unifromlocal(bytesfunc(obj))
232 256
233 257 return unifunc
234 258
235 259
236 260 # converter functions between native str and byte string. use these if the
237 261 # character encoding is not aware (e.g. exception message) or is known to
238 262 # be locale dependent (e.g. date formatting.)
239 263 if pycompat.ispy3:
240 264 strtolocal = unitolocal
241 265 strfromlocal = unifromlocal
242 266 strmethod = unimethod
243 267 else:
244 268
245 269 def strtolocal(s):
246 270 # type: (str) -> bytes
247 271 return s
248 272
249 273 def strfromlocal(s):
250 274 # type: (bytes) -> str
251 275 return s
252 276
253 277 strmethod = pycompat.identity
254 278
255 279 if not _nativeenviron:
256 280 # now encoding and helper functions are available, recreate the environ
257 281 # dict to be exported to other modules
258 282 environ = dict(
259 283 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
260 284 for k, v in os.environ.items() # re-exports
261 285 )
262 286
263 287 if pycompat.ispy3:
264 288 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
265 289 # returns bytes.
266 290 if pycompat.iswindows:
267 291 # Python 3 on Windows issues a DeprecationWarning about using the bytes
268 292 # API when os.getcwdb() is called.
269 293 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
270 294 else:
271 295 getcwd = os.getcwdb # re-exports
272 296 else:
273 297 getcwd = os.getcwd # re-exports
274 298
275 299 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
276 300 _wide = _sysstr(
277 301 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
278 302 and b"WFA"
279 303 or b"WF"
280 304 )
281 305
282 306
283 307 def colwidth(s):
308 # type: (bytes) -> int
284 309 b"Find the column width of a string for display in the local encoding"
285 310 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
286 311
287 312
288 313 def ucolwidth(d):
314 # type: (Text) -> int
289 315 b"Find the column width of a Unicode string for display"
290 316 eaw = getattr(unicodedata, 'east_asian_width', None)
291 317 if eaw is not None:
292 318 return sum([eaw(c) in _wide and 2 or 1 for c in d])
293 319 return len(d)
294 320
295 321
296 322 def getcols(s, start, c):
323 # type: (bytes, int, int) -> bytes
297 324 '''Use colwidth to find a c-column substring of s starting at byte
298 325 index start'''
299 326 for x in pycompat.xrange(start + c, len(s)):
300 327 t = s[start:x]
301 328 if colwidth(t) == c:
302 329 return t
303 330
304 331
305 332 def trim(s, width, ellipsis=b'', leftside=False):
333 # type: (bytes, int, bytes, bool) -> bytes
306 334 """Trim string 's' to at most 'width' columns (including 'ellipsis').
307 335
308 336 If 'leftside' is True, left side of string 's' is trimmed.
309 337 'ellipsis' is always placed at trimmed side.
310 338
311 339 >>> from .node import bin
312 340 >>> def bprint(s):
313 341 ... print(pycompat.sysstr(s))
314 342 >>> ellipsis = b'+++'
315 343 >>> from . import encoding
316 344 >>> encoding.encoding = b'utf-8'
317 345 >>> t = b'1234567890'
318 346 >>> bprint(trim(t, 12, ellipsis=ellipsis))
319 347 1234567890
320 348 >>> bprint(trim(t, 10, ellipsis=ellipsis))
321 349 1234567890
322 350 >>> bprint(trim(t, 8, ellipsis=ellipsis))
323 351 12345+++
324 352 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
325 353 +++67890
326 354 >>> bprint(trim(t, 8))
327 355 12345678
328 356 >>> bprint(trim(t, 8, leftside=True))
329 357 34567890
330 358 >>> bprint(trim(t, 3, ellipsis=ellipsis))
331 359 +++
332 360 >>> bprint(trim(t, 1, ellipsis=ellipsis))
333 361 +
334 362 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
335 363 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
336 364 >>> bprint(trim(t, 12, ellipsis=ellipsis))
337 365 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
338 366 >>> bprint(trim(t, 10, ellipsis=ellipsis))
339 367 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
340 368 >>> bprint(trim(t, 8, ellipsis=ellipsis))
341 369 \xe3\x81\x82\xe3\x81\x84+++
342 370 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
343 371 +++\xe3\x81\x88\xe3\x81\x8a
344 372 >>> bprint(trim(t, 5))
345 373 \xe3\x81\x82\xe3\x81\x84
346 374 >>> bprint(trim(t, 5, leftside=True))
347 375 \xe3\x81\x88\xe3\x81\x8a
348 376 >>> bprint(trim(t, 4, ellipsis=ellipsis))
349 377 +++
350 378 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
351 379 +++
352 380 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
353 381 >>> bprint(trim(t, 12, ellipsis=ellipsis))
354 382 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
355 383 >>> bprint(trim(t, 10, ellipsis=ellipsis))
356 384 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
357 385 >>> bprint(trim(t, 8, ellipsis=ellipsis))
358 386 \x11\x22\x33\x44\x55+++
359 387 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
360 388 +++\x66\x77\x88\x99\xaa
361 389 >>> bprint(trim(t, 8))
362 390 \x11\x22\x33\x44\x55\x66\x77\x88
363 391 >>> bprint(trim(t, 8, leftside=True))
364 392 \x33\x44\x55\x66\x77\x88\x99\xaa
365 393 >>> bprint(trim(t, 3, ellipsis=ellipsis))
366 394 +++
367 395 >>> bprint(trim(t, 1, ellipsis=ellipsis))
368 396 +
369 397 """
370 398 try:
371 399 u = s.decode(_sysstr(encoding))
372 400 except UnicodeDecodeError:
373 401 if len(s) <= width: # trimming is not needed
374 402 return s
375 403 width -= len(ellipsis)
376 404 if width <= 0: # no enough room even for ellipsis
377 405 return ellipsis[: width + len(ellipsis)]
378 406 if leftside:
379 407 return ellipsis + s[-width:]
380 408 return s[:width] + ellipsis
381 409
382 410 if ucolwidth(u) <= width: # trimming is not needed
383 411 return s
384 412
385 413 width -= len(ellipsis)
386 414 if width <= 0: # no enough room even for ellipsis
387 415 return ellipsis[: width + len(ellipsis)]
388 416
389 417 if leftside:
390 418 uslice = lambda i: u[i:]
391 419 concat = lambda s: ellipsis + s
392 420 else:
393 421 uslice = lambda i: u[:-i]
394 422 concat = lambda s: s + ellipsis
395 423 for i in pycompat.xrange(1, len(u)):
396 424 usub = uslice(i)
397 425 if ucolwidth(usub) <= width:
398 426 return concat(usub.encode(_sysstr(encoding)))
399 427 return ellipsis # no enough room for multi-column characters
400 428
401 429
402 430 def lower(s):
431 # type: (bytes) -> bytes
403 432 b"best-effort encoding-aware case-folding of local string s"
404 433 try:
405 434 return asciilower(s)
406 435 except UnicodeDecodeError:
407 436 pass
408 437 try:
409 438 if isinstance(s, localstr):
410 439 u = s._utf8.decode("utf-8")
411 440 else:
412 441 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
413 442
414 443 lu = u.lower()
415 444 if u == lu:
416 445 return s # preserve localstring
417 446 return lu.encode(_sysstr(encoding))
418 447 except UnicodeError:
419 448 return s.lower() # we don't know how to fold this except in ASCII
420 449 except LookupError as k:
421 450 raise error.Abort(k, hint=b"please check your locale settings")
422 451
423 452
424 453 def upper(s):
454 # type: (bytes) -> bytes
425 455 b"best-effort encoding-aware case-folding of local string s"
426 456 try:
427 457 return asciiupper(s)
428 458 except UnicodeDecodeError:
429 459 return upperfallback(s)
430 460
431 461
432 462 def upperfallback(s):
463 # type: (Any) -> Any
433 464 try:
434 465 if isinstance(s, localstr):
435 466 u = s._utf8.decode("utf-8")
436 467 else:
437 468 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
438 469
439 470 uu = u.upper()
440 471 if u == uu:
441 472 return s # preserve localstring
442 473 return uu.encode(_sysstr(encoding))
443 474 except UnicodeError:
444 475 return s.upper() # we don't know how to fold this except in ASCII
445 476 except LookupError as k:
446 477 raise error.Abort(k, hint=b"please check your locale settings")
447 478
448 479
449 480 class normcasespecs(object):
450 481 '''what a platform's normcase does to ASCII strings
451 482
452 483 This is specified per platform, and should be consistent with what normcase
453 484 on that platform actually does.
454 485
455 486 lower: normcase lowercases ASCII strings
456 487 upper: normcase uppercases ASCII strings
457 488 other: the fallback function should always be called
458 489
459 490 This should be kept in sync with normcase_spec in util.h.'''
460 491
461 492 lower = -1
462 493 upper = 1
463 494 other = 0
464 495
465 496
466 497 def jsonescape(s, paranoid=False):
498 # type: (Any, Any) -> Any
467 499 '''returns a string suitable for JSON
468 500
469 501 JSON is problematic for us because it doesn't support non-Unicode
470 502 bytes. To deal with this, we take the following approach:
471 503
472 504 - localstr/safelocalstr objects are converted back to UTF-8
473 505 - valid UTF-8/ASCII strings are passed as-is
474 506 - other strings are converted to UTF-8b surrogate encoding
475 507 - apply JSON-specified string escaping
476 508
477 509 (escapes are doubled in these tests)
478 510
479 511 >>> jsonescape(b'this is a test')
480 512 'this is a test'
481 513 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
482 514 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
483 515 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
484 516 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
485 517 >>> jsonescape(b'a weird byte: \\xdd')
486 518 'a weird byte: \\xed\\xb3\\x9d'
487 519 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
488 520 'utf-8: caf\\xc3\\xa9'
489 521 >>> jsonescape(b'')
490 522 ''
491 523
492 524 If paranoid, non-ascii and common troublesome characters are also escaped.
493 525 This is suitable for web output.
494 526
495 527 >>> s = b'escape characters: \\0 \\x0b \\x7f'
496 528 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
497 529 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
498 530 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
499 531 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
500 532 'escape boundary: ~ \\\\u007f \\\\u0080'
501 533 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
502 534 'a weird byte: \\\\udcdd'
503 535 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
504 536 'utf-8: caf\\\\u00e9'
505 537 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
506 538 'non-BMP: \\\\ud834\\\\udd1e'
507 539 >>> jsonescape(b'<foo@example.org>', paranoid=True)
508 540 '\\\\u003cfoo@example.org\\\\u003e'
509 541 '''
510 542
511 543 u8chars = toutf8b(s)
512 544 try:
513 545 return _jsonescapeu8fast(u8chars, paranoid)
514 546 except ValueError:
515 547 pass
516 548 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
517 549
518 550
519 551 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
520 552 # bytes are mapped to that range.
521 553 if pycompat.ispy3:
522 554 _utf8strict = r'surrogatepass'
523 555 else:
524 556 _utf8strict = r'strict'
525 557
526 558 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
527 559
528 560
529 561 def getutf8char(s, pos):
562 # type: (Any, Any) -> Any
530 563 '''get the next full utf-8 character in the given string, starting at pos
531 564
532 565 Raises a UnicodeError if the given location does not start a valid
533 566 utf-8 character.
534 567 '''
535 568
536 569 # find how many bytes to attempt decoding from first nibble
537 570 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
538 571 if not l: # ascii
539 572 return s[pos : pos + 1]
540 573
541 574 c = s[pos : pos + l]
542 575 # validate with attempted decode
543 576 c.decode("utf-8", _utf8strict)
544 577 return c
545 578
546 579
547 580 def toutf8b(s):
581 # type: (Any) -> Any
548 582 '''convert a local, possibly-binary string into UTF-8b
549 583
550 584 This is intended as a generic method to preserve data when working
551 585 with schemes like JSON and XML that have no provision for
552 586 arbitrary byte strings. As Mercurial often doesn't know
553 587 what encoding data is in, we use so-called UTF-8b.
554 588
555 589 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
556 590 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
557 591 uDC00-uDCFF.
558 592
559 593 Principles of operation:
560 594
561 595 - ASCII and UTF-8 data successfully round-trips and is understood
562 596 by Unicode-oriented clients
563 597 - filenames and file contents in arbitrary other encodings can have
564 598 be round-tripped or recovered by clueful clients
565 599 - local strings that have a cached known UTF-8 encoding (aka
566 600 localstr) get sent as UTF-8 so Unicode-oriented clients get the
567 601 Unicode data they want
568 602 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
569 603 - because we must preserve UTF-8 bytestring in places such as
570 604 filenames, metadata can't be roundtripped without help
571 605
572 606 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
573 607 arbitrary bytes into an internal Unicode format that can be
574 608 re-encoded back into the original. Here we are exposing the
575 609 internal surrogate encoding as a UTF-8 string.)
576 610 '''
577 611
578 612 if isinstance(s, localstr):
579 613 # assume that the original UTF-8 sequence would never contain
580 614 # invalid characters in U+DCxx range
581 615 return s._utf8
582 616 elif isinstance(s, safelocalstr):
583 617 # already verified that s is non-lossy in legacy encoding, which
584 618 # shouldn't contain characters in U+DCxx range
585 619 return fromlocal(s)
586 620 elif isasciistr(s):
587 621 return s
588 622 if b"\xed" not in s:
589 623 try:
590 624 s.decode('utf-8', _utf8strict)
591 625 return s
592 626 except UnicodeDecodeError:
593 627 pass
594 628
595 629 s = pycompat.bytestr(s)
596 630 r = b""
597 631 pos = 0
598 632 l = len(s)
599 633 while pos < l:
600 634 try:
601 635 c = getutf8char(s, pos)
602 636 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
603 637 # have to re-escape existing U+DCxx characters
604 638 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
605 639 pos += 1
606 640 else:
607 641 pos += len(c)
608 642 except UnicodeDecodeError:
609 643 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
610 644 pos += 1
611 645 r += c
612 646 return r
613 647
614 648
615 649 def fromutf8b(s):
650 # type: (Text) -> bytes
616 651 '''Given a UTF-8b string, return a local, possibly-binary string.
617 652
618 653 return the original binary string. This
619 654 is a round-trip process for strings like filenames, but metadata
620 655 that's was passed through tolocal will remain in UTF-8.
621 656
622 657 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
623 658 >>> m = b"\\xc3\\xa9\\x99abcd"
624 659 >>> toutf8b(m)
625 660 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
626 661 >>> roundtrip(m)
627 662 True
628 663 >>> roundtrip(b"\\xc2\\xc2\\x80")
629 664 True
630 665 >>> roundtrip(b"\\xef\\xbf\\xbd")
631 666 True
632 667 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
633 668 True
634 669 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
635 670 True
636 671 '''
637 672
638 673 if isasciistr(s):
639 674 return s
640 675 # fast path - look for uDxxx prefixes in s
641 676 if b"\xed" not in s:
642 677 return s
643 678
644 679 # We could do this with the unicode type but some Python builds
645 680 # use UTF-16 internally (issue5031) which causes non-BMP code
646 681 # points to be escaped. Instead, we use our handy getutf8char
647 682 # helper again to walk the string without "decoding" it.
648 683
649 684 s = pycompat.bytestr(s)
650 685 r = b""
651 686 pos = 0
652 687 l = len(s)
653 688 while pos < l:
654 689 c = getutf8char(s, pos)
655 690 pos += len(c)
656 691 # unescape U+DCxx characters
657 692 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
658 693 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
659 694 r += c
660 695 return r
@@ -1,27 +1,28
1 1 #require test-repo pyflakes hg10
2 2
3 3 $ . "$TESTDIR/helpers-testrepo.sh"
4 4
5 5 run pyflakes on all tracked files ending in .py or without a file ending
6 6 (skipping binary file random-seed)
7 7
8 8 $ cat > test.py <<EOF
9 9 > print(undefinedname)
10 10 > EOF
11 11 $ pyflakes test.py 2>/dev/null | "$TESTDIR/filterpyflakes.py"
12 12 test.py:1: undefined name 'undefinedname'
13 13
14 14 $ cd "`dirname "$TESTDIR"`"
15 15
16 16 $ testrepohg locate 'set:**.py or grep("^#!.*python")' \
17 17 > -X hgext/fsmonitor/pywatchman \
18 18 > -X mercurial/pycompat.py -X contrib/python-zstandard \
19 19 > -X mercurial/thirdparty/cbor \
20 20 > -X mercurial/thirdparty/concurrent \
21 21 > -X mercurial/thirdparty/zope \
22 22 > 2>/dev/null \
23 23 > | xargs pyflakes 2>/dev/null | "$TESTDIR/filterpyflakes.py"
24 24 contrib/perf.py:*: undefined name 'xrange' (glob) (?)
25 25 mercurial/hgweb/server.py:*: undefined name 'reload' (glob) (?)
26 26 mercurial/util.py:*: undefined name 'file' (glob) (?)
27 mercurial/encoding.py:*: undefined name 'localstr' (glob) (?)
27 28
General Comments 0
You need to be logged in to leave comments. Login now