##// END OF EJS Templates
encoding: drop circular import by proxying through '<policy>.charencode'...
Yuya Nishihara -
r33756:f5fc54e7 default
parent child Browse files
Show More
@@ -0,0 +1,22 b''
1 # charencode.py - miscellaneous character encoding
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
7
8 from __future__ import absolute_import
9
10 def asciilower(s):
11 '''convert a string to lowercase if ASCII
12
13 Raises UnicodeDecodeError if non-ASCII characters are found.'''
14 s.decode('ascii')
15 return s.lower()
16
17 def asciiupper(s):
18 '''convert a string to uppercase if ASCII
19
20 Raises UnicodeDecodeError if non-ASCII characters are found.'''
21 s.decode('ascii')
22 return s.upper()
@@ -1,602 +1,575 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import array
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 charencode = policy.importmod(r'charencode')
22
23 asciilower = charencode.asciilower
24 asciiupper = charencode.asciiupper
25
21 26 _sysstr = pycompat.sysstr
22 27
23 28 if pycompat.ispy3:
24 29 unichr = chr
25 30
26 31 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
27 32 # "Unicode Subtleties"), so we need to ignore them in some places for
28 33 # sanity.
29 34 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
30 35 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
31 36 "206a 206b 206c 206d 206e 206f feff".split()]
32 37 # verify the next function will work
33 38 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
34 39
35 40 def hfsignoreclean(s):
36 41 """Remove codepoints ignored by HFS+ from s.
37 42
38 43 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
39 44 '.hg'
40 45 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
41 46 '.hg'
42 47 """
43 48 if "\xe2" in s or "\xef" in s:
44 49 for c in _ignore:
45 50 s = s.replace(c, '')
46 51 return s
47 52
48 53 # encoding.environ is provided read-only, which may not be used to modify
49 54 # the process environment
50 55 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
51 56 if not pycompat.ispy3:
52 57 environ = os.environ # re-exports
53 58 elif _nativeenviron:
54 59 environ = os.environb # re-exports
55 60 else:
56 61 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
57 62 # and recreate it once encoding is settled
58 63 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
59 64 for k, v in os.environ.items()) # re-exports
60 65
61 66 _encodingfixers = {
62 67 '646': lambda: 'ascii',
63 68 'ANSI_X3.4-1968': lambda: 'ascii',
64 69 }
65 70
66 71 try:
67 72 encoding = environ.get("HGENCODING")
68 73 if not encoding:
69 74 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
70 75 encoding = _encodingfixers.get(encoding, lambda: encoding)()
71 76 except locale.Error:
72 77 encoding = 'ascii'
73 78 encodingmode = environ.get("HGENCODINGMODE", "strict")
74 79 fallbackencoding = 'ISO-8859-1'
75 80
76 81 class localstr(str):
77 82 '''This class allows strings that are unmodified to be
78 83 round-tripped to the local encoding and back'''
79 84 def __new__(cls, u, l):
80 85 s = str.__new__(cls, l)
81 86 s._utf8 = u
82 87 return s
83 88 def __hash__(self):
84 89 return hash(self._utf8) # avoid collisions in local string space
85 90
86 91 def tolocal(s):
87 92 """
88 93 Convert a string from internal UTF-8 to local encoding
89 94
90 95 All internal strings should be UTF-8 but some repos before the
91 96 implementation of locale support may contain latin1 or possibly
92 97 other character sets. We attempt to decode everything strictly
93 98 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
94 99 replace unknown characters.
95 100
96 101 The localstr class is used to cache the known UTF-8 encoding of
97 102 strings next to their local representation to allow lossless
98 103 round-trip conversion back to UTF-8.
99 104
100 105 >>> u = 'foo: \\xc3\\xa4' # utf-8
101 106 >>> l = tolocal(u)
102 107 >>> l
103 108 'foo: ?'
104 109 >>> fromlocal(l)
105 110 'foo: \\xc3\\xa4'
106 111 >>> u2 = 'foo: \\xc3\\xa1'
107 112 >>> d = { l: 1, tolocal(u2): 2 }
108 113 >>> len(d) # no collision
109 114 2
110 115 >>> 'foo: ?' in d
111 116 False
112 117 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
113 118 >>> l = tolocal(l1)
114 119 >>> l
115 120 'foo: ?'
116 121 >>> fromlocal(l) # magically in utf-8
117 122 'foo: \\xc3\\xa4'
118 123 """
119 124
120 125 try:
121 126 try:
122 127 # make sure string is actually stored in UTF-8
123 128 u = s.decode('UTF-8')
124 129 if encoding == 'UTF-8':
125 130 # fast path
126 131 return s
127 132 r = u.encode(_sysstr(encoding), u"replace")
128 133 if u == r.decode(_sysstr(encoding)):
129 134 # r is a safe, non-lossy encoding of s
130 135 return r
131 136 return localstr(s, r)
132 137 except UnicodeDecodeError:
133 138 # we should only get here if we're looking at an ancient changeset
134 139 try:
135 140 u = s.decode(_sysstr(fallbackencoding))
136 141 r = u.encode(_sysstr(encoding), u"replace")
137 142 if u == r.decode(_sysstr(encoding)):
138 143 # r is a safe, non-lossy encoding of s
139 144 return r
140 145 return localstr(u.encode('UTF-8'), r)
141 146 except UnicodeDecodeError:
142 147 u = s.decode("utf-8", "replace") # last ditch
143 148 # can't round-trip
144 149 return u.encode(_sysstr(encoding), u"replace")
145 150 except LookupError as k:
146 151 raise error.Abort(k, hint="please check your locale settings")
147 152
148 153 def fromlocal(s):
149 154 """
150 155 Convert a string from the local character encoding to UTF-8
151 156
152 157 We attempt to decode strings using the encoding mode set by
153 158 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
154 159 characters will cause an error message. Other modes include
155 160 'replace', which replaces unknown characters with a special
156 161 Unicode character, and 'ignore', which drops the character.
157 162 """
158 163
159 164 # can we do a lossless round-trip?
160 165 if isinstance(s, localstr):
161 166 return s._utf8
162 167
163 168 try:
164 169 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
165 170 return u.encode("utf-8")
166 171 except UnicodeDecodeError as inst:
167 172 sub = s[max(0, inst.start - 10):inst.start + 10]
168 173 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
169 174 except LookupError as k:
170 175 raise error.Abort(k, hint="please check your locale settings")
171 176
172 177 def unitolocal(u):
173 178 """Convert a unicode string to a byte string of local encoding"""
174 179 return tolocal(u.encode('utf-8'))
175 180
176 181 def unifromlocal(s):
177 182 """Convert a byte string of local encoding to a unicode string"""
178 183 return fromlocal(s).decode('utf-8')
179 184
180 185 def unimethod(bytesfunc):
181 186 """Create a proxy method that forwards __unicode__() and __str__() of
182 187 Python 3 to __bytes__()"""
183 188 def unifunc(obj):
184 189 return unifromlocal(bytesfunc(obj))
185 190 return unifunc
186 191
187 192 # converter functions between native str and byte string. use these if the
188 193 # character encoding is not aware (e.g. exception message) or is known to
189 194 # be locale dependent (e.g. date formatting.)
190 195 if pycompat.ispy3:
191 196 strtolocal = unitolocal
192 197 strfromlocal = unifromlocal
193 198 strmethod = unimethod
194 199 else:
195 200 strtolocal = pycompat.identity
196 201 strfromlocal = pycompat.identity
197 202 strmethod = pycompat.identity
198 203
199 204 if not _nativeenviron:
200 205 # now encoding and helper functions are available, recreate the environ
201 206 # dict to be exported to other modules
202 207 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
203 208 for k, v in os.environ.items()) # re-exports
204 209
205 210 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
206 211 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
207 212 and "WFA" or "WF")
208 213
209 214 def colwidth(s):
210 215 "Find the column width of a string for display in the local encoding"
211 216 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
212 217
213 218 def ucolwidth(d):
214 219 "Find the column width of a Unicode string for display"
215 220 eaw = getattr(unicodedata, 'east_asian_width', None)
216 221 if eaw is not None:
217 222 return sum([eaw(c) in _wide and 2 or 1 for c in d])
218 223 return len(d)
219 224
220 225 def getcols(s, start, c):
221 226 '''Use colwidth to find a c-column substring of s starting at byte
222 227 index start'''
223 228 for x in xrange(start + c, len(s)):
224 229 t = s[start:x]
225 230 if colwidth(t) == c:
226 231 return t
227 232
228 233 def trim(s, width, ellipsis='', leftside=False):
229 234 """Trim string 's' to at most 'width' columns (including 'ellipsis').
230 235
231 236 If 'leftside' is True, left side of string 's' is trimmed.
232 237 'ellipsis' is always placed at trimmed side.
233 238
234 239 >>> ellipsis = '+++'
235 240 >>> from . import encoding
236 241 >>> encoding.encoding = 'utf-8'
237 242 >>> t= '1234567890'
238 243 >>> print trim(t, 12, ellipsis=ellipsis)
239 244 1234567890
240 245 >>> print trim(t, 10, ellipsis=ellipsis)
241 246 1234567890
242 247 >>> print trim(t, 8, ellipsis=ellipsis)
243 248 12345+++
244 249 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
245 250 +++67890
246 251 >>> print trim(t, 8)
247 252 12345678
248 253 >>> print trim(t, 8, leftside=True)
249 254 34567890
250 255 >>> print trim(t, 3, ellipsis=ellipsis)
251 256 +++
252 257 >>> print trim(t, 1, ellipsis=ellipsis)
253 258 +
254 259 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
255 260 >>> t = u.encode(encoding.encoding)
256 261 >>> print trim(t, 12, ellipsis=ellipsis)
257 262 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
258 263 >>> print trim(t, 10, ellipsis=ellipsis)
259 264 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
260 265 >>> print trim(t, 8, ellipsis=ellipsis)
261 266 \xe3\x81\x82\xe3\x81\x84+++
262 267 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
263 268 +++\xe3\x81\x88\xe3\x81\x8a
264 269 >>> print trim(t, 5)
265 270 \xe3\x81\x82\xe3\x81\x84
266 271 >>> print trim(t, 5, leftside=True)
267 272 \xe3\x81\x88\xe3\x81\x8a
268 273 >>> print trim(t, 4, ellipsis=ellipsis)
269 274 +++
270 275 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
271 276 +++
272 277 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
273 278 >>> print trim(t, 12, ellipsis=ellipsis)
274 279 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
275 280 >>> print trim(t, 10, ellipsis=ellipsis)
276 281 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
277 282 >>> print trim(t, 8, ellipsis=ellipsis)
278 283 \x11\x22\x33\x44\x55+++
279 284 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
280 285 +++\x66\x77\x88\x99\xaa
281 286 >>> print trim(t, 8)
282 287 \x11\x22\x33\x44\x55\x66\x77\x88
283 288 >>> print trim(t, 8, leftside=True)
284 289 \x33\x44\x55\x66\x77\x88\x99\xaa
285 290 >>> print trim(t, 3, ellipsis=ellipsis)
286 291 +++
287 292 >>> print trim(t, 1, ellipsis=ellipsis)
288 293 +
289 294 """
290 295 try:
291 296 u = s.decode(_sysstr(encoding))
292 297 except UnicodeDecodeError:
293 298 if len(s) <= width: # trimming is not needed
294 299 return s
295 300 width -= len(ellipsis)
296 301 if width <= 0: # no enough room even for ellipsis
297 302 return ellipsis[:width + len(ellipsis)]
298 303 if leftside:
299 304 return ellipsis + s[-width:]
300 305 return s[:width] + ellipsis
301 306
302 307 if ucolwidth(u) <= width: # trimming is not needed
303 308 return s
304 309
305 310 width -= len(ellipsis)
306 311 if width <= 0: # no enough room even for ellipsis
307 312 return ellipsis[:width + len(ellipsis)]
308 313
309 314 if leftside:
310 315 uslice = lambda i: u[i:]
311 316 concat = lambda s: ellipsis + s
312 317 else:
313 318 uslice = lambda i: u[:-i]
314 319 concat = lambda s: s + ellipsis
315 320 for i in xrange(1, len(u)):
316 321 usub = uslice(i)
317 322 if ucolwidth(usub) <= width:
318 323 return concat(usub.encode(_sysstr(encoding)))
319 324 return ellipsis # no enough room for multi-column characters
320 325
321 def _asciilower(s):
322 '''convert a string to lowercase if ASCII
323
324 Raises UnicodeDecodeError if non-ASCII characters are found.'''
325 s.decode('ascii')
326 return s.lower()
327
328 def asciilower(s):
329 # delay importing avoids cyclic dependency around "parsers" in
330 # pure Python build (util => i18n => encoding => parsers => util)
331 parsers = policy.importmod(r'parsers')
332 impl = getattr(parsers, 'asciilower', _asciilower)
333 global asciilower
334 asciilower = impl
335 return impl(s)
336
337 def _asciiupper(s):
338 '''convert a string to uppercase if ASCII
339
340 Raises UnicodeDecodeError if non-ASCII characters are found.'''
341 s.decode('ascii')
342 return s.upper()
343
344 def asciiupper(s):
345 # delay importing avoids cyclic dependency around "parsers" in
346 # pure Python build (util => i18n => encoding => parsers => util)
347 parsers = policy.importmod(r'parsers')
348 impl = getattr(parsers, 'asciiupper', _asciiupper)
349 global asciiupper
350 asciiupper = impl
351 return impl(s)
352
353 326 def lower(s):
354 327 "best-effort encoding-aware case-folding of local string s"
355 328 try:
356 329 return asciilower(s)
357 330 except UnicodeDecodeError:
358 331 pass
359 332 try:
360 333 if isinstance(s, localstr):
361 334 u = s._utf8.decode("utf-8")
362 335 else:
363 336 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
364 337
365 338 lu = u.lower()
366 339 if u == lu:
367 340 return s # preserve localstring
368 341 return lu.encode(_sysstr(encoding))
369 342 except UnicodeError:
370 343 return s.lower() # we don't know how to fold this except in ASCII
371 344 except LookupError as k:
372 345 raise error.Abort(k, hint="please check your locale settings")
373 346
374 347 def upper(s):
375 348 "best-effort encoding-aware case-folding of local string s"
376 349 try:
377 350 return asciiupper(s)
378 351 except UnicodeDecodeError:
379 352 return upperfallback(s)
380 353
381 354 def upperfallback(s):
382 355 try:
383 356 if isinstance(s, localstr):
384 357 u = s._utf8.decode("utf-8")
385 358 else:
386 359 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
387 360
388 361 uu = u.upper()
389 362 if u == uu:
390 363 return s # preserve localstring
391 364 return uu.encode(_sysstr(encoding))
392 365 except UnicodeError:
393 366 return s.upper() # we don't know how to fold this except in ASCII
394 367 except LookupError as k:
395 368 raise error.Abort(k, hint="please check your locale settings")
396 369
397 370 class normcasespecs(object):
398 371 '''what a platform's normcase does to ASCII strings
399 372
400 373 This is specified per platform, and should be consistent with what normcase
401 374 on that platform actually does.
402 375
403 376 lower: normcase lowercases ASCII strings
404 377 upper: normcase uppercases ASCII strings
405 378 other: the fallback function should always be called
406 379
407 380 This should be kept in sync with normcase_spec in util.h.'''
408 381 lower = -1
409 382 upper = 1
410 383 other = 0
411 384
412 385 _jsonmap = []
413 386 _jsonmap.extend("\\u%04x" % x for x in range(32))
414 387 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
415 388 _jsonmap.append('\\u007f')
416 389 _jsonmap[0x09] = '\\t'
417 390 _jsonmap[0x0a] = '\\n'
418 391 _jsonmap[0x22] = '\\"'
419 392 _jsonmap[0x5c] = '\\\\'
420 393 _jsonmap[0x08] = '\\b'
421 394 _jsonmap[0x0c] = '\\f'
422 395 _jsonmap[0x0d] = '\\r'
423 396 _paranoidjsonmap = _jsonmap[:]
424 397 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
425 398 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
426 399 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
427 400
428 401 def jsonescape(s, paranoid=False):
429 402 '''returns a string suitable for JSON
430 403
431 404 JSON is problematic for us because it doesn't support non-Unicode
432 405 bytes. To deal with this, we take the following approach:
433 406
434 407 - localstr objects are converted back to UTF-8
435 408 - valid UTF-8/ASCII strings are passed as-is
436 409 - other strings are converted to UTF-8b surrogate encoding
437 410 - apply JSON-specified string escaping
438 411
439 412 (escapes are doubled in these tests)
440 413
441 414 >>> jsonescape('this is a test')
442 415 'this is a test'
443 416 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
444 417 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
445 418 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
446 419 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
447 420 >>> jsonescape('a weird byte: \\xdd')
448 421 'a weird byte: \\xed\\xb3\\x9d'
449 422 >>> jsonescape('utf-8: caf\\xc3\\xa9')
450 423 'utf-8: caf\\xc3\\xa9'
451 424 >>> jsonescape('')
452 425 ''
453 426
454 427 If paranoid, non-ascii and common troublesome characters are also escaped.
455 428 This is suitable for web output.
456 429
457 430 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
458 431 'escape boundary: ~ \\\\u007f \\\\u0080'
459 432 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
460 433 'a weird byte: \\\\udcdd'
461 434 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
462 435 'utf-8: caf\\\\u00e9'
463 436 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
464 437 'non-BMP: \\\\ud834\\\\udd1e'
465 438 >>> jsonescape('<foo@example.org>', paranoid=True)
466 439 '\\\\u003cfoo@example.org\\\\u003e'
467 440 '''
468 441
469 442 if paranoid:
470 443 jm = _paranoidjsonmap
471 444 else:
472 445 jm = _jsonmap
473 446
474 447 u8chars = toutf8b(s)
475 448 try:
476 449 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
477 450 except IndexError:
478 451 pass
479 452 # non-BMP char is represented as UTF-16 surrogate pair
480 453 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
481 454 u16codes.pop(0) # drop BOM
482 455 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
483 456
484 457 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
485 458
486 459 def getutf8char(s, pos):
487 460 '''get the next full utf-8 character in the given string, starting at pos
488 461
489 462 Raises a UnicodeError if the given location does not start a valid
490 463 utf-8 character.
491 464 '''
492 465
493 466 # find how many bytes to attempt decoding from first nibble
494 467 l = _utf8len[ord(s[pos]) >> 4]
495 468 if not l: # ascii
496 469 return s[pos]
497 470
498 471 c = s[pos:pos + l]
499 472 # validate with attempted decode
500 473 c.decode("utf-8")
501 474 return c
502 475
503 476 def toutf8b(s):
504 477 '''convert a local, possibly-binary string into UTF-8b
505 478
506 479 This is intended as a generic method to preserve data when working
507 480 with schemes like JSON and XML that have no provision for
508 481 arbitrary byte strings. As Mercurial often doesn't know
509 482 what encoding data is in, we use so-called UTF-8b.
510 483
511 484 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
512 485 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
513 486 uDC00-uDCFF.
514 487
515 488 Principles of operation:
516 489
517 490 - ASCII and UTF-8 data successfully round-trips and is understood
518 491 by Unicode-oriented clients
519 492 - filenames and file contents in arbitrary other encodings can have
520 493 be round-tripped or recovered by clueful clients
521 494 - local strings that have a cached known UTF-8 encoding (aka
522 495 localstr) get sent as UTF-8 so Unicode-oriented clients get the
523 496 Unicode data they want
524 497 - because we must preserve UTF-8 bytestring in places such as
525 498 filenames, metadata can't be roundtripped without help
526 499
527 500 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
528 501 arbitrary bytes into an internal Unicode format that can be
529 502 re-encoded back into the original. Here we are exposing the
530 503 internal surrogate encoding as a UTF-8 string.)
531 504 '''
532 505
533 506 if "\xed" not in s:
534 507 if isinstance(s, localstr):
535 508 return s._utf8
536 509 try:
537 510 s.decode('utf-8')
538 511 return s
539 512 except UnicodeDecodeError:
540 513 pass
541 514
542 515 r = ""
543 516 pos = 0
544 517 l = len(s)
545 518 while pos < l:
546 519 try:
547 520 c = getutf8char(s, pos)
548 521 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
549 522 # have to re-escape existing U+DCxx characters
550 523 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
551 524 pos += 1
552 525 else:
553 526 pos += len(c)
554 527 except UnicodeDecodeError:
555 528 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
556 529 pos += 1
557 530 r += c
558 531 return r
559 532
560 533 def fromutf8b(s):
561 534 '''Given a UTF-8b string, return a local, possibly-binary string.
562 535
563 536 return the original binary string. This
564 537 is a round-trip process for strings like filenames, but metadata
565 538 that's was passed through tolocal will remain in UTF-8.
566 539
567 540 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
568 541 >>> m = "\\xc3\\xa9\\x99abcd"
569 542 >>> toutf8b(m)
570 543 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
571 544 >>> roundtrip(m)
572 545 True
573 546 >>> roundtrip("\\xc2\\xc2\\x80")
574 547 True
575 548 >>> roundtrip("\\xef\\xbf\\xbd")
576 549 True
577 550 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
578 551 True
579 552 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
580 553 True
581 554 '''
582 555
583 556 # fast path - look for uDxxx prefixes in s
584 557 if "\xed" not in s:
585 558 return s
586 559
587 560 # We could do this with the unicode type but some Python builds
588 561 # use UTF-16 internally (issue5031) which causes non-BMP code
589 562 # points to be escaped. Instead, we use our handy getutf8char
590 563 # helper again to walk the string without "decoding" it.
591 564
592 565 r = ""
593 566 pos = 0
594 567 l = len(s)
595 568 while pos < l:
596 569 c = getutf8char(s, pos)
597 570 pos += len(c)
598 571 # unescape U+DCxx characters
599 572 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
600 573 c = chr(ord(c.decode("utf-8")) & 0xff)
601 574 r += c
602 575 return r
@@ -1,114 +1,116 b''
1 1 # policy.py - module policy logic for Mercurial.
2 2 #
3 3 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import os
11 11 import sys
12 12
13 13 # Rules for how modules can be loaded. Values are:
14 14 #
15 15 # c - require C extensions
16 16 # allow - allow pure Python implementation when C loading fails
17 17 # cffi - required cffi versions (implemented within pure module)
18 18 # cffi-allow - allow pure Python implementation if cffi version is missing
19 19 # py - only load pure Python modules
20 20 #
21 21 # By default, fall back to the pure modules so the in-place build can
22 22 # run without recompiling the C extensions. This will be overridden by
23 23 # __modulepolicy__ generated by setup.py.
24 24 policy = b'allow'
25 25 _packageprefs = {
26 26 # policy: (versioned package, pure package)
27 27 b'c': (r'cext', None),
28 28 b'allow': (r'cext', r'pure'),
29 29 b'cffi': (r'cffi', None),
30 30 b'cffi-allow': (r'cffi', r'pure'),
31 31 b'py': (None, r'pure'),
32 32 }
33 33
34 34 try:
35 35 from . import __modulepolicy__
36 36 policy = __modulepolicy__.modulepolicy
37 37 except ImportError:
38 38 pass
39 39
40 40 # PyPy doesn't load C extensions.
41 41 #
42 42 # The canonical way to do this is to test platform.python_implementation().
43 43 # But we don't import platform and don't bloat for it here.
44 44 if r'__pypy__' in sys.builtin_module_names:
45 45 policy = b'cffi'
46 46
47 47 # Our C extensions aren't yet compatible with Python 3. So use pure Python
48 48 # on Python 3 for now.
49 49 if sys.version_info[0] >= 3:
50 50 policy = b'py'
51 51
52 52 # Environment variable can always force settings.
53 53 if sys.version_info[0] >= 3:
54 54 if r'HGMODULEPOLICY' in os.environ:
55 55 policy = os.environ[r'HGMODULEPOLICY'].encode(r'utf-8')
56 56 else:
57 57 policy = os.environ.get(r'HGMODULEPOLICY', policy)
58 58
59 59 def _importfrom(pkgname, modname):
60 60 # from .<pkgname> import <modname> (where . is looked through this module)
61 61 fakelocals = {}
62 62 pkg = __import__(pkgname, globals(), fakelocals, [modname], level=1)
63 63 try:
64 64 fakelocals[modname] = mod = getattr(pkg, modname)
65 65 except AttributeError:
66 66 raise ImportError(r'cannot import name %s' % modname)
67 67 # force import; fakelocals[modname] may be replaced with the real module
68 68 getattr(mod, r'__doc__', None)
69 69 return fakelocals[modname]
70 70
71 71 # keep in sync with "version" in C modules
72 72 _cextversions = {
73 73 (r'cext', r'base85'): 1,
74 74 (r'cext', r'bdiff'): 1,
75 75 (r'cext', r'diffhelpers'): 1,
76 76 (r'cext', r'mpatch'): 1,
77 77 (r'cext', r'osutil'): 1,
78 78 (r'cext', r'parsers'): 1,
79 79 }
80 80
81 81 # map import request to other package or module
82 82 _modredirects = {
83 (r'cext', r'charencode'): (r'cext', r'parsers'),
83 84 (r'cffi', r'base85'): (r'pure', r'base85'),
85 (r'cffi', r'charencode'): (r'pure', r'charencode'),
84 86 (r'cffi', r'diffhelpers'): (r'pure', r'diffhelpers'),
85 87 (r'cffi', r'parsers'): (r'pure', r'parsers'),
86 88 }
87 89
88 90 def _checkmod(pkgname, modname, mod):
89 91 expected = _cextversions.get((pkgname, modname))
90 92 actual = getattr(mod, r'version', None)
91 93 if actual != expected:
92 94 raise ImportError(r'cannot import module %s.%s '
93 95 r'(expected version: %d, actual: %r)'
94 96 % (pkgname, modname, expected, actual))
95 97
96 98 def importmod(modname):
97 99 """Import module according to policy and check API version"""
98 100 try:
99 101 verpkg, purepkg = _packageprefs[policy]
100 102 except KeyError:
101 103 raise ImportError(r'invalid HGMODULEPOLICY %r' % policy)
102 104 assert verpkg or purepkg
103 105 if verpkg:
104 106 pn, mn = _modredirects.get((verpkg, modname), (verpkg, modname))
105 107 try:
106 108 mod = _importfrom(pn, mn)
107 109 if pn == verpkg:
108 110 _checkmod(pn, mn, mod)
109 111 return mod
110 112 except ImportError:
111 113 if not purepkg:
112 114 raise
113 115 pn, mn = _modredirects.get((purepkg, modname), (purepkg, modname))
114 116 return _importfrom(pn, mn)
General Comments 0
You need to be logged in to leave comments. Login now