##// END OF EJS Templates
doctest: do not embed non-ascii characters in docstring...
Yuya Nishihara -
r34138:414a3513 default
parent child Browse files
Show More
@@ -1,585 +1,586
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import io
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import (
22 22 charencode as charencodepure,
23 23 )
24 24
25 25 charencode = policy.importmod(r'charencode')
26 26
27 27 isasciistr = charencode.isasciistr
28 28 asciilower = charencode.asciilower
29 29 asciiupper = charencode.asciiupper
30 30 _jsonescapeu8fast = charencode.jsonescapeu8fast
31 31
32 32 _sysstr = pycompat.sysstr
33 33
34 34 if pycompat.ispy3:
35 35 unichr = chr
36 36
37 37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
38 38 # "Unicode Subtleties"), so we need to ignore them in some places for
39 39 # sanity.
40 40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
41 41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
42 42 "206a 206b 206c 206d 206e 206f feff".split()]
43 43 # verify the next function will work
44 44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
45 45
46 46 def hfsignoreclean(s):
47 47 """Remove codepoints ignored by HFS+ from s.
48 48
49 49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 50 '.hg'
51 51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 52 '.hg'
53 53 """
54 54 if "\xe2" in s or "\xef" in s:
55 55 for c in _ignore:
56 56 s = s.replace(c, '')
57 57 return s
58 58
59 59 # encoding.environ is provided read-only, which may not be used to modify
60 60 # the process environment
61 61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
62 62 if not pycompat.ispy3:
63 63 environ = os.environ # re-exports
64 64 elif _nativeenviron:
65 65 environ = os.environb # re-exports
66 66 else:
67 67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
68 68 # and recreate it once encoding is settled
69 69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
70 70 for k, v in os.environ.items()) # re-exports
71 71
72 72 _encodingfixers = {
73 73 '646': lambda: 'ascii',
74 74 'ANSI_X3.4-1968': lambda: 'ascii',
75 75 }
76 76
77 77 try:
78 78 encoding = environ.get("HGENCODING")
79 79 if not encoding:
80 80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
81 81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
82 82 except locale.Error:
83 83 encoding = 'ascii'
84 84 encodingmode = environ.get("HGENCODINGMODE", "strict")
85 85 fallbackencoding = 'ISO-8859-1'
86 86
87 87 class localstr(bytes):
88 88 '''This class allows strings that are unmodified to be
89 89 round-tripped to the local encoding and back'''
90 90 def __new__(cls, u, l):
91 91 s = bytes.__new__(cls, l)
92 92 s._utf8 = u
93 93 return s
94 94 def __hash__(self):
95 95 return hash(self._utf8) # avoid collisions in local string space
96 96
97 97 def tolocal(s):
98 98 """
99 99 Convert a string from internal UTF-8 to local encoding
100 100
101 101 All internal strings should be UTF-8 but some repos before the
102 102 implementation of locale support may contain latin1 or possibly
103 103 other character sets. We attempt to decode everything strictly
104 104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
105 105 replace unknown characters.
106 106
107 107 The localstr class is used to cache the known UTF-8 encoding of
108 108 strings next to their local representation to allow lossless
109 109 round-trip conversion back to UTF-8.
110 110
111 111 >>> u = b'foo: \\xc3\\xa4' # utf-8
112 112 >>> l = tolocal(u)
113 113 >>> l
114 114 'foo: ?'
115 115 >>> fromlocal(l)
116 116 'foo: \\xc3\\xa4'
117 117 >>> u2 = b'foo: \\xc3\\xa1'
118 118 >>> d = { l: 1, tolocal(u2): 2 }
119 119 >>> len(d) # no collision
120 120 2
121 121 >>> b'foo: ?' in d
122 122 False
123 123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
124 124 >>> l = tolocal(l1)
125 125 >>> l
126 126 'foo: ?'
127 127 >>> fromlocal(l) # magically in utf-8
128 128 'foo: \\xc3\\xa4'
129 129 """
130 130
131 131 if isasciistr(s):
132 132 return s
133 133
134 134 try:
135 135 try:
136 136 # make sure string is actually stored in UTF-8
137 137 u = s.decode('UTF-8')
138 138 if encoding == 'UTF-8':
139 139 # fast path
140 140 return s
141 141 r = u.encode(_sysstr(encoding), u"replace")
142 142 if u == r.decode(_sysstr(encoding)):
143 143 # r is a safe, non-lossy encoding of s
144 144 return r
145 145 return localstr(s, r)
146 146 except UnicodeDecodeError:
147 147 # we should only get here if we're looking at an ancient changeset
148 148 try:
149 149 u = s.decode(_sysstr(fallbackencoding))
150 150 r = u.encode(_sysstr(encoding), u"replace")
151 151 if u == r.decode(_sysstr(encoding)):
152 152 # r is a safe, non-lossy encoding of s
153 153 return r
154 154 return localstr(u.encode('UTF-8'), r)
155 155 except UnicodeDecodeError:
156 156 u = s.decode("utf-8", "replace") # last ditch
157 157 # can't round-trip
158 158 return u.encode(_sysstr(encoding), u"replace")
159 159 except LookupError as k:
160 160 raise error.Abort(k, hint="please check your locale settings")
161 161
162 162 def fromlocal(s):
163 163 """
164 164 Convert a string from the local character encoding to UTF-8
165 165
166 166 We attempt to decode strings using the encoding mode set by
167 167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
168 168 characters will cause an error message. Other modes include
169 169 'replace', which replaces unknown characters with a special
170 170 Unicode character, and 'ignore', which drops the character.
171 171 """
172 172
173 173 # can we do a lossless round-trip?
174 174 if isinstance(s, localstr):
175 175 return s._utf8
176 176 if isasciistr(s):
177 177 return s
178 178
179 179 try:
180 180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
181 181 return u.encode("utf-8")
182 182 except UnicodeDecodeError as inst:
183 183 sub = s[max(0, inst.start - 10):inst.start + 10]
184 184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
185 185 except LookupError as k:
186 186 raise error.Abort(k, hint="please check your locale settings")
187 187
188 188 def unitolocal(u):
189 189 """Convert a unicode string to a byte string of local encoding"""
190 190 return tolocal(u.encode('utf-8'))
191 191
192 192 def unifromlocal(s):
193 193 """Convert a byte string of local encoding to a unicode string"""
194 194 return fromlocal(s).decode('utf-8')
195 195
196 196 def unimethod(bytesfunc):
197 197 """Create a proxy method that forwards __unicode__() and __str__() of
198 198 Python 3 to __bytes__()"""
199 199 def unifunc(obj):
200 200 return unifromlocal(bytesfunc(obj))
201 201 return unifunc
202 202
203 203 # converter functions between native str and byte string. use these if the
204 204 # character encoding is not aware (e.g. exception message) or is known to
205 205 # be locale dependent (e.g. date formatting.)
206 206 if pycompat.ispy3:
207 207 strtolocal = unitolocal
208 208 strfromlocal = unifromlocal
209 209 strmethod = unimethod
210 210 else:
211 211 strtolocal = pycompat.identity
212 212 strfromlocal = pycompat.identity
213 213 strmethod = pycompat.identity
214 214
215 215 if not _nativeenviron:
216 216 # now encoding and helper functions are available, recreate the environ
217 217 # dict to be exported to other modules
218 218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 219 for k, v in os.environ.items()) # re-exports
220 220
221 221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 223 and "WFA" or "WF")
224 224
225 225 def colwidth(s):
226 226 "Find the column width of a string for display in the local encoding"
227 227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228 228
229 229 def ucolwidth(d):
230 230 "Find the column width of a Unicode string for display"
231 231 eaw = getattr(unicodedata, 'east_asian_width', None)
232 232 if eaw is not None:
233 233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 234 return len(d)
235 235
236 236 def getcols(s, start, c):
237 237 '''Use colwidth to find a c-column substring of s starting at byte
238 238 index start'''
239 239 for x in xrange(start + c, len(s)):
240 240 t = s[start:x]
241 241 if colwidth(t) == c:
242 242 return t
243 243
244 244 def trim(s, width, ellipsis='', leftside=False):
245 245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246 246
247 247 If 'leftside' is True, left side of string 's' is trimmed.
248 248 'ellipsis' is always placed at trimmed side.
249 249
250 >>> from .node import bin
250 251 >>> ellipsis = b'+++'
251 252 >>> from . import encoding
252 253 >>> encoding.encoding = b'utf-8'
253 254 >>> t = b'1234567890'
254 255 >>> print trim(t, 12, ellipsis=ellipsis)
255 256 1234567890
256 257 >>> print trim(t, 10, ellipsis=ellipsis)
257 258 1234567890
258 259 >>> print trim(t, 8, ellipsis=ellipsis)
259 260 12345+++
260 261 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
261 262 +++67890
262 263 >>> print trim(t, 8)
263 264 12345678
264 265 >>> print trim(t, 8, leftside=True)
265 266 34567890
266 267 >>> print trim(t, 3, ellipsis=ellipsis)
267 268 +++
268 269 >>> print trim(t, 1, ellipsis=ellipsis)
269 270 +
270 271 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
271 272 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
272 273 >>> print trim(t, 12, ellipsis=ellipsis)
273 274 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
274 275 >>> print trim(t, 10, ellipsis=ellipsis)
275 276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
276 277 >>> print trim(t, 8, ellipsis=ellipsis)
277 278 \xe3\x81\x82\xe3\x81\x84+++
278 279 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
279 280 +++\xe3\x81\x88\xe3\x81\x8a
280 281 >>> print trim(t, 5)
281 282 \xe3\x81\x82\xe3\x81\x84
282 283 >>> print trim(t, 5, leftside=True)
283 284 \xe3\x81\x88\xe3\x81\x8a
284 285 >>> print trim(t, 4, ellipsis=ellipsis)
285 286 +++
286 287 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
287 288 +++
288 >>> t = b'\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
289 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
289 290 >>> print trim(t, 12, ellipsis=ellipsis)
290 291 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
291 292 >>> print trim(t, 10, ellipsis=ellipsis)
292 293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
293 294 >>> print trim(t, 8, ellipsis=ellipsis)
294 295 \x11\x22\x33\x44\x55+++
295 296 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
296 297 +++\x66\x77\x88\x99\xaa
297 298 >>> print trim(t, 8)
298 299 \x11\x22\x33\x44\x55\x66\x77\x88
299 300 >>> print trim(t, 8, leftside=True)
300 301 \x33\x44\x55\x66\x77\x88\x99\xaa
301 302 >>> print trim(t, 3, ellipsis=ellipsis)
302 303 +++
303 304 >>> print trim(t, 1, ellipsis=ellipsis)
304 305 +
305 306 """
306 307 try:
307 308 u = s.decode(_sysstr(encoding))
308 309 except UnicodeDecodeError:
309 310 if len(s) <= width: # trimming is not needed
310 311 return s
311 312 width -= len(ellipsis)
312 313 if width <= 0: # no enough room even for ellipsis
313 314 return ellipsis[:width + len(ellipsis)]
314 315 if leftside:
315 316 return ellipsis + s[-width:]
316 317 return s[:width] + ellipsis
317 318
318 319 if ucolwidth(u) <= width: # trimming is not needed
319 320 return s
320 321
321 322 width -= len(ellipsis)
322 323 if width <= 0: # no enough room even for ellipsis
323 324 return ellipsis[:width + len(ellipsis)]
324 325
325 326 if leftside:
326 327 uslice = lambda i: u[i:]
327 328 concat = lambda s: ellipsis + s
328 329 else:
329 330 uslice = lambda i: u[:-i]
330 331 concat = lambda s: s + ellipsis
331 332 for i in xrange(1, len(u)):
332 333 usub = uslice(i)
333 334 if ucolwidth(usub) <= width:
334 335 return concat(usub.encode(_sysstr(encoding)))
335 336 return ellipsis # no enough room for multi-column characters
336 337
337 338 def lower(s):
338 339 "best-effort encoding-aware case-folding of local string s"
339 340 try:
340 341 return asciilower(s)
341 342 except UnicodeDecodeError:
342 343 pass
343 344 try:
344 345 if isinstance(s, localstr):
345 346 u = s._utf8.decode("utf-8")
346 347 else:
347 348 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
348 349
349 350 lu = u.lower()
350 351 if u == lu:
351 352 return s # preserve localstring
352 353 return lu.encode(_sysstr(encoding))
353 354 except UnicodeError:
354 355 return s.lower() # we don't know how to fold this except in ASCII
355 356 except LookupError as k:
356 357 raise error.Abort(k, hint="please check your locale settings")
357 358
358 359 def upper(s):
359 360 "best-effort encoding-aware case-folding of local string s"
360 361 try:
361 362 return asciiupper(s)
362 363 except UnicodeDecodeError:
363 364 return upperfallback(s)
364 365
365 366 def upperfallback(s):
366 367 try:
367 368 if isinstance(s, localstr):
368 369 u = s._utf8.decode("utf-8")
369 370 else:
370 371 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
371 372
372 373 uu = u.upper()
373 374 if u == uu:
374 375 return s # preserve localstring
375 376 return uu.encode(_sysstr(encoding))
376 377 except UnicodeError:
377 378 return s.upper() # we don't know how to fold this except in ASCII
378 379 except LookupError as k:
379 380 raise error.Abort(k, hint="please check your locale settings")
380 381
381 382 class normcasespecs(object):
382 383 '''what a platform's normcase does to ASCII strings
383 384
384 385 This is specified per platform, and should be consistent with what normcase
385 386 on that platform actually does.
386 387
387 388 lower: normcase lowercases ASCII strings
388 389 upper: normcase uppercases ASCII strings
389 390 other: the fallback function should always be called
390 391
391 392 This should be kept in sync with normcase_spec in util.h.'''
392 393 lower = -1
393 394 upper = 1
394 395 other = 0
395 396
396 397 def jsonescape(s, paranoid=False):
397 398 '''returns a string suitable for JSON
398 399
399 400 JSON is problematic for us because it doesn't support non-Unicode
400 401 bytes. To deal with this, we take the following approach:
401 402
402 403 - localstr objects are converted back to UTF-8
403 404 - valid UTF-8/ASCII strings are passed as-is
404 405 - other strings are converted to UTF-8b surrogate encoding
405 406 - apply JSON-specified string escaping
406 407
407 408 (escapes are doubled in these tests)
408 409
409 410 >>> jsonescape(b'this is a test')
410 411 'this is a test'
411 412 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
412 413 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
413 414 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
414 415 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
415 416 >>> jsonescape(b'a weird byte: \\xdd')
416 417 'a weird byte: \\xed\\xb3\\x9d'
417 418 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
418 419 'utf-8: caf\\xc3\\xa9'
419 420 >>> jsonescape(b'')
420 421 ''
421 422
422 423 If paranoid, non-ascii and common troublesome characters are also escaped.
423 424 This is suitable for web output.
424 425
425 426 >>> s = b'escape characters: \\0 \\x0b \\x7f'
426 427 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
427 428 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
428 429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
429 430 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
430 431 'escape boundary: ~ \\\\u007f \\\\u0080'
431 432 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
432 433 'a weird byte: \\\\udcdd'
433 434 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
434 435 'utf-8: caf\\\\u00e9'
435 436 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
436 437 'non-BMP: \\\\ud834\\\\udd1e'
437 438 >>> jsonescape(b'<foo@example.org>', paranoid=True)
438 439 '\\\\u003cfoo@example.org\\\\u003e'
439 440 '''
440 441
441 442 u8chars = toutf8b(s)
442 443 try:
443 444 return _jsonescapeu8fast(u8chars, paranoid)
444 445 except ValueError:
445 446 pass
446 447 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
447 448
448 449 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
449 450
450 451 def getutf8char(s, pos):
451 452 '''get the next full utf-8 character in the given string, starting at pos
452 453
453 454 Raises a UnicodeError if the given location does not start a valid
454 455 utf-8 character.
455 456 '''
456 457
457 458 # find how many bytes to attempt decoding from first nibble
458 459 l = _utf8len[ord(s[pos]) >> 4]
459 460 if not l: # ascii
460 461 return s[pos]
461 462
462 463 c = s[pos:pos + l]
463 464 # validate with attempted decode
464 465 c.decode("utf-8")
465 466 return c
466 467
467 468 def toutf8b(s):
468 469 '''convert a local, possibly-binary string into UTF-8b
469 470
470 471 This is intended as a generic method to preserve data when working
471 472 with schemes like JSON and XML that have no provision for
472 473 arbitrary byte strings. As Mercurial often doesn't know
473 474 what encoding data is in, we use so-called UTF-8b.
474 475
475 476 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
476 477 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
477 478 uDC00-uDCFF.
478 479
479 480 Principles of operation:
480 481
481 482 - ASCII and UTF-8 data successfully round-trips and is understood
482 483 by Unicode-oriented clients
483 484 - filenames and file contents in arbitrary other encodings can have
484 485 be round-tripped or recovered by clueful clients
485 486 - local strings that have a cached known UTF-8 encoding (aka
486 487 localstr) get sent as UTF-8 so Unicode-oriented clients get the
487 488 Unicode data they want
488 489 - because we must preserve UTF-8 bytestring in places such as
489 490 filenames, metadata can't be roundtripped without help
490 491
491 492 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
492 493 arbitrary bytes into an internal Unicode format that can be
493 494 re-encoded back into the original. Here we are exposing the
494 495 internal surrogate encoding as a UTF-8 string.)
495 496 '''
496 497
497 498 if not isinstance(s, localstr) and isasciistr(s):
498 499 return s
499 500 if "\xed" not in s:
500 501 if isinstance(s, localstr):
501 502 return s._utf8
502 503 try:
503 504 s.decode('utf-8')
504 505 return s
505 506 except UnicodeDecodeError:
506 507 pass
507 508
508 509 r = ""
509 510 pos = 0
510 511 l = len(s)
511 512 while pos < l:
512 513 try:
513 514 c = getutf8char(s, pos)
514 515 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
515 516 # have to re-escape existing U+DCxx characters
516 517 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
517 518 pos += 1
518 519 else:
519 520 pos += len(c)
520 521 except UnicodeDecodeError:
521 522 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
522 523 pos += 1
523 524 r += c
524 525 return r
525 526
526 527 def fromutf8b(s):
527 528 '''Given a UTF-8b string, return a local, possibly-binary string.
528 529
529 530 return the original binary string. This
530 531 is a round-trip process for strings like filenames, but metadata
531 532 that's was passed through tolocal will remain in UTF-8.
532 533
533 534 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
534 535 >>> m = b"\\xc3\\xa9\\x99abcd"
535 536 >>> toutf8b(m)
536 537 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
537 538 >>> roundtrip(m)
538 539 True
539 540 >>> roundtrip(b"\\xc2\\xc2\\x80")
540 541 True
541 542 >>> roundtrip(b"\\xef\\xbf\\xbd")
542 543 True
543 544 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
544 545 True
545 546 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
546 547 True
547 548 '''
548 549
549 550 if isasciistr(s):
550 551 return s
551 552 # fast path - look for uDxxx prefixes in s
552 553 if "\xed" not in s:
553 554 return s
554 555
555 556 # We could do this with the unicode type but some Python builds
556 557 # use UTF-16 internally (issue5031) which causes non-BMP code
557 558 # points to be escaped. Instead, we use our handy getutf8char
558 559 # helper again to walk the string without "decoding" it.
559 560
560 561 r = ""
561 562 pos = 0
562 563 l = len(s)
563 564 while pos < l:
564 565 c = getutf8char(s, pos)
565 566 pos += len(c)
566 567 # unescape U+DCxx characters
567 568 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
568 569 c = chr(ord(c.decode("utf-8")) & 0xff)
569 570 r += c
570 571 return r
571 572
572 573 if pycompat.ispy3:
573 574 class strio(io.TextIOWrapper):
574 575 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
575 576
576 577 Also works around Python closing streams.
577 578 """
578 579
579 580 def __init__(self, buffer):
580 581 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
581 582
582 583 def __del__(self):
583 584 """Override __del__ so it doesn't close the underlying stream."""
584 585 else:
585 586 strio = pycompat.identity
@@ -1,575 +1,575
1 1 # store.py - repository store handling for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import errno
11 11 import hashlib
12 12 import os
13 13 import stat
14 14
15 15 from .i18n import _
16 16 from . import (
17 17 error,
18 18 policy,
19 19 pycompat,
20 20 util,
21 21 vfs as vfsmod,
22 22 )
23 23
24 24 parsers = policy.importmod(r'parsers')
25 25
26 26 # This avoids a collision between a file named foo and a dir named
27 27 # foo.i or foo.d
28 28 def _encodedir(path):
29 29 '''
30 30 >>> _encodedir(b'data/foo.i')
31 31 'data/foo.i'
32 32 >>> _encodedir(b'data/foo.i/bla.i')
33 33 'data/foo.i.hg/bla.i'
34 34 >>> _encodedir(b'data/foo.i.hg/bla.i')
35 35 'data/foo.i.hg.hg/bla.i'
36 36 >>> _encodedir(b'data/foo.i\\ndata/foo.i/bla.i\\ndata/foo.i.hg/bla.i\\n')
37 37 'data/foo.i\\ndata/foo.i.hg/bla.i\\ndata/foo.i.hg.hg/bla.i\\n'
38 38 '''
39 39 return (path
40 40 .replace(".hg/", ".hg.hg/")
41 41 .replace(".i/", ".i.hg/")
42 42 .replace(".d/", ".d.hg/"))
43 43
44 44 encodedir = getattr(parsers, 'encodedir', _encodedir)
45 45
46 46 def decodedir(path):
47 47 '''
48 48 >>> decodedir(b'data/foo.i')
49 49 'data/foo.i'
50 50 >>> decodedir(b'data/foo.i.hg/bla.i')
51 51 'data/foo.i/bla.i'
52 52 >>> decodedir(b'data/foo.i.hg.hg/bla.i')
53 53 'data/foo.i.hg/bla.i'
54 54 '''
55 55 if ".hg/" not in path:
56 56 return path
57 57 return (path
58 58 .replace(".d.hg/", ".d/")
59 59 .replace(".i.hg/", ".i/")
60 60 .replace(".hg.hg/", ".hg/"))
61 61
62 62 def _reserved():
63 63 ''' characters that are problematic for filesystems
64 64
65 65 * ascii escapes (0..31)
66 66 * ascii hi (126..255)
67 67 * windows specials
68 68
69 69 these characters will be escaped by encodefunctions
70 70 '''
71 71 winreserved = [ord(x) for x in u'\\:*?"<>|']
72 72 for x in range(32):
73 73 yield x
74 74 for x in range(126, 256):
75 75 yield x
76 76 for x in winreserved:
77 77 yield x
78 78
79 79 def _buildencodefun():
80 80 '''
81 81 >>> enc, dec = _buildencodefun()
82 82
83 83 >>> enc(b'nothing/special.txt')
84 84 'nothing/special.txt'
85 85 >>> dec(b'nothing/special.txt')
86 86 'nothing/special.txt'
87 87
88 88 >>> enc(b'HELLO')
89 89 '_h_e_l_l_o'
90 90 >>> dec(b'_h_e_l_l_o')
91 91 'HELLO'
92 92
93 93 >>> enc(b'hello:world?')
94 94 'hello~3aworld~3f'
95 95 >>> dec(b'hello~3aworld~3f')
96 96 'hello:world?'
97 97
98 >>> enc(b'the\x07quick\xADshot')
98 >>> enc(b'the\\x07quick\\xADshot')
99 99 'the~07quick~adshot'
100 100 >>> dec(b'the~07quick~adshot')
101 101 'the\\x07quick\\xadshot'
102 102 '''
103 103 e = '_'
104 104 xchr = pycompat.bytechr
105 105 asciistr = list(map(xchr, range(127)))
106 106 capitals = list(range(ord("A"), ord("Z") + 1))
107 107
108 108 cmap = dict((x, x) for x in asciistr)
109 109 for x in _reserved():
110 110 cmap[xchr(x)] = "~%02x" % x
111 111 for x in capitals + [ord(e)]:
112 112 cmap[xchr(x)] = e + xchr(x).lower()
113 113
114 114 dmap = {}
115 115 for k, v in cmap.iteritems():
116 116 dmap[v] = k
117 117 def decode(s):
118 118 i = 0
119 119 while i < len(s):
120 120 for l in xrange(1, 4):
121 121 try:
122 122 yield dmap[s[i:i + l]]
123 123 i += l
124 124 break
125 125 except KeyError:
126 126 pass
127 127 else:
128 128 raise KeyError
129 129 return (lambda s: ''.join([cmap[s[c:c + 1]] for c in xrange(len(s))]),
130 130 lambda s: ''.join(list(decode(s))))
131 131
132 132 _encodefname, _decodefname = _buildencodefun()
133 133
134 134 def encodefilename(s):
135 135 '''
136 136 >>> encodefilename(b'foo.i/bar.d/bla.hg/hi:world?/HELLO')
137 137 'foo.i.hg/bar.d.hg/bla.hg.hg/hi~3aworld~3f/_h_e_l_l_o'
138 138 '''
139 139 return _encodefname(encodedir(s))
140 140
141 141 def decodefilename(s):
142 142 '''
143 143 >>> decodefilename(b'foo.i.hg/bar.d.hg/bla.hg.hg/hi~3aworld~3f/_h_e_l_l_o')
144 144 'foo.i/bar.d/bla.hg/hi:world?/HELLO'
145 145 '''
146 146 return decodedir(_decodefname(s))
147 147
148 148 def _buildlowerencodefun():
149 149 '''
150 150 >>> f = _buildlowerencodefun()
151 151 >>> f(b'nothing/special.txt')
152 152 'nothing/special.txt'
153 153 >>> f(b'HELLO')
154 154 'hello'
155 155 >>> f(b'hello:world?')
156 156 'hello~3aworld~3f'
157 >>> f(b'the\x07quick\xADshot')
157 >>> f(b'the\\x07quick\\xADshot')
158 158 'the~07quick~adshot'
159 159 '''
160 160 cmap = dict([(chr(x), chr(x)) for x in xrange(127)])
161 161 for x in _reserved():
162 162 cmap[chr(x)] = "~%02x" % x
163 163 for x in range(ord("A"), ord("Z") + 1):
164 164 cmap[chr(x)] = chr(x).lower()
165 165 return lambda s: "".join([cmap[c] for c in s])
166 166
167 167 lowerencode = getattr(parsers, 'lowerencode', None) or _buildlowerencodefun()
168 168
169 169 # Windows reserved names: con, prn, aux, nul, com1..com9, lpt1..lpt9
170 170 _winres3 = ('aux', 'con', 'prn', 'nul') # length 3
171 171 _winres4 = ('com', 'lpt') # length 4 (with trailing 1..9)
172 172 def _auxencode(path, dotencode):
173 173 '''
174 174 Encodes filenames containing names reserved by Windows or which end in
175 175 period or space. Does not touch other single reserved characters c.
176 176 Specifically, c in '\\:*?"<>|' or ord(c) <= 31 are *not* encoded here.
177 177 Additionally encodes space or period at the beginning, if dotencode is
178 178 True. Parameter path is assumed to be all lowercase.
179 179 A segment only needs encoding if a reserved name appears as a
180 180 basename (e.g. "aux", "aux.foo"). A directory or file named "foo.aux"
181 181 doesn't need encoding.
182 182
183 183 >>> s = b'.foo/aux.txt/txt.aux/con/prn/nul/foo.'
184 184 >>> _auxencode(s.split(b'/'), True)
185 185 ['~2efoo', 'au~78.txt', 'txt.aux', 'co~6e', 'pr~6e', 'nu~6c', 'foo~2e']
186 186 >>> s = b'.com1com2/lpt9.lpt4.lpt1/conprn/com0/lpt0/foo.'
187 187 >>> _auxencode(s.split(b'/'), False)
188 188 ['.com1com2', 'lp~749.lpt4.lpt1', 'conprn', 'com0', 'lpt0', 'foo~2e']
189 189 >>> _auxencode([b'foo. '], True)
190 190 ['foo.~20']
191 191 >>> _auxencode([b' .foo'], True)
192 192 ['~20.foo']
193 193 '''
194 194 for i, n in enumerate(path):
195 195 if not n:
196 196 continue
197 197 if dotencode and n[0] in '. ':
198 198 n = "~%02x" % ord(n[0:1]) + n[1:]
199 199 path[i] = n
200 200 else:
201 201 l = n.find('.')
202 202 if l == -1:
203 203 l = len(n)
204 204 if ((l == 3 and n[:3] in _winres3) or
205 205 (l == 4 and n[3:4] <= '9' and n[3:4] >= '1'
206 206 and n[:3] in _winres4)):
207 207 # encode third letter ('aux' -> 'au~78')
208 208 ec = "~%02x" % ord(n[2:3])
209 209 n = n[0:2] + ec + n[3:]
210 210 path[i] = n
211 211 if n[-1] in '. ':
212 212 # encode last period or space ('foo...' -> 'foo..~2e')
213 213 path[i] = n[:-1] + "~%02x" % ord(n[-1:])
214 214 return path
215 215
216 216 _maxstorepathlen = 120
217 217 _dirprefixlen = 8
218 218 _maxshortdirslen = 8 * (_dirprefixlen + 1) - 4
219 219
220 220 def _hashencode(path, dotencode):
221 221 digest = hashlib.sha1(path).hexdigest()
222 222 le = lowerencode(path[5:]).split('/') # skips prefix 'data/' or 'meta/'
223 223 parts = _auxencode(le, dotencode)
224 224 basename = parts[-1]
225 225 _root, ext = os.path.splitext(basename)
226 226 sdirs = []
227 227 sdirslen = 0
228 228 for p in parts[:-1]:
229 229 d = p[:_dirprefixlen]
230 230 if d[-1] in '. ':
231 231 # Windows can't access dirs ending in period or space
232 232 d = d[:-1] + '_'
233 233 if sdirslen == 0:
234 234 t = len(d)
235 235 else:
236 236 t = sdirslen + 1 + len(d)
237 237 if t > _maxshortdirslen:
238 238 break
239 239 sdirs.append(d)
240 240 sdirslen = t
241 241 dirs = '/'.join(sdirs)
242 242 if len(dirs) > 0:
243 243 dirs += '/'
244 244 res = 'dh/' + dirs + digest + ext
245 245 spaceleft = _maxstorepathlen - len(res)
246 246 if spaceleft > 0:
247 247 filler = basename[:spaceleft]
248 248 res = 'dh/' + dirs + filler + digest + ext
249 249 return res
250 250
251 251 def _hybridencode(path, dotencode):
252 252 '''encodes path with a length limit
253 253
254 254 Encodes all paths that begin with 'data/', according to the following.
255 255
256 256 Default encoding (reversible):
257 257
258 258 Encodes all uppercase letters 'X' as '_x'. All reserved or illegal
259 259 characters are encoded as '~xx', where xx is the two digit hex code
260 260 of the character (see encodefilename).
261 261 Relevant path components consisting of Windows reserved filenames are
262 262 masked by encoding the third character ('aux' -> 'au~78', see _auxencode).
263 263
264 264 Hashed encoding (not reversible):
265 265
266 266 If the default-encoded path is longer than _maxstorepathlen, a
267 267 non-reversible hybrid hashing of the path is done instead.
268 268 This encoding uses up to _dirprefixlen characters of all directory
269 269 levels of the lowerencoded path, but not more levels than can fit into
270 270 _maxshortdirslen.
271 271 Then follows the filler followed by the sha digest of the full path.
272 272 The filler is the beginning of the basename of the lowerencoded path
273 273 (the basename is everything after the last path separator). The filler
274 274 is as long as possible, filling in characters from the basename until
275 275 the encoded path has _maxstorepathlen characters (or all chars of the
276 276 basename have been taken).
277 277 The extension (e.g. '.i' or '.d') is preserved.
278 278
279 279 The string 'data/' at the beginning is replaced with 'dh/', if the hashed
280 280 encoding was used.
281 281 '''
282 282 path = encodedir(path)
283 283 ef = _encodefname(path).split('/')
284 284 res = '/'.join(_auxencode(ef, dotencode))
285 285 if len(res) > _maxstorepathlen:
286 286 res = _hashencode(path, dotencode)
287 287 return res
288 288
289 289 def _pathencode(path):
290 290 de = encodedir(path)
291 291 if len(path) > _maxstorepathlen:
292 292 return _hashencode(de, True)
293 293 ef = _encodefname(de).split('/')
294 294 res = '/'.join(_auxencode(ef, True))
295 295 if len(res) > _maxstorepathlen:
296 296 return _hashencode(de, True)
297 297 return res
298 298
299 299 _pathencode = getattr(parsers, 'pathencode', _pathencode)
300 300
301 301 def _plainhybridencode(f):
302 302 return _hybridencode(f, False)
303 303
304 304 def _calcmode(vfs):
305 305 try:
306 306 # files in .hg/ will be created using this mode
307 307 mode = vfs.stat().st_mode
308 308 # avoid some useless chmods
309 309 if (0o777 & ~util.umask) == (0o777 & mode):
310 310 mode = None
311 311 except OSError:
312 312 mode = None
313 313 return mode
314 314
315 315 _data = ('data meta 00manifest.d 00manifest.i 00changelog.d 00changelog.i'
316 316 ' phaseroots obsstore')
317 317
318 318 class basicstore(object):
319 319 '''base class for local repository stores'''
320 320 def __init__(self, path, vfstype):
321 321 vfs = vfstype(path)
322 322 self.path = vfs.base
323 323 self.createmode = _calcmode(vfs)
324 324 vfs.createmode = self.createmode
325 325 self.rawvfs = vfs
326 326 self.vfs = vfsmod.filtervfs(vfs, encodedir)
327 327 self.opener = self.vfs
328 328
329 329 def join(self, f):
330 330 return self.path + '/' + encodedir(f)
331 331
332 332 def _walk(self, relpath, recurse):
333 333 '''yields (unencoded, encoded, size)'''
334 334 path = self.path
335 335 if relpath:
336 336 path += '/' + relpath
337 337 striplen = len(self.path) + 1
338 338 l = []
339 339 if self.rawvfs.isdir(path):
340 340 visit = [path]
341 341 readdir = self.rawvfs.readdir
342 342 while visit:
343 343 p = visit.pop()
344 344 for f, kind, st in readdir(p, stat=True):
345 345 fp = p + '/' + f
346 346 if kind == stat.S_IFREG and f[-2:] in ('.d', '.i'):
347 347 n = util.pconvert(fp[striplen:])
348 348 l.append((decodedir(n), n, st.st_size))
349 349 elif kind == stat.S_IFDIR and recurse:
350 350 visit.append(fp)
351 351 l.sort()
352 352 return l
353 353
354 354 def datafiles(self):
355 355 return self._walk('data', True) + self._walk('meta', True)
356 356
357 357 def topfiles(self):
358 358 # yield manifest before changelog
359 359 return reversed(self._walk('', False))
360 360
361 361 def walk(self):
362 362 '''yields (unencoded, encoded, size)'''
363 363 # yield data files first
364 364 for x in self.datafiles():
365 365 yield x
366 366 for x in self.topfiles():
367 367 yield x
368 368
369 369 def copylist(self):
370 370 return ['requires'] + _data.split()
371 371
372 372 def write(self, tr):
373 373 pass
374 374
375 375 def invalidatecaches(self):
376 376 pass
377 377
378 378 def markremoved(self, fn):
379 379 pass
380 380
381 381 def __contains__(self, path):
382 382 '''Checks if the store contains path'''
383 383 path = "/".join(("data", path))
384 384 # file?
385 385 if self.vfs.exists(path + ".i"):
386 386 return True
387 387 # dir?
388 388 if not path.endswith("/"):
389 389 path = path + "/"
390 390 return self.vfs.exists(path)
391 391
392 392 class encodedstore(basicstore):
393 393 def __init__(self, path, vfstype):
394 394 vfs = vfstype(path + '/store')
395 395 self.path = vfs.base
396 396 self.createmode = _calcmode(vfs)
397 397 vfs.createmode = self.createmode
398 398 self.rawvfs = vfs
399 399 self.vfs = vfsmod.filtervfs(vfs, encodefilename)
400 400 self.opener = self.vfs
401 401
402 402 def datafiles(self):
403 403 for a, b, size in super(encodedstore, self).datafiles():
404 404 try:
405 405 a = decodefilename(a)
406 406 except KeyError:
407 407 a = None
408 408 yield a, b, size
409 409
410 410 def join(self, f):
411 411 return self.path + '/' + encodefilename(f)
412 412
413 413 def copylist(self):
414 414 return (['requires', '00changelog.i'] +
415 415 ['store/' + f for f in _data.split()])
416 416
417 417 class fncache(object):
418 418 # the filename used to be partially encoded
419 419 # hence the encodedir/decodedir dance
420 420 def __init__(self, vfs):
421 421 self.vfs = vfs
422 422 self.entries = None
423 423 self._dirty = False
424 424
425 425 def _load(self):
426 426 '''fill the entries from the fncache file'''
427 427 self._dirty = False
428 428 try:
429 429 fp = self.vfs('fncache', mode='rb')
430 430 except IOError:
431 431 # skip nonexistent file
432 432 self.entries = set()
433 433 return
434 434 self.entries = set(decodedir(fp.read()).splitlines())
435 435 if '' in self.entries:
436 436 fp.seek(0)
437 437 for n, line in enumerate(util.iterfile(fp)):
438 438 if not line.rstrip('\n'):
439 439 t = _('invalid entry in fncache, line %d') % (n + 1)
440 440 raise error.Abort(t)
441 441 fp.close()
442 442
443 443 def write(self, tr):
444 444 if self._dirty:
445 445 tr.addbackup('fncache')
446 446 fp = self.vfs('fncache', mode='wb', atomictemp=True)
447 447 if self.entries:
448 448 fp.write(encodedir('\n'.join(self.entries) + '\n'))
449 449 fp.close()
450 450 self._dirty = False
451 451
452 452 def add(self, fn):
453 453 if self.entries is None:
454 454 self._load()
455 455 if fn not in self.entries:
456 456 self._dirty = True
457 457 self.entries.add(fn)
458 458
459 459 def remove(self, fn):
460 460 if self.entries is None:
461 461 self._load()
462 462 try:
463 463 self.entries.remove(fn)
464 464 self._dirty = True
465 465 except KeyError:
466 466 pass
467 467
468 468 def __contains__(self, fn):
469 469 if self.entries is None:
470 470 self._load()
471 471 return fn in self.entries
472 472
473 473 def __iter__(self):
474 474 if self.entries is None:
475 475 self._load()
476 476 return iter(self.entries)
477 477
478 478 class _fncachevfs(vfsmod.abstractvfs, vfsmod.proxyvfs):
479 479 def __init__(self, vfs, fnc, encode):
480 480 vfsmod.proxyvfs.__init__(self, vfs)
481 481 self.fncache = fnc
482 482 self.encode = encode
483 483
484 484 def __call__(self, path, mode='r', *args, **kw):
485 485 if mode not in ('r', 'rb') and (path.startswith('data/') or
486 486 path.startswith('meta/')):
487 487 self.fncache.add(path)
488 488 return self.vfs(self.encode(path), mode, *args, **kw)
489 489
490 490 def join(self, path):
491 491 if path:
492 492 return self.vfs.join(self.encode(path))
493 493 else:
494 494 return self.vfs.join(path)
495 495
496 496 class fncachestore(basicstore):
497 497 def __init__(self, path, vfstype, dotencode):
498 498 if dotencode:
499 499 encode = _pathencode
500 500 else:
501 501 encode = _plainhybridencode
502 502 self.encode = encode
503 503 vfs = vfstype(path + '/store')
504 504 self.path = vfs.base
505 505 self.pathsep = self.path + '/'
506 506 self.createmode = _calcmode(vfs)
507 507 vfs.createmode = self.createmode
508 508 self.rawvfs = vfs
509 509 fnc = fncache(vfs)
510 510 self.fncache = fnc
511 511 self.vfs = _fncachevfs(vfs, fnc, encode)
512 512 self.opener = self.vfs
513 513
514 514 def join(self, f):
515 515 return self.pathsep + self.encode(f)
516 516
517 517 def getsize(self, path):
518 518 return self.rawvfs.stat(path).st_size
519 519
520 520 def datafiles(self):
521 521 for f in sorted(self.fncache):
522 522 ef = self.encode(f)
523 523 try:
524 524 yield f, ef, self.getsize(ef)
525 525 except OSError as err:
526 526 if err.errno != errno.ENOENT:
527 527 raise
528 528
529 529 def copylist(self):
530 530 d = ('data meta dh fncache phaseroots obsstore'
531 531 ' 00manifest.d 00manifest.i 00changelog.d 00changelog.i')
532 532 return (['requires', '00changelog.i'] +
533 533 ['store/' + f for f in d.split()])
534 534
535 535 def write(self, tr):
536 536 self.fncache.write(tr)
537 537
538 538 def invalidatecaches(self):
539 539 self.fncache.entries = None
540 540
541 541 def markremoved(self, fn):
542 542 self.fncache.remove(fn)
543 543
544 544 def _exists(self, f):
545 545 ef = self.encode(f)
546 546 try:
547 547 self.getsize(ef)
548 548 return True
549 549 except OSError as err:
550 550 if err.errno != errno.ENOENT:
551 551 raise
552 552 # nonexistent entry
553 553 return False
554 554
555 555 def __contains__(self, path):
556 556 '''Checks if the store contains path'''
557 557 path = "/".join(("data", path))
558 558 # check for files (exact match)
559 559 e = path + '.i'
560 560 if e in self.fncache and self._exists(e):
561 561 return True
562 562 # now check for directories (prefix match)
563 563 if not path.endswith('/'):
564 564 path += '/'
565 565 for e in self.fncache:
566 566 if e.startswith(path) and self._exists(e):
567 567 return True
568 568 return False
569 569
570 570 def store(requirements, path, vfstype):
571 571 if 'store' in requirements:
572 572 if 'fncache' in requirements:
573 573 return fncachestore(path, vfstype, 'dotencode' in requirements)
574 574 return encodedstore(path, vfstype)
575 575 return basicstore(path, vfstype)
General Comments 0
You need to be logged in to leave comments. Login now