##// END OF EJS Templates
censor: implement censoring for revlogv2...
marmoute -
r48250:f8330a3f default
parent child Browse files
Show More
@@ -1,3297 +1,3293 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 # coding: utf8
3 3 #
4 4 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 """Storage back-end for Mercurial.
10 10
11 11 This provides efficient delta storage with O(1) retrieve and append
12 12 and O(changes) merge between branches.
13 13 """
14 14
15 15 from __future__ import absolute_import
16 16
17 17 import binascii
18 18 import collections
19 19 import contextlib
20 20 import errno
21 21 import io
22 22 import os
23 23 import struct
24 24 import zlib
25 25
26 26 # import stuff from node for others to import from revlog
27 27 from .node import (
28 28 bin,
29 29 hex,
30 30 nullrev,
31 31 sha1nodeconstants,
32 32 short,
33 33 wdirrev,
34 34 )
35 35 from .i18n import _
36 36 from .pycompat import getattr
37 37 from .revlogutils.constants import (
38 38 ALL_KINDS,
39 39 CHANGELOGV2,
40 40 COMP_MODE_DEFAULT,
41 41 COMP_MODE_INLINE,
42 42 COMP_MODE_PLAIN,
43 43 FEATURES_BY_VERSION,
44 44 FLAG_GENERALDELTA,
45 45 FLAG_INLINE_DATA,
46 46 INDEX_HEADER,
47 47 KIND_CHANGELOG,
48 48 REVLOGV0,
49 49 REVLOGV1,
50 50 REVLOGV1_FLAGS,
51 51 REVLOGV2,
52 52 REVLOGV2_FLAGS,
53 53 REVLOG_DEFAULT_FLAGS,
54 54 REVLOG_DEFAULT_FORMAT,
55 55 REVLOG_DEFAULT_VERSION,
56 56 SUPPORTED_FLAGS,
57 57 )
58 58 from .revlogutils.flagutil import (
59 59 REVIDX_DEFAULT_FLAGS,
60 60 REVIDX_ELLIPSIS,
61 61 REVIDX_EXTSTORED,
62 62 REVIDX_FLAGS_ORDER,
63 63 REVIDX_HASCOPIESINFO,
64 64 REVIDX_ISCENSORED,
65 65 REVIDX_RAWTEXT_CHANGING_FLAGS,
66 66 )
67 67 from .thirdparty import attr
68 68 from . import (
69 69 ancestor,
70 70 dagop,
71 71 error,
72 72 mdiff,
73 73 policy,
74 74 pycompat,
75 75 revlogutils,
76 76 templatefilters,
77 77 util,
78 78 )
79 79 from .interfaces import (
80 80 repository,
81 81 util as interfaceutil,
82 82 )
83 83 from .revlogutils import (
84 84 censor,
85 85 deltas as deltautil,
86 86 docket as docketutil,
87 87 flagutil,
88 88 nodemap as nodemaputil,
89 89 randomaccessfile,
90 90 revlogv0,
91 91 sidedata as sidedatautil,
92 92 )
93 93 from .utils import (
94 94 storageutil,
95 95 stringutil,
96 96 )
97 97
98 98 # blanked usage of all the name to prevent pyflakes constraints
99 99 # We need these name available in the module for extensions.
100 100
101 101 REVLOGV0
102 102 REVLOGV1
103 103 REVLOGV2
104 104 FLAG_INLINE_DATA
105 105 FLAG_GENERALDELTA
106 106 REVLOG_DEFAULT_FLAGS
107 107 REVLOG_DEFAULT_FORMAT
108 108 REVLOG_DEFAULT_VERSION
109 109 REVLOGV1_FLAGS
110 110 REVLOGV2_FLAGS
111 111 REVIDX_ISCENSORED
112 112 REVIDX_ELLIPSIS
113 113 REVIDX_HASCOPIESINFO
114 114 REVIDX_EXTSTORED
115 115 REVIDX_DEFAULT_FLAGS
116 116 REVIDX_FLAGS_ORDER
117 117 REVIDX_RAWTEXT_CHANGING_FLAGS
118 118
119 119 parsers = policy.importmod('parsers')
120 120 rustancestor = policy.importrust('ancestor')
121 121 rustdagop = policy.importrust('dagop')
122 122 rustrevlog = policy.importrust('revlog')
123 123
124 124 # Aliased for performance.
125 125 _zlibdecompress = zlib.decompress
126 126
127 127 # max size of revlog with inline data
128 128 _maxinline = 131072
129 129
130 130 # Flag processors for REVIDX_ELLIPSIS.
131 131 def ellipsisreadprocessor(rl, text):
132 132 return text, False
133 133
134 134
135 135 def ellipsiswriteprocessor(rl, text):
136 136 return text, False
137 137
138 138
139 139 def ellipsisrawprocessor(rl, text):
140 140 return False
141 141
142 142
143 143 ellipsisprocessor = (
144 144 ellipsisreadprocessor,
145 145 ellipsiswriteprocessor,
146 146 ellipsisrawprocessor,
147 147 )
148 148
149 149
150 150 def _verify_revision(rl, skipflags, state, node):
151 151 """Verify the integrity of the given revlog ``node`` while providing a hook
152 152 point for extensions to influence the operation."""
153 153 if skipflags:
154 154 state[b'skipread'].add(node)
155 155 else:
156 156 # Side-effect: read content and verify hash.
157 157 rl.revision(node)
158 158
159 159
160 160 # True if a fast implementation for persistent-nodemap is available
161 161 #
162 162 # We also consider we have a "fast" implementation in "pure" python because
163 163 # people using pure don't really have performance consideration (and a
164 164 # wheelbarrow of other slowness source)
165 165 HAS_FAST_PERSISTENT_NODEMAP = rustrevlog is not None or util.safehasattr(
166 166 parsers, 'BaseIndexObject'
167 167 )
168 168
169 169
170 170 @interfaceutil.implementer(repository.irevisiondelta)
171 171 @attr.s(slots=True)
172 172 class revlogrevisiondelta(object):
173 173 node = attr.ib()
174 174 p1node = attr.ib()
175 175 p2node = attr.ib()
176 176 basenode = attr.ib()
177 177 flags = attr.ib()
178 178 baserevisionsize = attr.ib()
179 179 revision = attr.ib()
180 180 delta = attr.ib()
181 181 sidedata = attr.ib()
182 182 protocol_flags = attr.ib()
183 183 linknode = attr.ib(default=None)
184 184
185 185
186 186 @interfaceutil.implementer(repository.iverifyproblem)
187 187 @attr.s(frozen=True)
188 188 class revlogproblem(object):
189 189 warning = attr.ib(default=None)
190 190 error = attr.ib(default=None)
191 191 node = attr.ib(default=None)
192 192
193 193
194 194 def parse_index_v1(data, inline):
195 195 # call the C implementation to parse the index data
196 196 index, cache = parsers.parse_index2(data, inline)
197 197 return index, cache
198 198
199 199
200 200 def parse_index_v2(data, inline):
201 201 # call the C implementation to parse the index data
202 202 index, cache = parsers.parse_index2(data, inline, revlogv2=True)
203 203 return index, cache
204 204
205 205
206 206 def parse_index_cl_v2(data, inline):
207 207 # call the C implementation to parse the index data
208 208 assert not inline
209 209 from .pure.parsers import parse_index_cl_v2
210 210
211 211 index, cache = parse_index_cl_v2(data)
212 212 return index, cache
213 213
214 214
215 215 if util.safehasattr(parsers, 'parse_index_devel_nodemap'):
216 216
217 217 def parse_index_v1_nodemap(data, inline):
218 218 index, cache = parsers.parse_index_devel_nodemap(data, inline)
219 219 return index, cache
220 220
221 221
222 222 else:
223 223 parse_index_v1_nodemap = None
224 224
225 225
226 226 def parse_index_v1_mixed(data, inline):
227 227 index, cache = parse_index_v1(data, inline)
228 228 return rustrevlog.MixedIndex(index), cache
229 229
230 230
231 231 # corresponds to uncompressed length of indexformatng (2 gigs, 4-byte
232 232 # signed integer)
233 233 _maxentrysize = 0x7FFFFFFF
234 234
235 235 FILE_TOO_SHORT_MSG = _(
236 236 b'cannot read from revlog %s;'
237 237 b' expected %d bytes from offset %d, data size is %d'
238 238 )
239 239
240 240
241 241 class revlog(object):
242 242 """
243 243 the underlying revision storage object
244 244
245 245 A revlog consists of two parts, an index and the revision data.
246 246
247 247 The index is a file with a fixed record size containing
248 248 information on each revision, including its nodeid (hash), the
249 249 nodeids of its parents, the position and offset of its data within
250 250 the data file, and the revision it's based on. Finally, each entry
251 251 contains a linkrev entry that can serve as a pointer to external
252 252 data.
253 253
254 254 The revision data itself is a linear collection of data chunks.
255 255 Each chunk represents a revision and is usually represented as a
256 256 delta against the previous chunk. To bound lookup time, runs of
257 257 deltas are limited to about 2 times the length of the original
258 258 version data. This makes retrieval of a version proportional to
259 259 its size, or O(1) relative to the number of revisions.
260 260
261 261 Both pieces of the revlog are written to in an append-only
262 262 fashion, which means we never need to rewrite a file to insert or
263 263 remove data, and can use some simple techniques to avoid the need
264 264 for locking while reading.
265 265
266 266 If checkambig, indexfile is opened with checkambig=True at
267 267 writing, to avoid file stat ambiguity.
268 268
269 269 If mmaplargeindex is True, and an mmapindexthreshold is set, the
270 270 index will be mmapped rather than read if it is larger than the
271 271 configured threshold.
272 272
273 273 If censorable is True, the revlog can have censored revisions.
274 274
275 275 If `upperboundcomp` is not None, this is the expected maximal gain from
276 276 compression for the data content.
277 277
278 278 `concurrencychecker` is an optional function that receives 3 arguments: a
279 279 file handle, a filename, and an expected position. It should check whether
280 280 the current position in the file handle is valid, and log/warn/fail (by
281 281 raising).
282 282
283 283 See mercurial/revlogutils/contants.py for details about the content of an
284 284 index entry.
285 285 """
286 286
287 287 _flagserrorclass = error.RevlogError
288 288
289 289 def __init__(
290 290 self,
291 291 opener,
292 292 target,
293 293 radix,
294 294 postfix=None, # only exist for `tmpcensored` now
295 295 checkambig=False,
296 296 mmaplargeindex=False,
297 297 censorable=False,
298 298 upperboundcomp=None,
299 299 persistentnodemap=False,
300 300 concurrencychecker=None,
301 301 trypending=False,
302 302 ):
303 303 """
304 304 create a revlog object
305 305
306 306 opener is a function that abstracts the file opening operation
307 307 and can be used to implement COW semantics or the like.
308 308
309 309 `target`: a (KIND, ID) tuple that identify the content stored in
310 310 this revlog. It help the rest of the code to understand what the revlog
311 311 is about without having to resort to heuristic and index filename
312 312 analysis. Note: that this must be reliably be set by normal code, but
313 313 that test, debug, or performance measurement code might not set this to
314 314 accurate value.
315 315 """
316 316 self.upperboundcomp = upperboundcomp
317 317
318 318 self.radix = radix
319 319
320 320 self._docket_file = None
321 321 self._indexfile = None
322 322 self._datafile = None
323 323 self._sidedatafile = None
324 324 self._nodemap_file = None
325 325 self.postfix = postfix
326 326 self._trypending = trypending
327 327 self.opener = opener
328 328 if persistentnodemap:
329 329 self._nodemap_file = nodemaputil.get_nodemap_file(self)
330 330
331 331 assert target[0] in ALL_KINDS
332 332 assert len(target) == 2
333 333 self.target = target
334 334 # When True, indexfile is opened with checkambig=True at writing, to
335 335 # avoid file stat ambiguity.
336 336 self._checkambig = checkambig
337 337 self._mmaplargeindex = mmaplargeindex
338 338 self._censorable = censorable
339 339 # 3-tuple of (node, rev, text) for a raw revision.
340 340 self._revisioncache = None
341 341 # Maps rev to chain base rev.
342 342 self._chainbasecache = util.lrucachedict(100)
343 343 # 2-tuple of (offset, data) of raw data from the revlog at an offset.
344 344 self._chunkcache = (0, b'')
345 345 # How much data to read and cache into the raw revlog data cache.
346 346 self._chunkcachesize = 65536
347 347 self._maxchainlen = None
348 348 self._deltabothparents = True
349 349 self.index = None
350 350 self._docket = None
351 351 self._nodemap_docket = None
352 352 # Mapping of partial identifiers to full nodes.
353 353 self._pcache = {}
354 354 # Mapping of revision integer to full node.
355 355 self._compengine = b'zlib'
356 356 self._compengineopts = {}
357 357 self._maxdeltachainspan = -1
358 358 self._withsparseread = False
359 359 self._sparserevlog = False
360 360 self.hassidedata = False
361 361 self._srdensitythreshold = 0.50
362 362 self._srmingapsize = 262144
363 363
364 364 # Make copy of flag processors so each revlog instance can support
365 365 # custom flags.
366 366 self._flagprocessors = dict(flagutil.flagprocessors)
367 367
368 368 # 3-tuple of file handles being used for active writing.
369 369 self._writinghandles = None
370 370 # prevent nesting of addgroup
371 371 self._adding_group = None
372 372
373 373 self._loadindex()
374 374
375 375 self._concurrencychecker = concurrencychecker
376 376
377 377 def _init_opts(self):
378 378 """process options (from above/config) to setup associated default revlog mode
379 379
380 380 These values might be affected when actually reading on disk information.
381 381
382 382 The relevant values are returned for use in _loadindex().
383 383
384 384 * newversionflags:
385 385 version header to use if we need to create a new revlog
386 386
387 387 * mmapindexthreshold:
388 388 minimal index size for start to use mmap
389 389
390 390 * force_nodemap:
391 391 force the usage of a "development" version of the nodemap code
392 392 """
393 393 mmapindexthreshold = None
394 394 opts = self.opener.options
395 395
396 396 if b'changelogv2' in opts and self.revlog_kind == KIND_CHANGELOG:
397 397 new_header = CHANGELOGV2
398 398 elif b'revlogv2' in opts:
399 399 new_header = REVLOGV2
400 400 elif b'revlogv1' in opts:
401 401 new_header = REVLOGV1 | FLAG_INLINE_DATA
402 402 if b'generaldelta' in opts:
403 403 new_header |= FLAG_GENERALDELTA
404 404 elif b'revlogv0' in self.opener.options:
405 405 new_header = REVLOGV0
406 406 else:
407 407 new_header = REVLOG_DEFAULT_VERSION
408 408
409 409 if b'chunkcachesize' in opts:
410 410 self._chunkcachesize = opts[b'chunkcachesize']
411 411 if b'maxchainlen' in opts:
412 412 self._maxchainlen = opts[b'maxchainlen']
413 413 if b'deltabothparents' in opts:
414 414 self._deltabothparents = opts[b'deltabothparents']
415 415 self._lazydelta = bool(opts.get(b'lazydelta', True))
416 416 self._lazydeltabase = False
417 417 if self._lazydelta:
418 418 self._lazydeltabase = bool(opts.get(b'lazydeltabase', False))
419 419 if b'compengine' in opts:
420 420 self._compengine = opts[b'compengine']
421 421 if b'zlib.level' in opts:
422 422 self._compengineopts[b'zlib.level'] = opts[b'zlib.level']
423 423 if b'zstd.level' in opts:
424 424 self._compengineopts[b'zstd.level'] = opts[b'zstd.level']
425 425 if b'maxdeltachainspan' in opts:
426 426 self._maxdeltachainspan = opts[b'maxdeltachainspan']
427 427 if self._mmaplargeindex and b'mmapindexthreshold' in opts:
428 428 mmapindexthreshold = opts[b'mmapindexthreshold']
429 429 self._sparserevlog = bool(opts.get(b'sparse-revlog', False))
430 430 withsparseread = bool(opts.get(b'with-sparse-read', False))
431 431 # sparse-revlog forces sparse-read
432 432 self._withsparseread = self._sparserevlog or withsparseread
433 433 if b'sparse-read-density-threshold' in opts:
434 434 self._srdensitythreshold = opts[b'sparse-read-density-threshold']
435 435 if b'sparse-read-min-gap-size' in opts:
436 436 self._srmingapsize = opts[b'sparse-read-min-gap-size']
437 437 if opts.get(b'enableellipsis'):
438 438 self._flagprocessors[REVIDX_ELLIPSIS] = ellipsisprocessor
439 439
440 440 # revlog v0 doesn't have flag processors
441 441 for flag, processor in pycompat.iteritems(
442 442 opts.get(b'flagprocessors', {})
443 443 ):
444 444 flagutil.insertflagprocessor(flag, processor, self._flagprocessors)
445 445
446 446 if self._chunkcachesize <= 0:
447 447 raise error.RevlogError(
448 448 _(b'revlog chunk cache size %r is not greater than 0')
449 449 % self._chunkcachesize
450 450 )
451 451 elif self._chunkcachesize & (self._chunkcachesize - 1):
452 452 raise error.RevlogError(
453 453 _(b'revlog chunk cache size %r is not a power of 2')
454 454 % self._chunkcachesize
455 455 )
456 456 force_nodemap = opts.get(b'devel-force-nodemap', False)
457 457 return new_header, mmapindexthreshold, force_nodemap
458 458
459 459 def _get_data(self, filepath, mmap_threshold, size=None):
460 460 """return a file content with or without mmap
461 461
462 462 If the file is missing return the empty string"""
463 463 try:
464 464 with self.opener(filepath) as fp:
465 465 if mmap_threshold is not None:
466 466 file_size = self.opener.fstat(fp).st_size
467 467 if file_size >= mmap_threshold:
468 468 if size is not None:
469 469 # avoid potentiel mmap crash
470 470 size = min(file_size, size)
471 471 # TODO: should .close() to release resources without
472 472 # relying on Python GC
473 473 if size is None:
474 474 return util.buffer(util.mmapread(fp))
475 475 else:
476 476 return util.buffer(util.mmapread(fp, size))
477 477 if size is None:
478 478 return fp.read()
479 479 else:
480 480 return fp.read(size)
481 481 except IOError as inst:
482 482 if inst.errno != errno.ENOENT:
483 483 raise
484 484 return b''
485 485
486 486 def _loadindex(self, docket=None):
487 487
488 488 new_header, mmapindexthreshold, force_nodemap = self._init_opts()
489 489
490 490 if self.postfix is not None:
491 491 entry_point = b'%s.i.%s' % (self.radix, self.postfix)
492 492 elif self._trypending and self.opener.exists(b'%s.i.a' % self.radix):
493 493 entry_point = b'%s.i.a' % self.radix
494 494 else:
495 495 entry_point = b'%s.i' % self.radix
496 496
497 497 if docket is not None:
498 498 self._docket = docket
499 499 self._docket_file = entry_point
500 500 else:
501 501 entry_data = b''
502 502 self._initempty = True
503 503 entry_data = self._get_data(entry_point, mmapindexthreshold)
504 504 if len(entry_data) > 0:
505 505 header = INDEX_HEADER.unpack(entry_data[:4])[0]
506 506 self._initempty = False
507 507 else:
508 508 header = new_header
509 509
510 510 self._format_flags = header & ~0xFFFF
511 511 self._format_version = header & 0xFFFF
512 512
513 513 supported_flags = SUPPORTED_FLAGS.get(self._format_version)
514 514 if supported_flags is None:
515 515 msg = _(b'unknown version (%d) in revlog %s')
516 516 msg %= (self._format_version, self.display_id)
517 517 raise error.RevlogError(msg)
518 518 elif self._format_flags & ~supported_flags:
519 519 msg = _(b'unknown flags (%#04x) in version %d revlog %s')
520 520 display_flag = self._format_flags >> 16
521 521 msg %= (display_flag, self._format_version, self.display_id)
522 522 raise error.RevlogError(msg)
523 523
524 524 features = FEATURES_BY_VERSION[self._format_version]
525 525 self._inline = features[b'inline'](self._format_flags)
526 526 self._generaldelta = features[b'generaldelta'](self._format_flags)
527 527 self.hassidedata = features[b'sidedata']
528 528
529 529 if not features[b'docket']:
530 530 self._indexfile = entry_point
531 531 index_data = entry_data
532 532 else:
533 533 self._docket_file = entry_point
534 534 if self._initempty:
535 535 self._docket = docketutil.default_docket(self, header)
536 536 else:
537 537 self._docket = docketutil.parse_docket(
538 538 self, entry_data, use_pending=self._trypending
539 539 )
540 540
541 541 if self._docket is not None:
542 542 self._indexfile = self._docket.index_filepath()
543 543 index_data = b''
544 544 index_size = self._docket.index_end
545 545 if index_size > 0:
546 546 index_data = self._get_data(
547 547 self._indexfile, mmapindexthreshold, size=index_size
548 548 )
549 549 if len(index_data) < index_size:
550 550 msg = _(b'too few index data for %s: got %d, expected %d')
551 551 msg %= (self.display_id, len(index_data), index_size)
552 552 raise error.RevlogError(msg)
553 553
554 554 self._inline = False
555 555 # generaldelta implied by version 2 revlogs.
556 556 self._generaldelta = True
557 557 # the logic for persistent nodemap will be dealt with within the
558 558 # main docket, so disable it for now.
559 559 self._nodemap_file = None
560 560
561 561 if self._docket is not None:
562 562 self._datafile = self._docket.data_filepath()
563 563 self._sidedatafile = self._docket.sidedata_filepath()
564 564 elif self.postfix is None:
565 565 self._datafile = b'%s.d' % self.radix
566 566 else:
567 567 self._datafile = b'%s.d.%s' % (self.radix, self.postfix)
568 568
569 569 self.nodeconstants = sha1nodeconstants
570 570 self.nullid = self.nodeconstants.nullid
571 571
572 572 # sparse-revlog can't be on without general-delta (issue6056)
573 573 if not self._generaldelta:
574 574 self._sparserevlog = False
575 575
576 576 self._storedeltachains = True
577 577
578 578 devel_nodemap = (
579 579 self._nodemap_file
580 580 and force_nodemap
581 581 and parse_index_v1_nodemap is not None
582 582 )
583 583
584 584 use_rust_index = False
585 585 if rustrevlog is not None:
586 586 if self._nodemap_file is not None:
587 587 use_rust_index = True
588 588 else:
589 589 use_rust_index = self.opener.options.get(b'rust.index')
590 590
591 591 self._parse_index = parse_index_v1
592 592 if self._format_version == REVLOGV0:
593 593 self._parse_index = revlogv0.parse_index_v0
594 594 elif self._format_version == REVLOGV2:
595 595 self._parse_index = parse_index_v2
596 596 elif self._format_version == CHANGELOGV2:
597 597 self._parse_index = parse_index_cl_v2
598 598 elif devel_nodemap:
599 599 self._parse_index = parse_index_v1_nodemap
600 600 elif use_rust_index:
601 601 self._parse_index = parse_index_v1_mixed
602 602 try:
603 603 d = self._parse_index(index_data, self._inline)
604 604 index, chunkcache = d
605 605 use_nodemap = (
606 606 not self._inline
607 607 and self._nodemap_file is not None
608 608 and util.safehasattr(index, 'update_nodemap_data')
609 609 )
610 610 if use_nodemap:
611 611 nodemap_data = nodemaputil.persisted_data(self)
612 612 if nodemap_data is not None:
613 613 docket = nodemap_data[0]
614 614 if (
615 615 len(d[0]) > docket.tip_rev
616 616 and d[0][docket.tip_rev][7] == docket.tip_node
617 617 ):
618 618 # no changelog tampering
619 619 self._nodemap_docket = docket
620 620 index.update_nodemap_data(*nodemap_data)
621 621 except (ValueError, IndexError):
622 622 raise error.RevlogError(
623 623 _(b"index %s is corrupted") % self.display_id
624 624 )
625 625 self.index = index
626 626 self._segmentfile = randomaccessfile.randomaccessfile(
627 627 self.opener,
628 628 (self._indexfile if self._inline else self._datafile),
629 629 self._chunkcachesize,
630 630 chunkcache,
631 631 )
632 632 self._segmentfile_sidedata = randomaccessfile.randomaccessfile(
633 633 self.opener,
634 634 self._sidedatafile,
635 635 self._chunkcachesize,
636 636 )
637 637 # revnum -> (chain-length, sum-delta-length)
638 638 self._chaininfocache = util.lrucachedict(500)
639 639 # revlog header -> revlog compressor
640 640 self._decompressors = {}
641 641
642 642 @util.propertycache
643 643 def revlog_kind(self):
644 644 return self.target[0]
645 645
646 646 @util.propertycache
647 647 def display_id(self):
648 648 """The public facing "ID" of the revlog that we use in message"""
649 649 # Maybe we should build a user facing representation of
650 650 # revlog.target instead of using `self.radix`
651 651 return self.radix
652 652
653 653 def _get_decompressor(self, t):
654 654 try:
655 655 compressor = self._decompressors[t]
656 656 except KeyError:
657 657 try:
658 658 engine = util.compengines.forrevlogheader(t)
659 659 compressor = engine.revlogcompressor(self._compengineopts)
660 660 self._decompressors[t] = compressor
661 661 except KeyError:
662 662 raise error.RevlogError(
663 663 _(b'unknown compression type %s') % binascii.hexlify(t)
664 664 )
665 665 return compressor
666 666
667 667 @util.propertycache
668 668 def _compressor(self):
669 669 engine = util.compengines[self._compengine]
670 670 return engine.revlogcompressor(self._compengineopts)
671 671
672 672 @util.propertycache
673 673 def _decompressor(self):
674 674 """the default decompressor"""
675 675 if self._docket is None:
676 676 return None
677 677 t = self._docket.default_compression_header
678 678 c = self._get_decompressor(t)
679 679 return c.decompress
680 680
681 681 def _indexfp(self):
682 682 """file object for the revlog's index file"""
683 683 return self.opener(self._indexfile, mode=b"r")
684 684
685 685 def __index_write_fp(self):
686 686 # You should not use this directly and use `_writing` instead
687 687 try:
688 688 f = self.opener(
689 689 self._indexfile, mode=b"r+", checkambig=self._checkambig
690 690 )
691 691 if self._docket is None:
692 692 f.seek(0, os.SEEK_END)
693 693 else:
694 694 f.seek(self._docket.index_end, os.SEEK_SET)
695 695 return f
696 696 except IOError as inst:
697 697 if inst.errno != errno.ENOENT:
698 698 raise
699 699 return self.opener(
700 700 self._indexfile, mode=b"w+", checkambig=self._checkambig
701 701 )
702 702
703 703 def __index_new_fp(self):
704 704 # You should not use this unless you are upgrading from inline revlog
705 705 return self.opener(
706 706 self._indexfile,
707 707 mode=b"w",
708 708 checkambig=self._checkambig,
709 709 atomictemp=True,
710 710 )
711 711
712 712 def _datafp(self, mode=b'r'):
713 713 """file object for the revlog's data file"""
714 714 return self.opener(self._datafile, mode=mode)
715 715
716 716 @contextlib.contextmanager
717 717 def _sidedatareadfp(self):
718 718 """file object suitable to read sidedata"""
719 719 if self._writinghandles:
720 720 yield self._writinghandles[2]
721 721 else:
722 722 with self.opener(self._sidedatafile) as fp:
723 723 yield fp
724 724
725 725 def tiprev(self):
726 726 return len(self.index) - 1
727 727
728 728 def tip(self):
729 729 return self.node(self.tiprev())
730 730
731 731 def __contains__(self, rev):
732 732 return 0 <= rev < len(self)
733 733
734 734 def __len__(self):
735 735 return len(self.index)
736 736
737 737 def __iter__(self):
738 738 return iter(pycompat.xrange(len(self)))
739 739
740 740 def revs(self, start=0, stop=None):
741 741 """iterate over all rev in this revlog (from start to stop)"""
742 742 return storageutil.iterrevs(len(self), start=start, stop=stop)
743 743
744 744 @property
745 745 def nodemap(self):
746 746 msg = (
747 747 b"revlog.nodemap is deprecated, "
748 748 b"use revlog.index.[has_node|rev|get_rev]"
749 749 )
750 750 util.nouideprecwarn(msg, b'5.3', stacklevel=2)
751 751 return self.index.nodemap
752 752
753 753 @property
754 754 def _nodecache(self):
755 755 msg = b"revlog._nodecache is deprecated, use revlog.index.nodemap"
756 756 util.nouideprecwarn(msg, b'5.3', stacklevel=2)
757 757 return self.index.nodemap
758 758
759 759 def hasnode(self, node):
760 760 try:
761 761 self.rev(node)
762 762 return True
763 763 except KeyError:
764 764 return False
765 765
766 766 def candelta(self, baserev, rev):
767 767 """whether two revisions (baserev, rev) can be delta-ed or not"""
768 768 # Disable delta if either rev requires a content-changing flag
769 769 # processor (ex. LFS). This is because such flag processor can alter
770 770 # the rawtext content that the delta will be based on, and two clients
771 771 # could have a same revlog node with different flags (i.e. different
772 772 # rawtext contents) and the delta could be incompatible.
773 773 if (self.flags(baserev) & REVIDX_RAWTEXT_CHANGING_FLAGS) or (
774 774 self.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS
775 775 ):
776 776 return False
777 777 return True
778 778
779 779 def update_caches(self, transaction):
780 780 if self._nodemap_file is not None:
781 781 if transaction is None:
782 782 nodemaputil.update_persistent_nodemap(self)
783 783 else:
784 784 nodemaputil.setup_persistent_nodemap(transaction, self)
785 785
786 786 def clearcaches(self):
787 787 self._revisioncache = None
788 788 self._chainbasecache.clear()
789 789 self._segmentfile.clear_cache()
790 790 self._segmentfile_sidedata.clear_cache()
791 791 self._pcache = {}
792 792 self._nodemap_docket = None
793 793 self.index.clearcaches()
794 794 # The python code is the one responsible for validating the docket, we
795 795 # end up having to refresh it here.
796 796 use_nodemap = (
797 797 not self._inline
798 798 and self._nodemap_file is not None
799 799 and util.safehasattr(self.index, 'update_nodemap_data')
800 800 )
801 801 if use_nodemap:
802 802 nodemap_data = nodemaputil.persisted_data(self)
803 803 if nodemap_data is not None:
804 804 self._nodemap_docket = nodemap_data[0]
805 805 self.index.update_nodemap_data(*nodemap_data)
806 806
807 807 def rev(self, node):
808 808 try:
809 809 return self.index.rev(node)
810 810 except TypeError:
811 811 raise
812 812 except error.RevlogError:
813 813 # parsers.c radix tree lookup failed
814 814 if (
815 815 node == self.nodeconstants.wdirid
816 816 or node in self.nodeconstants.wdirfilenodeids
817 817 ):
818 818 raise error.WdirUnsupported
819 819 raise error.LookupError(node, self.display_id, _(b'no node'))
820 820
821 821 # Accessors for index entries.
822 822
823 823 # First tuple entry is 8 bytes. First 6 bytes are offset. Last 2 bytes
824 824 # are flags.
825 825 def start(self, rev):
826 826 return int(self.index[rev][0] >> 16)
827 827
828 828 def sidedata_cut_off(self, rev):
829 829 sd_cut_off = self.index[rev][8]
830 830 if sd_cut_off != 0:
831 831 return sd_cut_off
832 832 # This is some annoying dance, because entries without sidedata
833 833 # currently use 0 as their ofsset. (instead of previous-offset +
834 834 # previous-size)
835 835 #
836 836 # We should reconsider this sidedata β†’ 0 sidata_offset policy.
837 837 # In the meantime, we need this.
838 838 while 0 <= rev:
839 839 e = self.index[rev]
840 840 if e[9] != 0:
841 841 return e[8] + e[9]
842 842 rev -= 1
843 843 return 0
844 844
845 845 def flags(self, rev):
846 846 return self.index[rev][0] & 0xFFFF
847 847
848 848 def length(self, rev):
849 849 return self.index[rev][1]
850 850
851 851 def sidedata_length(self, rev):
852 852 if not self.hassidedata:
853 853 return 0
854 854 return self.index[rev][9]
855 855
856 856 def rawsize(self, rev):
857 857 """return the length of the uncompressed text for a given revision"""
858 858 l = self.index[rev][2]
859 859 if l >= 0:
860 860 return l
861 861
862 862 t = self.rawdata(rev)
863 863 return len(t)
864 864
865 865 def size(self, rev):
866 866 """length of non-raw text (processed by a "read" flag processor)"""
867 867 # fast path: if no "read" flag processor could change the content,
868 868 # size is rawsize. note: ELLIPSIS is known to not change the content.
869 869 flags = self.flags(rev)
870 870 if flags & (flagutil.REVIDX_KNOWN_FLAGS ^ REVIDX_ELLIPSIS) == 0:
871 871 return self.rawsize(rev)
872 872
873 873 return len(self.revision(rev, raw=False))
874 874
875 875 def chainbase(self, rev):
876 876 base = self._chainbasecache.get(rev)
877 877 if base is not None:
878 878 return base
879 879
880 880 index = self.index
881 881 iterrev = rev
882 882 base = index[iterrev][3]
883 883 while base != iterrev:
884 884 iterrev = base
885 885 base = index[iterrev][3]
886 886
887 887 self._chainbasecache[rev] = base
888 888 return base
889 889
890 890 def linkrev(self, rev):
891 891 return self.index[rev][4]
892 892
893 893 def parentrevs(self, rev):
894 894 try:
895 895 entry = self.index[rev]
896 896 except IndexError:
897 897 if rev == wdirrev:
898 898 raise error.WdirUnsupported
899 899 raise
900 900 if entry[5] == nullrev:
901 901 return entry[6], entry[5]
902 902 else:
903 903 return entry[5], entry[6]
904 904
905 905 # fast parentrevs(rev) where rev isn't filtered
906 906 _uncheckedparentrevs = parentrevs
907 907
908 908 def node(self, rev):
909 909 try:
910 910 return self.index[rev][7]
911 911 except IndexError:
912 912 if rev == wdirrev:
913 913 raise error.WdirUnsupported
914 914 raise
915 915
916 916 # Derived from index values.
917 917
918 918 def end(self, rev):
919 919 return self.start(rev) + self.length(rev)
920 920
921 921 def parents(self, node):
922 922 i = self.index
923 923 d = i[self.rev(node)]
924 924 # inline node() to avoid function call overhead
925 925 if d[5] == self.nullid:
926 926 return i[d[6]][7], i[d[5]][7]
927 927 else:
928 928 return i[d[5]][7], i[d[6]][7]
929 929
930 930 def chainlen(self, rev):
931 931 return self._chaininfo(rev)[0]
932 932
933 933 def _chaininfo(self, rev):
934 934 chaininfocache = self._chaininfocache
935 935 if rev in chaininfocache:
936 936 return chaininfocache[rev]
937 937 index = self.index
938 938 generaldelta = self._generaldelta
939 939 iterrev = rev
940 940 e = index[iterrev]
941 941 clen = 0
942 942 compresseddeltalen = 0
943 943 while iterrev != e[3]:
944 944 clen += 1
945 945 compresseddeltalen += e[1]
946 946 if generaldelta:
947 947 iterrev = e[3]
948 948 else:
949 949 iterrev -= 1
950 950 if iterrev in chaininfocache:
951 951 t = chaininfocache[iterrev]
952 952 clen += t[0]
953 953 compresseddeltalen += t[1]
954 954 break
955 955 e = index[iterrev]
956 956 else:
957 957 # Add text length of base since decompressing that also takes
958 958 # work. For cache hits the length is already included.
959 959 compresseddeltalen += e[1]
960 960 r = (clen, compresseddeltalen)
961 961 chaininfocache[rev] = r
962 962 return r
963 963
964 964 def _deltachain(self, rev, stoprev=None):
965 965 """Obtain the delta chain for a revision.
966 966
967 967 ``stoprev`` specifies a revision to stop at. If not specified, we
968 968 stop at the base of the chain.
969 969
970 970 Returns a 2-tuple of (chain, stopped) where ``chain`` is a list of
971 971 revs in ascending order and ``stopped`` is a bool indicating whether
972 972 ``stoprev`` was hit.
973 973 """
974 974 # Try C implementation.
975 975 try:
976 976 return self.index.deltachain(rev, stoprev, self._generaldelta)
977 977 except AttributeError:
978 978 pass
979 979
980 980 chain = []
981 981
982 982 # Alias to prevent attribute lookup in tight loop.
983 983 index = self.index
984 984 generaldelta = self._generaldelta
985 985
986 986 iterrev = rev
987 987 e = index[iterrev]
988 988 while iterrev != e[3] and iterrev != stoprev:
989 989 chain.append(iterrev)
990 990 if generaldelta:
991 991 iterrev = e[3]
992 992 else:
993 993 iterrev -= 1
994 994 e = index[iterrev]
995 995
996 996 if iterrev == stoprev:
997 997 stopped = True
998 998 else:
999 999 chain.append(iterrev)
1000 1000 stopped = False
1001 1001
1002 1002 chain.reverse()
1003 1003 return chain, stopped
1004 1004
1005 1005 def ancestors(self, revs, stoprev=0, inclusive=False):
1006 1006 """Generate the ancestors of 'revs' in reverse revision order.
1007 1007 Does not generate revs lower than stoprev.
1008 1008
1009 1009 See the documentation for ancestor.lazyancestors for more details."""
1010 1010
1011 1011 # first, make sure start revisions aren't filtered
1012 1012 revs = list(revs)
1013 1013 checkrev = self.node
1014 1014 for r in revs:
1015 1015 checkrev(r)
1016 1016 # and we're sure ancestors aren't filtered as well
1017 1017
1018 1018 if rustancestor is not None and self.index.rust_ext_compat:
1019 1019 lazyancestors = rustancestor.LazyAncestors
1020 1020 arg = self.index
1021 1021 else:
1022 1022 lazyancestors = ancestor.lazyancestors
1023 1023 arg = self._uncheckedparentrevs
1024 1024 return lazyancestors(arg, revs, stoprev=stoprev, inclusive=inclusive)
1025 1025
1026 1026 def descendants(self, revs):
1027 1027 return dagop.descendantrevs(revs, self.revs, self.parentrevs)
1028 1028
1029 1029 def findcommonmissing(self, common=None, heads=None):
1030 1030 """Return a tuple of the ancestors of common and the ancestors of heads
1031 1031 that are not ancestors of common. In revset terminology, we return the
1032 1032 tuple:
1033 1033
1034 1034 ::common, (::heads) - (::common)
1035 1035
1036 1036 The list is sorted by revision number, meaning it is
1037 1037 topologically sorted.
1038 1038
1039 1039 'heads' and 'common' are both lists of node IDs. If heads is
1040 1040 not supplied, uses all of the revlog's heads. If common is not
1041 1041 supplied, uses nullid."""
1042 1042 if common is None:
1043 1043 common = [self.nullid]
1044 1044 if heads is None:
1045 1045 heads = self.heads()
1046 1046
1047 1047 common = [self.rev(n) for n in common]
1048 1048 heads = [self.rev(n) for n in heads]
1049 1049
1050 1050 # we want the ancestors, but inclusive
1051 1051 class lazyset(object):
1052 1052 def __init__(self, lazyvalues):
1053 1053 self.addedvalues = set()
1054 1054 self.lazyvalues = lazyvalues
1055 1055
1056 1056 def __contains__(self, value):
1057 1057 return value in self.addedvalues or value in self.lazyvalues
1058 1058
1059 1059 def __iter__(self):
1060 1060 added = self.addedvalues
1061 1061 for r in added:
1062 1062 yield r
1063 1063 for r in self.lazyvalues:
1064 1064 if not r in added:
1065 1065 yield r
1066 1066
1067 1067 def add(self, value):
1068 1068 self.addedvalues.add(value)
1069 1069
1070 1070 def update(self, values):
1071 1071 self.addedvalues.update(values)
1072 1072
1073 1073 has = lazyset(self.ancestors(common))
1074 1074 has.add(nullrev)
1075 1075 has.update(common)
1076 1076
1077 1077 # take all ancestors from heads that aren't in has
1078 1078 missing = set()
1079 1079 visit = collections.deque(r for r in heads if r not in has)
1080 1080 while visit:
1081 1081 r = visit.popleft()
1082 1082 if r in missing:
1083 1083 continue
1084 1084 else:
1085 1085 missing.add(r)
1086 1086 for p in self.parentrevs(r):
1087 1087 if p not in has:
1088 1088 visit.append(p)
1089 1089 missing = list(missing)
1090 1090 missing.sort()
1091 1091 return has, [self.node(miss) for miss in missing]
1092 1092
1093 1093 def incrementalmissingrevs(self, common=None):
1094 1094 """Return an object that can be used to incrementally compute the
1095 1095 revision numbers of the ancestors of arbitrary sets that are not
1096 1096 ancestors of common. This is an ancestor.incrementalmissingancestors
1097 1097 object.
1098 1098
1099 1099 'common' is a list of revision numbers. If common is not supplied, uses
1100 1100 nullrev.
1101 1101 """
1102 1102 if common is None:
1103 1103 common = [nullrev]
1104 1104
1105 1105 if rustancestor is not None and self.index.rust_ext_compat:
1106 1106 return rustancestor.MissingAncestors(self.index, common)
1107 1107 return ancestor.incrementalmissingancestors(self.parentrevs, common)
1108 1108
1109 1109 def findmissingrevs(self, common=None, heads=None):
1110 1110 """Return the revision numbers of the ancestors of heads that
1111 1111 are not ancestors of common.
1112 1112
1113 1113 More specifically, return a list of revision numbers corresponding to
1114 1114 nodes N such that every N satisfies the following constraints:
1115 1115
1116 1116 1. N is an ancestor of some node in 'heads'
1117 1117 2. N is not an ancestor of any node in 'common'
1118 1118
1119 1119 The list is sorted by revision number, meaning it is
1120 1120 topologically sorted.
1121 1121
1122 1122 'heads' and 'common' are both lists of revision numbers. If heads is
1123 1123 not supplied, uses all of the revlog's heads. If common is not
1124 1124 supplied, uses nullid."""
1125 1125 if common is None:
1126 1126 common = [nullrev]
1127 1127 if heads is None:
1128 1128 heads = self.headrevs()
1129 1129
1130 1130 inc = self.incrementalmissingrevs(common=common)
1131 1131 return inc.missingancestors(heads)
1132 1132
1133 1133 def findmissing(self, common=None, heads=None):
1134 1134 """Return the ancestors of heads that are not ancestors of common.
1135 1135
1136 1136 More specifically, return a list of nodes N such that every N
1137 1137 satisfies the following constraints:
1138 1138
1139 1139 1. N is an ancestor of some node in 'heads'
1140 1140 2. N is not an ancestor of any node in 'common'
1141 1141
1142 1142 The list is sorted by revision number, meaning it is
1143 1143 topologically sorted.
1144 1144
1145 1145 'heads' and 'common' are both lists of node IDs. If heads is
1146 1146 not supplied, uses all of the revlog's heads. If common is not
1147 1147 supplied, uses nullid."""
1148 1148 if common is None:
1149 1149 common = [self.nullid]
1150 1150 if heads is None:
1151 1151 heads = self.heads()
1152 1152
1153 1153 common = [self.rev(n) for n in common]
1154 1154 heads = [self.rev(n) for n in heads]
1155 1155
1156 1156 inc = self.incrementalmissingrevs(common=common)
1157 1157 return [self.node(r) for r in inc.missingancestors(heads)]
1158 1158
1159 1159 def nodesbetween(self, roots=None, heads=None):
1160 1160 """Return a topological path from 'roots' to 'heads'.
1161 1161
1162 1162 Return a tuple (nodes, outroots, outheads) where 'nodes' is a
1163 1163 topologically sorted list of all nodes N that satisfy both of
1164 1164 these constraints:
1165 1165
1166 1166 1. N is a descendant of some node in 'roots'
1167 1167 2. N is an ancestor of some node in 'heads'
1168 1168
1169 1169 Every node is considered to be both a descendant and an ancestor
1170 1170 of itself, so every reachable node in 'roots' and 'heads' will be
1171 1171 included in 'nodes'.
1172 1172
1173 1173 'outroots' is the list of reachable nodes in 'roots', i.e., the
1174 1174 subset of 'roots' that is returned in 'nodes'. Likewise,
1175 1175 'outheads' is the subset of 'heads' that is also in 'nodes'.
1176 1176
1177 1177 'roots' and 'heads' are both lists of node IDs. If 'roots' is
1178 1178 unspecified, uses nullid as the only root. If 'heads' is
1179 1179 unspecified, uses list of all of the revlog's heads."""
1180 1180 nonodes = ([], [], [])
1181 1181 if roots is not None:
1182 1182 roots = list(roots)
1183 1183 if not roots:
1184 1184 return nonodes
1185 1185 lowestrev = min([self.rev(n) for n in roots])
1186 1186 else:
1187 1187 roots = [self.nullid] # Everybody's a descendant of nullid
1188 1188 lowestrev = nullrev
1189 1189 if (lowestrev == nullrev) and (heads is None):
1190 1190 # We want _all_ the nodes!
1191 1191 return (
1192 1192 [self.node(r) for r in self],
1193 1193 [self.nullid],
1194 1194 list(self.heads()),
1195 1195 )
1196 1196 if heads is None:
1197 1197 # All nodes are ancestors, so the latest ancestor is the last
1198 1198 # node.
1199 1199 highestrev = len(self) - 1
1200 1200 # Set ancestors to None to signal that every node is an ancestor.
1201 1201 ancestors = None
1202 1202 # Set heads to an empty dictionary for later discovery of heads
1203 1203 heads = {}
1204 1204 else:
1205 1205 heads = list(heads)
1206 1206 if not heads:
1207 1207 return nonodes
1208 1208 ancestors = set()
1209 1209 # Turn heads into a dictionary so we can remove 'fake' heads.
1210 1210 # Also, later we will be using it to filter out the heads we can't
1211 1211 # find from roots.
1212 1212 heads = dict.fromkeys(heads, False)
1213 1213 # Start at the top and keep marking parents until we're done.
1214 1214 nodestotag = set(heads)
1215 1215 # Remember where the top was so we can use it as a limit later.
1216 1216 highestrev = max([self.rev(n) for n in nodestotag])
1217 1217 while nodestotag:
1218 1218 # grab a node to tag
1219 1219 n = nodestotag.pop()
1220 1220 # Never tag nullid
1221 1221 if n == self.nullid:
1222 1222 continue
1223 1223 # A node's revision number represents its place in a
1224 1224 # topologically sorted list of nodes.
1225 1225 r = self.rev(n)
1226 1226 if r >= lowestrev:
1227 1227 if n not in ancestors:
1228 1228 # If we are possibly a descendant of one of the roots
1229 1229 # and we haven't already been marked as an ancestor
1230 1230 ancestors.add(n) # Mark as ancestor
1231 1231 # Add non-nullid parents to list of nodes to tag.
1232 1232 nodestotag.update(
1233 1233 [p for p in self.parents(n) if p != self.nullid]
1234 1234 )
1235 1235 elif n in heads: # We've seen it before, is it a fake head?
1236 1236 # So it is, real heads should not be the ancestors of
1237 1237 # any other heads.
1238 1238 heads.pop(n)
1239 1239 if not ancestors:
1240 1240 return nonodes
1241 1241 # Now that we have our set of ancestors, we want to remove any
1242 1242 # roots that are not ancestors.
1243 1243
1244 1244 # If one of the roots was nullid, everything is included anyway.
1245 1245 if lowestrev > nullrev:
1246 1246 # But, since we weren't, let's recompute the lowest rev to not
1247 1247 # include roots that aren't ancestors.
1248 1248
1249 1249 # Filter out roots that aren't ancestors of heads
1250 1250 roots = [root for root in roots if root in ancestors]
1251 1251 # Recompute the lowest revision
1252 1252 if roots:
1253 1253 lowestrev = min([self.rev(root) for root in roots])
1254 1254 else:
1255 1255 # No more roots? Return empty list
1256 1256 return nonodes
1257 1257 else:
1258 1258 # We are descending from nullid, and don't need to care about
1259 1259 # any other roots.
1260 1260 lowestrev = nullrev
1261 1261 roots = [self.nullid]
1262 1262 # Transform our roots list into a set.
1263 1263 descendants = set(roots)
1264 1264 # Also, keep the original roots so we can filter out roots that aren't
1265 1265 # 'real' roots (i.e. are descended from other roots).
1266 1266 roots = descendants.copy()
1267 1267 # Our topologically sorted list of output nodes.
1268 1268 orderedout = []
1269 1269 # Don't start at nullid since we don't want nullid in our output list,
1270 1270 # and if nullid shows up in descendants, empty parents will look like
1271 1271 # they're descendants.
1272 1272 for r in self.revs(start=max(lowestrev, 0), stop=highestrev + 1):
1273 1273 n = self.node(r)
1274 1274 isdescendant = False
1275 1275 if lowestrev == nullrev: # Everybody is a descendant of nullid
1276 1276 isdescendant = True
1277 1277 elif n in descendants:
1278 1278 # n is already a descendant
1279 1279 isdescendant = True
1280 1280 # This check only needs to be done here because all the roots
1281 1281 # will start being marked is descendants before the loop.
1282 1282 if n in roots:
1283 1283 # If n was a root, check if it's a 'real' root.
1284 1284 p = tuple(self.parents(n))
1285 1285 # If any of its parents are descendants, it's not a root.
1286 1286 if (p[0] in descendants) or (p[1] in descendants):
1287 1287 roots.remove(n)
1288 1288 else:
1289 1289 p = tuple(self.parents(n))
1290 1290 # A node is a descendant if either of its parents are
1291 1291 # descendants. (We seeded the dependents list with the roots
1292 1292 # up there, remember?)
1293 1293 if (p[0] in descendants) or (p[1] in descendants):
1294 1294 descendants.add(n)
1295 1295 isdescendant = True
1296 1296 if isdescendant and ((ancestors is None) or (n in ancestors)):
1297 1297 # Only include nodes that are both descendants and ancestors.
1298 1298 orderedout.append(n)
1299 1299 if (ancestors is not None) and (n in heads):
1300 1300 # We're trying to figure out which heads are reachable
1301 1301 # from roots.
1302 1302 # Mark this head as having been reached
1303 1303 heads[n] = True
1304 1304 elif ancestors is None:
1305 1305 # Otherwise, we're trying to discover the heads.
1306 1306 # Assume this is a head because if it isn't, the next step
1307 1307 # will eventually remove it.
1308 1308 heads[n] = True
1309 1309 # But, obviously its parents aren't.
1310 1310 for p in self.parents(n):
1311 1311 heads.pop(p, None)
1312 1312 heads = [head for head, flag in pycompat.iteritems(heads) if flag]
1313 1313 roots = list(roots)
1314 1314 assert orderedout
1315 1315 assert roots
1316 1316 assert heads
1317 1317 return (orderedout, roots, heads)
1318 1318
1319 1319 def headrevs(self, revs=None):
1320 1320 if revs is None:
1321 1321 try:
1322 1322 return self.index.headrevs()
1323 1323 except AttributeError:
1324 1324 return self._headrevs()
1325 1325 if rustdagop is not None and self.index.rust_ext_compat:
1326 1326 return rustdagop.headrevs(self.index, revs)
1327 1327 return dagop.headrevs(revs, self._uncheckedparentrevs)
1328 1328
1329 1329 def computephases(self, roots):
1330 1330 return self.index.computephasesmapsets(roots)
1331 1331
1332 1332 def _headrevs(self):
1333 1333 count = len(self)
1334 1334 if not count:
1335 1335 return [nullrev]
1336 1336 # we won't iter over filtered rev so nobody is a head at start
1337 1337 ishead = [0] * (count + 1)
1338 1338 index = self.index
1339 1339 for r in self:
1340 1340 ishead[r] = 1 # I may be an head
1341 1341 e = index[r]
1342 1342 ishead[e[5]] = ishead[e[6]] = 0 # my parent are not
1343 1343 return [r for r, val in enumerate(ishead) if val]
1344 1344
1345 1345 def heads(self, start=None, stop=None):
1346 1346 """return the list of all nodes that have no children
1347 1347
1348 1348 if start is specified, only heads that are descendants of
1349 1349 start will be returned
1350 1350 if stop is specified, it will consider all the revs from stop
1351 1351 as if they had no children
1352 1352 """
1353 1353 if start is None and stop is None:
1354 1354 if not len(self):
1355 1355 return [self.nullid]
1356 1356 return [self.node(r) for r in self.headrevs()]
1357 1357
1358 1358 if start is None:
1359 1359 start = nullrev
1360 1360 else:
1361 1361 start = self.rev(start)
1362 1362
1363 1363 stoprevs = {self.rev(n) for n in stop or []}
1364 1364
1365 1365 revs = dagop.headrevssubset(
1366 1366 self.revs, self.parentrevs, startrev=start, stoprevs=stoprevs
1367 1367 )
1368 1368
1369 1369 return [self.node(rev) for rev in revs]
1370 1370
1371 1371 def children(self, node):
1372 1372 """find the children of a given node"""
1373 1373 c = []
1374 1374 p = self.rev(node)
1375 1375 for r in self.revs(start=p + 1):
1376 1376 prevs = [pr for pr in self.parentrevs(r) if pr != nullrev]
1377 1377 if prevs:
1378 1378 for pr in prevs:
1379 1379 if pr == p:
1380 1380 c.append(self.node(r))
1381 1381 elif p == nullrev:
1382 1382 c.append(self.node(r))
1383 1383 return c
1384 1384
1385 1385 def commonancestorsheads(self, a, b):
1386 1386 """calculate all the heads of the common ancestors of nodes a and b"""
1387 1387 a, b = self.rev(a), self.rev(b)
1388 1388 ancs = self._commonancestorsheads(a, b)
1389 1389 return pycompat.maplist(self.node, ancs)
1390 1390
1391 1391 def _commonancestorsheads(self, *revs):
1392 1392 """calculate all the heads of the common ancestors of revs"""
1393 1393 try:
1394 1394 ancs = self.index.commonancestorsheads(*revs)
1395 1395 except (AttributeError, OverflowError): # C implementation failed
1396 1396 ancs = ancestor.commonancestorsheads(self.parentrevs, *revs)
1397 1397 return ancs
1398 1398
1399 1399 def isancestor(self, a, b):
1400 1400 """return True if node a is an ancestor of node b
1401 1401
1402 1402 A revision is considered an ancestor of itself."""
1403 1403 a, b = self.rev(a), self.rev(b)
1404 1404 return self.isancestorrev(a, b)
1405 1405
1406 1406 def isancestorrev(self, a, b):
1407 1407 """return True if revision a is an ancestor of revision b
1408 1408
1409 1409 A revision is considered an ancestor of itself.
1410 1410
1411 1411 The implementation of this is trivial but the use of
1412 1412 reachableroots is not."""
1413 1413 if a == nullrev:
1414 1414 return True
1415 1415 elif a == b:
1416 1416 return True
1417 1417 elif a > b:
1418 1418 return False
1419 1419 return bool(self.reachableroots(a, [b], [a], includepath=False))
1420 1420
1421 1421 def reachableroots(self, minroot, heads, roots, includepath=False):
1422 1422 """return (heads(::(<roots> and <roots>::<heads>)))
1423 1423
1424 1424 If includepath is True, return (<roots>::<heads>)."""
1425 1425 try:
1426 1426 return self.index.reachableroots2(
1427 1427 minroot, heads, roots, includepath
1428 1428 )
1429 1429 except AttributeError:
1430 1430 return dagop._reachablerootspure(
1431 1431 self.parentrevs, minroot, roots, heads, includepath
1432 1432 )
1433 1433
1434 1434 def ancestor(self, a, b):
1435 1435 """calculate the "best" common ancestor of nodes a and b"""
1436 1436
1437 1437 a, b = self.rev(a), self.rev(b)
1438 1438 try:
1439 1439 ancs = self.index.ancestors(a, b)
1440 1440 except (AttributeError, OverflowError):
1441 1441 ancs = ancestor.ancestors(self.parentrevs, a, b)
1442 1442 if ancs:
1443 1443 # choose a consistent winner when there's a tie
1444 1444 return min(map(self.node, ancs))
1445 1445 return self.nullid
1446 1446
1447 1447 def _match(self, id):
1448 1448 if isinstance(id, int):
1449 1449 # rev
1450 1450 return self.node(id)
1451 1451 if len(id) == self.nodeconstants.nodelen:
1452 1452 # possibly a binary node
1453 1453 # odds of a binary node being all hex in ASCII are 1 in 10**25
1454 1454 try:
1455 1455 node = id
1456 1456 self.rev(node) # quick search the index
1457 1457 return node
1458 1458 except error.LookupError:
1459 1459 pass # may be partial hex id
1460 1460 try:
1461 1461 # str(rev)
1462 1462 rev = int(id)
1463 1463 if b"%d" % rev != id:
1464 1464 raise ValueError
1465 1465 if rev < 0:
1466 1466 rev = len(self) + rev
1467 1467 if rev < 0 or rev >= len(self):
1468 1468 raise ValueError
1469 1469 return self.node(rev)
1470 1470 except (ValueError, OverflowError):
1471 1471 pass
1472 1472 if len(id) == 2 * self.nodeconstants.nodelen:
1473 1473 try:
1474 1474 # a full hex nodeid?
1475 1475 node = bin(id)
1476 1476 self.rev(node)
1477 1477 return node
1478 1478 except (TypeError, error.LookupError):
1479 1479 pass
1480 1480
1481 1481 def _partialmatch(self, id):
1482 1482 # we don't care wdirfilenodeids as they should be always full hash
1483 1483 maybewdir = self.nodeconstants.wdirhex.startswith(id)
1484 1484 ambiguous = False
1485 1485 try:
1486 1486 partial = self.index.partialmatch(id)
1487 1487 if partial and self.hasnode(partial):
1488 1488 if maybewdir:
1489 1489 # single 'ff...' match in radix tree, ambiguous with wdir
1490 1490 ambiguous = True
1491 1491 else:
1492 1492 return partial
1493 1493 elif maybewdir:
1494 1494 # no 'ff...' match in radix tree, wdir identified
1495 1495 raise error.WdirUnsupported
1496 1496 else:
1497 1497 return None
1498 1498 except error.RevlogError:
1499 1499 # parsers.c radix tree lookup gave multiple matches
1500 1500 # fast path: for unfiltered changelog, radix tree is accurate
1501 1501 if not getattr(self, 'filteredrevs', None):
1502 1502 ambiguous = True
1503 1503 # fall through to slow path that filters hidden revisions
1504 1504 except (AttributeError, ValueError):
1505 1505 # we are pure python, or key was too short to search radix tree
1506 1506 pass
1507 1507 if ambiguous:
1508 1508 raise error.AmbiguousPrefixLookupError(
1509 1509 id, self.display_id, _(b'ambiguous identifier')
1510 1510 )
1511 1511
1512 1512 if id in self._pcache:
1513 1513 return self._pcache[id]
1514 1514
1515 1515 if len(id) <= 40:
1516 1516 try:
1517 1517 # hex(node)[:...]
1518 1518 l = len(id) // 2 # grab an even number of digits
1519 1519 prefix = bin(id[: l * 2])
1520 1520 nl = [e[7] for e in self.index if e[7].startswith(prefix)]
1521 1521 nl = [
1522 1522 n for n in nl if hex(n).startswith(id) and self.hasnode(n)
1523 1523 ]
1524 1524 if self.nodeconstants.nullhex.startswith(id):
1525 1525 nl.append(self.nullid)
1526 1526 if len(nl) > 0:
1527 1527 if len(nl) == 1 and not maybewdir:
1528 1528 self._pcache[id] = nl[0]
1529 1529 return nl[0]
1530 1530 raise error.AmbiguousPrefixLookupError(
1531 1531 id, self.display_id, _(b'ambiguous identifier')
1532 1532 )
1533 1533 if maybewdir:
1534 1534 raise error.WdirUnsupported
1535 1535 return None
1536 1536 except TypeError:
1537 1537 pass
1538 1538
1539 1539 def lookup(self, id):
1540 1540 """locate a node based on:
1541 1541 - revision number or str(revision number)
1542 1542 - nodeid or subset of hex nodeid
1543 1543 """
1544 1544 n = self._match(id)
1545 1545 if n is not None:
1546 1546 return n
1547 1547 n = self._partialmatch(id)
1548 1548 if n:
1549 1549 return n
1550 1550
1551 1551 raise error.LookupError(id, self.display_id, _(b'no match found'))
1552 1552
1553 1553 def shortest(self, node, minlength=1):
1554 1554 """Find the shortest unambiguous prefix that matches node."""
1555 1555
1556 1556 def isvalid(prefix):
1557 1557 try:
1558 1558 matchednode = self._partialmatch(prefix)
1559 1559 except error.AmbiguousPrefixLookupError:
1560 1560 return False
1561 1561 except error.WdirUnsupported:
1562 1562 # single 'ff...' match
1563 1563 return True
1564 1564 if matchednode is None:
1565 1565 raise error.LookupError(node, self.display_id, _(b'no node'))
1566 1566 return True
1567 1567
1568 1568 def maybewdir(prefix):
1569 1569 return all(c == b'f' for c in pycompat.iterbytestr(prefix))
1570 1570
1571 1571 hexnode = hex(node)
1572 1572
1573 1573 def disambiguate(hexnode, minlength):
1574 1574 """Disambiguate against wdirid."""
1575 1575 for length in range(minlength, len(hexnode) + 1):
1576 1576 prefix = hexnode[:length]
1577 1577 if not maybewdir(prefix):
1578 1578 return prefix
1579 1579
1580 1580 if not getattr(self, 'filteredrevs', None):
1581 1581 try:
1582 1582 length = max(self.index.shortest(node), minlength)
1583 1583 return disambiguate(hexnode, length)
1584 1584 except error.RevlogError:
1585 1585 if node != self.nodeconstants.wdirid:
1586 1586 raise error.LookupError(
1587 1587 node, self.display_id, _(b'no node')
1588 1588 )
1589 1589 except AttributeError:
1590 1590 # Fall through to pure code
1591 1591 pass
1592 1592
1593 1593 if node == self.nodeconstants.wdirid:
1594 1594 for length in range(minlength, len(hexnode) + 1):
1595 1595 prefix = hexnode[:length]
1596 1596 if isvalid(prefix):
1597 1597 return prefix
1598 1598
1599 1599 for length in range(minlength, len(hexnode) + 1):
1600 1600 prefix = hexnode[:length]
1601 1601 if isvalid(prefix):
1602 1602 return disambiguate(hexnode, length)
1603 1603
1604 1604 def cmp(self, node, text):
1605 1605 """compare text with a given file revision
1606 1606
1607 1607 returns True if text is different than what is stored.
1608 1608 """
1609 1609 p1, p2 = self.parents(node)
1610 1610 return storageutil.hashrevisionsha1(text, p1, p2) != node
1611 1611
1612 1612 def _getsegmentforrevs(self, startrev, endrev, df=None):
1613 1613 """Obtain a segment of raw data corresponding to a range of revisions.
1614 1614
1615 1615 Accepts the start and end revisions and an optional already-open
1616 1616 file handle to be used for reading. If the file handle is read, its
1617 1617 seek position will not be preserved.
1618 1618
1619 1619 Requests for data may be satisfied by a cache.
1620 1620
1621 1621 Returns a 2-tuple of (offset, data) for the requested range of
1622 1622 revisions. Offset is the integer offset from the beginning of the
1623 1623 revlog and data is a str or buffer of the raw byte data.
1624 1624
1625 1625 Callers will need to call ``self.start(rev)`` and ``self.length(rev)``
1626 1626 to determine where each revision's data begins and ends.
1627 1627 """
1628 1628 # Inlined self.start(startrev) & self.end(endrev) for perf reasons
1629 1629 # (functions are expensive).
1630 1630 index = self.index
1631 1631 istart = index[startrev]
1632 1632 start = int(istart[0] >> 16)
1633 1633 if startrev == endrev:
1634 1634 end = start + istart[1]
1635 1635 else:
1636 1636 iend = index[endrev]
1637 1637 end = int(iend[0] >> 16) + iend[1]
1638 1638
1639 1639 if self._inline:
1640 1640 start += (startrev + 1) * self.index.entry_size
1641 1641 end += (endrev + 1) * self.index.entry_size
1642 1642 length = end - start
1643 1643
1644 1644 return start, self._segmentfile.read_chunk(start, length, df)
1645 1645
1646 1646 def _chunk(self, rev, df=None):
1647 1647 """Obtain a single decompressed chunk for a revision.
1648 1648
1649 1649 Accepts an integer revision and an optional already-open file handle
1650 1650 to be used for reading. If used, the seek position of the file will not
1651 1651 be preserved.
1652 1652
1653 1653 Returns a str holding uncompressed data for the requested revision.
1654 1654 """
1655 1655 compression_mode = self.index[rev][10]
1656 1656 data = self._getsegmentforrevs(rev, rev, df=df)[1]
1657 1657 if compression_mode == COMP_MODE_PLAIN:
1658 1658 return data
1659 1659 elif compression_mode == COMP_MODE_DEFAULT:
1660 1660 return self._decompressor(data)
1661 1661 elif compression_mode == COMP_MODE_INLINE:
1662 1662 return self.decompress(data)
1663 1663 else:
1664 1664 msg = b'unknown compression mode %d'
1665 1665 msg %= compression_mode
1666 1666 raise error.RevlogError(msg)
1667 1667
1668 1668 def _chunks(self, revs, df=None, targetsize=None):
1669 1669 """Obtain decompressed chunks for the specified revisions.
1670 1670
1671 1671 Accepts an iterable of numeric revisions that are assumed to be in
1672 1672 ascending order. Also accepts an optional already-open file handle
1673 1673 to be used for reading. If used, the seek position of the file will
1674 1674 not be preserved.
1675 1675
1676 1676 This function is similar to calling ``self._chunk()`` multiple times,
1677 1677 but is faster.
1678 1678
1679 1679 Returns a list with decompressed data for each requested revision.
1680 1680 """
1681 1681 if not revs:
1682 1682 return []
1683 1683 start = self.start
1684 1684 length = self.length
1685 1685 inline = self._inline
1686 1686 iosize = self.index.entry_size
1687 1687 buffer = util.buffer
1688 1688
1689 1689 l = []
1690 1690 ladd = l.append
1691 1691
1692 1692 if not self._withsparseread:
1693 1693 slicedchunks = (revs,)
1694 1694 else:
1695 1695 slicedchunks = deltautil.slicechunk(
1696 1696 self, revs, targetsize=targetsize
1697 1697 )
1698 1698
1699 1699 for revschunk in slicedchunks:
1700 1700 firstrev = revschunk[0]
1701 1701 # Skip trailing revisions with empty diff
1702 1702 for lastrev in revschunk[::-1]:
1703 1703 if length(lastrev) != 0:
1704 1704 break
1705 1705
1706 1706 try:
1707 1707 offset, data = self._getsegmentforrevs(firstrev, lastrev, df=df)
1708 1708 except OverflowError:
1709 1709 # issue4215 - we can't cache a run of chunks greater than
1710 1710 # 2G on Windows
1711 1711 return [self._chunk(rev, df=df) for rev in revschunk]
1712 1712
1713 1713 decomp = self.decompress
1714 1714 # self._decompressor might be None, but will not be used in that case
1715 1715 def_decomp = self._decompressor
1716 1716 for rev in revschunk:
1717 1717 chunkstart = start(rev)
1718 1718 if inline:
1719 1719 chunkstart += (rev + 1) * iosize
1720 1720 chunklength = length(rev)
1721 1721 comp_mode = self.index[rev][10]
1722 1722 c = buffer(data, chunkstart - offset, chunklength)
1723 1723 if comp_mode == COMP_MODE_PLAIN:
1724 1724 ladd(c)
1725 1725 elif comp_mode == COMP_MODE_INLINE:
1726 1726 ladd(decomp(c))
1727 1727 elif comp_mode == COMP_MODE_DEFAULT:
1728 1728 ladd(def_decomp(c))
1729 1729 else:
1730 1730 msg = b'unknown compression mode %d'
1731 1731 msg %= comp_mode
1732 1732 raise error.RevlogError(msg)
1733 1733
1734 1734 return l
1735 1735
1736 1736 def deltaparent(self, rev):
1737 1737 """return deltaparent of the given revision"""
1738 1738 base = self.index[rev][3]
1739 1739 if base == rev:
1740 1740 return nullrev
1741 1741 elif self._generaldelta:
1742 1742 return base
1743 1743 else:
1744 1744 return rev - 1
1745 1745
1746 1746 def issnapshot(self, rev):
1747 1747 """tells whether rev is a snapshot"""
1748 1748 if not self._sparserevlog:
1749 1749 return self.deltaparent(rev) == nullrev
1750 1750 elif util.safehasattr(self.index, b'issnapshot'):
1751 1751 # directly assign the method to cache the testing and access
1752 1752 self.issnapshot = self.index.issnapshot
1753 1753 return self.issnapshot(rev)
1754 1754 if rev == nullrev:
1755 1755 return True
1756 1756 entry = self.index[rev]
1757 1757 base = entry[3]
1758 1758 if base == rev:
1759 1759 return True
1760 1760 if base == nullrev:
1761 1761 return True
1762 1762 p1 = entry[5]
1763 1763 p2 = entry[6]
1764 1764 if base == p1 or base == p2:
1765 1765 return False
1766 1766 return self.issnapshot(base)
1767 1767
1768 1768 def snapshotdepth(self, rev):
1769 1769 """number of snapshot in the chain before this one"""
1770 1770 if not self.issnapshot(rev):
1771 1771 raise error.ProgrammingError(b'revision %d not a snapshot')
1772 1772 return len(self._deltachain(rev)[0]) - 1
1773 1773
1774 1774 def revdiff(self, rev1, rev2):
1775 1775 """return or calculate a delta between two revisions
1776 1776
1777 1777 The delta calculated is in binary form and is intended to be written to
1778 1778 revlog data directly. So this function needs raw revision data.
1779 1779 """
1780 1780 if rev1 != nullrev and self.deltaparent(rev2) == rev1:
1781 1781 return bytes(self._chunk(rev2))
1782 1782
1783 1783 return mdiff.textdiff(self.rawdata(rev1), self.rawdata(rev2))
1784 1784
1785 1785 def _processflags(self, text, flags, operation, raw=False):
1786 1786 """deprecated entry point to access flag processors"""
1787 1787 msg = b'_processflag(...) use the specialized variant'
1788 1788 util.nouideprecwarn(msg, b'5.2', stacklevel=2)
1789 1789 if raw:
1790 1790 return text, flagutil.processflagsraw(self, text, flags)
1791 1791 elif operation == b'read':
1792 1792 return flagutil.processflagsread(self, text, flags)
1793 1793 else: # write operation
1794 1794 return flagutil.processflagswrite(self, text, flags)
1795 1795
1796 1796 def revision(self, nodeorrev, _df=None, raw=False):
1797 1797 """return an uncompressed revision of a given node or revision
1798 1798 number.
1799 1799
1800 1800 _df - an existing file handle to read from. (internal-only)
1801 1801 raw - an optional argument specifying if the revision data is to be
1802 1802 treated as raw data when applying flag transforms. 'raw' should be set
1803 1803 to True when generating changegroups or in debug commands.
1804 1804 """
1805 1805 if raw:
1806 1806 msg = (
1807 1807 b'revlog.revision(..., raw=True) is deprecated, '
1808 1808 b'use revlog.rawdata(...)'
1809 1809 )
1810 1810 util.nouideprecwarn(msg, b'5.2', stacklevel=2)
1811 1811 return self._revisiondata(nodeorrev, _df, raw=raw)
1812 1812
1813 1813 def sidedata(self, nodeorrev, _df=None):
1814 1814 """a map of extra data related to the changeset but not part of the hash
1815 1815
1816 1816 This function currently return a dictionary. However, more advanced
1817 1817 mapping object will likely be used in the future for a more
1818 1818 efficient/lazy code.
1819 1819 """
1820 1820 # deal with <nodeorrev> argument type
1821 1821 if isinstance(nodeorrev, int):
1822 1822 rev = nodeorrev
1823 1823 else:
1824 1824 rev = self.rev(nodeorrev)
1825 1825 return self._sidedata(rev)
1826 1826
1827 1827 def _revisiondata(self, nodeorrev, _df=None, raw=False):
1828 1828 # deal with <nodeorrev> argument type
1829 1829 if isinstance(nodeorrev, int):
1830 1830 rev = nodeorrev
1831 1831 node = self.node(rev)
1832 1832 else:
1833 1833 node = nodeorrev
1834 1834 rev = None
1835 1835
1836 1836 # fast path the special `nullid` rev
1837 1837 if node == self.nullid:
1838 1838 return b""
1839 1839
1840 1840 # ``rawtext`` is the text as stored inside the revlog. Might be the
1841 1841 # revision or might need to be processed to retrieve the revision.
1842 1842 rev, rawtext, validated = self._rawtext(node, rev, _df=_df)
1843 1843
1844 1844 if raw and validated:
1845 1845 # if we don't want to process the raw text and that raw
1846 1846 # text is cached, we can exit early.
1847 1847 return rawtext
1848 1848 if rev is None:
1849 1849 rev = self.rev(node)
1850 1850 # the revlog's flag for this revision
1851 1851 # (usually alter its state or content)
1852 1852 flags = self.flags(rev)
1853 1853
1854 1854 if validated and flags == REVIDX_DEFAULT_FLAGS:
1855 1855 # no extra flags set, no flag processor runs, text = rawtext
1856 1856 return rawtext
1857 1857
1858 1858 if raw:
1859 1859 validatehash = flagutil.processflagsraw(self, rawtext, flags)
1860 1860 text = rawtext
1861 1861 else:
1862 1862 r = flagutil.processflagsread(self, rawtext, flags)
1863 1863 text, validatehash = r
1864 1864 if validatehash:
1865 1865 self.checkhash(text, node, rev=rev)
1866 1866 if not validated:
1867 1867 self._revisioncache = (node, rev, rawtext)
1868 1868
1869 1869 return text
1870 1870
1871 1871 def _rawtext(self, node, rev, _df=None):
1872 1872 """return the possibly unvalidated rawtext for a revision
1873 1873
1874 1874 returns (rev, rawtext, validated)
1875 1875 """
1876 1876
1877 1877 # revision in the cache (could be useful to apply delta)
1878 1878 cachedrev = None
1879 1879 # An intermediate text to apply deltas to
1880 1880 basetext = None
1881 1881
1882 1882 # Check if we have the entry in cache
1883 1883 # The cache entry looks like (node, rev, rawtext)
1884 1884 if self._revisioncache:
1885 1885 if self._revisioncache[0] == node:
1886 1886 return (rev, self._revisioncache[2], True)
1887 1887 cachedrev = self._revisioncache[1]
1888 1888
1889 1889 if rev is None:
1890 1890 rev = self.rev(node)
1891 1891
1892 1892 chain, stopped = self._deltachain(rev, stoprev=cachedrev)
1893 1893 if stopped:
1894 1894 basetext = self._revisioncache[2]
1895 1895
1896 1896 # drop cache to save memory, the caller is expected to
1897 1897 # update self._revisioncache after validating the text
1898 1898 self._revisioncache = None
1899 1899
1900 1900 targetsize = None
1901 1901 rawsize = self.index[rev][2]
1902 1902 if 0 <= rawsize:
1903 1903 targetsize = 4 * rawsize
1904 1904
1905 1905 bins = self._chunks(chain, df=_df, targetsize=targetsize)
1906 1906 if basetext is None:
1907 1907 basetext = bytes(bins[0])
1908 1908 bins = bins[1:]
1909 1909
1910 1910 rawtext = mdiff.patches(basetext, bins)
1911 1911 del basetext # let us have a chance to free memory early
1912 1912 return (rev, rawtext, False)
1913 1913
1914 1914 def _sidedata(self, rev):
1915 1915 """Return the sidedata for a given revision number."""
1916 1916 index_entry = self.index[rev]
1917 1917 sidedata_offset = index_entry[8]
1918 1918 sidedata_size = index_entry[9]
1919 1919
1920 1920 if self._inline:
1921 1921 sidedata_offset += self.index.entry_size * (1 + rev)
1922 1922 if sidedata_size == 0:
1923 1923 return {}
1924 1924
1925 1925 if self._docket.sidedata_end < sidedata_offset + sidedata_size:
1926 1926 filename = self._sidedatafile
1927 1927 end = self._docket.sidedata_end
1928 1928 offset = sidedata_offset
1929 1929 length = sidedata_size
1930 1930 m = FILE_TOO_SHORT_MSG % (filename, length, offset, end)
1931 1931 raise error.RevlogError(m)
1932 1932
1933 1933 comp_segment = self._segmentfile_sidedata.read_chunk(
1934 1934 sidedata_offset, sidedata_size
1935 1935 )
1936 1936
1937 1937 comp = self.index[rev][11]
1938 1938 if comp == COMP_MODE_PLAIN:
1939 1939 segment = comp_segment
1940 1940 elif comp == COMP_MODE_DEFAULT:
1941 1941 segment = self._decompressor(comp_segment)
1942 1942 elif comp == COMP_MODE_INLINE:
1943 1943 segment = self.decompress(comp_segment)
1944 1944 else:
1945 1945 msg = b'unknown compression mode %d'
1946 1946 msg %= comp
1947 1947 raise error.RevlogError(msg)
1948 1948
1949 1949 sidedata = sidedatautil.deserialize_sidedata(segment)
1950 1950 return sidedata
1951 1951
1952 1952 def rawdata(self, nodeorrev, _df=None):
1953 1953 """return an uncompressed raw data of a given node or revision number.
1954 1954
1955 1955 _df - an existing file handle to read from. (internal-only)
1956 1956 """
1957 1957 return self._revisiondata(nodeorrev, _df, raw=True)
1958 1958
1959 1959 def hash(self, text, p1, p2):
1960 1960 """Compute a node hash.
1961 1961
1962 1962 Available as a function so that subclasses can replace the hash
1963 1963 as needed.
1964 1964 """
1965 1965 return storageutil.hashrevisionsha1(text, p1, p2)
1966 1966
1967 1967 def checkhash(self, text, node, p1=None, p2=None, rev=None):
1968 1968 """Check node hash integrity.
1969 1969
1970 1970 Available as a function so that subclasses can extend hash mismatch
1971 1971 behaviors as needed.
1972 1972 """
1973 1973 try:
1974 1974 if p1 is None and p2 is None:
1975 1975 p1, p2 = self.parents(node)
1976 1976 if node != self.hash(text, p1, p2):
1977 1977 # Clear the revision cache on hash failure. The revision cache
1978 1978 # only stores the raw revision and clearing the cache does have
1979 1979 # the side-effect that we won't have a cache hit when the raw
1980 1980 # revision data is accessed. But this case should be rare and
1981 1981 # it is extra work to teach the cache about the hash
1982 1982 # verification state.
1983 1983 if self._revisioncache and self._revisioncache[0] == node:
1984 1984 self._revisioncache = None
1985 1985
1986 1986 revornode = rev
1987 1987 if revornode is None:
1988 1988 revornode = templatefilters.short(hex(node))
1989 1989 raise error.RevlogError(
1990 1990 _(b"integrity check failed on %s:%s")
1991 1991 % (self.display_id, pycompat.bytestr(revornode))
1992 1992 )
1993 1993 except error.RevlogError:
1994 1994 if self._censorable and storageutil.iscensoredtext(text):
1995 1995 raise error.CensoredNodeError(self.display_id, node, text)
1996 1996 raise
1997 1997
1998 1998 def _enforceinlinesize(self, tr):
1999 1999 """Check if the revlog is too big for inline and convert if so.
2000 2000
2001 2001 This should be called after revisions are added to the revlog. If the
2002 2002 revlog has grown too large to be an inline revlog, it will convert it
2003 2003 to use multiple index and data files.
2004 2004 """
2005 2005 tiprev = len(self) - 1
2006 2006 total_size = self.start(tiprev) + self.length(tiprev)
2007 2007 if not self._inline or total_size < _maxinline:
2008 2008 return
2009 2009
2010 2010 troffset = tr.findoffset(self._indexfile)
2011 2011 if troffset is None:
2012 2012 raise error.RevlogError(
2013 2013 _(b"%s not found in the transaction") % self._indexfile
2014 2014 )
2015 2015 trindex = 0
2016 2016 tr.add(self._datafile, 0)
2017 2017
2018 2018 existing_handles = False
2019 2019 if self._writinghandles is not None:
2020 2020 existing_handles = True
2021 2021 fp = self._writinghandles[0]
2022 2022 fp.flush()
2023 2023 fp.close()
2024 2024 # We can't use the cached file handle after close(). So prevent
2025 2025 # its usage.
2026 2026 self._writinghandles = None
2027 2027 self._segmentfile.writing_handle = None
2028 2028 # No need to deal with sidedata writing handle as it is only
2029 2029 # relevant with revlog-v2 which is never inline, not reaching
2030 2030 # this code
2031 2031
2032 2032 new_dfh = self._datafp(b'w+')
2033 2033 new_dfh.truncate(0) # drop any potentially existing data
2034 2034 try:
2035 2035 with self._indexfp() as read_ifh:
2036 2036 for r in self:
2037 2037 new_dfh.write(self._getsegmentforrevs(r, r, df=read_ifh)[1])
2038 2038 if troffset <= self.start(r) + r * self.index.entry_size:
2039 2039 trindex = r
2040 2040 new_dfh.flush()
2041 2041
2042 2042 with self.__index_new_fp() as fp:
2043 2043 self._format_flags &= ~FLAG_INLINE_DATA
2044 2044 self._inline = False
2045 2045 for i in self:
2046 2046 e = self.index.entry_binary(i)
2047 2047 if i == 0 and self._docket is None:
2048 2048 header = self._format_flags | self._format_version
2049 2049 header = self.index.pack_header(header)
2050 2050 e = header + e
2051 2051 fp.write(e)
2052 2052 if self._docket is not None:
2053 2053 self._docket.index_end = fp.tell()
2054 2054
2055 2055 # There is a small transactional race here. If the rename of
2056 2056 # the index fails, we should remove the datafile. It is more
2057 2057 # important to ensure that the data file is not truncated
2058 2058 # when the index is replaced as otherwise data is lost.
2059 2059 tr.replace(self._datafile, self.start(trindex))
2060 2060
2061 2061 # the temp file replace the real index when we exit the context
2062 2062 # manager
2063 2063
2064 2064 tr.replace(self._indexfile, trindex * self.index.entry_size)
2065 2065 nodemaputil.setup_persistent_nodemap(tr, self)
2066 2066 self._segmentfile = randomaccessfile.randomaccessfile(
2067 2067 self.opener,
2068 2068 self._datafile,
2069 2069 self._chunkcachesize,
2070 2070 )
2071 2071
2072 2072 if existing_handles:
2073 2073 # switched from inline to conventional reopen the index
2074 2074 ifh = self.__index_write_fp()
2075 2075 self._writinghandles = (ifh, new_dfh, None)
2076 2076 self._segmentfile.writing_handle = new_dfh
2077 2077 new_dfh = None
2078 2078 # No need to deal with sidedata writing handle as it is only
2079 2079 # relevant with revlog-v2 which is never inline, not reaching
2080 2080 # this code
2081 2081 finally:
2082 2082 if new_dfh is not None:
2083 2083 new_dfh.close()
2084 2084
2085 2085 def _nodeduplicatecallback(self, transaction, node):
2086 2086 """called when trying to add a node already stored."""
2087 2087
2088 2088 @contextlib.contextmanager
2089 2089 def _writing(self, transaction):
2090 2090 if self._trypending:
2091 2091 msg = b'try to write in a `trypending` revlog: %s'
2092 2092 msg %= self.display_id
2093 2093 raise error.ProgrammingError(msg)
2094 2094 if self._writinghandles is not None:
2095 2095 yield
2096 2096 else:
2097 2097 ifh = dfh = sdfh = None
2098 2098 try:
2099 2099 r = len(self)
2100 2100 # opening the data file.
2101 2101 dsize = 0
2102 2102 if r:
2103 2103 dsize = self.end(r - 1)
2104 2104 dfh = None
2105 2105 if not self._inline:
2106 2106 try:
2107 2107 dfh = self._datafp(b"r+")
2108 2108 if self._docket is None:
2109 2109 dfh.seek(0, os.SEEK_END)
2110 2110 else:
2111 2111 dfh.seek(self._docket.data_end, os.SEEK_SET)
2112 2112 except IOError as inst:
2113 2113 if inst.errno != errno.ENOENT:
2114 2114 raise
2115 2115 dfh = self._datafp(b"w+")
2116 2116 transaction.add(self._datafile, dsize)
2117 2117 if self._sidedatafile is not None:
2118 2118 try:
2119 2119 sdfh = self.opener(self._sidedatafile, mode=b"r+")
2120 2120 dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
2121 2121 except IOError as inst:
2122 2122 if inst.errno != errno.ENOENT:
2123 2123 raise
2124 2124 sdfh = self.opener(self._sidedatafile, mode=b"w+")
2125 2125 transaction.add(
2126 2126 self._sidedatafile, self._docket.sidedata_end
2127 2127 )
2128 2128
2129 2129 # opening the index file.
2130 2130 isize = r * self.index.entry_size
2131 2131 ifh = self.__index_write_fp()
2132 2132 if self._inline:
2133 2133 transaction.add(self._indexfile, dsize + isize)
2134 2134 else:
2135 2135 transaction.add(self._indexfile, isize)
2136 2136 # exposing all file handle for writing.
2137 2137 self._writinghandles = (ifh, dfh, sdfh)
2138 2138 self._segmentfile.writing_handle = ifh if self._inline else dfh
2139 2139 self._segmentfile_sidedata.writing_handle = sdfh
2140 2140 yield
2141 2141 if self._docket is not None:
2142 2142 self._write_docket(transaction)
2143 2143 finally:
2144 2144 self._writinghandles = None
2145 2145 self._segmentfile.writing_handle = None
2146 2146 self._segmentfile_sidedata.writing_handle = None
2147 2147 if dfh is not None:
2148 2148 dfh.close()
2149 2149 if sdfh is not None:
2150 2150 sdfh.close()
2151 2151 # closing the index file last to avoid exposing referent to
2152 2152 # potential unflushed data content.
2153 2153 if ifh is not None:
2154 2154 ifh.close()
2155 2155
2156 2156 def _write_docket(self, transaction):
2157 2157 """write the current docket on disk
2158 2158
2159 2159 Exist as a method to help changelog to implement transaction logic
2160 2160
2161 2161 We could also imagine using the same transaction logic for all revlog
2162 2162 since docket are cheap."""
2163 2163 self._docket.write(transaction)
2164 2164
2165 2165 def addrevision(
2166 2166 self,
2167 2167 text,
2168 2168 transaction,
2169 2169 link,
2170 2170 p1,
2171 2171 p2,
2172 2172 cachedelta=None,
2173 2173 node=None,
2174 2174 flags=REVIDX_DEFAULT_FLAGS,
2175 2175 deltacomputer=None,
2176 2176 sidedata=None,
2177 2177 ):
2178 2178 """add a revision to the log
2179 2179
2180 2180 text - the revision data to add
2181 2181 transaction - the transaction object used for rollback
2182 2182 link - the linkrev data to add
2183 2183 p1, p2 - the parent nodeids of the revision
2184 2184 cachedelta - an optional precomputed delta
2185 2185 node - nodeid of revision; typically node is not specified, and it is
2186 2186 computed by default as hash(text, p1, p2), however subclasses might
2187 2187 use different hashing method (and override checkhash() in such case)
2188 2188 flags - the known flags to set on the revision
2189 2189 deltacomputer - an optional deltacomputer instance shared between
2190 2190 multiple calls
2191 2191 """
2192 2192 if link == nullrev:
2193 2193 raise error.RevlogError(
2194 2194 _(b"attempted to add linkrev -1 to %s") % self.display_id
2195 2195 )
2196 2196
2197 2197 if sidedata is None:
2198 2198 sidedata = {}
2199 2199 elif sidedata and not self.hassidedata:
2200 2200 raise error.ProgrammingError(
2201 2201 _(b"trying to add sidedata to a revlog who don't support them")
2202 2202 )
2203 2203
2204 2204 if flags:
2205 2205 node = node or self.hash(text, p1, p2)
2206 2206
2207 2207 rawtext, validatehash = flagutil.processflagswrite(self, text, flags)
2208 2208
2209 2209 # If the flag processor modifies the revision data, ignore any provided
2210 2210 # cachedelta.
2211 2211 if rawtext != text:
2212 2212 cachedelta = None
2213 2213
2214 2214 if len(rawtext) > _maxentrysize:
2215 2215 raise error.RevlogError(
2216 2216 _(
2217 2217 b"%s: size of %d bytes exceeds maximum revlog storage of 2GiB"
2218 2218 )
2219 2219 % (self.display_id, len(rawtext))
2220 2220 )
2221 2221
2222 2222 node = node or self.hash(rawtext, p1, p2)
2223 2223 rev = self.index.get_rev(node)
2224 2224 if rev is not None:
2225 2225 return rev
2226 2226
2227 2227 if validatehash:
2228 2228 self.checkhash(rawtext, node, p1=p1, p2=p2)
2229 2229
2230 2230 return self.addrawrevision(
2231 2231 rawtext,
2232 2232 transaction,
2233 2233 link,
2234 2234 p1,
2235 2235 p2,
2236 2236 node,
2237 2237 flags,
2238 2238 cachedelta=cachedelta,
2239 2239 deltacomputer=deltacomputer,
2240 2240 sidedata=sidedata,
2241 2241 )
2242 2242
2243 2243 def addrawrevision(
2244 2244 self,
2245 2245 rawtext,
2246 2246 transaction,
2247 2247 link,
2248 2248 p1,
2249 2249 p2,
2250 2250 node,
2251 2251 flags,
2252 2252 cachedelta=None,
2253 2253 deltacomputer=None,
2254 2254 sidedata=None,
2255 2255 ):
2256 2256 """add a raw revision with known flags, node and parents
2257 2257 useful when reusing a revision not stored in this revlog (ex: received
2258 2258 over wire, or read from an external bundle).
2259 2259 """
2260 2260 with self._writing(transaction):
2261 2261 return self._addrevision(
2262 2262 node,
2263 2263 rawtext,
2264 2264 transaction,
2265 2265 link,
2266 2266 p1,
2267 2267 p2,
2268 2268 flags,
2269 2269 cachedelta,
2270 2270 deltacomputer=deltacomputer,
2271 2271 sidedata=sidedata,
2272 2272 )
2273 2273
2274 2274 def compress(self, data):
2275 2275 """Generate a possibly-compressed representation of data."""
2276 2276 if not data:
2277 2277 return b'', data
2278 2278
2279 2279 compressed = self._compressor.compress(data)
2280 2280
2281 2281 if compressed:
2282 2282 # The revlog compressor added the header in the returned data.
2283 2283 return b'', compressed
2284 2284
2285 2285 if data[0:1] == b'\0':
2286 2286 return b'', data
2287 2287 return b'u', data
2288 2288
2289 2289 def decompress(self, data):
2290 2290 """Decompress a revlog chunk.
2291 2291
2292 2292 The chunk is expected to begin with a header identifying the
2293 2293 format type so it can be routed to an appropriate decompressor.
2294 2294 """
2295 2295 if not data:
2296 2296 return data
2297 2297
2298 2298 # Revlogs are read much more frequently than they are written and many
2299 2299 # chunks only take microseconds to decompress, so performance is
2300 2300 # important here.
2301 2301 #
2302 2302 # We can make a few assumptions about revlogs:
2303 2303 #
2304 2304 # 1) the majority of chunks will be compressed (as opposed to inline
2305 2305 # raw data).
2306 2306 # 2) decompressing *any* data will likely by at least 10x slower than
2307 2307 # returning raw inline data.
2308 2308 # 3) we want to prioritize common and officially supported compression
2309 2309 # engines
2310 2310 #
2311 2311 # It follows that we want to optimize for "decompress compressed data
2312 2312 # when encoded with common and officially supported compression engines"
2313 2313 # case over "raw data" and "data encoded by less common or non-official
2314 2314 # compression engines." That is why we have the inline lookup first
2315 2315 # followed by the compengines lookup.
2316 2316 #
2317 2317 # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
2318 2318 # compressed chunks. And this matters for changelog and manifest reads.
2319 2319 t = data[0:1]
2320 2320
2321 2321 if t == b'x':
2322 2322 try:
2323 2323 return _zlibdecompress(data)
2324 2324 except zlib.error as e:
2325 2325 raise error.RevlogError(
2326 2326 _(b'revlog decompress error: %s')
2327 2327 % stringutil.forcebytestr(e)
2328 2328 )
2329 2329 # '\0' is more common than 'u' so it goes first.
2330 2330 elif t == b'\0':
2331 2331 return data
2332 2332 elif t == b'u':
2333 2333 return util.buffer(data, 1)
2334 2334
2335 2335 compressor = self._get_decompressor(t)
2336 2336
2337 2337 return compressor.decompress(data)
2338 2338
2339 2339 def _addrevision(
2340 2340 self,
2341 2341 node,
2342 2342 rawtext,
2343 2343 transaction,
2344 2344 link,
2345 2345 p1,
2346 2346 p2,
2347 2347 flags,
2348 2348 cachedelta,
2349 2349 alwayscache=False,
2350 2350 deltacomputer=None,
2351 2351 sidedata=None,
2352 2352 ):
2353 2353 """internal function to add revisions to the log
2354 2354
2355 2355 see addrevision for argument descriptions.
2356 2356
2357 2357 note: "addrevision" takes non-raw text, "_addrevision" takes raw text.
2358 2358
2359 2359 if "deltacomputer" is not provided or None, a defaultdeltacomputer will
2360 2360 be used.
2361 2361
2362 2362 invariants:
2363 2363 - rawtext is optional (can be None); if not set, cachedelta must be set.
2364 2364 if both are set, they must correspond to each other.
2365 2365 """
2366 2366 if node == self.nullid:
2367 2367 raise error.RevlogError(
2368 2368 _(b"%s: attempt to add null revision") % self.display_id
2369 2369 )
2370 2370 if (
2371 2371 node == self.nodeconstants.wdirid
2372 2372 or node in self.nodeconstants.wdirfilenodeids
2373 2373 ):
2374 2374 raise error.RevlogError(
2375 2375 _(b"%s: attempt to add wdir revision") % self.display_id
2376 2376 )
2377 2377 if self._writinghandles is None:
2378 2378 msg = b'adding revision outside `revlog._writing` context'
2379 2379 raise error.ProgrammingError(msg)
2380 2380
2381 2381 if self._inline:
2382 2382 fh = self._writinghandles[0]
2383 2383 else:
2384 2384 fh = self._writinghandles[1]
2385 2385
2386 2386 btext = [rawtext]
2387 2387
2388 2388 curr = len(self)
2389 2389 prev = curr - 1
2390 2390
2391 2391 offset = self._get_data_offset(prev)
2392 2392
2393 2393 if self._concurrencychecker:
2394 2394 ifh, dfh, sdfh = self._writinghandles
2395 2395 # XXX no checking for the sidedata file
2396 2396 if self._inline:
2397 2397 # offset is "as if" it were in the .d file, so we need to add on
2398 2398 # the size of the entry metadata.
2399 2399 self._concurrencychecker(
2400 2400 ifh, self._indexfile, offset + curr * self.index.entry_size
2401 2401 )
2402 2402 else:
2403 2403 # Entries in the .i are a consistent size.
2404 2404 self._concurrencychecker(
2405 2405 ifh, self._indexfile, curr * self.index.entry_size
2406 2406 )
2407 2407 self._concurrencychecker(dfh, self._datafile, offset)
2408 2408
2409 2409 p1r, p2r = self.rev(p1), self.rev(p2)
2410 2410
2411 2411 # full versions are inserted when the needed deltas
2412 2412 # become comparable to the uncompressed text
2413 2413 if rawtext is None:
2414 2414 # need rawtext size, before changed by flag processors, which is
2415 2415 # the non-raw size. use revlog explicitly to avoid filelog's extra
2416 2416 # logic that might remove metadata size.
2417 2417 textlen = mdiff.patchedsize(
2418 2418 revlog.size(self, cachedelta[0]), cachedelta[1]
2419 2419 )
2420 2420 else:
2421 2421 textlen = len(rawtext)
2422 2422
2423 2423 if deltacomputer is None:
2424 2424 deltacomputer = deltautil.deltacomputer(self)
2425 2425
2426 2426 revinfo = revlogutils.revisioninfo(
2427 2427 node,
2428 2428 p1,
2429 2429 p2,
2430 2430 btext,
2431 2431 textlen,
2432 2432 cachedelta,
2433 2433 flags,
2434 2434 )
2435 2435
2436 2436 deltainfo = deltacomputer.finddeltainfo(revinfo, fh)
2437 2437
2438 2438 compression_mode = COMP_MODE_INLINE
2439 2439 if self._docket is not None:
2440 2440 default_comp = self._docket.default_compression_header
2441 2441 r = deltautil.delta_compression(default_comp, deltainfo)
2442 2442 compression_mode, deltainfo = r
2443 2443
2444 2444 sidedata_compression_mode = COMP_MODE_INLINE
2445 2445 if sidedata and self.hassidedata:
2446 2446 sidedata_compression_mode = COMP_MODE_PLAIN
2447 2447 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
2448 2448 sidedata_offset = self._docket.sidedata_end
2449 2449 h, comp_sidedata = self.compress(serialized_sidedata)
2450 2450 if (
2451 2451 h != b'u'
2452 2452 and comp_sidedata[0:1] != b'\0'
2453 2453 and len(comp_sidedata) < len(serialized_sidedata)
2454 2454 ):
2455 2455 assert not h
2456 2456 if (
2457 2457 comp_sidedata[0:1]
2458 2458 == self._docket.default_compression_header
2459 2459 ):
2460 2460 sidedata_compression_mode = COMP_MODE_DEFAULT
2461 2461 serialized_sidedata = comp_sidedata
2462 2462 else:
2463 2463 sidedata_compression_mode = COMP_MODE_INLINE
2464 2464 serialized_sidedata = comp_sidedata
2465 2465 else:
2466 2466 serialized_sidedata = b""
2467 2467 # Don't store the offset if the sidedata is empty, that way
2468 2468 # we can easily detect empty sidedata and they will be no different
2469 2469 # than ones we manually add.
2470 2470 sidedata_offset = 0
2471 2471
2472 2472 e = revlogutils.entry(
2473 2473 flags=flags,
2474 2474 data_offset=offset,
2475 2475 data_compressed_length=deltainfo.deltalen,
2476 2476 data_uncompressed_length=textlen,
2477 2477 data_compression_mode=compression_mode,
2478 2478 data_delta_base=deltainfo.base,
2479 2479 link_rev=link,
2480 2480 parent_rev_1=p1r,
2481 2481 parent_rev_2=p2r,
2482 2482 node_id=node,
2483 2483 sidedata_offset=sidedata_offset,
2484 2484 sidedata_compressed_length=len(serialized_sidedata),
2485 2485 sidedata_compression_mode=sidedata_compression_mode,
2486 2486 )
2487 2487
2488 2488 self.index.append(e)
2489 2489 entry = self.index.entry_binary(curr)
2490 2490 if curr == 0 and self._docket is None:
2491 2491 header = self._format_flags | self._format_version
2492 2492 header = self.index.pack_header(header)
2493 2493 entry = header + entry
2494 2494 self._writeentry(
2495 2495 transaction,
2496 2496 entry,
2497 2497 deltainfo.data,
2498 2498 link,
2499 2499 offset,
2500 2500 serialized_sidedata,
2501 2501 sidedata_offset,
2502 2502 )
2503 2503
2504 2504 rawtext = btext[0]
2505 2505
2506 2506 if alwayscache and rawtext is None:
2507 2507 rawtext = deltacomputer.buildtext(revinfo, fh)
2508 2508
2509 2509 if type(rawtext) == bytes: # only accept immutable objects
2510 2510 self._revisioncache = (node, curr, rawtext)
2511 2511 self._chainbasecache[curr] = deltainfo.chainbase
2512 2512 return curr
2513 2513
2514 2514 def _get_data_offset(self, prev):
2515 2515 """Returns the current offset in the (in-transaction) data file.
2516 2516 Versions < 2 of the revlog can get this 0(1), revlog v2 needs a docket
2517 2517 file to store that information: since sidedata can be rewritten to the
2518 2518 end of the data file within a transaction, you can have cases where, for
2519 2519 example, rev `n` does not have sidedata while rev `n - 1` does, leading
2520 2520 to `n - 1`'s sidedata being written after `n`'s data.
2521 2521
2522 2522 TODO cache this in a docket file before getting out of experimental."""
2523 2523 if self._docket is None:
2524 2524 return self.end(prev)
2525 2525 else:
2526 2526 return self._docket.data_end
2527 2527
2528 2528 def _writeentry(
2529 2529 self, transaction, entry, data, link, offset, sidedata, sidedata_offset
2530 2530 ):
2531 2531 # Files opened in a+ mode have inconsistent behavior on various
2532 2532 # platforms. Windows requires that a file positioning call be made
2533 2533 # when the file handle transitions between reads and writes. See
2534 2534 # 3686fa2b8eee and the mixedfilemodewrapper in windows.py. On other
2535 2535 # platforms, Python or the platform itself can be buggy. Some versions
2536 2536 # of Solaris have been observed to not append at the end of the file
2537 2537 # if the file was seeked to before the end. See issue4943 for more.
2538 2538 #
2539 2539 # We work around this issue by inserting a seek() before writing.
2540 2540 # Note: This is likely not necessary on Python 3. However, because
2541 2541 # the file handle is reused for reads and may be seeked there, we need
2542 2542 # to be careful before changing this.
2543 2543 if self._writinghandles is None:
2544 2544 msg = b'adding revision outside `revlog._writing` context'
2545 2545 raise error.ProgrammingError(msg)
2546 2546 ifh, dfh, sdfh = self._writinghandles
2547 2547 if self._docket is None:
2548 2548 ifh.seek(0, os.SEEK_END)
2549 2549 else:
2550 2550 ifh.seek(self._docket.index_end, os.SEEK_SET)
2551 2551 if dfh:
2552 2552 if self._docket is None:
2553 2553 dfh.seek(0, os.SEEK_END)
2554 2554 else:
2555 2555 dfh.seek(self._docket.data_end, os.SEEK_SET)
2556 2556 if sdfh:
2557 2557 sdfh.seek(self._docket.sidedata_end, os.SEEK_SET)
2558 2558
2559 2559 curr = len(self) - 1
2560 2560 if not self._inline:
2561 2561 transaction.add(self._datafile, offset)
2562 2562 if self._sidedatafile:
2563 2563 transaction.add(self._sidedatafile, sidedata_offset)
2564 2564 transaction.add(self._indexfile, curr * len(entry))
2565 2565 if data[0]:
2566 2566 dfh.write(data[0])
2567 2567 dfh.write(data[1])
2568 2568 if sidedata:
2569 2569 sdfh.write(sidedata)
2570 2570 ifh.write(entry)
2571 2571 else:
2572 2572 offset += curr * self.index.entry_size
2573 2573 transaction.add(self._indexfile, offset)
2574 2574 ifh.write(entry)
2575 2575 ifh.write(data[0])
2576 2576 ifh.write(data[1])
2577 2577 assert not sidedata
2578 2578 self._enforceinlinesize(transaction)
2579 2579 if self._docket is not None:
2580 2580 self._docket.index_end = self._writinghandles[0].tell()
2581 2581 self._docket.data_end = self._writinghandles[1].tell()
2582 2582 self._docket.sidedata_end = self._writinghandles[2].tell()
2583 2583
2584 2584 nodemaputil.setup_persistent_nodemap(transaction, self)
2585 2585
2586 2586 def addgroup(
2587 2587 self,
2588 2588 deltas,
2589 2589 linkmapper,
2590 2590 transaction,
2591 2591 alwayscache=False,
2592 2592 addrevisioncb=None,
2593 2593 duplicaterevisioncb=None,
2594 2594 ):
2595 2595 """
2596 2596 add a delta group
2597 2597
2598 2598 given a set of deltas, add them to the revision log. the
2599 2599 first delta is against its parent, which should be in our
2600 2600 log, the rest are against the previous delta.
2601 2601
2602 2602 If ``addrevisioncb`` is defined, it will be called with arguments of
2603 2603 this revlog and the node that was added.
2604 2604 """
2605 2605
2606 2606 if self._adding_group:
2607 2607 raise error.ProgrammingError(b'cannot nest addgroup() calls')
2608 2608
2609 2609 self._adding_group = True
2610 2610 empty = True
2611 2611 try:
2612 2612 with self._writing(transaction):
2613 2613 deltacomputer = deltautil.deltacomputer(self)
2614 2614 # loop through our set of deltas
2615 2615 for data in deltas:
2616 2616 (
2617 2617 node,
2618 2618 p1,
2619 2619 p2,
2620 2620 linknode,
2621 2621 deltabase,
2622 2622 delta,
2623 2623 flags,
2624 2624 sidedata,
2625 2625 ) = data
2626 2626 link = linkmapper(linknode)
2627 2627 flags = flags or REVIDX_DEFAULT_FLAGS
2628 2628
2629 2629 rev = self.index.get_rev(node)
2630 2630 if rev is not None:
2631 2631 # this can happen if two branches make the same change
2632 2632 self._nodeduplicatecallback(transaction, rev)
2633 2633 if duplicaterevisioncb:
2634 2634 duplicaterevisioncb(self, rev)
2635 2635 empty = False
2636 2636 continue
2637 2637
2638 2638 for p in (p1, p2):
2639 2639 if not self.index.has_node(p):
2640 2640 raise error.LookupError(
2641 2641 p, self.radix, _(b'unknown parent')
2642 2642 )
2643 2643
2644 2644 if not self.index.has_node(deltabase):
2645 2645 raise error.LookupError(
2646 2646 deltabase, self.display_id, _(b'unknown delta base')
2647 2647 )
2648 2648
2649 2649 baserev = self.rev(deltabase)
2650 2650
2651 2651 if baserev != nullrev and self.iscensored(baserev):
2652 2652 # if base is censored, delta must be full replacement in a
2653 2653 # single patch operation
2654 2654 hlen = struct.calcsize(b">lll")
2655 2655 oldlen = self.rawsize(baserev)
2656 2656 newlen = len(delta) - hlen
2657 2657 if delta[:hlen] != mdiff.replacediffheader(
2658 2658 oldlen, newlen
2659 2659 ):
2660 2660 raise error.CensoredBaseError(
2661 2661 self.display_id, self.node(baserev)
2662 2662 )
2663 2663
2664 2664 if not flags and self._peek_iscensored(baserev, delta):
2665 2665 flags |= REVIDX_ISCENSORED
2666 2666
2667 2667 # We assume consumers of addrevisioncb will want to retrieve
2668 2668 # the added revision, which will require a call to
2669 2669 # revision(). revision() will fast path if there is a cache
2670 2670 # hit. So, we tell _addrevision() to always cache in this case.
2671 2671 # We're only using addgroup() in the context of changegroup
2672 2672 # generation so the revision data can always be handled as raw
2673 2673 # by the flagprocessor.
2674 2674 rev = self._addrevision(
2675 2675 node,
2676 2676 None,
2677 2677 transaction,
2678 2678 link,
2679 2679 p1,
2680 2680 p2,
2681 2681 flags,
2682 2682 (baserev, delta),
2683 2683 alwayscache=alwayscache,
2684 2684 deltacomputer=deltacomputer,
2685 2685 sidedata=sidedata,
2686 2686 )
2687 2687
2688 2688 if addrevisioncb:
2689 2689 addrevisioncb(self, rev)
2690 2690 empty = False
2691 2691 finally:
2692 2692 self._adding_group = False
2693 2693 return not empty
2694 2694
2695 2695 def iscensored(self, rev):
2696 2696 """Check if a file revision is censored."""
2697 2697 if not self._censorable:
2698 2698 return False
2699 2699
2700 2700 return self.flags(rev) & REVIDX_ISCENSORED
2701 2701
2702 2702 def _peek_iscensored(self, baserev, delta):
2703 2703 """Quickly check if a delta produces a censored revision."""
2704 2704 if not self._censorable:
2705 2705 return False
2706 2706
2707 2707 return storageutil.deltaiscensored(delta, baserev, self.rawsize)
2708 2708
2709 2709 def getstrippoint(self, minlink):
2710 2710 """find the minimum rev that must be stripped to strip the linkrev
2711 2711
2712 2712 Returns a tuple containing the minimum rev and a set of all revs that
2713 2713 have linkrevs that will be broken by this strip.
2714 2714 """
2715 2715 return storageutil.resolvestripinfo(
2716 2716 minlink,
2717 2717 len(self) - 1,
2718 2718 self.headrevs(),
2719 2719 self.linkrev,
2720 2720 self.parentrevs,
2721 2721 )
2722 2722
2723 2723 def strip(self, minlink, transaction):
2724 2724 """truncate the revlog on the first revision with a linkrev >= minlink
2725 2725
2726 2726 This function is called when we're stripping revision minlink and
2727 2727 its descendants from the repository.
2728 2728
2729 2729 We have to remove all revisions with linkrev >= minlink, because
2730 2730 the equivalent changelog revisions will be renumbered after the
2731 2731 strip.
2732 2732
2733 2733 So we truncate the revlog on the first of these revisions, and
2734 2734 trust that the caller has saved the revisions that shouldn't be
2735 2735 removed and that it'll re-add them after this truncation.
2736 2736 """
2737 2737 if len(self) == 0:
2738 2738 return
2739 2739
2740 2740 rev, _ = self.getstrippoint(minlink)
2741 2741 if rev == len(self):
2742 2742 return
2743 2743
2744 2744 # first truncate the files on disk
2745 2745 data_end = self.start(rev)
2746 2746 if not self._inline:
2747 2747 transaction.add(self._datafile, data_end)
2748 2748 end = rev * self.index.entry_size
2749 2749 else:
2750 2750 end = data_end + (rev * self.index.entry_size)
2751 2751
2752 2752 if self._sidedatafile:
2753 2753 sidedata_end = self.sidedata_cut_off(rev)
2754 2754 transaction.add(self._sidedatafile, sidedata_end)
2755 2755
2756 2756 transaction.add(self._indexfile, end)
2757 2757 if self._docket is not None:
2758 2758 # XXX we could, leverage the docket while stripping. However it is
2759 2759 # not powerfull enough at the time of this comment
2760 2760 self._docket.index_end = end
2761 2761 self._docket.data_end = data_end
2762 2762 self._docket.sidedata_end = sidedata_end
2763 2763 self._docket.write(transaction, stripping=True)
2764 2764
2765 2765 # then reset internal state in memory to forget those revisions
2766 2766 self._revisioncache = None
2767 2767 self._chaininfocache = util.lrucachedict(500)
2768 2768 self._segmentfile.clear_cache()
2769 2769 self._segmentfile_sidedata.clear_cache()
2770 2770
2771 2771 del self.index[rev:-1]
2772 2772
2773 2773 def checksize(self):
2774 2774 """Check size of index and data files
2775 2775
2776 2776 return a (dd, di) tuple.
2777 2777 - dd: extra bytes for the "data" file
2778 2778 - di: extra bytes for the "index" file
2779 2779
2780 2780 A healthy revlog will return (0, 0).
2781 2781 """
2782 2782 expected = 0
2783 2783 if len(self):
2784 2784 expected = max(0, self.end(len(self) - 1))
2785 2785
2786 2786 try:
2787 2787 with self._datafp() as f:
2788 2788 f.seek(0, io.SEEK_END)
2789 2789 actual = f.tell()
2790 2790 dd = actual - expected
2791 2791 except IOError as inst:
2792 2792 if inst.errno != errno.ENOENT:
2793 2793 raise
2794 2794 dd = 0
2795 2795
2796 2796 try:
2797 2797 f = self.opener(self._indexfile)
2798 2798 f.seek(0, io.SEEK_END)
2799 2799 actual = f.tell()
2800 2800 f.close()
2801 2801 s = self.index.entry_size
2802 2802 i = max(0, actual // s)
2803 2803 di = actual - (i * s)
2804 2804 if self._inline:
2805 2805 databytes = 0
2806 2806 for r in self:
2807 2807 databytes += max(0, self.length(r))
2808 2808 dd = 0
2809 2809 di = actual - len(self) * s - databytes
2810 2810 except IOError as inst:
2811 2811 if inst.errno != errno.ENOENT:
2812 2812 raise
2813 2813 di = 0
2814 2814
2815 2815 return (dd, di)
2816 2816
2817 2817 def files(self):
2818 2818 res = [self._indexfile]
2819 2819 if self._docket_file is None:
2820 2820 if not self._inline:
2821 2821 res.append(self._datafile)
2822 2822 else:
2823 2823 res.append(self._docket_file)
2824 2824 res.extend(self._docket.old_index_filepaths(include_empty=False))
2825 2825 if self._docket.data_end:
2826 2826 res.append(self._datafile)
2827 2827 res.extend(self._docket.old_data_filepaths(include_empty=False))
2828 2828 if self._docket.sidedata_end:
2829 2829 res.append(self._sidedatafile)
2830 2830 res.extend(self._docket.old_sidedata_filepaths(include_empty=False))
2831 2831 return res
2832 2832
2833 2833 def emitrevisions(
2834 2834 self,
2835 2835 nodes,
2836 2836 nodesorder=None,
2837 2837 revisiondata=False,
2838 2838 assumehaveparentrevisions=False,
2839 2839 deltamode=repository.CG_DELTAMODE_STD,
2840 2840 sidedata_helpers=None,
2841 2841 ):
2842 2842 if nodesorder not in (b'nodes', b'storage', b'linear', None):
2843 2843 raise error.ProgrammingError(
2844 2844 b'unhandled value for nodesorder: %s' % nodesorder
2845 2845 )
2846 2846
2847 2847 if nodesorder is None and not self._generaldelta:
2848 2848 nodesorder = b'storage'
2849 2849
2850 2850 if (
2851 2851 not self._storedeltachains
2852 2852 and deltamode != repository.CG_DELTAMODE_PREV
2853 2853 ):
2854 2854 deltamode = repository.CG_DELTAMODE_FULL
2855 2855
2856 2856 return storageutil.emitrevisions(
2857 2857 self,
2858 2858 nodes,
2859 2859 nodesorder,
2860 2860 revlogrevisiondelta,
2861 2861 deltaparentfn=self.deltaparent,
2862 2862 candeltafn=self.candelta,
2863 2863 rawsizefn=self.rawsize,
2864 2864 revdifffn=self.revdiff,
2865 2865 flagsfn=self.flags,
2866 2866 deltamode=deltamode,
2867 2867 revisiondata=revisiondata,
2868 2868 assumehaveparentrevisions=assumehaveparentrevisions,
2869 2869 sidedata_helpers=sidedata_helpers,
2870 2870 )
2871 2871
2872 2872 DELTAREUSEALWAYS = b'always'
2873 2873 DELTAREUSESAMEREVS = b'samerevs'
2874 2874 DELTAREUSENEVER = b'never'
2875 2875
2876 2876 DELTAREUSEFULLADD = b'fulladd'
2877 2877
2878 2878 DELTAREUSEALL = {b'always', b'samerevs', b'never', b'fulladd'}
2879 2879
2880 2880 def clone(
2881 2881 self,
2882 2882 tr,
2883 2883 destrevlog,
2884 2884 addrevisioncb=None,
2885 2885 deltareuse=DELTAREUSESAMEREVS,
2886 2886 forcedeltabothparents=None,
2887 2887 sidedata_helpers=None,
2888 2888 ):
2889 2889 """Copy this revlog to another, possibly with format changes.
2890 2890
2891 2891 The destination revlog will contain the same revisions and nodes.
2892 2892 However, it may not be bit-for-bit identical due to e.g. delta encoding
2893 2893 differences.
2894 2894
2895 2895 The ``deltareuse`` argument control how deltas from the existing revlog
2896 2896 are preserved in the destination revlog. The argument can have the
2897 2897 following values:
2898 2898
2899 2899 DELTAREUSEALWAYS
2900 2900 Deltas will always be reused (if possible), even if the destination
2901 2901 revlog would not select the same revisions for the delta. This is the
2902 2902 fastest mode of operation.
2903 2903 DELTAREUSESAMEREVS
2904 2904 Deltas will be reused if the destination revlog would pick the same
2905 2905 revisions for the delta. This mode strikes a balance between speed
2906 2906 and optimization.
2907 2907 DELTAREUSENEVER
2908 2908 Deltas will never be reused. This is the slowest mode of execution.
2909 2909 This mode can be used to recompute deltas (e.g. if the diff/delta
2910 2910 algorithm changes).
2911 2911 DELTAREUSEFULLADD
2912 2912 Revision will be re-added as if their were new content. This is
2913 2913 slower than DELTAREUSEALWAYS but allow more mechanism to kicks in.
2914 2914 eg: large file detection and handling.
2915 2915
2916 2916 Delta computation can be slow, so the choice of delta reuse policy can
2917 2917 significantly affect run time.
2918 2918
2919 2919 The default policy (``DELTAREUSESAMEREVS``) strikes a balance between
2920 2920 two extremes. Deltas will be reused if they are appropriate. But if the
2921 2921 delta could choose a better revision, it will do so. This means if you
2922 2922 are converting a non-generaldelta revlog to a generaldelta revlog,
2923 2923 deltas will be recomputed if the delta's parent isn't a parent of the
2924 2924 revision.
2925 2925
2926 2926 In addition to the delta policy, the ``forcedeltabothparents``
2927 2927 argument controls whether to force compute deltas against both parents
2928 2928 for merges. By default, the current default is used.
2929 2929
2930 2930 See `revlogutil.sidedata.get_sidedata_helpers` for the doc on
2931 2931 `sidedata_helpers`.
2932 2932 """
2933 2933 if deltareuse not in self.DELTAREUSEALL:
2934 2934 raise ValueError(
2935 2935 _(b'value for deltareuse invalid: %s') % deltareuse
2936 2936 )
2937 2937
2938 2938 if len(destrevlog):
2939 2939 raise ValueError(_(b'destination revlog is not empty'))
2940 2940
2941 2941 if getattr(self, 'filteredrevs', None):
2942 2942 raise ValueError(_(b'source revlog has filtered revisions'))
2943 2943 if getattr(destrevlog, 'filteredrevs', None):
2944 2944 raise ValueError(_(b'destination revlog has filtered revisions'))
2945 2945
2946 2946 # lazydelta and lazydeltabase controls whether to reuse a cached delta,
2947 2947 # if possible.
2948 2948 oldlazydelta = destrevlog._lazydelta
2949 2949 oldlazydeltabase = destrevlog._lazydeltabase
2950 2950 oldamd = destrevlog._deltabothparents
2951 2951
2952 2952 try:
2953 2953 if deltareuse == self.DELTAREUSEALWAYS:
2954 2954 destrevlog._lazydeltabase = True
2955 2955 destrevlog._lazydelta = True
2956 2956 elif deltareuse == self.DELTAREUSESAMEREVS:
2957 2957 destrevlog._lazydeltabase = False
2958 2958 destrevlog._lazydelta = True
2959 2959 elif deltareuse == self.DELTAREUSENEVER:
2960 2960 destrevlog._lazydeltabase = False
2961 2961 destrevlog._lazydelta = False
2962 2962
2963 2963 destrevlog._deltabothparents = forcedeltabothparents or oldamd
2964 2964
2965 2965 self._clone(
2966 2966 tr,
2967 2967 destrevlog,
2968 2968 addrevisioncb,
2969 2969 deltareuse,
2970 2970 forcedeltabothparents,
2971 2971 sidedata_helpers,
2972 2972 )
2973 2973
2974 2974 finally:
2975 2975 destrevlog._lazydelta = oldlazydelta
2976 2976 destrevlog._lazydeltabase = oldlazydeltabase
2977 2977 destrevlog._deltabothparents = oldamd
2978 2978
2979 2979 def _clone(
2980 2980 self,
2981 2981 tr,
2982 2982 destrevlog,
2983 2983 addrevisioncb,
2984 2984 deltareuse,
2985 2985 forcedeltabothparents,
2986 2986 sidedata_helpers,
2987 2987 ):
2988 2988 """perform the core duty of `revlog.clone` after parameter processing"""
2989 2989 deltacomputer = deltautil.deltacomputer(destrevlog)
2990 2990 index = self.index
2991 2991 for rev in self:
2992 2992 entry = index[rev]
2993 2993
2994 2994 # Some classes override linkrev to take filtered revs into
2995 2995 # account. Use raw entry from index.
2996 2996 flags = entry[0] & 0xFFFF
2997 2997 linkrev = entry[4]
2998 2998 p1 = index[entry[5]][7]
2999 2999 p2 = index[entry[6]][7]
3000 3000 node = entry[7]
3001 3001
3002 3002 # (Possibly) reuse the delta from the revlog if allowed and
3003 3003 # the revlog chunk is a delta.
3004 3004 cachedelta = None
3005 3005 rawtext = None
3006 3006 if deltareuse == self.DELTAREUSEFULLADD:
3007 3007 text = self._revisiondata(rev)
3008 3008 sidedata = self.sidedata(rev)
3009 3009
3010 3010 if sidedata_helpers is not None:
3011 3011 (sidedata, new_flags) = sidedatautil.run_sidedata_helpers(
3012 3012 self, sidedata_helpers, sidedata, rev
3013 3013 )
3014 3014 flags = flags | new_flags[0] & ~new_flags[1]
3015 3015
3016 3016 destrevlog.addrevision(
3017 3017 text,
3018 3018 tr,
3019 3019 linkrev,
3020 3020 p1,
3021 3021 p2,
3022 3022 cachedelta=cachedelta,
3023 3023 node=node,
3024 3024 flags=flags,
3025 3025 deltacomputer=deltacomputer,
3026 3026 sidedata=sidedata,
3027 3027 )
3028 3028 else:
3029 3029 if destrevlog._lazydelta:
3030 3030 dp = self.deltaparent(rev)
3031 3031 if dp != nullrev:
3032 3032 cachedelta = (dp, bytes(self._chunk(rev)))
3033 3033
3034 3034 sidedata = None
3035 3035 if not cachedelta:
3036 3036 rawtext = self._revisiondata(rev)
3037 3037 sidedata = self.sidedata(rev)
3038 3038 if sidedata is None:
3039 3039 sidedata = self.sidedata(rev)
3040 3040
3041 3041 if sidedata_helpers is not None:
3042 3042 (sidedata, new_flags) = sidedatautil.run_sidedata_helpers(
3043 3043 self, sidedata_helpers, sidedata, rev
3044 3044 )
3045 3045 flags = flags | new_flags[0] & ~new_flags[1]
3046 3046
3047 3047 with destrevlog._writing(tr):
3048 3048 destrevlog._addrevision(
3049 3049 node,
3050 3050 rawtext,
3051 3051 tr,
3052 3052 linkrev,
3053 3053 p1,
3054 3054 p2,
3055 3055 flags,
3056 3056 cachedelta,
3057 3057 deltacomputer=deltacomputer,
3058 3058 sidedata=sidedata,
3059 3059 )
3060 3060
3061 3061 if addrevisioncb:
3062 3062 addrevisioncb(self, rev, node)
3063 3063
3064 3064 def censorrevision(self, tr, censornode, tombstone=b''):
3065 3065 if self._format_version == REVLOGV0:
3066 3066 raise error.RevlogError(
3067 3067 _(b'cannot censor with version %d revlogs')
3068 3068 % self._format_version
3069 3069 )
3070 3070 elif self._format_version == REVLOGV1:
3071 3071 censor.v1_censor(self, tr, censornode, tombstone)
3072 3072 else:
3073 # revlog v2
3074 raise error.RevlogError(
3075 _(b'cannot censor with version %d revlogs')
3076 % self._format_version
3077 )
3073 censor.v2_censor(self, tr, censornode, tombstone)
3078 3074
3079 3075 def verifyintegrity(self, state):
3080 3076 """Verifies the integrity of the revlog.
3081 3077
3082 3078 Yields ``revlogproblem`` instances describing problems that are
3083 3079 found.
3084 3080 """
3085 3081 dd, di = self.checksize()
3086 3082 if dd:
3087 3083 yield revlogproblem(error=_(b'data length off by %d bytes') % dd)
3088 3084 if di:
3089 3085 yield revlogproblem(error=_(b'index contains %d extra bytes') % di)
3090 3086
3091 3087 version = self._format_version
3092 3088
3093 3089 # The verifier tells us what version revlog we should be.
3094 3090 if version != state[b'expectedversion']:
3095 3091 yield revlogproblem(
3096 3092 warning=_(b"warning: '%s' uses revlog format %d; expected %d")
3097 3093 % (self.display_id, version, state[b'expectedversion'])
3098 3094 )
3099 3095
3100 3096 state[b'skipread'] = set()
3101 3097 state[b'safe_renamed'] = set()
3102 3098
3103 3099 for rev in self:
3104 3100 node = self.node(rev)
3105 3101
3106 3102 # Verify contents. 4 cases to care about:
3107 3103 #
3108 3104 # common: the most common case
3109 3105 # rename: with a rename
3110 3106 # meta: file content starts with b'\1\n', the metadata
3111 3107 # header defined in filelog.py, but without a rename
3112 3108 # ext: content stored externally
3113 3109 #
3114 3110 # More formally, their differences are shown below:
3115 3111 #
3116 3112 # | common | rename | meta | ext
3117 3113 # -------------------------------------------------------
3118 3114 # flags() | 0 | 0 | 0 | not 0
3119 3115 # renamed() | False | True | False | ?
3120 3116 # rawtext[0:2]=='\1\n'| False | True | True | ?
3121 3117 #
3122 3118 # "rawtext" means the raw text stored in revlog data, which
3123 3119 # could be retrieved by "rawdata(rev)". "text"
3124 3120 # mentioned below is "revision(rev)".
3125 3121 #
3126 3122 # There are 3 different lengths stored physically:
3127 3123 # 1. L1: rawsize, stored in revlog index
3128 3124 # 2. L2: len(rawtext), stored in revlog data
3129 3125 # 3. L3: len(text), stored in revlog data if flags==0, or
3130 3126 # possibly somewhere else if flags!=0
3131 3127 #
3132 3128 # L1 should be equal to L2. L3 could be different from them.
3133 3129 # "text" may or may not affect commit hash depending on flag
3134 3130 # processors (see flagutil.addflagprocessor).
3135 3131 #
3136 3132 # | common | rename | meta | ext
3137 3133 # -------------------------------------------------
3138 3134 # rawsize() | L1 | L1 | L1 | L1
3139 3135 # size() | L1 | L2-LM | L1(*) | L1 (?)
3140 3136 # len(rawtext) | L2 | L2 | L2 | L2
3141 3137 # len(text) | L2 | L2 | L2 | L3
3142 3138 # len(read()) | L2 | L2-LM | L2-LM | L3 (?)
3143 3139 #
3144 3140 # LM: length of metadata, depending on rawtext
3145 3141 # (*): not ideal, see comment in filelog.size
3146 3142 # (?): could be "- len(meta)" if the resolved content has
3147 3143 # rename metadata
3148 3144 #
3149 3145 # Checks needed to be done:
3150 3146 # 1. length check: L1 == L2, in all cases.
3151 3147 # 2. hash check: depending on flag processor, we may need to
3152 3148 # use either "text" (external), or "rawtext" (in revlog).
3153 3149
3154 3150 try:
3155 3151 skipflags = state.get(b'skipflags', 0)
3156 3152 if skipflags:
3157 3153 skipflags &= self.flags(rev)
3158 3154
3159 3155 _verify_revision(self, skipflags, state, node)
3160 3156
3161 3157 l1 = self.rawsize(rev)
3162 3158 l2 = len(self.rawdata(node))
3163 3159
3164 3160 if l1 != l2:
3165 3161 yield revlogproblem(
3166 3162 error=_(b'unpacked size is %d, %d expected') % (l2, l1),
3167 3163 node=node,
3168 3164 )
3169 3165
3170 3166 except error.CensoredNodeError:
3171 3167 if state[b'erroroncensored']:
3172 3168 yield revlogproblem(
3173 3169 error=_(b'censored file data'), node=node
3174 3170 )
3175 3171 state[b'skipread'].add(node)
3176 3172 except Exception as e:
3177 3173 yield revlogproblem(
3178 3174 error=_(b'unpacking %s: %s')
3179 3175 % (short(node), stringutil.forcebytestr(e)),
3180 3176 node=node,
3181 3177 )
3182 3178 state[b'skipread'].add(node)
3183 3179
3184 3180 def storageinfo(
3185 3181 self,
3186 3182 exclusivefiles=False,
3187 3183 sharedfiles=False,
3188 3184 revisionscount=False,
3189 3185 trackedsize=False,
3190 3186 storedsize=False,
3191 3187 ):
3192 3188 d = {}
3193 3189
3194 3190 if exclusivefiles:
3195 3191 d[b'exclusivefiles'] = [(self.opener, self._indexfile)]
3196 3192 if not self._inline:
3197 3193 d[b'exclusivefiles'].append((self.opener, self._datafile))
3198 3194
3199 3195 if sharedfiles:
3200 3196 d[b'sharedfiles'] = []
3201 3197
3202 3198 if revisionscount:
3203 3199 d[b'revisionscount'] = len(self)
3204 3200
3205 3201 if trackedsize:
3206 3202 d[b'trackedsize'] = sum(map(self.rawsize, iter(self)))
3207 3203
3208 3204 if storedsize:
3209 3205 d[b'storedsize'] = sum(
3210 3206 self.opener.stat(path).st_size for path in self.files()
3211 3207 )
3212 3208
3213 3209 return d
3214 3210
3215 3211 def rewrite_sidedata(self, transaction, helpers, startrev, endrev):
3216 3212 if not self.hassidedata:
3217 3213 return
3218 3214 # revlog formats with sidedata support does not support inline
3219 3215 assert not self._inline
3220 3216 if not helpers[1] and not helpers[2]:
3221 3217 # Nothing to generate or remove
3222 3218 return
3223 3219
3224 3220 new_entries = []
3225 3221 # append the new sidedata
3226 3222 with self._writing(transaction):
3227 3223 ifh, dfh, sdfh = self._writinghandles
3228 3224 dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
3229 3225
3230 3226 current_offset = sdfh.tell()
3231 3227 for rev in range(startrev, endrev + 1):
3232 3228 entry = self.index[rev]
3233 3229 new_sidedata, flags = sidedatautil.run_sidedata_helpers(
3234 3230 store=self,
3235 3231 sidedata_helpers=helpers,
3236 3232 sidedata={},
3237 3233 rev=rev,
3238 3234 )
3239 3235
3240 3236 serialized_sidedata = sidedatautil.serialize_sidedata(
3241 3237 new_sidedata
3242 3238 )
3243 3239
3244 3240 sidedata_compression_mode = COMP_MODE_INLINE
3245 3241 if serialized_sidedata and self.hassidedata:
3246 3242 sidedata_compression_mode = COMP_MODE_PLAIN
3247 3243 h, comp_sidedata = self.compress(serialized_sidedata)
3248 3244 if (
3249 3245 h != b'u'
3250 3246 and comp_sidedata[0] != b'\0'
3251 3247 and len(comp_sidedata) < len(serialized_sidedata)
3252 3248 ):
3253 3249 assert not h
3254 3250 if (
3255 3251 comp_sidedata[0]
3256 3252 == self._docket.default_compression_header
3257 3253 ):
3258 3254 sidedata_compression_mode = COMP_MODE_DEFAULT
3259 3255 serialized_sidedata = comp_sidedata
3260 3256 else:
3261 3257 sidedata_compression_mode = COMP_MODE_INLINE
3262 3258 serialized_sidedata = comp_sidedata
3263 3259 if entry[8] != 0 or entry[9] != 0:
3264 3260 # rewriting entries that already have sidedata is not
3265 3261 # supported yet, because it introduces garbage data in the
3266 3262 # revlog.
3267 3263 msg = b"rewriting existing sidedata is not supported yet"
3268 3264 raise error.Abort(msg)
3269 3265
3270 3266 # Apply (potential) flags to add and to remove after running
3271 3267 # the sidedata helpers
3272 3268 new_offset_flags = entry[0] | flags[0] & ~flags[1]
3273 3269 entry_update = (
3274 3270 current_offset,
3275 3271 len(serialized_sidedata),
3276 3272 new_offset_flags,
3277 3273 sidedata_compression_mode,
3278 3274 )
3279 3275
3280 3276 # the sidedata computation might have move the file cursors around
3281 3277 sdfh.seek(current_offset, os.SEEK_SET)
3282 3278 sdfh.write(serialized_sidedata)
3283 3279 new_entries.append(entry_update)
3284 3280 current_offset += len(serialized_sidedata)
3285 3281 self._docket.sidedata_end = sdfh.tell()
3286 3282
3287 3283 # rewrite the new index entries
3288 3284 ifh.seek(startrev * self.index.entry_size)
3289 3285 for i, e in enumerate(new_entries):
3290 3286 rev = startrev + i
3291 3287 self.index.replace_sidedata_info(rev, *e)
3292 3288 packed = self.index.entry_binary(rev)
3293 3289 if rev == 0 and self._docket is None:
3294 3290 header = self._format_flags | self._format_version
3295 3291 header = self.index.pack_header(header)
3296 3292 packed = header + packed
3297 3293 ifh.write(packed)
@@ -1,97 +1,359 b''
1 1 # censor code related to censoring revision
2 # coding: utf8
2 3 #
3 4 # Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
4 5 # Copyright 2015 Google, Inc <martinvonz@google.com>
5 6 #
6 7 # This software may be used and distributed according to the terms of the
7 8 # GNU General Public License version 2 or any later version.
8 9
10 import contextlib
11 import os
12
9 13 from ..node import (
10 14 nullrev,
11 15 )
16 from .constants import (
17 COMP_MODE_PLAIN,
18 ENTRY_DATA_COMPRESSED_LENGTH,
19 ENTRY_DATA_COMPRESSION_MODE,
20 ENTRY_DATA_OFFSET,
21 ENTRY_DATA_UNCOMPRESSED_LENGTH,
22 ENTRY_DELTA_BASE,
23 ENTRY_LINK_REV,
24 ENTRY_NODE_ID,
25 ENTRY_PARENT_1,
26 ENTRY_PARENT_2,
27 ENTRY_SIDEDATA_COMPRESSED_LENGTH,
28 ENTRY_SIDEDATA_COMPRESSION_MODE,
29 ENTRY_SIDEDATA_OFFSET,
30 REVLOGV0,
31 REVLOGV1,
32 )
12 33 from ..i18n import _
34
13 35 from .. import (
14 36 error,
37 pycompat,
38 revlogutils,
39 util,
15 40 )
16 41 from ..utils import (
17 42 storageutil,
18 43 )
19 from . import constants
44 from . import (
45 constants,
46 deltas,
47 )
20 48
21 49
22 50 def v1_censor(rl, tr, censornode, tombstone=b''):
23 51 """censors a revision in a "version 1" revlog"""
24 52 assert rl._format_version == constants.REVLOGV1, rl._format_version
25 53
26 54 # avoid cycle
27 55 from .. import revlog
28 56
29 57 censorrev = rl.rev(censornode)
30 58 tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
31 59
32 60 # Rewriting the revlog in place is hard. Our strategy for censoring is
33 61 # to create a new revlog, copy all revisions to it, then replace the
34 62 # revlogs on transaction close.
35 63 #
36 64 # This is a bit dangerous. We could easily have a mismatch of state.
37 65 newrl = revlog.revlog(
38 66 rl.opener,
39 67 target=rl.target,
40 68 radix=rl.radix,
41 69 postfix=b'tmpcensored',
42 70 censorable=True,
43 71 )
44 72 newrl._format_version = rl._format_version
45 73 newrl._format_flags = rl._format_flags
46 74 newrl._generaldelta = rl._generaldelta
47 75 newrl._parse_index = rl._parse_index
48 76
49 77 for rev in rl.revs():
50 78 node = rl.node(rev)
51 79 p1, p2 = rl.parents(node)
52 80
53 81 if rev == censorrev:
54 82 newrl.addrawrevision(
55 83 tombstone,
56 84 tr,
57 85 rl.linkrev(censorrev),
58 86 p1,
59 87 p2,
60 88 censornode,
61 89 constants.REVIDX_ISCENSORED,
62 90 )
63 91
64 92 if newrl.deltaparent(rev) != nullrev:
65 93 m = _(b'censored revision stored as delta; cannot censor')
66 94 h = _(
67 95 b'censoring of revlogs is not fully implemented;'
68 96 b' please report this bug'
69 97 )
70 98 raise error.Abort(m, hint=h)
71 99 continue
72 100
73 101 if rl.iscensored(rev):
74 102 if rl.deltaparent(rev) != nullrev:
75 103 m = _(
76 104 b'cannot censor due to censored '
77 105 b'revision having delta stored'
78 106 )
79 107 raise error.Abort(m)
80 108 rawtext = rl._chunk(rev)
81 109 else:
82 110 rawtext = rl.rawdata(rev)
83 111
84 112 newrl.addrawrevision(
85 113 rawtext, tr, rl.linkrev(rev), p1, p2, node, rl.flags(rev)
86 114 )
87 115
88 116 tr.addbackup(rl._indexfile, location=b'store')
89 117 if not rl._inline:
90 118 tr.addbackup(rl._datafile, location=b'store')
91 119
92 120 rl.opener.rename(newrl._indexfile, rl._indexfile)
93 121 if not rl._inline:
94 122 rl.opener.rename(newrl._datafile, rl._datafile)
95 123
96 124 rl.clearcaches()
97 125 rl._loadindex()
126
127
128 def v2_censor(rl, tr, censornode, tombstone=b''):
129 """censors a revision in a "version 2" revlog"""
130 # General principle
131 #
132 # We create new revlog files (index/data/sidedata) to copy the content of
133 # the existing data without the censored data.
134 #
135 # We need to recompute new delta for any revision that used the censored
136 # revision as delta base. As the cumulative size of the new delta may be
137 # large, we store them in a temporary file until they are stored in their
138 # final destination.
139 #
140 # All data before the censored data can be blindly copied. The rest needs
141 # to be copied as we go and the associated index entry needs adjustement.
142
143 assert rl._format_version != REVLOGV0, rl._format_version
144 assert rl._format_version != REVLOGV1, rl._format_version
145
146 old_index = rl.index
147 docket = rl._docket
148
149 censor_rev = rl.rev(censornode)
150 tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
151
152 censored_entry = rl.index[censor_rev]
153 index_cutoff = rl.index.entry_size * censor_rev
154 data_cutoff = censored_entry[ENTRY_DATA_OFFSET] >> 16
155 sidedata_cutoff = rl.sidedata_cut_off(censor_rev)
156
157 # rev β†’ (new_base, data_start, data_end)
158 rewritten_entries = {}
159
160 dc = deltas.deltacomputer(rl)
161 excl = [censor_rev]
162
163 with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage:
164 with rl._segmentfile._open_read() as dfh:
165 for rev in range(censor_rev + 1, len(old_index)):
166 entry = old_index[rev]
167 if censor_rev != entry[ENTRY_DELTA_BASE]:
168 continue
169 # This is a revision that use the censored revision as the base
170 # for its delta. We need a need new deltas
171 if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0:
172 # this revision is empty, we can delta against nullrev
173 rewritten_entries[rev] = (nullrev, 0, 0)
174 else:
175
176 text = rl.rawdata(rev, _df=dfh)
177 info = revlogutils.revisioninfo(
178 node=entry[ENTRY_NODE_ID],
179 p1=rl.node(entry[ENTRY_PARENT_1]),
180 p2=rl.node(entry[ENTRY_PARENT_2]),
181 btext=[text],
182 textlen=len(text),
183 cachedelta=None,
184 flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF,
185 )
186 d = dc.finddeltainfo(
187 info, dfh, excluded_bases=excl, target_rev=rev
188 )
189 default_comp = rl._docket.default_compression_header
190 comp_mode, d = deltas.delta_compression(default_comp, d)
191 # using `tell` is a bit lazy, but we are not here for speed
192 start = tmp_storage.tell()
193 tmp_storage.write(d.data[1])
194 end = tmp_storage.tell()
195 rewritten_entries[rev] = (d.base, start, end, comp_mode)
196
197 old_index_filepath = rl.opener.join(docket.index_filepath())
198 old_data_filepath = rl.opener.join(docket.data_filepath())
199 old_sidedata_filepath = rl.opener.join(docket.sidedata_filepath())
200
201 new_index_filepath = rl.opener.join(docket.new_index_file())
202 new_data_filepath = rl.opener.join(docket.new_data_file())
203 new_sidedata_filepath = rl.opener.join(docket.new_sidedata_file())
204
205 util.copyfile(
206 old_index_filepath, new_index_filepath, nb_bytes=index_cutoff
207 )
208 util.copyfile(
209 old_data_filepath, new_data_filepath, nb_bytes=data_cutoff
210 )
211 util.copyfile(
212 old_sidedata_filepath,
213 new_sidedata_filepath,
214 nb_bytes=sidedata_cutoff,
215 )
216 rl.opener.register_file(docket.index_filepath())
217 rl.opener.register_file(docket.data_filepath())
218 rl.opener.register_file(docket.sidedata_filepath())
219
220 docket.index_end = index_cutoff
221 docket.data_end = data_cutoff
222 docket.sidedata_end = sidedata_cutoff
223
224 # reload the revlog internal information
225 rl.clearcaches()
226 rl._loadindex(docket=docket)
227
228 @contextlib.contextmanager
229 def all_files():
230 # hide opening in an helper function to please check-code, black
231 # and various python ersion at the same time
232 with open(old_data_filepath, 'rb') as old_data_file:
233 with open(old_sidedata_filepath, 'rb') as old_sidedata_file:
234 with open(new_index_filepath, 'r+b') as new_index_file:
235 with open(new_data_filepath, 'r+b') as new_data_file:
236 with open(
237 new_sidedata_filepath, 'r+b'
238 ) as new_sidedata_file:
239 yield (
240 old_data_file,
241 old_sidedata_file,
242 new_index_file,
243 new_data_file,
244 new_sidedata_file,
245 )
246
247 # we dont need to open the old index file since its content already
248 # exist in a usable form in `old_index`.
249 with all_files() as (
250 old_data_file,
251 old_sidedata_file,
252 new_index_file,
253 new_data_file,
254 new_sidedata_file,
255 ):
256 new_index_file.seek(0, os.SEEK_END)
257 assert new_index_file.tell() == index_cutoff
258 new_data_file.seek(0, os.SEEK_END)
259 assert new_data_file.tell() == data_cutoff
260 new_sidedata_file.seek(0, os.SEEK_END)
261 assert new_sidedata_file.tell() == sidedata_cutoff
262
263 ### writing the censored revision
264 entry = old_index[censor_rev]
265
266 # XXX consider trying the default compression too
267 new_data_size = len(tombstone)
268 new_data_offset = new_data_file.tell()
269 new_data_file.write(tombstone)
270
271 # we are not adding any sidedata as they might leak info about the censored version
272
273 new_entry = revlogutils.entry(
274 flags=constants.REVIDX_ISCENSORED,
275 data_offset=new_data_offset,
276 data_compressed_length=new_data_size,
277 data_uncompressed_length=new_data_size,
278 data_delta_base=censor_rev,
279 link_rev=entry[ENTRY_LINK_REV],
280 parent_rev_1=entry[ENTRY_PARENT_1],
281 parent_rev_2=entry[ENTRY_PARENT_2],
282 node_id=entry[ENTRY_NODE_ID],
283 sidedata_offset=0,
284 sidedata_compressed_length=0,
285 data_compression_mode=COMP_MODE_PLAIN,
286 sidedata_compression_mode=COMP_MODE_PLAIN,
287 )
288 rl.index.append(new_entry)
289 entry_bin = rl.index.entry_binary(censor_rev)
290 new_index_file.write(entry_bin)
291 docket.index_end = new_index_file.tell()
292 docket.data_end = new_data_file.tell()
293
294 #### Writing all subsequent revisions
295 for rev in range(censor_rev + 1, len(old_index)):
296 entry = old_index[rev]
297 flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF
298 old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16
299
300 if rev not in rewritten_entries:
301 old_data_file.seek(old_data_offset)
302 new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH]
303 new_data = old_data_file.read(new_data_size)
304 data_delta_base = entry[ENTRY_DELTA_BASE]
305 d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE]
306 else:
307 (
308 data_delta_base,
309 start,
310 end,
311 d_comp_mode,
312 ) = rewritten_entries[rev]
313 new_data_size = end - start
314 tmp_storage.seek(start)
315 new_data = tmp_storage.read(new_data_size)
316
317 # It might be faster to group continuous read/write operation,
318 # however, this is censor, an operation that is not focussed
319 # around stellar performance. So I have not written this
320 # optimisation yet.
321 new_data_offset = new_data_file.tell()
322 new_data_file.write(new_data)
323
324 sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH]
325 new_sidedata_offset = new_sidedata_file.tell()
326 if 0 < sidedata_size:
327 old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET]
328 old_sidedata_file.seek(old_sidedata_offset)
329 new_sidedata = old_sidedata_file.read(sidedata_size)
330 new_sidedata_file.write(new_sidedata)
331
332 data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH]
333 sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE]
334 assert data_delta_base <= rev, (data_delta_base, rev)
335
336 new_entry = revlogutils.entry(
337 flags=flags,
338 data_offset=new_data_offset,
339 data_compressed_length=new_data_size,
340 data_uncompressed_length=data_uncompressed_length,
341 data_delta_base=data_delta_base,
342 link_rev=entry[ENTRY_LINK_REV],
343 parent_rev_1=entry[ENTRY_PARENT_1],
344 parent_rev_2=entry[ENTRY_PARENT_2],
345 node_id=entry[ENTRY_NODE_ID],
346 sidedata_offset=new_sidedata_offset,
347 sidedata_compressed_length=sidedata_size,
348 data_compression_mode=d_comp_mode,
349 sidedata_compression_mode=sd_com_mode,
350 )
351 rl.index.append(new_entry)
352 entry_bin = rl.index.entry_binary(rev)
353 new_index_file.write(entry_bin)
354
355 docket.index_end = new_index_file.tell()
356 docket.data_end = new_data_file.tell()
357 docket.sidedata_end = new_sidedata_file.tell()
358
359 docket.write(transaction=None, stripping=True)
@@ -1,1147 +1,1147 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import nullrev
17 17 from ..i18n import _
18 18 from ..pycompat import getattr
19 19
20 20 from .constants import (
21 21 COMP_MODE_DEFAULT,
22 22 COMP_MODE_INLINE,
23 23 COMP_MODE_PLAIN,
24 24 REVIDX_ISCENSORED,
25 25 REVIDX_RAWTEXT_CHANGING_FLAGS,
26 26 )
27 27
28 28 from ..thirdparty import attr
29 29
30 30 from .. import (
31 31 error,
32 32 mdiff,
33 33 util,
34 34 )
35 35
36 36 from . import flagutil
37 37
38 38 # maximum <delta-chain-data>/<revision-text-length> ratio
39 39 LIMIT_DELTA2TEXT = 2
40 40
41 41
42 42 class _testrevlog(object):
43 43 """minimalist fake revlog to use in doctests"""
44 44
45 45 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
46 46 """data is an list of revision payload boundaries"""
47 47 self._data = data
48 48 self._srdensitythreshold = density
49 49 self._srmingapsize = mingap
50 50 self._snapshot = set(snapshot)
51 51 self.index = None
52 52
53 53 def start(self, rev):
54 54 if rev == nullrev:
55 55 return 0
56 56 if rev == 0:
57 57 return 0
58 58 return self._data[rev - 1]
59 59
60 60 def end(self, rev):
61 61 if rev == nullrev:
62 62 return 0
63 63 return self._data[rev]
64 64
65 65 def length(self, rev):
66 66 return self.end(rev) - self.start(rev)
67 67
68 68 def __len__(self):
69 69 return len(self._data)
70 70
71 71 def issnapshot(self, rev):
72 72 if rev == nullrev:
73 73 return True
74 74 return rev in self._snapshot
75 75
76 76
77 77 def slicechunk(revlog, revs, targetsize=None):
78 78 """slice revs to reduce the amount of unrelated data to be read from disk.
79 79
80 80 ``revs`` is sliced into groups that should be read in one time.
81 81 Assume that revs are sorted.
82 82
83 83 The initial chunk is sliced until the overall density (payload/chunks-span
84 84 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
85 85 `revlog._srmingapsize` is skipped.
86 86
87 87 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
88 88 For consistency with other slicing choice, this limit won't go lower than
89 89 `revlog._srmingapsize`.
90 90
91 91 If individual revisions chunk are larger than this limit, they will still
92 92 be raised individually.
93 93
94 94 >>> data = [
95 95 ... 5, #00 (5)
96 96 ... 10, #01 (5)
97 97 ... 12, #02 (2)
98 98 ... 12, #03 (empty)
99 99 ... 27, #04 (15)
100 100 ... 31, #05 (4)
101 101 ... 31, #06 (empty)
102 102 ... 42, #07 (11)
103 103 ... 47, #08 (5)
104 104 ... 47, #09 (empty)
105 105 ... 48, #10 (1)
106 106 ... 51, #11 (3)
107 107 ... 74, #12 (23)
108 108 ... 85, #13 (11)
109 109 ... 86, #14 (1)
110 110 ... 91, #15 (5)
111 111 ... ]
112 112 >>> revlog = _testrevlog(data, snapshot=range(16))
113 113
114 114 >>> list(slicechunk(revlog, list(range(16))))
115 115 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
116 116 >>> list(slicechunk(revlog, [0, 15]))
117 117 [[0], [15]]
118 118 >>> list(slicechunk(revlog, [0, 11, 15]))
119 119 [[0], [11], [15]]
120 120 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
121 121 [[0], [11, 13, 15]]
122 122 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
123 123 [[1, 2], [5, 8, 10, 11], [14]]
124 124
125 125 Slicing with a maximum chunk size
126 126 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
127 127 [[0], [11], [13], [15]]
128 128 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
129 129 [[0], [11], [13, 15]]
130 130
131 131 Slicing involving nullrev
132 132 >>> list(slicechunk(revlog, [-1, 0, 11, 13, 15], targetsize=20))
133 133 [[-1, 0], [11], [13, 15]]
134 134 >>> list(slicechunk(revlog, [-1, 13, 15], targetsize=5))
135 135 [[-1], [13], [15]]
136 136 """
137 137 if targetsize is not None:
138 138 targetsize = max(targetsize, revlog._srmingapsize)
139 139 # targetsize should not be specified when evaluating delta candidates:
140 140 # * targetsize is used to ensure we stay within specification when reading,
141 141 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
142 142 if densityslicing is None:
143 143 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
144 144 for chunk in densityslicing(
145 145 revs, revlog._srdensitythreshold, revlog._srmingapsize
146 146 ):
147 147 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
148 148 yield subchunk
149 149
150 150
151 151 def _slicechunktosize(revlog, revs, targetsize=None):
152 152 """slice revs to match the target size
153 153
154 154 This is intended to be used on chunk that density slicing selected by that
155 155 are still too large compared to the read garantee of revlog. This might
156 156 happens when "minimal gap size" interrupted the slicing or when chain are
157 157 built in a way that create large blocks next to each other.
158 158
159 159 >>> data = [
160 160 ... 3, #0 (3)
161 161 ... 5, #1 (2)
162 162 ... 6, #2 (1)
163 163 ... 8, #3 (2)
164 164 ... 8, #4 (empty)
165 165 ... 11, #5 (3)
166 166 ... 12, #6 (1)
167 167 ... 13, #7 (1)
168 168 ... 14, #8 (1)
169 169 ... ]
170 170
171 171 == All snapshots cases ==
172 172 >>> revlog = _testrevlog(data, snapshot=range(9))
173 173
174 174 Cases where chunk is already small enough
175 175 >>> list(_slicechunktosize(revlog, [0], 3))
176 176 [[0]]
177 177 >>> list(_slicechunktosize(revlog, [6, 7], 3))
178 178 [[6, 7]]
179 179 >>> list(_slicechunktosize(revlog, [0], None))
180 180 [[0]]
181 181 >>> list(_slicechunktosize(revlog, [6, 7], None))
182 182 [[6, 7]]
183 183
184 184 cases where we need actual slicing
185 185 >>> list(_slicechunktosize(revlog, [0, 1], 3))
186 186 [[0], [1]]
187 187 >>> list(_slicechunktosize(revlog, [1, 3], 3))
188 188 [[1], [3]]
189 189 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
190 190 [[1, 2], [3]]
191 191 >>> list(_slicechunktosize(revlog, [3, 5], 3))
192 192 [[3], [5]]
193 193 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
194 194 [[3], [5]]
195 195 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
196 196 [[5], [6, 7, 8]]
197 197 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
198 198 [[0], [1, 2], [3], [5], [6, 7, 8]]
199 199
200 200 Case with too large individual chunk (must return valid chunk)
201 201 >>> list(_slicechunktosize(revlog, [0, 1], 2))
202 202 [[0], [1]]
203 203 >>> list(_slicechunktosize(revlog, [1, 3], 1))
204 204 [[1], [3]]
205 205 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
206 206 [[3], [5]]
207 207
208 208 == No Snapshot cases ==
209 209 >>> revlog = _testrevlog(data)
210 210
211 211 Cases where chunk is already small enough
212 212 >>> list(_slicechunktosize(revlog, [0], 3))
213 213 [[0]]
214 214 >>> list(_slicechunktosize(revlog, [6, 7], 3))
215 215 [[6, 7]]
216 216 >>> list(_slicechunktosize(revlog, [0], None))
217 217 [[0]]
218 218 >>> list(_slicechunktosize(revlog, [6, 7], None))
219 219 [[6, 7]]
220 220
221 221 cases where we need actual slicing
222 222 >>> list(_slicechunktosize(revlog, [0, 1], 3))
223 223 [[0], [1]]
224 224 >>> list(_slicechunktosize(revlog, [1, 3], 3))
225 225 [[1], [3]]
226 226 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
227 227 [[1], [2, 3]]
228 228 >>> list(_slicechunktosize(revlog, [3, 5], 3))
229 229 [[3], [5]]
230 230 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
231 231 [[3], [4, 5]]
232 232 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
233 233 [[5], [6, 7, 8]]
234 234 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
235 235 [[0], [1, 2], [3], [5], [6, 7, 8]]
236 236
237 237 Case with too large individual chunk (must return valid chunk)
238 238 >>> list(_slicechunktosize(revlog, [0, 1], 2))
239 239 [[0], [1]]
240 240 >>> list(_slicechunktosize(revlog, [1, 3], 1))
241 241 [[1], [3]]
242 242 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
243 243 [[3], [5]]
244 244
245 245 == mixed case ==
246 246 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
247 247 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
248 248 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
249 249 """
250 250 assert targetsize is None or 0 <= targetsize
251 251 startdata = revlog.start(revs[0])
252 252 enddata = revlog.end(revs[-1])
253 253 fullspan = enddata - startdata
254 254 if targetsize is None or fullspan <= targetsize:
255 255 yield revs
256 256 return
257 257
258 258 startrevidx = 0
259 259 endrevidx = 1
260 260 iterrevs = enumerate(revs)
261 261 next(iterrevs) # skip first rev.
262 262 # first step: get snapshots out of the way
263 263 for idx, r in iterrevs:
264 264 span = revlog.end(r) - startdata
265 265 snapshot = revlog.issnapshot(r)
266 266 if span <= targetsize and snapshot:
267 267 endrevidx = idx + 1
268 268 else:
269 269 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
270 270 if chunk:
271 271 yield chunk
272 272 startrevidx = idx
273 273 startdata = revlog.start(r)
274 274 endrevidx = idx + 1
275 275 if not snapshot:
276 276 break
277 277
278 278 # for the others, we use binary slicing to quickly converge toward valid
279 279 # chunks (otherwise, we might end up looking for start/end of many
280 280 # revisions). This logic is not looking for the perfect slicing point, it
281 281 # focuses on quickly converging toward valid chunks.
282 282 nbitem = len(revs)
283 283 while (enddata - startdata) > targetsize:
284 284 endrevidx = nbitem
285 285 if nbitem - startrevidx <= 1:
286 286 break # protect against individual chunk larger than limit
287 287 localenddata = revlog.end(revs[endrevidx - 1])
288 288 span = localenddata - startdata
289 289 while span > targetsize:
290 290 if endrevidx - startrevidx <= 1:
291 291 break # protect against individual chunk larger than limit
292 292 endrevidx -= (endrevidx - startrevidx) // 2
293 293 localenddata = revlog.end(revs[endrevidx - 1])
294 294 span = localenddata - startdata
295 295 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
296 296 if chunk:
297 297 yield chunk
298 298 startrevidx = endrevidx
299 299 startdata = revlog.start(revs[startrevidx])
300 300
301 301 chunk = _trimchunk(revlog, revs, startrevidx)
302 302 if chunk:
303 303 yield chunk
304 304
305 305
306 306 def _slicechunktodensity(revlog, revs, targetdensity=0.5, mingapsize=0):
307 307 """slice revs to reduce the amount of unrelated data to be read from disk.
308 308
309 309 ``revs`` is sliced into groups that should be read in one time.
310 310 Assume that revs are sorted.
311 311
312 312 The initial chunk is sliced until the overall density (payload/chunks-span
313 313 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
314 314 skipped.
315 315
316 316 >>> revlog = _testrevlog([
317 317 ... 5, #00 (5)
318 318 ... 10, #01 (5)
319 319 ... 12, #02 (2)
320 320 ... 12, #03 (empty)
321 321 ... 27, #04 (15)
322 322 ... 31, #05 (4)
323 323 ... 31, #06 (empty)
324 324 ... 42, #07 (11)
325 325 ... 47, #08 (5)
326 326 ... 47, #09 (empty)
327 327 ... 48, #10 (1)
328 328 ... 51, #11 (3)
329 329 ... 74, #12 (23)
330 330 ... 85, #13 (11)
331 331 ... 86, #14 (1)
332 332 ... 91, #15 (5)
333 333 ... ])
334 334
335 335 >>> list(_slicechunktodensity(revlog, list(range(16))))
336 336 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
337 337 >>> list(_slicechunktodensity(revlog, [0, 15]))
338 338 [[0], [15]]
339 339 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
340 340 [[0], [11], [15]]
341 341 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
342 342 [[0], [11, 13, 15]]
343 343 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
344 344 [[1, 2], [5, 8, 10, 11], [14]]
345 345 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
346 346 ... mingapsize=20))
347 347 [[1, 2, 3, 5, 8, 10, 11], [14]]
348 348 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
349 349 ... targetdensity=0.95))
350 350 [[1, 2], [5], [8, 10, 11], [14]]
351 351 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
352 352 ... targetdensity=0.95, mingapsize=12))
353 353 [[1, 2], [5, 8, 10, 11], [14]]
354 354 """
355 355 start = revlog.start
356 356 length = revlog.length
357 357
358 358 if len(revs) <= 1:
359 359 yield revs
360 360 return
361 361
362 362 deltachainspan = segmentspan(revlog, revs)
363 363
364 364 if deltachainspan < mingapsize:
365 365 yield revs
366 366 return
367 367
368 368 readdata = deltachainspan
369 369 chainpayload = sum(length(r) for r in revs)
370 370
371 371 if deltachainspan:
372 372 density = chainpayload / float(deltachainspan)
373 373 else:
374 374 density = 1.0
375 375
376 376 if density >= targetdensity:
377 377 yield revs
378 378 return
379 379
380 380 # Store the gaps in a heap to have them sorted by decreasing size
381 381 gaps = []
382 382 prevend = None
383 383 for i, rev in enumerate(revs):
384 384 revstart = start(rev)
385 385 revlen = length(rev)
386 386
387 387 # Skip empty revisions to form larger holes
388 388 if revlen == 0:
389 389 continue
390 390
391 391 if prevend is not None:
392 392 gapsize = revstart - prevend
393 393 # only consider holes that are large enough
394 394 if gapsize > mingapsize:
395 395 gaps.append((gapsize, i))
396 396
397 397 prevend = revstart + revlen
398 398 # sort the gaps to pop them from largest to small
399 399 gaps.sort()
400 400
401 401 # Collect the indices of the largest holes until the density is acceptable
402 402 selected = []
403 403 while gaps and density < targetdensity:
404 404 gapsize, gapidx = gaps.pop()
405 405
406 406 selected.append(gapidx)
407 407
408 408 # the gap sizes are stored as negatives to be sorted decreasingly
409 409 # by the heap
410 410 readdata -= gapsize
411 411 if readdata > 0:
412 412 density = chainpayload / float(readdata)
413 413 else:
414 414 density = 1.0
415 415 selected.sort()
416 416
417 417 # Cut the revs at collected indices
418 418 previdx = 0
419 419 for idx in selected:
420 420
421 421 chunk = _trimchunk(revlog, revs, previdx, idx)
422 422 if chunk:
423 423 yield chunk
424 424
425 425 previdx = idx
426 426
427 427 chunk = _trimchunk(revlog, revs, previdx)
428 428 if chunk:
429 429 yield chunk
430 430
431 431
432 432 def _trimchunk(revlog, revs, startidx, endidx=None):
433 433 """returns revs[startidx:endidx] without empty trailing revs
434 434
435 435 Doctest Setup
436 436 >>> revlog = _testrevlog([
437 437 ... 5, #0
438 438 ... 10, #1
439 439 ... 12, #2
440 440 ... 12, #3 (empty)
441 441 ... 17, #4
442 442 ... 21, #5
443 443 ... 21, #6 (empty)
444 444 ... ])
445 445
446 446 Contiguous cases:
447 447 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
448 448 [0, 1, 2, 3, 4, 5]
449 449 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
450 450 [0, 1, 2, 3, 4]
451 451 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
452 452 [0, 1, 2]
453 453 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
454 454 [2]
455 455 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
456 456 [3, 4, 5]
457 457 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
458 458 [3, 4]
459 459
460 460 Discontiguous cases:
461 461 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
462 462 [1, 3, 5]
463 463 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
464 464 [1]
465 465 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
466 466 [3, 5]
467 467 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
468 468 [3, 5]
469 469 """
470 470 length = revlog.length
471 471
472 472 if endidx is None:
473 473 endidx = len(revs)
474 474
475 475 # If we have a non-emtpy delta candidate, there are nothing to trim
476 476 if revs[endidx - 1] < len(revlog):
477 477 # Trim empty revs at the end, except the very first revision of a chain
478 478 while (
479 479 endidx > 1 and endidx > startidx and length(revs[endidx - 1]) == 0
480 480 ):
481 481 endidx -= 1
482 482
483 483 return revs[startidx:endidx]
484 484
485 485
486 486 def segmentspan(revlog, revs):
487 487 """Get the byte span of a segment of revisions
488 488
489 489 revs is a sorted array of revision numbers
490 490
491 491 >>> revlog = _testrevlog([
492 492 ... 5, #0
493 493 ... 10, #1
494 494 ... 12, #2
495 495 ... 12, #3 (empty)
496 496 ... 17, #4
497 497 ... ])
498 498
499 499 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
500 500 17
501 501 >>> segmentspan(revlog, [0, 4])
502 502 17
503 503 >>> segmentspan(revlog, [3, 4])
504 504 5
505 505 >>> segmentspan(revlog, [1, 2, 3,])
506 506 7
507 507 >>> segmentspan(revlog, [1, 3])
508 508 7
509 509 """
510 510 if not revs:
511 511 return 0
512 512 end = revlog.end(revs[-1])
513 513 return end - revlog.start(revs[0])
514 514
515 515
516 516 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
517 517 """build full text from a (base, delta) pair and other metadata"""
518 518 # special case deltas which replace entire base; no need to decode
519 519 # base revision. this neatly avoids censored bases, which throw when
520 520 # they're decoded.
521 521 hlen = struct.calcsize(b">lll")
522 522 if delta[:hlen] == mdiff.replacediffheader(
523 523 revlog.rawsize(baserev), len(delta) - hlen
524 524 ):
525 525 fulltext = delta[hlen:]
526 526 else:
527 527 # deltabase is rawtext before changed by flag processors, which is
528 528 # equivalent to non-raw text
529 529 basetext = revlog.revision(baserev, _df=fh, raw=False)
530 530 fulltext = mdiff.patch(basetext, delta)
531 531
532 532 try:
533 533 validatehash = flagutil.processflagsraw(revlog, fulltext, flags)
534 534 if validatehash:
535 535 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
536 536 if flags & REVIDX_ISCENSORED:
537 537 raise error.StorageError(
538 538 _(b'node %s is not censored') % expectednode
539 539 )
540 540 except error.CensoredNodeError:
541 541 # must pass the censored index flag to add censored revisions
542 542 if not flags & REVIDX_ISCENSORED:
543 543 raise
544 544 return fulltext
545 545
546 546
547 547 @attr.s(slots=True, frozen=True)
548 548 class _deltainfo(object):
549 549 distance = attr.ib()
550 550 deltalen = attr.ib()
551 551 data = attr.ib()
552 552 base = attr.ib()
553 553 chainbase = attr.ib()
554 554 chainlen = attr.ib()
555 555 compresseddeltalen = attr.ib()
556 556 snapshotdepth = attr.ib()
557 557
558 558
559 559 def drop_u_compression(delta):
560 560 """turn into a "u" (no-compression) into no-compression without header
561 561
562 562 This is useful for revlog format that has better compression method.
563 563 """
564 564 assert delta.data[0] == b'u', delta.data[0]
565 565 return _deltainfo(
566 566 delta.distance,
567 567 delta.deltalen - 1,
568 568 (b'', delta.data[1]),
569 569 delta.base,
570 570 delta.chainbase,
571 571 delta.chainlen,
572 572 delta.compresseddeltalen,
573 573 delta.snapshotdepth,
574 574 )
575 575
576 576
577 577 def isgooddeltainfo(revlog, deltainfo, revinfo):
578 578 """Returns True if the given delta is good. Good means that it is within
579 579 the disk span, disk size, and chain length bounds that we know to be
580 580 performant."""
581 581 if deltainfo is None:
582 582 return False
583 583
584 584 # - 'deltainfo.distance' is the distance from the base revision --
585 585 # bounding it limits the amount of I/O we need to do.
586 586 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
587 587 # deltas we need to apply -- bounding it limits the amount of CPU
588 588 # we consume.
589 589
590 590 textlen = revinfo.textlen
591 591 defaultmax = textlen * 4
592 592 maxdist = revlog._maxdeltachainspan
593 593 if not maxdist:
594 594 maxdist = deltainfo.distance # ensure the conditional pass
595 595 maxdist = max(maxdist, defaultmax)
596 596
597 597 # Bad delta from read span:
598 598 #
599 599 # If the span of data read is larger than the maximum allowed.
600 600 #
601 601 # In the sparse-revlog case, we rely on the associated "sparse reading"
602 602 # to avoid issue related to the span of data. In theory, it would be
603 603 # possible to build pathological revlog where delta pattern would lead
604 604 # to too many reads. However, they do not happen in practice at all. So
605 605 # we skip the span check entirely.
606 606 if not revlog._sparserevlog and maxdist < deltainfo.distance:
607 607 return False
608 608
609 609 # Bad delta from new delta size:
610 610 #
611 611 # If the delta size is larger than the target text, storing the
612 612 # delta will be inefficient.
613 613 if textlen < deltainfo.deltalen:
614 614 return False
615 615
616 616 # Bad delta from cumulated payload size:
617 617 #
618 618 # If the sum of delta get larger than K * target text length.
619 619 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
620 620 return False
621 621
622 622 # Bad delta from chain length:
623 623 #
624 624 # If the number of delta in the chain gets too high.
625 625 if revlog._maxchainlen and revlog._maxchainlen < deltainfo.chainlen:
626 626 return False
627 627
628 628 # bad delta from intermediate snapshot size limit
629 629 #
630 630 # If an intermediate snapshot size is higher than the limit. The
631 631 # limit exist to prevent endless chain of intermediate delta to be
632 632 # created.
633 633 if (
634 634 deltainfo.snapshotdepth is not None
635 635 and (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen
636 636 ):
637 637 return False
638 638
639 639 # bad delta if new intermediate snapshot is larger than the previous
640 640 # snapshot
641 641 if (
642 642 deltainfo.snapshotdepth
643 643 and revlog.length(deltainfo.base) < deltainfo.deltalen
644 644 ):
645 645 return False
646 646
647 647 return True
648 648
649 649
650 650 # If a revision's full text is that much bigger than a base candidate full
651 651 # text's, it is very unlikely that it will produce a valid delta. We no longer
652 652 # consider these candidates.
653 653 LIMIT_BASE2TEXT = 500
654 654
655 655
656 656 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
657 657 """Provides group of revision to be tested as delta base
658 658
659 659 This top level function focus on emitting groups with unique and worthwhile
660 660 content. See _raw_candidate_groups for details about the group order.
661 661 """
662 662 # should we try to build a delta?
663 663 if not (len(revlog) and revlog._storedeltachains):
664 664 yield None
665 665 return
666 666
667 667 deltalength = revlog.length
668 668 deltaparent = revlog.deltaparent
669 669 sparse = revlog._sparserevlog
670 670 good = None
671 671
672 672 deltas_limit = textlen * LIMIT_DELTA2TEXT
673 673
674 674 tested = {nullrev}
675 675 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
676 676 while True:
677 677 temptative = candidates.send(good)
678 678 if temptative is None:
679 679 break
680 680 group = []
681 681 for rev in temptative:
682 682 # skip over empty delta (no need to include them in a chain)
683 683 while revlog._generaldelta and not (
684 684 rev == nullrev or rev in tested or deltalength(rev)
685 685 ):
686 686 tested.add(rev)
687 687 rev = deltaparent(rev)
688 688 # no need to try a delta against nullrev, this will be done as a
689 689 # last resort.
690 690 if rev == nullrev:
691 691 continue
692 692 # filter out revision we tested already
693 693 if rev in tested:
694 694 continue
695 695 tested.add(rev)
696 696 # filter out delta base that will never produce good delta
697 697 if deltas_limit < revlog.length(rev):
698 698 continue
699 699 if sparse and revlog.rawsize(rev) < (textlen // LIMIT_BASE2TEXT):
700 700 continue
701 701 # no delta for rawtext-changing revs (see "candelta" for why)
702 702 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
703 703 continue
704 704 # If we reach here, we are about to build and test a delta.
705 705 # The delta building process will compute the chaininfo in all
706 706 # case, since that computation is cached, it is fine to access it
707 707 # here too.
708 708 chainlen, chainsize = revlog._chaininfo(rev)
709 709 # if chain will be too long, skip base
710 710 if revlog._maxchainlen and chainlen >= revlog._maxchainlen:
711 711 continue
712 712 # if chain already have too much data, skip base
713 713 if deltas_limit < chainsize:
714 714 continue
715 715 if sparse and revlog.upperboundcomp is not None:
716 716 maxcomp = revlog.upperboundcomp
717 717 basenotsnap = (p1, p2, nullrev)
718 718 if rev not in basenotsnap and revlog.issnapshot(rev):
719 719 snapshotdepth = revlog.snapshotdepth(rev)
720 720 # If text is significantly larger than the base, we can
721 721 # expect the resulting delta to be proportional to the size
722 722 # difference
723 723 revsize = revlog.rawsize(rev)
724 724 rawsizedistance = max(textlen - revsize, 0)
725 725 # use an estimate of the compression upper bound.
726 726 lowestrealisticdeltalen = rawsizedistance // maxcomp
727 727
728 728 # check the absolute constraint on the delta size
729 729 snapshotlimit = textlen >> snapshotdepth
730 730 if snapshotlimit < lowestrealisticdeltalen:
731 731 # delta lower bound is larger than accepted upper bound
732 732 continue
733 733
734 734 # check the relative constraint on the delta size
735 735 revlength = revlog.length(rev)
736 736 if revlength < lowestrealisticdeltalen:
737 737 # delta probable lower bound is larger than target base
738 738 continue
739 739
740 740 group.append(rev)
741 741 if group:
742 742 # XXX: in the sparse revlog case, group can become large,
743 743 # impacting performances. Some bounding or slicing mecanism
744 744 # would help to reduce this impact.
745 745 good = yield tuple(group)
746 746 yield None
747 747
748 748
749 749 def _findsnapshots(revlog, cache, start_rev):
750 750 """find snapshot from start_rev to tip"""
751 751 if util.safehasattr(revlog.index, b'findsnapshots'):
752 752 revlog.index.findsnapshots(cache, start_rev)
753 753 else:
754 754 deltaparent = revlog.deltaparent
755 755 issnapshot = revlog.issnapshot
756 756 for rev in revlog.revs(start_rev):
757 757 if issnapshot(rev):
758 758 cache[deltaparent(rev)].append(rev)
759 759
760 760
761 761 def _refinedgroups(revlog, p1, p2, cachedelta):
762 762 good = None
763 763 # First we try to reuse a the delta contained in the bundle.
764 764 # (or from the source revlog)
765 765 #
766 766 # This logic only applies to general delta repositories and can be disabled
767 767 # through configuration. Disabling reuse source delta is useful when
768 768 # we want to make sure we recomputed "optimal" deltas.
769 769 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
770 770 # Assume what we received from the server is a good choice
771 771 # build delta will reuse the cache
772 772 good = yield (cachedelta[0],)
773 773 if good is not None:
774 774 yield None
775 775 return
776 776 snapshots = collections.defaultdict(list)
777 777 for candidates in _rawgroups(revlog, p1, p2, cachedelta, snapshots):
778 778 good = yield candidates
779 779 if good is not None:
780 780 break
781 781
782 782 # If sparse revlog is enabled, we can try to refine the available deltas
783 783 if not revlog._sparserevlog:
784 784 yield None
785 785 return
786 786
787 787 # if we have a refinable value, try to refine it
788 788 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
789 789 # refine snapshot down
790 790 previous = None
791 791 while previous != good:
792 792 previous = good
793 793 base = revlog.deltaparent(good)
794 794 if base == nullrev:
795 795 break
796 796 good = yield (base,)
797 797 # refine snapshot up
798 798 if not snapshots:
799 799 _findsnapshots(revlog, snapshots, good + 1)
800 800 previous = None
801 801 while good != previous:
802 802 previous = good
803 803 children = tuple(sorted(c for c in snapshots[good]))
804 804 good = yield children
805 805
806 806 # we have found nothing
807 807 yield None
808 808
809 809
810 810 def _rawgroups(revlog, p1, p2, cachedelta, snapshots=None):
811 811 """Provides group of revision to be tested as delta base
812 812
813 813 This lower level function focus on emitting delta theorically interresting
814 814 without looking it any practical details.
815 815
816 816 The group order aims at providing fast or small candidates first.
817 817 """
818 818 gdelta = revlog._generaldelta
819 819 # gate sparse behind general-delta because of issue6056
820 820 sparse = gdelta and revlog._sparserevlog
821 821 curr = len(revlog)
822 822 prev = curr - 1
823 823 deltachain = lambda rev: revlog._deltachain(rev)[0]
824 824
825 825 if gdelta:
826 826 # exclude already lazy tested base if any
827 827 parents = [p for p in (p1, p2) if p != nullrev]
828 828
829 829 if not revlog._deltabothparents and len(parents) == 2:
830 830 parents.sort()
831 831 # To minimize the chance of having to build a fulltext,
832 832 # pick first whichever parent is closest to us (max rev)
833 833 yield (parents[1],)
834 834 # then the other one (min rev) if the first did not fit
835 835 yield (parents[0],)
836 836 elif len(parents) > 0:
837 837 # Test all parents (1 or 2), and keep the best candidate
838 838 yield parents
839 839
840 840 if sparse and parents:
841 841 if snapshots is None:
842 842 # map: base-rev: snapshot-rev
843 843 snapshots = collections.defaultdict(list)
844 844 # See if we can use an existing snapshot in the parent chains to use as
845 845 # a base for a new intermediate-snapshot
846 846 #
847 847 # search for snapshot in parents delta chain
848 848 # map: snapshot-level: snapshot-rev
849 849 parents_snaps = collections.defaultdict(set)
850 850 candidate_chains = [deltachain(p) for p in parents]
851 851 for chain in candidate_chains:
852 852 for idx, s in enumerate(chain):
853 853 if not revlog.issnapshot(s):
854 854 break
855 855 parents_snaps[idx].add(s)
856 856 snapfloor = min(parents_snaps[0]) + 1
857 857 _findsnapshots(revlog, snapshots, snapfloor)
858 858 # search for the highest "unrelated" revision
859 859 #
860 860 # Adding snapshots used by "unrelated" revision increase the odd we
861 861 # reuse an independant, yet better snapshot chain.
862 862 #
863 863 # XXX instead of building a set of revisions, we could lazily enumerate
864 864 # over the chains. That would be more efficient, however we stick to
865 865 # simple code for now.
866 866 all_revs = set()
867 867 for chain in candidate_chains:
868 868 all_revs.update(chain)
869 869 other = None
870 870 for r in revlog.revs(prev, snapfloor):
871 871 if r not in all_revs:
872 872 other = r
873 873 break
874 874 if other is not None:
875 875 # To avoid unfair competition, we won't use unrelated intermediate
876 876 # snapshot that are deeper than the ones from the parent delta
877 877 # chain.
878 878 max_depth = max(parents_snaps.keys())
879 879 chain = deltachain(other)
880 880 for idx, s in enumerate(chain):
881 881 if s < snapfloor:
882 882 continue
883 883 if max_depth < idx:
884 884 break
885 885 if not revlog.issnapshot(s):
886 886 break
887 887 parents_snaps[idx].add(s)
888 888 # Test them as possible intermediate snapshot base
889 889 # We test them from highest to lowest level. High level one are more
890 890 # likely to result in small delta
891 891 floor = None
892 892 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
893 893 siblings = set()
894 894 for s in snaps:
895 895 siblings.update(snapshots[s])
896 896 # Before considering making a new intermediate snapshot, we check
897 897 # if an existing snapshot, children of base we consider, would be
898 898 # suitable.
899 899 #
900 900 # It give a change to reuse a delta chain "unrelated" to the
901 901 # current revision instead of starting our own. Without such
902 902 # re-use, topological branches would keep reopening new chains.
903 903 # Creating more and more snapshot as the repository grow.
904 904
905 905 if floor is not None:
906 906 # We only do this for siblings created after the one in our
907 907 # parent's delta chain. Those created before has less chances
908 908 # to be valid base since our ancestors had to create a new
909 909 # snapshot.
910 910 siblings = [r for r in siblings if floor < r]
911 911 yield tuple(sorted(siblings))
912 912 # then test the base from our parent's delta chain.
913 913 yield tuple(sorted(snaps))
914 914 floor = min(snaps)
915 915 # No suitable base found in the parent chain, search if any full
916 916 # snapshots emitted since parent's base would be a suitable base for an
917 917 # intermediate snapshot.
918 918 #
919 919 # It give a chance to reuse a delta chain unrelated to the current
920 920 # revisions instead of starting our own. Without such re-use,
921 921 # topological branches would keep reopening new full chains. Creating
922 922 # more and more snapshot as the repository grow.
923 923 yield tuple(snapshots[nullrev])
924 924
925 925 if not sparse:
926 926 # other approach failed try against prev to hopefully save us a
927 927 # fulltext.
928 928 yield (prev,)
929 929
930 930
931 931 class deltacomputer(object):
932 932 def __init__(self, revlog):
933 933 self.revlog = revlog
934 934
935 935 def buildtext(self, revinfo, fh):
936 936 """Builds a fulltext version of a revision
937 937
938 938 revinfo: revisioninfo instance that contains all needed info
939 939 fh: file handle to either the .i or the .d revlog file,
940 940 depending on whether it is inlined or not
941 941 """
942 942 btext = revinfo.btext
943 943 if btext[0] is not None:
944 944 return btext[0]
945 945
946 946 revlog = self.revlog
947 947 cachedelta = revinfo.cachedelta
948 948 baserev = cachedelta[0]
949 949 delta = cachedelta[1]
950 950
951 951 fulltext = btext[0] = _textfromdelta(
952 952 fh,
953 953 revlog,
954 954 baserev,
955 955 delta,
956 956 revinfo.p1,
957 957 revinfo.p2,
958 958 revinfo.flags,
959 959 revinfo.node,
960 960 )
961 961 return fulltext
962 962
963 963 def _builddeltadiff(self, base, revinfo, fh):
964 964 revlog = self.revlog
965 965 t = self.buildtext(revinfo, fh)
966 966 if revlog.iscensored(base):
967 967 # deltas based on a censored revision must replace the
968 968 # full content in one patch, so delta works everywhere
969 969 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
970 970 delta = header + t
971 971 else:
972 972 ptext = revlog.rawdata(base, _df=fh)
973 973 delta = mdiff.textdiff(ptext, t)
974 974
975 975 return delta
976 976
977 977 def _builddeltainfo(self, revinfo, base, fh):
978 978 # can we use the cached delta?
979 979 revlog = self.revlog
980 980 chainbase = revlog.chainbase(base)
981 981 if revlog._generaldelta:
982 982 deltabase = base
983 983 else:
984 984 deltabase = chainbase
985 985 snapshotdepth = None
986 986 if revlog._sparserevlog and deltabase == nullrev:
987 987 snapshotdepth = 0
988 988 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
989 989 # A delta chain should always be one full snapshot,
990 990 # zero or more semi-snapshots, and zero or more deltas
991 991 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
992 992 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
993 993 snapshotdepth = len(revlog._deltachain(deltabase)[0])
994 994 delta = None
995 995 if revinfo.cachedelta:
996 996 cachebase, cachediff = revinfo.cachedelta
997 997 # check if the diff still apply
998 998 currentbase = cachebase
999 999 while (
1000 1000 currentbase != nullrev
1001 1001 and currentbase != base
1002 1002 and self.revlog.length(currentbase) == 0
1003 1003 ):
1004 1004 currentbase = self.revlog.deltaparent(currentbase)
1005 1005 if self.revlog._lazydelta and currentbase == base:
1006 1006 delta = revinfo.cachedelta[1]
1007 1007 if delta is None:
1008 1008 delta = self._builddeltadiff(base, revinfo, fh)
1009 1009 # snapshotdept need to be neither None nor 0 level snapshot
1010 1010 if revlog.upperboundcomp is not None and snapshotdepth:
1011 1011 lowestrealisticdeltalen = len(delta) // revlog.upperboundcomp
1012 1012 snapshotlimit = revinfo.textlen >> snapshotdepth
1013 1013 if snapshotlimit < lowestrealisticdeltalen:
1014 1014 return None
1015 1015 if revlog.length(base) < lowestrealisticdeltalen:
1016 1016 return None
1017 1017 header, data = revlog.compress(delta)
1018 1018 deltalen = len(header) + len(data)
1019 1019 offset = revlog.end(len(revlog) - 1)
1020 1020 dist = deltalen + offset - revlog.start(chainbase)
1021 1021 chainlen, compresseddeltalen = revlog._chaininfo(base)
1022 1022 chainlen += 1
1023 1023 compresseddeltalen += deltalen
1024 1024
1025 1025 return _deltainfo(
1026 1026 dist,
1027 1027 deltalen,
1028 1028 (header, data),
1029 1029 deltabase,
1030 1030 chainbase,
1031 1031 chainlen,
1032 1032 compresseddeltalen,
1033 1033 snapshotdepth,
1034 1034 )
1035 1035
1036 1036 def _fullsnapshotinfo(self, fh, revinfo, curr):
1037 1037 rawtext = self.buildtext(revinfo, fh)
1038 1038 data = self.revlog.compress(rawtext)
1039 1039 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
1040 1040 deltabase = chainbase = curr
1041 1041 snapshotdepth = 0
1042 1042 chainlen = 1
1043 1043
1044 1044 return _deltainfo(
1045 1045 dist,
1046 1046 deltalen,
1047 1047 data,
1048 1048 deltabase,
1049 1049 chainbase,
1050 1050 chainlen,
1051 1051 compresseddeltalen,
1052 1052 snapshotdepth,
1053 1053 )
1054 1054
1055 1055 def finddeltainfo(self, revinfo, fh, excluded_bases=None, target_rev=None):
1056 1056 """Find an acceptable delta against a candidate revision
1057 1057
1058 1058 revinfo: information about the revision (instance of _revisioninfo)
1059 1059 fh: file handle to either the .i or the .d revlog file,
1060 1060 depending on whether it is inlined or not
1061 1061
1062 1062 Returns the first acceptable candidate revision, as ordered by
1063 1063 _candidategroups
1064 1064
1065 1065 If no suitable deltabase is found, we return delta info for a full
1066 1066 snapshot.
1067 1067
1068 1068 `excluded_bases` is an optional set of revision that cannot be used as
1069 1069 a delta base. Use this to recompute delta suitable in censor or strip
1070 1070 context.
1071 1071 """
1072 1072 if target_rev is None:
1073 curr = len(self.revlog)
1073 target_rev = len(self.revlog)
1074 1074
1075 1075 if not revinfo.textlen:
1076 1076 return self._fullsnapshotinfo(fh, revinfo, target_rev)
1077 1077
1078 1078 if excluded_bases is None:
1079 1079 excluded_bases = set()
1080 1080
1081 1081 # no delta for flag processor revision (see "candelta" for why)
1082 1082 # not calling candelta since only one revision needs test, also to
1083 1083 # avoid overhead fetching flags again.
1084 1084 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
1085 1085 return self._fullsnapshotinfo(fh, revinfo, target_rev)
1086 1086
1087 1087 cachedelta = revinfo.cachedelta
1088 1088 p1 = revinfo.p1
1089 1089 p2 = revinfo.p2
1090 1090 revlog = self.revlog
1091 1091
1092 1092 deltainfo = None
1093 1093 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
1094 1094 groups = _candidategroups(
1095 1095 self.revlog, revinfo.textlen, p1r, p2r, cachedelta
1096 1096 )
1097 1097 candidaterevs = next(groups)
1098 1098 while candidaterevs is not None:
1099 1099 nominateddeltas = []
1100 1100 if deltainfo is not None:
1101 1101 # if we already found a good delta,
1102 1102 # challenge it against refined candidates
1103 1103 nominateddeltas.append(deltainfo)
1104 1104 for candidaterev in candidaterevs:
1105 1105 if candidaterev in excluded_bases:
1106 1106 continue
1107 1107 if candidaterev >= target_rev:
1108 1108 continue
1109 1109 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
1110 1110 if candidatedelta is not None:
1111 1111 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
1112 1112 nominateddeltas.append(candidatedelta)
1113 1113 if nominateddeltas:
1114 1114 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
1115 1115 if deltainfo is not None:
1116 1116 candidaterevs = groups.send(deltainfo.base)
1117 1117 else:
1118 1118 candidaterevs = next(groups)
1119 1119
1120 1120 if deltainfo is None:
1121 1121 deltainfo = self._fullsnapshotinfo(fh, revinfo, target_rev)
1122 1122 return deltainfo
1123 1123
1124 1124
1125 1125 def delta_compression(default_compression_header, deltainfo):
1126 1126 """return (COMPRESSION_MODE, deltainfo)
1127 1127
1128 1128 used by revlog v2+ format to dispatch between PLAIN and DEFAULT
1129 1129 compression.
1130 1130 """
1131 1131 h, d = deltainfo.data
1132 1132 compression_mode = COMP_MODE_INLINE
1133 1133 if not h and not d:
1134 1134 # not data to store at all... declare them uncompressed
1135 1135 compression_mode = COMP_MODE_PLAIN
1136 1136 elif not h:
1137 1137 t = d[0:1]
1138 1138 if t == b'\0':
1139 1139 compression_mode = COMP_MODE_PLAIN
1140 1140 elif t == default_compression_header:
1141 1141 compression_mode = COMP_MODE_DEFAULT
1142 1142 elif h == b'u':
1143 1143 # we have a more efficient way to declare uncompressed
1144 1144 h = b''
1145 1145 compression_mode = COMP_MODE_PLAIN
1146 1146 deltainfo = drop_u_compression(deltainfo)
1147 1147 return compression_mode, deltainfo
@@ -1,507 +1,565 b''
1 1 #require no-reposimplestore
2 #testcases revlogv1 revlogv2
3
4 #if revlogv2
5
6 $ cat >> $HGRCPATH <<EOF
7 > [experimental]
8 > revlogv2=enable-unstable-format-and-corrupt-my-data
9 > EOF
10
11 #endif
2 12
3 13 $ cat >> $HGRCPATH <<EOF
4 14 > [extensions]
5 15 > censor=
6 16 > EOF
7 17 $ cp $HGRCPATH $HGRCPATH.orig
8 18
9 19 Create repo with unimpeachable content
10 20
11 21 $ hg init r
12 22 $ cd r
13 23 $ echo 'Initially untainted file' > target
14 24 $ echo 'Normal file here' > bystander
15 25 $ hg add target bystander
16 26 $ hg ci -m init
17 27
18 28 Clone repo so we can test pull later
19 29
20 30 $ cd ..
21 31 $ hg clone r rpull
22 32 updating to branch default
23 33 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
24 34 $ cd r
25 35
26 36 Introduce content which will ultimately require censorship. Name the first
27 37 censored node C1, second C2, and so on
28 38
29 39 $ echo 'Tainted file' > target
30 40 $ echo 'Passwords: hunter2' >> target
31 41 $ hg ci -m taint target
32 42 $ C1=`hg id --debug -i`
33 43
34 44 $ echo 'hunter3' >> target
35 45 $ echo 'Normal file v2' > bystander
36 46 $ hg ci -m moretaint target bystander
37 47 $ C2=`hg id --debug -i`
38 48
39 49 Add a new sanitized versions to correct our mistake. Name the first head H1,
40 50 the second head H2, and so on
41 51
42 52 $ echo 'Tainted file is now sanitized' > target
43 53 $ hg ci -m sanitized target
44 54 $ H1=`hg id --debug -i`
45 55
46 56 $ hg update -r $C2
47 57 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
48 58 $ echo 'Tainted file now super sanitized' > target
49 59 $ hg ci -m 'super sanitized' target
50 60 created new head
51 61 $ H2=`hg id --debug -i`
52 62
53 63 Verify target contents before censorship at each revision
54 64
55 65 $ hg cat -r $H1 target | head -n 10
56 66 Tainted file is now sanitized
57 67 $ hg cat -r $H2 target | head -n 10
58 68 Tainted file now super sanitized
59 69 $ hg cat -r $C2 target | head -n 10
60 70 Tainted file
61 71 Passwords: hunter2
62 72 hunter3
63 73 $ hg cat -r $C1 target | head -n 10
64 74 Tainted file
65 75 Passwords: hunter2
66 76 $ hg cat -r 0 target | head -n 10
67 77 Initially untainted file
68 78
69 79 Censor revision with 2 offenses
70 80
71 81 (this also tests file pattern matching: path relative to cwd case)
72 82
73 83 $ mkdir -p foo/bar/baz
74 84 $ hg --cwd foo/bar/baz censor -r $C2 -t "remove password" ../../../target
75 85 $ hg cat -r $H1 target | head -n 10
76 86 Tainted file is now sanitized
77 87 $ hg cat -r $H2 target | head -n 10
78 88 Tainted file now super sanitized
79 89 $ hg cat -r $C2 target | head -n 10
80 90 abort: censored node: 1e0247a9a4b7
81 91 (set censor.policy to ignore errors)
82 92 $ hg cat -r $C1 target | head -n 10
83 93 Tainted file
84 94 Passwords: hunter2
85 95 $ hg cat -r 0 target | head -n 10
86 96 Initially untainted file
87 97
88 98 Censor revision with 1 offense
89 99
90 100 (this also tests file pattern matching: with 'path:' scheme)
91 101
92 102 $ hg --cwd foo/bar/baz censor -r $C1 path:target
93 103 $ hg cat -r $H1 target | head -n 10
94 104 Tainted file is now sanitized
95 105 $ hg cat -r $H2 target | head -n 10
96 106 Tainted file now super sanitized
97 107 $ hg cat -r $C2 target | head -n 10
98 108 abort: censored node: 1e0247a9a4b7
99 109 (set censor.policy to ignore errors)
100 110 $ hg cat -r $C1 target | head -n 10
101 111 abort: censored node: 613bc869fceb
102 112 (set censor.policy to ignore errors)
103 113 $ hg cat -r 0 target | head -n 10
104 114 Initially untainted file
105 115
106 116 Can only checkout target at uncensored revisions, -X is workaround for --all
107 117
108 118 $ hg revert -r $C2 target | head -n 10
109 119 abort: censored node: 1e0247a9a4b7
110 120 (set censor.policy to ignore errors)
111 121 $ hg revert -r $C1 target | head -n 10
112 122 abort: censored node: 613bc869fceb
113 123 (set censor.policy to ignore errors)
114 124 $ hg revert -r $C1 --all
115 125 reverting bystander
116 126 reverting target
117 127 abort: censored node: 613bc869fceb
118 128 (set censor.policy to ignore errors)
119 129 [255]
120 130 $ hg revert -r $C1 --all -X target
121 131 $ cat target | head -n 10
122 132 Tainted file now super sanitized
123 133 $ hg revert -r 0 --all
124 134 reverting target
125 135 $ cat target | head -n 10
126 136 Initially untainted file
127 137 $ hg revert -r $H2 --all
128 138 reverting bystander
129 139 reverting target
130 140 $ cat target | head -n 10
131 141 Tainted file now super sanitized
132 142
133 143 Uncensored file can be viewed at any revision
134 144
135 145 $ hg cat -r $H1 bystander | head -n 10
136 146 Normal file v2
137 147 $ hg cat -r $C2 bystander | head -n 10
138 148 Normal file v2
139 149 $ hg cat -r $C1 bystander | head -n 10
140 150 Normal file here
141 151 $ hg cat -r 0 bystander | head -n 10
142 152 Normal file here
143 153
144 154 Can update to children of censored revision
145 155
146 156 $ hg update -r $H1
147 157 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
148 158 $ cat target | head -n 10
149 159 Tainted file is now sanitized
150 160 $ hg update -r $H2
151 161 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
152 162 $ cat target | head -n 10
153 163 Tainted file now super sanitized
154 164
155 165 Set censor policy to abort in trusted $HGRC so hg verify fails
156 166
157 167 $ cp $HGRCPATH.orig $HGRCPATH
158 168 $ cat >> $HGRCPATH <<EOF
159 169 > [censor]
160 170 > policy = abort
161 171 > EOF
162 172
163 173 Repo fails verification due to censorship
164 174
165 175 $ hg verify
166 176 checking changesets
167 177 checking manifests
168 178 crosschecking files in changesets and manifests
169 179 checking files
170 180 target@1: censored file data
171 181 target@2: censored file data
172 182 checked 5 changesets with 7 changes to 2 files
173 183 2 integrity errors encountered!
174 184 (first damaged changeset appears to be 1)
175 185 [1]
176 186
177 187 Cannot update to revision with censored data
178 188
179 189 $ hg update -r $C2
180 190 abort: censored node: 1e0247a9a4b7
181 191 (set censor.policy to ignore errors)
182 192 [255]
183 193 $ hg update -r $C1
184 194 abort: censored node: 613bc869fceb
185 195 (set censor.policy to ignore errors)
186 196 [255]
187 197 $ hg update -r 0
188 198 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
189 199 $ hg update -r $H2
190 200 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
191 201
192 202 Set censor policy to ignore in trusted $HGRC so hg verify passes
193 203
194 204 $ cp $HGRCPATH.orig $HGRCPATH
195 205 $ cat >> $HGRCPATH <<EOF
196 206 > [censor]
197 207 > policy = ignore
198 208 > EOF
199 209
200 210 Repo passes verification with warnings with explicit config
201 211
202 212 $ hg verify
203 213 checking changesets
204 214 checking manifests
205 215 crosschecking files in changesets and manifests
206 216 checking files
207 217 checked 5 changesets with 7 changes to 2 files
208 218
209 219 May update to revision with censored data with explicit config
210 220
211 221 $ hg update -r $C2
212 222 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
213 223 $ cat target | head -n 10
214 224 $ hg update -r $C1
215 225 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
216 226 $ cat target | head -n 10
217 227 $ hg update -r 0
218 228 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
219 229 $ cat target | head -n 10
220 230 Initially untainted file
221 231 $ hg update -r $H2
222 232 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
223 233 $ cat target | head -n 10
224 234 Tainted file now super sanitized
225 235
226 236 Can merge in revision with censored data. Test requires one branch of history
227 237 with the file censored, but we can't censor at a head, so advance H1.
228 238
229 239 $ hg update -r $H1
230 240 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
231 241 $ C3=$H1
232 242 $ echo 'advanced head H1' > target
233 243 $ hg ci -m 'advance head H1' target
234 244 $ H1=`hg id --debug -i`
235 245 $ hg censor -r $C3 target
236 246 $ hg update -r $H2
237 247 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
238 248 $ hg merge -r $C3
239 249 merging target
240 250 0 files updated, 1 files merged, 0 files removed, 0 files unresolved
241 251 (branch merge, don't forget to commit)
242 252
243 253 Revisions present in repository heads may not be censored
244 254
245 255 $ hg update -C -r $H2
246 256 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
247 257 $ hg censor -r $H2 target
248 258 abort: cannot censor file in heads (78a8fc215e79)
249 259 (clean/delete and commit first)
250 260 [255]
251 261 $ echo 'twiddling thumbs' > bystander
252 262 $ hg ci -m 'bystander commit'
253 263 $ H2=`hg id --debug -i`
254 264 $ hg censor -r "$H2^" target
255 265 abort: cannot censor file in heads (efbe78065929)
256 266 (clean/delete and commit first)
257 267 [255]
258 268
259 269 Cannot censor working directory
260 270
261 271 $ echo 'seriously no passwords' > target
262 272 $ hg ci -m 'extend second head arbitrarily' target
263 273 $ H2=`hg id --debug -i`
264 274 $ hg update -r "$H2^"
265 275 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
266 276 $ hg censor -r . target
267 277 abort: cannot censor working directory
268 278 (clean/delete/update first)
269 279 [255]
270 280 $ hg update -r $H2
271 281 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
272 282
273 283 Can re-add file after being deleted + censored
274 284
275 285 $ C4=$H2
276 286 $ hg rm target
277 287 $ hg ci -m 'delete target so it may be censored'
278 288 $ H2=`hg id --debug -i`
279 289 $ hg censor -r $C4 target
280 290 $ hg cat -r $C4 target | head -n 10
281 291 $ hg cat -r "$H2^^" target | head -n 10
282 292 Tainted file now super sanitized
283 293 $ echo 'fresh start' > target
284 294 $ hg add target
285 295 $ hg ci -m reincarnated target
286 296 $ H2=`hg id --debug -i`
287 297 $ hg cat -r $H2 target | head -n 10
288 298 fresh start
289 299 $ hg cat -r "$H2^" target | head -n 10
290 300 target: no such file in rev 452ec1762369
291 301 $ hg cat -r $C4 target | head -n 10
292 302 $ hg cat -r "$H2^^^" target | head -n 10
293 303 Tainted file now super sanitized
294 304
295 305 Can censor after revlog has expanded to no longer permit inline storage
296 306
297 307 $ for x in `"$PYTHON" $TESTDIR/seq.py 0 50000`
298 308 > do
299 309 > echo "Password: hunter$x" >> target
300 310 > done
301 311 $ hg ci -m 'add 100k passwords'
302 312 $ H2=`hg id --debug -i`
303 313 $ C5=$H2
304 314 $ hg revert -r "$H2^" target
305 315 $ hg ci -m 'cleaned 100k passwords'
306 316 $ H2=`hg id --debug -i`
307 317 $ hg censor -r $C5 target
308 318 $ hg cat -r $C5 target | head -n 10
309 319 $ hg cat -r $H2 target | head -n 10
310 320 fresh start
311 321
312 322 Repo with censored nodes can be cloned and cloned nodes are censored
313 323
314 324 $ cd ..
315 325 $ hg clone r rclone
316 326 updating to branch default
317 327 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
318 328 $ cd rclone
319 329 $ hg cat -r $H1 target | head -n 10
320 330 advanced head H1
321 331 $ hg cat -r $H2~5 target | head -n 10
322 332 Tainted file now super sanitized
323 333 $ hg cat -r $C2 target | head -n 10
324 334 $ hg cat -r $C1 target | head -n 10
325 335 $ hg cat -r 0 target | head -n 10
326 336 Initially untainted file
327 337 $ hg verify
328 338 checking changesets
329 339 checking manifests
330 340 crosschecking files in changesets and manifests
331 341 checking files
332 342 checked 12 changesets with 13 changes to 2 files
333 343
334 344 Repo cloned before tainted content introduced can pull censored nodes
335 345
336 346 $ cd ../rpull
337 347 $ hg cat -r tip target | head -n 10
338 348 Initially untainted file
339 349 $ hg verify
340 350 checking changesets
341 351 checking manifests
342 352 crosschecking files in changesets and manifests
343 353 checking files
344 354 checked 1 changesets with 2 changes to 2 files
345 355 $ hg pull -r $H1 -r $H2
346 356 pulling from $TESTTMP/r
347 357 searching for changes
348 358 adding changesets
349 359 adding manifests
350 360 adding file changes
351 361 added 11 changesets with 11 changes to 2 files (+1 heads)
352 362 new changesets 186fb27560c3:683e4645fded
353 363 (run 'hg heads' to see heads, 'hg merge' to merge)
354 364 $ hg update 4
355 365 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
356 366 $ cat target | head -n 10
357 367 Tainted file now super sanitized
358 368 $ hg cat -r $H1 target | head -n 10
359 369 advanced head H1
360 370 $ hg cat -r $H2~5 target | head -n 10
361 371 Tainted file now super sanitized
362 372 $ hg cat -r $C2 target | head -n 10
363 373 $ hg cat -r $C1 target | head -n 10
364 374 $ hg cat -r 0 target | head -n 10
365 375 Initially untainted file
366 376 $ hg verify
367 377 checking changesets
368 378 checking manifests
369 379 crosschecking files in changesets and manifests
370 380 checking files
371 381 checked 12 changesets with 13 changes to 2 files
372 382
373 383 Censored nodes can be pushed if they censor previously unexchanged nodes
374 384
375 385 $ echo 'Passwords: hunter2hunter2' > target
376 386 $ hg ci -m 're-add password from clone' target
377 387 created new head
378 388 $ H3=`hg id --debug -i`
379 389 $ REV=$H3
380 390 $ echo 'Re-sanitized; nothing to see here' > target
381 391 $ hg ci -m 're-sanitized' target
382 392 $ H2=`hg id --debug -i`
383 393 $ CLEANREV=$H2
384 394 $ hg cat -r $REV target | head -n 10
385 395 Passwords: hunter2hunter2
386 396 $ hg censor -r $REV target
387 397 $ hg cat -r $REV target | head -n 10
388 398 $ hg cat -r $CLEANREV target | head -n 10
389 399 Re-sanitized; nothing to see here
390 400 $ hg push -f -r $H2
391 401 pushing to $TESTTMP/r
392 402 searching for changes
393 403 adding changesets
394 404 adding manifests
395 405 adding file changes
396 406 added 2 changesets with 2 changes to 1 files (+1 heads)
397 407
398 408 $ cd ../r
399 409 $ hg cat -r $REV target | head -n 10
400 410 $ hg cat -r $CLEANREV target | head -n 10
401 411 Re-sanitized; nothing to see here
402 412 $ hg update $CLEANREV
403 413 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
404 414 $ cat target | head -n 10
405 415 Re-sanitized; nothing to see here
406 416
407 417 Censored nodes can be bundled up and unbundled in another repo
408 418
409 419 $ hg bundle --base 0 ../pwbundle
410 420 13 changesets found
411 421 $ cd ../rclone
412 422 $ hg unbundle ../pwbundle
413 423 adding changesets
414 424 adding manifests
415 425 adding file changes
416 426 added 2 changesets with 2 changes to 2 files (+1 heads)
417 427 new changesets 075be80ac777:dcbaf17bf3a1 (2 drafts)
418 428 (run 'hg heads .' to see heads, 'hg merge' to merge)
419 429 $ hg cat -r $REV target | head -n 10
420 430 $ hg cat -r $CLEANREV target | head -n 10
421 431 Re-sanitized; nothing to see here
422 432 $ hg update $CLEANREV
423 433 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
424 434 $ cat target | head -n 10
425 435 Re-sanitized; nothing to see here
426 436 $ hg verify
427 437 checking changesets
428 438 checking manifests
429 439 crosschecking files in changesets and manifests
430 440 checking files
431 441 checked 14 changesets with 15 changes to 2 files
432 442
433 443 Grepping only warns, doesn't error out
434 444
435 445 $ cd ../rpull
436 446 $ hg grep 'Normal file'
437 447 bystander:Normal file v2
438 448 $ hg grep nothing
439 449 target:Re-sanitized; nothing to see here
440 450 $ hg grep --diff 'Normal file'
441 451 cannot search in censored file: target:7
442 452 cannot search in censored file: target:10
443 453 cannot search in censored file: target:12
444 454 bystander:6:-:Normal file v2
445 455 cannot search in censored file: target:1
446 456 cannot search in censored file: target:2
447 457 cannot search in censored file: target:3
448 458 bystander:2:-:Normal file here
449 459 bystander:2:+:Normal file v2
450 460 bystander:0:+:Normal file here
451 461 $ hg grep --diff nothing
452 462 cannot search in censored file: target:7
453 463 cannot search in censored file: target:10
454 464 cannot search in censored file: target:12
455 465 target:13:+:Re-sanitized; nothing to see here
456 466 cannot search in censored file: target:1
457 467 cannot search in censored file: target:2
458 468 cannot search in censored file: target:3
459 469
460 470 Censored nodes can be imported on top of censored nodes, consecutively
461 471
462 472 $ hg init ../rimport
463 473 $ hg bundle --base 1 ../rimport/splitbundle
464 474 12 changesets found
465 475 $ cd ../rimport
466 476 $ hg pull -r $H1 -r $H2 ../r
467 477 pulling from ../r
468 478 adding changesets
469 479 adding manifests
470 480 adding file changes
471 481 added 8 changesets with 10 changes to 2 files (+1 heads)
472 482 new changesets e97f55b2665a:dcbaf17bf3a1
473 483 (run 'hg heads' to see heads, 'hg merge' to merge)
474 484 $ hg unbundle splitbundle
475 485 adding changesets
476 486 adding manifests
477 487 adding file changes
478 488 added 6 changesets with 5 changes to 2 files (+1 heads)
479 489 new changesets efbe78065929:683e4645fded (6 drafts)
480 490 (run 'hg heads .' to see heads, 'hg merge' to merge)
481 491 $ hg update $H2
482 492 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
483 493 $ cat target | head -n 10
484 494 Re-sanitized; nothing to see here
485 495 $ hg verify
486 496 checking changesets
487 497 checking manifests
488 498 crosschecking files in changesets and manifests
489 499 checking files
490 500 checked 14 changesets with 15 changes to 2 files
491 501 $ cd ../r
492 502
493 503 Can import bundle where first revision of a file is censored
494 504
495 505 $ hg init ../rinit
496 506 $ hg censor -r 0 target
497 507 $ hg bundle -r 0 --base null ../rinit/initbundle
498 508 1 changesets found
499 509 $ cd ../rinit
500 510 $ hg unbundle initbundle
501 511 adding changesets
502 512 adding manifests
503 513 adding file changes
504 514 added 1 changesets with 2 changes to 2 files
505 515 new changesets e97f55b2665a (1 drafts)
506 516 (run 'hg update' to get a working copy)
507 517 $ hg cat -r 0 target | head -n 10
518
519 #if revlogv2
520
521 Testing feature that does not work in revlog v1
522 ===============================================
523
524 Censoring a revision that is used as delta base
525 -----------------------------------------------
526
527 $ cd ..
528 $ hg init censor-with-delta
529 $ cd censor-with-delta
530 $ echo root > target
531 $ hg add target
532 $ hg commit -m root
533 $ B0=`hg id --debug -i`
534 $ for x in `"$PYTHON" $TESTDIR/seq.py 0 50000`
535 > do
536 > echo "Password: hunter$x" >> target
537 > done
538 $ hg ci -m 'write a long file'
539 $ B1=`hg id --debug -i`
540 $ echo 'small change (should create a delta)' >> target
541 $ hg ci -m 'create a delta over the password'
542 (should show that the last revision is a delta, not a snapshot)
543 $ B2=`hg id --debug -i`
544
545 Make sure the last revision is a delta against the revision we will censor
546
547 $ hg debugdeltachain target -T '{rev} {chainid} {chainlen} {prevrev}\n'
548 0 1 1 -1
549 1 2 1 -1
550 2 2 2 1
551
552 Censor the file
553
554 $ hg cat -r $B1 target | wc -l
555 50002 (re)
556 $ hg censor -r $B1 target
557 $ hg cat -r $B1 target | wc -l
558 0 (re)
559
560 Check the children is fine
561
562 $ hg cat -r $B2 target | wc -l
563 50003 (re)
564
565 #endif
General Comments 0
You need to be logged in to leave comments. Login now