diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py --- a/mercurial/debugcommands.py +++ b/mercurial/debugcommands.py @@ -88,6 +88,10 @@ from .utils import ( stringutil, ) +from .revlogutils import ( + deltas as deltautil +) + release = lockmod.release command = registrar.command() @@ -706,7 +710,7 @@ def debugdeltachain(ui, repo, file_=None largestblock = 0 srchunks = 0 - for revschunk in revlog._slicechunk(r, chain): + for revschunk in deltautil.slicechunk(r, chain): srchunks += 1 blkend = start(revschunk[-1]) + length(revschunk[-1]) blksize = blkend - start(revschunk[0]) diff --git a/mercurial/revlog.py b/mercurial/revlog.py --- a/mercurial/revlog.py +++ b/mercurial/revlog.py @@ -17,7 +17,6 @@ import collections import contextlib import errno import hashlib -import heapq import os import re import struct @@ -39,7 +38,6 @@ from .i18n import _ from .revlogutils.constants import ( FLAG_GENERALDELTA, FLAG_INLINE_DATA, - LIMIT_DELTA2TEXT, REVIDX_DEFAULT_FLAGS, REVIDX_ELLIPSIS, REVIDX_EXTSTORED, @@ -69,6 +67,9 @@ from . import ( templatefilters, util, ) +from .revlogutils import ( + deltas as deltautil, +) from .utils import ( interfaceutil, stringutil, @@ -211,610 +212,6 @@ def hash(text, p1, p2): s.update(text) return s.digest() -class _testrevlog(object): - """minimalist fake revlog to use in doctests""" - - def __init__(self, data, density=0.5, mingap=0): - """data is an list of revision payload boundaries""" - self._data = data - self._srdensitythreshold = density - self._srmingapsize = mingap - - def start(self, rev): - if rev == 0: - return 0 - return self._data[rev - 1] - - def end(self, rev): - return self._data[rev] - - def length(self, rev): - return self.end(rev) - self.start(rev) - - def __len__(self): - return len(self._data) - -def _trimchunk(revlog, revs, startidx, endidx=None): - """returns revs[startidx:endidx] without empty trailing revs - - Doctest Setup - >>> revlog = _testrevlog([ - ... 5, #0 - ... 10, #1 - ... 12, #2 - ... 12, #3 (empty) - ... 17, #4 - ... 21, #5 - ... 21, #6 (empty) - ... ]) - - Contiguous cases: - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0) - [0, 1, 2, 3, 4, 5] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5) - [0, 1, 2, 3, 4] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4) - [0, 1, 2] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4) - [2] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3) - [3, 4, 5] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5) - [3, 4] - - Discontiguous cases: - >>> _trimchunk(revlog, [1, 3, 5, 6], 0) - [1, 3, 5] - >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2) - [1] - >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3) - [3, 5] - >>> _trimchunk(revlog, [1, 3, 5, 6], 1) - [3, 5] - """ - length = revlog.length - - if endidx is None: - endidx = len(revs) - - # If we have a non-emtpy delta candidate, there are nothing to trim - if revs[endidx - 1] < len(revlog): - # Trim empty revs at the end, except the very first revision of a chain - while (endidx > 1 - and endidx > startidx - and length(revs[endidx - 1]) == 0): - endidx -= 1 - - return revs[startidx:endidx] - -def _segmentspan(revlog, revs, deltainfo=None): - """Get the byte span of a segment of revisions - - revs is a sorted array of revision numbers - - >>> revlog = _testrevlog([ - ... 5, #0 - ... 10, #1 - ... 12, #2 - ... 12, #3 (empty) - ... 17, #4 - ... ]) - - >>> _segmentspan(revlog, [0, 1, 2, 3, 4]) - 17 - >>> _segmentspan(revlog, [0, 4]) - 17 - >>> _segmentspan(revlog, [3, 4]) - 5 - >>> _segmentspan(revlog, [1, 2, 3,]) - 7 - >>> _segmentspan(revlog, [1, 3]) - 7 - """ - if not revs: - return 0 - if deltainfo is not None and len(revlog) <= revs[-1]: - if len(revs) == 1: - return deltainfo.deltalen - offset = revlog.end(len(revlog) - 1) - end = deltainfo.deltalen + offset - else: - end = revlog.end(revs[-1]) - return end - revlog.start(revs[0]) - -def _slicechunk(revlog, revs, deltainfo=None, targetsize=None): - """slice revs to reduce the amount of unrelated data to be read from disk. - - ``revs`` is sliced into groups that should be read in one time. - Assume that revs are sorted. - - The initial chunk is sliced until the overall density (payload/chunks-span - ratio) is above `revlog._srdensitythreshold`. No gap smaller than - `revlog._srmingapsize` is skipped. - - If `targetsize` is set, no chunk larger than `targetsize` will be yield. - For consistency with other slicing choice, this limit won't go lower than - `revlog._srmingapsize`. - - If individual revisions chunk are larger than this limit, they will still - be raised individually. - - >>> revlog = _testrevlog([ - ... 5, #00 (5) - ... 10, #01 (5) - ... 12, #02 (2) - ... 12, #03 (empty) - ... 27, #04 (15) - ... 31, #05 (4) - ... 31, #06 (empty) - ... 42, #07 (11) - ... 47, #08 (5) - ... 47, #09 (empty) - ... 48, #10 (1) - ... 51, #11 (3) - ... 74, #12 (23) - ... 85, #13 (11) - ... 86, #14 (1) - ... 91, #15 (5) - ... ]) - - >>> list(_slicechunk(revlog, list(range(16)))) - [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] - >>> list(_slicechunk(revlog, [0, 15])) - [[0], [15]] - >>> list(_slicechunk(revlog, [0, 11, 15])) - [[0], [11], [15]] - >>> list(_slicechunk(revlog, [0, 11, 13, 15])) - [[0], [11, 13, 15]] - >>> list(_slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14])) - [[1, 2], [5, 8, 10, 11], [14]] - - Slicing with a maximum chunk size - >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=15)) - [[0], [11], [13], [15]] - >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=20)) - [[0], [11], [13, 15]] - """ - if targetsize is not None: - targetsize = max(targetsize, revlog._srmingapsize) - # targetsize should not be specified when evaluating delta candidates: - # * targetsize is used to ensure we stay within specification when reading, - # * deltainfo is used to pick are good delta chain when writing. - if not (deltainfo is None or targetsize is None): - msg = 'cannot use `targetsize` with a `deltainfo`' - raise error.ProgrammingError(msg) - for chunk in _slicechunktodensity(revlog, revs, - deltainfo, - revlog._srdensitythreshold, - revlog._srmingapsize): - for subchunk in _slicechunktosize(revlog, chunk, targetsize): - yield subchunk - -def _slicechunktosize(revlog, revs, targetsize=None): - """slice revs to match the target size - - This is intended to be used on chunk that density slicing selected by that - are still too large compared to the read garantee of revlog. This might - happens when "minimal gap size" interrupted the slicing or when chain are - built in a way that create large blocks next to each other. - - >>> revlog = _testrevlog([ - ... 3, #0 (3) - ... 5, #1 (2) - ... 6, #2 (1) - ... 8, #3 (2) - ... 8, #4 (empty) - ... 11, #5 (3) - ... 12, #6 (1) - ... 13, #7 (1) - ... 14, #8 (1) - ... ]) - - Cases where chunk is already small enough - >>> list(_slicechunktosize(revlog, [0], 3)) - [[0]] - >>> list(_slicechunktosize(revlog, [6, 7], 3)) - [[6, 7]] - >>> list(_slicechunktosize(revlog, [0], None)) - [[0]] - >>> list(_slicechunktosize(revlog, [6, 7], None)) - [[6, 7]] - - cases where we need actual slicing - >>> list(_slicechunktosize(revlog, [0, 1], 3)) - [[0], [1]] - >>> list(_slicechunktosize(revlog, [1, 3], 3)) - [[1], [3]] - >>> list(_slicechunktosize(revlog, [1, 2, 3], 3)) - [[1, 2], [3]] - >>> list(_slicechunktosize(revlog, [3, 5], 3)) - [[3], [5]] - >>> list(_slicechunktosize(revlog, [3, 4, 5], 3)) - [[3], [5]] - >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3)) - [[5], [6, 7, 8]] - >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3)) - [[0], [1, 2], [3], [5], [6, 7, 8]] - - Case with too large individual chunk (must return valid chunk) - >>> list(_slicechunktosize(revlog, [0, 1], 2)) - [[0], [1]] - >>> list(_slicechunktosize(revlog, [1, 3], 1)) - [[1], [3]] - >>> list(_slicechunktosize(revlog, [3, 4, 5], 2)) - [[3], [5]] - """ - assert targetsize is None or 0 <= targetsize - if targetsize is None or _segmentspan(revlog, revs) <= targetsize: - yield revs - return - - startrevidx = 0 - startdata = revlog.start(revs[0]) - endrevidx = 0 - iterrevs = enumerate(revs) - next(iterrevs) # skip first rev. - for idx, r in iterrevs: - span = revlog.end(r) - startdata - if span <= targetsize: - endrevidx = idx - else: - chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1) - if chunk: - yield chunk - startrevidx = idx - startdata = revlog.start(r) - endrevidx = idx - yield _trimchunk(revlog, revs, startrevidx) - -def _slicechunktodensity(revlog, revs, deltainfo=None, targetdensity=0.5, - mingapsize=0): - """slice revs to reduce the amount of unrelated data to be read from disk. - - ``revs`` is sliced into groups that should be read in one time. - Assume that revs are sorted. - - ``deltainfo`` is a _deltainfo instance of a revision that we would append - to the top of the revlog. - - The initial chunk is sliced until the overall density (payload/chunks-span - ratio) is above `targetdensity`. No gap smaller than `mingapsize` is - skipped. - - >>> revlog = _testrevlog([ - ... 5, #00 (5) - ... 10, #01 (5) - ... 12, #02 (2) - ... 12, #03 (empty) - ... 27, #04 (15) - ... 31, #05 (4) - ... 31, #06 (empty) - ... 42, #07 (11) - ... 47, #08 (5) - ... 47, #09 (empty) - ... 48, #10 (1) - ... 51, #11 (3) - ... 74, #12 (23) - ... 85, #13 (11) - ... 86, #14 (1) - ... 91, #15 (5) - ... ]) - - >>> list(_slicechunktodensity(revlog, list(range(16)))) - [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] - >>> list(_slicechunktodensity(revlog, [0, 15])) - [[0], [15]] - >>> list(_slicechunktodensity(revlog, [0, 11, 15])) - [[0], [11], [15]] - >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15])) - [[0], [11, 13, 15]] - >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14])) - [[1, 2], [5, 8, 10, 11], [14]] - >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14], - ... mingapsize=20)) - [[1, 2, 3, 5, 8, 10, 11], [14]] - >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14], - ... targetdensity=0.95)) - [[1, 2], [5], [8, 10, 11], [14]] - >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14], - ... targetdensity=0.95, mingapsize=12)) - [[1, 2], [5, 8, 10, 11], [14]] - """ - start = revlog.start - length = revlog.length - - if len(revs) <= 1: - yield revs - return - - nextrev = len(revlog) - nextoffset = revlog.end(nextrev - 1) - - if deltainfo is None: - deltachainspan = _segmentspan(revlog, revs) - chainpayload = sum(length(r) for r in revs) - else: - deltachainspan = deltainfo.distance - chainpayload = deltainfo.compresseddeltalen - - if deltachainspan < mingapsize: - yield revs - return - - readdata = deltachainspan - - if deltachainspan: - density = chainpayload / float(deltachainspan) - else: - density = 1.0 - - if density >= targetdensity: - yield revs - return - - if deltainfo is not None and deltainfo.deltalen: - revs = list(revs) - revs.append(nextrev) - - # Store the gaps in a heap to have them sorted by decreasing size - gapsheap = [] - heapq.heapify(gapsheap) - prevend = None - for i, rev in enumerate(revs): - if rev < nextrev: - revstart = start(rev) - revlen = length(rev) - else: - revstart = nextoffset - revlen = deltainfo.deltalen - - # Skip empty revisions to form larger holes - if revlen == 0: - continue - - if prevend is not None: - gapsize = revstart - prevend - # only consider holes that are large enough - if gapsize > mingapsize: - heapq.heappush(gapsheap, (-gapsize, i)) - - prevend = revstart + revlen - - # Collect the indices of the largest holes until the density is acceptable - indicesheap = [] - heapq.heapify(indicesheap) - while gapsheap and density < targetdensity: - oppgapsize, gapidx = heapq.heappop(gapsheap) - - heapq.heappush(indicesheap, gapidx) - - # the gap sizes are stored as negatives to be sorted decreasingly - # by the heap - readdata -= (-oppgapsize) - if readdata > 0: - density = chainpayload / float(readdata) - else: - density = 1.0 - - # Cut the revs at collected indices - previdx = 0 - while indicesheap: - idx = heapq.heappop(indicesheap) - - chunk = _trimchunk(revlog, revs, previdx, idx) - if chunk: - yield chunk - - previdx = idx - - chunk = _trimchunk(revlog, revs, previdx) - if chunk: - yield chunk - -@attr.s(slots=True, frozen=True) -class _deltainfo(object): - distance = attr.ib() - deltalen = attr.ib() - data = attr.ib() - base = attr.ib() - chainbase = attr.ib() - chainlen = attr.ib() - compresseddeltalen = attr.ib() - snapshotdepth = attr.ib() - -class _deltacomputer(object): - def __init__(self, revlog): - self.revlog = revlog - - def _getcandidaterevs(self, p1, p2, cachedelta): - """ - Provides revisions that present an interest to be diffed against, - grouped by level of easiness. - """ - revlog = self.revlog - gdelta = revlog._generaldelta - curr = len(revlog) - prev = curr - 1 - p1r, p2r = revlog.rev(p1), revlog.rev(p2) - - # should we try to build a delta? - if prev != nullrev and revlog._storedeltachains: - tested = set() - # This condition is true most of the time when processing - # changegroup data into a generaldelta repo. The only time it - # isn't true is if this is the first revision in a delta chain - # or if ``format.generaldelta=true`` disabled ``lazydeltabase``. - if cachedelta and gdelta and revlog._lazydeltabase: - # Assume what we received from the server is a good choice - # build delta will reuse the cache - yield (cachedelta[0],) - tested.add(cachedelta[0]) - - if gdelta: - # exclude already lazy tested base if any - parents = [p for p in (p1r, p2r) - if p != nullrev and p not in tested] - - if not revlog._deltabothparents and len(parents) == 2: - parents.sort() - # To minimize the chance of having to build a fulltext, - # pick first whichever parent is closest to us (max rev) - yield (parents[1],) - # then the other one (min rev) if the first did not fit - yield (parents[0],) - tested.update(parents) - elif len(parents) > 0: - # Test all parents (1 or 2), and keep the best candidate - yield parents - tested.update(parents) - - if prev not in tested: - # other approach failed try against prev to hopefully save us a - # fulltext. - yield (prev,) - tested.add(prev) - - def buildtext(self, revinfo, fh): - """Builds a fulltext version of a revision - - revinfo: _revisioninfo instance that contains all needed info - fh: file handle to either the .i or the .d revlog file, - depending on whether it is inlined or not - """ - btext = revinfo.btext - if btext[0] is not None: - return btext[0] - - revlog = self.revlog - cachedelta = revinfo.cachedelta - flags = revinfo.flags - node = revinfo.node - - baserev = cachedelta[0] - delta = cachedelta[1] - # special case deltas which replace entire base; no need to decode - # base revision. this neatly avoids censored bases, which throw when - # they're decoded. - hlen = struct.calcsize(">lll") - if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev), - len(delta) - hlen): - btext[0] = delta[hlen:] - else: - # deltabase is rawtext before changed by flag processors, which is - # equivalent to non-raw text - basetext = revlog.revision(baserev, _df=fh, raw=False) - btext[0] = mdiff.patch(basetext, delta) - - try: - res = revlog._processflags(btext[0], flags, 'read', raw=True) - btext[0], validatehash = res - if validatehash: - revlog.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2) - if flags & REVIDX_ISCENSORED: - raise RevlogError(_('node %s is not censored') % node) - except CensoredNodeError: - # must pass the censored index flag to add censored revisions - if not flags & REVIDX_ISCENSORED: - raise - return btext[0] - - def _builddeltadiff(self, base, revinfo, fh): - revlog = self.revlog - t = self.buildtext(revinfo, fh) - if revlog.iscensored(base): - # deltas based on a censored revision must replace the - # full content in one patch, so delta works everywhere - header = mdiff.replacediffheader(revlog.rawsize(base), len(t)) - delta = header + t - else: - ptext = revlog.revision(base, _df=fh, raw=True) - delta = mdiff.textdiff(ptext, t) - - return delta - - def _builddeltainfo(self, revinfo, base, fh): - # can we use the cached delta? - if revinfo.cachedelta and revinfo.cachedelta[0] == base: - delta = revinfo.cachedelta[1] - else: - delta = self._builddeltadiff(base, revinfo, fh) - revlog = self.revlog - header, data = revlog.compress(delta) - deltalen = len(header) + len(data) - chainbase = revlog.chainbase(base) - offset = revlog.end(len(revlog) - 1) - dist = deltalen + offset - revlog.start(chainbase) - if revlog._generaldelta: - deltabase = base - else: - deltabase = chainbase - chainlen, compresseddeltalen = revlog._chaininfo(base) - chainlen += 1 - compresseddeltalen += deltalen - - revlog = self.revlog - snapshotdepth = None - if deltabase == nullrev: - snapshotdepth = 0 - elif revlog._sparserevlog and revlog.issnapshot(deltabase): - # A delta chain should always be one full snapshot, - # zero or more semi-snapshots, and zero or more deltas - p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2) - if deltabase not in (p1, p2) and revlog.issnapshot(deltabase): - snapshotdepth = len(revlog._deltachain(deltabase)[0]) - - return _deltainfo(dist, deltalen, (header, data), deltabase, - chainbase, chainlen, compresseddeltalen, - snapshotdepth) - - def finddeltainfo(self, revinfo, fh): - """Find an acceptable delta against a candidate revision - - revinfo: information about the revision (instance of _revisioninfo) - fh: file handle to either the .i or the .d revlog file, - depending on whether it is inlined or not - - Returns the first acceptable candidate revision, as ordered by - _getcandidaterevs - """ - if not revinfo.textlen: - return None # empty file do not need delta - - cachedelta = revinfo.cachedelta - p1 = revinfo.p1 - p2 = revinfo.p2 - revlog = self.revlog - - deltalength = self.revlog.length - deltaparent = self.revlog.deltaparent - - deltainfo = None - deltas_limit = revinfo.textlen * LIMIT_DELTA2TEXT - for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta): - # filter out delta base that will never produce good delta - candidaterevs = [r for r in candidaterevs - if self.revlog.length(r) <= deltas_limit] - nominateddeltas = [] - for candidaterev in candidaterevs: - # skip over empty delta (no need to include them in a chain) - while candidaterev != nullrev and not deltalength(candidaterev): - candidaterev = deltaparent(candidaterev) - # no need to try a delta against nullid, this will be handled - # by fulltext later. - if candidaterev == nullrev: - continue - # no delta for rawtext-changing revs (see "candelta" for why) - if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS: - continue - candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh) - if revlog._isgooddeltainfo(candidatedelta, revinfo): - nominateddeltas.append(candidatedelta) - if nominateddeltas: - deltainfo = min(nominateddeltas, key=lambda x: x.deltalen) - break - - return deltainfo - @attr.s(slots=True, frozen=True) class _revisioninfo(object): """Information about a revision that allows building its fulltext @@ -2095,7 +1492,8 @@ class revlog(object): if not self._withsparseread: slicedchunks = (revs,) else: - slicedchunks = _slicechunk(self, revs, targetsize=targetsize) + slicedchunks = deltautil.slicechunk(self, revs, + targetsize=targetsize) for revschunk in slicedchunks: firstrev = revschunk[0] @@ -2393,7 +1791,7 @@ class revlog(object): computed by default as hash(text, p1, p2), however subclasses might use different hashing method (and override checkhash() in such case) flags - the known flags to set on the revision - deltacomputer - an optional _deltacomputer instance shared between + deltacomputer - an optional deltacomputer instance shared between multiple calls """ if link == nullrev: @@ -2516,95 +1914,6 @@ class revlog(object): return compressor.decompress(data) - def _isgooddeltainfo(self, deltainfo, revinfo): - """Returns True if the given delta is good. Good means that it is within - the disk span, disk size, and chain length bounds that we know to be - performant.""" - if deltainfo is None: - return False - - # - 'deltainfo.distance' is the distance from the base revision -- - # bounding it limits the amount of I/O we need to do. - # - 'deltainfo.compresseddeltalen' is the sum of the total size of - # deltas we need to apply -- bounding it limits the amount of CPU - # we consume. - - if self._sparserevlog: - # As sparse-read will be used, we can consider that the distance, - # instead of being the span of the whole chunk, - # is the span of the largest read chunk - base = deltainfo.base - - if base != nullrev: - deltachain = self._deltachain(base)[0] - else: - deltachain = [] - - # search for the first non-snapshot revision - for idx, r in enumerate(deltachain): - if not self.issnapshot(r): - break - deltachain = deltachain[idx:] - chunks = _slicechunk(self, deltachain, deltainfo) - all_span = [_segmentspan(self, revs, deltainfo) for revs in chunks] - distance = max(all_span) - else: - distance = deltainfo.distance - - textlen = revinfo.textlen - defaultmax = textlen * 4 - maxdist = self._maxdeltachainspan - if not maxdist: - maxdist = distance # ensure the conditional pass - maxdist = max(maxdist, defaultmax) - if self._sparserevlog and maxdist < self._srmingapsize: - # In multiple place, we are ignoring irrelevant data range below a - # certain size. Be also apply this tradeoff here and relax span - # constraint for small enought content. - maxdist = self._srmingapsize - - # Bad delta from read span: - # - # If the span of data read is larger than the maximum allowed. - if maxdist < distance: - return False - - # Bad delta from new delta size: - # - # If the delta size is larger than the target text, storing the - # delta will be inefficient. - if textlen < deltainfo.deltalen: - return False - - # Bad delta from cumulated payload size: - # - # If the sum of delta get larger than K * target text length. - if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: - return False - - # Bad delta from chain length: - # - # If the number of delta in the chain gets too high. - if self._maxchainlen and self._maxchainlen < deltainfo.chainlen: - return False - - # bad delta from intermediate snapshot size limit - # - # If an intermediate snapshot size is higher than the limit. The - # limit exist to prevent endless chain of intermediate delta to be - # created. - if (deltainfo.snapshotdepth is not None and - (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen): - return False - - # bad delta if new intermediate snapshot is larger than the previous - # snapshot - if (deltainfo.snapshotdepth - and self.length(deltainfo.base) < deltainfo.deltalen): - return False - - return True - def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags, cachedelta, ifh, dfh, alwayscache=False, deltacomputer=None): @@ -2652,7 +1961,7 @@ class revlog(object): textlen = len(rawtext) if deltacomputer is None: - deltacomputer = _deltacomputer(self) + deltacomputer = deltautil.deltacomputer(self) revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags) @@ -2754,7 +2063,7 @@ class revlog(object): dfh.flush() ifh.flush() try: - deltacomputer = _deltacomputer(self) + deltacomputer = deltautil.deltacomputer(self) # loop through our set of deltas for data in deltas: node, p1, p2, linknode, deltabase, delta, flags = data @@ -3127,7 +2436,7 @@ class revlog(object): populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS, self.DELTAREUSESAMEREVS) - deltacomputer = _deltacomputer(destrevlog) + deltacomputer = deltautil.deltacomputer(destrevlog) index = self.index for rev in self: entry = index[rev] diff --git a/mercurial/revlogutils/constants.py b/mercurial/revlogutils/constants.py --- a/mercurial/revlogutils/constants.py +++ b/mercurial/revlogutils/constants.py @@ -41,6 +41,3 @@ REVIDX_FLAGS_ORDER = [ REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER) # bitmark for flags that could cause rawdata content change REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED - -# maximum / ratio -LIMIT_DELTA2TEXT = 2 diff --git a/mercurial/revlog.py b/mercurial/revlogutils/deltas.py copy from mercurial/revlog.py copy to mercurial/revlogutils/deltas.py --- a/mercurial/revlog.py +++ b/mercurial/revlogutils/deltas.py @@ -1,215 +1,42 @@ -# revlog.py - storage back-end for mercurial +# revlogdeltas.py - Logic around delta computation for revlog # # Copyright 2005-2007 Matt Mackall +# Copyright 2018 Octobus # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. - -"""Storage back-end for Mercurial. - -This provides efficient delta storage with O(1) retrieve and append -and O(changes) merge between branches. -""" +"""Helper class to compute deltas stored inside revlogs""" from __future__ import absolute_import -import collections -import contextlib -import errno -import hashlib import heapq -import os -import re import struct -import zlib # import stuff from node for others to import from revlog -from .node import ( - bin, - hex, - nullhex, - nullid, +from ..node import ( nullrev, - wdirfilenodeids, - wdirhex, - wdirid, - wdirrev, ) -from .i18n import _ -from .revlogutils.constants import ( - FLAG_GENERALDELTA, - FLAG_INLINE_DATA, - LIMIT_DELTA2TEXT, - REVIDX_DEFAULT_FLAGS, - REVIDX_ELLIPSIS, - REVIDX_EXTSTORED, - REVIDX_FLAGS_ORDER, +from ..i18n import _ + +from .constants import ( REVIDX_ISCENSORED, - REVIDX_KNOWN_FLAGS, REVIDX_RAWTEXT_CHANGING_FLAGS, - REVLOGV0, - REVLOGV1, - REVLOGV1_FLAGS, - REVLOGV2, - REVLOGV2_FLAGS, - REVLOG_DEFAULT_FLAGS, - REVLOG_DEFAULT_FORMAT, - REVLOG_DEFAULT_VERSION, ) -from .thirdparty import ( + +from ..thirdparty import ( attr, ) -from . import ( - ancestor, + +from .. import ( error, mdiff, - policy, - pycompat, - repository, - templatefilters, - util, ) -from .utils import ( - interfaceutil, - stringutil, -) - -# blanked usage of all the name to prevent pyflakes constraints -# We need these name available in the module for extensions. -REVLOGV0 -REVLOGV1 -REVLOGV2 -FLAG_INLINE_DATA -FLAG_GENERALDELTA -REVLOG_DEFAULT_FLAGS -REVLOG_DEFAULT_FORMAT -REVLOG_DEFAULT_VERSION -REVLOGV1_FLAGS -REVLOGV2_FLAGS -REVIDX_ISCENSORED -REVIDX_ELLIPSIS -REVIDX_EXTSTORED -REVIDX_DEFAULT_FLAGS -REVIDX_FLAGS_ORDER -REVIDX_KNOWN_FLAGS -REVIDX_RAWTEXT_CHANGING_FLAGS - -parsers = policy.importmod(r'parsers') - -# Aliased for performance. -_zlibdecompress = zlib.decompress - -# max size of revlog with inline data -_maxinline = 131072 -_chunksize = 1048576 RevlogError = error.RevlogError -LookupError = error.LookupError -AmbiguousPrefixLookupError = error.AmbiguousPrefixLookupError CensoredNodeError = error.CensoredNodeError -ProgrammingError = error.ProgrammingError - -# Store flag processors (cf. 'addflagprocessor()' to register) -_flagprocessors = { - REVIDX_ISCENSORED: None, -} - -_mdre = re.compile('\1\n') -def parsemeta(text): - """return (metadatadict, metadatasize)""" - # text can be buffer, so we can't use .startswith or .index - if text[:2] != '\1\n': - return None, None - s = _mdre.search(text, 2).start() - mtext = text[2:s] - meta = {} - for l in mtext.splitlines(): - k, v = l.split(": ", 1) - meta[k] = v - return meta, (s + 2) - -def packmeta(meta, text): - keys = sorted(meta) - metatext = "".join("%s: %s\n" % (k, meta[k]) for k in keys) - return "\1\n%s\1\n%s" % (metatext, text) - -def _censoredtext(text): - m, offs = parsemeta(text) - return m and "censored" in m - -def addflagprocessor(flag, processor): - """Register a flag processor on a revision data flag. - Invariant: - - Flags need to be defined in REVIDX_KNOWN_FLAGS and REVIDX_FLAGS_ORDER, - and REVIDX_RAWTEXT_CHANGING_FLAGS if they can alter rawtext. - - Only one flag processor can be registered on a specific flag. - - flagprocessors must be 3-tuples of functions (read, write, raw) with the - following signatures: - - (read) f(self, rawtext) -> text, bool - - (write) f(self, text) -> rawtext, bool - - (raw) f(self, rawtext) -> bool - "text" is presented to the user. "rawtext" is stored in revlog data, not - directly visible to the user. - The boolean returned by these transforms is used to determine whether - the returned text can be used for hash integrity checking. For example, - if "write" returns False, then "text" is used to generate hash. If - "write" returns True, that basically means "rawtext" returned by "write" - should be used to generate hash. Usually, "write" and "read" return - different booleans. And "raw" returns a same boolean as "write". - - Note: The 'raw' transform is used for changegroup generation and in some - debug commands. In this case the transform only indicates whether the - contents can be used for hash integrity checks. - """ - if not flag & REVIDX_KNOWN_FLAGS: - msg = _("cannot register processor on unknown flag '%#x'.") % (flag) - raise ProgrammingError(msg) - if flag not in REVIDX_FLAGS_ORDER: - msg = _("flag '%#x' undefined in REVIDX_FLAGS_ORDER.") % (flag) - raise ProgrammingError(msg) - if flag in _flagprocessors: - msg = _("cannot register multiple processors on flag '%#x'.") % (flag) - raise error.Abort(msg) - _flagprocessors[flag] = processor - -def getoffset(q): - return int(q >> 16) - -def gettype(q): - return int(q & 0xFFFF) - -def offset_type(offset, type): - if (type & ~REVIDX_KNOWN_FLAGS) != 0: - raise ValueError('unknown revlog index flags') - return int(int(offset) << 16 | type) - -_nullhash = hashlib.sha1(nullid) - -def hash(text, p1, p2): - """generate a hash from the given text and its parent hashes - - This hash combines both the current file contents and its history - in a manner that makes it easy to distinguish nodes with the same - content in the revision graph. - """ - # As of now, if one of the parent node is null, p2 is null - if p2 == nullid: - # deep copy of a hash is faster than creating one - s = _nullhash.copy() - s.update(p1) - else: - # none of the parent nodes are nullid - if p1 < p2: - a = p1 - b = p2 - else: - a = p2 - b = p1 - s = hashlib.sha1(a) - s.update(b) - s.update(text) - return s.digest() +# maximum / ratio +LIMIT_DELTA2TEXT = 2 class _testrevlog(object): """minimalist fake revlog to use in doctests""" @@ -234,95 +61,7 @@ class _testrevlog(object): def __len__(self): return len(self._data) -def _trimchunk(revlog, revs, startidx, endidx=None): - """returns revs[startidx:endidx] without empty trailing revs - - Doctest Setup - >>> revlog = _testrevlog([ - ... 5, #0 - ... 10, #1 - ... 12, #2 - ... 12, #3 (empty) - ... 17, #4 - ... 21, #5 - ... 21, #6 (empty) - ... ]) - - Contiguous cases: - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0) - [0, 1, 2, 3, 4, 5] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5) - [0, 1, 2, 3, 4] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4) - [0, 1, 2] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4) - [2] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3) - [3, 4, 5] - >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5) - [3, 4] - - Discontiguous cases: - >>> _trimchunk(revlog, [1, 3, 5, 6], 0) - [1, 3, 5] - >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2) - [1] - >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3) - [3, 5] - >>> _trimchunk(revlog, [1, 3, 5, 6], 1) - [3, 5] - """ - length = revlog.length - - if endidx is None: - endidx = len(revs) - - # If we have a non-emtpy delta candidate, there are nothing to trim - if revs[endidx - 1] < len(revlog): - # Trim empty revs at the end, except the very first revision of a chain - while (endidx > 1 - and endidx > startidx - and length(revs[endidx - 1]) == 0): - endidx -= 1 - - return revs[startidx:endidx] - -def _segmentspan(revlog, revs, deltainfo=None): - """Get the byte span of a segment of revisions - - revs is a sorted array of revision numbers - - >>> revlog = _testrevlog([ - ... 5, #0 - ... 10, #1 - ... 12, #2 - ... 12, #3 (empty) - ... 17, #4 - ... ]) - - >>> _segmentspan(revlog, [0, 1, 2, 3, 4]) - 17 - >>> _segmentspan(revlog, [0, 4]) - 17 - >>> _segmentspan(revlog, [3, 4]) - 5 - >>> _segmentspan(revlog, [1, 2, 3,]) - 7 - >>> _segmentspan(revlog, [1, 3]) - 7 - """ - if not revs: - return 0 - if deltainfo is not None and len(revlog) <= revs[-1]: - if len(revs) == 1: - return deltainfo.deltalen - offset = revlog.end(len(revlog) - 1) - end = deltainfo.deltalen + offset - else: - end = revlog.end(revs[-1]) - return end - revlog.start(revs[0]) - -def _slicechunk(revlog, revs, deltainfo=None, targetsize=None): +def slicechunk(revlog, revs, deltainfo=None, targetsize=None): """slice revs to reduce the amount of unrelated data to be read from disk. ``revs`` is sliced into groups that should be read in one time. @@ -358,21 +97,21 @@ def _slicechunk(revlog, revs, deltainfo= ... 91, #15 (5) ... ]) - >>> list(_slicechunk(revlog, list(range(16)))) + >>> list(slicechunk(revlog, list(range(16)))) [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] - >>> list(_slicechunk(revlog, [0, 15])) + >>> list(slicechunk(revlog, [0, 15])) [[0], [15]] - >>> list(_slicechunk(revlog, [0, 11, 15])) + >>> list(slicechunk(revlog, [0, 11, 15])) [[0], [11], [15]] - >>> list(_slicechunk(revlog, [0, 11, 13, 15])) + >>> list(slicechunk(revlog, [0, 11, 13, 15])) [[0], [11, 13, 15]] - >>> list(_slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14])) + >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14])) [[1, 2], [5, 8, 10, 11], [14]] Slicing with a maximum chunk size - >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=15)) + >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15)) [[0], [11], [13], [15]] - >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=20)) + >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20)) [[0], [11], [13, 15]] """ if targetsize is not None: @@ -445,7 +184,7 @@ def _slicechunktosize(revlog, revs, targ [[3], [5]] """ assert targetsize is None or 0 <= targetsize - if targetsize is None or _segmentspan(revlog, revs) <= targetsize: + if targetsize is None or segmentspan(revlog, revs) <= targetsize: yield revs return @@ -531,7 +270,7 @@ def _slicechunktodensity(revlog, revs, d nextoffset = revlog.end(nextrev - 1) if deltainfo is None: - deltachainspan = _segmentspan(revlog, revs) + deltachainspan = segmentspan(revlog, revs) chainpayload = sum(length(r) for r in revs) else: deltachainspan = deltainfo.distance @@ -611,6 +350,94 @@ def _slicechunktodensity(revlog, revs, d if chunk: yield chunk +def _trimchunk(revlog, revs, startidx, endidx=None): + """returns revs[startidx:endidx] without empty trailing revs + + Doctest Setup + >>> revlog = _testrevlog([ + ... 5, #0 + ... 10, #1 + ... 12, #2 + ... 12, #3 (empty) + ... 17, #4 + ... 21, #5 + ... 21, #6 (empty) + ... ]) + + Contiguous cases: + >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0) + [0, 1, 2, 3, 4, 5] + >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5) + [0, 1, 2, 3, 4] + >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4) + [0, 1, 2] + >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4) + [2] + >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3) + [3, 4, 5] + >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5) + [3, 4] + + Discontiguous cases: + >>> _trimchunk(revlog, [1, 3, 5, 6], 0) + [1, 3, 5] + >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2) + [1] + >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3) + [3, 5] + >>> _trimchunk(revlog, [1, 3, 5, 6], 1) + [3, 5] + """ + length = revlog.length + + if endidx is None: + endidx = len(revs) + + # If we have a non-emtpy delta candidate, there are nothing to trim + if revs[endidx - 1] < len(revlog): + # Trim empty revs at the end, except the very first revision of a chain + while (endidx > 1 + and endidx > startidx + and length(revs[endidx - 1]) == 0): + endidx -= 1 + + return revs[startidx:endidx] + +def segmentspan(revlog, revs, deltainfo=None): + """Get the byte span of a segment of revisions + + revs is a sorted array of revision numbers + + >>> revlog = _testrevlog([ + ... 5, #0 + ... 10, #1 + ... 12, #2 + ... 12, #3 (empty) + ... 17, #4 + ... ]) + + >>> segmentspan(revlog, [0, 1, 2, 3, 4]) + 17 + >>> segmentspan(revlog, [0, 4]) + 17 + >>> segmentspan(revlog, [3, 4]) + 5 + >>> segmentspan(revlog, [1, 2, 3,]) + 7 + >>> segmentspan(revlog, [1, 3]) + 7 + """ + if not revs: + return 0 + if deltainfo is not None and len(revlog) <= revs[-1]: + if len(revs) == 1: + return deltainfo.deltalen + offset = revlog.end(len(revlog) - 1) + end = deltainfo.deltalen + offset + else: + end = revlog.end(revs[-1]) + return end - revlog.start(revs[0]) + @attr.s(slots=True, frozen=True) class _deltainfo(object): distance = attr.ib() @@ -622,7 +449,98 @@ class _deltainfo(object): compresseddeltalen = attr.ib() snapshotdepth = attr.ib() -class _deltacomputer(object): +def isgooddeltainfo(revlog, deltainfo, revinfo): + """Returns True if the given delta is good. Good means that it is within + the disk span, disk size, and chain length bounds that we know to be + performant.""" + if deltainfo is None: + return False + + # - 'deltainfo.distance' is the distance from the base revision -- + # bounding it limits the amount of I/O we need to do. + # - 'deltainfo.compresseddeltalen' is the sum of the total size of + # deltas we need to apply -- bounding it limits the amount of CPU + # we consume. + + if revlog._sparserevlog: + # As sparse-read will be used, we can consider that the distance, + # instead of being the span of the whole chunk, + # is the span of the largest read chunk + base = deltainfo.base + + if base != nullrev: + deltachain = revlog._deltachain(base)[0] + else: + deltachain = [] + + # search for the first non-snapshot revision + for idx, r in enumerate(deltachain): + if not revlog.issnapshot(r): + break + deltachain = deltachain[idx:] + chunks = slicechunk(revlog, deltachain, deltainfo) + all_span = [segmentspan(revlog, revs, deltainfo) + for revs in chunks] + distance = max(all_span) + else: + distance = deltainfo.distance + + textlen = revinfo.textlen + defaultmax = textlen * 4 + maxdist = revlog._maxdeltachainspan + if not maxdist: + maxdist = distance # ensure the conditional pass + maxdist = max(maxdist, defaultmax) + if revlog._sparserevlog and maxdist < revlog._srmingapsize: + # In multiple place, we are ignoring irrelevant data range below a + # certain size. Be also apply this tradeoff here and relax span + # constraint for small enought content. + maxdist = revlog._srmingapsize + + # Bad delta from read span: + # + # If the span of data read is larger than the maximum allowed. + if maxdist < distance: + return False + + # Bad delta from new delta size: + # + # If the delta size is larger than the target text, storing the + # delta will be inefficient. + if textlen < deltainfo.deltalen: + return False + + # Bad delta from cumulated payload size: + # + # If the sum of delta get larger than K * target text length. + if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: + return False + + # Bad delta from chain length: + # + # If the number of delta in the chain gets too high. + if (revlog._maxchainlen + and revlog._maxchainlen < deltainfo.chainlen): + return False + + # bad delta from intermediate snapshot size limit + # + # If an intermediate snapshot size is higher than the limit. The + # limit exist to prevent endless chain of intermediate delta to be + # created. + if (deltainfo.snapshotdepth is not None and + (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen): + return False + + # bad delta if new intermediate snapshot is larger than the previous + # snapshot + if (deltainfo.snapshotdepth + and revlog.length(deltainfo.base) < deltainfo.deltalen): + return False + + return True + +class deltacomputer(object): def __init__(self, revlog): self.revlog = revlog @@ -807,2374 +725,10 @@ class _deltacomputer(object): if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS: continue candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh) - if revlog._isgooddeltainfo(candidatedelta, revinfo): + if isgooddeltainfo(self.revlog, candidatedelta, revinfo): nominateddeltas.append(candidatedelta) if nominateddeltas: deltainfo = min(nominateddeltas, key=lambda x: x.deltalen) break return deltainfo - -@attr.s(slots=True, frozen=True) -class _revisioninfo(object): - """Information about a revision that allows building its fulltext - node: expected hash of the revision - p1, p2: parent revs of the revision - btext: built text cache consisting of a one-element list - cachedelta: (baserev, uncompressed_delta) or None - flags: flags associated to the revision storage - - One of btext[0] or cachedelta must be set. - """ - node = attr.ib() - p1 = attr.ib() - p2 = attr.ib() - btext = attr.ib() - textlen = attr.ib() - cachedelta = attr.ib() - flags = attr.ib() - -@interfaceutil.implementer(repository.irevisiondelta) -@attr.s(slots=True, frozen=True) -class revlogrevisiondelta(object): - node = attr.ib() - p1node = attr.ib() - p2node = attr.ib() - basenode = attr.ib() - linknode = attr.ib() - flags = attr.ib() - baserevisionsize = attr.ib() - revision = attr.ib() - delta = attr.ib() - -# index v0: -# 4 bytes: offset -# 4 bytes: compressed length -# 4 bytes: base rev -# 4 bytes: link rev -# 20 bytes: parent 1 nodeid -# 20 bytes: parent 2 nodeid -# 20 bytes: nodeid -indexformatv0 = struct.Struct(">4l20s20s20s") -indexformatv0_pack = indexformatv0.pack -indexformatv0_unpack = indexformatv0.unpack - -class revlogoldindex(list): - def __getitem__(self, i): - if i == -1: - return (0, 0, 0, -1, -1, -1, -1, nullid) - return list.__getitem__(self, i) - -class revlogoldio(object): - def __init__(self): - self.size = indexformatv0.size - - def parseindex(self, data, inline): - s = self.size - index = [] - nodemap = {nullid: nullrev} - n = off = 0 - l = len(data) - while off + s <= l: - cur = data[off:off + s] - off += s - e = indexformatv0_unpack(cur) - # transform to revlogv1 format - e2 = (offset_type(e[0], 0), e[1], -1, e[2], e[3], - nodemap.get(e[4], nullrev), nodemap.get(e[5], nullrev), e[6]) - index.append(e2) - nodemap[e[6]] = n - n += 1 - - return revlogoldindex(index), nodemap, None - - def packentry(self, entry, node, version, rev): - if gettype(entry[0]): - raise RevlogError(_('index entry flags need revlog version 1')) - e2 = (getoffset(entry[0]), entry[1], entry[3], entry[4], - node(entry[5]), node(entry[6]), entry[7]) - return indexformatv0_pack(*e2) - -# index ng: -# 6 bytes: offset -# 2 bytes: flags -# 4 bytes: compressed length -# 4 bytes: uncompressed length -# 4 bytes: base rev -# 4 bytes: link rev -# 4 bytes: parent 1 rev -# 4 bytes: parent 2 rev -# 32 bytes: nodeid -indexformatng = struct.Struct(">Qiiiiii20s12x") -indexformatng_pack = indexformatng.pack -versionformat = struct.Struct(">I") -versionformat_pack = versionformat.pack -versionformat_unpack = versionformat.unpack - -# corresponds to uncompressed length of indexformatng (2 gigs, 4-byte -# signed integer) -_maxentrysize = 0x7fffffff - -class revlogio(object): - def __init__(self): - self.size = indexformatng.size - - def parseindex(self, data, inline): - # call the C implementation to parse the index data - index, cache = parsers.parse_index2(data, inline) - return index, getattr(index, 'nodemap', None), cache - - def packentry(self, entry, node, version, rev): - p = indexformatng_pack(*entry) - if rev == 0: - p = versionformat_pack(version) + p[4:] - return p - -class revlog(object): - """ - the underlying revision storage object - - A revlog consists of two parts, an index and the revision data. - - The index is a file with a fixed record size containing - information on each revision, including its nodeid (hash), the - nodeids of its parents, the position and offset of its data within - the data file, and the revision it's based on. Finally, each entry - contains a linkrev entry that can serve as a pointer to external - data. - - The revision data itself is a linear collection of data chunks. - Each chunk represents a revision and is usually represented as a - delta against the previous chunk. To bound lookup time, runs of - deltas are limited to about 2 times the length of the original - version data. This makes retrieval of a version proportional to - its size, or O(1) relative to the number of revisions. - - Both pieces of the revlog are written to in an append-only - fashion, which means we never need to rewrite a file to insert or - remove data, and can use some simple techniques to avoid the need - for locking while reading. - - If checkambig, indexfile is opened with checkambig=True at - writing, to avoid file stat ambiguity. - - If mmaplargeindex is True, and an mmapindexthreshold is set, the - index will be mmapped rather than read if it is larger than the - configured threshold. - - If censorable is True, the revlog can have censored revisions. - """ - def __init__(self, opener, indexfile, datafile=None, checkambig=False, - mmaplargeindex=False, censorable=False): - """ - create a revlog object - - opener is a function that abstracts the file opening operation - and can be used to implement COW semantics or the like. - """ - self.indexfile = indexfile - self.datafile = datafile or (indexfile[:-2] + ".d") - self.opener = opener - # When True, indexfile is opened with checkambig=True at writing, to - # avoid file stat ambiguity. - self._checkambig = checkambig - self._censorable = censorable - # 3-tuple of (node, rev, text) for a raw revision. - self._cache = None - # Maps rev to chain base rev. - self._chainbasecache = util.lrucachedict(100) - # 2-tuple of (offset, data) of raw data from the revlog at an offset. - self._chunkcache = (0, '') - # How much data to read and cache into the raw revlog data cache. - self._chunkcachesize = 65536 - self._maxchainlen = None - self._deltabothparents = True - self.index = [] - # Mapping of partial identifiers to full nodes. - self._pcache = {} - # Mapping of revision integer to full node. - self._nodecache = {nullid: nullrev} - self._nodepos = None - self._compengine = 'zlib' - self._maxdeltachainspan = -1 - self._withsparseread = False - self._sparserevlog = False - self._srdensitythreshold = 0.50 - self._srmingapsize = 262144 - - mmapindexthreshold = None - v = REVLOG_DEFAULT_VERSION - opts = getattr(opener, 'options', None) - if opts is not None: - if 'revlogv2' in opts: - # version 2 revlogs always use generaldelta. - v = REVLOGV2 | FLAG_GENERALDELTA | FLAG_INLINE_DATA - elif 'revlogv1' in opts: - if 'generaldelta' in opts: - v |= FLAG_GENERALDELTA - else: - v = 0 - if 'chunkcachesize' in opts: - self._chunkcachesize = opts['chunkcachesize'] - if 'maxchainlen' in opts: - self._maxchainlen = opts['maxchainlen'] - if 'deltabothparents' in opts: - self._deltabothparents = opts['deltabothparents'] - self._lazydeltabase = bool(opts.get('lazydeltabase', False)) - if 'compengine' in opts: - self._compengine = opts['compengine'] - if 'maxdeltachainspan' in opts: - self._maxdeltachainspan = opts['maxdeltachainspan'] - if mmaplargeindex and 'mmapindexthreshold' in opts: - mmapindexthreshold = opts['mmapindexthreshold'] - self._sparserevlog = bool(opts.get('sparse-revlog', False)) - withsparseread = bool(opts.get('with-sparse-read', False)) - # sparse-revlog forces sparse-read - self._withsparseread = self._sparserevlog or withsparseread - if 'sparse-read-density-threshold' in opts: - self._srdensitythreshold = opts['sparse-read-density-threshold'] - if 'sparse-read-min-gap-size' in opts: - self._srmingapsize = opts['sparse-read-min-gap-size'] - - if self._chunkcachesize <= 0: - raise RevlogError(_('revlog chunk cache size %r is not greater ' - 'than 0') % self._chunkcachesize) - elif self._chunkcachesize & (self._chunkcachesize - 1): - raise RevlogError(_('revlog chunk cache size %r is not a power ' - 'of 2') % self._chunkcachesize) - - indexdata = '' - self._initempty = True - try: - with self._indexfp() as f: - if (mmapindexthreshold is not None and - self.opener.fstat(f).st_size >= mmapindexthreshold): - indexdata = util.buffer(util.mmapread(f)) - else: - indexdata = f.read() - if len(indexdata) > 0: - v = versionformat_unpack(indexdata[:4])[0] - self._initempty = False - except IOError as inst: - if inst.errno != errno.ENOENT: - raise - - self.version = v - self._inline = v & FLAG_INLINE_DATA - self._generaldelta = v & FLAG_GENERALDELTA - flags = v & ~0xFFFF - fmt = v & 0xFFFF - if fmt == REVLOGV0: - if flags: - raise RevlogError(_('unknown flags (%#04x) in version %d ' - 'revlog %s') % - (flags >> 16, fmt, self.indexfile)) - elif fmt == REVLOGV1: - if flags & ~REVLOGV1_FLAGS: - raise RevlogError(_('unknown flags (%#04x) in version %d ' - 'revlog %s') % - (flags >> 16, fmt, self.indexfile)) - elif fmt == REVLOGV2: - if flags & ~REVLOGV2_FLAGS: - raise RevlogError(_('unknown flags (%#04x) in version %d ' - 'revlog %s') % - (flags >> 16, fmt, self.indexfile)) - else: - raise RevlogError(_('unknown version (%d) in revlog %s') % - (fmt, self.indexfile)) - - self._storedeltachains = True - - self._io = revlogio() - if self.version == REVLOGV0: - self._io = revlogoldio() - try: - d = self._io.parseindex(indexdata, self._inline) - except (ValueError, IndexError): - raise RevlogError(_("index %s is corrupted") % (self.indexfile)) - self.index, nodemap, self._chunkcache = d - if nodemap is not None: - self.nodemap = self._nodecache = nodemap - if not self._chunkcache: - self._chunkclear() - # revnum -> (chain-length, sum-delta-length) - self._chaininfocache = {} - # revlog header -> revlog compressor - self._decompressors = {} - - @util.propertycache - def _compressor(self): - return util.compengines[self._compengine].revlogcompressor() - - def _indexfp(self, mode='r'): - """file object for the revlog's index file""" - args = {r'mode': mode} - if mode != 'r': - args[r'checkambig'] = self._checkambig - if mode == 'w': - args[r'atomictemp'] = True - return self.opener(self.indexfile, **args) - - def _datafp(self, mode='r'): - """file object for the revlog's data file""" - return self.opener(self.datafile, mode=mode) - - @contextlib.contextmanager - def _datareadfp(self, existingfp=None): - """file object suitable to read data""" - if existingfp is not None: - yield existingfp - else: - if self._inline: - func = self._indexfp - else: - func = self._datafp - with func() as fp: - yield fp - - def tip(self): - return self.node(len(self.index) - 1) - def __contains__(self, rev): - return 0 <= rev < len(self) - def __len__(self): - return len(self.index) - def __iter__(self): - return iter(pycompat.xrange(len(self))) - def revs(self, start=0, stop=None): - """iterate over all rev in this revlog (from start to stop)""" - step = 1 - length = len(self) - if stop is not None: - if start > stop: - step = -1 - stop += step - if stop > length: - stop = length - else: - stop = length - return pycompat.xrange(start, stop, step) - - @util.propertycache - def nodemap(self): - if self.index: - # populate mapping down to the initial node - node0 = self.index[0][7] # get around changelog filtering - self.rev(node0) - return self._nodecache - - def hasnode(self, node): - try: - self.rev(node) - return True - except KeyError: - return False - - def candelta(self, baserev, rev): - """whether two revisions (baserev, rev) can be delta-ed or not""" - # Disable delta if either rev requires a content-changing flag - # processor (ex. LFS). This is because such flag processor can alter - # the rawtext content that the delta will be based on, and two clients - # could have a same revlog node with different flags (i.e. different - # rawtext contents) and the delta could be incompatible. - if ((self.flags(baserev) & REVIDX_RAWTEXT_CHANGING_FLAGS) - or (self.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS)): - return False - return True - - def clearcaches(self): - self._cache = None - self._chainbasecache.clear() - self._chunkcache = (0, '') - self._pcache = {} - - try: - self._nodecache.clearcaches() - except AttributeError: - self._nodecache = {nullid: nullrev} - self._nodepos = None - - def rev(self, node): - try: - return self._nodecache[node] - except TypeError: - raise - except RevlogError: - # parsers.c radix tree lookup failed - if node == wdirid or node in wdirfilenodeids: - raise error.WdirUnsupported - raise LookupError(node, self.indexfile, _('no node')) - except KeyError: - # pure python cache lookup failed - n = self._nodecache - i = self.index - p = self._nodepos - if p is None: - p = len(i) - 1 - else: - assert p < len(i) - for r in pycompat.xrange(p, -1, -1): - v = i[r][7] - n[v] = r - if v == node: - self._nodepos = r - 1 - return r - if node == wdirid or node in wdirfilenodeids: - raise error.WdirUnsupported - raise LookupError(node, self.indexfile, _('no node')) - - # Accessors for index entries. - - # First tuple entry is 8 bytes. First 6 bytes are offset. Last 2 bytes - # are flags. - def start(self, rev): - return int(self.index[rev][0] >> 16) - - def flags(self, rev): - return self.index[rev][0] & 0xFFFF - - def length(self, rev): - return self.index[rev][1] - - def rawsize(self, rev): - """return the length of the uncompressed text for a given revision""" - l = self.index[rev][2] - if l >= 0: - return l - - t = self.revision(rev, raw=True) - return len(t) - - def size(self, rev): - """length of non-raw text (processed by a "read" flag processor)""" - # fast path: if no "read" flag processor could change the content, - # size is rawsize. note: ELLIPSIS is known to not change the content. - flags = self.flags(rev) - if flags & (REVIDX_KNOWN_FLAGS ^ REVIDX_ELLIPSIS) == 0: - return self.rawsize(rev) - - return len(self.revision(rev, raw=False)) - - def chainbase(self, rev): - base = self._chainbasecache.get(rev) - if base is not None: - return base - - index = self.index - iterrev = rev - base = index[iterrev][3] - while base != iterrev: - iterrev = base - base = index[iterrev][3] - - self._chainbasecache[rev] = base - return base - - def linkrev(self, rev): - return self.index[rev][4] - - def parentrevs(self, rev): - try: - entry = self.index[rev] - except IndexError: - if rev == wdirrev: - raise error.WdirUnsupported - raise - - return entry[5], entry[6] - - def node(self, rev): - try: - return self.index[rev][7] - except IndexError: - if rev == wdirrev: - raise error.WdirUnsupported - raise - - # Derived from index values. - - def end(self, rev): - return self.start(rev) + self.length(rev) - - def parents(self, node): - i = self.index - d = i[self.rev(node)] - return i[d[5]][7], i[d[6]][7] # map revisions to nodes inline - - def chainlen(self, rev): - return self._chaininfo(rev)[0] - - def _chaininfo(self, rev): - chaininfocache = self._chaininfocache - if rev in chaininfocache: - return chaininfocache[rev] - index = self.index - generaldelta = self._generaldelta - iterrev = rev - e = index[iterrev] - clen = 0 - compresseddeltalen = 0 - while iterrev != e[3]: - clen += 1 - compresseddeltalen += e[1] - if generaldelta: - iterrev = e[3] - else: - iterrev -= 1 - if iterrev in chaininfocache: - t = chaininfocache[iterrev] - clen += t[0] - compresseddeltalen += t[1] - break - e = index[iterrev] - else: - # Add text length of base since decompressing that also takes - # work. For cache hits the length is already included. - compresseddeltalen += e[1] - r = (clen, compresseddeltalen) - chaininfocache[rev] = r - return r - - def _deltachain(self, rev, stoprev=None): - """Obtain the delta chain for a revision. - - ``stoprev`` specifies a revision to stop at. If not specified, we - stop at the base of the chain. - - Returns a 2-tuple of (chain, stopped) where ``chain`` is a list of - revs in ascending order and ``stopped`` is a bool indicating whether - ``stoprev`` was hit. - """ - # Try C implementation. - try: - return self.index.deltachain(rev, stoprev, self._generaldelta) - except AttributeError: - pass - - chain = [] - - # Alias to prevent attribute lookup in tight loop. - index = self.index - generaldelta = self._generaldelta - - iterrev = rev - e = index[iterrev] - while iterrev != e[3] and iterrev != stoprev: - chain.append(iterrev) - if generaldelta: - iterrev = e[3] - else: - iterrev -= 1 - e = index[iterrev] - - if iterrev == stoprev: - stopped = True - else: - chain.append(iterrev) - stopped = False - - chain.reverse() - return chain, stopped - - def ancestors(self, revs, stoprev=0, inclusive=False): - """Generate the ancestors of 'revs' in reverse topological order. - Does not generate revs lower than stoprev. - - See the documentation for ancestor.lazyancestors for more details.""" - - return ancestor.lazyancestors(self.parentrevs, revs, stoprev=stoprev, - inclusive=inclusive) - - def descendants(self, revs): - """Generate the descendants of 'revs' in revision order. - - Yield a sequence of revision numbers starting with a child of - some rev in revs, i.e., each revision is *not* considered a - descendant of itself. Results are ordered by revision number (a - topological sort).""" - first = min(revs) - if first == nullrev: - for i in self: - yield i - return - - seen = set(revs) - for i in self.revs(start=first + 1): - for x in self.parentrevs(i): - if x != nullrev and x in seen: - seen.add(i) - yield i - break - - def findcommonmissing(self, common=None, heads=None): - """Return a tuple of the ancestors of common and the ancestors of heads - that are not ancestors of common. In revset terminology, we return the - tuple: - - ::common, (::heads) - (::common) - - The list is sorted by revision number, meaning it is - topologically sorted. - - 'heads' and 'common' are both lists of node IDs. If heads is - not supplied, uses all of the revlog's heads. If common is not - supplied, uses nullid.""" - if common is None: - common = [nullid] - if heads is None: - heads = self.heads() - - common = [self.rev(n) for n in common] - heads = [self.rev(n) for n in heads] - - # we want the ancestors, but inclusive - class lazyset(object): - def __init__(self, lazyvalues): - self.addedvalues = set() - self.lazyvalues = lazyvalues - - def __contains__(self, value): - return value in self.addedvalues or value in self.lazyvalues - - def __iter__(self): - added = self.addedvalues - for r in added: - yield r - for r in self.lazyvalues: - if not r in added: - yield r - - def add(self, value): - self.addedvalues.add(value) - - def update(self, values): - self.addedvalues.update(values) - - has = lazyset(self.ancestors(common)) - has.add(nullrev) - has.update(common) - - # take all ancestors from heads that aren't in has - missing = set() - visit = collections.deque(r for r in heads if r not in has) - while visit: - r = visit.popleft() - if r in missing: - continue - else: - missing.add(r) - for p in self.parentrevs(r): - if p not in has: - visit.append(p) - missing = list(missing) - missing.sort() - return has, [self.node(miss) for miss in missing] - - def incrementalmissingrevs(self, common=None): - """Return an object that can be used to incrementally compute the - revision numbers of the ancestors of arbitrary sets that are not - ancestors of common. This is an ancestor.incrementalmissingancestors - object. - - 'common' is a list of revision numbers. If common is not supplied, uses - nullrev. - """ - if common is None: - common = [nullrev] - - return ancestor.incrementalmissingancestors(self.parentrevs, common) - - def findmissingrevs(self, common=None, heads=None): - """Return the revision numbers of the ancestors of heads that - are not ancestors of common. - - More specifically, return a list of revision numbers corresponding to - nodes N such that every N satisfies the following constraints: - - 1. N is an ancestor of some node in 'heads' - 2. N is not an ancestor of any node in 'common' - - The list is sorted by revision number, meaning it is - topologically sorted. - - 'heads' and 'common' are both lists of revision numbers. If heads is - not supplied, uses all of the revlog's heads. If common is not - supplied, uses nullid.""" - if common is None: - common = [nullrev] - if heads is None: - heads = self.headrevs() - - inc = self.incrementalmissingrevs(common=common) - return inc.missingancestors(heads) - - def findmissing(self, common=None, heads=None): - """Return the ancestors of heads that are not ancestors of common. - - More specifically, return a list of nodes N such that every N - satisfies the following constraints: - - 1. N is an ancestor of some node in 'heads' - 2. N is not an ancestor of any node in 'common' - - The list is sorted by revision number, meaning it is - topologically sorted. - - 'heads' and 'common' are both lists of node IDs. If heads is - not supplied, uses all of the revlog's heads. If common is not - supplied, uses nullid.""" - if common is None: - common = [nullid] - if heads is None: - heads = self.heads() - - common = [self.rev(n) for n in common] - heads = [self.rev(n) for n in heads] - - inc = self.incrementalmissingrevs(common=common) - return [self.node(r) for r in inc.missingancestors(heads)] - - def nodesbetween(self, roots=None, heads=None): - """Return a topological path from 'roots' to 'heads'. - - Return a tuple (nodes, outroots, outheads) where 'nodes' is a - topologically sorted list of all nodes N that satisfy both of - these constraints: - - 1. N is a descendant of some node in 'roots' - 2. N is an ancestor of some node in 'heads' - - Every node is considered to be both a descendant and an ancestor - of itself, so every reachable node in 'roots' and 'heads' will be - included in 'nodes'. - - 'outroots' is the list of reachable nodes in 'roots', i.e., the - subset of 'roots' that is returned in 'nodes'. Likewise, - 'outheads' is the subset of 'heads' that is also in 'nodes'. - - 'roots' and 'heads' are both lists of node IDs. If 'roots' is - unspecified, uses nullid as the only root. If 'heads' is - unspecified, uses list of all of the revlog's heads.""" - nonodes = ([], [], []) - if roots is not None: - roots = list(roots) - if not roots: - return nonodes - lowestrev = min([self.rev(n) for n in roots]) - else: - roots = [nullid] # Everybody's a descendant of nullid - lowestrev = nullrev - if (lowestrev == nullrev) and (heads is None): - # We want _all_ the nodes! - return ([self.node(r) for r in self], [nullid], list(self.heads())) - if heads is None: - # All nodes are ancestors, so the latest ancestor is the last - # node. - highestrev = len(self) - 1 - # Set ancestors to None to signal that every node is an ancestor. - ancestors = None - # Set heads to an empty dictionary for later discovery of heads - heads = {} - else: - heads = list(heads) - if not heads: - return nonodes - ancestors = set() - # Turn heads into a dictionary so we can remove 'fake' heads. - # Also, later we will be using it to filter out the heads we can't - # find from roots. - heads = dict.fromkeys(heads, False) - # Start at the top and keep marking parents until we're done. - nodestotag = set(heads) - # Remember where the top was so we can use it as a limit later. - highestrev = max([self.rev(n) for n in nodestotag]) - while nodestotag: - # grab a node to tag - n = nodestotag.pop() - # Never tag nullid - if n == nullid: - continue - # A node's revision number represents its place in a - # topologically sorted list of nodes. - r = self.rev(n) - if r >= lowestrev: - if n not in ancestors: - # If we are possibly a descendant of one of the roots - # and we haven't already been marked as an ancestor - ancestors.add(n) # Mark as ancestor - # Add non-nullid parents to list of nodes to tag. - nodestotag.update([p for p in self.parents(n) if - p != nullid]) - elif n in heads: # We've seen it before, is it a fake head? - # So it is, real heads should not be the ancestors of - # any other heads. - heads.pop(n) - if not ancestors: - return nonodes - # Now that we have our set of ancestors, we want to remove any - # roots that are not ancestors. - - # If one of the roots was nullid, everything is included anyway. - if lowestrev > nullrev: - # But, since we weren't, let's recompute the lowest rev to not - # include roots that aren't ancestors. - - # Filter out roots that aren't ancestors of heads - roots = [root for root in roots if root in ancestors] - # Recompute the lowest revision - if roots: - lowestrev = min([self.rev(root) for root in roots]) - else: - # No more roots? Return empty list - return nonodes - else: - # We are descending from nullid, and don't need to care about - # any other roots. - lowestrev = nullrev - roots = [nullid] - # Transform our roots list into a set. - descendants = set(roots) - # Also, keep the original roots so we can filter out roots that aren't - # 'real' roots (i.e. are descended from other roots). - roots = descendants.copy() - # Our topologically sorted list of output nodes. - orderedout = [] - # Don't start at nullid since we don't want nullid in our output list, - # and if nullid shows up in descendants, empty parents will look like - # they're descendants. - for r in self.revs(start=max(lowestrev, 0), stop=highestrev + 1): - n = self.node(r) - isdescendant = False - if lowestrev == nullrev: # Everybody is a descendant of nullid - isdescendant = True - elif n in descendants: - # n is already a descendant - isdescendant = True - # This check only needs to be done here because all the roots - # will start being marked is descendants before the loop. - if n in roots: - # If n was a root, check if it's a 'real' root. - p = tuple(self.parents(n)) - # If any of its parents are descendants, it's not a root. - if (p[0] in descendants) or (p[1] in descendants): - roots.remove(n) - else: - p = tuple(self.parents(n)) - # A node is a descendant if either of its parents are - # descendants. (We seeded the dependents list with the roots - # up there, remember?) - if (p[0] in descendants) or (p[1] in descendants): - descendants.add(n) - isdescendant = True - if isdescendant and ((ancestors is None) or (n in ancestors)): - # Only include nodes that are both descendants and ancestors. - orderedout.append(n) - if (ancestors is not None) and (n in heads): - # We're trying to figure out which heads are reachable - # from roots. - # Mark this head as having been reached - heads[n] = True - elif ancestors is None: - # Otherwise, we're trying to discover the heads. - # Assume this is a head because if it isn't, the next step - # will eventually remove it. - heads[n] = True - # But, obviously its parents aren't. - for p in self.parents(n): - heads.pop(p, None) - heads = [head for head, flag in heads.iteritems() if flag] - roots = list(roots) - assert orderedout - assert roots - assert heads - return (orderedout, roots, heads) - - def headrevs(self): - try: - return self.index.headrevs() - except AttributeError: - return self._headrevs() - - def computephases(self, roots): - return self.index.computephasesmapsets(roots) - - def _headrevs(self): - count = len(self) - if not count: - return [nullrev] - # we won't iter over filtered rev so nobody is a head at start - ishead = [0] * (count + 1) - index = self.index - for r in self: - ishead[r] = 1 # I may be an head - e = index[r] - ishead[e[5]] = ishead[e[6]] = 0 # my parent are not - return [r for r, val in enumerate(ishead) if val] - - def heads(self, start=None, stop=None): - """return the list of all nodes that have no children - - if start is specified, only heads that are descendants of - start will be returned - if stop is specified, it will consider all the revs from stop - as if they had no children - """ - if start is None and stop is None: - if not len(self): - return [nullid] - return [self.node(r) for r in self.headrevs()] - - if start is None: - start = nullid - if stop is None: - stop = [] - stoprevs = set([self.rev(n) for n in stop]) - startrev = self.rev(start) - reachable = {startrev} - heads = {startrev} - - parentrevs = self.parentrevs - for r in self.revs(start=startrev + 1): - for p in parentrevs(r): - if p in reachable: - if r not in stoprevs: - reachable.add(r) - heads.add(r) - if p in heads and p not in stoprevs: - heads.remove(p) - - return [self.node(r) for r in heads] - - def children(self, node): - """find the children of a given node""" - c = [] - p = self.rev(node) - for r in self.revs(start=p + 1): - prevs = [pr for pr in self.parentrevs(r) if pr != nullrev] - if prevs: - for pr in prevs: - if pr == p: - c.append(self.node(r)) - elif p == nullrev: - c.append(self.node(r)) - return c - - def commonancestorsheads(self, a, b): - """calculate all the heads of the common ancestors of nodes a and b""" - a, b = self.rev(a), self.rev(b) - ancs = self._commonancestorsheads(a, b) - return pycompat.maplist(self.node, ancs) - - def _commonancestorsheads(self, *revs): - """calculate all the heads of the common ancestors of revs""" - try: - ancs = self.index.commonancestorsheads(*revs) - except (AttributeError, OverflowError): # C implementation failed - ancs = ancestor.commonancestorsheads(self.parentrevs, *revs) - return ancs - - def isancestor(self, a, b): - """return True if node a is an ancestor of node b - - A revision is considered an ancestor of itself.""" - a, b = self.rev(a), self.rev(b) - return self.isancestorrev(a, b) - - def isancestorrev(self, a, b): - """return True if revision a is an ancestor of revision b - - A revision is considered an ancestor of itself. - - The implementation of this is trivial but the use of - commonancestorsheads is not.""" - if a == nullrev: - return True - elif a == b: - return True - elif a > b: - return False - return a in self._commonancestorsheads(a, b) - - def ancestor(self, a, b): - """calculate the "best" common ancestor of nodes a and b""" - - a, b = self.rev(a), self.rev(b) - try: - ancs = self.index.ancestors(a, b) - except (AttributeError, OverflowError): - ancs = ancestor.ancestors(self.parentrevs, a, b) - if ancs: - # choose a consistent winner when there's a tie - return min(map(self.node, ancs)) - return nullid - - def _match(self, id): - if isinstance(id, int): - # rev - return self.node(id) - if len(id) == 20: - # possibly a binary node - # odds of a binary node being all hex in ASCII are 1 in 10**25 - try: - node = id - self.rev(node) # quick search the index - return node - except LookupError: - pass # may be partial hex id - try: - # str(rev) - rev = int(id) - if "%d" % rev != id: - raise ValueError - if rev < 0: - rev = len(self) + rev - if rev < 0 or rev >= len(self): - raise ValueError - return self.node(rev) - except (ValueError, OverflowError): - pass - if len(id) == 40: - try: - # a full hex nodeid? - node = bin(id) - self.rev(node) - return node - except (TypeError, LookupError): - pass - - def _partialmatch(self, id): - # we don't care wdirfilenodeids as they should be always full hash - maybewdir = wdirhex.startswith(id) - try: - partial = self.index.partialmatch(id) - if partial and self.hasnode(partial): - if maybewdir: - # single 'ff...' match in radix tree, ambiguous with wdir - raise RevlogError - return partial - if maybewdir: - # no 'ff...' match in radix tree, wdir identified - raise error.WdirUnsupported - return None - except RevlogError: - # parsers.c radix tree lookup gave multiple matches - # fast path: for unfiltered changelog, radix tree is accurate - if not getattr(self, 'filteredrevs', None): - raise AmbiguousPrefixLookupError(id, self.indexfile, - _('ambiguous identifier')) - # fall through to slow path that filters hidden revisions - except (AttributeError, ValueError): - # we are pure python, or key was too short to search radix tree - pass - - if id in self._pcache: - return self._pcache[id] - - if len(id) <= 40: - try: - # hex(node)[:...] - l = len(id) // 2 # grab an even number of digits - prefix = bin(id[:l * 2]) - nl = [e[7] for e in self.index if e[7].startswith(prefix)] - nl = [n for n in nl if hex(n).startswith(id) and - self.hasnode(n)] - if nullhex.startswith(id): - nl.append(nullid) - if len(nl) > 0: - if len(nl) == 1 and not maybewdir: - self._pcache[id] = nl[0] - return nl[0] - raise AmbiguousPrefixLookupError(id, self.indexfile, - _('ambiguous identifier')) - if maybewdir: - raise error.WdirUnsupported - return None - except TypeError: - pass - - def lookup(self, id): - """locate a node based on: - - revision number or str(revision number) - - nodeid or subset of hex nodeid - """ - n = self._match(id) - if n is not None: - return n - n = self._partialmatch(id) - if n: - return n - - raise LookupError(id, self.indexfile, _('no match found')) - - def shortest(self, node, minlength=1): - """Find the shortest unambiguous prefix that matches node.""" - def isvalid(prefix): - try: - node = self._partialmatch(prefix) - except error.RevlogError: - return False - except error.WdirUnsupported: - # single 'ff...' match - return True - if node is None: - raise LookupError(node, self.indexfile, _('no node')) - return True - - def maybewdir(prefix): - return all(c == 'f' for c in prefix) - - hexnode = hex(node) - - def disambiguate(hexnode, minlength): - """Disambiguate against wdirid.""" - for length in range(minlength, 41): - prefix = hexnode[:length] - if not maybewdir(prefix): - return prefix - - if not getattr(self, 'filteredrevs', None): - try: - length = max(self.index.shortest(node), minlength) - return disambiguate(hexnode, length) - except RevlogError: - if node != wdirid: - raise LookupError(node, self.indexfile, _('no node')) - except AttributeError: - # Fall through to pure code - pass - - if node == wdirid: - for length in range(minlength, 41): - prefix = hexnode[:length] - if isvalid(prefix): - return prefix - - for length in range(minlength, 41): - prefix = hexnode[:length] - if isvalid(prefix): - return disambiguate(hexnode, length) - - def cmp(self, node, text): - """compare text with a given file revision - - returns True if text is different than what is stored. - """ - p1, p2 = self.parents(node) - return hash(text, p1, p2) != node - - def _cachesegment(self, offset, data): - """Add a segment to the revlog cache. - - Accepts an absolute offset and the data that is at that location. - """ - o, d = self._chunkcache - # try to add to existing cache - if o + len(d) == offset and len(d) + len(data) < _chunksize: - self._chunkcache = o, d + data - else: - self._chunkcache = offset, data - - def _readsegment(self, offset, length, df=None): - """Load a segment of raw data from the revlog. - - Accepts an absolute offset, length to read, and an optional existing - file handle to read from. - - If an existing file handle is passed, it will be seeked and the - original seek position will NOT be restored. - - Returns a str or buffer of raw byte data. - """ - # Cache data both forward and backward around the requested - # data, in a fixed size window. This helps speed up operations - # involving reading the revlog backwards. - cachesize = self._chunkcachesize - realoffset = offset & ~(cachesize - 1) - reallength = (((offset + length + cachesize) & ~(cachesize - 1)) - - realoffset) - with self._datareadfp(df) as df: - df.seek(realoffset) - d = df.read(reallength) - self._cachesegment(realoffset, d) - if offset != realoffset or reallength != length: - return util.buffer(d, offset - realoffset, length) - return d - - def _getsegment(self, offset, length, df=None): - """Obtain a segment of raw data from the revlog. - - Accepts an absolute offset, length of bytes to obtain, and an - optional file handle to the already-opened revlog. If the file - handle is used, it's original seek position will not be preserved. - - Requests for data may be returned from a cache. - - Returns a str or a buffer instance of raw byte data. - """ - o, d = self._chunkcache - l = len(d) - - # is it in the cache? - cachestart = offset - o - cacheend = cachestart + length - if cachestart >= 0 and cacheend <= l: - if cachestart == 0 and cacheend == l: - return d # avoid a copy - return util.buffer(d, cachestart, cacheend - cachestart) - - return self._readsegment(offset, length, df=df) - - def _getsegmentforrevs(self, startrev, endrev, df=None): - """Obtain a segment of raw data corresponding to a range of revisions. - - Accepts the start and end revisions and an optional already-open - file handle to be used for reading. If the file handle is read, its - seek position will not be preserved. - - Requests for data may be satisfied by a cache. - - Returns a 2-tuple of (offset, data) for the requested range of - revisions. Offset is the integer offset from the beginning of the - revlog and data is a str or buffer of the raw byte data. - - Callers will need to call ``self.start(rev)`` and ``self.length(rev)`` - to determine where each revision's data begins and ends. - """ - # Inlined self.start(startrev) & self.end(endrev) for perf reasons - # (functions are expensive). - index = self.index - istart = index[startrev] - start = int(istart[0] >> 16) - if startrev == endrev: - end = start + istart[1] - else: - iend = index[endrev] - end = int(iend[0] >> 16) + iend[1] - - if self._inline: - start += (startrev + 1) * self._io.size - end += (endrev + 1) * self._io.size - length = end - start - - return start, self._getsegment(start, length, df=df) - - def _chunk(self, rev, df=None): - """Obtain a single decompressed chunk for a revision. - - Accepts an integer revision and an optional already-open file handle - to be used for reading. If used, the seek position of the file will not - be preserved. - - Returns a str holding uncompressed data for the requested revision. - """ - return self.decompress(self._getsegmentforrevs(rev, rev, df=df)[1]) - - def _chunks(self, revs, df=None, targetsize=None): - """Obtain decompressed chunks for the specified revisions. - - Accepts an iterable of numeric revisions that are assumed to be in - ascending order. Also accepts an optional already-open file handle - to be used for reading. If used, the seek position of the file will - not be preserved. - - This function is similar to calling ``self._chunk()`` multiple times, - but is faster. - - Returns a list with decompressed data for each requested revision. - """ - if not revs: - return [] - start = self.start - length = self.length - inline = self._inline - iosize = self._io.size - buffer = util.buffer - - l = [] - ladd = l.append - - if not self._withsparseread: - slicedchunks = (revs,) - else: - slicedchunks = _slicechunk(self, revs, targetsize=targetsize) - - for revschunk in slicedchunks: - firstrev = revschunk[0] - # Skip trailing revisions with empty diff - for lastrev in revschunk[::-1]: - if length(lastrev) != 0: - break - - try: - offset, data = self._getsegmentforrevs(firstrev, lastrev, df=df) - except OverflowError: - # issue4215 - we can't cache a run of chunks greater than - # 2G on Windows - return [self._chunk(rev, df=df) for rev in revschunk] - - decomp = self.decompress - for rev in revschunk: - chunkstart = start(rev) - if inline: - chunkstart += (rev + 1) * iosize - chunklength = length(rev) - ladd(decomp(buffer(data, chunkstart - offset, chunklength))) - - return l - - def _chunkclear(self): - """Clear the raw chunk cache.""" - self._chunkcache = (0, '') - - def deltaparent(self, rev): - """return deltaparent of the given revision""" - base = self.index[rev][3] - if base == rev: - return nullrev - elif self._generaldelta: - return base - else: - return rev - 1 - - def issnapshot(self, rev): - """tells whether rev is a snapshot - """ - if rev == nullrev: - return True - deltap = self.deltaparent(rev) - if deltap == nullrev: - return True - p1, p2 = self.parentrevs(rev) - if deltap in (p1, p2): - return False - return self.issnapshot(deltap) - - def snapshotdepth(self, rev): - """number of snapshot in the chain before this one""" - if not self.issnapshot(rev): - raise ProgrammingError('revision %d not a snapshot') - return len(self._deltachain(rev)[0]) - 1 - - def revdiff(self, rev1, rev2): - """return or calculate a delta between two revisions - - The delta calculated is in binary form and is intended to be written to - revlog data directly. So this function needs raw revision data. - """ - if rev1 != nullrev and self.deltaparent(rev2) == rev1: - return bytes(self._chunk(rev2)) - - return mdiff.textdiff(self.revision(rev1, raw=True), - self.revision(rev2, raw=True)) - - def revision(self, nodeorrev, _df=None, raw=False): - """return an uncompressed revision of a given node or revision - number. - - _df - an existing file handle to read from. (internal-only) - raw - an optional argument specifying if the revision data is to be - treated as raw data when applying flag transforms. 'raw' should be set - to True when generating changegroups or in debug commands. - """ - if isinstance(nodeorrev, int): - rev = nodeorrev - node = self.node(rev) - else: - node = nodeorrev - rev = None - - cachedrev = None - flags = None - rawtext = None - if node == nullid: - return "" - if self._cache: - if self._cache[0] == node: - # _cache only stores rawtext - if raw: - return self._cache[2] - # duplicated, but good for perf - if rev is None: - rev = self.rev(node) - if flags is None: - flags = self.flags(rev) - # no extra flags set, no flag processor runs, text = rawtext - if flags == REVIDX_DEFAULT_FLAGS: - return self._cache[2] - # rawtext is reusable. need to run flag processor - rawtext = self._cache[2] - - cachedrev = self._cache[1] - - # look up what we need to read - if rawtext is None: - if rev is None: - rev = self.rev(node) - - chain, stopped = self._deltachain(rev, stoprev=cachedrev) - if stopped: - rawtext = self._cache[2] - - # drop cache to save memory - self._cache = None - - targetsize = None - rawsize = self.index[rev][2] - if 0 <= rawsize: - targetsize = 4 * rawsize - - bins = self._chunks(chain, df=_df, targetsize=targetsize) - if rawtext is None: - rawtext = bytes(bins[0]) - bins = bins[1:] - - rawtext = mdiff.patches(rawtext, bins) - self._cache = (node, rev, rawtext) - - if flags is None: - if rev is None: - rev = self.rev(node) - flags = self.flags(rev) - - text, validatehash = self._processflags(rawtext, flags, 'read', raw=raw) - if validatehash: - self.checkhash(text, node, rev=rev) - - return text - - def hash(self, text, p1, p2): - """Compute a node hash. - - Available as a function so that subclasses can replace the hash - as needed. - """ - return hash(text, p1, p2) - - def _processflags(self, text, flags, operation, raw=False): - """Inspect revision data flags and applies transforms defined by - registered flag processors. - - ``text`` - the revision data to process - ``flags`` - the revision flags - ``operation`` - the operation being performed (read or write) - ``raw`` - an optional argument describing if the raw transform should be - applied. - - This method processes the flags in the order (or reverse order if - ``operation`` is 'write') defined by REVIDX_FLAGS_ORDER, applying the - flag processors registered for present flags. The order of flags defined - in REVIDX_FLAGS_ORDER needs to be stable to allow non-commutativity. - - Returns a 2-tuple of ``(text, validatehash)`` where ``text`` is the - processed text and ``validatehash`` is a bool indicating whether the - returned text should be checked for hash integrity. - - Note: If the ``raw`` argument is set, it has precedence over the - operation and will only update the value of ``validatehash``. - """ - # fast path: no flag processors will run - if flags == 0: - return text, True - if not operation in ('read', 'write'): - raise ProgrammingError(_("invalid '%s' operation ") % (operation)) - # Check all flags are known. - if flags & ~REVIDX_KNOWN_FLAGS: - raise RevlogError(_("incompatible revision flag '%#x'") % - (flags & ~REVIDX_KNOWN_FLAGS)) - validatehash = True - # Depending on the operation (read or write), the order might be - # reversed due to non-commutative transforms. - orderedflags = REVIDX_FLAGS_ORDER - if operation == 'write': - orderedflags = reversed(orderedflags) - - for flag in orderedflags: - # If a flagprocessor has been registered for a known flag, apply the - # related operation transform and update result tuple. - if flag & flags: - vhash = True - - if flag not in _flagprocessors: - message = _("missing processor for flag '%#x'") % (flag) - raise RevlogError(message) - - processor = _flagprocessors[flag] - if processor is not None: - readtransform, writetransform, rawtransform = processor - - if raw: - vhash = rawtransform(self, text) - elif operation == 'read': - text, vhash = readtransform(self, text) - else: # write operation - text, vhash = writetransform(self, text) - validatehash = validatehash and vhash - - return text, validatehash - - def checkhash(self, text, node, p1=None, p2=None, rev=None): - """Check node hash integrity. - - Available as a function so that subclasses can extend hash mismatch - behaviors as needed. - """ - try: - if p1 is None and p2 is None: - p1, p2 = self.parents(node) - if node != self.hash(text, p1, p2): - revornode = rev - if revornode is None: - revornode = templatefilters.short(hex(node)) - raise RevlogError(_("integrity check failed on %s:%s") - % (self.indexfile, pycompat.bytestr(revornode))) - except RevlogError: - if self._censorable and _censoredtext(text): - raise error.CensoredNodeError(self.indexfile, node, text) - raise - - def _enforceinlinesize(self, tr, fp=None): - """Check if the revlog is too big for inline and convert if so. - - This should be called after revisions are added to the revlog. If the - revlog has grown too large to be an inline revlog, it will convert it - to use multiple index and data files. - """ - tiprev = len(self) - 1 - if (not self._inline or - (self.start(tiprev) + self.length(tiprev)) < _maxinline): - return - - trinfo = tr.find(self.indexfile) - if trinfo is None: - raise RevlogError(_("%s not found in the transaction") - % self.indexfile) - - trindex = trinfo[2] - if trindex is not None: - dataoff = self.start(trindex) - else: - # revlog was stripped at start of transaction, use all leftover data - trindex = len(self) - 1 - dataoff = self.end(tiprev) - - tr.add(self.datafile, dataoff) - - if fp: - fp.flush() - fp.close() - - with self._datafp('w') as df: - for r in self: - df.write(self._getsegmentforrevs(r, r)[1]) - - with self._indexfp('w') as fp: - self.version &= ~FLAG_INLINE_DATA - self._inline = False - io = self._io - for i in self: - e = io.packentry(self.index[i], self.node, self.version, i) - fp.write(e) - - # the temp file replace the real index when we exit the context - # manager - - tr.replace(self.indexfile, trindex * self._io.size) - self._chunkclear() - - def addrevision(self, text, transaction, link, p1, p2, cachedelta=None, - node=None, flags=REVIDX_DEFAULT_FLAGS, deltacomputer=None): - """add a revision to the log - - text - the revision data to add - transaction - the transaction object used for rollback - link - the linkrev data to add - p1, p2 - the parent nodeids of the revision - cachedelta - an optional precomputed delta - node - nodeid of revision; typically node is not specified, and it is - computed by default as hash(text, p1, p2), however subclasses might - use different hashing method (and override checkhash() in such case) - flags - the known flags to set on the revision - deltacomputer - an optional _deltacomputer instance shared between - multiple calls - """ - if link == nullrev: - raise RevlogError(_("attempted to add linkrev -1 to %s") - % self.indexfile) - - if flags: - node = node or self.hash(text, p1, p2) - - rawtext, validatehash = self._processflags(text, flags, 'write') - - # If the flag processor modifies the revision data, ignore any provided - # cachedelta. - if rawtext != text: - cachedelta = None - - if len(rawtext) > _maxentrysize: - raise RevlogError( - _("%s: size of %d bytes exceeds maximum revlog storage of 2GiB") - % (self.indexfile, len(rawtext))) - - node = node or self.hash(rawtext, p1, p2) - if node in self.nodemap: - return node - - if validatehash: - self.checkhash(rawtext, node, p1=p1, p2=p2) - - return self.addrawrevision(rawtext, transaction, link, p1, p2, node, - flags, cachedelta=cachedelta, - deltacomputer=deltacomputer) - - def addrawrevision(self, rawtext, transaction, link, p1, p2, node, flags, - cachedelta=None, deltacomputer=None): - """add a raw revision with known flags, node and parents - useful when reusing a revision not stored in this revlog (ex: received - over wire, or read from an external bundle). - """ - dfh = None - if not self._inline: - dfh = self._datafp("a+") - ifh = self._indexfp("a+") - try: - return self._addrevision(node, rawtext, transaction, link, p1, p2, - flags, cachedelta, ifh, dfh, - deltacomputer=deltacomputer) - finally: - if dfh: - dfh.close() - ifh.close() - - def compress(self, data): - """Generate a possibly-compressed representation of data.""" - if not data: - return '', data - - compressed = self._compressor.compress(data) - - if compressed: - # The revlog compressor added the header in the returned data. - return '', compressed - - if data[0:1] == '\0': - return '', data - return 'u', data - - def decompress(self, data): - """Decompress a revlog chunk. - - The chunk is expected to begin with a header identifying the - format type so it can be routed to an appropriate decompressor. - """ - if not data: - return data - - # Revlogs are read much more frequently than they are written and many - # chunks only take microseconds to decompress, so performance is - # important here. - # - # We can make a few assumptions about revlogs: - # - # 1) the majority of chunks will be compressed (as opposed to inline - # raw data). - # 2) decompressing *any* data will likely by at least 10x slower than - # returning raw inline data. - # 3) we want to prioritize common and officially supported compression - # engines - # - # It follows that we want to optimize for "decompress compressed data - # when encoded with common and officially supported compression engines" - # case over "raw data" and "data encoded by less common or non-official - # compression engines." That is why we have the inline lookup first - # followed by the compengines lookup. - # - # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib - # compressed chunks. And this matters for changelog and manifest reads. - t = data[0:1] - - if t == 'x': - try: - return _zlibdecompress(data) - except zlib.error as e: - raise RevlogError(_('revlog decompress error: %s') % - stringutil.forcebytestr(e)) - # '\0' is more common than 'u' so it goes first. - elif t == '\0': - return data - elif t == 'u': - return util.buffer(data, 1) - - try: - compressor = self._decompressors[t] - except KeyError: - try: - engine = util.compengines.forrevlogheader(t) - compressor = engine.revlogcompressor() - self._decompressors[t] = compressor - except KeyError: - raise RevlogError(_('unknown compression type %r') % t) - - return compressor.decompress(data) - - def _isgooddeltainfo(self, deltainfo, revinfo): - """Returns True if the given delta is good. Good means that it is within - the disk span, disk size, and chain length bounds that we know to be - performant.""" - if deltainfo is None: - return False - - # - 'deltainfo.distance' is the distance from the base revision -- - # bounding it limits the amount of I/O we need to do. - # - 'deltainfo.compresseddeltalen' is the sum of the total size of - # deltas we need to apply -- bounding it limits the amount of CPU - # we consume. - - if self._sparserevlog: - # As sparse-read will be used, we can consider that the distance, - # instead of being the span of the whole chunk, - # is the span of the largest read chunk - base = deltainfo.base - - if base != nullrev: - deltachain = self._deltachain(base)[0] - else: - deltachain = [] - - # search for the first non-snapshot revision - for idx, r in enumerate(deltachain): - if not self.issnapshot(r): - break - deltachain = deltachain[idx:] - chunks = _slicechunk(self, deltachain, deltainfo) - all_span = [_segmentspan(self, revs, deltainfo) for revs in chunks] - distance = max(all_span) - else: - distance = deltainfo.distance - - textlen = revinfo.textlen - defaultmax = textlen * 4 - maxdist = self._maxdeltachainspan - if not maxdist: - maxdist = distance # ensure the conditional pass - maxdist = max(maxdist, defaultmax) - if self._sparserevlog and maxdist < self._srmingapsize: - # In multiple place, we are ignoring irrelevant data range below a - # certain size. Be also apply this tradeoff here and relax span - # constraint for small enought content. - maxdist = self._srmingapsize - - # Bad delta from read span: - # - # If the span of data read is larger than the maximum allowed. - if maxdist < distance: - return False - - # Bad delta from new delta size: - # - # If the delta size is larger than the target text, storing the - # delta will be inefficient. - if textlen < deltainfo.deltalen: - return False - - # Bad delta from cumulated payload size: - # - # If the sum of delta get larger than K * target text length. - if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: - return False - - # Bad delta from chain length: - # - # If the number of delta in the chain gets too high. - if self._maxchainlen and self._maxchainlen < deltainfo.chainlen: - return False - - # bad delta from intermediate snapshot size limit - # - # If an intermediate snapshot size is higher than the limit. The - # limit exist to prevent endless chain of intermediate delta to be - # created. - if (deltainfo.snapshotdepth is not None and - (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen): - return False - - # bad delta if new intermediate snapshot is larger than the previous - # snapshot - if (deltainfo.snapshotdepth - and self.length(deltainfo.base) < deltainfo.deltalen): - return False - - return True - - def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags, - cachedelta, ifh, dfh, alwayscache=False, - deltacomputer=None): - """internal function to add revisions to the log - - see addrevision for argument descriptions. - - note: "addrevision" takes non-raw text, "_addrevision" takes raw text. - - if "deltacomputer" is not provided or None, a defaultdeltacomputer will - be used. - - invariants: - - rawtext is optional (can be None); if not set, cachedelta must be set. - if both are set, they must correspond to each other. - """ - if node == nullid: - raise RevlogError(_("%s: attempt to add null revision") % - (self.indexfile)) - if node == wdirid or node in wdirfilenodeids: - raise RevlogError(_("%s: attempt to add wdir revision") % - (self.indexfile)) - - if self._inline: - fh = ifh - else: - fh = dfh - - btext = [rawtext] - - curr = len(self) - prev = curr - 1 - offset = self.end(prev) - p1r, p2r = self.rev(p1), self.rev(p2) - - # full versions are inserted when the needed deltas - # become comparable to the uncompressed text - if rawtext is None: - # need rawtext size, before changed by flag processors, which is - # the non-raw size. use revlog explicitly to avoid filelog's extra - # logic that might remove metadata size. - textlen = mdiff.patchedsize(revlog.size(self, cachedelta[0]), - cachedelta[1]) - else: - textlen = len(rawtext) - - if deltacomputer is None: - deltacomputer = _deltacomputer(self) - - revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags) - - # no delta for flag processor revision (see "candelta" for why) - # not calling candelta since only one revision needs test, also to - # avoid overhead fetching flags again. - if flags & REVIDX_RAWTEXT_CHANGING_FLAGS: - deltainfo = None - else: - deltainfo = deltacomputer.finddeltainfo(revinfo, fh) - - if deltainfo is not None: - base = deltainfo.base - chainbase = deltainfo.chainbase - data = deltainfo.data - l = deltainfo.deltalen - else: - rawtext = deltacomputer.buildtext(revinfo, fh) - data = self.compress(rawtext) - l = len(data[1]) + len(data[0]) - base = chainbase = curr - - e = (offset_type(offset, flags), l, textlen, - base, link, p1r, p2r, node) - self.index.append(e) - self.nodemap[node] = curr - - entry = self._io.packentry(e, self.node, self.version, curr) - self._writeentry(transaction, ifh, dfh, entry, data, link, offset) - - if alwayscache and rawtext is None: - rawtext = deltacomputer.buildtext(revinfo, fh) - - if type(rawtext) == bytes: # only accept immutable objects - self._cache = (node, curr, rawtext) - self._chainbasecache[curr] = chainbase - return node - - def _writeentry(self, transaction, ifh, dfh, entry, data, link, offset): - # Files opened in a+ mode have inconsistent behavior on various - # platforms. Windows requires that a file positioning call be made - # when the file handle transitions between reads and writes. See - # 3686fa2b8eee and the mixedfilemodewrapper in windows.py. On other - # platforms, Python or the platform itself can be buggy. Some versions - # of Solaris have been observed to not append at the end of the file - # if the file was seeked to before the end. See issue4943 for more. - # - # We work around this issue by inserting a seek() before writing. - # Note: This is likely not necessary on Python 3. - ifh.seek(0, os.SEEK_END) - if dfh: - dfh.seek(0, os.SEEK_END) - - curr = len(self) - 1 - if not self._inline: - transaction.add(self.datafile, offset) - transaction.add(self.indexfile, curr * len(entry)) - if data[0]: - dfh.write(data[0]) - dfh.write(data[1]) - ifh.write(entry) - else: - offset += curr * self._io.size - transaction.add(self.indexfile, offset, curr) - ifh.write(entry) - ifh.write(data[0]) - ifh.write(data[1]) - self._enforceinlinesize(transaction, ifh) - - def addgroup(self, deltas, linkmapper, transaction, addrevisioncb=None): - """ - add a delta group - - given a set of deltas, add them to the revision log. the - first delta is against its parent, which should be in our - log, the rest are against the previous delta. - - If ``addrevisioncb`` is defined, it will be called with arguments of - this revlog and the node that was added. - """ - - nodes = [] - - r = len(self) - end = 0 - if r: - end = self.end(r - 1) - ifh = self._indexfp("a+") - isize = r * self._io.size - if self._inline: - transaction.add(self.indexfile, end + isize, r) - dfh = None - else: - transaction.add(self.indexfile, isize, r) - transaction.add(self.datafile, end) - dfh = self._datafp("a+") - def flush(): - if dfh: - dfh.flush() - ifh.flush() - try: - deltacomputer = _deltacomputer(self) - # loop through our set of deltas - for data in deltas: - node, p1, p2, linknode, deltabase, delta, flags = data - link = linkmapper(linknode) - flags = flags or REVIDX_DEFAULT_FLAGS - - nodes.append(node) - - if node in self.nodemap: - # this can happen if two branches make the same change - continue - - for p in (p1, p2): - if p not in self.nodemap: - raise LookupError(p, self.indexfile, - _('unknown parent')) - - if deltabase not in self.nodemap: - raise LookupError(deltabase, self.indexfile, - _('unknown delta base')) - - baserev = self.rev(deltabase) - - if baserev != nullrev and self.iscensored(baserev): - # if base is censored, delta must be full replacement in a - # single patch operation - hlen = struct.calcsize(">lll") - oldlen = self.rawsize(baserev) - newlen = len(delta) - hlen - if delta[:hlen] != mdiff.replacediffheader(oldlen, newlen): - raise error.CensoredBaseError(self.indexfile, - self.node(baserev)) - - if not flags and self._peek_iscensored(baserev, delta, flush): - flags |= REVIDX_ISCENSORED - - # We assume consumers of addrevisioncb will want to retrieve - # the added revision, which will require a call to - # revision(). revision() will fast path if there is a cache - # hit. So, we tell _addrevision() to always cache in this case. - # We're only using addgroup() in the context of changegroup - # generation so the revision data can always be handled as raw - # by the flagprocessor. - self._addrevision(node, None, transaction, link, - p1, p2, flags, (baserev, delta), - ifh, dfh, - alwayscache=bool(addrevisioncb), - deltacomputer=deltacomputer) - - if addrevisioncb: - addrevisioncb(self, node) - - if not dfh and not self._inline: - # addrevision switched from inline to conventional - # reopen the index - ifh.close() - dfh = self._datafp("a+") - ifh = self._indexfp("a+") - finally: - if dfh: - dfh.close() - ifh.close() - - return nodes - - def iscensored(self, rev): - """Check if a file revision is censored.""" - if not self._censorable: - return False - - return self.flags(rev) & REVIDX_ISCENSORED - - def _peek_iscensored(self, baserev, delta, flush): - """Quickly check if a delta produces a censored revision.""" - if not self._censorable: - return False - - # Fragile heuristic: unless new file meta keys are added alphabetically - # preceding "censored", all censored revisions are prefixed by - # "\1\ncensored:". A delta producing such a censored revision must be a - # full-replacement delta, so we inspect the first and only patch in the - # delta for this prefix. - hlen = struct.calcsize(">lll") - if len(delta) <= hlen: - return False - - oldlen = self.rawsize(baserev) - newlen = len(delta) - hlen - if delta[:hlen] != mdiff.replacediffheader(oldlen, newlen): - return False - - add = "\1\ncensored:" - addlen = len(add) - return newlen >= addlen and delta[hlen:hlen + addlen] == add - - def getstrippoint(self, minlink): - """find the minimum rev that must be stripped to strip the linkrev - - Returns a tuple containing the minimum rev and a set of all revs that - have linkrevs that will be broken by this strip. - """ - brokenrevs = set() - strippoint = len(self) - - heads = {} - futurelargelinkrevs = set() - for head in self.headrevs(): - headlinkrev = self.linkrev(head) - heads[head] = headlinkrev - if headlinkrev >= minlink: - futurelargelinkrevs.add(headlinkrev) - - # This algorithm involves walking down the rev graph, starting at the - # heads. Since the revs are topologically sorted according to linkrev, - # once all head linkrevs are below the minlink, we know there are - # no more revs that could have a linkrev greater than minlink. - # So we can stop walking. - while futurelargelinkrevs: - strippoint -= 1 - linkrev = heads.pop(strippoint) - - if linkrev < minlink: - brokenrevs.add(strippoint) - else: - futurelargelinkrevs.remove(linkrev) - - for p in self.parentrevs(strippoint): - if p != nullrev: - plinkrev = self.linkrev(p) - heads[p] = plinkrev - if plinkrev >= minlink: - futurelargelinkrevs.add(plinkrev) - - return strippoint, brokenrevs - - def strip(self, minlink, transaction): - """truncate the revlog on the first revision with a linkrev >= minlink - - This function is called when we're stripping revision minlink and - its descendants from the repository. - - We have to remove all revisions with linkrev >= minlink, because - the equivalent changelog revisions will be renumbered after the - strip. - - So we truncate the revlog on the first of these revisions, and - trust that the caller has saved the revisions that shouldn't be - removed and that it'll re-add them after this truncation. - """ - if len(self) == 0: - return - - rev, _ = self.getstrippoint(minlink) - if rev == len(self): - return - - # first truncate the files on disk - end = self.start(rev) - if not self._inline: - transaction.add(self.datafile, end) - end = rev * self._io.size - else: - end += rev * self._io.size - - transaction.add(self.indexfile, end) - - # then reset internal state in memory to forget those revisions - self._cache = None - self._chaininfocache = {} - self._chunkclear() - for x in pycompat.xrange(rev, len(self)): - del self.nodemap[self.node(x)] - - del self.index[rev:-1] - self._nodepos = None - - def checksize(self): - expected = 0 - if len(self): - expected = max(0, self.end(len(self) - 1)) - - try: - with self._datafp() as f: - f.seek(0, 2) - actual = f.tell() - dd = actual - expected - except IOError as inst: - if inst.errno != errno.ENOENT: - raise - dd = 0 - - try: - f = self.opener(self.indexfile) - f.seek(0, 2) - actual = f.tell() - f.close() - s = self._io.size - i = max(0, actual // s) - di = actual - (i * s) - if self._inline: - databytes = 0 - for r in self: - databytes += max(0, self.length(r)) - dd = 0 - di = actual - len(self) * s - databytes - except IOError as inst: - if inst.errno != errno.ENOENT: - raise - di = 0 - - return (dd, di) - - def files(self): - res = [self.indexfile] - if not self._inline: - res.append(self.datafile) - return res - - def emitrevisiondeltas(self, requests): - frev = self.rev - - prevrev = None - for request in requests: - node = request.node - rev = frev(node) - - if prevrev is None: - prevrev = self.index[rev][5] - - # Requesting a full revision. - if request.basenode == nullid: - baserev = nullrev - # Requesting an explicit revision. - elif request.basenode is not None: - baserev = frev(request.basenode) - # Allowing us to choose. - else: - p1rev, p2rev = self.parentrevs(rev) - deltaparentrev = self.deltaparent(rev) - - # Avoid sending full revisions when delta parent is null. Pick - # prev in that case. It's tempting to pick p1 in this case, as - # p1 will be smaller in the common case. However, computing a - # delta against p1 may require resolving the raw text of p1, - # which could be expensive. The revlog caches should have prev - # cached, meaning less CPU for delta generation. There is - # likely room to add a flag and/or config option to control this - # behavior. - if deltaparentrev == nullrev and self._storedeltachains: - baserev = prevrev - - # Revlog is configured to use full snapshot for a reason. - # Stick to full snapshot. - elif deltaparentrev == nullrev: - baserev = nullrev - - # Pick previous when we can't be sure the base is available - # on consumer. - elif deltaparentrev not in (p1rev, p2rev, prevrev): - baserev = prevrev - else: - baserev = deltaparentrev - - if baserev != nullrev and not self.candelta(baserev, rev): - baserev = nullrev - - revision = None - delta = None - baserevisionsize = None - - if self.iscensored(baserev) or self.iscensored(rev): - try: - revision = self.revision(node, raw=True) - except error.CensoredNodeError as e: - revision = e.tombstone - - if baserev != nullrev: - baserevisionsize = self.rawsize(baserev) - - elif baserev == nullrev: - revision = self.revision(node, raw=True) - else: - delta = self.revdiff(baserev, rev) - - extraflags = REVIDX_ELLIPSIS if request.ellipsis else 0 - - yield revlogrevisiondelta( - node=node, - p1node=request.p1node, - p2node=request.p2node, - linknode=request.linknode, - basenode=self.node(baserev), - flags=self.flags(rev) | extraflags, - baserevisionsize=baserevisionsize, - revision=revision, - delta=delta) - - prevrev = rev - - DELTAREUSEALWAYS = 'always' - DELTAREUSESAMEREVS = 'samerevs' - DELTAREUSENEVER = 'never' - - DELTAREUSEFULLADD = 'fulladd' - - DELTAREUSEALL = {'always', 'samerevs', 'never', 'fulladd'} - - def clone(self, tr, destrevlog, addrevisioncb=None, - deltareuse=DELTAREUSESAMEREVS, deltabothparents=None): - """Copy this revlog to another, possibly with format changes. - - The destination revlog will contain the same revisions and nodes. - However, it may not be bit-for-bit identical due to e.g. delta encoding - differences. - - The ``deltareuse`` argument control how deltas from the existing revlog - are preserved in the destination revlog. The argument can have the - following values: - - DELTAREUSEALWAYS - Deltas will always be reused (if possible), even if the destination - revlog would not select the same revisions for the delta. This is the - fastest mode of operation. - DELTAREUSESAMEREVS - Deltas will be reused if the destination revlog would pick the same - revisions for the delta. This mode strikes a balance between speed - and optimization. - DELTAREUSENEVER - Deltas will never be reused. This is the slowest mode of execution. - This mode can be used to recompute deltas (e.g. if the diff/delta - algorithm changes). - - Delta computation can be slow, so the choice of delta reuse policy can - significantly affect run time. - - The default policy (``DELTAREUSESAMEREVS``) strikes a balance between - two extremes. Deltas will be reused if they are appropriate. But if the - delta could choose a better revision, it will do so. This means if you - are converting a non-generaldelta revlog to a generaldelta revlog, - deltas will be recomputed if the delta's parent isn't a parent of the - revision. - - In addition to the delta policy, the ``deltabothparents`` argument - controls whether to compute deltas against both parents for merges. - By default, the current default is used. - """ - if deltareuse not in self.DELTAREUSEALL: - raise ValueError(_('value for deltareuse invalid: %s') % deltareuse) - - if len(destrevlog): - raise ValueError(_('destination revlog is not empty')) - - if getattr(self, 'filteredrevs', None): - raise ValueError(_('source revlog has filtered revisions')) - if getattr(destrevlog, 'filteredrevs', None): - raise ValueError(_('destination revlog has filtered revisions')) - - # lazydeltabase controls whether to reuse a cached delta, if possible. - oldlazydeltabase = destrevlog._lazydeltabase - oldamd = destrevlog._deltabothparents - - try: - if deltareuse == self.DELTAREUSEALWAYS: - destrevlog._lazydeltabase = True - elif deltareuse == self.DELTAREUSESAMEREVS: - destrevlog._lazydeltabase = False - - destrevlog._deltabothparents = deltabothparents or oldamd - - populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS, - self.DELTAREUSESAMEREVS) - - deltacomputer = _deltacomputer(destrevlog) - index = self.index - for rev in self: - entry = index[rev] - - # Some classes override linkrev to take filtered revs into - # account. Use raw entry from index. - flags = entry[0] & 0xffff - linkrev = entry[4] - p1 = index[entry[5]][7] - p2 = index[entry[6]][7] - node = entry[7] - - # (Possibly) reuse the delta from the revlog if allowed and - # the revlog chunk is a delta. - cachedelta = None - rawtext = None - if populatecachedelta: - dp = self.deltaparent(rev) - if dp != nullrev: - cachedelta = (dp, bytes(self._chunk(rev))) - - if not cachedelta: - rawtext = self.revision(rev, raw=True) - - - if deltareuse == self.DELTAREUSEFULLADD: - destrevlog.addrevision(rawtext, tr, linkrev, p1, p2, - cachedelta=cachedelta, - node=node, flags=flags, - deltacomputer=deltacomputer) - else: - ifh = destrevlog.opener(destrevlog.indexfile, 'a+', - checkambig=False) - dfh = None - if not destrevlog._inline: - dfh = destrevlog.opener(destrevlog.datafile, 'a+') - try: - destrevlog._addrevision(node, rawtext, tr, linkrev, p1, - p2, flags, cachedelta, ifh, dfh, - deltacomputer=deltacomputer) - finally: - if dfh: - dfh.close() - ifh.close() - - if addrevisioncb: - addrevisioncb(self, rev, node) - finally: - destrevlog._lazydeltabase = oldlazydeltabase - destrevlog._deltabothparents = oldamd