# HG changeset patch # User Pierre-Yves David # Date 2021-04-06 03:20:24 # Node ID 4c041c71ec01356c6e9fb329b5fdc2c34041bb39 # Parent 3e381eb557f3c6869301c86d711904c5098ed06f revlog: introduce an explicit tracking of what the revlog is about Since the dawn of time, people have been forced to rely to lossy introspection of the index filename to determine what the purpose and role of the revlog they encounter is. This is hacky, error prone, inflexible, abstraction-leaky, . In f63299ee7e4d Raphaël introduced a new attribute to track this information: `revlog_kind`. However it is initialized in an odd place and various instances end up not having it set. In addition is only tracking some of the information we end up having to introspect in various pieces of code. So we add a new attribute that holds more data and is more strictly enforced. This work is done in collaboration with Raphaël. The `revlog_kind` one will be removed/adapted in the next changeset. We expect to be able to clean up various existing piece of code and to simplify coming work around the newer revlog format. Differential Revision: https://phab.mercurial-scm.org/D10352 diff --git a/contrib/dumprevlog b/contrib/dumprevlog --- a/contrib/dumprevlog +++ b/contrib/dumprevlog @@ -13,6 +13,10 @@ from mercurial import ( ) from mercurial.utils import procutil +from mercurial.revlogutils import ( + constants as revlog_constants, +) + for fp in (sys.stdin, sys.stdout, sys.stderr): procutil.setbinary(fp) @@ -32,7 +36,11 @@ def printb(data, end=b'\n'): for f in sys.argv[1:]: - r = revlog.revlog(binopen, encoding.strtolocal(f)) + r = revlog.revlog( + binopen, + target=(revlog_constants.KIND_OTHER, b'dump-revlog'), + indexfile=encoding.strtolocal(f), + ) print("file:", f) for i in r: n = r.node(i) diff --git a/contrib/perf.py b/contrib/perf.py --- a/contrib/perf.py +++ b/contrib/perf.py @@ -66,6 +66,8 @@ import sys import tempfile import threading import time + +import mercurial.revlog from mercurial import ( changegroup, cmdutil, @@ -76,7 +78,6 @@ from mercurial import ( hg, mdiff, merge, - revlog, util, ) @@ -119,6 +120,21 @@ try: except ImportError: profiling = None +try: + from mercurial.revlogutils import constants as revlog_constants + + perf_rl_kind = (revlog_constants.KIND_OTHER, b'created-by-perf') + + def revlog(opener, *args, **kwargs): + return mercurial.revlog.revlog(opener, perf_rl_kind, *args, **kwargs) + + +except (ImportError, AttributeError): + perf_rl_kind = None + + def revlog(opener, *args, **kwargs): + return mercurial.revlog.revlog(opener, *args, **kwargs) + def identity(a): return a @@ -1809,7 +1825,8 @@ def perfnodelookup(ui, repo, rev, **opts mercurial.revlog._prereadsize = 2 ** 24 # disable lazy parser in old hg n = scmutil.revsingle(repo, rev).node() - cl = mercurial.revlog.revlog(getsvfs(repo), b"00changelog.i") + + cl = revlog(getsvfs(repo), indexfile=b"00changelog.i") def d(): cl.rev(n) @@ -2602,9 +2619,9 @@ def perfrevlogindex(ui, repo, file_=None else: raise error.Abort(b'unsupported revlog version: %d' % version) - parse_index_v1 = getattr(revlog, 'parse_index_v1', None) + parse_index_v1 = getattr(mercurial.revlog, 'parse_index_v1', None) if parse_index_v1 is None: - parse_index_v1 = revlog.revlogio().parseindex + parse_index_v1 = mercurial.revlog.revlogio().parseindex rllen = len(rl) @@ -2620,7 +2637,7 @@ def perfrevlogindex(ui, repo, file_=None allnodesrev = list(reversed(allnodes)) def constructor(): - revlog.revlog(opener, indexfile) + revlog(opener, indexfile=indexfile) def read(): with opener(indexfile) as fh: @@ -3042,7 +3059,7 @@ def _temprevlog(ui, orig, truncaterev): vfs = vfsmod.vfs(tmpdir) vfs.options = getattr(orig.opener, 'options', None) - dest = revlog.revlog( + dest = revlog( vfs, indexfile=indexname, datafile=dataname, **revlogkwargs ) if dest._inline: diff --git a/contrib/undumprevlog b/contrib/undumprevlog --- a/contrib/undumprevlog +++ b/contrib/undumprevlog @@ -15,6 +15,10 @@ from mercurial import ( ) from mercurial.utils import procutil +from mercurial.revlogutils import ( + constants as revlog_constants, +) + for fp in (sys.stdin, sys.stdout, sys.stderr): procutil.setbinary(fp) @@ -28,7 +32,11 @@ while True: break if l.startswith("file:"): f = encoding.strtolocal(l[6:-1]) - r = revlog.revlog(opener, f) + r = revlog.revlog( + opener, + target=(revlog_constants.KIND_OTHER, b'undump-revlog'), + indexfile=f, + ) procutil.stdout.write(b'%s\n' % f) elif l.startswith("node:"): n = bin(l[6:-1]) diff --git a/mercurial/bundlerepo.py b/mercurial/bundlerepo.py --- a/mercurial/bundlerepo.py +++ b/mercurial/bundlerepo.py @@ -46,9 +46,13 @@ from .utils import ( urlutil, ) +from .revlogutils import ( + constants as revlog_constants, +) + class bundlerevlog(revlog.revlog): - def __init__(self, opener, indexfile, cgunpacker, linkmapper): + def __init__(self, opener, target, indexfile, cgunpacker, linkmapper): # How it works: # To retrieve a revision, we need to know the offset of the revision in # the bundle (an unbundle object). We store this offset in the index @@ -57,7 +61,7 @@ class bundlerevlog(revlog.revlog): # To differentiate a rev in the bundle from a rev in the revlog, we # check revision against repotiprev. opener = vfsmod.readonlyvfs(opener) - revlog.revlog.__init__(self, opener, indexfile) + revlog.revlog.__init__(self, opener, target=target, indexfile=indexfile) self.bundle = cgunpacker n = len(self) self.repotiprev = n - 1 @@ -171,7 +175,12 @@ class bundlechangelog(bundlerevlog, chan changelog.changelog.__init__(self, opener) linkmapper = lambda x: x bundlerevlog.__init__( - self, opener, self.indexfile, cgunpacker, linkmapper + self, + opener, + (revlog_constants.KIND_CHANGELOG, None), + self.indexfile, + cgunpacker, + linkmapper, ) @@ -187,7 +196,12 @@ class bundlemanifest(bundlerevlog, manif ): manifest.manifestrevlog.__init__(self, nodeconstants, opener, tree=dir) bundlerevlog.__init__( - self, opener, self.indexfile, cgunpacker, linkmapper + self, + opener, + (revlog_constants.KIND_MANIFESTLOG, dir), + self.indexfile, + cgunpacker, + linkmapper, ) if dirlogstarts is None: dirlogstarts = {} @@ -214,7 +228,12 @@ class bundlefilelog(filelog.filelog): def __init__(self, opener, path, cgunpacker, linkmapper): filelog.filelog.__init__(self, opener, path) self._revlog = bundlerevlog( - opener, self.indexfile, cgunpacker, linkmapper + opener, + # XXX should use the unencoded path + target=(revlog_constants.KIND_FILELOG, path), + indexfile=self.indexfile, + cgunpacker=cgunpacker, + linkmapper=linkmapper, ) diff --git a/mercurial/changelog.py b/mercurial/changelog.py --- a/mercurial/changelog.py +++ b/mercurial/changelog.py @@ -25,7 +25,10 @@ from .utils import ( dateutil, stringutil, ) -from .revlogutils import flagutil +from .revlogutils import ( + constants as revlog_constants, + flagutil, +) _defaultextra = {b'branch': b'default'} @@ -401,7 +404,8 @@ class changelog(revlog.revlog): revlog.revlog.__init__( self, opener, - indexfile, + target=(revlog_constants.KIND_CHANGELOG, None), + indexfile=indexfile, datafile=datafile, checkambig=True, mmaplargeindex=True, diff --git a/mercurial/cmdutil.py b/mercurial/cmdutil.py --- a/mercurial/cmdutil.py +++ b/mercurial/cmdutil.py @@ -61,6 +61,10 @@ from .utils import ( stringutil, ) +from .revlogutils import ( + constants as revlog_constants, +) + if pycompat.TYPE_CHECKING: from typing import ( Any, @@ -1428,8 +1432,12 @@ def openstorage(repo, cmd, file_, opts, raise error.CommandError(cmd, _(b'invalid arguments')) if not os.path.isfile(file_): raise error.InputError(_(b"revlog '%s' not found") % file_) + + target = (revlog_constants.KIND_OTHER, b'free-form:%s' % file_) r = revlog.revlog( - vfsmod.vfs(encoding.getcwd(), audit=False), file_[:-2] + b".i" + vfsmod.vfs(encoding.getcwd(), audit=False), + target=target, + indexfile=file_[:-2] + b".i", ) return r diff --git a/mercurial/filelog.py b/mercurial/filelog.py --- a/mercurial/filelog.py +++ b/mercurial/filelog.py @@ -18,13 +18,20 @@ from .interfaces import ( util as interfaceutil, ) from .utils import storageutil +from .revlogutils import ( + constants as revlog_constants, +) @interfaceutil.implementer(repository.ifilestorage) class filelog(object): def __init__(self, opener, path): self._revlog = revlog.revlog( - opener, b'/'.join((b'data', path + b'.i')), censorable=True + opener, + # XXX should use the unencoded path + target=(revlog_constants.KIND_FILELOG, path), + indexfile=b'/'.join((b'data', path + b'.i')), + censorable=True, ) # Full name of the user visible file, relative to the repository root. # Used by LFS. diff --git a/mercurial/manifest.py b/mercurial/manifest.py --- a/mercurial/manifest.py +++ b/mercurial/manifest.py @@ -34,6 +34,9 @@ from .interfaces import ( repository, util as interfaceutil, ) +from .revlogutils import ( + constants as revlog_constants, +) parsers = policy.importmod('parsers') propertycache = util.propertycache @@ -1610,7 +1613,8 @@ class manifestrevlog(object): self._revlog = revlog.revlog( opener, - indexfile, + target=(revlog_constants.KIND_MANIFESTLOG, self.tree), + indexfile=indexfile, # only root indexfile is cached checkambig=not bool(tree), mmaplargeindex=True, diff --git a/mercurial/revlog.py b/mercurial/revlog.py --- a/mercurial/revlog.py +++ b/mercurial/revlog.py @@ -34,6 +34,7 @@ from .node import ( from .i18n import _ from .pycompat import getattr from .revlogutils.constants import ( + ALL_KINDS, FLAG_GENERALDELTA, FLAG_INLINE_DATA, INDEX_HEADER, @@ -287,7 +288,8 @@ class revlog(object): def __init__( self, opener, - indexfile, + target, + indexfile=None, datafile=None, checkambig=False, mmaplargeindex=False, @@ -302,6 +304,12 @@ class revlog(object): opener is a function that abstracts the file opening operation and can be used to implement COW semantics or the like. + `target`: a (KIND, ID) tuple that identify the content stored in + this revlog. It help the rest of the code to understand what the revlog + is about without having to resort to heuristic and index filename + analysis. Note: that this must be reliably be set by normal code, but + that test, debug, or performance measurement code might not set this to + accurate value. """ self.upperboundcomp = upperboundcomp self.indexfile = indexfile @@ -313,6 +321,9 @@ class revlog(object): ) self.opener = opener + assert target[0] in ALL_KINDS + assert len(target) == 2 + self.target = target # When True, indexfile is opened with checkambig=True at writing, to # avoid file stat ambiguity. self._checkambig = checkambig @@ -2869,7 +2880,13 @@ class revlog(object): newdatafile = self.datafile + b'.tmpcensored' # This is a bit dangerous. We could easily have a mismatch of state. - newrl = revlog(self.opener, newindexfile, newdatafile, censorable=True) + newrl = revlog( + self.opener, + target=self.target, + indexfile=newindexfile, + datafile=newdatafile, + censorable=True, + ) newrl.version = self.version newrl._generaldelta = self._generaldelta newrl._parse_index = self._parse_index diff --git a/mercurial/revlogutils/constants.py b/mercurial/revlogutils/constants.py --- a/mercurial/revlogutils/constants.py +++ b/mercurial/revlogutils/constants.py @@ -13,6 +13,20 @@ import struct from ..interfaces import repository +### Internal utily constants + +KIND_CHANGELOG = 1001 # over 256 to not be comparable with a bytes +KIND_MANIFESTLOG = 1002 +KIND_FILELOG = 1003 +KIND_OTHER = 1004 + +ALL_KINDS = { + KIND_CHANGELOG, + KIND_MANIFESTLOG, + KIND_FILELOG, + KIND_OTHER, +} + ### main revlog header INDEX_HEADER = struct.Struct(b">I") diff --git a/mercurial/unionrepo.py b/mercurial/unionrepo.py --- a/mercurial/unionrepo.py +++ b/mercurial/unionrepo.py @@ -41,7 +41,11 @@ class unionrevlog(revlog.revlog): # To differentiate a rev in the second revlog from a rev in the revlog, # we check revision against repotiprev. opener = vfsmod.readonlyvfs(opener) - revlog.revlog.__init__(self, opener, indexfile) + target = getattr(revlog2, 'target', None) + if target is None: + # a revlog wrapper, eg: the manifestlog that is not an actual revlog + target = revlog2._revlog.target + revlog.revlog.__init__(self, opener, target=target, indexfile=indexfile) self.revlog2 = revlog2 n = len(self) diff --git a/tests/test-revlog-raw.py b/tests/test-revlog-raw.py --- a/tests/test-revlog-raw.py +++ b/tests/test-revlog-raw.py @@ -14,6 +14,7 @@ from mercurial import ( ) from mercurial.revlogutils import ( + constants, deltas, flagutil, ) @@ -81,7 +82,9 @@ def newtransaction(): def newrevlog(name=b'_testrevlog.i', recreate=False): if recreate: tvfs.tryunlink(name) - rlog = revlog.revlog(tvfs, name) + rlog = revlog.revlog( + tvfs, target=(constants.KIND_OTHER, b'test'), indexfile=name + ) return rlog diff --git a/tests/test-revlog.t b/tests/test-revlog.t --- a/tests/test-revlog.t +++ b/tests/test-revlog.t @@ -45,9 +45,10 @@ Test for CVE-2016-3630 0 2 99e0332bd498 000000000000 000000000000 1 3 6674f57a23d8 99e0332bd498 000000000000 + >>> from mercurial.revlogutils.constants import KIND_OTHER >>> from mercurial import revlog, vfs >>> tvfs = vfs.vfs(b'.') >>> tvfs.options = {b'revlogv1': True} - >>> rl = revlog.revlog(tvfs, b'a.i') + >>> rl = revlog.revlog(tvfs, target=(KIND_OTHER, b'test'), indexfile=b'a.i') >>> rl.revision(1) mpatchError(*'patch cannot be decoded'*) (glob)