##// END OF EJS Templates
branchmap-v3: filter topo heads using node for performance reason...
branchmap-v3: filter topo heads using node for performance reason The branchmap currently contains heads as nodeid. If we build a set of revnum with the topological heads, we need to turn the nodeid in the branchmap to revnum to be able to check if they are topo-heads. That nodeid → revnum lookup is "expensive" and adds up to something noticeable if you do it hundreds of thousand of time. Instead we turn all the topo-heads revnums into nodes and build a set. So we can directly test membership of the nodeids stored in the branchmap. That is much faster. Ideally we would have revnum in the branchmap and could directly test revnum against a revnum set and that would be even faster. However that's an adventure for another time. Without this change, the branchmap format "v3" was significantly slower than the "v2" format. With this changes, some of that gap is recovered With rust + persistent nodemap, this overhead was smaller because the extra lookup did not had to to build the nodemap from scratch. In addition the mozilla-unified repository is able to use the "pure_top" mode of branchmap v3, so it was not really affected by this. Future changeset will work of the remaining of the performance gap. ### benchmark.name = hg.command.unbundle # bin-env-vars.hg.py-re2-module = default # benchmark.variants.issue6528 = disabled # benchmark.variants.resource-usage = default # benchmark.variants.reuse-external-delta-parent = yes # benchmark.variants.revs = any-1-extra-rev # benchmark.variants.source = unbundle # benchmark.variants.validate = default # benchmark.variants.verbosity = quiet ## data-env-vars.name = netbeans-2018-08-01-zstd-sparse-revlog # bin-env-vars.hg.flavor = default branch-v2: 0.233711 ~~~~~ branch-v3 before: 0.380994 (+63.02%, +0.15) branch-v3 after: 0.368769 (+57.79%, +0.14) # bin-env-vars.hg.flavor = rust branch-v2: 0.235230 ~~~~~ branch-v3 before: 0.385060 (+63.70%, +0.15) branch-v3 after: 0.372460 (+58.34%, +0.14) ## data-env-vars.name = netbeans-2018-08-01-ds2-pnm # bin-env-vars.hg.flavor = rust branch-v2: 0.255586 ~~~~~ branch-v3 before: 0.317524 (+24.23%, +0.06) branch-v3 after: 0.318907 (+24.78%, +0.06) ## data-env-vars.name = mozilla-central-2024-03-22-zstd-sparse-revlog # bin-env-vars.hg.flavor = default branch-v2: 0.339010 ~~~~~ branch-v3 before: 0.410007 (+20.94%, +0.07) branch-v3 after: 0.349752 (+3.17%, +0.01) # bin-env-vars.hg.flavor = rust branch-v2: 0.346525 ~~~~~ branch-v3 before: 0.410428 (+18.44%, +0.06) branch-v3 after: 0.354300 (+2.24%, +0.01) ## data-env-vars.name = mozilla-central-2024-03-22-ds2-pnm # bin-env-vars.hg.flavor = rust branch-v2: 0.380202 ~~~~~ branch-v3 before: 0.393871 (+3.60%, +0.01) branch-v3 after: 0.396293 (+4.23%, +0.02) ## data-env-vars.name = mozilla-unified-2024-03-22-zstd-sparse-revlog # bin-env-vars.hg.flavor = default branch-v2: 0.412165 ~~~~~ branch-v3 before: 0.438105 (+6.29%, +0.03) branch-v3 after: 0.424769 (+3.06%, +0.01) # bin-env-vars.hg.flavor = rust branch-v2: 0.412397 ~~~~~ branch-v3 before: 0.438405 (+6.31%, +0.03) branch-v3 after: 0.421796 (+2.28%, +0.01) ## data-env-vars.name = mozilla-unified-2024-03-22-ds2-pnm # bin-env-vars.hg.flavor = rust branch-v2: 0.429501 ~~~~~ branch-v3 before: 0.452692 (+5.40%, +0.02) branch-v3 after: 0.443849 (+3.34%, +0.01) ## data-env-vars.name = mozilla-try-2024-03-26-zstd-sparse-revlog # bin-env-vars.hg.flavor = default branch-v2: 3.403171 ~~~~~ branch-v3 before: 6.562345 (+92.83%, +3.16) branch-v3 after: 6.234055 (+83.18%, +2.83) # bin-env-vars.hg.flavor = rust branch-v2: 3.454876 ~~~~~ branch-v3 before: 6.160248 (+78.31%, +2.71) branch-v3 after: 6.307813 (+82.58%, +2.85) ## data-env-vars.name = mozilla-try-2024-03-26-ds2-pnm # bin-env-vars.hg.flavor = rust branch-v2: 3.465435 ~~~~~ branch-v3 before: 5.381648 (+55.30%, +1.92) branch-v3 after: 5.176076 (+49.36%, +1.71)

File last commit:

r52868:76416b6e default
r52869:41b8892a default
Show More
rev_cache.py
442 lines | 16.2 KiB | text/x-python | PythonLexer
rev-branch-cache: move the code in a dedicated module...
r52794 # rev_cache.py - caching branch information per revision
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import annotations
rev-branch-cache: stop truncating cache file...
r52798 import os
rev-branch-cache: move the code in a dedicated module...
r52794 import struct
from ..node import (
nullrev,
)
from .. import (
encoding,
error,
util,
)
from ..utils import (
stringutil,
)
calcsize = struct.calcsize
pack_into = struct.pack_into
unpack_from = struct.unpack_from
# Revision branch info cache
rev-branch-cache: increment the version to "v2"...
r52799 # The "V2" version use the same format as the "V1" but garantee it won't be
# truncated, preventing SIGBUS when it is mmap-ed
_rbcversion = b'-v2'
rev-branch-cache: move the code in a dedicated module...
r52794 _rbcnames = b'rbc-names' + _rbcversion
_rbcrevs = b'rbc-revs' + _rbcversion
rev-branch-cache: fallback on "v1" data if no v2 is found...
r52800 _rbc_legacy_version = b'-v1'
_rbc_legacy_names = b'rbc-names' + _rbc_legacy_version
_rbc_legacy_revs = b'rbc-revs' + _rbc_legacy_version
rev-branch-cache: move the code in a dedicated module...
r52794 # [4 byte hash prefix][4 byte branch name number with sign bit indicating open]
_rbcrecfmt = b'>4sI'
_rbcrecsize = calcsize(_rbcrecfmt)
_rbcmininc = 64 * _rbcrecsize
_rbcnodelen = 4
_rbcbranchidxmask = 0x7FFFFFFF
_rbccloseflag = 0x80000000
rev-branch-cache: stop truncating cache file...
r52798 # with atomic replacement.
REWRITE_RATIO = 0.2
rev-branch-cache: move the code in a dedicated module...
r52794 class rbcrevs:
"""a byte string consisting of an immutable prefix followed by a mutable suffix"""
def __init__(self, revs):
self._prefix = revs
self._rest = bytearray()
rev-branch-cache: properly ignores unaligned trailing data...
r52868 @property
def len_prefix(self):
size = len(self._prefix)
return size - (size % _rbcrecsize)
rev-branch-cache: move the code in a dedicated module...
r52794 def __len__(self):
rev-branch-cache: properly ignores unaligned trailing data...
r52868 return self.len_prefix + len(self._rest)
rev-branch-cache: move the code in a dedicated module...
r52794
def unpack_record(self, rbcrevidx):
rev-branch-cache: properly ignores unaligned trailing data...
r52868 if rbcrevidx < self.len_prefix:
rev-branch-cache: move the code in a dedicated module...
r52794 return unpack_from(_rbcrecfmt, util.buffer(self._prefix), rbcrevidx)
else:
return unpack_from(
_rbcrecfmt,
util.buffer(self._rest),
rev-branch-cache: properly ignores unaligned trailing data...
r52868 rbcrevidx - self.len_prefix,
rev-branch-cache: move the code in a dedicated module...
r52794 )
def make_mutable(self):
rev-branch-cache: properly ignores unaligned trailing data...
r52868 if self.len_prefix > 0:
rev-branch-cache: move the code in a dedicated module...
r52794 entirety = bytearray()
rev-branch-cache: properly ignores unaligned trailing data...
r52868 entirety[:] = self._prefix[: self.len_prefix]
rev-branch-cache: move the code in a dedicated module...
r52794 entirety.extend(self._rest)
self._rest = entirety
self._prefix = bytearray()
def truncate(self, pos):
self.make_mutable()
del self._rest[pos:]
def pack_into(self, rbcrevidx, node, branchidx):
rev-branch-cache: properly ignores unaligned trailing data...
r52868 if rbcrevidx < self.len_prefix:
rev-branch-cache: move the code in a dedicated module...
r52794 self.make_mutable()
buf = self._rest
rev-branch-cache: properly ignores unaligned trailing data...
r52868 start_offset = rbcrevidx - self.len_prefix
rev-branch-cache: move the code in a dedicated module...
r52794 end_offset = start_offset + _rbcrecsize
if len(self._rest) < end_offset:
# bytearray doesn't allocate extra space at least in Python 3.7.
# When multiple changesets are added in a row, precise resize would
# result in quadratic complexity. Overallocate to compensate by
# using the classic doubling technique for dynamic arrays instead.
# If there was a gap in the map before, less space will be reserved.
self._rest.extend(b'\0' * end_offset)
return pack_into(
_rbcrecfmt,
buf,
start_offset,
node,
branchidx,
)
def extend(self, extension):
return self._rest.extend(extension)
def slice(self, begin, end):
rev-branch-cache: properly ignores unaligned trailing data...
r52868 if begin < self.len_prefix:
rev-branch-cache: move the code in a dedicated module...
r52794 acc = bytearray()
rev-branch-cache: properly ignores unaligned trailing data...
r52868 acc[:] = self._prefix[begin : min(end, self.len_prefix)]
rev-branch-cache: move the code in a dedicated module...
r52794 acc.extend(
rev-branch-cache: properly ignores unaligned trailing data...
r52868 self._rest[begin - self.len_prefix : end - self.len_prefix]
rev-branch-cache: move the code in a dedicated module...
r52794 )
return acc
rev-branch-cache: properly ignores unaligned trailing data...
r52868 return self._rest[begin - self.len_prefix : end - self.len_prefix]
rev-branch-cache: move the code in a dedicated module...
r52794
class revbranchcache:
"""Persistent cache, mapping from revision number to branch name and close.
This is a low level cache, independent of filtering.
Branch names are stored in rbc-names in internal encoding separated by 0.
rbc-names is append-only, and each branch name is only stored once and will
thus have a unique index.
The branch info for each revision is stored in rbc-revs as constant size
records. The whole file is read into memory, but it is only 'parsed' on
demand. The file is usually append-only but will be truncated if repo
modification is detected.
The record for each revision contains the first 4 bytes of the
corresponding node hash, and the record is only used if it still matches.
Even a completely trashed rbc-revs fill thus still give the right result
while converging towards full recovery ... assuming no incorrectly matching
node hashes.
The record also contains 4 bytes where 31 bits contains the index of the
branch and the last bit indicate that it is a branch close commit.
The usage pattern for rbc-revs is thus somewhat similar to 00changelog.i
and will grow with it but be 1/8th of its size.
"""
def __init__(self, repo, readonly=True):
assert repo.filtername is None
self._repo = repo
self._names = [] # branch names in local encoding with static index
self._rbcrevs = rbcrevs(bytearray())
self._rbcsnameslen = 0 # length of names read at _rbcsnameslen
rev-branch-cache: schedule a write of the "v2" format if we read from "v1"...
r52801 self._force_overwrite = False
rev-branch-cache: fallback on "v1" data if no v2 is found...
r52800 v1_fallback = False
rev-branch-cache: move the code in a dedicated module...
r52794 try:
rev-branch-cache: fallback on "v1" data if no v2 is found...
r52800 try:
bndata = repo.cachevfs.read(_rbcnames)
except (IOError, OSError):
# If we don't have "v2" data, we might have "v1" data worth
# using.
#
# consider stop doing this many version after hg-6.9 release
bndata = repo.cachevfs.read(_rbc_legacy_names)
v1_fallback = True
rev-branch-cache: schedule a write of the "v2" format if we read from "v1"...
r52801 self._force_overwrite = True
rev-branch-cache: move the code in a dedicated module...
r52794 self._rbcsnameslen = len(bndata) # for verification before writing
if bndata:
self._names = [
encoding.tolocal(bn) for bn in bndata.split(b'\0')
]
except (IOError, OSError):
if readonly:
# don't try to use cache - fall back to the slow path
self.branchinfo = self._branchinfo
if self._names:
try:
usemmap = repo.ui.configbool(b'storage', b'revbranchcache.mmap')
rev-branch-cache: fallback on "v1" data if no v2 is found...
r52800 if not v1_fallback:
with repo.cachevfs(_rbcrevs) as fp:
if usemmap and repo.cachevfs.is_mmap_safe(_rbcrevs):
data = util.buffer(util.mmapread(fp))
else:
data = fp.read()
else:
# If we don't have "v2" data, we might have "v1" data worth
# using.
#
# Consider stop doing this many version after hg-6.9
# release.
with repo.cachevfs(_rbc_legacy_revs) as fp:
rev-branch-cache: move the code in a dedicated module...
r52794 data = fp.read()
self._rbcrevs = rbcrevs(data)
except (IOError, OSError) as inst:
repo.ui.debug(
b"couldn't read revision branch cache: %s\n"
% stringutil.forcebytestr(inst)
)
# remember number of good records on disk
self._rbcrevslen = min(
len(self._rbcrevs) // _rbcrecsize, len(repo.changelog)
)
if self._rbcrevslen == 0:
self._names = []
self._rbcnamescount = len(self._names) # number of names read at
# _rbcsnameslen
def _clear(self):
self._rbcsnameslen = 0
del self._names[:]
self._rbcnamescount = 0
self._rbcrevslen = len(self._repo.changelog)
self._rbcrevs = rbcrevs(bytearray(self._rbcrevslen * _rbcrecsize))
util.clearcachedproperty(self, b'_namesreverse')
rev-branch-cache: add a way to force rewrite of the cache...
r52796 self._force_overwrite = True
def invalidate(self, rev=0):
self._rbcrevslen = rev
self._rbcrevs.truncate(rev)
self._force_overwrite = True
rev-branch-cache: move the code in a dedicated module...
r52794
@util.propertycache
def _namesreverse(self):
return {b: r for r, b in enumerate(self._names)}
def branchinfo(self, rev):
"""Return branch name and close flag for rev, using and updating
persistent cache."""
changelog = self._repo.changelog
rbcrevidx = rev * _rbcrecsize
# avoid negative index, changelog.read(nullrev) is fast without cache
if rev == nullrev:
return changelog.branchinfo(rev)
# if requested rev isn't allocated, grow and cache the rev info
if len(self._rbcrevs) < rbcrevidx + _rbcrecsize:
return self._branchinfo(rev)
# fast path: extract data from cache, use it if node is matching
reponode = changelog.node(rev)[:_rbcnodelen]
cachenode, branchidx = self._rbcrevs.unpack_record(rbcrevidx)
close = bool(branchidx & _rbccloseflag)
if close:
branchidx &= _rbcbranchidxmask
if cachenode == b'\0\0\0\0':
pass
elif cachenode == reponode:
try:
return self._names[branchidx], close
except IndexError:
# recover from invalid reference to unknown branch
self._repo.ui.debug(
b"referenced branch names not found"
b" - rebuilding revision branch cache from scratch\n"
)
self._clear()
else:
# rev/node map has changed, invalidate the cache from here up
self._repo.ui.debug(
b"history modification detected - truncating "
b"revision branch cache to revision %d\n" % rev
)
truncate = rbcrevidx + _rbcrecsize
self._rbcrevs.truncate(truncate)
self._rbcrevslen = min(self._rbcrevslen, truncate)
# fall back to slow path and make sure it will be written to disk
return self._branchinfo(rev)
def _branchinfo(self, rev):
"""Retrieve branch info from changelog and update _rbcrevs"""
changelog = self._repo.changelog
b, close = changelog.branchinfo(rev)
if b in self._namesreverse:
branchidx = self._namesreverse[b]
else:
branchidx = len(self._names)
self._names.append(b)
self._namesreverse[b] = branchidx
reponode = changelog.node(rev)
if close:
branchidx |= _rbccloseflag
self._setcachedata(rev, reponode, branchidx)
return b, close
def setdata(self, rev, changelogrevision):
"""add new data information to the cache"""
branch, close = changelogrevision.branchinfo
if branch in self._namesreverse:
branchidx = self._namesreverse[branch]
else:
branchidx = len(self._names)
self._names.append(branch)
self._namesreverse[branch] = branchidx
if close:
branchidx |= _rbccloseflag
self._setcachedata(rev, self._repo.changelog.node(rev), branchidx)
# If no cache data were readable (non exists, bad permission, etc)
# the cache was bypassing itself by setting:
#
# self.branchinfo = self._branchinfo
#
# Since we now have data in the cache, we need to drop this bypassing.
if 'branchinfo' in vars(self):
del self.branchinfo
def _setcachedata(self, rev, node, branchidx):
"""Writes the node's branch data to the in-memory cache data."""
if rev == nullrev:
return
rbcrevidx = rev * _rbcrecsize
self._rbcrevs.pack_into(rbcrevidx, node, branchidx)
self._rbcrevslen = min(self._rbcrevslen, rev)
tr = self._repo.currenttransaction()
if tr:
tr.addfinalize(b'write-revbranchcache', self.write)
def write(self, tr=None):
"""Save branch cache if it is dirty."""
repo = self._repo
wlock = None
step = b''
try:
# write the new names
rev-branch-cache: schedule a write of the "v2" format if we read from "v1"...
r52801 if self._force_overwrite or self._rbcnamescount < len(self._names):
rev-branch-cache: move the code in a dedicated module...
r52794 wlock = repo.wlock(wait=False)
step = b' names'
self._writenames(repo)
# write the new revs
start = self._rbcrevslen * _rbcrecsize
rev-branch-cache: add a way to force rewrite of the cache...
r52796 if self._force_overwrite or start != len(self._rbcrevs):
rev-branch-cache: move the code in a dedicated module...
r52794 step = b''
if wlock is None:
wlock = repo.wlock(wait=False)
self._writerevs(repo, start)
except (IOError, OSError, error.Abort, error.LockError) as inst:
repo.ui.debug(
b"couldn't write revision branch cache%s: %s\n"
% (step, stringutil.forcebytestr(inst))
)
finally:
if wlock is not None:
wlock.release()
def _writenames(self, repo):
"""write the new branch names to revbranchcache"""
rev-branch-cache: make sure we close the name file we open...
r52797 f = None
rev-branch-cache: schedule a write of the "v2" format if we read from "v1"...
r52801 if self._force_overwrite:
self._rbcsnameslen = 0
self._rbcnamescount = 0
rev-branch-cache: make sure we close the name file we open...
r52797 try:
rev-branch-cache: schedule a write of the "v2" format if we read from "v1"...
r52801 if self._force_overwrite or self._rbcnamescount != 0:
rev-branch-cache: make sure we close the name file we open...
r52797 f = repo.cachevfs.open(_rbcnames, b'ab')
rev-branch-cache: schedule a write of the "v2" format if we read from "v1"...
r52801 current_size = f.tell()
if current_size == self._rbcsnameslen:
rev-branch-cache: make sure we close the name file we open...
r52797 f.write(b'\0')
else:
f.close()
rev-branch-cache: schedule a write of the "v2" format if we read from "v1"...
r52801 if self._force_overwrite:
dbg = b"resetting content of %s\n"
elif current_size > 0:
dbg = b"%s changed - rewriting it\n"
else:
dbg = b"%s is missing - rewriting it\n"
repo.ui.debug(dbg % _rbcnames)
rev-branch-cache: make sure we close the name file we open...
r52797 self._rbcnamescount = 0
self._rbcrevslen = 0
if self._rbcnamescount == 0:
# before rewriting names, make sure references are removed
repo.cachevfs.unlinkpath(_rbcrevs, ignoremissing=True)
f = repo.cachevfs.open(_rbcnames, b'wb')
names = self._names[self._rbcnamescount :]
from_local = encoding.fromlocal
data = b'\0'.join(from_local(b) for b in names)
f.write(data)
self._rbcsnameslen = f.tell()
finally:
if f is not None:
rev-branch-cache: move the code in a dedicated module...
r52794 f.close()
self._rbcnamescount = len(self._names)
def _writerevs(self, repo, start):
"""write the new revs to revbranchcache"""
revs = min(len(repo.changelog), len(self._rbcrevs) // _rbcrecsize)
rev-branch-cache: stop truncating cache file...
r52798
end = revs * _rbcrecsize
rev-branch-cache: add a way to force rewrite of the cache...
r52796 if self._force_overwrite:
start = 0
rev-branch-cache: stop truncating cache file...
r52798
rev-branch-cache: properly ignores unaligned trailing data...
r52868 # align start on entry boundary
start = _rbcrecsize * (start // _rbcrecsize)
rev-branch-cache: stop truncating cache file...
r52798 with repo.cachevfs.open(_rbcrevs, b'a+b') as f:
pass # this make sure the file exist…
with repo.cachevfs.open(_rbcrevs, b'r+b') as f:
f.seek(0, os.SEEK_END)
rev-branch-cache: issue more truthful "truncating" message...
r52795 current_size = f.tell()
if current_size < start:
start = 0
if current_size != start:
rev-branch-cache: stop truncating cache file...
r52798 threshold = current_size * REWRITE_RATIO
rev-branch-cache: stop pretending we will overwrite data when we don't...
r52867 overwritten = min(end, current_size) - start
if (max(end, current_size) - start) >= threshold:
start = 0
dbg = b"resetting content of cache/%s\n" % _rbcrevs
repo.ui.debug(dbg)
elif overwritten > 0:
# end affected, let us overwrite the bad value
rev-branch-cache: stop truncating cache file...
r52798 dbg = b"overwriting %d bytes from %d in cache/%s"
rev-branch-cache: stop pretending we will overwrite data when we don't...
r52867 dbg %= (current_size - start, start, _rbcrevs)
rev-branch-cache: stop truncating cache file...
r52798 if end < current_size:
extra = b" leaving (%d trailing bytes)"
extra %= current_size - end
dbg += extra
dbg += b'\n'
repo.ui.debug(dbg)
else:
rev-branch-cache: stop pretending we will overwrite data when we don't...
r52867 # extra untouched data at the end, lets warn about them
assert start == end # since don't write anything
dbg = b"cache/%s contains %d unknown trailing bytes\n"
dbg %= (_rbcrevs, current_size - start)
rev-branch-cache: stop truncating cache file...
r52798 repo.ui.debug(dbg)
rev-branch-cache: stop pretending we will overwrite data when we don't...
r52867
rev-branch-cache: stop truncating cache file...
r52798 if start > 0:
rev-branch-cache: move the code in a dedicated module...
r52794 f.seek(start)
rev-branch-cache: stop truncating cache file...
r52798 f.write(self._rbcrevs.slice(start, end))
else:
f.close()
with repo.cachevfs.open(
_rbcrevs,
b'wb',
atomictemp=True,
) as rev_file:
rev_file.write(self._rbcrevs.slice(start, end))
rev-branch-cache: move the code in a dedicated module...
r52794 self._rbcrevslen = revs
rev-branch-cache: add a way to force rewrite of the cache...
r52796 self._force_overwrite = False