rev_cache.py
442 lines
| 16.2 KiB
| text/x-python
|
PythonLexer
r52794 | # rev_cache.py - caching branch information per revision | |||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
from __future__ import annotations | ||||
r52798 | import os | |||
r52794 | import struct | |||
from ..node import ( | ||||
nullrev, | ||||
) | ||||
from .. import ( | ||||
encoding, | ||||
error, | ||||
util, | ||||
) | ||||
from ..utils import ( | ||||
stringutil, | ||||
) | ||||
calcsize = struct.calcsize | ||||
pack_into = struct.pack_into | ||||
unpack_from = struct.unpack_from | ||||
# Revision branch info cache | ||||
r52799 | # The "V2" version use the same format as the "V1" but garantee it won't be | |||
# truncated, preventing SIGBUS when it is mmap-ed | ||||
_rbcversion = b'-v2' | ||||
r52794 | _rbcnames = b'rbc-names' + _rbcversion | |||
_rbcrevs = b'rbc-revs' + _rbcversion | ||||
r52800 | _rbc_legacy_version = b'-v1' | |||
_rbc_legacy_names = b'rbc-names' + _rbc_legacy_version | ||||
_rbc_legacy_revs = b'rbc-revs' + _rbc_legacy_version | ||||
r52794 | # [4 byte hash prefix][4 byte branch name number with sign bit indicating open] | |||
_rbcrecfmt = b'>4sI' | ||||
_rbcrecsize = calcsize(_rbcrecfmt) | ||||
_rbcmininc = 64 * _rbcrecsize | ||||
_rbcnodelen = 4 | ||||
_rbcbranchidxmask = 0x7FFFFFFF | ||||
_rbccloseflag = 0x80000000 | ||||
r52798 | # with atomic replacement. | |||
REWRITE_RATIO = 0.2 | ||||
r52794 | class rbcrevs: | |||
"""a byte string consisting of an immutable prefix followed by a mutable suffix""" | ||||
def __init__(self, revs): | ||||
self._prefix = revs | ||||
self._rest = bytearray() | ||||
r52868 | @property | |||
def len_prefix(self): | ||||
size = len(self._prefix) | ||||
return size - (size % _rbcrecsize) | ||||
r52794 | def __len__(self): | |||
r52868 | return self.len_prefix + len(self._rest) | |||
r52794 | ||||
def unpack_record(self, rbcrevidx): | ||||
r52868 | if rbcrevidx < self.len_prefix: | |||
r52794 | return unpack_from(_rbcrecfmt, util.buffer(self._prefix), rbcrevidx) | |||
else: | ||||
return unpack_from( | ||||
_rbcrecfmt, | ||||
util.buffer(self._rest), | ||||
r52868 | rbcrevidx - self.len_prefix, | |||
r52794 | ) | |||
def make_mutable(self): | ||||
r52868 | if self.len_prefix > 0: | |||
r52794 | entirety = bytearray() | |||
r52868 | entirety[:] = self._prefix[: self.len_prefix] | |||
r52794 | entirety.extend(self._rest) | |||
self._rest = entirety | ||||
self._prefix = bytearray() | ||||
def truncate(self, pos): | ||||
self.make_mutable() | ||||
del self._rest[pos:] | ||||
def pack_into(self, rbcrevidx, node, branchidx): | ||||
r52868 | if rbcrevidx < self.len_prefix: | |||
r52794 | self.make_mutable() | |||
buf = self._rest | ||||
r52868 | start_offset = rbcrevidx - self.len_prefix | |||
r52794 | end_offset = start_offset + _rbcrecsize | |||
if len(self._rest) < end_offset: | ||||
# bytearray doesn't allocate extra space at least in Python 3.7. | ||||
# When multiple changesets are added in a row, precise resize would | ||||
# result in quadratic complexity. Overallocate to compensate by | ||||
# using the classic doubling technique for dynamic arrays instead. | ||||
# If there was a gap in the map before, less space will be reserved. | ||||
self._rest.extend(b'\0' * end_offset) | ||||
return pack_into( | ||||
_rbcrecfmt, | ||||
buf, | ||||
start_offset, | ||||
node, | ||||
branchidx, | ||||
) | ||||
def extend(self, extension): | ||||
return self._rest.extend(extension) | ||||
def slice(self, begin, end): | ||||
r52868 | if begin < self.len_prefix: | |||
r52794 | acc = bytearray() | |||
r52868 | acc[:] = self._prefix[begin : min(end, self.len_prefix)] | |||
r52794 | acc.extend( | |||
r52868 | self._rest[begin - self.len_prefix : end - self.len_prefix] | |||
r52794 | ) | |||
return acc | ||||
r52868 | return self._rest[begin - self.len_prefix : end - self.len_prefix] | |||
r52794 | ||||
class revbranchcache: | ||||
"""Persistent cache, mapping from revision number to branch name and close. | ||||
This is a low level cache, independent of filtering. | ||||
Branch names are stored in rbc-names in internal encoding separated by 0. | ||||
rbc-names is append-only, and each branch name is only stored once and will | ||||
thus have a unique index. | ||||
The branch info for each revision is stored in rbc-revs as constant size | ||||
records. The whole file is read into memory, but it is only 'parsed' on | ||||
demand. The file is usually append-only but will be truncated if repo | ||||
modification is detected. | ||||
The record for each revision contains the first 4 bytes of the | ||||
corresponding node hash, and the record is only used if it still matches. | ||||
Even a completely trashed rbc-revs fill thus still give the right result | ||||
while converging towards full recovery ... assuming no incorrectly matching | ||||
node hashes. | ||||
The record also contains 4 bytes where 31 bits contains the index of the | ||||
branch and the last bit indicate that it is a branch close commit. | ||||
The usage pattern for rbc-revs is thus somewhat similar to 00changelog.i | ||||
and will grow with it but be 1/8th of its size. | ||||
""" | ||||
def __init__(self, repo, readonly=True): | ||||
assert repo.filtername is None | ||||
self._repo = repo | ||||
self._names = [] # branch names in local encoding with static index | ||||
self._rbcrevs = rbcrevs(bytearray()) | ||||
self._rbcsnameslen = 0 # length of names read at _rbcsnameslen | ||||
r52801 | self._force_overwrite = False | |||
r52800 | v1_fallback = False | |||
r52794 | try: | |||
r52800 | try: | |||
bndata = repo.cachevfs.read(_rbcnames) | ||||
except (IOError, OSError): | ||||
# If we don't have "v2" data, we might have "v1" data worth | ||||
# using. | ||||
# | ||||
# consider stop doing this many version after hg-6.9 release | ||||
bndata = repo.cachevfs.read(_rbc_legacy_names) | ||||
v1_fallback = True | ||||
r52801 | self._force_overwrite = True | |||
r52794 | self._rbcsnameslen = len(bndata) # for verification before writing | |||
if bndata: | ||||
self._names = [ | ||||
encoding.tolocal(bn) for bn in bndata.split(b'\0') | ||||
] | ||||
except (IOError, OSError): | ||||
if readonly: | ||||
# don't try to use cache - fall back to the slow path | ||||
self.branchinfo = self._branchinfo | ||||
if self._names: | ||||
try: | ||||
usemmap = repo.ui.configbool(b'storage', b'revbranchcache.mmap') | ||||
r52800 | if not v1_fallback: | |||
with repo.cachevfs(_rbcrevs) as fp: | ||||
if usemmap and repo.cachevfs.is_mmap_safe(_rbcrevs): | ||||
data = util.buffer(util.mmapread(fp)) | ||||
else: | ||||
data = fp.read() | ||||
else: | ||||
# If we don't have "v2" data, we might have "v1" data worth | ||||
# using. | ||||
# | ||||
# Consider stop doing this many version after hg-6.9 | ||||
# release. | ||||
with repo.cachevfs(_rbc_legacy_revs) as fp: | ||||
r52794 | data = fp.read() | |||
self._rbcrevs = rbcrevs(data) | ||||
except (IOError, OSError) as inst: | ||||
repo.ui.debug( | ||||
b"couldn't read revision branch cache: %s\n" | ||||
% stringutil.forcebytestr(inst) | ||||
) | ||||
# remember number of good records on disk | ||||
self._rbcrevslen = min( | ||||
len(self._rbcrevs) // _rbcrecsize, len(repo.changelog) | ||||
) | ||||
if self._rbcrevslen == 0: | ||||
self._names = [] | ||||
self._rbcnamescount = len(self._names) # number of names read at | ||||
# _rbcsnameslen | ||||
def _clear(self): | ||||
self._rbcsnameslen = 0 | ||||
del self._names[:] | ||||
self._rbcnamescount = 0 | ||||
self._rbcrevslen = len(self._repo.changelog) | ||||
self._rbcrevs = rbcrevs(bytearray(self._rbcrevslen * _rbcrecsize)) | ||||
util.clearcachedproperty(self, b'_namesreverse') | ||||
r52796 | self._force_overwrite = True | |||
def invalidate(self, rev=0): | ||||
self._rbcrevslen = rev | ||||
self._rbcrevs.truncate(rev) | ||||
self._force_overwrite = True | ||||
r52794 | ||||
@util.propertycache | ||||
def _namesreverse(self): | ||||
return {b: r for r, b in enumerate(self._names)} | ||||
def branchinfo(self, rev): | ||||
"""Return branch name and close flag for rev, using and updating | ||||
persistent cache.""" | ||||
changelog = self._repo.changelog | ||||
rbcrevidx = rev * _rbcrecsize | ||||
# avoid negative index, changelog.read(nullrev) is fast without cache | ||||
if rev == nullrev: | ||||
return changelog.branchinfo(rev) | ||||
# if requested rev isn't allocated, grow and cache the rev info | ||||
if len(self._rbcrevs) < rbcrevidx + _rbcrecsize: | ||||
return self._branchinfo(rev) | ||||
# fast path: extract data from cache, use it if node is matching | ||||
reponode = changelog.node(rev)[:_rbcnodelen] | ||||
cachenode, branchidx = self._rbcrevs.unpack_record(rbcrevidx) | ||||
close = bool(branchidx & _rbccloseflag) | ||||
if close: | ||||
branchidx &= _rbcbranchidxmask | ||||
if cachenode == b'\0\0\0\0': | ||||
pass | ||||
elif cachenode == reponode: | ||||
try: | ||||
return self._names[branchidx], close | ||||
except IndexError: | ||||
# recover from invalid reference to unknown branch | ||||
self._repo.ui.debug( | ||||
b"referenced branch names not found" | ||||
b" - rebuilding revision branch cache from scratch\n" | ||||
) | ||||
self._clear() | ||||
else: | ||||
# rev/node map has changed, invalidate the cache from here up | ||||
self._repo.ui.debug( | ||||
b"history modification detected - truncating " | ||||
b"revision branch cache to revision %d\n" % rev | ||||
) | ||||
truncate = rbcrevidx + _rbcrecsize | ||||
self._rbcrevs.truncate(truncate) | ||||
self._rbcrevslen = min(self._rbcrevslen, truncate) | ||||
# fall back to slow path and make sure it will be written to disk | ||||
return self._branchinfo(rev) | ||||
def _branchinfo(self, rev): | ||||
"""Retrieve branch info from changelog and update _rbcrevs""" | ||||
changelog = self._repo.changelog | ||||
b, close = changelog.branchinfo(rev) | ||||
if b in self._namesreverse: | ||||
branchidx = self._namesreverse[b] | ||||
else: | ||||
branchidx = len(self._names) | ||||
self._names.append(b) | ||||
self._namesreverse[b] = branchidx | ||||
reponode = changelog.node(rev) | ||||
if close: | ||||
branchidx |= _rbccloseflag | ||||
self._setcachedata(rev, reponode, branchidx) | ||||
return b, close | ||||
def setdata(self, rev, changelogrevision): | ||||
"""add new data information to the cache""" | ||||
branch, close = changelogrevision.branchinfo | ||||
if branch in self._namesreverse: | ||||
branchidx = self._namesreverse[branch] | ||||
else: | ||||
branchidx = len(self._names) | ||||
self._names.append(branch) | ||||
self._namesreverse[branch] = branchidx | ||||
if close: | ||||
branchidx |= _rbccloseflag | ||||
self._setcachedata(rev, self._repo.changelog.node(rev), branchidx) | ||||
# If no cache data were readable (non exists, bad permission, etc) | ||||
# the cache was bypassing itself by setting: | ||||
# | ||||
# self.branchinfo = self._branchinfo | ||||
# | ||||
# Since we now have data in the cache, we need to drop this bypassing. | ||||
if 'branchinfo' in vars(self): | ||||
del self.branchinfo | ||||
def _setcachedata(self, rev, node, branchidx): | ||||
"""Writes the node's branch data to the in-memory cache data.""" | ||||
if rev == nullrev: | ||||
return | ||||
rbcrevidx = rev * _rbcrecsize | ||||
self._rbcrevs.pack_into(rbcrevidx, node, branchidx) | ||||
self._rbcrevslen = min(self._rbcrevslen, rev) | ||||
tr = self._repo.currenttransaction() | ||||
if tr: | ||||
tr.addfinalize(b'write-revbranchcache', self.write) | ||||
def write(self, tr=None): | ||||
"""Save branch cache if it is dirty.""" | ||||
repo = self._repo | ||||
wlock = None | ||||
step = b'' | ||||
try: | ||||
# write the new names | ||||
r52801 | if self._force_overwrite or self._rbcnamescount < len(self._names): | |||
r52794 | wlock = repo.wlock(wait=False) | |||
step = b' names' | ||||
self._writenames(repo) | ||||
# write the new revs | ||||
start = self._rbcrevslen * _rbcrecsize | ||||
r52796 | if self._force_overwrite or start != len(self._rbcrevs): | |||
r52794 | step = b'' | |||
if wlock is None: | ||||
wlock = repo.wlock(wait=False) | ||||
self._writerevs(repo, start) | ||||
except (IOError, OSError, error.Abort, error.LockError) as inst: | ||||
repo.ui.debug( | ||||
b"couldn't write revision branch cache%s: %s\n" | ||||
% (step, stringutil.forcebytestr(inst)) | ||||
) | ||||
finally: | ||||
if wlock is not None: | ||||
wlock.release() | ||||
def _writenames(self, repo): | ||||
"""write the new branch names to revbranchcache""" | ||||
r52797 | f = None | |||
r52801 | if self._force_overwrite: | |||
self._rbcsnameslen = 0 | ||||
self._rbcnamescount = 0 | ||||
r52797 | try: | |||
r52801 | if self._force_overwrite or self._rbcnamescount != 0: | |||
r52797 | f = repo.cachevfs.open(_rbcnames, b'ab') | |||
r52801 | current_size = f.tell() | |||
if current_size == self._rbcsnameslen: | ||||
r52797 | f.write(b'\0') | |||
else: | ||||
f.close() | ||||
r52801 | if self._force_overwrite: | |||
dbg = b"resetting content of %s\n" | ||||
elif current_size > 0: | ||||
dbg = b"%s changed - rewriting it\n" | ||||
else: | ||||
dbg = b"%s is missing - rewriting it\n" | ||||
repo.ui.debug(dbg % _rbcnames) | ||||
r52797 | self._rbcnamescount = 0 | |||
self._rbcrevslen = 0 | ||||
if self._rbcnamescount == 0: | ||||
# before rewriting names, make sure references are removed | ||||
repo.cachevfs.unlinkpath(_rbcrevs, ignoremissing=True) | ||||
f = repo.cachevfs.open(_rbcnames, b'wb') | ||||
names = self._names[self._rbcnamescount :] | ||||
from_local = encoding.fromlocal | ||||
data = b'\0'.join(from_local(b) for b in names) | ||||
f.write(data) | ||||
self._rbcsnameslen = f.tell() | ||||
finally: | ||||
if f is not None: | ||||
r52794 | f.close() | |||
self._rbcnamescount = len(self._names) | ||||
def _writerevs(self, repo, start): | ||||
"""write the new revs to revbranchcache""" | ||||
revs = min(len(repo.changelog), len(self._rbcrevs) // _rbcrecsize) | ||||
r52798 | ||||
end = revs * _rbcrecsize | ||||
r52796 | if self._force_overwrite: | |||
start = 0 | ||||
r52798 | ||||
r52868 | # align start on entry boundary | |||
start = _rbcrecsize * (start // _rbcrecsize) | ||||
r52798 | with repo.cachevfs.open(_rbcrevs, b'a+b') as f: | |||
pass # this make sure the file exist… | ||||
with repo.cachevfs.open(_rbcrevs, b'r+b') as f: | ||||
f.seek(0, os.SEEK_END) | ||||
r52795 | current_size = f.tell() | |||
if current_size < start: | ||||
start = 0 | ||||
if current_size != start: | ||||
r52798 | threshold = current_size * REWRITE_RATIO | |||
r52867 | overwritten = min(end, current_size) - start | |||
if (max(end, current_size) - start) >= threshold: | ||||
start = 0 | ||||
dbg = b"resetting content of cache/%s\n" % _rbcrevs | ||||
repo.ui.debug(dbg) | ||||
elif overwritten > 0: | ||||
# end affected, let us overwrite the bad value | ||||
r52798 | dbg = b"overwriting %d bytes from %d in cache/%s" | |||
r52867 | dbg %= (current_size - start, start, _rbcrevs) | |||
r52798 | if end < current_size: | |||
extra = b" leaving (%d trailing bytes)" | ||||
extra %= current_size - end | ||||
dbg += extra | ||||
dbg += b'\n' | ||||
repo.ui.debug(dbg) | ||||
else: | ||||
r52867 | # extra untouched data at the end, lets warn about them | |||
assert start == end # since don't write anything | ||||
dbg = b"cache/%s contains %d unknown trailing bytes\n" | ||||
dbg %= (_rbcrevs, current_size - start) | ||||
r52798 | repo.ui.debug(dbg) | |||
r52867 | ||||
r52798 | if start > 0: | |||
r52794 | f.seek(start) | |||
r52798 | f.write(self._rbcrevs.slice(start, end)) | |||
else: | ||||
f.close() | ||||
with repo.cachevfs.open( | ||||
_rbcrevs, | ||||
b'wb', | ||||
atomictemp=True, | ||||
) as rev_file: | ||||
rev_file.write(self._rbcrevs.slice(start, end)) | ||||
r52794 | self._rbcrevslen = revs | |||
r52796 | self._force_overwrite = False | |||