revmap.py
259 lines
| 8.5 KiB
| text/x-python
|
PythonLexer
Augie Fackler
|
r39243 | # Copyright 2016-present Facebook. All Rights Reserved. | ||
# | ||||
# revmap: trivial hg hash - linelog rev bidirectional map | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Matt Harbison
|
r52756 | from __future__ import annotations | ||
Augie Fackler
|
r39243 | |||
import bisect | ||||
Augie Fackler
|
r42767 | import io | ||
Augie Fackler
|
r39243 | import os | ||
import struct | ||||
from mercurial.node import hex | ||||
from mercurial import ( | ||||
error as hgerror, | ||||
) | ||||
from . import error | ||||
# the revmap file format is straightforward: | ||||
# | ||||
# 8 bytes: header | ||||
# 1 byte : flag for linelog revision 1 | ||||
# ? bytes: (optional) '\0'-terminated path string | ||||
# only exists if (flag & renameflag) != 0 | ||||
# 20 bytes: hg hash for linelog revision 1 | ||||
# 1 byte : flag for linelog revision 2 | ||||
# ? bytes: (optional) '\0'-terminated path string | ||||
# 20 bytes: hg hash for linelog revision 2 | ||||
# .... | ||||
# | ||||
# the implementation is kinda stupid: __init__ loads the whole revmap. | ||||
# no laziness. benchmark shows loading 10000 revisions is about 0.015 | ||||
# seconds, which looks enough for our use-case. if this implementation | ||||
# becomes a bottleneck, we can change it to lazily read the file | ||||
# from the end. | ||||
# whether the changeset is in the side branch. i.e. not in the linear main | ||||
# branch but only got referenced by lines in merge changesets. | ||||
sidebranchflag = 1 | ||||
# whether the changeset changes the file path (ie. is a rename) | ||||
renameflag = 2 | ||||
# len(mercurial.node.nullid) | ||||
_hshlen = 20 | ||||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r49801 | class revmap: | ||
Augie Fackler
|
r39243 | """trivial hg bin hash - linelog rev bidirectional map | ||
also stores a flag (uint8) for each revision, and track renames. | ||||
""" | ||||
HEADER = b'REVMAP1\0' | ||||
def __init__(self, path=None): | ||||
"""create or load the revmap, optionally associate to a file | ||||
if path is None, the revmap is entirely in-memory. the caller is | ||||
responsible for locking. concurrent writes to a same file is unsafe. | ||||
the caller needs to make sure one file is associated to at most one | ||||
revmap object at a time.""" | ||||
self.path = path | ||||
self._rev2hsh = [None] | ||||
self._rev2flag = [None] | ||||
self._hsh2rev = {} | ||||
# since rename does not happen frequently, do not store path for every | ||||
# revision. self._renamerevs can be used for bisecting. | ||||
self._renamerevs = [0] | ||||
Augie Fackler
|
r43347 | self._renamepaths = [b''] | ||
Augie Fackler
|
r39243 | self._lastmaxrev = -1 | ||
if path: | ||||
if os.path.exists(path): | ||||
self._load() | ||||
else: | ||||
# write the header so "append" can do incremental updates | ||||
self.flush() | ||||
def copyfrom(self, rhs): | ||||
"""copy the map data from another revmap. do not affect self.path""" | ||||
self._rev2hsh = rhs._rev2hsh[:] | ||||
self._rev2flag = rhs._rev2flag[:] | ||||
self._hsh2rev = rhs._hsh2rev.copy() | ||||
self._renamerevs = rhs._renamerevs[:] | ||||
self._renamepaths = rhs._renamepaths[:] | ||||
self._lastmaxrev = -1 | ||||
@property | ||||
def maxrev(self): | ||||
"""return max linelog revision number""" | ||||
return len(self._rev2hsh) - 1 | ||||
def append(self, hsh, sidebranch=False, path=None, flush=False): | ||||
"""add a binary hg hash and return the mapped linelog revision. | ||||
if flush is True, incrementally update the file. | ||||
""" | ||||
if hsh in self._hsh2rev: | ||||
Augie Fackler
|
r43347 | raise error.CorruptedFileError( | ||
b'%r is in revmap already' % hex(hsh) | ||||
) | ||||
Augie Fackler
|
r39243 | if len(hsh) != _hshlen: | ||
Augie Fackler
|
r43347 | raise hgerror.ProgrammingError( | ||
b'hsh must be %d-char long' % _hshlen | ||||
) | ||||
Augie Fackler
|
r39243 | idx = len(self._rev2hsh) | ||
flag = 0 | ||||
if sidebranch: | ||||
flag |= sidebranchflag | ||||
if path is not None and path != self._renamepaths[-1]: | ||||
flag |= renameflag | ||||
self._renamerevs.append(idx) | ||||
self._renamepaths.append(path) | ||||
self._rev2hsh.append(hsh) | ||||
self._rev2flag.append(flag) | ||||
self._hsh2rev[hsh] = idx | ||||
if flush: | ||||
self.flush() | ||||
return idx | ||||
def rev2hsh(self, rev): | ||||
"""convert linelog revision to hg hash. return None if not found.""" | ||||
if rev > self.maxrev or rev < 0: | ||||
return None | ||||
return self._rev2hsh[rev] | ||||
def rev2flag(self, rev): | ||||
"""get the flag (uint8) for a given linelog revision. | ||||
return None if revision does not exist. | ||||
""" | ||||
if rev > self.maxrev or rev < 0: | ||||
return None | ||||
return self._rev2flag[rev] | ||||
def rev2path(self, rev): | ||||
"""get the path for a given linelog revision. | ||||
return None if revision does not exist. | ||||
""" | ||||
if rev > self.maxrev or rev < 0: | ||||
return None | ||||
idx = bisect.bisect_right(self._renamerevs, rev) - 1 | ||||
return self._renamepaths[idx] | ||||
def hsh2rev(self, hsh): | ||||
"""convert hg hash to linelog revision. return None if not found.""" | ||||
return self._hsh2rev.get(hsh) | ||||
def clear(self, flush=False): | ||||
"""make the map empty. if flush is True, write to disk""" | ||||
# rev 0 is reserved, real rev starts from 1 | ||||
self._rev2hsh = [None] | ||||
self._rev2flag = [None] | ||||
self._hsh2rev = {} | ||||
Augie Fackler
|
r43347 | self._rev2path = [b''] | ||
Augie Fackler
|
r39243 | self._lastmaxrev = -1 | ||
if flush: | ||||
self.flush() | ||||
def flush(self): | ||||
"""write the state down to the file""" | ||||
if not self.path: | ||||
return | ||||
Augie Fackler
|
r43346 | if self._lastmaxrev == -1: # write the entire file | ||
Matt Harbison
|
r53288 | with open(self.path, 'wb') as f: | ||
Augie Fackler
|
r39243 | f.write(self.HEADER) | ||
Manuel Jacob
|
r50179 | for i in range(1, len(self._rev2hsh)): | ||
Augie Fackler
|
r39243 | self._writerev(i, f) | ||
Augie Fackler
|
r43346 | else: # append incrementally | ||
Matt Harbison
|
r53288 | with open(self.path, 'ab') as f: | ||
Manuel Jacob
|
r50179 | for i in range(self._lastmaxrev + 1, len(self._rev2hsh)): | ||
Augie Fackler
|
r39243 | self._writerev(i, f) | ||
self._lastmaxrev = self.maxrev | ||||
def _load(self): | ||||
"""load state from file""" | ||||
if not self.path: | ||||
return | ||||
# use local variables in a loop. CPython uses LOAD_FAST for them, | ||||
# which is faster than both LOAD_CONST and LOAD_GLOBAL. | ||||
flaglen = 1 | ||||
hshlen = _hshlen | ||||
Matt Harbison
|
r53288 | with open(self.path, 'rb') as f: | ||
Augie Fackler
|
r39243 | if f.read(len(self.HEADER)) != self.HEADER: | ||
raise error.CorruptedFileError() | ||||
self.clear(flush=False) | ||||
while True: | ||||
buf = f.read(flaglen) | ||||
if not buf: | ||||
break | ||||
flag = ord(buf) | ||||
rev = len(self._rev2hsh) | ||||
if flag & renameflag: | ||||
path = self._readcstr(f) | ||||
self._renamerevs.append(rev) | ||||
self._renamepaths.append(path) | ||||
hsh = f.read(hshlen) | ||||
if len(hsh) != hshlen: | ||||
raise error.CorruptedFileError() | ||||
self._hsh2rev[hsh] = rev | ||||
self._rev2flag.append(flag) | ||||
self._rev2hsh.append(hsh) | ||||
self._lastmaxrev = self.maxrev | ||||
def _writerev(self, rev, f): | ||||
"""append a revision data to file""" | ||||
flag = self._rev2flag[rev] | ||||
hsh = self._rev2hsh[rev] | ||||
Augie Fackler
|
r43347 | f.write(struct.pack(b'B', flag)) | ||
Augie Fackler
|
r39243 | if flag & renameflag: | ||
path = self.rev2path(rev) | ||||
if path is None: | ||||
Augie Fackler
|
r43347 | raise error.CorruptedFileError(b'cannot find path for %s' % rev) | ||
Augie Fackler
|
r41151 | f.write(path + b'\0') | ||
Augie Fackler
|
r39243 | f.write(hsh) | ||
@staticmethod | ||||
def _readcstr(f): | ||||
"""read a C-language-like '\0'-terminated string""" | ||||
Augie Fackler
|
r43347 | buf = b'' | ||
Augie Fackler
|
r39243 | while True: | ||
ch = f.read(1) | ||||
Augie Fackler
|
r43346 | if not ch: # unexpected eof | ||
Augie Fackler
|
r39243 | raise error.CorruptedFileError() | ||
Augie Fackler
|
r43347 | if ch == b'\0': | ||
Augie Fackler
|
r39243 | break | ||
buf += ch | ||||
return buf | ||||
def __contains__(self, f): | ||||
"""(fctx or (node, path)) -> bool. | ||||
test if (node, path) is in the map, and is not in a side branch. | ||||
f can be either a tuple of (node, path), or a fctx. | ||||
""" | ||||
Augie Fackler
|
r43346 | if isinstance(f, tuple): # f: (node, path) | ||
Augie Fackler
|
r39243 | hsh, path = f | ||
Augie Fackler
|
r43346 | else: # f: fctx | ||
Augie Fackler
|
r39243 | hsh, path = f.node(), f.path() | ||
rev = self.hsh2rev(hsh) | ||||
if rev is None: | ||||
return False | ||||
if path is not None and path != self.rev2path(rev): | ||||
return False | ||||
return (self.rev2flag(rev) & sidebranchflag) == 0 | ||||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r39243 | def getlastnode(path): | ||
"""return the last hash in a revmap, without loading its full content. | ||||
this is equivalent to `m = revmap(path); m.rev2hsh(m.maxrev)`, but faster. | ||||
""" | ||||
hsh = None | ||||
try: | ||||
Matt Harbison
|
r53288 | with open(path, 'rb') as f: | ||
Augie Fackler
|
r42767 | f.seek(-_hshlen, io.SEEK_END) | ||
Augie Fackler
|
r39243 | if f.tell() > len(revmap.HEADER): | ||
hsh = f.read(_hshlen) | ||||
except IOError: | ||||
pass | ||||
return hsh | ||||