datapack.py
473 lines
| 15.5 KiB
| text/x-python
|
PythonLexer
Augie Fackler
|
r40530 | from __future__ import absolute_import | ||
import struct | ||||
Augie Fackler
|
r40542 | import zlib | ||
Augie Fackler
|
r40530 | |||
from mercurial.node import hex, nullid | ||||
from mercurial.i18n import _ | ||||
from mercurial import ( | ||||
pycompat, | ||||
util, | ||||
) | ||||
from . import ( | ||||
basepack, | ||||
constants, | ||||
shallowutil, | ||||
) | ||||
NODELENGTH = 20 | ||||
# The indicator value in the index for a fulltext entry. | ||||
FULLTEXTINDEXMARK = -1 | ||||
NOBASEINDEXMARK = -2 | ||||
Augie Fackler
|
r43347 | INDEXSUFFIX = b'.dataidx' | ||
PACKSUFFIX = b'.datapack' | ||||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r40530 | class datapackstore(basepack.basepackstore): | ||
INDEXSUFFIX = INDEXSUFFIX | ||||
PACKSUFFIX = PACKSUFFIX | ||||
def __init__(self, ui, path): | ||||
super(datapackstore, self).__init__(ui, path) | ||||
def getpack(self, path): | ||||
return datapack(path) | ||||
def get(self, name, node): | ||||
Augie Fackler
|
r43347 | raise RuntimeError(b"must use getdeltachain with datapackstore") | ||
Augie Fackler
|
r40530 | |||
def getmeta(self, name, node): | ||||
for pack in self.packs: | ||||
try: | ||||
return pack.getmeta(name, node) | ||||
except KeyError: | ||||
pass | ||||
for pack in self.refresh(): | ||||
try: | ||||
return pack.getmeta(name, node) | ||||
except KeyError: | ||||
pass | ||||
raise KeyError((name, hex(node))) | ||||
def getdelta(self, name, node): | ||||
for pack in self.packs: | ||||
try: | ||||
return pack.getdelta(name, node) | ||||
except KeyError: | ||||
pass | ||||
for pack in self.refresh(): | ||||
try: | ||||
return pack.getdelta(name, node) | ||||
except KeyError: | ||||
pass | ||||
raise KeyError((name, hex(node))) | ||||
def getdeltachain(self, name, node): | ||||
for pack in self.packs: | ||||
try: | ||||
return pack.getdeltachain(name, node) | ||||
except KeyError: | ||||
pass | ||||
for pack in self.refresh(): | ||||
try: | ||||
return pack.getdeltachain(name, node) | ||||
except KeyError: | ||||
pass | ||||
raise KeyError((name, hex(node))) | ||||
def add(self, name, node, data): | ||||
Augie Fackler
|
r43347 | raise RuntimeError(b"cannot add to datapackstore") | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r40530 | class datapack(basepack.basepack): | ||
INDEXSUFFIX = INDEXSUFFIX | ||||
PACKSUFFIX = PACKSUFFIX | ||||
# Format is <node><delta offset><pack data offset><pack data size> | ||||
# See the mutabledatapack doccomment for more details. | ||||
Augie Fackler
|
r43347 | INDEXFORMAT = b'!20siQQ' | ||
Augie Fackler
|
r40530 | INDEXENTRYLENGTH = 40 | ||
Augie Fackler
|
r40541 | SUPPORTED_VERSIONS = [2] | ||
Augie Fackler
|
r40530 | |||
def getmissing(self, keys): | ||||
missing = [] | ||||
for name, node in keys: | ||||
value = self._find(node) | ||||
if not value: | ||||
missing.append((name, node)) | ||||
return missing | ||||
def get(self, name, node): | ||||
Augie Fackler
|
r43346 | raise RuntimeError( | ||
Augie Fackler
|
r43347 | b"must use getdeltachain with datapack (%s:%s)" % (name, hex(node)) | ||
Augie Fackler
|
r43346 | ) | ||
Augie Fackler
|
r40530 | |||
def getmeta(self, name, node): | ||||
value = self._find(node) | ||||
if value is None: | ||||
raise KeyError((name, hex(node))) | ||||
node, deltabaseoffset, offset, size = value | ||||
Augie Fackler
|
r43346 | rawentry = self._data[offset : offset + size] | ||
Augie Fackler
|
r40530 | |||
# see docstring of mutabledatapack for the format | ||||
offset = 0 | ||||
Augie Fackler
|
r43347 | offset += struct.unpack_from(b'!H', rawentry, offset)[0] + 2 # filename | ||
Augie Fackler
|
r43346 | offset += 40 # node, deltabase node | ||
Augie Fackler
|
r43347 | offset += struct.unpack_from(b'!Q', rawentry, offset)[0] + 8 # delta | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r43347 | metalen = struct.unpack_from(b'!I', rawentry, offset)[0] | ||
Augie Fackler
|
r40530 | offset += 4 | ||
Augie Fackler
|
r43346 | meta = shallowutil.parsepackmeta(rawentry[offset : offset + metalen]) | ||
Augie Fackler
|
r40530 | |||
return meta | ||||
def getdelta(self, name, node): | ||||
value = self._find(node) | ||||
if value is None: | ||||
raise KeyError((name, hex(node))) | ||||
node, deltabaseoffset, offset, size = value | ||||
entry = self._readentry(offset, size, getmeta=True) | ||||
filename, node, deltabasenode, delta, meta = entry | ||||
# If we've read a lot of data from the mmap, free some memory. | ||||
self.freememory() | ||||
return delta, filename, deltabasenode, meta | ||||
def getdeltachain(self, name, node): | ||||
value = self._find(node) | ||||
if value is None: | ||||
raise KeyError((name, hex(node))) | ||||
params = self.params | ||||
# Precompute chains | ||||
chain = [value] | ||||
deltabaseoffset = value[1] | ||||
entrylen = self.INDEXENTRYLENGTH | ||||
Augie Fackler
|
r43346 | while ( | ||
deltabaseoffset != FULLTEXTINDEXMARK | ||||
and deltabaseoffset != NOBASEINDEXMARK | ||||
): | ||||
Augie Fackler
|
r40530 | loc = params.indexstart + deltabaseoffset | ||
Augie Fackler
|
r43346 | value = struct.unpack( | ||
self.INDEXFORMAT, self._index[loc : loc + entrylen] | ||||
) | ||||
Augie Fackler
|
r40530 | deltabaseoffset = value[1] | ||
chain.append(value) | ||||
# Read chain data | ||||
deltachain = [] | ||||
for node, deltabaseoffset, offset, size in chain: | ||||
filename, node, deltabasenode, delta = self._readentry(offset, size) | ||||
deltachain.append((filename, node, filename, deltabasenode, delta)) | ||||
# If we've read a lot of data from the mmap, free some memory. | ||||
self.freememory() | ||||
return deltachain | ||||
def _readentry(self, offset, size, getmeta=False): | ||||
Augie Fackler
|
r43346 | rawentry = self._data[offset : offset + size] | ||
Augie Fackler
|
r40530 | self._pagedin += len(rawentry) | ||
# <2 byte len> + <filename> | ||||
lengthsize = 2 | ||||
Augie Fackler
|
r43347 | filenamelen = struct.unpack(b'!H', rawentry[:2])[0] | ||
Augie Fackler
|
r43346 | filename = rawentry[lengthsize : lengthsize + filenamelen] | ||
Augie Fackler
|
r40530 | |||
# <20 byte node> + <20 byte deltabase> | ||||
nodestart = lengthsize + filenamelen | ||||
deltabasestart = nodestart + NODELENGTH | ||||
node = rawentry[nodestart:deltabasestart] | ||||
Augie Fackler
|
r43346 | deltabasenode = rawentry[deltabasestart : deltabasestart + NODELENGTH] | ||
Augie Fackler
|
r40530 | |||
# <8 byte len> + <delta> | ||||
deltastart = deltabasestart + NODELENGTH | ||||
Augie Fackler
|
r43346 | rawdeltalen = rawentry[deltastart : deltastart + 8] | ||
Augie Fackler
|
r43347 | deltalen = struct.unpack(b'!Q', rawdeltalen)[0] | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r43346 | delta = rawentry[deltastart + 8 : deltastart + 8 + deltalen] | ||
Augie Fackler
|
r40542 | delta = self._decompress(delta) | ||
Augie Fackler
|
r40530 | |||
if getmeta: | ||||
Augie Fackler
|
r40541 | metastart = deltastart + 8 + deltalen | ||
Augie Fackler
|
r43347 | metalen = struct.unpack_from(b'!I', rawentry, metastart)[0] | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r43346 | rawmeta = rawentry[metastart + 4 : metastart + 4 + metalen] | ||
Augie Fackler
|
r40541 | meta = shallowutil.parsepackmeta(rawmeta) | ||
Augie Fackler
|
r40530 | return filename, node, deltabasenode, delta, meta | ||
else: | ||||
return filename, node, deltabasenode, delta | ||||
Augie Fackler
|
r40542 | def _decompress(self, data): | ||
return zlib.decompress(data) | ||||
Augie Fackler
|
r40530 | def add(self, name, node, data): | ||
Augie Fackler
|
r43347 | raise RuntimeError(b"cannot add to datapack (%s:%s)" % (name, node)) | ||
Augie Fackler
|
r40530 | |||
def _find(self, node): | ||||
params = self.params | ||||
Augie Fackler
|
r43346 | fanoutkey = struct.unpack( | ||
params.fanoutstruct, node[: params.fanoutprefix] | ||||
)[0] | ||||
Augie Fackler
|
r40530 | fanout = self._fanouttable | ||
start = fanout[fanoutkey] + params.indexstart | ||||
indexend = self._indexend | ||||
# Scan forward to find the first non-same entry, which is the upper | ||||
# bound. | ||||
for i in pycompat.xrange(fanoutkey + 1, params.fanoutcount): | ||||
end = fanout[i] + params.indexstart | ||||
if end != start: | ||||
break | ||||
else: | ||||
end = indexend | ||||
# Bisect between start and end to find node | ||||
index = self._index | ||||
Augie Fackler
|
r43346 | startnode = index[start : start + NODELENGTH] | ||
endnode = index[end : end + NODELENGTH] | ||||
Augie Fackler
|
r40530 | entrylen = self.INDEXENTRYLENGTH | ||
if startnode == node: | ||||
Augie Fackler
|
r43346 | entry = index[start : start + entrylen] | ||
Augie Fackler
|
r40530 | elif endnode == node: | ||
Augie Fackler
|
r43346 | entry = index[end : end + entrylen] | ||
Augie Fackler
|
r40530 | else: | ||
while start < end - entrylen: | ||||
Boris Feld
|
r41668 | mid = start + (end - start) // 2 | ||
mid = mid - ((mid - params.indexstart) % entrylen) | ||||
Augie Fackler
|
r43346 | midnode = index[mid : mid + NODELENGTH] | ||
Augie Fackler
|
r40530 | if midnode == node: | ||
Augie Fackler
|
r43346 | entry = index[mid : mid + entrylen] | ||
Augie Fackler
|
r40530 | break | ||
if node > midnode: | ||||
start = mid | ||||
elif node < midnode: | ||||
end = mid | ||||
else: | ||||
return None | ||||
return struct.unpack(self.INDEXFORMAT, entry) | ||||
def markledger(self, ledger, options=None): | ||||
for filename, node in self: | ||||
ledger.markdataentry(self, filename, node) | ||||
def cleanup(self, ledger): | ||||
entries = ledger.sources.get(self, []) | ||||
allkeys = set(self) | ||||
Augie Fackler
|
r44937 | repackedkeys = { | ||
Augie Fackler
|
r43346 | (e.filename, e.node) for e in entries if e.datarepacked or e.gced | ||
Augie Fackler
|
r44937 | } | ||
Augie Fackler
|
r40530 | |||
if len(allkeys - repackedkeys) == 0: | ||||
if self.path not in ledger.created: | ||||
util.unlinkpath(self.indexpath, ignoremissing=True) | ||||
util.unlinkpath(self.packpath, ignoremissing=True) | ||||
def __iter__(self): | ||||
for f, n, deltabase, deltalen in self.iterentries(): | ||||
yield f, n | ||||
def iterentries(self): | ||||
# Start at 1 to skip the header | ||||
offset = 1 | ||||
data = self._data | ||||
while offset < self.datasize: | ||||
oldoffset = offset | ||||
# <2 byte len> + <filename> | ||||
Augie Fackler
|
r43347 | filenamelen = struct.unpack(b'!H', data[offset : offset + 2])[0] | ||
Augie Fackler
|
r40530 | offset += 2 | ||
Augie Fackler
|
r43346 | filename = data[offset : offset + filenamelen] | ||
Augie Fackler
|
r40530 | offset += filenamelen | ||
# <20 byte node> | ||||
Augie Fackler
|
r43346 | node = data[offset : offset + constants.NODESIZE] | ||
Augie Fackler
|
r40530 | offset += constants.NODESIZE | ||
# <20 byte deltabase> | ||||
Augie Fackler
|
r43346 | deltabase = data[offset : offset + constants.NODESIZE] | ||
Augie Fackler
|
r40530 | offset += constants.NODESIZE | ||
# <8 byte len> + <delta> | ||||
Augie Fackler
|
r43346 | rawdeltalen = data[offset : offset + 8] | ||
Augie Fackler
|
r43347 | deltalen = struct.unpack(b'!Q', rawdeltalen)[0] | ||
Augie Fackler
|
r40530 | offset += 8 | ||
Augie Fackler
|
r40542 | # TODO(augie): we should store a header that is the | ||
# uncompressed size. | ||||
Augie Fackler
|
r43346 | uncompressedlen = len( | ||
self._decompress(data[offset : offset + deltalen]) | ||||
) | ||||
Augie Fackler
|
r40530 | offset += deltalen | ||
Augie Fackler
|
r40541 | # <4 byte len> + <metadata-list> | ||
Augie Fackler
|
r43347 | metalen = struct.unpack_from(b'!I', data, offset)[0] | ||
Augie Fackler
|
r40541 | offset += 4 + metalen | ||
Augie Fackler
|
r40530 | |||
yield (filename, node, deltabase, uncompressedlen) | ||||
# If we've read a lot of data from the mmap, free some memory. | ||||
self._pagedin += offset - oldoffset | ||||
if self.freememory(): | ||||
data = self._data | ||||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r40530 | class mutabledatapack(basepack.mutablebasepack): | ||
"""A class for constructing and serializing a datapack file and index. | ||||
A datapack is a pair of files that contain the revision contents for various | ||||
file revisions in Mercurial. It contains only revision contents (like file | ||||
contents), not any history information. | ||||
It consists of two files, with the following format. All bytes are in | ||||
network byte order (big endian). | ||||
.datapack | ||||
The pack itself is a series of revision deltas with some basic header | ||||
information on each. A revision delta may be a fulltext, represented by | ||||
a deltabasenode equal to the nullid. | ||||
datapack = <version: 1 byte> | ||||
[<revision>,...] | ||||
revision = <filename len: 2 byte unsigned int> | ||||
<filename> | ||||
<node: 20 byte> | ||||
<deltabasenode: 20 byte> | ||||
<delta len: 8 byte unsigned int> | ||||
<delta> | ||||
<metadata-list len: 4 byte unsigned int> [1] | ||||
<metadata-list> [1] | ||||
metadata-list = [<metadata-item>, ...] | ||||
metadata-item = <metadata-key: 1 byte> | ||||
<metadata-value len: 2 byte unsigned> | ||||
<metadata-value> | ||||
metadata-key could be METAKEYFLAG or METAKEYSIZE or other single byte | ||||
value in the future. | ||||
.dataidx | ||||
The index file consists of two parts, the fanout and the index. | ||||
The index is a list of index entries, sorted by node (one per revision | ||||
in the pack). Each entry has: | ||||
- node (The 20 byte node of the entry; i.e. the commit hash, file node | ||||
hash, etc) | ||||
- deltabase index offset (The location in the index of the deltabase for | ||||
this entry. The deltabase is the next delta in | ||||
the chain, with the chain eventually | ||||
terminating in a full-text, represented by a | ||||
deltabase offset of -1. This lets us compute | ||||
delta chains from the index, then do | ||||
sequential reads from the pack if the revision | ||||
are nearby on disk.) | ||||
- pack entry offset (The location of this entry in the datapack) | ||||
- pack content size (The on-disk length of this entry's pack data) | ||||
The fanout is a quick lookup table to reduce the number of steps for | ||||
bisecting the index. It is a series of 4 byte pointers to positions | ||||
within the index. It has 2^16 entries, which corresponds to hash | ||||
prefixes [0000, 0001,..., FFFE, FFFF]. Example: the pointer in slot | ||||
4F0A points to the index position of the first revision whose node | ||||
starts with 4F0A. This saves log(2^16)=16 bisect steps. | ||||
dataidx = <fanouttable> | ||||
<index> | ||||
fanouttable = [<index offset: 4 byte unsigned int>,...] (2^16 entries) | ||||
index = [<index entry>,...] | ||||
indexentry = <node: 20 byte> | ||||
<deltabase location: 4 byte signed int> | ||||
<pack entry offset: 8 byte unsigned int> | ||||
<pack entry size: 8 byte unsigned int> | ||||
[1]: new in version 1. | ||||
""" | ||||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r40530 | INDEXSUFFIX = INDEXSUFFIX | ||
PACKSUFFIX = PACKSUFFIX | ||||
# v[01] index format: <node><delta offset><pack data offset><pack data size> | ||||
INDEXFORMAT = datapack.INDEXFORMAT | ||||
INDEXENTRYLENGTH = datapack.INDEXENTRYLENGTH | ||||
# v1 has metadata support | ||||
Augie Fackler
|
r40541 | SUPPORTED_VERSIONS = [2] | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r40542 | def _compress(self, data): | ||
return zlib.compress(data) | ||||
Augie Fackler
|
r40530 | def add(self, name, node, deltabasenode, delta, metadata=None): | ||
# metadata is a dict, ex. {METAKEYFLAG: flag} | ||||
Augie Fackler
|
r43346 | if len(name) > 2 ** 16: | ||
Augie Fackler
|
r43347 | raise RuntimeError(_(b"name too long %s") % name) | ||
Augie Fackler
|
r40530 | if len(node) != 20: | ||
Augie Fackler
|
r43347 | raise RuntimeError(_(b"node should be 20 bytes %s") % node) | ||
Augie Fackler
|
r40530 | |||
if node in self.entries: | ||||
# The revision has already been added | ||||
return | ||||
# TODO: allow configurable compression | ||||
Augie Fackler
|
r40542 | delta = self._compress(delta) | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r43347 | rawdata = b''.join( | ||
Augie Fackler
|
r43346 | ( | ||
Augie Fackler
|
r43347 | struct.pack(b'!H', len(name)), # unsigned 2 byte int | ||
Augie Fackler
|
r43346 | name, | ||
node, | ||||
deltabasenode, | ||||
Augie Fackler
|
r43347 | struct.pack(b'!Q', len(delta)), # unsigned 8 byte int | ||
Augie Fackler
|
r43346 | delta, | ||
) | ||||
) | ||||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r40541 | # v1 support metadata | ||
rawmeta = shallowutil.buildpackmeta(metadata) | ||||
Augie Fackler
|
r43347 | rawdata += struct.pack(b'!I', len(rawmeta)) # unsigned 4 byte | ||
Augie Fackler
|
r40541 | rawdata += rawmeta | ||
Augie Fackler
|
r40530 | |||
offset = self.packfp.tell() | ||||
size = len(rawdata) | ||||
self.entries[node] = (deltabasenode, offset, size) | ||||
self.writeraw(rawdata) | ||||
def createindex(self, nodelocations, indexoffset): | ||||
Augie Fackler
|
r43346 | entries = sorted( | ||
Gregory Szorc
|
r43375 | (n, db, o, s) for n, (db, o, s) in pycompat.iteritems(self.entries) | ||
Augie Fackler
|
r43346 | ) | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r43347 | rawindex = b'' | ||
Augie Fackler
|
r40530 | fmt = self.INDEXFORMAT | ||
for node, deltabase, offset, size in entries: | ||||
if deltabase == nullid: | ||||
deltabaselocation = FULLTEXTINDEXMARK | ||||
else: | ||||
# Instead of storing the deltabase node in the index, let's | ||||
# store a pointer directly to the index entry for the deltabase. | ||||
Augie Fackler
|
r43346 | deltabaselocation = nodelocations.get( | ||
deltabase, NOBASEINDEXMARK | ||||
) | ||||
Augie Fackler
|
r40530 | |||
entry = struct.pack(fmt, node, deltabaselocation, offset, size) | ||||
rawindex += entry | ||||
return rawindex | ||||