datapack.py
458 lines
| 15.4 KiB
| text/x-python
|
PythonLexer
Augie Fackler
|
r40530 | from __future__ import absolute_import | ||
import struct | ||||
Augie Fackler
|
r40542 | import zlib | ||
Augie Fackler
|
r40530 | |||
from mercurial.node import hex, nullid | ||||
from mercurial.i18n import _ | ||||
from mercurial import ( | ||||
pycompat, | ||||
util, | ||||
) | ||||
from . import ( | ||||
basepack, | ||||
constants, | ||||
shallowutil, | ||||
) | ||||
NODELENGTH = 20 | ||||
# The indicator value in the index for a fulltext entry. | ||||
FULLTEXTINDEXMARK = -1 | ||||
NOBASEINDEXMARK = -2 | ||||
INDEXSUFFIX = '.dataidx' | ||||
PACKSUFFIX = '.datapack' | ||||
class datapackstore(basepack.basepackstore): | ||||
INDEXSUFFIX = INDEXSUFFIX | ||||
PACKSUFFIX = PACKSUFFIX | ||||
def __init__(self, ui, path): | ||||
super(datapackstore, self).__init__(ui, path) | ||||
def getpack(self, path): | ||||
return datapack(path) | ||||
def get(self, name, node): | ||||
raise RuntimeError("must use getdeltachain with datapackstore") | ||||
def getmeta(self, name, node): | ||||
for pack in self.packs: | ||||
try: | ||||
return pack.getmeta(name, node) | ||||
except KeyError: | ||||
pass | ||||
for pack in self.refresh(): | ||||
try: | ||||
return pack.getmeta(name, node) | ||||
except KeyError: | ||||
pass | ||||
raise KeyError((name, hex(node))) | ||||
def getdelta(self, name, node): | ||||
for pack in self.packs: | ||||
try: | ||||
return pack.getdelta(name, node) | ||||
except KeyError: | ||||
pass | ||||
for pack in self.refresh(): | ||||
try: | ||||
return pack.getdelta(name, node) | ||||
except KeyError: | ||||
pass | ||||
raise KeyError((name, hex(node))) | ||||
def getdeltachain(self, name, node): | ||||
for pack in self.packs: | ||||
try: | ||||
return pack.getdeltachain(name, node) | ||||
except KeyError: | ||||
pass | ||||
for pack in self.refresh(): | ||||
try: | ||||
return pack.getdeltachain(name, node) | ||||
except KeyError: | ||||
pass | ||||
raise KeyError((name, hex(node))) | ||||
def add(self, name, node, data): | ||||
raise RuntimeError("cannot add to datapackstore") | ||||
class datapack(basepack.basepack): | ||||
INDEXSUFFIX = INDEXSUFFIX | ||||
PACKSUFFIX = PACKSUFFIX | ||||
# Format is <node><delta offset><pack data offset><pack data size> | ||||
# See the mutabledatapack doccomment for more details. | ||||
INDEXFORMAT = '!20siQQ' | ||||
INDEXENTRYLENGTH = 40 | ||||
Augie Fackler
|
r40541 | SUPPORTED_VERSIONS = [2] | ||
Augie Fackler
|
r40530 | |||
def getmissing(self, keys): | ||||
missing = [] | ||||
for name, node in keys: | ||||
value = self._find(node) | ||||
if not value: | ||||
missing.append((name, node)) | ||||
return missing | ||||
def get(self, name, node): | ||||
raise RuntimeError("must use getdeltachain with datapack (%s:%s)" | ||||
% (name, hex(node))) | ||||
def getmeta(self, name, node): | ||||
value = self._find(node) | ||||
if value is None: | ||||
raise KeyError((name, hex(node))) | ||||
node, deltabaseoffset, offset, size = value | ||||
rawentry = self._data[offset:offset + size] | ||||
# see docstring of mutabledatapack for the format | ||||
offset = 0 | ||||
offset += struct.unpack_from('!H', rawentry, offset)[0] + 2 # filename | ||||
offset += 40 # node, deltabase node | ||||
offset += struct.unpack_from('!Q', rawentry, offset)[0] + 8 # delta | ||||
metalen = struct.unpack_from('!I', rawentry, offset)[0] | ||||
offset += 4 | ||||
meta = shallowutil.parsepackmeta(rawentry[offset:offset + metalen]) | ||||
return meta | ||||
def getdelta(self, name, node): | ||||
value = self._find(node) | ||||
if value is None: | ||||
raise KeyError((name, hex(node))) | ||||
node, deltabaseoffset, offset, size = value | ||||
entry = self._readentry(offset, size, getmeta=True) | ||||
filename, node, deltabasenode, delta, meta = entry | ||||
# If we've read a lot of data from the mmap, free some memory. | ||||
self.freememory() | ||||
return delta, filename, deltabasenode, meta | ||||
def getdeltachain(self, name, node): | ||||
value = self._find(node) | ||||
if value is None: | ||||
raise KeyError((name, hex(node))) | ||||
params = self.params | ||||
# Precompute chains | ||||
chain = [value] | ||||
deltabaseoffset = value[1] | ||||
entrylen = self.INDEXENTRYLENGTH | ||||
while (deltabaseoffset != FULLTEXTINDEXMARK | ||||
and deltabaseoffset != NOBASEINDEXMARK): | ||||
loc = params.indexstart + deltabaseoffset | ||||
value = struct.unpack(self.INDEXFORMAT, | ||||
self._index[loc:loc + entrylen]) | ||||
deltabaseoffset = value[1] | ||||
chain.append(value) | ||||
# Read chain data | ||||
deltachain = [] | ||||
for node, deltabaseoffset, offset, size in chain: | ||||
filename, node, deltabasenode, delta = self._readentry(offset, size) | ||||
deltachain.append((filename, node, filename, deltabasenode, delta)) | ||||
# If we've read a lot of data from the mmap, free some memory. | ||||
self.freememory() | ||||
return deltachain | ||||
def _readentry(self, offset, size, getmeta=False): | ||||
rawentry = self._data[offset:offset + size] | ||||
self._pagedin += len(rawentry) | ||||
# <2 byte len> + <filename> | ||||
lengthsize = 2 | ||||
filenamelen = struct.unpack('!H', rawentry[:2])[0] | ||||
filename = rawentry[lengthsize:lengthsize + filenamelen] | ||||
# <20 byte node> + <20 byte deltabase> | ||||
nodestart = lengthsize + filenamelen | ||||
deltabasestart = nodestart + NODELENGTH | ||||
node = rawentry[nodestart:deltabasestart] | ||||
deltabasenode = rawentry[deltabasestart:deltabasestart + NODELENGTH] | ||||
# <8 byte len> + <delta> | ||||
deltastart = deltabasestart + NODELENGTH | ||||
rawdeltalen = rawentry[deltastart:deltastart + 8] | ||||
deltalen = struct.unpack('!Q', rawdeltalen)[0] | ||||
delta = rawentry[deltastart + 8:deltastart + 8 + deltalen] | ||||
Augie Fackler
|
r40542 | delta = self._decompress(delta) | ||
Augie Fackler
|
r40530 | |||
if getmeta: | ||||
Augie Fackler
|
r40541 | metastart = deltastart + 8 + deltalen | ||
metalen = struct.unpack_from('!I', rawentry, metastart)[0] | ||||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r40541 | rawmeta = rawentry[metastart + 4:metastart + 4 + metalen] | ||
meta = shallowutil.parsepackmeta(rawmeta) | ||||
Augie Fackler
|
r40530 | return filename, node, deltabasenode, delta, meta | ||
else: | ||||
return filename, node, deltabasenode, delta | ||||
Augie Fackler
|
r40542 | def _decompress(self, data): | ||
return zlib.decompress(data) | ||||
Augie Fackler
|
r40530 | def add(self, name, node, data): | ||
raise RuntimeError("cannot add to datapack (%s:%s)" % (name, node)) | ||||
def _find(self, node): | ||||
params = self.params | ||||
fanoutkey = struct.unpack(params.fanoutstruct, | ||||
node[:params.fanoutprefix])[0] | ||||
fanout = self._fanouttable | ||||
start = fanout[fanoutkey] + params.indexstart | ||||
indexend = self._indexend | ||||
# Scan forward to find the first non-same entry, which is the upper | ||||
# bound. | ||||
for i in pycompat.xrange(fanoutkey + 1, params.fanoutcount): | ||||
end = fanout[i] + params.indexstart | ||||
if end != start: | ||||
break | ||||
else: | ||||
end = indexend | ||||
# Bisect between start and end to find node | ||||
index = self._index | ||||
startnode = index[start:start + NODELENGTH] | ||||
endnode = index[end:end + NODELENGTH] | ||||
entrylen = self.INDEXENTRYLENGTH | ||||
if startnode == node: | ||||
entry = index[start:start + entrylen] | ||||
elif endnode == node: | ||||
entry = index[end:end + entrylen] | ||||
else: | ||||
while start < end - entrylen: | ||||
Boris Feld
|
r41668 | mid = start + (end - start) // 2 | ||
mid = mid - ((mid - params.indexstart) % entrylen) | ||||
Augie Fackler
|
r40530 | midnode = index[mid:mid + NODELENGTH] | ||
if midnode == node: | ||||
entry = index[mid:mid + entrylen] | ||||
break | ||||
if node > midnode: | ||||
start = mid | ||||
elif node < midnode: | ||||
end = mid | ||||
else: | ||||
return None | ||||
return struct.unpack(self.INDEXFORMAT, entry) | ||||
def markledger(self, ledger, options=None): | ||||
for filename, node in self: | ||||
ledger.markdataentry(self, filename, node) | ||||
def cleanup(self, ledger): | ||||
entries = ledger.sources.get(self, []) | ||||
allkeys = set(self) | ||||
repackedkeys = set((e.filename, e.node) for e in entries if | ||||
e.datarepacked or e.gced) | ||||
if len(allkeys - repackedkeys) == 0: | ||||
if self.path not in ledger.created: | ||||
util.unlinkpath(self.indexpath, ignoremissing=True) | ||||
util.unlinkpath(self.packpath, ignoremissing=True) | ||||
def __iter__(self): | ||||
for f, n, deltabase, deltalen in self.iterentries(): | ||||
yield f, n | ||||
def iterentries(self): | ||||
# Start at 1 to skip the header | ||||
offset = 1 | ||||
data = self._data | ||||
while offset < self.datasize: | ||||
oldoffset = offset | ||||
# <2 byte len> + <filename> | ||||
filenamelen = struct.unpack('!H', data[offset:offset + 2])[0] | ||||
offset += 2 | ||||
filename = data[offset:offset + filenamelen] | ||||
offset += filenamelen | ||||
# <20 byte node> | ||||
node = data[offset:offset + constants.NODESIZE] | ||||
offset += constants.NODESIZE | ||||
# <20 byte deltabase> | ||||
deltabase = data[offset:offset + constants.NODESIZE] | ||||
offset += constants.NODESIZE | ||||
# <8 byte len> + <delta> | ||||
rawdeltalen = data[offset:offset + 8] | ||||
deltalen = struct.unpack('!Q', rawdeltalen)[0] | ||||
offset += 8 | ||||
Augie Fackler
|
r40542 | # TODO(augie): we should store a header that is the | ||
# uncompressed size. | ||||
uncompressedlen = len(self._decompress( | ||||
data[offset:offset + deltalen])) | ||||
Augie Fackler
|
r40530 | offset += deltalen | ||
Augie Fackler
|
r40541 | # <4 byte len> + <metadata-list> | ||
metalen = struct.unpack_from('!I', data, offset)[0] | ||||
offset += 4 + metalen | ||||
Augie Fackler
|
r40530 | |||
yield (filename, node, deltabase, uncompressedlen) | ||||
# If we've read a lot of data from the mmap, free some memory. | ||||
self._pagedin += offset - oldoffset | ||||
if self.freememory(): | ||||
data = self._data | ||||
class mutabledatapack(basepack.mutablebasepack): | ||||
"""A class for constructing and serializing a datapack file and index. | ||||
A datapack is a pair of files that contain the revision contents for various | ||||
file revisions in Mercurial. It contains only revision contents (like file | ||||
contents), not any history information. | ||||
It consists of two files, with the following format. All bytes are in | ||||
network byte order (big endian). | ||||
.datapack | ||||
The pack itself is a series of revision deltas with some basic header | ||||
information on each. A revision delta may be a fulltext, represented by | ||||
a deltabasenode equal to the nullid. | ||||
datapack = <version: 1 byte> | ||||
[<revision>,...] | ||||
revision = <filename len: 2 byte unsigned int> | ||||
<filename> | ||||
<node: 20 byte> | ||||
<deltabasenode: 20 byte> | ||||
<delta len: 8 byte unsigned int> | ||||
<delta> | ||||
<metadata-list len: 4 byte unsigned int> [1] | ||||
<metadata-list> [1] | ||||
metadata-list = [<metadata-item>, ...] | ||||
metadata-item = <metadata-key: 1 byte> | ||||
<metadata-value len: 2 byte unsigned> | ||||
<metadata-value> | ||||
metadata-key could be METAKEYFLAG or METAKEYSIZE or other single byte | ||||
value in the future. | ||||
.dataidx | ||||
The index file consists of two parts, the fanout and the index. | ||||
The index is a list of index entries, sorted by node (one per revision | ||||
in the pack). Each entry has: | ||||
- node (The 20 byte node of the entry; i.e. the commit hash, file node | ||||
hash, etc) | ||||
- deltabase index offset (The location in the index of the deltabase for | ||||
this entry. The deltabase is the next delta in | ||||
the chain, with the chain eventually | ||||
terminating in a full-text, represented by a | ||||
deltabase offset of -1. This lets us compute | ||||
delta chains from the index, then do | ||||
sequential reads from the pack if the revision | ||||
are nearby on disk.) | ||||
- pack entry offset (The location of this entry in the datapack) | ||||
- pack content size (The on-disk length of this entry's pack data) | ||||
The fanout is a quick lookup table to reduce the number of steps for | ||||
bisecting the index. It is a series of 4 byte pointers to positions | ||||
within the index. It has 2^16 entries, which corresponds to hash | ||||
prefixes [0000, 0001,..., FFFE, FFFF]. Example: the pointer in slot | ||||
4F0A points to the index position of the first revision whose node | ||||
starts with 4F0A. This saves log(2^16)=16 bisect steps. | ||||
dataidx = <fanouttable> | ||||
<index> | ||||
fanouttable = [<index offset: 4 byte unsigned int>,...] (2^16 entries) | ||||
index = [<index entry>,...] | ||||
indexentry = <node: 20 byte> | ||||
<deltabase location: 4 byte signed int> | ||||
<pack entry offset: 8 byte unsigned int> | ||||
<pack entry size: 8 byte unsigned int> | ||||
[1]: new in version 1. | ||||
""" | ||||
INDEXSUFFIX = INDEXSUFFIX | ||||
PACKSUFFIX = PACKSUFFIX | ||||
# v[01] index format: <node><delta offset><pack data offset><pack data size> | ||||
INDEXFORMAT = datapack.INDEXFORMAT | ||||
INDEXENTRYLENGTH = datapack.INDEXENTRYLENGTH | ||||
# v1 has metadata support | ||||
Augie Fackler
|
r40541 | SUPPORTED_VERSIONS = [2] | ||
Augie Fackler
|
r40530 | |||
Augie Fackler
|
r40542 | def _compress(self, data): | ||
return zlib.compress(data) | ||||
Augie Fackler
|
r40530 | def add(self, name, node, deltabasenode, delta, metadata=None): | ||
# metadata is a dict, ex. {METAKEYFLAG: flag} | ||||
if len(name) > 2**16: | ||||
raise RuntimeError(_("name too long %s") % name) | ||||
if len(node) != 20: | ||||
raise RuntimeError(_("node should be 20 bytes %s") % node) | ||||
if node in self.entries: | ||||
# The revision has already been added | ||||
return | ||||
# TODO: allow configurable compression | ||||
Augie Fackler
|
r40542 | delta = self._compress(delta) | ||
Augie Fackler
|
r40530 | |||
rawdata = ''.join(( | ||||
struct.pack('!H', len(name)), # unsigned 2 byte int | ||||
name, | ||||
node, | ||||
deltabasenode, | ||||
struct.pack('!Q', len(delta)), # unsigned 8 byte int | ||||
delta, | ||||
)) | ||||
Augie Fackler
|
r40541 | # v1 support metadata | ||
rawmeta = shallowutil.buildpackmeta(metadata) | ||||
rawdata += struct.pack('!I', len(rawmeta)) # unsigned 4 byte | ||||
rawdata += rawmeta | ||||
Augie Fackler
|
r40530 | |||
offset = self.packfp.tell() | ||||
size = len(rawdata) | ||||
self.entries[node] = (deltabasenode, offset, size) | ||||
self.writeraw(rawdata) | ||||
def createindex(self, nodelocations, indexoffset): | ||||
entries = sorted((n, db, o, s) for n, (db, o, s) | ||||
in self.entries.iteritems()) | ||||
rawindex = '' | ||||
fmt = self.INDEXFORMAT | ||||
for node, deltabase, offset, size in entries: | ||||
if deltabase == nullid: | ||||
deltabaselocation = FULLTEXTINDEXMARK | ||||
else: | ||||
# Instead of storing the deltabase node in the index, let's | ||||
# store a pointer directly to the index entry for the deltabase. | ||||
deltabaselocation = nodelocations.get(deltabase, | ||||
NOBASEINDEXMARK) | ||||
entry = struct.pack(fmt, node, deltabaselocation, offset, size) | ||||
rawindex += entry | ||||
return rawindex | ||||