# HG changeset patch # User Pierre-Yves David # Date 2020-01-15 14:49:54 # Node ID 50ad851efd9b2ded7d278be6aab03f4572bccd32 # Parent 76a96e3a2bbb2dd7546e5f33eade0c21907f55fc nodemap: introduce append-only incremental update of the persistent data Rewriting the full nodemap for each transaction has a cost we would like to avoid. We introduce a new way to write persistent nodemap data by adding new information at the end for file. Any new and updated block as added at the end of the file. The last block is the new root node. With this method, some of the block already on disk get "dereferenced" and become dead data. In later changesets, We'll start tracking the amount of dead data to eventually re-generate a full nodemap. Differential Revision: https://phab.mercurial-scm.org/D7886 diff --git a/mercurial/pure/parsers.py b/mercurial/pure/parsers.py --- a/mercurial/pure/parsers.py +++ b/mercurial/pure/parsers.py @@ -156,13 +156,31 @@ class PersistentNodeMapIndexObject(Index index.""" return nodemaputil.persistent_data(self) + def nodemap_data_incremental(self): + """Return bytes containing a incremental update to persistent nodemap + + This containst the data for an append-only update of the data provided + in the last call to `update_nodemap_data`. + """ + if self._nm_root is None: + return None + data = nodemaputil.update_persistent_data( + self, self._nm_root, self._nm_max_idx, self._nm_rev + ) + self._nm_root = self._nm_max_idx = self._nm_rev = None + return data + def update_nodemap_data(self, nm_data): """provide full blokc of persisted binary data for a nodemap The data are expected to come from disk. See `nodemap_data_all` for a produceur of such data.""" if nm_data is not None: - nodemaputil.parse_data(nm_data) + self._nm_root, self._nm_max_idx = nodemaputil.parse_data(nm_data) + if self._nm_root: + self._nm_rev = len(self) - 1 + else: + self._nm_root = self._nm_max_idx = self._nm_rev = None class InlinedIndexObject(BaseIndexObject): diff --git a/mercurial/revlogutils/nodemap.py b/mercurial/revlogutils/nodemap.py --- a/mercurial/revlogutils/nodemap.py +++ b/mercurial/revlogutils/nodemap.py @@ -69,12 +69,41 @@ def _persist_nodemap(tr, revlog): if revlog.nodemap_file is None: msg = "calling persist nodemap on a revlog without the feature enableb" raise error.ProgrammingError(msg) - if util.safehasattr(revlog.index, "nodemap_data_all"): - data = revlog.index.nodemap_data_all() + + can_incremental = util.safehasattr(revlog.index, "nodemap_data_incremental") + ondisk_docket = revlog._nodemap_docket + + # first attemp an incremental update of the data + if can_incremental and ondisk_docket is not None: + target_docket = revlog._nodemap_docket.copy() + data = revlog.index.nodemap_data_incremental() + datafile = _rawdata_filepath(revlog, target_docket) + # EXP-TODO: if this is a cache, this should use a cache vfs, not a + # store vfs + with revlog.opener(datafile, b'a') as fd: + fd.write(data) else: - data = persistent_data(revlog.index) - target_docket = NodeMapDocket() - datafile = _rawdata_filepath(revlog, target_docket) + # otherwise fallback to a full new export + target_docket = NodeMapDocket() + datafile = _rawdata_filepath(revlog, target_docket) + if util.safehasattr(revlog.index, "nodemap_data_all"): + data = revlog.index.nodemap_data_all() + else: + data = persistent_data(revlog.index) + # EXP-TODO: if this is a cache, this should use a cache vfs, not a + # store vfs + with revlog.opener(datafile, b'w') as fd: + fd.write(data) + # EXP-TODO: if this is a cache, this should use a cache vfs, not a + # store vfs + with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp: + fp.write(target_docket.serialize()) + revlog._nodemap_docket = target_docket + # EXP-TODO: if the transaction abort, we should remove the new data and + # reinstall the old one. + + # search for old index file in all cases, some older process might have + # left one behind. olds = _other_rawdata_filepath(revlog, target_docket) if olds: realvfs = getattr(revlog, '_realopener', revlog.opener) @@ -85,17 +114,6 @@ def _persist_nodemap(tr, revlog): callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file tr.addpostclose(callback_id, cleanup) - # EXP-TODO: if this is a cache, this should use a cache vfs, not a - # store vfs - with revlog.opener(datafile, b'w') as fd: - fd.write(data) - # EXP-TODO: if this is a cache, this should use a cache vfs, not a - # store vfs - with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp: - fp.write(target_docket.serialize()) - revlog._nodemap_docket = target_docket - # EXP-TODO: if the transaction abort, we should remove the new data and - # reinstall the old one. ### Nodemap docket file @@ -208,6 +226,13 @@ def persistent_data(index): return _persist_trie(trie) +def update_persistent_data(index, root, max_idx, last_rev): + """return the incremental update for persistent nodemap from a given index + """ + trie = _update_trie(index, root, last_rev) + return _persist_trie(trie, existing_idx=max_idx) + + S_BLOCK = struct.Struct(">" + ("l" * 16)) NO_ENTRY = -1 @@ -260,6 +285,14 @@ def _build_trie(index): return root +def _update_trie(index, root, last_rev): + """consume""" + for rev in range(last_rev + 1, len(index)): + hex = nodemod.hex(index[rev][7]) + _insert_into_block(index, 0, root, rev, hex) + return root + + def _insert_into_block(index, level, block, current_rev, current_hex): """insert a new revision in a block @@ -269,6 +302,8 @@ def _insert_into_block(index, level, blo current_rev: the revision number we are adding current_hex: the hexadecimal representation of the of that revision """ + if block.ondisk_id is not None: + block.ondisk_id = None hex_digit = _to_int(current_hex[level : level + 1]) entry = block.get(hex_digit) if entry is None: @@ -288,15 +323,22 @@ def _insert_into_block(index, level, blo _insert_into_block(index, level + 1, new, current_rev, current_hex) -def _persist_trie(root): +def _persist_trie(root, existing_idx=None): """turn a nodemap trie into persistent binary data See `_build_trie` for nodemap trie structure""" block_map = {} + if existing_idx is not None: + base_idx = existing_idx + 1 + else: + base_idx = 0 chunks = [] for tn in _walk_trie(root): - block_map[id(tn)] = len(chunks) - chunks.append(_persist_block(tn, block_map)) + if tn.ondisk_id is not None: + block_map[id(tn)] = tn.ondisk_id + else: + block_map[id(tn)] = len(chunks) + base_idx + chunks.append(_persist_block(tn, block_map)) return b''.join(chunks) @@ -338,7 +380,7 @@ def parse_data(data): msg = "nodemap data size is not a multiple of block size (%d): %d" raise error.Abort(msg % (S_BLOCK.size, len(data))) if not data: - return Block() + return Block(), None block_map = {} new_blocks = [] for i in range(0, len(data), S_BLOCK.size): @@ -356,7 +398,7 @@ def parse_data(data): b[idx] = block_map[v] else: b[idx] = _transform_rev(v) - return block + return block, i // S_BLOCK.size # debug utility @@ -366,7 +408,7 @@ def check_data(ui, index, data): """verify that the provided nodemap data are valid for the given idex""" ret = 0 ui.status((b"revision in index: %d\n") % len(index)) - root = parse_data(data) + root, __ = parse_data(data) all_revs = set(_all_revisions(root)) ui.status((b"revision in nodemap: %d\n") % len(all_revs)) for r in range(len(index)): diff --git a/tests/test-persistent-nodemap.t b/tests/test-persistent-nodemap.t --- a/tests/test-persistent-nodemap.t +++ b/tests/test-persistent-nodemap.t @@ -49,8 +49,19 @@ add a new commit $ hg ci -m 'foo' $ f --size .hg/store/00changelog.n .hg/store/00changelog.n: size=18 + +(The pure code use the debug code that perform incremental update, the C code reencode from scratch) + +#if pure + $ f --sha256 .hg/store/00changelog-*.nd --size + .hg/store/00changelog-????????????????.nd: size=123072, sha256=136472751566c8198ff09e306a7d2f9bd18bd32298d614752b73da4d6df23340 (glob) + +#else $ f --sha256 .hg/store/00changelog-*.nd --size .hg/store/00changelog-????????????????.nd: size=122880, sha256=bfafebd751c4f6d116a76a37a1dee2a251747affe7efbcc4f4842ccc746d4db9 (glob) + +#endif + $ hg debugnodemap --check revision in index: 5002 revision in nodemap: 5002