# censor code related to censoring revision # coding: utf8 # # Copyright 2021 Pierre-Yves David # Copyright 2015 Google, Inc # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. from __future__ import annotations import binascii import contextlib import os import struct from ..node import ( nullrev, ) from .constants import ( COMP_MODE_PLAIN, ENTRY_DATA_COMPRESSED_LENGTH, ENTRY_DATA_COMPRESSION_MODE, ENTRY_DATA_OFFSET, ENTRY_DATA_UNCOMPRESSED_LENGTH, ENTRY_DELTA_BASE, ENTRY_LINK_REV, ENTRY_NODE_ID, ENTRY_PARENT_1, ENTRY_PARENT_2, ENTRY_SIDEDATA_COMPRESSED_LENGTH, ENTRY_SIDEDATA_COMPRESSION_MODE, ENTRY_SIDEDATA_OFFSET, REVIDX_ISCENSORED, REVLOGV0, REVLOGV1, ) from ..i18n import _ from .. import ( error, mdiff, pycompat, revlogutils, util, ) from ..utils import ( storageutil, ) from . import ( constants, deltas, ) def v1_censor(rl, tr, censor_nodes, tombstone=b''): """censors a revision in a "version 1" revlog""" assert rl._format_version == constants.REVLOGV1, rl._format_version # avoid cycle from .. import revlog censor_revs = set(rl.rev(node) for node in censor_nodes) tombstone = storageutil.packmeta({b'censored': tombstone}, b'') # Rewriting the revlog in place is hard. Our strategy for censoring is # to create a new revlog, copy all revisions to it, then replace the # revlogs on transaction close. # # This is a bit dangerous. We could easily have a mismatch of state. newrl = revlog.revlog( rl.opener, target=rl.target, radix=rl.radix, postfix=b'tmpcensored', censorable=True, data_config=rl.data_config, delta_config=rl.delta_config, feature_config=rl.feature_config, may_inline=rl._inline, ) # inline splitting will prepare some transaction work that will get # confused by the final file move. So if there is a risk of not being # inline at the end, we prevent the new revlog to be inline in the first # place. assert not (newrl._inline and not rl._inline) for rev in rl.revs(): node = rl.node(rev) p1, p2 = rl.parents(node) if rev in censor_revs: newrl.addrawrevision( tombstone, tr, rl.linkrev(rev), p1, p2, node, constants.REVIDX_ISCENSORED, ) if newrl.deltaparent(rev) != nullrev: m = _(b'censored revision stored as delta; cannot censor') h = _( b'censoring of revlogs is not fully implemented;' b' please report this bug' ) raise error.Abort(m, hint=h) continue if rl.iscensored(rev): if rl.deltaparent(rev) != nullrev: m = _( b'cannot censor due to censored ' b'revision having delta stored' ) raise error.Abort(m) rawtext = rl._inner._chunk(rev) else: rawtext = rl.rawdata(rev) newrl.addrawrevision( rawtext, tr, rl.linkrev(rev), p1, p2, node, rl.flags(rev) ) tr.addbackup(rl._indexfile, location=b'store') if not rl._inline: tr.addbackup(rl._datafile, location=b'store') rl.opener.rename(newrl._indexfile, rl._indexfile) if newrl._inline: assert rl._inline else: assert not rl._inline rl.opener.rename(newrl._datafile, rl._datafile) rl.clearcaches() index, chunk_cache = rl._loadindex() rl._load_inner(index, chunk_cache) def v2_censor(revlog, tr, censor_nodes, tombstone=b''): """censors a revision in a "version 2" revlog""" assert revlog._format_version != REVLOGV0, revlog._format_version assert revlog._format_version != REVLOGV1, revlog._format_version censor_revs = {revlog.rev(node) for node in censor_nodes} _rewrite_v2(revlog, tr, censor_revs, tombstone) def _rewrite_v2(revlog, tr, censor_revs, tombstone=b''): """rewrite a revlog to censor some of its content General principle We create new revlog files (index/data/sidedata) to copy the content of the existing data without the censored data. We need to recompute new delta for any revision that used the censored revision as delta base. As the cumulative size of the new delta may be large, we store them in a temporary file until they are stored in their final destination. All data before the censored data can be blindly copied. The rest needs to be copied as we go and the associated index entry needs adjustement. """ assert revlog._format_version != REVLOGV0, revlog._format_version assert revlog._format_version != REVLOGV1, revlog._format_version old_index = revlog.index docket = revlog._docket tombstone = storageutil.packmeta({b'censored': tombstone}, b'') first_excl_rev = min(censor_revs) first_excl_entry = revlog.index[first_excl_rev] index_cutoff = revlog.index.entry_size * first_excl_rev data_cutoff = first_excl_entry[ENTRY_DATA_OFFSET] >> 16 sidedata_cutoff = revlog.sidedata_cut_off(first_excl_rev) with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage: # rev → (new_base, data_start, data_end, compression_mode) rewritten_entries = _precompute_rewritten_delta( revlog, old_index, censor_revs, tmp_storage, ) all_files = _setup_new_files( revlog, index_cutoff, data_cutoff, sidedata_cutoff, ) # we dont need to open the old index file since its content already # exist in a usable form in `old_index`. with all_files() as open_files: ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) = open_files # writing the censored revision # Writing all subsequent revisions for rev in range(first_excl_rev, len(old_index)): if rev in censor_revs: _rewrite_censor( revlog, old_index, open_files, rev, tombstone, ) else: _rewrite_simple( revlog, old_index, open_files, rev, rewritten_entries, tmp_storage, ) docket.write(transaction=None, stripping=True) def _precompute_rewritten_delta( revlog, old_index, excluded_revs, tmp_storage, ): """Compute new delta for revisions whose delta is based on revision that will not survive as is. Return a mapping: {rev → (new_base, data_start, data_end, compression_mode)} """ dc = deltas.deltacomputer(revlog) rewritten_entries = {} first_excl_rev = min(excluded_revs) with revlog.reading(): for rev in range(first_excl_rev, len(old_index)): if rev in excluded_revs: # this revision will be preserved as is, so we don't need to # consider recomputing a delta. continue entry = old_index[rev] if entry[ENTRY_DELTA_BASE] not in excluded_revs: continue # This is a revision that use the censored revision as the base # for its delta. We need a need new deltas if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0: # this revision is empty, we can delta against nullrev rewritten_entries[rev] = (nullrev, 0, 0, COMP_MODE_PLAIN) else: text = revlog.rawdata(rev) info = revlogutils.revisioninfo( node=entry[ENTRY_NODE_ID], p1=revlog.node(entry[ENTRY_PARENT_1]), p2=revlog.node(entry[ENTRY_PARENT_2]), btext=[text], textlen=len(text), cachedelta=None, flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF, ) d = dc.finddeltainfo( info, excluded_bases=excluded_revs, target_rev=rev ) default_comp = revlog._docket.default_compression_header comp_mode, d = deltas.delta_compression(default_comp, d) # using `tell` is a bit lazy, but we are not here for speed start = tmp_storage.tell() tmp_storage.write(d.data[1]) end = tmp_storage.tell() rewritten_entries[rev] = (d.base, start, end, comp_mode) return rewritten_entries def _setup_new_files( revlog, index_cutoff, data_cutoff, sidedata_cutoff, ): """ return a context manager to open all the relevant files: - old_data_file, - old_sidedata_file, - new_index_file, - new_data_file, - new_sidedata_file, The old_index_file is not here because it is accessed through the `old_index` object if the caller function. """ docket = revlog._docket old_index_filepath = revlog.opener.join(docket.index_filepath()) old_data_filepath = revlog.opener.join(docket.data_filepath()) old_sidedata_filepath = revlog.opener.join(docket.sidedata_filepath()) new_index_filepath = revlog.opener.join(docket.new_index_file()) new_data_filepath = revlog.opener.join(docket.new_data_file()) new_sidedata_filepath = revlog.opener.join(docket.new_sidedata_file()) util.copyfile(old_index_filepath, new_index_filepath, nb_bytes=index_cutoff) util.copyfile(old_data_filepath, new_data_filepath, nb_bytes=data_cutoff) util.copyfile( old_sidedata_filepath, new_sidedata_filepath, nb_bytes=sidedata_cutoff, ) revlog.opener.register_file(docket.index_filepath()) revlog.opener.register_file(docket.data_filepath()) revlog.opener.register_file(docket.sidedata_filepath()) docket.index_end = index_cutoff docket.data_end = data_cutoff docket.sidedata_end = sidedata_cutoff # reload the revlog internal information revlog.clearcaches() index, chunk_cache = revlog._loadindex(docket=docket) revlog._load_inner(index, chunk_cache) @contextlib.contextmanager def all_files_opener(): # hide opening in an helper function to please check-code, black # and various python version at the same time with open(old_data_filepath, 'rb') as old_data_file: with open(old_sidedata_filepath, 'rb') as old_sidedata_file: with open(new_index_filepath, 'r+b') as new_index_file: with open(new_data_filepath, 'r+b') as new_data_file: with open( new_sidedata_filepath, 'r+b' ) as new_sidedata_file: new_index_file.seek(0, os.SEEK_END) assert new_index_file.tell() == index_cutoff new_data_file.seek(0, os.SEEK_END) assert new_data_file.tell() == data_cutoff new_sidedata_file.seek(0, os.SEEK_END) assert new_sidedata_file.tell() == sidedata_cutoff yield ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) return all_files_opener def _rewrite_simple( revlog, old_index, all_files, rev, rewritten_entries, tmp_storage, ): """append a normal revision to the index after the rewritten one(s)""" ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) = all_files entry = old_index[rev] flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16 if rev not in rewritten_entries: old_data_file.seek(old_data_offset) new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH] new_data = old_data_file.read(new_data_size) data_delta_base = entry[ENTRY_DELTA_BASE] d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE] else: ( data_delta_base, start, end, d_comp_mode, ) = rewritten_entries[rev] new_data_size = end - start tmp_storage.seek(start) new_data = tmp_storage.read(new_data_size) # It might be faster to group continuous read/write operation, # however, this is censor, an operation that is not focussed # around stellar performance. So I have not written this # optimisation yet. new_data_offset = new_data_file.tell() new_data_file.write(new_data) sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH] new_sidedata_offset = new_sidedata_file.tell() if 0 < sidedata_size: old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET] old_sidedata_file.seek(old_sidedata_offset) new_sidedata = old_sidedata_file.read(sidedata_size) new_sidedata_file.write(new_sidedata) data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE] assert data_delta_base <= rev, (data_delta_base, rev) new_entry = revlogutils.entry( flags=flags, data_offset=new_data_offset, data_compressed_length=new_data_size, data_uncompressed_length=data_uncompressed_length, data_delta_base=data_delta_base, link_rev=entry[ENTRY_LINK_REV], parent_rev_1=entry[ENTRY_PARENT_1], parent_rev_2=entry[ENTRY_PARENT_2], node_id=entry[ENTRY_NODE_ID], sidedata_offset=new_sidedata_offset, sidedata_compressed_length=sidedata_size, data_compression_mode=d_comp_mode, sidedata_compression_mode=sd_com_mode, ) revlog.index.append(new_entry) entry_bin = revlog.index.entry_binary(rev) new_index_file.write(entry_bin) revlog._docket.index_end = new_index_file.tell() revlog._docket.data_end = new_data_file.tell() revlog._docket.sidedata_end = new_sidedata_file.tell() def _rewrite_censor( revlog, old_index, all_files, rev, tombstone, ): """rewrite and append a censored revision""" ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) = all_files entry = old_index[rev] # XXX consider trying the default compression too new_data_size = len(tombstone) new_data_offset = new_data_file.tell() new_data_file.write(tombstone) # we are not adding any sidedata as they might leak info about the censored version link_rev = entry[ENTRY_LINK_REV] p1 = entry[ENTRY_PARENT_1] p2 = entry[ENTRY_PARENT_2] new_entry = revlogutils.entry( flags=constants.REVIDX_ISCENSORED, data_offset=new_data_offset, data_compressed_length=new_data_size, data_uncompressed_length=new_data_size, data_delta_base=rev, link_rev=link_rev, parent_rev_1=p1, parent_rev_2=p2, node_id=entry[ENTRY_NODE_ID], sidedata_offset=0, sidedata_compressed_length=0, data_compression_mode=COMP_MODE_PLAIN, sidedata_compression_mode=COMP_MODE_PLAIN, ) revlog.index.append(new_entry) entry_bin = revlog.index.entry_binary(rev) new_index_file.write(entry_bin) revlog._docket.index_end = new_index_file.tell() revlog._docket.data_end = new_data_file.tell() def _get_filename_from_filelog_index(path): # Drop the extension and the `data/` prefix path_part = path.rsplit(b'.', 1)[0].split(b'/', 1) if len(path_part) < 2: msg = _(b"cannot recognize filelog from filename: '%s'") msg %= path raise error.Abort(msg) return path_part[1] def _filelog_from_filename(repo, path): """Returns the filelog for the given `path`. Stolen from `engine.py`""" from .. import filelog # avoid cycle fl = filelog.filelog(repo.svfs, path) return fl def _write_swapped_parents(repo, rl, rev, offset, fp): """Swaps p1 and p2 and overwrites the revlog entry for `rev` in `fp`""" from ..pure import parsers # avoid cycle if repo._currentlock(repo._lockref) is None: # Let's be paranoid about it msg = "repo needs to be locked to rewrite parents" raise error.ProgrammingError(msg) index_format = parsers.IndexObject.index_format entry = rl.index[rev] new_entry = list(entry) new_entry[5], new_entry[6] = entry[6], entry[5] packed = index_format.pack(*new_entry[:8]) fp.seek(offset) fp.write(packed) def _reorder_filelog_parents(repo, fl, to_fix): """ Swaps p1 and p2 for all `to_fix` revisions of filelog `fl` and writes the new version to disk, overwriting the old one with a rename. """ from ..pure import parsers # avoid cycle ui = repo.ui assert len(to_fix) > 0 rl = fl._revlog if rl._format_version != constants.REVLOGV1: msg = "expected version 1 revlog, got version '%d'" % rl._format_version raise error.ProgrammingError(msg) index_file = rl._indexfile new_file_path = index_file + b'.tmp-parents-fix' repaired_msg = _(b"repaired revision %d of 'filelog %s'\n") with ui.uninterruptible(): try: util.copyfile( rl.opener.join(index_file), rl.opener.join(new_file_path), checkambig=rl.data_config.check_ambig, ) with rl.opener(new_file_path, mode=b"r+") as fp: if rl._inline: index = parsers.InlinedIndexObject(fp.read()) for rev in fl.revs(): if rev in to_fix: offset = index._calculate_index(rev) _write_swapped_parents(repo, rl, rev, offset, fp) ui.write(repaired_msg % (rev, index_file)) else: index_format = parsers.IndexObject.index_format for rev in to_fix: offset = rev * index_format.size _write_swapped_parents(repo, rl, rev, offset, fp) ui.write(repaired_msg % (rev, index_file)) rl.opener.rename(new_file_path, index_file) rl.clearcaches() index, chunk_cache = rl._loadindex() rl._load_inner(index, chunk_cache) finally: util.tryunlink(new_file_path) def _is_revision_affected(fl, filerev, metadata_cache=None): full_text = lambda: fl._revlog.rawdata(filerev) parent_revs = lambda: fl._revlog.parentrevs(filerev) return _is_revision_affected_inner( full_text, parent_revs, filerev, metadata_cache ) def _is_revision_affected_inner( full_text, parents_revs, filerev, metadata_cache=None, ): """Mercurial currently (5.9rc0) uses `p1 == nullrev and p2 != nullrev` as a special meaning compared to the reverse in the context of filelog-based copytracing. issue6528 exists because new code assumed that parent ordering didn't matter, so this detects if the revision contains metadata (since it's only used for filelog-based copytracing) and its parents are in the "wrong" order.""" try: raw_text = full_text() except error.CensoredNodeError: # We don't care about censored nodes as they never carry metadata return False # raw text can be a `memoryview`, which doesn't implement `startswith` has_meta = bytes(raw_text[:2]) == b'\x01\n' if metadata_cache is not None: metadata_cache[filerev] = has_meta if has_meta: (p1, p2) = parents_revs() if p1 != nullrev and p2 == nullrev: return True return False def _is_revision_affected_fast(repo, fl, filerev, metadata_cache): rl = fl._revlog is_censored = lambda: rl.iscensored(filerev) delta_base = lambda: rl.deltaparent(filerev) delta = lambda: rl._inner._chunk(filerev) full_text = lambda: rl.rawdata(filerev) parent_revs = lambda: rl.parentrevs(filerev) # This function is used by repair_issue6528, but not by # filter_delta_issue6528. As such, we do not want to trust # parent revisions of the delta base to decide whether # the delta base has metadata. return _is_revision_affected_fast_inner( is_censored, delta_base, delta, full_text, parent_revs, None, # don't trust the parent revisions filerev, metadata_cache, ) def _is_revision_affected_fast_inner( is_censored, delta_base, delta, full_text, parent_revs, deltabase_parentrevs, filerev, metadata_cache, ): """Optimization fast-path for `_is_revision_affected`. `metadata_cache` is a dict of `{rev: has_metadata}` which allows any revision to check if its base has metadata, saving computation of the full text, instead looking at the current delta. This optimization only works if the revisions are looked at in order.""" if is_censored(): # Censored revisions don't contain metadata, so they cannot be affected metadata_cache[filerev] = False return False p1, p2 = parent_revs() if p1 == nullrev or p2 != nullrev: metadata_cache[filerev] = True return False delta_parent = delta_base() parent_has_metadata = metadata_cache.get(delta_parent) if parent_has_metadata is None: if deltabase_parentrevs is not None: deltabase_parentrevs = deltabase_parentrevs() if deltabase_parentrevs == (nullrev, nullrev): # Need to check the content itself as there is no flag. parent_has_metadata = None elif deltabase_parentrevs[0] == nullrev: # Second parent is !null, assume repository is correct # and has flagged this file revision as having metadata. parent_has_metadata = True elif deltabase_parentrevs[1] == nullrev: # First parent is !null, so assume it has no metadata. parent_has_metadata = False if parent_has_metadata is None: return _is_revision_affected_inner( full_text, parent_revs, filerev, metadata_cache, ) chunk = delta() if not len(chunk): # No diff for this revision metadata_cache[filerev] = parent_has_metadata return parent_has_metadata header_length = 12 if len(chunk) < header_length: raise error.Abort(_(b"patch cannot be decoded")) start, _end, _length = struct.unpack(b">lll", chunk[:header_length]) if start < 2: # len(b'\x01\n') == 2 # This delta does *something* to the metadata marker (if any). # Check it the slow way is_affected = _is_revision_affected_inner( full_text, parent_revs, filerev, metadata_cache, ) return is_affected # The diff did not remove or add the metadata header, it's then in the same # situation as its parent metadata_cache[filerev] = parent_has_metadata return parent_has_metadata def _from_report(ui, repo, context, from_report, dry_run): """ Fix the revisions given in the `from_report` file, but still checks if the revisions are indeed affected to prevent an unfortunate cyclic situation where we'd swap well-ordered parents again. See the doc for `debug_fix_issue6528` for the format documentation. """ ui.write(_(b"loading report file '%s'\n") % from_report) with context(), open(from_report, mode='rb') as f: for line in f.read().split(b'\n'): if not line: continue filenodes, filename = line.split(b' ', 1) fl = _filelog_from_filename(repo, filename) to_fix = set( fl.rev(binascii.unhexlify(n)) for n in filenodes.split(b',') ) excluded = set() for filerev in to_fix: if _is_revision_affected(fl, filerev): msg = b"found affected revision %d for filelog '%s'\n" ui.warn(msg % (filerev, filename)) else: msg = _(b"revision %s of file '%s' is not affected\n") msg %= (binascii.hexlify(fl.node(filerev)), filename) ui.warn(msg) excluded.add(filerev) to_fix = to_fix - excluded if not to_fix: msg = _(b"no affected revisions were found for '%s'\n") ui.write(msg % filename) continue if not dry_run: _reorder_filelog_parents(repo, fl, sorted(to_fix)) def filter_delta_issue6528(revlog, deltas_iter): """filter incomind deltas to repaire issue 6528 on the fly""" metadata_cache = {nullrev: False} deltacomputer = deltas.deltacomputer(revlog) for rev, d in enumerate(deltas_iter, len(revlog)): ( node, p1_node, p2_node, linknode, deltabase, delta, flags, sidedata, ) = d if not revlog.index.has_node(deltabase): raise error.LookupError( deltabase, revlog.radix, _(b'unknown parent') ) base_rev = revlog.rev(deltabase) if not revlog.index.has_node(p1_node): raise error.LookupError(p1_node, revlog.radix, _(b'unknown parent')) p1_rev = revlog.rev(p1_node) if not revlog.index.has_node(p2_node): raise error.LookupError(p2_node, revlog.radix, _(b'unknown parent')) p2_rev = revlog.rev(p2_node) is_censored = lambda: bool(flags & REVIDX_ISCENSORED) delta_base = lambda: base_rev parent_revs = lambda: (p1_rev, p2_rev) deltabase_parentrevs = lambda: revlog.parentrevs(base_rev) def full_text(): # note: being able to reuse the full text computation in the # underlying addrevision would be useful however this is a bit too # intrusive the for the "quick" issue6528 we are writing before the # 5.8 release textlen = mdiff.patchedsize(revlog.size(base_rev), delta) revinfo = revlogutils.revisioninfo( node, p1_node, p2_node, [None], textlen, (base_rev, delta), flags, ) return deltacomputer.buildtext(revinfo) is_affected = _is_revision_affected_fast_inner( is_censored, delta_base, lambda: delta, full_text, parent_revs, deltabase_parentrevs, rev, metadata_cache, ) if is_affected: d = ( node, p2_node, p1_node, linknode, deltabase, delta, flags, sidedata, ) yield d def repair_issue6528( ui, repo, dry_run=False, to_report=None, from_report=None, paranoid=False ): @contextlib.contextmanager def context(): if dry_run or to_report: # No need for locking yield else: with repo.wlock(), repo.lock(): yield if from_report: return _from_report(ui, repo, context, from_report, dry_run) report_entries = [] with context(): files = list( entry for entry in repo.store.data_entries() if entry.is_revlog and entry.is_filelog ) progress = ui.makeprogress( _(b"looking for affected revisions"), unit=_(b"filelogs"), total=len(files), ) found_nothing = True for entry in files: progress.increment() filename = entry.target_id fl = _filelog_from_filename(repo, entry.target_id) # Set of filerevs (or hex filenodes if `to_report`) that need fixing to_fix = set() metadata_cache = {nullrev: False} for filerev in fl.revs(): affected = _is_revision_affected_fast( repo, fl, filerev, metadata_cache ) if paranoid: slow = _is_revision_affected(fl, filerev) if slow != affected: msg = _(b"paranoid check failed for '%s' at node %s") node = binascii.hexlify(fl.node(filerev)) raise error.Abort(msg % (filename, node)) if affected: msg = b"found affected revision %d for file '%s'\n" ui.warn(msg % (filerev, filename)) found_nothing = False if not dry_run: if to_report: to_fix.add(binascii.hexlify(fl.node(filerev))) else: to_fix.add(filerev) if to_fix: to_fix = sorted(to_fix) if to_report: report_entries.append((filename, to_fix)) else: _reorder_filelog_parents(repo, fl, to_fix) if found_nothing: ui.write(_(b"no affected revisions were found\n")) if to_report and report_entries: with open(to_report, mode="wb") as f: for path, to_fix in report_entries: f.write(b"%s %s\n" % (b",".join(to_fix), path)) progress.complete()