# censor code related to censoring revision # coding: utf8 # # Copyright 2021 Pierre-Yves David # Copyright 2015 Google, Inc # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. import contextlib import os from ..node import ( nullrev, ) from .constants import ( COMP_MODE_PLAIN, ENTRY_DATA_COMPRESSED_LENGTH, ENTRY_DATA_COMPRESSION_MODE, ENTRY_DATA_OFFSET, ENTRY_DATA_UNCOMPRESSED_LENGTH, ENTRY_DELTA_BASE, ENTRY_LINK_REV, ENTRY_NODE_ID, ENTRY_PARENT_1, ENTRY_PARENT_2, ENTRY_SIDEDATA_COMPRESSED_LENGTH, ENTRY_SIDEDATA_COMPRESSION_MODE, ENTRY_SIDEDATA_OFFSET, REVLOGV0, REVLOGV1, ) from ..i18n import _ from .. import ( error, pycompat, revlogutils, util, ) from ..utils import ( storageutil, ) from . import ( constants, deltas, ) def v1_censor(rl, tr, censornode, tombstone=b''): """censors a revision in a "version 1" revlog""" assert rl._format_version == constants.REVLOGV1, rl._format_version # avoid cycle from .. import revlog censorrev = rl.rev(censornode) tombstone = storageutil.packmeta({b'censored': tombstone}, b'') # Rewriting the revlog in place is hard. Our strategy for censoring is # to create a new revlog, copy all revisions to it, then replace the # revlogs on transaction close. # # This is a bit dangerous. We could easily have a mismatch of state. newrl = revlog.revlog( rl.opener, target=rl.target, radix=rl.radix, postfix=b'tmpcensored', censorable=True, ) newrl._format_version = rl._format_version newrl._format_flags = rl._format_flags newrl._generaldelta = rl._generaldelta newrl._parse_index = rl._parse_index for rev in rl.revs(): node = rl.node(rev) p1, p2 = rl.parents(node) if rev == censorrev: newrl.addrawrevision( tombstone, tr, rl.linkrev(censorrev), p1, p2, censornode, constants.REVIDX_ISCENSORED, ) if newrl.deltaparent(rev) != nullrev: m = _(b'censored revision stored as delta; cannot censor') h = _( b'censoring of revlogs is not fully implemented;' b' please report this bug' ) raise error.Abort(m, hint=h) continue if rl.iscensored(rev): if rl.deltaparent(rev) != nullrev: m = _( b'cannot censor due to censored ' b'revision having delta stored' ) raise error.Abort(m) rawtext = rl._chunk(rev) else: rawtext = rl.rawdata(rev) newrl.addrawrevision( rawtext, tr, rl.linkrev(rev), p1, p2, node, rl.flags(rev) ) tr.addbackup(rl._indexfile, location=b'store') if not rl._inline: tr.addbackup(rl._datafile, location=b'store') rl.opener.rename(newrl._indexfile, rl._indexfile) if not rl._inline: rl.opener.rename(newrl._datafile, rl._datafile) rl.clearcaches() rl._loadindex() def v2_censor(rl, tr, censornode, tombstone=b''): """censors a revision in a "version 2" revlog""" # General principle # # We create new revlog files (index/data/sidedata) to copy the content of # the existing data without the censored data. # # We need to recompute new delta for any revision that used the censored # revision as delta base. As the cumulative size of the new delta may be # large, we store them in a temporary file until they are stored in their # final destination. # # All data before the censored data can be blindly copied. The rest needs # to be copied as we go and the associated index entry needs adjustement. assert rl._format_version != REVLOGV0, rl._format_version assert rl._format_version != REVLOGV1, rl._format_version old_index = rl.index docket = rl._docket censor_rev = rl.rev(censornode) tombstone = storageutil.packmeta({b'censored': tombstone}, b'') censored_entry = rl.index[censor_rev] index_cutoff = rl.index.entry_size * censor_rev data_cutoff = censored_entry[ENTRY_DATA_OFFSET] >> 16 sidedata_cutoff = rl.sidedata_cut_off(censor_rev) with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage: # rev → (new_base, data_start, data_end, compression_mode) rewritten_entries = _precompute_rewritten_delta( rl, old_index, {censor_rev}, tmp_storage, ) all_files = _setup_new_files( rl, index_cutoff, data_cutoff, sidedata_cutoff, ) # we dont need to open the old index file since its content already # exist in a usable form in `old_index`. with all_files() as open_files: ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) = open_files # writing the censored revision _rewrite_censor( rl, old_index, open_files, censor_rev, tombstone, ) # Writing all subsequent revisions for rev in range(censor_rev + 1, len(old_index)): _rewrite_simple( rl, old_index, open_files, rev, rewritten_entries, tmp_storage, ) docket.write(transaction=None, stripping=True) def _precompute_rewritten_delta( revlog, old_index, excluded_revs, tmp_storage, ): """Compute new delta for revisions whose delta is based on revision that will not survive as is. Return a mapping: {rev → (new_base, data_start, data_end, compression_mode)} """ dc = deltas.deltacomputer(revlog) rewritten_entries = {} first_excl_rev = min(excluded_revs) with revlog._segmentfile._open_read() as dfh: for rev in range(first_excl_rev, len(old_index)): if rev in excluded_revs: # this revision will be preserved as is, so we don't need to # consider recomputing a delta. continue entry = old_index[rev] if entry[ENTRY_DELTA_BASE] not in excluded_revs: continue # This is a revision that use the censored revision as the base # for its delta. We need a need new deltas if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0: # this revision is empty, we can delta against nullrev rewritten_entries[rev] = (nullrev, 0, 0, COMP_MODE_PLAIN) else: text = revlog.rawdata(rev, _df=dfh) info = revlogutils.revisioninfo( node=entry[ENTRY_NODE_ID], p1=revlog.node(entry[ENTRY_PARENT_1]), p2=revlog.node(entry[ENTRY_PARENT_2]), btext=[text], textlen=len(text), cachedelta=None, flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF, ) d = dc.finddeltainfo( info, dfh, excluded_bases=excluded_revs, target_rev=rev ) default_comp = revlog._docket.default_compression_header comp_mode, d = deltas.delta_compression(default_comp, d) # using `tell` is a bit lazy, but we are not here for speed start = tmp_storage.tell() tmp_storage.write(d.data[1]) end = tmp_storage.tell() rewritten_entries[rev] = (d.base, start, end, comp_mode) return rewritten_entries def _setup_new_files( revlog, index_cutoff, data_cutoff, sidedata_cutoff, ): """ return a context manager to open all the relevant files: - old_data_file, - old_sidedata_file, - new_index_file, - new_data_file, - new_sidedata_file, The old_index_file is not here because it is accessed through the `old_index` object if the caller function. """ docket = revlog._docket old_index_filepath = revlog.opener.join(docket.index_filepath()) old_data_filepath = revlog.opener.join(docket.data_filepath()) old_sidedata_filepath = revlog.opener.join(docket.sidedata_filepath()) new_index_filepath = revlog.opener.join(docket.new_index_file()) new_data_filepath = revlog.opener.join(docket.new_data_file()) new_sidedata_filepath = revlog.opener.join(docket.new_sidedata_file()) util.copyfile(old_index_filepath, new_index_filepath, nb_bytes=index_cutoff) util.copyfile(old_data_filepath, new_data_filepath, nb_bytes=data_cutoff) util.copyfile( old_sidedata_filepath, new_sidedata_filepath, nb_bytes=sidedata_cutoff, ) revlog.opener.register_file(docket.index_filepath()) revlog.opener.register_file(docket.data_filepath()) revlog.opener.register_file(docket.sidedata_filepath()) docket.index_end = index_cutoff docket.data_end = data_cutoff docket.sidedata_end = sidedata_cutoff # reload the revlog internal information revlog.clearcaches() revlog._loadindex(docket=docket) @contextlib.contextmanager def all_files_opener(): # hide opening in an helper function to please check-code, black # and various python version at the same time with open(old_data_filepath, 'rb') as old_data_file: with open(old_sidedata_filepath, 'rb') as old_sidedata_file: with open(new_index_filepath, 'r+b') as new_index_file: with open(new_data_filepath, 'r+b') as new_data_file: with open( new_sidedata_filepath, 'r+b' ) as new_sidedata_file: new_index_file.seek(0, os.SEEK_END) assert new_index_file.tell() == index_cutoff new_data_file.seek(0, os.SEEK_END) assert new_data_file.tell() == data_cutoff new_sidedata_file.seek(0, os.SEEK_END) assert new_sidedata_file.tell() == sidedata_cutoff yield ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) return all_files_opener def _rewrite_simple( revlog, old_index, all_files, rev, rewritten_entries, tmp_storage, ): """append a normal revision to the index after the rewritten one(s)""" ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) = all_files entry = old_index[rev] flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16 if rev not in rewritten_entries: old_data_file.seek(old_data_offset) new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH] new_data = old_data_file.read(new_data_size) data_delta_base = entry[ENTRY_DELTA_BASE] d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE] else: ( data_delta_base, start, end, d_comp_mode, ) = rewritten_entries[rev] new_data_size = end - start tmp_storage.seek(start) new_data = tmp_storage.read(new_data_size) # It might be faster to group continuous read/write operation, # however, this is censor, an operation that is not focussed # around stellar performance. So I have not written this # optimisation yet. new_data_offset = new_data_file.tell() new_data_file.write(new_data) sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH] new_sidedata_offset = new_sidedata_file.tell() if 0 < sidedata_size: old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET] old_sidedata_file.seek(old_sidedata_offset) new_sidedata = old_sidedata_file.read(sidedata_size) new_sidedata_file.write(new_sidedata) data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE] assert data_delta_base <= rev, (data_delta_base, rev) new_entry = revlogutils.entry( flags=flags, data_offset=new_data_offset, data_compressed_length=new_data_size, data_uncompressed_length=data_uncompressed_length, data_delta_base=data_delta_base, link_rev=entry[ENTRY_LINK_REV], parent_rev_1=entry[ENTRY_PARENT_1], parent_rev_2=entry[ENTRY_PARENT_2], node_id=entry[ENTRY_NODE_ID], sidedata_offset=new_sidedata_offset, sidedata_compressed_length=sidedata_size, data_compression_mode=d_comp_mode, sidedata_compression_mode=sd_com_mode, ) revlog.index.append(new_entry) entry_bin = revlog.index.entry_binary(rev) new_index_file.write(entry_bin) revlog._docket.index_end = new_index_file.tell() revlog._docket.data_end = new_data_file.tell() revlog._docket.sidedata_end = new_sidedata_file.tell() def _rewrite_censor( revlog, old_index, all_files, rev, tombstone, ): """rewrite and append a censored revision""" ( old_data_file, old_sidedata_file, new_index_file, new_data_file, new_sidedata_file, ) = all_files entry = old_index[rev] # XXX consider trying the default compression too new_data_size = len(tombstone) new_data_offset = new_data_file.tell() new_data_file.write(tombstone) # we are not adding any sidedata as they might leak info about the censored version link_rev = entry[ENTRY_LINK_REV] p1 = entry[ENTRY_PARENT_1] p2 = entry[ENTRY_PARENT_2] new_entry = revlogutils.entry( flags=constants.REVIDX_ISCENSORED, data_offset=new_data_offset, data_compressed_length=new_data_size, data_uncompressed_length=new_data_size, data_delta_base=rev, link_rev=link_rev, parent_rev_1=p1, parent_rev_2=p2, node_id=entry[ENTRY_NODE_ID], sidedata_offset=0, sidedata_compressed_length=0, data_compression_mode=COMP_MODE_PLAIN, sidedata_compression_mode=COMP_MODE_PLAIN, ) revlog.index.append(new_entry) entry_bin = revlog.index.entry_binary(rev) new_index_file.write(entry_bin) revlog._docket.index_end = new_index_file.tell() revlog._docket.data_end = new_data_file.tell()