rewrite.py
474 lines
| 15.4 KiB
| text/x-python
|
PythonLexer
r48257 | # censor code related to censoring revision | |||
# coding: utf8 | ||||
# | ||||
# Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net> | ||||
# Copyright 2015 Google, Inc <martinvonz@google.com> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
import contextlib | ||||
import os | ||||
from ..node import ( | ||||
nullrev, | ||||
) | ||||
from .constants import ( | ||||
COMP_MODE_PLAIN, | ||||
ENTRY_DATA_COMPRESSED_LENGTH, | ||||
ENTRY_DATA_COMPRESSION_MODE, | ||||
ENTRY_DATA_OFFSET, | ||||
ENTRY_DATA_UNCOMPRESSED_LENGTH, | ||||
ENTRY_DELTA_BASE, | ||||
ENTRY_LINK_REV, | ||||
ENTRY_NODE_ID, | ||||
ENTRY_PARENT_1, | ||||
ENTRY_PARENT_2, | ||||
ENTRY_SIDEDATA_COMPRESSED_LENGTH, | ||||
ENTRY_SIDEDATA_COMPRESSION_MODE, | ||||
ENTRY_SIDEDATA_OFFSET, | ||||
REVLOGV0, | ||||
REVLOGV1, | ||||
) | ||||
from ..i18n import _ | ||||
from .. import ( | ||||
error, | ||||
pycompat, | ||||
revlogutils, | ||||
util, | ||||
) | ||||
from ..utils import ( | ||||
storageutil, | ||||
) | ||||
from . import ( | ||||
constants, | ||||
deltas, | ||||
) | ||||
def v1_censor(rl, tr, censornode, tombstone=b''): | ||||
"""censors a revision in a "version 1" revlog""" | ||||
assert rl._format_version == constants.REVLOGV1, rl._format_version | ||||
# avoid cycle | ||||
from .. import revlog | ||||
censorrev = rl.rev(censornode) | ||||
tombstone = storageutil.packmeta({b'censored': tombstone}, b'') | ||||
# Rewriting the revlog in place is hard. Our strategy for censoring is | ||||
# to create a new revlog, copy all revisions to it, then replace the | ||||
# revlogs on transaction close. | ||||
# | ||||
# This is a bit dangerous. We could easily have a mismatch of state. | ||||
newrl = revlog.revlog( | ||||
rl.opener, | ||||
target=rl.target, | ||||
radix=rl.radix, | ||||
postfix=b'tmpcensored', | ||||
censorable=True, | ||||
) | ||||
newrl._format_version = rl._format_version | ||||
newrl._format_flags = rl._format_flags | ||||
newrl._generaldelta = rl._generaldelta | ||||
newrl._parse_index = rl._parse_index | ||||
for rev in rl.revs(): | ||||
node = rl.node(rev) | ||||
p1, p2 = rl.parents(node) | ||||
if rev == censorrev: | ||||
newrl.addrawrevision( | ||||
tombstone, | ||||
tr, | ||||
rl.linkrev(censorrev), | ||||
p1, | ||||
p2, | ||||
censornode, | ||||
constants.REVIDX_ISCENSORED, | ||||
) | ||||
if newrl.deltaparent(rev) != nullrev: | ||||
m = _(b'censored revision stored as delta; cannot censor') | ||||
h = _( | ||||
b'censoring of revlogs is not fully implemented;' | ||||
b' please report this bug' | ||||
) | ||||
raise error.Abort(m, hint=h) | ||||
continue | ||||
if rl.iscensored(rev): | ||||
if rl.deltaparent(rev) != nullrev: | ||||
m = _( | ||||
b'cannot censor due to censored ' | ||||
b'revision having delta stored' | ||||
) | ||||
raise error.Abort(m) | ||||
rawtext = rl._chunk(rev) | ||||
else: | ||||
rawtext = rl.rawdata(rev) | ||||
newrl.addrawrevision( | ||||
rawtext, tr, rl.linkrev(rev), p1, p2, node, rl.flags(rev) | ||||
) | ||||
tr.addbackup(rl._indexfile, location=b'store') | ||||
if not rl._inline: | ||||
tr.addbackup(rl._datafile, location=b'store') | ||||
rl.opener.rename(newrl._indexfile, rl._indexfile) | ||||
if not rl._inline: | ||||
rl.opener.rename(newrl._datafile, rl._datafile) | ||||
rl.clearcaches() | ||||
rl._loadindex() | ||||
r48263 | def v2_censor(revlog, tr, censornode, tombstone=b''): | |||
r48257 | """censors a revision in a "version 2" revlog""" | |||
r48265 | assert revlog._format_version != REVLOGV0, revlog._format_version | |||
assert revlog._format_version != REVLOGV1, revlog._format_version | ||||
censor_revs = {revlog.rev(censornode)} | ||||
_rewrite_v2(revlog, tr, censor_revs, tombstone) | ||||
def _rewrite_v2(revlog, tr, censor_revs, tombstone=b''): | ||||
"""rewrite a revlog to censor some of its content | ||||
General principle | ||||
r48257 | ||||
r48265 | We create new revlog files (index/data/sidedata) to copy the content of | |||
the existing data without the censored data. | ||||
We need to recompute new delta for any revision that used the censored | ||||
revision as delta base. As the cumulative size of the new delta may be | ||||
large, we store them in a temporary file until they are stored in their | ||||
final destination. | ||||
All data before the censored data can be blindly copied. The rest needs | ||||
to be copied as we go and the associated index entry needs adjustement. | ||||
""" | ||||
r48263 | assert revlog._format_version != REVLOGV0, revlog._format_version | |||
assert revlog._format_version != REVLOGV1, revlog._format_version | ||||
r48257 | ||||
r48263 | old_index = revlog.index | |||
docket = revlog._docket | ||||
r48257 | ||||
tombstone = storageutil.packmeta({b'censored': tombstone}, b'') | ||||
r48264 | first_excl_rev = min(censor_revs) | |||
first_excl_entry = revlog.index[first_excl_rev] | ||||
index_cutoff = revlog.index.entry_size * first_excl_rev | ||||
data_cutoff = first_excl_entry[ENTRY_DATA_OFFSET] >> 16 | ||||
sidedata_cutoff = revlog.sidedata_cut_off(first_excl_rev) | ||||
r48257 | ||||
with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage: | ||||
r48261 | # rev → (new_base, data_start, data_end, compression_mode) | |||
rewritten_entries = _precompute_rewritten_delta( | ||||
r48263 | revlog, | |||
r48261 | old_index, | |||
r48264 | censor_revs, | |||
r48261 | tmp_storage, | |||
) | ||||
r48257 | ||||
r48262 | all_files = _setup_new_files( | |||
r48263 | revlog, | |||
r48262 | index_cutoff, | |||
data_cutoff, | ||||
sidedata_cutoff, | ||||
r48257 | ) | |||
# we dont need to open the old index file since its content already | ||||
# exist in a usable form in `old_index`. | ||||
r48258 | with all_files() as open_files: | |||
( | ||||
old_data_file, | ||||
old_sidedata_file, | ||||
new_index_file, | ||||
new_data_file, | ||||
new_sidedata_file, | ||||
) = open_files | ||||
r48257 | ||||
r48259 | # writing the censored revision | |||
r48257 | ||||
r48260 | # Writing all subsequent revisions | |||
r48264 | for rev in range(first_excl_rev, len(old_index)): | |||
if rev in censor_revs: | ||||
_rewrite_censor( | ||||
revlog, | ||||
old_index, | ||||
open_files, | ||||
rev, | ||||
tombstone, | ||||
) | ||||
else: | ||||
_rewrite_simple( | ||||
revlog, | ||||
old_index, | ||||
open_files, | ||||
rev, | ||||
rewritten_entries, | ||||
tmp_storage, | ||||
) | ||||
r48260 | docket.write(transaction=None, stripping=True) | |||
r48257 | ||||
r48261 | def _precompute_rewritten_delta( | |||
revlog, | ||||
old_index, | ||||
excluded_revs, | ||||
tmp_storage, | ||||
): | ||||
"""Compute new delta for revisions whose delta is based on revision that | ||||
will not survive as is. | ||||
Return a mapping: {rev → (new_base, data_start, data_end, compression_mode)} | ||||
""" | ||||
dc = deltas.deltacomputer(revlog) | ||||
rewritten_entries = {} | ||||
first_excl_rev = min(excluded_revs) | ||||
with revlog._segmentfile._open_read() as dfh: | ||||
for rev in range(first_excl_rev, len(old_index)): | ||||
if rev in excluded_revs: | ||||
# this revision will be preserved as is, so we don't need to | ||||
# consider recomputing a delta. | ||||
continue | ||||
entry = old_index[rev] | ||||
if entry[ENTRY_DELTA_BASE] not in excluded_revs: | ||||
continue | ||||
# This is a revision that use the censored revision as the base | ||||
# for its delta. We need a need new deltas | ||||
if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0: | ||||
# this revision is empty, we can delta against nullrev | ||||
rewritten_entries[rev] = (nullrev, 0, 0, COMP_MODE_PLAIN) | ||||
else: | ||||
text = revlog.rawdata(rev, _df=dfh) | ||||
info = revlogutils.revisioninfo( | ||||
node=entry[ENTRY_NODE_ID], | ||||
p1=revlog.node(entry[ENTRY_PARENT_1]), | ||||
p2=revlog.node(entry[ENTRY_PARENT_2]), | ||||
btext=[text], | ||||
textlen=len(text), | ||||
cachedelta=None, | ||||
flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF, | ||||
) | ||||
d = dc.finddeltainfo( | ||||
info, dfh, excluded_bases=excluded_revs, target_rev=rev | ||||
) | ||||
default_comp = revlog._docket.default_compression_header | ||||
comp_mode, d = deltas.delta_compression(default_comp, d) | ||||
# using `tell` is a bit lazy, but we are not here for speed | ||||
start = tmp_storage.tell() | ||||
tmp_storage.write(d.data[1]) | ||||
end = tmp_storage.tell() | ||||
rewritten_entries[rev] = (d.base, start, end, comp_mode) | ||||
return rewritten_entries | ||||
r48262 | def _setup_new_files( | |||
revlog, | ||||
index_cutoff, | ||||
data_cutoff, | ||||
sidedata_cutoff, | ||||
): | ||||
""" | ||||
return a context manager to open all the relevant files: | ||||
- old_data_file, | ||||
- old_sidedata_file, | ||||
- new_index_file, | ||||
- new_data_file, | ||||
- new_sidedata_file, | ||||
The old_index_file is not here because it is accessed through the | ||||
`old_index` object if the caller function. | ||||
""" | ||||
docket = revlog._docket | ||||
old_index_filepath = revlog.opener.join(docket.index_filepath()) | ||||
old_data_filepath = revlog.opener.join(docket.data_filepath()) | ||||
old_sidedata_filepath = revlog.opener.join(docket.sidedata_filepath()) | ||||
new_index_filepath = revlog.opener.join(docket.new_index_file()) | ||||
new_data_filepath = revlog.opener.join(docket.new_data_file()) | ||||
new_sidedata_filepath = revlog.opener.join(docket.new_sidedata_file()) | ||||
util.copyfile(old_index_filepath, new_index_filepath, nb_bytes=index_cutoff) | ||||
util.copyfile(old_data_filepath, new_data_filepath, nb_bytes=data_cutoff) | ||||
util.copyfile( | ||||
old_sidedata_filepath, | ||||
new_sidedata_filepath, | ||||
nb_bytes=sidedata_cutoff, | ||||
) | ||||
revlog.opener.register_file(docket.index_filepath()) | ||||
revlog.opener.register_file(docket.data_filepath()) | ||||
revlog.opener.register_file(docket.sidedata_filepath()) | ||||
docket.index_end = index_cutoff | ||||
docket.data_end = data_cutoff | ||||
docket.sidedata_end = sidedata_cutoff | ||||
# reload the revlog internal information | ||||
revlog.clearcaches() | ||||
revlog._loadindex(docket=docket) | ||||
@contextlib.contextmanager | ||||
def all_files_opener(): | ||||
# hide opening in an helper function to please check-code, black | ||||
# and various python version at the same time | ||||
with open(old_data_filepath, 'rb') as old_data_file: | ||||
with open(old_sidedata_filepath, 'rb') as old_sidedata_file: | ||||
with open(new_index_filepath, 'r+b') as new_index_file: | ||||
with open(new_data_filepath, 'r+b') as new_data_file: | ||||
with open( | ||||
new_sidedata_filepath, 'r+b' | ||||
) as new_sidedata_file: | ||||
new_index_file.seek(0, os.SEEK_END) | ||||
assert new_index_file.tell() == index_cutoff | ||||
new_data_file.seek(0, os.SEEK_END) | ||||
assert new_data_file.tell() == data_cutoff | ||||
new_sidedata_file.seek(0, os.SEEK_END) | ||||
assert new_sidedata_file.tell() == sidedata_cutoff | ||||
yield ( | ||||
old_data_file, | ||||
old_sidedata_file, | ||||
new_index_file, | ||||
new_data_file, | ||||
new_sidedata_file, | ||||
) | ||||
return all_files_opener | ||||
r48260 | def _rewrite_simple( | |||
revlog, | ||||
old_index, | ||||
all_files, | ||||
rev, | ||||
rewritten_entries, | ||||
tmp_storage, | ||||
): | ||||
"""append a normal revision to the index after the rewritten one(s)""" | ||||
( | ||||
old_data_file, | ||||
old_sidedata_file, | ||||
new_index_file, | ||||
new_data_file, | ||||
new_sidedata_file, | ||||
) = all_files | ||||
entry = old_index[rev] | ||||
flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF | ||||
old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16 | ||||
r48257 | ||||
r48260 | if rev not in rewritten_entries: | |||
old_data_file.seek(old_data_offset) | ||||
new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH] | ||||
new_data = old_data_file.read(new_data_size) | ||||
data_delta_base = entry[ENTRY_DELTA_BASE] | ||||
d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE] | ||||
else: | ||||
( | ||||
data_delta_base, | ||||
start, | ||||
end, | ||||
d_comp_mode, | ||||
) = rewritten_entries[rev] | ||||
new_data_size = end - start | ||||
tmp_storage.seek(start) | ||||
new_data = tmp_storage.read(new_data_size) | ||||
r48257 | ||||
r48260 | # It might be faster to group continuous read/write operation, | |||
# however, this is censor, an operation that is not focussed | ||||
# around stellar performance. So I have not written this | ||||
# optimisation yet. | ||||
new_data_offset = new_data_file.tell() | ||||
new_data_file.write(new_data) | ||||
r48257 | ||||
r48260 | sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH] | |||
new_sidedata_offset = new_sidedata_file.tell() | ||||
if 0 < sidedata_size: | ||||
old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET] | ||||
old_sidedata_file.seek(old_sidedata_offset) | ||||
new_sidedata = old_sidedata_file.read(sidedata_size) | ||||
new_sidedata_file.write(new_sidedata) | ||||
data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] | ||||
sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE] | ||||
assert data_delta_base <= rev, (data_delta_base, rev) | ||||
r48257 | ||||
r48260 | new_entry = revlogutils.entry( | |||
flags=flags, | ||||
data_offset=new_data_offset, | ||||
data_compressed_length=new_data_size, | ||||
data_uncompressed_length=data_uncompressed_length, | ||||
data_delta_base=data_delta_base, | ||||
link_rev=entry[ENTRY_LINK_REV], | ||||
parent_rev_1=entry[ENTRY_PARENT_1], | ||||
parent_rev_2=entry[ENTRY_PARENT_2], | ||||
node_id=entry[ENTRY_NODE_ID], | ||||
sidedata_offset=new_sidedata_offset, | ||||
sidedata_compressed_length=sidedata_size, | ||||
data_compression_mode=d_comp_mode, | ||||
sidedata_compression_mode=sd_com_mode, | ||||
) | ||||
revlog.index.append(new_entry) | ||||
entry_bin = revlog.index.entry_binary(rev) | ||||
new_index_file.write(entry_bin) | ||||
r48257 | ||||
r48260 | revlog._docket.index_end = new_index_file.tell() | |||
revlog._docket.data_end = new_data_file.tell() | ||||
revlog._docket.sidedata_end = new_sidedata_file.tell() | ||||
r48259 | ||||
def _rewrite_censor( | ||||
revlog, | ||||
old_index, | ||||
all_files, | ||||
rev, | ||||
tombstone, | ||||
): | ||||
"""rewrite and append a censored revision""" | ||||
( | ||||
old_data_file, | ||||
old_sidedata_file, | ||||
new_index_file, | ||||
new_data_file, | ||||
new_sidedata_file, | ||||
) = all_files | ||||
entry = old_index[rev] | ||||
# XXX consider trying the default compression too | ||||
new_data_size = len(tombstone) | ||||
new_data_offset = new_data_file.tell() | ||||
new_data_file.write(tombstone) | ||||
# we are not adding any sidedata as they might leak info about the censored version | ||||
link_rev = entry[ENTRY_LINK_REV] | ||||
p1 = entry[ENTRY_PARENT_1] | ||||
p2 = entry[ENTRY_PARENT_2] | ||||
new_entry = revlogutils.entry( | ||||
flags=constants.REVIDX_ISCENSORED, | ||||
data_offset=new_data_offset, | ||||
data_compressed_length=new_data_size, | ||||
data_uncompressed_length=new_data_size, | ||||
data_delta_base=rev, | ||||
link_rev=link_rev, | ||||
parent_rev_1=p1, | ||||
parent_rev_2=p2, | ||||
node_id=entry[ENTRY_NODE_ID], | ||||
sidedata_offset=0, | ||||
sidedata_compressed_length=0, | ||||
data_compression_mode=COMP_MODE_PLAIN, | ||||
sidedata_compression_mode=COMP_MODE_PLAIN, | ||||
) | ||||
revlog.index.append(new_entry) | ||||
entry_bin = revlog.index.entry_binary(rev) | ||||
new_index_file.write(entry_bin) | ||||
revlog._docket.index_end = new_index_file.tell() | ||||
revlog._docket.data_end = new_data_file.tell() | ||||