##// END OF EJS Templates
censor: implement censoring for revlogv2...
marmoute -
r48250:f8330a3f default
parent child Browse files
Show More
@@ -3070,11 +3070,7 b' class revlog(object):'
3070 3070 elif self._format_version == REVLOGV1:
3071 3071 censor.v1_censor(self, tr, censornode, tombstone)
3072 3072 else:
3073 # revlog v2
3074 raise error.RevlogError(
3075 _(b'cannot censor with version %d revlogs')
3076 % self._format_version
3077 )
3073 censor.v2_censor(self, tr, censornode, tombstone)
3078 3074
3079 3075 def verifyintegrity(self, state):
3080 3076 """Verifies the integrity of the revlog.
@@ -1,4 +1,5 b''
1 1 # censor code related to censoring revision
2 # coding: utf8
2 3 #
3 4 # Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
4 5 # Copyright 2015 Google, Inc <martinvonz@google.com>
@@ -6,17 +7,44 b''
6 7 # This software may be used and distributed according to the terms of the
7 8 # GNU General Public License version 2 or any later version.
8 9
10 import contextlib
11 import os
12
9 13 from ..node import (
10 14 nullrev,
11 15 )
16 from .constants import (
17 COMP_MODE_PLAIN,
18 ENTRY_DATA_COMPRESSED_LENGTH,
19 ENTRY_DATA_COMPRESSION_MODE,
20 ENTRY_DATA_OFFSET,
21 ENTRY_DATA_UNCOMPRESSED_LENGTH,
22 ENTRY_DELTA_BASE,
23 ENTRY_LINK_REV,
24 ENTRY_NODE_ID,
25 ENTRY_PARENT_1,
26 ENTRY_PARENT_2,
27 ENTRY_SIDEDATA_COMPRESSED_LENGTH,
28 ENTRY_SIDEDATA_COMPRESSION_MODE,
29 ENTRY_SIDEDATA_OFFSET,
30 REVLOGV0,
31 REVLOGV1,
32 )
12 33 from ..i18n import _
34
13 35 from .. import (
14 36 error,
37 pycompat,
38 revlogutils,
39 util,
15 40 )
16 41 from ..utils import (
17 42 storageutil,
18 43 )
19 from . import constants
44 from . import (
45 constants,
46 deltas,
47 )
20 48
21 49
22 50 def v1_censor(rl, tr, censornode, tombstone=b''):
@@ -95,3 +123,237 b' def v1_censor(rl, tr, censornode, tombst'
95 123
96 124 rl.clearcaches()
97 125 rl._loadindex()
126
127
128 def v2_censor(rl, tr, censornode, tombstone=b''):
129 """censors a revision in a "version 2" revlog"""
130 # General principle
131 #
132 # We create new revlog files (index/data/sidedata) to copy the content of
133 # the existing data without the censored data.
134 #
135 # We need to recompute new delta for any revision that used the censored
136 # revision as delta base. As the cumulative size of the new delta may be
137 # large, we store them in a temporary file until they are stored in their
138 # final destination.
139 #
140 # All data before the censored data can be blindly copied. The rest needs
141 # to be copied as we go and the associated index entry needs adjustement.
142
143 assert rl._format_version != REVLOGV0, rl._format_version
144 assert rl._format_version != REVLOGV1, rl._format_version
145
146 old_index = rl.index
147 docket = rl._docket
148
149 censor_rev = rl.rev(censornode)
150 tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
151
152 censored_entry = rl.index[censor_rev]
153 index_cutoff = rl.index.entry_size * censor_rev
154 data_cutoff = censored_entry[ENTRY_DATA_OFFSET] >> 16
155 sidedata_cutoff = rl.sidedata_cut_off(censor_rev)
156
157 # rev β†’ (new_base, data_start, data_end)
158 rewritten_entries = {}
159
160 dc = deltas.deltacomputer(rl)
161 excl = [censor_rev]
162
163 with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage:
164 with rl._segmentfile._open_read() as dfh:
165 for rev in range(censor_rev + 1, len(old_index)):
166 entry = old_index[rev]
167 if censor_rev != entry[ENTRY_DELTA_BASE]:
168 continue
169 # This is a revision that use the censored revision as the base
170 # for its delta. We need a need new deltas
171 if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0:
172 # this revision is empty, we can delta against nullrev
173 rewritten_entries[rev] = (nullrev, 0, 0)
174 else:
175
176 text = rl.rawdata(rev, _df=dfh)
177 info = revlogutils.revisioninfo(
178 node=entry[ENTRY_NODE_ID],
179 p1=rl.node(entry[ENTRY_PARENT_1]),
180 p2=rl.node(entry[ENTRY_PARENT_2]),
181 btext=[text],
182 textlen=len(text),
183 cachedelta=None,
184 flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF,
185 )
186 d = dc.finddeltainfo(
187 info, dfh, excluded_bases=excl, target_rev=rev
188 )
189 default_comp = rl._docket.default_compression_header
190 comp_mode, d = deltas.delta_compression(default_comp, d)
191 # using `tell` is a bit lazy, but we are not here for speed
192 start = tmp_storage.tell()
193 tmp_storage.write(d.data[1])
194 end = tmp_storage.tell()
195 rewritten_entries[rev] = (d.base, start, end, comp_mode)
196
197 old_index_filepath = rl.opener.join(docket.index_filepath())
198 old_data_filepath = rl.opener.join(docket.data_filepath())
199 old_sidedata_filepath = rl.opener.join(docket.sidedata_filepath())
200
201 new_index_filepath = rl.opener.join(docket.new_index_file())
202 new_data_filepath = rl.opener.join(docket.new_data_file())
203 new_sidedata_filepath = rl.opener.join(docket.new_sidedata_file())
204
205 util.copyfile(
206 old_index_filepath, new_index_filepath, nb_bytes=index_cutoff
207 )
208 util.copyfile(
209 old_data_filepath, new_data_filepath, nb_bytes=data_cutoff
210 )
211 util.copyfile(
212 old_sidedata_filepath,
213 new_sidedata_filepath,
214 nb_bytes=sidedata_cutoff,
215 )
216 rl.opener.register_file(docket.index_filepath())
217 rl.opener.register_file(docket.data_filepath())
218 rl.opener.register_file(docket.sidedata_filepath())
219
220 docket.index_end = index_cutoff
221 docket.data_end = data_cutoff
222 docket.sidedata_end = sidedata_cutoff
223
224 # reload the revlog internal information
225 rl.clearcaches()
226 rl._loadindex(docket=docket)
227
228 @contextlib.contextmanager
229 def all_files():
230 # hide opening in an helper function to please check-code, black
231 # and various python ersion at the same time
232 with open(old_data_filepath, 'rb') as old_data_file:
233 with open(old_sidedata_filepath, 'rb') as old_sidedata_file:
234 with open(new_index_filepath, 'r+b') as new_index_file:
235 with open(new_data_filepath, 'r+b') as new_data_file:
236 with open(
237 new_sidedata_filepath, 'r+b'
238 ) as new_sidedata_file:
239 yield (
240 old_data_file,
241 old_sidedata_file,
242 new_index_file,
243 new_data_file,
244 new_sidedata_file,
245 )
246
247 # we dont need to open the old index file since its content already
248 # exist in a usable form in `old_index`.
249 with all_files() as (
250 old_data_file,
251 old_sidedata_file,
252 new_index_file,
253 new_data_file,
254 new_sidedata_file,
255 ):
256 new_index_file.seek(0, os.SEEK_END)
257 assert new_index_file.tell() == index_cutoff
258 new_data_file.seek(0, os.SEEK_END)
259 assert new_data_file.tell() == data_cutoff
260 new_sidedata_file.seek(0, os.SEEK_END)
261 assert new_sidedata_file.tell() == sidedata_cutoff
262
263 ### writing the censored revision
264 entry = old_index[censor_rev]
265
266 # XXX consider trying the default compression too
267 new_data_size = len(tombstone)
268 new_data_offset = new_data_file.tell()
269 new_data_file.write(tombstone)
270
271 # we are not adding any sidedata as they might leak info about the censored version
272
273 new_entry = revlogutils.entry(
274 flags=constants.REVIDX_ISCENSORED,
275 data_offset=new_data_offset,
276 data_compressed_length=new_data_size,
277 data_uncompressed_length=new_data_size,
278 data_delta_base=censor_rev,
279 link_rev=entry[ENTRY_LINK_REV],
280 parent_rev_1=entry[ENTRY_PARENT_1],
281 parent_rev_2=entry[ENTRY_PARENT_2],
282 node_id=entry[ENTRY_NODE_ID],
283 sidedata_offset=0,
284 sidedata_compressed_length=0,
285 data_compression_mode=COMP_MODE_PLAIN,
286 sidedata_compression_mode=COMP_MODE_PLAIN,
287 )
288 rl.index.append(new_entry)
289 entry_bin = rl.index.entry_binary(censor_rev)
290 new_index_file.write(entry_bin)
291 docket.index_end = new_index_file.tell()
292 docket.data_end = new_data_file.tell()
293
294 #### Writing all subsequent revisions
295 for rev in range(censor_rev + 1, len(old_index)):
296 entry = old_index[rev]
297 flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF
298 old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16
299
300 if rev not in rewritten_entries:
301 old_data_file.seek(old_data_offset)
302 new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH]
303 new_data = old_data_file.read(new_data_size)
304 data_delta_base = entry[ENTRY_DELTA_BASE]
305 d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE]
306 else:
307 (
308 data_delta_base,
309 start,
310 end,
311 d_comp_mode,
312 ) = rewritten_entries[rev]
313 new_data_size = end - start
314 tmp_storage.seek(start)
315 new_data = tmp_storage.read(new_data_size)
316
317 # It might be faster to group continuous read/write operation,
318 # however, this is censor, an operation that is not focussed
319 # around stellar performance. So I have not written this
320 # optimisation yet.
321 new_data_offset = new_data_file.tell()
322 new_data_file.write(new_data)
323
324 sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH]
325 new_sidedata_offset = new_sidedata_file.tell()
326 if 0 < sidedata_size:
327 old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET]
328 old_sidedata_file.seek(old_sidedata_offset)
329 new_sidedata = old_sidedata_file.read(sidedata_size)
330 new_sidedata_file.write(new_sidedata)
331
332 data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH]
333 sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE]
334 assert data_delta_base <= rev, (data_delta_base, rev)
335
336 new_entry = revlogutils.entry(
337 flags=flags,
338 data_offset=new_data_offset,
339 data_compressed_length=new_data_size,
340 data_uncompressed_length=data_uncompressed_length,
341 data_delta_base=data_delta_base,
342 link_rev=entry[ENTRY_LINK_REV],
343 parent_rev_1=entry[ENTRY_PARENT_1],
344 parent_rev_2=entry[ENTRY_PARENT_2],
345 node_id=entry[ENTRY_NODE_ID],
346 sidedata_offset=new_sidedata_offset,
347 sidedata_compressed_length=sidedata_size,
348 data_compression_mode=d_comp_mode,
349 sidedata_compression_mode=sd_com_mode,
350 )
351 rl.index.append(new_entry)
352 entry_bin = rl.index.entry_binary(rev)
353 new_index_file.write(entry_bin)
354
355 docket.index_end = new_index_file.tell()
356 docket.data_end = new_data_file.tell()
357 docket.sidedata_end = new_sidedata_file.tell()
358
359 docket.write(transaction=None, stripping=True)
@@ -1070,7 +1070,7 b' class deltacomputer(object):'
1070 1070 context.
1071 1071 """
1072 1072 if target_rev is None:
1073 curr = len(self.revlog)
1073 target_rev = len(self.revlog)
1074 1074
1075 1075 if not revinfo.textlen:
1076 1076 return self._fullsnapshotinfo(fh, revinfo, target_rev)
@@ -1,4 +1,14 b''
1 1 #require no-reposimplestore
2 #testcases revlogv1 revlogv2
3
4 #if revlogv2
5
6 $ cat >> $HGRCPATH <<EOF
7 > [experimental]
8 > revlogv2=enable-unstable-format-and-corrupt-my-data
9 > EOF
10
11 #endif
2 12
3 13 $ cat >> $HGRCPATH <<EOF
4 14 > [extensions]
@@ -505,3 +515,51 b' Can import bundle where first revision o'
505 515 new changesets e97f55b2665a (1 drafts)
506 516 (run 'hg update' to get a working copy)
507 517 $ hg cat -r 0 target | head -n 10
518
519 #if revlogv2
520
521 Testing feature that does not work in revlog v1
522 ===============================================
523
524 Censoring a revision that is used as delta base
525 -----------------------------------------------
526
527 $ cd ..
528 $ hg init censor-with-delta
529 $ cd censor-with-delta
530 $ echo root > target
531 $ hg add target
532 $ hg commit -m root
533 $ B0=`hg id --debug -i`
534 $ for x in `"$PYTHON" $TESTDIR/seq.py 0 50000`
535 > do
536 > echo "Password: hunter$x" >> target
537 > done
538 $ hg ci -m 'write a long file'
539 $ B1=`hg id --debug -i`
540 $ echo 'small change (should create a delta)' >> target
541 $ hg ci -m 'create a delta over the password'
542 (should show that the last revision is a delta, not a snapshot)
543 $ B2=`hg id --debug -i`
544
545 Make sure the last revision is a delta against the revision we will censor
546
547 $ hg debugdeltachain target -T '{rev} {chainid} {chainlen} {prevrev}\n'
548 0 1 1 -1
549 1 2 1 -1
550 2 2 2 1
551
552 Censor the file
553
554 $ hg cat -r $B1 target | wc -l
555 50002 (re)
556 $ hg censor -r $B1 target
557 $ hg cat -r $B1 target | wc -l
558 0 (re)
559
560 Check the children is fine
561
562 $ hg cat -r $B2 target | wc -l
563 50003 (re)
564
565 #endif
General Comments 0
You need to be logged in to leave comments. Login now