upstream/mercurial-mirror Commit - r48262:d6afe147

censor: extract the part about creating and opening new files in a function...

marmoute -

r48262:d6afe147 default

parent child

mercurial/revlogutils/rewrite.py

0 +79 -54

              # censor code related to censoring revision
              # coding: utf8
              #
              # Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
              # Copyright 2015 Google, Inc <martinvonz@google.com>
              #
              # This software may be used and distributed according to the terms of the
              # GNU General Public License version 2 or any later version.
              import contextlib
              import os
              from ..node import (
                  nullrev,
              )
              from .constants import (
                  COMP_MODE_PLAIN,
                  ENTRY_DATA_COMPRESSED_LENGTH,
                  ENTRY_DATA_COMPRESSION_MODE,
                  ENTRY_DATA_OFFSET,
                  ENTRY_DATA_UNCOMPRESSED_LENGTH,
                  ENTRY_DELTA_BASE,
                  ENTRY_LINK_REV,
                  ENTRY_NODE_ID,
                  ENTRY_PARENT_1,
                  ENTRY_PARENT_2,
                  ENTRY_SIDEDATA_COMPRESSED_LENGTH,
                  ENTRY_SIDEDATA_COMPRESSION_MODE,
                  ENTRY_SIDEDATA_OFFSET,
                  REVLOGV0,
                  REVLOGV1,
              )
              from ..i18n import _
              from .. import (
                  error,
                  pycompat,
                  revlogutils,
                  util,
              )
              from ..utils import (
                  storageutil,
              )
              from . import (
                  constants,
                  deltas,
              )
              def v1_censor(rl, tr, censornode, tombstone=b''):
                  """censors a revision in a "version 1" revlog"""
                  assert rl._format_version == constants.REVLOGV1, rl._format_version
                  # avoid cycle
                  from .. import revlog
                  censorrev = rl.rev(censornode)
                  tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
                  # Rewriting the revlog in place is hard. Our strategy for censoring is
                  # to create a new revlog, copy all revisions to it, then replace the
                  # revlogs on transaction close.
                  #
                  # This is a bit dangerous. We could easily have a mismatch of state.
                  newrl = revlog.revlog(
                      rl.opener,
                      target=rl.target,
                      radix=rl.radix,
                      postfix=b'tmpcensored',
                      censorable=True,
                  )
                  newrl._format_version = rl._format_version
                  newrl._format_flags = rl._format_flags
                  newrl._generaldelta = rl._generaldelta
                  newrl._parse_index = rl._parse_index
                  for rev in rl.revs():
                      node = rl.node(rev)
                      p1, p2 = rl.parents(node)
                      if rev == censorrev:
                          newrl.addrawrevision(
                              tombstone,
                              tr,
                              rl.linkrev(censorrev),
                              p1,
                              p2,
                              censornode,
                              constants.REVIDX_ISCENSORED,
                          )
                          if newrl.deltaparent(rev) != nullrev:
                              m = _(b'censored revision stored as delta; cannot censor')
                              h = _(
                                  b'censoring of revlogs is not fully implemented;'
                                  b' please report this bug'
                              )
                              raise error.Abort(m, hint=h)
                          continue
                      if rl.iscensored(rev):
                          if rl.deltaparent(rev) != nullrev:
                              m = _(
                                  b'cannot censor due to censored '
                                  b'revision having delta stored'
                              )
                              raise error.Abort(m)
                          rawtext = rl._chunk(rev)
                      else:
                          rawtext = rl.rawdata(rev)
                      newrl.addrawrevision(
                          rawtext, tr, rl.linkrev(rev), p1, p2, node, rl.flags(rev)
                      )
                  tr.addbackup(rl._indexfile, location=b'store')
                  if not rl._inline:
                      tr.addbackup(rl._datafile, location=b'store')
                  rl.opener.rename(newrl._indexfile, rl._indexfile)
                  if not rl._inline:
                      rl.opener.rename(newrl._datafile, rl._datafile)
                  rl.clearcaches()
                  rl._loadindex()
              def v2_censor(rl, tr, censornode, tombstone=b''):
                  """censors a revision in a "version 2" revlog"""
                  # General principle
                  #
                  # We create new revlog files (index/data/sidedata) to copy the content of
                  # the existing data without the censored data.
                  #
                  # We need to recompute new delta for any revision that used the censored
                  # revision as delta base. As the cumulative size of the new delta may be
                  # large, we store them in a temporary file until they are stored in their
                  # final destination.
                  #
                  # All data before the censored data can be blindly copied. The rest needs
                  # to be copied as we go and the associated index entry needs adjustement.
                  assert rl._format_version != REVLOGV0, rl._format_version
                  assert rl._format_version != REVLOGV1, rl._format_version
                  old_index = rl.index
                  docket = rl._docket
                  censor_rev = rl.rev(censornode)
                  tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
                  censored_entry = rl.index[censor_rev]
                  index_cutoff = rl.index.entry_size * censor_rev
                  data_cutoff = censored_entry[ENTRY_DATA_OFFSET] >> 16
                  sidedata_cutoff = rl.sidedata_cut_off(censor_rev)
                  with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage:
                      # rev → (new_base, data_start, data_end, compression_mode)
                      rewritten_entries = _precompute_rewritten_delta(
                          rl,
                          old_index,
                          {censor_rev},
                          tmp_storage,
                      )
-                     old_index_filepath = rl.opener.join(docket.index_filepath())
-                     old_data_filepath = rl.opener.join(docket.data_filepath())
-                     old_sidedata_filepath = rl.opener.join(docket.sidedata_filepath())
-                     new_index_filepath = rl.opener.join(docket.new_index_file())
-                     new_data_filepath = rl.opener.join(docket.new_data_file())
-                     new_sidedata_filepath = rl.opener.join(docket.new_sidedata_file())
-                     util.copyfile(
-                         old_index_filepath, new_index_filepath, nb_bytes=index_cutoff
+                     )
-                     util.copyfile(
-                         old_data_filepath, new_data_filepath, nb_bytes=data_cutoff
+                     )
-                     util.copyfile(
-                         old_sidedata_filepath,
-                         new_sidedata_filepath,
-                         nb_bytes=sidedata_cutoff,
+                     )
-                     rl.opener.register_file(docket.index_filepath())
-                     rl.opener.register_file(docket.data_filepath())
-                     rl.opener.register_file(docket.sidedata_filepath())
-                     docket.index_end = index_cutoff
-                     docket.data_end = data_cutoff
-                     docket.sidedata_end = sidedata_cutoff
-                     # reload the revlog internal information
-                     rl.clearcaches()
-                     rl._loadindex(docket=docket)
-                     @contextlib.contextmanager
-                     def all_files():
-                         # hide opening in an helper function to please check-code, black
-                         # and various python ersion at the same time
-                         with open(old_data_filepath, 'rb') as old_data_file:
-                             with open(old_sidedata_filepath, 'rb') as old_sidedata_file:
-                                 with open(new_index_filepath, 'r+b') as new_index_file:
-                                     with open(new_data_filepath, 'r+b') as new_data_file:
-                                         with open(
-                                             new_sidedata_filepath, 'r+b'
-                                         ) as new_sidedata_file:
-                                             yield (
-                                                 old_data_file,
-                                                 old_sidedata_file,
-                                                 new_index_file,
-                                                 new_data_file,
-                                                 new_sidedata_file,
+                     all_files = _setup_new_files(
+                         rl,
+                         index_cutoff,
+                         data_cutoff,
+                         sidedata_cutoff,
                                              )
                      # we dont need to open the old index file since its content already
                      # exist in a usable form in `old_index`.
                      with all_files() as open_files:
                          (
                              old_data_file,
                              old_sidedata_file,
                              new_index_file,
                              new_data_file,
                              new_sidedata_file,
                          ) = open_files
-                         new_index_file.seek(0, os.SEEK_END)
-                         assert new_index_file.tell() == index_cutoff
-                         new_data_file.seek(0, os.SEEK_END)
-                         assert new_data_file.tell() == data_cutoff
-                         new_sidedata_file.seek(0, os.SEEK_END)
-                         assert new_sidedata_file.tell() == sidedata_cutoff
                          # writing the censored revision
                          _rewrite_censor(
                              rl,
                              old_index,
                              open_files,
                              censor_rev,
                              tombstone,
                          )
                          # Writing all subsequent revisions
                          for rev in range(censor_rev + 1, len(old_index)):
                              _rewrite_simple(
                                  rl,
                                  old_index,
                                  open_files,
                                  rev,
                                  rewritten_entries,
                                  tmp_storage,
                              )
                  docket.write(transaction=None, stripping=True)
              def _precompute_rewritten_delta(
                  revlog,
                  old_index,
                  excluded_revs,
                  tmp_storage,
              ):
                  """Compute new delta for revisions whose delta is based on revision that
                  will not survive as is.
                  Return a mapping: {rev → (new_base, data_start, data_end, compression_mode)}
                  """
                  dc = deltas.deltacomputer(revlog)
                  rewritten_entries = {}
                  first_excl_rev = min(excluded_revs)
                  with revlog._segmentfile._open_read() as dfh:
                      for rev in range(first_excl_rev, len(old_index)):
                          if rev in excluded_revs:
                              # this revision will be preserved as is, so we don't need to
                              # consider recomputing a delta.
                              continue
                          entry = old_index[rev]
                          if entry[ENTRY_DELTA_BASE] not in excluded_revs:
                              continue
                          # This is a revision that use the censored revision as the base
                          # for its delta. We need a need new deltas
                          if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0:
                              # this revision is empty, we can delta against nullrev
                              rewritten_entries[rev] = (nullrev, 0, 0, COMP_MODE_PLAIN)
                          else:
                              text = revlog.rawdata(rev, _df=dfh)
                              info = revlogutils.revisioninfo(
                                  node=entry[ENTRY_NODE_ID],
                                  p1=revlog.node(entry[ENTRY_PARENT_1]),
                                  p2=revlog.node(entry[ENTRY_PARENT_2]),
                                  btext=[text],
                                  textlen=len(text),
                                  cachedelta=None,
                                  flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF,
                              )
                              d = dc.finddeltainfo(
                                  info, dfh, excluded_bases=excluded_revs, target_rev=rev
                              )
                              default_comp = revlog._docket.default_compression_header
                              comp_mode, d = deltas.delta_compression(default_comp, d)
                              # using `tell` is a bit lazy, but we are not here for speed
                              start = tmp_storage.tell()
                              tmp_storage.write(d.data[1])
                              end = tmp_storage.tell()
                              rewritten_entries[rev] = (d.base, start, end, comp_mode)
                  return rewritten_entries
+             def _setup_new_files(
+                 revlog,
+                 index_cutoff,
+                 data_cutoff,
+                 sidedata_cutoff,
+             ):
+                 """
+                 return a context manager to open all the relevant files:
+                 - old_data_file,
+                 - old_sidedata_file,
+                 - new_index_file,
+                 - new_data_file,
+                 - new_sidedata_file,
+                 The old_index_file is not here because it is accessed through the
+                 `old_index` object if the caller function.
+                 """
+                 docket = revlog._docket
+                 old_index_filepath = revlog.opener.join(docket.index_filepath())
+                 old_data_filepath = revlog.opener.join(docket.data_filepath())
+                 old_sidedata_filepath = revlog.opener.join(docket.sidedata_filepath())
+                 new_index_filepath = revlog.opener.join(docket.new_index_file())
+                 new_data_filepath = revlog.opener.join(docket.new_data_file())
+                 new_sidedata_filepath = revlog.opener.join(docket.new_sidedata_file())
+                 util.copyfile(old_index_filepath, new_index_filepath, nb_bytes=index_cutoff)
+                 util.copyfile(old_data_filepath, new_data_filepath, nb_bytes=data_cutoff)
+                 util.copyfile(
+                     old_sidedata_filepath,
+                     new_sidedata_filepath,
+                     nb_bytes=sidedata_cutoff,
+                 )
+                 revlog.opener.register_file(docket.index_filepath())
+                 revlog.opener.register_file(docket.data_filepath())
+                 revlog.opener.register_file(docket.sidedata_filepath())
+                 docket.index_end = index_cutoff
+                 docket.data_end = data_cutoff
+                 docket.sidedata_end = sidedata_cutoff
+                 # reload the revlog internal information
+                 revlog.clearcaches()
+                 revlog._loadindex(docket=docket)
+                 @contextlib.contextmanager
+                 def all_files_opener():
+                     # hide opening in an helper function to please check-code, black
+                     # and various python version at the same time
+                     with open(old_data_filepath, 'rb') as old_data_file:
+                         with open(old_sidedata_filepath, 'rb') as old_sidedata_file:
+                             with open(new_index_filepath, 'r+b') as new_index_file:
+                                 with open(new_data_filepath, 'r+b') as new_data_file:
+                                     with open(
+                                         new_sidedata_filepath, 'r+b'
+                                     ) as new_sidedata_file:
+                                         new_index_file.seek(0, os.SEEK_END)
+                                         assert new_index_file.tell() == index_cutoff
+                                         new_data_file.seek(0, os.SEEK_END)
+                                         assert new_data_file.tell() == data_cutoff
+                                         new_sidedata_file.seek(0, os.SEEK_END)
+                                         assert new_sidedata_file.tell() == sidedata_cutoff
+                                         yield (
+                                             old_data_file,
+                                             old_sidedata_file,
+                                             new_index_file,
+                                             new_data_file,
+                                             new_sidedata_file,
+                                         )
+                 return all_files_opener
              def _rewrite_simple(
                  revlog,
                  old_index,
                  all_files,
                  rev,
                  rewritten_entries,
                  tmp_storage,
              ):
                  """append a normal revision to the index after the rewritten one(s)"""
                  (
                      old_data_file,
                      old_sidedata_file,
                      new_index_file,
                      new_data_file,
                      new_sidedata_file,
                  ) = all_files
                  entry = old_index[rev]
                  flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF
                  old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16
                  if rev not in rewritten_entries:
                      old_data_file.seek(old_data_offset)
                      new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH]
                      new_data = old_data_file.read(new_data_size)
                      data_delta_base = entry[ENTRY_DELTA_BASE]
                      d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE]
                  else:
                      (
                          data_delta_base,
                          start,
                          end,
                          d_comp_mode,
                      ) = rewritten_entries[rev]
                      new_data_size = end - start
                      tmp_storage.seek(start)
                      new_data = tmp_storage.read(new_data_size)
                  # It might be faster to group continuous read/write operation,
                  # however, this is censor, an operation that is not focussed
                  # around stellar performance. So I have not written this
                  # optimisation yet.
                  new_data_offset = new_data_file.tell()
                  new_data_file.write(new_data)
                  sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH]
                  new_sidedata_offset = new_sidedata_file.tell()
                  if 0 < sidedata_size:
                      old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET]
                      old_sidedata_file.seek(old_sidedata_offset)
                      new_sidedata = old_sidedata_file.read(sidedata_size)
                      new_sidedata_file.write(new_sidedata)
                  data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH]
                  sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE]
                  assert data_delta_base <= rev, (data_delta_base, rev)
                  new_entry = revlogutils.entry(
                      flags=flags,
                      data_offset=new_data_offset,
                      data_compressed_length=new_data_size,
                      data_uncompressed_length=data_uncompressed_length,
                      data_delta_base=data_delta_base,
                      link_rev=entry[ENTRY_LINK_REV],
                      parent_rev_1=entry[ENTRY_PARENT_1],
                      parent_rev_2=entry[ENTRY_PARENT_2],
                      node_id=entry[ENTRY_NODE_ID],
                      sidedata_offset=new_sidedata_offset,
                      sidedata_compressed_length=sidedata_size,
                      data_compression_mode=d_comp_mode,
                      sidedata_compression_mode=sd_com_mode,
                  )
                  revlog.index.append(new_entry)
                  entry_bin = revlog.index.entry_binary(rev)
                  new_index_file.write(entry_bin)
                  revlog._docket.index_end = new_index_file.tell()
                  revlog._docket.data_end = new_data_file.tell()
                  revlog._docket.sidedata_end = new_sidedata_file.tell()
              def _rewrite_censor(
                  revlog,
                  old_index,
                  all_files,
                  rev,
                  tombstone,
              ):
                  """rewrite and append a censored revision"""
                  (
                      old_data_file,
                      old_sidedata_file,
                      new_index_file,
                      new_data_file,
                      new_sidedata_file,
                  ) = all_files
                  entry = old_index[rev]
                  # XXX consider trying the default compression too
                  new_data_size = len(tombstone)
                  new_data_offset = new_data_file.tell()
                  new_data_file.write(tombstone)
                  # we are not adding any sidedata as they might leak info about the censored version
                  link_rev = entry[ENTRY_LINK_REV]
                  p1 = entry[ENTRY_PARENT_1]
                  p2 = entry[ENTRY_PARENT_2]
                  new_entry = revlogutils.entry(
                      flags=constants.REVIDX_ISCENSORED,
                      data_offset=new_data_offset,
                      data_compressed_length=new_data_size,
                      data_uncompressed_length=new_data_size,
                      data_delta_base=rev,
                      link_rev=link_rev,
                      parent_rev_1=p1,
                      parent_rev_2=p2,
                      node_id=entry[ENTRY_NODE_ID],
                      sidedata_offset=0,
                      sidedata_compressed_length=0,
                      data_compression_mode=COMP_MODE_PLAIN,
                      sidedata_compression_mode=COMP_MODE_PLAIN,
                  )
                  revlog.index.append(new_entry)
                  entry_bin = revlog.index.entry_binary(rev)
                  new_index_file.write(entry_bin)
                  revlog._docket.index_end = new_index_file.tell()
                  revlog._docket.data_end = new_data_file.tell()

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages