##// END OF EJS Templates
errors: name arguments to Abort constructor...
errors: name arguments to Abort constructor Differential Revision: https://phab.mercurial-scm.org/D9179

File last commit:

r46243:e1f5a232 default
r46274:d2e1dcd4 default
Show More
metadata.py
918 lines | 32.3 KiB | text/x-python | PythonLexer
changing-files: add clean computation of changed file for merges...
r46260 # coding: utf8
metadata: move computation related to files touched in a dedicated module...
r45466 # metadata.py -- code related to various metadata computation and access.
#
# Copyright 2019 Google, Inc <martinvonz@google.com>
# Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import absolute_import, print_function
import multiprocessing
changing-files: rework the way we store changed files in side-data...
r46211 import struct
metadata: move computation related to files touched in a dedicated module...
r45466
from . import (
error,
files: extract code for extra filtering of the `removed` entry into copies...
r45467 node,
metadata: move computation related to files touched in a dedicated module...
r45466 pycompat,
util,
)
from .revlogutils import (
flagutil as sidedataflag,
sidedata as sidedatamod,
)
commitctx: return a richer object from _prepare_files...
r45883 class ChangingFiles(object):
changing-files: fix docstring...
r46196 """A class recording the changes made to files by a changeset
changing-files: document the various sets...
r46078
Actions performed on files are gathered into 3 sets:
- added: files actively added in the changeset.
changing-files: add the ability to track merged files too...
r46186 - merged: files whose history got merged
changing-files: document the various sets...
r46078 - removed: files removed in the revision
changing-files: add a "salvaged" set to track file that were not removed...
r46251 - salvaged: files that might have been deleted by a merge but were not
changing-files: document the various sets...
r46078 - touched: files affected by the merge
and copies information is held by 2 mappings
- copied_from_p1: {"<new-name>": "<source-name-in-p1>"} mapping for copies
- copied_from_p2: {"<new-name>": "<source-name-in-p2>"} mapping for copies
See their inline help for details.
commitctx: return a richer object from _prepare_files...
r45883 """
def __init__(
changing-files: move default constructor value to None...
r46163 self,
touched=None,
added=None,
removed=None,
changing-files: add the ability to track merged files too...
r46186 merged=None,
changing-files: add a "salvaged" set to track file that were not removed...
r46251 salvaged=None,
changing-files: move default constructor value to None...
r46163 p1_copies=None,
p2_copies=None,
commitctx: return a richer object from _prepare_files...
r45883 ):
changing-files: move default constructor value to None...
r46163 self._added = set(() if added is None else added)
changing-files: add the ability to track merged files too...
r46186 self._merged = set(() if merged is None else merged)
changing-files: move default constructor value to None...
r46163 self._removed = set(() if removed is None else removed)
self._touched = set(() if touched is None else touched)
changing-files: add a "salvaged" set to track file that were not removed...
r46251 self._salvaged = set(() if salvaged is None else salvaged)
commitctx: return a richer object from _prepare_files...
r45883 self._touched.update(self._added)
changing-files: add the ability to track merged files too...
r46186 self._touched.update(self._merged)
commitctx: return a richer object from _prepare_files...
r45883 self._touched.update(self._removed)
changing-files: move default constructor value to None...
r46163 self._p1_copies = dict(() if p1_copies is None else p1_copies)
self._p2_copies = dict(() if p2_copies is None else p2_copies)
commitctx: return a richer object from _prepare_files...
r45883
changing-files: implement equality checking...
r46079 def __eq__(self, other):
return (
self.added == other.added
changing-files: add the ability to track merged files too...
r46186 and self.merged == other.merged
changing-files: implement equality checking...
r46079 and self.removed == other.removed
changing-files: add a "salvaged" set to track file that were not removed...
r46251 and self.salvaged == other.salvaged
changing-files: implement equality checking...
r46079 and self.touched == other.touched
and self.copied_from_p1 == other.copied_from_p1
and self.copied_from_p2 == other.copied_from_p2
)
changing-files: cache the various property...
r46198 @util.propertycache
commitctx: return a richer object from _prepare_files...
r45883 def added(self):
changing-files: document the various sets...
r46078 """files actively added in the changeset
Any file present in that revision that was absent in all the changeset's
parents.
In case of merge, this means a file absent in one of the parents but
existing in the other will *not* be contained in this set. (They were
added by an ancestor)
"""
commitctx: return a richer object from _prepare_files...
r45883 return frozenset(self._added)
def mark_added(self, filename):
changing-files: cache the various property...
r46198 if 'added' in vars(self):
del self.added
commitctx: return a richer object from _prepare_files...
r45883 self._added.add(filename)
changing-files: always use `mark_touched` to update the touched set...
r46197 self.mark_touched(filename)
commitctx: return a richer object from _prepare_files...
r45883
def update_added(self, filenames):
for f in filenames:
self.mark_added(f)
changing-files: cache the various property...
r46198 @util.propertycache
changing-files: add the ability to track merged files too...
r46186 def merged(self):
"""files actively merged during a merge
Any modified files which had modification on both size that needed merging.
In this case a new filenode was created and it has two parents.
"""
return frozenset(self._merged)
def mark_merged(self, filename):
changing-files: cache the various property...
r46198 if 'merged' in vars(self):
del self.merged
changing-files: add the ability to track merged files too...
r46186 self._merged.add(filename)
changing-files: always use `mark_touched` to update the touched set...
r46197 self.mark_touched(filename)
changing-files: add the ability to track merged files too...
r46186
def update_merged(self, filenames):
for f in filenames:
self.mark_merged(f)
changing-files: cache the various property...
r46198 @util.propertycache
commitctx: return a richer object from _prepare_files...
r45883 def removed(self):
changing-files: document the various sets...
r46078 """files actively removed by the changeset
In case of merge this will only contain the set of files removing "new"
content. For any file absent in the current changeset:
a) If the file exists in both parents, it is clearly "actively" removed
by this changeset.
b) If a file exists in only one parent and in none of the common
ancestors, then the file was newly added in one of the merged branches
and then got "actively" removed.
c) If a file exists in only one parent and at least one of the common
ancestors using the same filenode, then the file was unchanged on one
side and deleted on the other side. The merge "passively" propagated
that deletion, but didn't "actively" remove the file. In this case the
file is *not* included in the `removed` set.
d) If a file exists in only one parent and at least one of the common
ancestors using a different filenode, then the file was changed on one
side and removed on the other side. The merge process "actively"
decided to drop the new change and delete the file. Unlike in the
previous case, (c), the file included in the `removed` set.
Summary table for merge:
case | exists in parents | exists in gca || removed
(a) | both | * || yes
(b) | one | none || yes
(c) | one | same filenode || no
(d) | one | new filenode || yes
"""
commitctx: return a richer object from _prepare_files...
r45883 return frozenset(self._removed)
def mark_removed(self, filename):
changing-files: cache the various property...
r46198 if 'removed' in vars(self):
del self.removed
commitctx: return a richer object from _prepare_files...
r45883 self._removed.add(filename)
changing-files: always use `mark_touched` to update the touched set...
r46197 self.mark_touched(filename)
commitctx: return a richer object from _prepare_files...
r45883
def update_removed(self, filenames):
for f in filenames:
self.mark_removed(f)
changing-files: cache the various property...
r46198 @util.propertycache
changing-files: add a "salvaged" set to track file that were not removed...
r46251 def salvaged(self):
"""files that might have been deleted by a merge, but still exists.
During a merge, the manifest merging might select some files for
removal, or for a removed/changed conflict. If at commit time the file
still exists, its removal was "reverted" and the file is "salvaged"
"""
return frozenset(self._salvaged)
def mark_salvaged(self, filename):
if "salvaged" in vars(self):
del self.salvaged
self._salvaged.add(filename)
self.mark_touched(filename)
def update_salvaged(self, filenames):
for f in filenames:
self.mark_salvaged(f)
@util.propertycache
commitctx: return a richer object from _prepare_files...
r45883 def touched(self):
changing-files: document the various sets...
r46078 """files either actively modified, added or removed"""
commitctx: return a richer object from _prepare_files...
r45883 return frozenset(self._touched)
def mark_touched(self, filename):
changing-files: cache the various property...
r46198 if 'touched' in vars(self):
del self.touched
commitctx: return a richer object from _prepare_files...
r45883 self._touched.add(filename)
def update_touched(self, filenames):
for f in filenames:
self.mark_touched(f)
changing-files: cache the various property...
r46198 @util.propertycache
commitctx: return a richer object from _prepare_files...
r45883 def copied_from_p1(self):
return self._p1_copies.copy()
def mark_copied_from_p1(self, source, dest):
changing-files: cache the various property...
r46198 if 'copied_from_p1' in vars(self):
del self.copied_from_p1
commitctx: return a richer object from _prepare_files...
r45883 self._p1_copies[dest] = source
def update_copies_from_p1(self, copies):
for dest, source in copies.items():
self.mark_copied_from_p1(source, dest)
changing-files: cache the various property...
r46198 @util.propertycache
commitctx: return a richer object from _prepare_files...
r45883 def copied_from_p2(self):
return self._p2_copies.copy()
def mark_copied_from_p2(self, source, dest):
changing-files: cache the various property...
r46198 if 'copied_from_p2' in vars(self):
del self.copied_from_p2
commitctx: return a richer object from _prepare_files...
r45883 self._p2_copies[dest] = source
def update_copies_from_p2(self, copies):
for dest, source in copies.items():
self.mark_copied_from_p2(source, dest)
changing-files: split the changing files computation from encoding...
r46255 def compute_all_files_changes(ctx):
"""compute the files changed by a revision"""
changing-files: add clean computation of changed files for roots...
r46258 p1 = ctx.p1()
p2 = ctx.p2()
if p1.rev() == node.nullrev and p2.rev() == node.nullrev:
return _process_root(ctx)
changing-files: add clean computation of changed files for linear changesets...
r46259 elif p1.rev() != node.nullrev and p2.rev() == node.nullrev:
return _process_linear(p1, ctx)
elif p1.rev() == node.nullrev and p2.rev() != node.nullrev:
# In the wild, one can encounter changeset where p1 is null but p2 is not
return _process_linear(p1, ctx, parent=2)
elif p1.rev() == p2.rev():
# In the wild, one can encounter such "non-merge"
return _process_linear(p1, ctx)
changing-files: add clean computation of changed file for merges...
r46260 else:
return _process_merge(p1, p2, ctx)
changing-files: split the changing files computation from encoding...
r46255
changing-files: add clean computation of changed files for roots...
r46258 def _process_root(ctx):
"""compute the appropriate changed files for a changeset with no parents
"""
# Simple, there was nothing before it, so everything is added.
md = ChangingFiles()
manifest = ctx.manifest()
for filename in manifest:
md.mark_added(filename)
return md
changing-files: add clean computation of changed files for linear changesets...
r46259 def _process_linear(parent_ctx, children_ctx, parent=1):
"""compute the appropriate changed files for a changeset with a single parent
"""
md = ChangingFiles()
parent_manifest = parent_ctx.manifest()
children_manifest = children_ctx.manifest()
copies_candidate = []
for filename, d in parent_manifest.diff(children_manifest).items():
if d[1][0] is None:
# no filenode for the "new" value, file is absent
md.mark_removed(filename)
else:
copies_candidate.append(filename)
if d[0][0] is None:
# not filenode for the "old" value file was absent
md.mark_added(filename)
else:
# filenode for both "old" and "new"
md.mark_touched(filename)
if parent == 1:
copied = md.mark_copied_from_p1
elif parent == 2:
copied = md.mark_copied_from_p2
else:
assert False, "bad parent value %d" % parent
for filename in copies_candidate:
copy_info = children_ctx[filename].renamed()
if copy_info:
source, srcnode = copy_info
copied(source, filename)
return md
changing-files: add clean computation of changed file for merges...
r46260 def _process_merge(p1_ctx, p2_ctx, ctx):
"""compute the appropriate changed files for a changeset with two parents
This is a more advance case. The information we need to record is summarise
in the following table:
┌──────────────┬──────────────┬──────────────┬──────────────┬──────────────┐
│ diff ╲ diff │ ø │ (Some, None) │ (None, Some) │ (Some, Some) │
│ p2 ╲ p1 │ │ │ │ │
├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
│ │ │🄱 No Changes │🄳 No Changes │ │
│ ø │🄰 No Changes │ OR │ OR │🄵 No Changes │
│ │ │🄲 Deleted[1] │🄴 Salvaged[2]│ [3] │
├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
│ │🄶 No Changes │ │ │ │
│ (Some, None) │ OR │🄻 Deleted │ ø │ ø │
│ │🄷 Deleted[1] │ │ │ │
├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
│ │🄸 No Changes │ │ │ │
│ (None, Some) │ OR │ ø │🄼 Added │🄽 Merged │
│ │🄹 Salvaged[2]│ │ (copied?) │ (copied?) │
├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
│ │ │ │ │ │
│ (Some, Some) │🄺 No Changes │ ø │🄾 Merged │🄿 Merged │
│ │ [3] │ │ (copied?) │ (copied?) │
└──────────────┴──────────────┴──────────────┴──────────────┴──────────────┘
Special case [1]:
The situation is:
- parent-A: file exists,
- parent-B: no file,
- working-copy: no file.
Detecting a "deletion" will depend on the presence of actual change on
the "parent-A" branch:
Subcase 🄱 or 🄶 : if the state of the file in "parent-A" is unchanged
compared to the merge ancestors, then parent-A branch left the file
untouched while parent-B deleted it. We simply apply the change from
"parent-B" branch the file was automatically dropped.
The result is:
- file is not recorded as touched by the merge.
Subcase 🄲 or 🄷 : otherwise, the change from parent-A branch were explicitly dropped and
the file was "deleted again". From a user perspective, the message
about "locally changed" while "remotely deleted" (or the other way
around) was issued and the user chose to deleted the file.
The result:
- file is recorded as touched by the merge.
Special case [2]:
The situation is:
- parent-A: no file,
- parent-B: file,
- working-copy: file (same content as parent-B).
There are three subcases depending on the ancestors contents:
- A) the file is missing in all ancestors,
- B) at least one ancestor has the file with filenode ≠ from parent-B,
- C) all ancestors use the same filenode as parent-B,
Subcase (A) is the simpler, nothing happend on parent-A side while
parent-B added it.
The result:
- the file is not marked as touched by the merge.
Subcase (B) is the counter part of "Special case [1]", the file was
modified on parent-B side, while parent-A side deleted it. However this
time, the conflict was solved by keeping the file (and its
modification). We consider the file as "salvaged".
The result:
- the file is marked as "salvaged" by the merge.
Subcase (C) is subtle variation of the case above. In this case, the
file in unchanged on the parent-B side and actively removed on the
parent-A side. So the merge machinery correctly decide it should be
removed. However, the file was explicitly restored to its parent-B
content before the merge was commited. The file is be marked
as salvaged too. From the merge result perspective, this is similar to
Subcase (B), however from the merge resolution perspective they differ
since in (C), there was some conflict not obvious solution to the
merge (That got reversed)
Special case [3]:
The situation is:
- parent-A: file,
- parent-B: file (different filenode as parent-A),
- working-copy: file (same filenode as parent-B).
This case is in theory much simple, for this to happens, this mean the
filenode in parent-A is purely replacing the one in parent-B (either a
descendant, or a full new file history, see changeset). So the merge
introduce no changes, and the file is not affected by the merge...
However, in the wild it is possible to find commit with the above is not
True. For example repository have some commit where the *new* node is an
ancestor of the node in parent-A, or where parent-A and parent-B are two
branches of the same file history, yet not merge-filenode were created
(while the "merge" should have led to a "modification").
Detecting such cases (and not recording the file as modified) would be a
nice bonus. However do not any of this yet.
"""
md = ChangingFiles()
m = ctx.manifest()
p1m = p1_ctx.manifest()
p2m = p2_ctx.manifest()
diff_p1 = p1m.diff(m)
diff_p2 = p2m.diff(m)
cahs = ctx.repo().changelog.commonancestorsheads(
p1_ctx.node(), p2_ctx.node()
)
if not cahs:
cahs = [node.nullrev]
mas = [ctx.repo()[r].manifest() for r in cahs]
copy_candidates = []
# Dealing with case 🄰 happens automatically. Since there are no entry in
# d1 nor d2, we won't iterate on it ever.
# Iteration over d1 content will deal with all cases, but the one in the
# first column of the table.
for filename, d1 in diff_p1.items():
d2 = diff_p2.pop(filename, None)
if d2 is None:
# this deal with the first line of the table.
_process_other_unchanged(md, mas, filename, d1)
else:
if d1[0][0] is None and d2[0][0] is None:
# case 🄼 — both deleted the file.
md.mark_added(filename)
copy_candidates.append(filename)
elif d1[1][0] is None and d2[1][0] is None:
# case 🄻 — both deleted the file.
md.mark_removed(filename)
elif d1[1][0] is not None and d2[1][0] is not None:
# case 🄽 🄾 🄿
md.mark_merged(filename)
copy_candidates.append(filename)
else:
# Impossible case, the post-merge file status cannot be None on
# one side and Something on the other side.
assert False, "unreachable"
# Iteration over remaining d2 content deal with the first column of the
# table.
for filename, d2 in diff_p2.items():
_process_other_unchanged(md, mas, filename, d2)
for filename in copy_candidates:
copy_info = ctx[filename].renamed()
if copy_info:
source, srcnode = copy_info
if source in p1_ctx and p1_ctx[source].filenode() == srcnode:
md.mark_copied_from_p1(source, filename)
elif source in p2_ctx and p2_ctx[source].filenode() == srcnode:
md.mark_copied_from_p2(source, filename)
return md
def _find(manifest, filename):
"""return the associate filenode or None"""
if filename not in manifest:
return None
return manifest.find(filename)[0]
def _process_other_unchanged(md, mas, filename, diff):
source_node = diff[0][0]
target_node = diff[1][0]
if source_node is not None and target_node is None:
if any(not _find(ma, filename) == source_node for ma in mas):
# case 🄲 of 🄷
md.mark_removed(filename)
# else, we have case 🄱 or 🄶 : no change need to be recorded
elif source_node is None and target_node is not None:
if any(_find(ma, filename) is not None for ma in mas):
# case 🄴 or 🄹
md.mark_salvaged(filename)
# else, we have case 🄳 or 🄸 : simple merge without intervention
elif source_node is not None and target_node is not None:
# case 🄵 or 🄺 : simple merge without intervention
#
# In buggy case where source_node is not an ancestors of target_node.
# There should have a been a new filenode created, recording this as
# "modified". We do not deal with them yet.
pass
else:
# An impossible case, the diff algorithm should not return entry if the
# file is missing on both side.
assert False, "unreachable"
def _missing_from_all_ancestors(mas, filename):
return all(_find(ma, filename) is None for ma in mas)
metadata: move computation related to files touched in a dedicated module...
r45466 def computechangesetfilesadded(ctx):
"""return the list of files added in a changeset
"""
added = []
for f in ctx.files():
if not any(f in p for p in ctx.parents()):
added.append(f)
return added
files: extract code for extra filtering of the `removed` entry into copies...
r45467 def get_removal_filter(ctx, x=None):
"""return a function to detect files "wrongly" detected as `removed`
When a file is removed relative to p1 in a merge, this
function determines whether the absence is due to a
deletion from a parent, or whether the merge commit
itself deletes the file. We decide this by doing a
simplified three way merge of the manifest entry for
the file. There are two ways we decide the merge
itself didn't delete a file:
- neither parent (nor the merge) contain the file
- exactly one parent contains the file, and that
parent has the same filelog entry as the merge
ancestor (or all of them if there two). In other
words, that parent left the file unchanged while the
other one deleted it.
One way to think about this is that deleting a file is
similar to emptying it, so the list of changed files
should be similar either way. The computation
described above is not done directly in _filecommit
when creating the list of changed files, however
it does something very similar by comparing filelog
nodes.
"""
if x is not None:
p1, p2, m1, m2 = x
else:
p1 = ctx.p1()
p2 = ctx.p2()
m1 = p1.manifest()
m2 = p2.manifest()
@util.cachefunc
def mas():
p1n = p1.node()
p2n = p2.node()
cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
if not cahs:
cahs = [node.nullrev]
return [ctx.repo()[r].manifest() for r in cahs]
def deletionfromparent(f):
if f in m1:
return f not in m2 and all(
f in ma and ma.find(f) == m1.find(f) for ma in mas()
)
elif f in m2:
return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
else:
return True
return deletionfromparent
metadata: move computation related to files touched in a dedicated module...
r45466 def computechangesetfilesremoved(ctx):
"""return the list of files removed in a changeset
"""
removed = []
for f in ctx.files():
if f not in ctx:
removed.append(f)
metadata: filter the `removed` set to only contains relevant data...
r45468 if removed:
rf = get_removal_filter(ctx)
removed = [r for r in removed if not rf(r)]
metadata: move computation related to files touched in a dedicated module...
r45466 return removed
changing-files: add a utility to compute the merged files post-commit...
r46188 def computechangesetfilesmerged(ctx):
"""return the list of files merged in a changeset
"""
merged = []
if len(ctx.parents()) < 2:
return merged
for f in ctx.files():
if f in ctx:
fctx = ctx[f]
parents = fctx._filelog.parents(fctx._filenode)
if parents[1] != node.nullid:
merged.append(f)
return merged
metadata: move computation related to files touched in a dedicated module...
r45466 def computechangesetcopies(ctx):
"""return the copies data for a changeset
The copies data are returned as a pair of dictionnary (p1copies, p2copies).
Each dictionnary are in the form: `{newname: oldname}`
"""
p1copies = {}
p2copies = {}
p1 = ctx.p1()
p2 = ctx.p2()
narrowmatch = ctx._repo.narrowmatch()
for dst in ctx.files():
if not narrowmatch(dst) or dst not in ctx:
continue
copied = ctx[dst].renamed()
if not copied:
continue
src, srcnode = copied
if src in p1 and p1[src].filenode() == srcnode:
p1copies[dst] = src
elif src in p2 and p2[src].filenode() == srcnode:
p2copies[dst] = src
return p1copies, p2copies
def encodecopies(files, copies):
items = []
for i, dst in enumerate(files):
if dst in copies:
items.append(b'%d\0%s' % (i, copies[dst]))
if len(items) != len(copies):
raise error.ProgrammingError(
b'some copy targets missing from file list'
)
return b"\n".join(items)
def decodecopies(files, data):
try:
copies = {}
if not data:
return copies
for l in data.split(b'\n'):
strindex, src = l.split(b'\0')
i = int(strindex)
dst = files[i]
copies[dst] = src
return copies
except (ValueError, IndexError):
# Perhaps someone had chosen the same key name (e.g. "p1copies") and
# used different syntax for the value.
return None
def encodefileindices(files, subset):
subset = set(subset)
indices = []
for i, f in enumerate(files):
if f in subset:
indices.append(b'%d' % i)
return b'\n'.join(indices)
def decodefileindices(files, data):
try:
subset = []
if not data:
return subset
for strindex in data.split(b'\n'):
i = int(strindex)
if i < 0 or i >= len(files):
return None
subset.append(files[i])
return subset
except (ValueError, IndexError):
# Perhaps someone had chosen the same key name (e.g. "added") and
# used different syntax for the value.
return None
changing-files: rework the way we store changed files in side-data...
r46211 # see mercurial/helptext/internals/revlogs.txt for details about the format
ACTION_MASK = int("111" "00", 2)
# note: untouched file used as copy source will as `000` for this mask.
ADDED_FLAG = int("001" "00", 2)
MERGED_FLAG = int("010" "00", 2)
REMOVED_FLAG = int("011" "00", 2)
salvaged: persist the salvaged set on disk...
r46261 SALVAGED_FLAG = int("100" "00", 2)
changing-files: rework the way we store changed files in side-data...
r46211 TOUCHED_FLAG = int("101" "00", 2)
COPIED_MASK = int("11", 2)
COPIED_FROM_P1_FLAG = int("10", 2)
COPIED_FROM_P2_FLAG = int("11", 2)
# structure is <flag><filename-end><copy-source>
INDEX_HEADER = struct.Struct(">L")
INDEX_ENTRY = struct.Struct(">bLL")
sidedata: rename `encode_copies_sidedata` to `encode_files_sidedata`...
r46143 def encode_files_sidedata(files):
salvaged: persist the salvaged set on disk...
r46261 all_files = set(files.touched)
changing-files: rework the way we store changed files in side-data...
r46211 all_files.update(files.copied_from_p1.values())
all_files.update(files.copied_from_p2.values())
all_files = sorted(all_files)
file_idx = {f: i for (i, f) in enumerate(all_files)}
file_idx[None] = 0
chunks = [INDEX_HEADER.pack(len(all_files))]
filename_length = 0
for f in all_files:
filename_size = len(f)
filename_length += filename_size
flag = 0
if f in files.added:
flag |= ADDED_FLAG
elif f in files.merged:
flag |= MERGED_FLAG
elif f in files.removed:
flag |= REMOVED_FLAG
salvaged: persist the salvaged set on disk...
r46261 elif f in files.salvaged:
flag |= SALVAGED_FLAG
changing-files: rework the way we store changed files in side-data...
r46211 elif f in files.touched:
flag |= TOUCHED_FLAG
copy = None
if f in files.copied_from_p1:
flag |= COPIED_FROM_P1_FLAG
copy = files.copied_from_p1.get(f)
elif f in files.copied_from_p2:
copy = files.copied_from_p2.get(f)
flag |= COPIED_FROM_P2_FLAG
copy_idx = file_idx[copy]
chunks.append(INDEX_ENTRY.pack(flag, filename_length, copy_idx))
chunks.extend(all_files)
return {sidedatamod.SD_FILES: b''.join(chunks)}
commitctx: extract sidedata encoding inside its own function...
r45885
changing-files: drop the now useless changelogrevision argument...
r46212 def decode_files_sidedata(sidedata):
changing-files: rework the way we store changed files in side-data...
r46211 md = ChangingFiles()
raw = sidedata.get(sidedatamod.SD_FILES)
if raw is None:
return md
copies = []
all_files = []
sidedata: add a `decode_files_sidedata` function...
r46145
changing-files: rework the way we store changed files in side-data...
r46211 assert len(raw) >= INDEX_HEADER.size
total_files = INDEX_HEADER.unpack_from(raw, 0)[0]
sidedata: add a `decode_files_sidedata` function...
r46145
changing-files: rework the way we store changed files in side-data...
r46211 offset = INDEX_HEADER.size
file_offset_base = offset + (INDEX_ENTRY.size * total_files)
file_offset_last = file_offset_base
assert len(raw) >= file_offset_base
sidedata: add a `decode_files_sidedata` function...
r46145
changing-files: rework the way we store changed files in side-data...
r46211 for idx in range(total_files):
flag, file_end, copy_idx = INDEX_ENTRY.unpack_from(raw, offset)
file_end += file_offset_base
filename = raw[file_offset_last:file_end]
filesize = file_end - file_offset_last
assert len(filename) == filesize
offset += INDEX_ENTRY.size
file_offset_last = file_end
all_files.append(filename)
if flag & ACTION_MASK == ADDED_FLAG:
md.mark_added(filename)
elif flag & ACTION_MASK == MERGED_FLAG:
md.mark_merged(filename)
elif flag & ACTION_MASK == REMOVED_FLAG:
md.mark_removed(filename)
salvaged: persist the salvaged set on disk...
r46261 elif flag & ACTION_MASK == SALVAGED_FLAG:
md.mark_salvaged(filename)
changing-files: rework the way we store changed files in side-data...
r46211 elif flag & ACTION_MASK == TOUCHED_FLAG:
md.mark_touched(filename)
sidedata: add a `decode_files_sidedata` function...
r46145
changing-files: rework the way we store changed files in side-data...
r46211 copied = None
if flag & COPIED_MASK == COPIED_FROM_P1_FLAG:
copied = md.mark_copied_from_p1
elif flag & COPIED_MASK == COPIED_FROM_P2_FLAG:
copied = md.mark_copied_from_p2
if copied is not None:
copies.append((copied, filename, copy_idx))
for copied, filename, copy_idx in copies:
copied(all_files[copy_idx], filename)
return md
sidedata: add a `decode_files_sidedata` function...
r46145
metadata: move computation related to files touched in a dedicated module...
r45466 def _getsidedata(srcrepo, rev):
ctx = srcrepo[rev]
changing-files: split the changing files computation from encoding...
r46255 files = compute_all_files_changes(ctx)
changing-files: rework the way we store changed files in side-data...
r46211 return encode_files_sidedata(files)
metadata: move computation related to files touched in a dedicated module...
r45466
def getsidedataadder(srcrepo, destrepo):
use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
if pycompat.iswindows or not use_w:
return _get_simple_sidedata_adder(srcrepo, destrepo)
else:
return _get_worker_sidedata_adder(srcrepo, destrepo)
def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
"""The function used by worker precomputing sidedata
It read an input queue containing revision numbers
It write in an output queue containing (rev, <sidedata-map>)
The `None` input value is used as a stop signal.
The `tokens` semaphore is user to avoid having too many unprocessed
entries. The workers needs to acquire one token before fetching a task.
They will be released by the consumer of the produced data.
"""
tokens.acquire()
rev = revs_queue.get()
while rev is not None:
data = _getsidedata(srcrepo, rev)
sidedata_queue.put((rev, data))
tokens.acquire()
rev = revs_queue.get()
# processing of `None` is completed, release the token.
tokens.release()
BUFF_PER_WORKER = 50
def _get_worker_sidedata_adder(srcrepo, destrepo):
"""The parallel version of the sidedata computation
This code spawn a pool of worker that precompute a buffer of sidedata
before we actually need them"""
# avoid circular import copies -> scmutil -> worker -> copies
from . import worker
nbworkers = worker._numworkers(srcrepo.ui)
tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
revsq = multiprocessing.Queue()
sidedataq = multiprocessing.Queue()
assert srcrepo.filtername is None
# queue all tasks beforehand, revision numbers are small and it make
# synchronisation simpler
#
# Since the computation for each node can be quite expensive, the overhead
# of using a single queue is not revelant. In practice, most computation
# are fast but some are very expensive and dominate all the other smaller
# cost.
for r in srcrepo.changelog.revs():
revsq.put(r)
# queue the "no more tasks" markers
for i in range(nbworkers):
revsq.put(None)
allworkers = []
for i in range(nbworkers):
args = (srcrepo, revsq, sidedataq, tokens)
w = multiprocessing.Process(target=_sidedata_worker, args=args)
allworkers.append(w)
w.start()
# dictionnary to store results for revision higher than we one we are
# looking for. For example, if we need the sidedatamap for 42, and 43 is
# received, when shelve 43 for later use.
staging = {}
def sidedata_companion(revlog, rev):
sidedata = {}
if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
# Is the data previously shelved ?
sidedata = staging.pop(rev, None)
if sidedata is None:
# look at the queued result until we find the one we are lookig
# for (shelve the other ones)
r, sidedata = sidedataq.get()
while r != rev:
staging[r] = sidedata
r, sidedata = sidedataq.get()
tokens.release()
return False, (), sidedata
return sidedata_companion
def _get_simple_sidedata_adder(srcrepo, destrepo):
"""The simple version of the sidedata computation
It just compute it in the same thread on request"""
def sidedatacompanion(revlog, rev):
sidedata = {}
if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
sidedata = _getsidedata(srcrepo, rev)
return False, (), sidedata
return sidedatacompanion
def getsidedataremover(srcrepo, destrepo):
def sidedatacompanion(revlog, rev):
f = ()
if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
f = (
sidedatamod.SD_P1COPIES,
sidedatamod.SD_P2COPIES,
sidedatamod.SD_FILESADDED,
sidedatamod.SD_FILESREMOVED,
)
return False, f, {}
return sidedatacompanion