setdiscovery.py
483 lines
| 16.4 KiB
| text/x-python
|
PythonLexer
/ mercurial / setdiscovery.py
Peter Arrenbrecht
|
r14164 | # setdiscovery.py - improved discovery of common nodeset for mercurial | ||
# | ||||
# Copyright 2010 Benoit Boissinot <bboissin@gmail.com> | ||||
# and Peter Arrenbrecht <peter@arrenbrecht.ch> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Olle Lundberg
|
r20656 | """ | ||
Algorithm works in the following way. You have two repository: local and | ||||
remote. They both contains a DAG of changelists. | ||||
The goal of the discovery protocol is to find one set of node *common*, | ||||
the set of nodes shared by local and remote. | ||||
One of the issue with the original protocol was latency, it could | ||||
potentially require lots of roundtrips to discover that the local repo was a | ||||
subset of remote (which is a very common case, you usually have few changes | ||||
compared to upstream, while upstream probably had lots of development). | ||||
The new protocol only requires one interface for the remote repo: `known()`, | ||||
which given a set of changelists tells you if they are present in the DAG. | ||||
The algorithm then works as follow: | ||||
- We will be using three sets, `common`, `missing`, `unknown`. Originally | ||||
all nodes are in `unknown`. | ||||
- Take a sample from `unknown`, call `remote.known(sample)` | ||||
- For each node that remote knows, move it and all its ancestors to `common` | ||||
- For each node that remote doesn't know, move it and all its descendants | ||||
to `missing` | ||||
- Iterate until `unknown` is empty | ||||
There are a couple optimizations, first is instead of starting with a random | ||||
sample of missing, start by sending all heads, in the case where the local | ||||
repo is a subset, you computed the answer in one round trip. | ||||
Then you can do something similar to the bisecting strategy used when | ||||
finding faulty changesets. Instead of random samples, you can try picking | ||||
nodes that will maximize the number of nodes that will be | ||||
classified with it (since all ancestors or descendants will be marked as well). | ||||
""" | ||||
Peter Arrenbrecht
|
r14164 | |||
Gregory Szorc
|
r25973 | from __future__ import absolute_import | ||
Martin von Zweigbergk
|
r25113 | import collections | ||
Augie Fackler
|
r20034 | import random | ||
Gregory Szorc
|
r25973 | |||
from .i18n import _ | ||||
from .node import ( | ||||
nullid, | ||||
nullrev, | ||||
) | ||||
from . import ( | ||||
Pierre-Yves David
|
r26587 | error, | ||
Georges Racinet
|
r42972 | policy, | ||
r32712 | util, | |||
Gregory Szorc
|
r25973 | ) | ||
Peter Arrenbrecht
|
r14164 | |||
Augie Fackler
|
r43346 | |||
Gregory Szorc
|
r39210 | def _updatesample(revs, heads, sample, parentfn, quicksamplesize=0): | ||
Pierre-Yves David
|
r23809 | """update an existing sample to match the expected size | ||
Gregory Szorc
|
r39204 | The sample is updated with revs exponentially distant from each head of the | ||
<revs> set. (H~1, H~2, H~4, H~8, etc). | ||||
Pierre-Yves David
|
r23809 | |||
If a target size is specified, the sampling will stop once this size is | ||||
Gregory Szorc
|
r39204 | reached. Otherwise sampling will happen until roots of the <revs> set are | ||
Pierre-Yves David
|
r23809 | reached. | ||
Gregory Szorc
|
r39204 | :revs: set of revs we want to discover (if None, assume the whole dag) | ||
Gregory Szorc
|
r39206 | :heads: set of DAG head revs | ||
Pierre-Yves David
|
r23809 | :sample: a sample to update | ||
Gregory Szorc
|
r39210 | :parentfn: a callable to resolve parents for a revision | ||
Pierre-Yves David
|
r23809 | :quicksamplesize: optional target size of the sample""" | ||
Peter Arrenbrecht
|
r14164 | dist = {} | ||
Martin von Zweigbergk
|
r25113 | visit = collections.deque(heads) | ||
Peter Arrenbrecht
|
r14164 | seen = set() | ||
factor = 1 | ||||
while visit: | ||||
curr = visit.popleft() | ||||
if curr in seen: | ||||
continue | ||||
d = dist.setdefault(curr, 1) | ||||
if d > factor: | ||||
factor *= 2 | ||||
if d == factor: | ||||
Pierre-Yves David
|
r23814 | sample.add(curr) | ||
if quicksamplesize and (len(sample) >= quicksamplesize): | ||||
return | ||||
Peter Arrenbrecht
|
r14164 | seen.add(curr) | ||
Gregory Szorc
|
r39210 | |||
for p in parentfn(curr): | ||||
if p != nullrev and (not revs or p in revs): | ||||
Peter Arrenbrecht
|
r14164 | dist.setdefault(p, d + 1) | ||
visit.append(p) | ||||
Augie Fackler
|
r43346 | |||
Georges Racinet
|
r42968 | def _limitsample(sample, desiredlen, randomize=True): | ||
"""return a random subset of sample of at most desiredlen item. | ||||
If randomize is False, though, a deterministic subset is returned. | ||||
This is meant for integration tests. | ||||
""" | ||||
if len(sample) <= desiredlen: | ||||
return sample | ||||
if randomize: | ||||
return set(random.sample(sample, desiredlen)) | ||||
sample = list(sample) | ||||
sample.sort() | ||||
return set(sample[:desiredlen]) | ||||
Pierre-Yves David
|
r23083 | |||
Augie Fackler
|
r43346 | |||
Boris Feld
|
r41147 | class partialdiscovery(object): | ||
"""an object representing ongoing discovery | ||||
Feed with data from the remote repository, this object keep track of the | ||||
current set of changeset in various states: | ||||
Boris Feld
|
r41208 | - common: revs also known remotely | ||
- undecided: revs we don't have information on yet | ||||
- missing: revs missing remotely | ||||
(all tracked revisions are known locally) | ||||
Boris Feld
|
r41147 | """ | ||
Georges Racinet
|
r42968 | def __init__(self, repo, targetheads, respectsize, randomize=True): | ||
Boris Feld
|
r41147 | self._repo = repo | ||
Boris Feld
|
r41203 | self._targetheads = targetheads | ||
Boris Feld
|
r41147 | self._common = repo.changelog.incrementalmissingrevs() | ||
Boris Feld
|
r41203 | self._undecided = None | ||
Boris Feld
|
r41206 | self.missing = set() | ||
r42051 | self._childrenmap = None | |||
Martin von Zweigbergk
|
r42594 | self._respectsize = respectsize | ||
Georges Racinet
|
r42968 | self.randomize = randomize | ||
Boris Feld
|
r41147 | |||
def addcommons(self, commons): | ||||
Joerg Sonnenberger
|
r42351 | """register nodes known as common""" | ||
Boris Feld
|
r41147 | self._common.addbases(commons) | ||
Boris Feld
|
r41374 | if self._undecided is not None: | ||
self._common.removeancestorsfrom(self._undecided) | ||||
Boris Feld
|
r41147 | |||
Boris Feld
|
r41206 | def addmissings(self, missings): | ||
Joerg Sonnenberger
|
r42351 | """register some nodes as missing""" | ||
Augie Fackler
|
r43347 | newmissing = self._repo.revs(b'%ld::%ld', missings, self.undecided) | ||
Boris Feld
|
r41316 | if newmissing: | ||
self.missing.update(newmissing) | ||||
self.undecided.difference_update(newmissing) | ||||
Boris Feld
|
r41206 | |||
Boris Feld
|
r41207 | def addinfo(self, sample): | ||
"""consume an iterable of (rev, known) tuples""" | ||||
common = set() | ||||
missing = set() | ||||
for rev, known in sample: | ||||
if known: | ||||
common.add(rev) | ||||
else: | ||||
missing.add(rev) | ||||
if common: | ||||
self.addcommons(common) | ||||
if missing: | ||||
self.addmissings(missing) | ||||
Boris Feld
|
r41147 | def hasinfo(self): | ||
"""return True is we have any clue about the remote state""" | ||||
return self._common.hasbases() | ||||
Boris Feld
|
r41205 | def iscomplete(self): | ||
"""True if all the necessary data have been gathered""" | ||||
return self._undecided is not None and not self._undecided | ||||
Boris Feld
|
r41203 | @property | ||
def undecided(self): | ||||
if self._undecided is not None: | ||||
return self._undecided | ||||
self._undecided = set(self._common.missingancestors(self._targetheads)) | ||||
return self._undecided | ||||
Georges Racinet
|
r42272 | def stats(self): | ||
return { | ||||
Augie Fackler
|
r43347 | b'undecided': len(self.undecided), | ||
Georges Racinet
|
r42272 | } | ||
Boris Feld
|
r41148 | def commonheads(self): | ||
"""the heads of the known common set""" | ||||
# heads(common) == heads(common.bases) since common represents | ||||
# common.bases and all its ancestors | ||||
Georges Racinet
|
r41281 | return self._common.basesheads() | ||
Boris Feld
|
r41147 | |||
r42047 | def _parentsgetter(self): | |||
getrev = self._repo.changelog.index.__getitem__ | ||||
Augie Fackler
|
r43346 | |||
r42047 | def getparents(r): | |||
r42145 | return getrev(r)[5:7] | |||
Augie Fackler
|
r43346 | |||
r42047 | return getparents | |||
r42052 | def _childrengetter(self): | |||
r42050 | ||||
r42051 | if self._childrenmap is not None: | |||
r42055 | # During discovery, the `undecided` set keep shrinking. | |||
# Therefore, the map computed for an iteration N will be | ||||
# valid for iteration N+1. Instead of computing the same | ||||
# data over and over we cached it the first time. | ||||
r42051 | return self._childrenmap.__getitem__ | |||
r42050 | # _updatesample() essentially does interaction over revisions to look | |||
# up their children. This lookup is expensive and doing it in a loop is | ||||
# quadratic. We precompute the children for all relevant revisions and | ||||
# make the lookup in _updatesample() a simple dict lookup. | ||||
r42051 | self._childrenmap = children = {} | |||
r42050 | ||||
parentrevs = self._parentsgetter() | ||||
r42052 | revs = self.undecided | |||
r42050 | ||||
for rev in sorted(revs): | ||||
# Always ensure revision has an entry so we don't need to worry | ||||
# about missing keys. | ||||
children[rev] = [] | ||||
for prev in parentrevs(rev): | ||||
if prev == nullrev: | ||||
continue | ||||
c = children.get(prev) | ||||
if c is not None: | ||||
c.append(rev) | ||||
return children.__getitem__ | ||||
Georges Racinet
|
r42045 | def takequicksample(self, headrevs, size): | ||
"""takes a quick sample of size <size> | ||||
It is meant for initial sampling and focuses on querying heads and close | ||||
ancestors of heads. | ||||
:headrevs: set of head revisions in local DAG to consider | ||||
:size: the maximum size of the sample""" | ||||
revs = self.undecided | ||||
if len(revs) <= size: | ||||
return list(revs) | ||||
Augie Fackler
|
r43347 | sample = set(self._repo.revs(b'heads(%ld)', revs)) | ||
Georges Racinet
|
r42045 | |||
if len(sample) >= size: | ||||
Georges Racinet
|
r42968 | return _limitsample(sample, size, randomize=self.randomize) | ||
Georges Racinet
|
r42045 | |||
Augie Fackler
|
r43346 | _updatesample( | ||
None, headrevs, sample, self._parentsgetter(), quicksamplesize=size | ||||
) | ||||
Georges Racinet
|
r42045 | return sample | ||
def takefullsample(self, headrevs, size): | ||||
revs = self.undecided | ||||
if len(revs) <= size: | ||||
return list(revs) | ||||
repo = self._repo | ||||
Augie Fackler
|
r43347 | sample = set(repo.revs(b'heads(%ld)', revs)) | ||
r42047 | parentrevs = self._parentsgetter() | |||
Georges Racinet
|
r42045 | |||
# update from heads | ||||
r42046 | revsheads = sample.copy() | |||
r42047 | _updatesample(revs, revsheads, sample, parentrevs) | |||
Georges Racinet
|
r42045 | |||
# update from roots | ||||
Augie Fackler
|
r43347 | revsroots = set(repo.revs(b'roots(%ld)', revs)) | ||
r42052 | childrenrevs = self._childrengetter() | |||
r42050 | _updatesample(revs, revsroots, sample, childrenrevs) | |||
Georges Racinet
|
r42045 | assert sample | ||
r42618 | ||||
if not self._respectsize: | ||||
size = max(size, min(len(revsroots), len(revsheads))) | ||||
Georges Racinet
|
r42968 | sample = _limitsample(sample, size, randomize=self.randomize) | ||
Georges Racinet
|
r42045 | if len(sample) < size: | ||
more = size - len(sample) | ||||
Georges Racinet
|
r42968 | takefrom = list(revs - sample) | ||
if self.randomize: | ||||
sample.update(random.sample(takefrom, more)) | ||||
else: | ||||
takefrom.sort() | ||||
sample.update(takefrom[:more]) | ||||
Georges Racinet
|
r42045 | return sample | ||
Augie Fackler
|
r43346 | |||
partialdiscovery = policy.importrust( | ||||
r'discovery', member=r'PartialDiscovery', default=partialdiscovery | ||||
) | ||||
Georges Racinet
|
r42972 | |||
Augie Fackler
|
r43346 | def findcommonheads( | ||
ui, | ||||
local, | ||||
remote, | ||||
initialsamplesize=100, | ||||
fullsamplesize=200, | ||||
abortwhenunrelated=True, | ||||
ancestorsof=None, | ||||
samplegrowth=1.05, | ||||
): | ||||
Steven Brown
|
r14206 | '''Return a tuple (common, anyincoming, remoteheads) used to identify | ||
missing nodes from or in remote. | ||||
Peter Arrenbrecht
|
r14164 | ''' | ||
r32712 | start = util.timer() | |||
Peter Arrenbrecht
|
r14164 | roundtrips = 0 | ||
cl = local.changelog | ||||
Gregory Szorc
|
r39195 | clnode = cl.node | ||
Gregory Szorc
|
r39197 | clrev = cl.rev | ||
Gregory Szorc
|
r39195 | |||
Boris Feld
|
r35305 | if ancestorsof is not None: | ||
Gregory Szorc
|
r39201 | ownheads = [clrev(n) for n in ancestorsof] | ||
else: | ||||
ownheads = [rev for rev in cl.headrevs() if rev != nullrev] | ||||
Peter Arrenbrecht
|
r14624 | # early exit if we know all the specified remote heads already | ||
Augie Fackler
|
r43347 | ui.debug(b"query 1; heads\n") | ||
Peter Arrenbrecht
|
r14164 | roundtrips += 1 | ||
r42335 | # We also ask remote about all the local heads. That set can be arbitrarily | |||
# large, so we used to limit it size to `initialsamplesize`. We no longer | ||||
# do as it proved counter productive. The skipped heads could lead to a | ||||
# large "undecided" set, slower to be clarified than if we asked the | ||||
# question for all heads right away. | ||||
# | ||||
# We are already fetching all server heads using the `heads` commands, | ||||
# sending a equivalent number of heads the other way should not have a | ||||
# significant impact. In addition, it is very likely that we are going to | ||||
# have to issue "known" request for an equivalent amount of revisions in | ||||
# order to decide if theses heads are common or missing. | ||||
# | ||||
# find a detailled analysis below. | ||||
# | ||||
# Case A: local and server both has few heads | ||||
# | ||||
# Ownheads is below initialsamplesize, limit would not have any effect. | ||||
# | ||||
# Case B: local has few heads and server has many | ||||
# | ||||
# Ownheads is below initialsamplesize, limit would not have any effect. | ||||
# | ||||
# Case C: local and server both has many heads | ||||
# | ||||
# We now transfert some more data, but not significantly more than is | ||||
# already transfered to carry the server heads. | ||||
# | ||||
# Case D: local has many heads, server has few | ||||
# | ||||
# D.1 local heads are mostly known remotely | ||||
# | ||||
# All the known head will have be part of a `known` request at some | ||||
# point for the discovery to finish. Sending them all earlier is | ||||
# actually helping. | ||||
# | ||||
# (This case is fairly unlikely, it requires the numerous heads to all | ||||
# be merged server side in only a few heads) | ||||
# | ||||
# D.2 local heads are mostly missing remotely | ||||
# | ||||
# To determine that the heads are missing, we'll have to issue `known` | ||||
# request for them or one of their ancestors. This amount of `known` | ||||
# request will likely be in the same order of magnitude than the amount | ||||
# of local heads. | ||||
# | ||||
# The only case where we can be more efficient using `known` request on | ||||
# ancestors are case were all the "missing" local heads are based on a | ||||
# few changeset, also "missing". This means we would have a "complex" | ||||
# graph (with many heads) attached to, but very independant to a the | ||||
# "simple" graph on the server. This is a fairly usual case and have | ||||
# not been met in the wild so far. | ||||
if remote.limitedarguments: | ||||
sample = _limitsample(ownheads, initialsamplesize) | ||||
# indices between sample and externalized version must match | ||||
sample = list(sample) | ||||
else: | ||||
sample = ownheads | ||||
Gregory Szorc
|
r37649 | |||
with remote.commandexecutor() as e: | ||||
Augie Fackler
|
r43347 | fheads = e.callcommand(b'heads', {}) | ||
fknown = e.callcommand( | ||||
b'known', {b'nodes': [clnode(r) for r in sample],} | ||||
) | ||||
Gregory Szorc
|
r37649 | |||
srvheadhashes, yesno = fheads.result(), fknown.result() | ||||
Peter Arrenbrecht
|
r14164 | |||
if cl.tip() == nullid: | ||||
if srvheadhashes != [nullid]: | ||||
return [nullid], True, srvheadhashes | ||||
return [nullid], False, [] | ||||
Steven Brown
|
r14206 | # start actual discovery (we note this before the next "if" for | ||
# compatibility reasons) | ||||
Augie Fackler
|
r43347 | ui.status(_(b"searching for changes\n")) | ||
Peter Arrenbrecht
|
r14164 | |||
Georges Racinet
|
r42044 | knownsrvheads = [] # revnos of remote heads that are known locally | ||
Gregory Szorc
|
r39197 | for node in srvheadhashes: | ||
if node == nullid: | ||||
continue | ||||
try: | ||||
Georges Racinet
|
r42044 | knownsrvheads.append(clrev(node)) | ||
Gregory Szorc
|
r39197 | # Catches unknown and filtered nodes. | ||
except error.LookupError: | ||||
continue | ||||
Georges Racinet
|
r42044 | if len(knownsrvheads) == len(srvheadhashes): | ||
Augie Fackler
|
r43347 | ui.debug(b"all remote heads known locally\n") | ||
Gregory Szorc
|
r39195 | return srvheadhashes, False, srvheadhashes | ||
Peter Arrenbrecht
|
r14164 | |||
Martin von Zweigbergk
|
r36733 | if len(sample) == len(ownheads) and all(yesno): | ||
Augie Fackler
|
r43347 | ui.note(_(b"all local changesets known remotely\n")) | ||
Gregory Szorc
|
r39195 | ownheadhashes = [clnode(r) for r in ownheads] | ||
return ownheadhashes, True, srvheadhashes | ||||
Peter Arrenbrecht
|
r14624 | |||
Peter Arrenbrecht
|
r14164 | # full blown discovery | ||
Augie Fackler
|
r43347 | randomize = ui.configbool(b'devel', b'discovery.randomize') | ||
Augie Fackler
|
r43346 | disco = partialdiscovery( | ||
local, ownheads, remote.limitedarguments, randomize=randomize | ||||
) | ||||
Siddharth Agarwal
|
r23343 | # treat remote heads (and maybe own heads) as a first implicit sample | ||
# response | ||||
Georges Racinet
|
r42044 | disco.addcommons(knownsrvheads) | ||
Boris Feld
|
r41207 | disco.addinfo(zip(sample, yesno)) | ||
Brodie Rao
|
r16683 | |||
Peter Arrenbrecht
|
r14624 | full = False | ||
Augie Fackler
|
r43347 | progress = ui.makeprogress(_(b'searching'), unit=_(b'queries')) | ||
Boris Feld
|
r41205 | while not disco.iscomplete(): | ||
Peter Arrenbrecht
|
r14164 | |||
Boris Feld
|
r41147 | if full or disco.hasinfo(): | ||
Pierre-Yves David
|
r23747 | if full: | ||
Augie Fackler
|
r43347 | ui.note(_(b"sampling from both directions\n")) | ||
Pierre-Yves David
|
r23747 | else: | ||
Augie Fackler
|
r43347 | ui.debug(b"taking initial sample\n") | ||
Georges Racinet
|
r42045 | samplefunc = disco.takefullsample | ||
Pierre-Yves David
|
r23130 | targetsize = fullsamplesize | ||
r42546 | if not remote.limitedarguments: | |||
fullsamplesize = int(fullsamplesize * samplegrowth) | ||||
Peter Arrenbrecht
|
r14624 | else: | ||
# use even cheaper initial sample | ||||
Augie Fackler
|
r43347 | ui.debug(b"taking quick initial sample\n") | ||
Georges Racinet
|
r42045 | samplefunc = disco.takequicksample | ||
Pierre-Yves David
|
r23130 | targetsize = initialsamplesize | ||
Georges Racinet
|
r42045 | sample = samplefunc(ownheads, targetsize) | ||
Peter Arrenbrecht
|
r14164 | |||
roundtrips += 1 | ||||
Martin von Zweigbergk
|
r38369 | progress.update(roundtrips) | ||
Georges Racinet
|
r42272 | stats = disco.stats() | ||
Augie Fackler
|
r43346 | ui.debug( | ||
Augie Fackler
|
r43347 | b"query %i; still undecided: %i, sample size is: %i\n" | ||
% (roundtrips, stats[b'undecided'], len(sample)) | ||||
Augie Fackler
|
r43346 | ) | ||
Georges Racinet
|
r42272 | |||
Peter Arrenbrecht
|
r14164 | # indices between sample and externalized version must match | ||
sample = list(sample) | ||||
Gregory Szorc
|
r37648 | |||
with remote.commandexecutor() as e: | ||||
Augie Fackler
|
r43346 | yesno = e.callcommand( | ||
Augie Fackler
|
r43347 | b'known', {b'nodes': [clnode(r) for r in sample],} | ||
Augie Fackler
|
r43346 | ).result() | ||
Gregory Szorc
|
r37648 | |||
Peter Arrenbrecht
|
r14624 | full = True | ||
Peter Arrenbrecht
|
r14164 | |||
Boris Feld
|
r41207 | disco.addinfo(zip(sample, yesno)) | ||
Siddharth Agarwal
|
r23343 | |||
Boris Feld
|
r41148 | result = disco.commonheads() | ||
r32712 | elapsed = util.timer() - start | |||
Martin von Zweigbergk
|
r38392 | progress.complete() | ||
Augie Fackler
|
r43347 | ui.debug(b"%d total queries in %.4fs\n" % (roundtrips, elapsed)) | ||
Augie Fackler
|
r43346 | msg = ( | ||
Augie Fackler
|
r43347 | b'found %d common and %d unknown server heads,' | ||
b' %d roundtrips in %.4fs\n' | ||||
Augie Fackler
|
r43346 | ) | ||
Georges Racinet
|
r42044 | missing = set(result) - set(knownsrvheads) | ||
Augie Fackler
|
r43347 | ui.log(b'discovery', msg, len(result), len(missing), roundtrips, elapsed) | ||
Peter Arrenbrecht
|
r14164 | |||
if not result and srvheadhashes != [nullid]: | ||||
if abortwhenunrelated: | ||||
Augie Fackler
|
r43347 | raise error.Abort(_(b"repository is unrelated")) | ||
Peter Arrenbrecht
|
r14164 | else: | ||
Augie Fackler
|
r43347 | ui.warn(_(b"warning: repository is unrelated\n")) | ||
Augie Fackler
|
r43346 | return ( | ||
{nullid}, | ||||
True, | ||||
srvheadhashes, | ||||
) | ||||
Peter Arrenbrecht
|
r14164 | |||
Augie Fackler
|
r43346 | anyincoming = srvheadhashes != [nullid] | ||
Gregory Szorc
|
r39195 | result = {clnode(r) for r in result} | ||
return result, anyincoming, srvheadhashes | ||||