setdiscovery.py
258 lines
| 9.1 KiB
| text/x-python
|
PythonLexer
/ mercurial / setdiscovery.py
Peter Arrenbrecht
|
r14164 | # setdiscovery.py - improved discovery of common nodeset for mercurial | ||
# | ||||
# Copyright 2010 Benoit Boissinot <bboissin@gmail.com> | ||||
# and Peter Arrenbrecht <peter@arrenbrecht.ch> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Olle Lundberg
|
r20656 | """ | ||
Algorithm works in the following way. You have two repository: local and | ||||
remote. They both contains a DAG of changelists. | ||||
The goal of the discovery protocol is to find one set of node *common*, | ||||
the set of nodes shared by local and remote. | ||||
One of the issue with the original protocol was latency, it could | ||||
potentially require lots of roundtrips to discover that the local repo was a | ||||
subset of remote (which is a very common case, you usually have few changes | ||||
compared to upstream, while upstream probably had lots of development). | ||||
The new protocol only requires one interface for the remote repo: `known()`, | ||||
which given a set of changelists tells you if they are present in the DAG. | ||||
The algorithm then works as follow: | ||||
- We will be using three sets, `common`, `missing`, `unknown`. Originally | ||||
all nodes are in `unknown`. | ||||
- Take a sample from `unknown`, call `remote.known(sample)` | ||||
- For each node that remote knows, move it and all its ancestors to `common` | ||||
- For each node that remote doesn't know, move it and all its descendants | ||||
to `missing` | ||||
- Iterate until `unknown` is empty | ||||
There are a couple optimizations, first is instead of starting with a random | ||||
sample of missing, start by sending all heads, in the case where the local | ||||
repo is a subset, you computed the answer in one round trip. | ||||
Then you can do something similar to the bisecting strategy used when | ||||
finding faulty changesets. Instead of random samples, you can try picking | ||||
nodes that will maximize the number of nodes that will be | ||||
classified with it (since all ancestors or descendants will be marked as well). | ||||
""" | ||||
Peter Arrenbrecht
|
r14164 | |||
Gregory Szorc
|
r25973 | from __future__ import absolute_import | ||
Martin von Zweigbergk
|
r25113 | import collections | ||
Augie Fackler
|
r20034 | import random | ||
Gregory Szorc
|
r25973 | |||
from .i18n import _ | ||||
from .node import ( | ||||
nullid, | ||||
nullrev, | ||||
) | ||||
from . import ( | ||||
dagutil, | ||||
Pierre-Yves David
|
r26587 | error, | ||
r32712 | util, | |||
Gregory Szorc
|
r25973 | ) | ||
Peter Arrenbrecht
|
r14164 | |||
Pierre-Yves David
|
r23814 | def _updatesample(dag, nodes, sample, quicksamplesize=0): | ||
Pierre-Yves David
|
r23809 | """update an existing sample to match the expected size | ||
The sample is updated with nodes exponentially distant from each head of the | ||||
<nodes> set. (H~1, H~2, H~4, H~8, etc). | ||||
If a target size is specified, the sampling will stop once this size is | ||||
reached. Otherwise sampling will happen until roots of the <nodes> set are | ||||
reached. | ||||
:dag: a dag object from dagutil | ||||
:nodes: set of nodes we want to discover (if None, assume the whole dag) | ||||
:sample: a sample to update | ||||
:quicksamplesize: optional target size of the sample""" | ||||
Peter Arrenbrecht
|
r14164 | # if nodes is empty we scan the entire graph | ||
if nodes: | ||||
heads = dag.headsetofconnecteds(nodes) | ||||
else: | ||||
heads = dag.heads() | ||||
dist = {} | ||||
Martin von Zweigbergk
|
r25113 | visit = collections.deque(heads) | ||
Peter Arrenbrecht
|
r14164 | seen = set() | ||
factor = 1 | ||||
while visit: | ||||
curr = visit.popleft() | ||||
if curr in seen: | ||||
continue | ||||
d = dist.setdefault(curr, 1) | ||||
if d > factor: | ||||
factor *= 2 | ||||
if d == factor: | ||||
Pierre-Yves David
|
r23814 | sample.add(curr) | ||
if quicksamplesize and (len(sample) >= quicksamplesize): | ||||
return | ||||
Peter Arrenbrecht
|
r14164 | seen.add(curr) | ||
for p in dag.parents(curr): | ||||
if not nodes or p in nodes: | ||||
dist.setdefault(p, d + 1) | ||||
visit.append(p) | ||||
Pierre-Yves David
|
r23806 | def _takequicksample(dag, nodes, size): | ||
Pierre-Yves David
|
r23816 | """takes a quick sample of size <size> | ||
It is meant for initial sampling and focuses on querying heads and close | ||||
ancestors of heads. | ||||
:dag: a dag object | ||||
:nodes: set of nodes to discover | ||||
:size: the maximum size of the sample""" | ||||
Pierre-Yves David
|
r23815 | sample = dag.headsetofconnecteds(nodes) | ||
if size <= len(sample): | ||||
return _limitsample(sample, size) | ||||
Pierre-Yves David
|
r23814 | _updatesample(dag, None, sample, quicksamplesize=size) | ||
Peter Arrenbrecht
|
r14164 | return sample | ||
def _takefullsample(dag, nodes, size): | ||||
Pierre-Yves David
|
r23814 | sample = dag.headsetofconnecteds(nodes) | ||
Peter Arrenbrecht
|
r14164 | # update from heads | ||
Pierre-Yves David
|
r23814 | _updatesample(dag, nodes, sample) | ||
Peter Arrenbrecht
|
r14164 | # update from roots | ||
Pierre-Yves David
|
r23814 | _updatesample(dag.inverse(), nodes, sample) | ||
Peter Arrenbrecht
|
r14164 | assert sample | ||
Pierre-Yves David
|
r23810 | sample = _limitsample(sample, size) | ||
if len(sample) < size: | ||||
more = size - len(sample) | ||||
sample.update(random.sample(list(nodes - sample), more)) | ||||
Peter Arrenbrecht
|
r14164 | return sample | ||
Pierre-Yves David
|
r23083 | def _limitsample(sample, desiredlen): | ||
"""return a random subset of sample of at most desiredlen item""" | ||||
if len(sample) > desiredlen: | ||||
sample = set(random.sample(sample, desiredlen)) | ||||
return sample | ||||
Peter Arrenbrecht
|
r14164 | def findcommonheads(ui, local, remote, | ||
initialsamplesize=100, | ||||
fullsamplesize=200, | ||||
abortwhenunrelated=True): | ||||
Steven Brown
|
r14206 | '''Return a tuple (common, anyincoming, remoteheads) used to identify | ||
missing nodes from or in remote. | ||||
Peter Arrenbrecht
|
r14164 | ''' | ||
r32712 | start = util.timer() | |||
Peter Arrenbrecht
|
r14164 | roundtrips = 0 | ||
cl = local.changelog | ||||
dag = dagutil.revlogdag(cl) | ||||
Peter Arrenbrecht
|
r14624 | # early exit if we know all the specified remote heads already | ||
Peter Arrenbrecht
|
r14164 | ui.debug("query 1; heads\n") | ||
roundtrips += 1 | ||||
Peter Arrenbrecht
|
r14624 | ownheads = dag.heads() | ||
Pierre-Yves David
|
r23084 | sample = _limitsample(ownheads, initialsamplesize) | ||
Mads Kiilerich
|
r23192 | # indices between sample and externalized version must match | ||
sample = list(sample) | ||||
Augie Fackler
|
r28437 | batch = remote.iterbatch() | ||
batch.heads() | ||||
batch.known(dag.externalizeall(sample)) | ||||
Augie Fackler
|
r25914 | batch.submit() | ||
Augie Fackler
|
r28437 | srvheadhashes, yesno = batch.results() | ||
Peter Arrenbrecht
|
r14164 | |||
if cl.tip() == nullid: | ||||
if srvheadhashes != [nullid]: | ||||
return [nullid], True, srvheadhashes | ||||
return [nullid], False, [] | ||||
Steven Brown
|
r14206 | # start actual discovery (we note this before the next "if" for | ||
# compatibility reasons) | ||||
Peter Arrenbrecht
|
r14164 | ui.status(_("searching for changes\n")) | ||
srvheads = dag.internalizeall(srvheadhashes, filterunknown=True) | ||||
if len(srvheads) == len(srvheadhashes): | ||||
Matt Mackall
|
r14833 | ui.debug("all remote heads known locally\n") | ||
Peter Arrenbrecht
|
r14164 | return (srvheadhashes, False, srvheadhashes,) | ||
Augie Fackler
|
r25151 | if sample and len(ownheads) <= initialsamplesize and all(yesno): | ||
Mads Kiilerich
|
r15497 | ui.note(_("all local heads known remotely\n")) | ||
Peter Arrenbrecht
|
r14624 | ownheadhashes = dag.externalizeall(ownheads) | ||
return (ownheadhashes, True, srvheadhashes,) | ||||
Peter Arrenbrecht
|
r14164 | # full blown discovery | ||
Brodie Rao
|
r16683 | # own nodes I know we both know | ||
Siddharth Agarwal
|
r23343 | # treat remote heads (and maybe own heads) as a first implicit sample | ||
# response | ||||
common = cl.incrementalmissingrevs(srvheads) | ||||
commoninsample = set(n for i, n in enumerate(sample) if yesno[i]) | ||||
common.addbases(commoninsample) | ||||
Pierre-Yves David
|
r23746 | # own nodes where I don't know if remote knows them | ||
Siddharth Agarwal
|
r23343 | undecided = set(common.missingancestors(ownheads)) | ||
Brodie Rao
|
r16683 | # own nodes I know remote lacks | ||
missing = set() | ||||
Peter Arrenbrecht
|
r14624 | full = False | ||
while undecided: | ||||
Peter Arrenbrecht
|
r14164 | |||
Peter Arrenbrecht
|
r14624 | if sample: | ||
missinginsample = [n for i, n in enumerate(sample) if not yesno[i]] | ||||
missing.update(dag.descendantset(missinginsample, missing)) | ||||
Peter Arrenbrecht
|
r14164 | |||
Peter Arrenbrecht
|
r14624 | undecided.difference_update(missing) | ||
Peter Arrenbrecht
|
r14164 | |||
if not undecided: | ||||
break | ||||
Pierre-Yves David
|
r23747 | if full or common.hasbases(): | ||
if full: | ||||
ui.note(_("sampling from both directions\n")) | ||||
else: | ||||
ui.debug("taking initial sample\n") | ||||
Pierre-Yves David
|
r23807 | samplefunc = _takefullsample | ||
Pierre-Yves David
|
r23130 | targetsize = fullsamplesize | ||
Peter Arrenbrecht
|
r14624 | else: | ||
# use even cheaper initial sample | ||||
ui.debug("taking quick initial sample\n") | ||||
Pierre-Yves David
|
r23807 | samplefunc = _takequicksample | ||
Pierre-Yves David
|
r23130 | targetsize = initialsamplesize | ||
Pierre-Yves David
|
r23808 | if len(undecided) < targetsize: | ||
sample = list(undecided) | ||||
else: | ||||
sample = samplefunc(dag, undecided, targetsize) | ||||
sample = _limitsample(sample, targetsize) | ||||
Peter Arrenbrecht
|
r14164 | |||
roundtrips += 1 | ||||
ui.progress(_('searching'), roundtrips, unit=_('queries')) | ||||
ui.debug("query %i; still undecided: %i, sample size is: %i\n" | ||||
% (roundtrips, len(undecided), len(sample))) | ||||
# indices between sample and externalized version must match | ||||
sample = list(sample) | ||||
yesno = remote.known(dag.externalizeall(sample)) | ||||
Peter Arrenbrecht
|
r14624 | full = True | ||
Peter Arrenbrecht
|
r14164 | |||
Siddharth Agarwal
|
r23343 | if sample: | ||
commoninsample = set(n for i, n in enumerate(sample) if yesno[i]) | ||||
common.addbases(commoninsample) | ||||
common.removeancestorsfrom(undecided) | ||||
# heads(common) == heads(common.bases) since common represents common.bases | ||||
# and all its ancestors | ||||
result = dag.headsetofconnecteds(common.bases) | ||||
# common.bases can include nullrev, but our contract requires us to not | ||||
# return any heads in that case, so discard that | ||||
result.discard(nullrev) | ||||
r32712 | elapsed = util.timer() - start | |||
Peter Arrenbrecht
|
r14164 | ui.progress(_('searching'), None) | ||
r32712 | ui.debug("%d total queries in %.4fs\n" % (roundtrips, elapsed)) | |||
r32768 | msg = ('found %d common and %d unknown server heads,' | |||
' %d roundtrips in %.4fs\n') | ||||
missing = set(result) - set(srvheads) | ||||
ui.log('discovery', msg, len(result), len(missing), roundtrips, | ||||
r32713 | elapsed) | |||
Peter Arrenbrecht
|
r14164 | |||
if not result and srvheadhashes != [nullid]: | ||||
if abortwhenunrelated: | ||||
Pierre-Yves David
|
r26587 | raise error.Abort(_("repository is unrelated")) | ||
Peter Arrenbrecht
|
r14164 | else: | ||
ui.warn(_("warning: repository is unrelated\n")) | ||||
Martin von Zweigbergk
|
r32291 | return ({nullid}, True, srvheadhashes,) | ||
Peter Arrenbrecht
|
r14164 | |||
Andrew Pritchard
|
r14981 | anyincoming = (srvheadhashes != [nullid]) | ||
return dag.externalizeall(result), anyincoming, srvheadhashes | ||||