##// END OF EJS Templates
setdiscovery: document the '_updatesample' function...
Pierre-Yves David -
r23809:9ca2eb88 default
parent child Browse files
Show More
@@ -1,243 +1,257 b''
1 # setdiscovery.py - improved discovery of common nodeset for mercurial
1 # setdiscovery.py - improved discovery of common nodeset for mercurial
2 #
2 #
3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
5 #
5 #
6 # This software may be used and distributed according to the terms of the
6 # This software may be used and distributed according to the terms of the
7 # GNU General Public License version 2 or any later version.
7 # GNU General Public License version 2 or any later version.
8 """
8 """
9 Algorithm works in the following way. You have two repository: local and
9 Algorithm works in the following way. You have two repository: local and
10 remote. They both contains a DAG of changelists.
10 remote. They both contains a DAG of changelists.
11
11
12 The goal of the discovery protocol is to find one set of node *common*,
12 The goal of the discovery protocol is to find one set of node *common*,
13 the set of nodes shared by local and remote.
13 the set of nodes shared by local and remote.
14
14
15 One of the issue with the original protocol was latency, it could
15 One of the issue with the original protocol was latency, it could
16 potentially require lots of roundtrips to discover that the local repo was a
16 potentially require lots of roundtrips to discover that the local repo was a
17 subset of remote (which is a very common case, you usually have few changes
17 subset of remote (which is a very common case, you usually have few changes
18 compared to upstream, while upstream probably had lots of development).
18 compared to upstream, while upstream probably had lots of development).
19
19
20 The new protocol only requires one interface for the remote repo: `known()`,
20 The new protocol only requires one interface for the remote repo: `known()`,
21 which given a set of changelists tells you if they are present in the DAG.
21 which given a set of changelists tells you if they are present in the DAG.
22
22
23 The algorithm then works as follow:
23 The algorithm then works as follow:
24
24
25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
26 all nodes are in `unknown`.
26 all nodes are in `unknown`.
27 - Take a sample from `unknown`, call `remote.known(sample)`
27 - Take a sample from `unknown`, call `remote.known(sample)`
28 - For each node that remote knows, move it and all its ancestors to `common`
28 - For each node that remote knows, move it and all its ancestors to `common`
29 - For each node that remote doesn't know, move it and all its descendants
29 - For each node that remote doesn't know, move it and all its descendants
30 to `missing`
30 to `missing`
31 - Iterate until `unknown` is empty
31 - Iterate until `unknown` is empty
32
32
33 There are a couple optimizations, first is instead of starting with a random
33 There are a couple optimizations, first is instead of starting with a random
34 sample of missing, start by sending all heads, in the case where the local
34 sample of missing, start by sending all heads, in the case where the local
35 repo is a subset, you computed the answer in one round trip.
35 repo is a subset, you computed the answer in one round trip.
36
36
37 Then you can do something similar to the bisecting strategy used when
37 Then you can do something similar to the bisecting strategy used when
38 finding faulty changesets. Instead of random samples, you can try picking
38 finding faulty changesets. Instead of random samples, you can try picking
39 nodes that will maximize the number of nodes that will be
39 nodes that will maximize the number of nodes that will be
40 classified with it (since all ancestors or descendants will be marked as well).
40 classified with it (since all ancestors or descendants will be marked as well).
41 """
41 """
42
42
43 from node import nullid, nullrev
43 from node import nullid, nullrev
44 from i18n import _
44 from i18n import _
45 import random
45 import random
46 import util, dagutil
46 import util, dagutil
47
47
48 def _updatesample(dag, nodes, sample, always, quicksamplesize=0):
48 def _updatesample(dag, nodes, sample, always, quicksamplesize=0):
49 """update an existing sample to match the expected size
50
51 The sample is updated with nodes exponentially distant from each head of the
52 <nodes> set. (H~1, H~2, H~4, H~8, etc).
53
54 If a target size is specified, the sampling will stop once this size is
55 reached. Otherwise sampling will happen until roots of the <nodes> set are
56 reached.
57
58 :dag: a dag object from dagutil
59 :nodes: set of nodes we want to discover (if None, assume the whole dag)
60 :sample: a sample to update
61 :always: set of notable nodes that will be part of the sample anyway
62 :quicksamplesize: optional target size of the sample"""
49 # if nodes is empty we scan the entire graph
63 # if nodes is empty we scan the entire graph
50 if nodes:
64 if nodes:
51 heads = dag.headsetofconnecteds(nodes)
65 heads = dag.headsetofconnecteds(nodes)
52 else:
66 else:
53 heads = dag.heads()
67 heads = dag.heads()
54 dist = {}
68 dist = {}
55 visit = util.deque(heads)
69 visit = util.deque(heads)
56 seen = set()
70 seen = set()
57 factor = 1
71 factor = 1
58 while visit:
72 while visit:
59 curr = visit.popleft()
73 curr = visit.popleft()
60 if curr in seen:
74 if curr in seen:
61 continue
75 continue
62 d = dist.setdefault(curr, 1)
76 d = dist.setdefault(curr, 1)
63 if d > factor:
77 if d > factor:
64 factor *= 2
78 factor *= 2
65 if d == factor:
79 if d == factor:
66 if curr not in always: # need this check for the early exit below
80 if curr not in always: # need this check for the early exit below
67 sample.add(curr)
81 sample.add(curr)
68 if quicksamplesize and (len(sample) >= quicksamplesize):
82 if quicksamplesize and (len(sample) >= quicksamplesize):
69 return
83 return
70 seen.add(curr)
84 seen.add(curr)
71 for p in dag.parents(curr):
85 for p in dag.parents(curr):
72 if not nodes or p in nodes:
86 if not nodes or p in nodes:
73 dist.setdefault(p, d + 1)
87 dist.setdefault(p, d + 1)
74 visit.append(p)
88 visit.append(p)
75
89
76 def _setupsample(dag, nodes, size):
90 def _setupsample(dag, nodes, size):
77 always = dag.headsetofconnecteds(nodes)
91 always = dag.headsetofconnecteds(nodes)
78 desiredlen = size - len(always)
92 desiredlen = size - len(always)
79 if desiredlen <= 0:
93 if desiredlen <= 0:
80 # This could be bad if there are very many heads, all unknown to the
94 # This could be bad if there are very many heads, all unknown to the
81 # server. We're counting on long request support here.
95 # server. We're counting on long request support here.
82 return always, None, desiredlen
96 return always, None, desiredlen
83 return always, set(), desiredlen
97 return always, set(), desiredlen
84
98
85 def _takequicksample(dag, nodes, size):
99 def _takequicksample(dag, nodes, size):
86 always, sample, desiredlen = _setupsample(dag, nodes, size)
100 always, sample, desiredlen = _setupsample(dag, nodes, size)
87 if sample is None:
101 if sample is None:
88 return always
102 return always
89 _updatesample(dag, None, sample, always, quicksamplesize=desiredlen)
103 _updatesample(dag, None, sample, always, quicksamplesize=desiredlen)
90 sample.update(always)
104 sample.update(always)
91 return sample
105 return sample
92
106
93 def _takefullsample(dag, nodes, size):
107 def _takefullsample(dag, nodes, size):
94 always, sample, desiredlen = _setupsample(dag, nodes, size)
108 always, sample, desiredlen = _setupsample(dag, nodes, size)
95 if sample is None:
109 if sample is None:
96 return always
110 return always
97 # update from heads
111 # update from heads
98 _updatesample(dag, nodes, sample, always)
112 _updatesample(dag, nodes, sample, always)
99 # update from roots
113 # update from roots
100 _updatesample(dag.inverse(), nodes, sample, always)
114 _updatesample(dag.inverse(), nodes, sample, always)
101 assert sample
115 assert sample
102 sample = _limitsample(sample, desiredlen)
116 sample = _limitsample(sample, desiredlen)
103 if len(sample) < desiredlen:
117 if len(sample) < desiredlen:
104 more = desiredlen - len(sample)
118 more = desiredlen - len(sample)
105 sample.update(random.sample(list(nodes - sample - always), more))
119 sample.update(random.sample(list(nodes - sample - always), more))
106 sample.update(always)
120 sample.update(always)
107 return sample
121 return sample
108
122
109 def _limitsample(sample, desiredlen):
123 def _limitsample(sample, desiredlen):
110 """return a random subset of sample of at most desiredlen item"""
124 """return a random subset of sample of at most desiredlen item"""
111 if len(sample) > desiredlen:
125 if len(sample) > desiredlen:
112 sample = set(random.sample(sample, desiredlen))
126 sample = set(random.sample(sample, desiredlen))
113 return sample
127 return sample
114
128
115 def findcommonheads(ui, local, remote,
129 def findcommonheads(ui, local, remote,
116 initialsamplesize=100,
130 initialsamplesize=100,
117 fullsamplesize=200,
131 fullsamplesize=200,
118 abortwhenunrelated=True):
132 abortwhenunrelated=True):
119 '''Return a tuple (common, anyincoming, remoteheads) used to identify
133 '''Return a tuple (common, anyincoming, remoteheads) used to identify
120 missing nodes from or in remote.
134 missing nodes from or in remote.
121 '''
135 '''
122 roundtrips = 0
136 roundtrips = 0
123 cl = local.changelog
137 cl = local.changelog
124 dag = dagutil.revlogdag(cl)
138 dag = dagutil.revlogdag(cl)
125
139
126 # early exit if we know all the specified remote heads already
140 # early exit if we know all the specified remote heads already
127 ui.debug("query 1; heads\n")
141 ui.debug("query 1; heads\n")
128 roundtrips += 1
142 roundtrips += 1
129 ownheads = dag.heads()
143 ownheads = dag.heads()
130 sample = _limitsample(ownheads, initialsamplesize)
144 sample = _limitsample(ownheads, initialsamplesize)
131 # indices between sample and externalized version must match
145 # indices between sample and externalized version must match
132 sample = list(sample)
146 sample = list(sample)
133 if remote.local():
147 if remote.local():
134 # stopgap until we have a proper localpeer that supports batch()
148 # stopgap until we have a proper localpeer that supports batch()
135 srvheadhashes = remote.heads()
149 srvheadhashes = remote.heads()
136 yesno = remote.known(dag.externalizeall(sample))
150 yesno = remote.known(dag.externalizeall(sample))
137 elif remote.capable('batch'):
151 elif remote.capable('batch'):
138 batch = remote.batch()
152 batch = remote.batch()
139 srvheadhashesref = batch.heads()
153 srvheadhashesref = batch.heads()
140 yesnoref = batch.known(dag.externalizeall(sample))
154 yesnoref = batch.known(dag.externalizeall(sample))
141 batch.submit()
155 batch.submit()
142 srvheadhashes = srvheadhashesref.value
156 srvheadhashes = srvheadhashesref.value
143 yesno = yesnoref.value
157 yesno = yesnoref.value
144 else:
158 else:
145 # compatibility with pre-batch, but post-known remotes during 1.9
159 # compatibility with pre-batch, but post-known remotes during 1.9
146 # development
160 # development
147 srvheadhashes = remote.heads()
161 srvheadhashes = remote.heads()
148 sample = []
162 sample = []
149
163
150 if cl.tip() == nullid:
164 if cl.tip() == nullid:
151 if srvheadhashes != [nullid]:
165 if srvheadhashes != [nullid]:
152 return [nullid], True, srvheadhashes
166 return [nullid], True, srvheadhashes
153 return [nullid], False, []
167 return [nullid], False, []
154
168
155 # start actual discovery (we note this before the next "if" for
169 # start actual discovery (we note this before the next "if" for
156 # compatibility reasons)
170 # compatibility reasons)
157 ui.status(_("searching for changes\n"))
171 ui.status(_("searching for changes\n"))
158
172
159 srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)
173 srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)
160 if len(srvheads) == len(srvheadhashes):
174 if len(srvheads) == len(srvheadhashes):
161 ui.debug("all remote heads known locally\n")
175 ui.debug("all remote heads known locally\n")
162 return (srvheadhashes, False, srvheadhashes,)
176 return (srvheadhashes, False, srvheadhashes,)
163
177
164 if sample and len(ownheads) <= initialsamplesize and util.all(yesno):
178 if sample and len(ownheads) <= initialsamplesize and util.all(yesno):
165 ui.note(_("all local heads known remotely\n"))
179 ui.note(_("all local heads known remotely\n"))
166 ownheadhashes = dag.externalizeall(ownheads)
180 ownheadhashes = dag.externalizeall(ownheads)
167 return (ownheadhashes, True, srvheadhashes,)
181 return (ownheadhashes, True, srvheadhashes,)
168
182
169 # full blown discovery
183 # full blown discovery
170
184
171 # own nodes I know we both know
185 # own nodes I know we both know
172 # treat remote heads (and maybe own heads) as a first implicit sample
186 # treat remote heads (and maybe own heads) as a first implicit sample
173 # response
187 # response
174 common = cl.incrementalmissingrevs(srvheads)
188 common = cl.incrementalmissingrevs(srvheads)
175 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
189 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
176 common.addbases(commoninsample)
190 common.addbases(commoninsample)
177 # own nodes where I don't know if remote knows them
191 # own nodes where I don't know if remote knows them
178 undecided = set(common.missingancestors(ownheads))
192 undecided = set(common.missingancestors(ownheads))
179 # own nodes I know remote lacks
193 # own nodes I know remote lacks
180 missing = set()
194 missing = set()
181
195
182 full = False
196 full = False
183 while undecided:
197 while undecided:
184
198
185 if sample:
199 if sample:
186 missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
200 missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
187 missing.update(dag.descendantset(missinginsample, missing))
201 missing.update(dag.descendantset(missinginsample, missing))
188
202
189 undecided.difference_update(missing)
203 undecided.difference_update(missing)
190
204
191 if not undecided:
205 if not undecided:
192 break
206 break
193
207
194 if full or common.hasbases():
208 if full or common.hasbases():
195 if full:
209 if full:
196 ui.note(_("sampling from both directions\n"))
210 ui.note(_("sampling from both directions\n"))
197 else:
211 else:
198 ui.debug("taking initial sample\n")
212 ui.debug("taking initial sample\n")
199 samplefunc = _takefullsample
213 samplefunc = _takefullsample
200 targetsize = fullsamplesize
214 targetsize = fullsamplesize
201 else:
215 else:
202 # use even cheaper initial sample
216 # use even cheaper initial sample
203 ui.debug("taking quick initial sample\n")
217 ui.debug("taking quick initial sample\n")
204 samplefunc = _takequicksample
218 samplefunc = _takequicksample
205 targetsize = initialsamplesize
219 targetsize = initialsamplesize
206 if len(undecided) < targetsize:
220 if len(undecided) < targetsize:
207 sample = list(undecided)
221 sample = list(undecided)
208 else:
222 else:
209 sample = samplefunc(dag, undecided, targetsize)
223 sample = samplefunc(dag, undecided, targetsize)
210 sample = _limitsample(sample, targetsize)
224 sample = _limitsample(sample, targetsize)
211
225
212 roundtrips += 1
226 roundtrips += 1
213 ui.progress(_('searching'), roundtrips, unit=_('queries'))
227 ui.progress(_('searching'), roundtrips, unit=_('queries'))
214 ui.debug("query %i; still undecided: %i, sample size is: %i\n"
228 ui.debug("query %i; still undecided: %i, sample size is: %i\n"
215 % (roundtrips, len(undecided), len(sample)))
229 % (roundtrips, len(undecided), len(sample)))
216 # indices between sample and externalized version must match
230 # indices between sample and externalized version must match
217 sample = list(sample)
231 sample = list(sample)
218 yesno = remote.known(dag.externalizeall(sample))
232 yesno = remote.known(dag.externalizeall(sample))
219 full = True
233 full = True
220
234
221 if sample:
235 if sample:
222 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
236 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
223 common.addbases(commoninsample)
237 common.addbases(commoninsample)
224 common.removeancestorsfrom(undecided)
238 common.removeancestorsfrom(undecided)
225
239
226 # heads(common) == heads(common.bases) since common represents common.bases
240 # heads(common) == heads(common.bases) since common represents common.bases
227 # and all its ancestors
241 # and all its ancestors
228 result = dag.headsetofconnecteds(common.bases)
242 result = dag.headsetofconnecteds(common.bases)
229 # common.bases can include nullrev, but our contract requires us to not
243 # common.bases can include nullrev, but our contract requires us to not
230 # return any heads in that case, so discard that
244 # return any heads in that case, so discard that
231 result.discard(nullrev)
245 result.discard(nullrev)
232 ui.progress(_('searching'), None)
246 ui.progress(_('searching'), None)
233 ui.debug("%d total queries\n" % roundtrips)
247 ui.debug("%d total queries\n" % roundtrips)
234
248
235 if not result and srvheadhashes != [nullid]:
249 if not result and srvheadhashes != [nullid]:
236 if abortwhenunrelated:
250 if abortwhenunrelated:
237 raise util.Abort(_("repository is unrelated"))
251 raise util.Abort(_("repository is unrelated"))
238 else:
252 else:
239 ui.warn(_("warning: repository is unrelated\n"))
253 ui.warn(_("warning: repository is unrelated\n"))
240 return (set([nullid]), True, srvheadhashes,)
254 return (set([nullid]), True, srvheadhashes,)
241
255
242 anyincoming = (srvheadhashes != [nullid])
256 anyincoming = (srvheadhashes != [nullid])
243 return dag.externalizeall(result), anyincoming, srvheadhashes
257 return dag.externalizeall(result), anyincoming, srvheadhashes
General Comments 0
You need to be logged in to leave comments. Login now