##// END OF EJS Templates
discovery: move handling of sampling special case inside sampling function...
Boris Feld -
r41146:3c85a62d default
parent child Browse files
Show More
@@ -1,313 +1,314 b''
1 1 # setdiscovery.py - improved discovery of common nodeset for mercurial
2 2 #
3 3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
4 4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """
9 9 Algorithm works in the following way. You have two repository: local and
10 10 remote. They both contains a DAG of changelists.
11 11
12 12 The goal of the discovery protocol is to find one set of node *common*,
13 13 the set of nodes shared by local and remote.
14 14
15 15 One of the issue with the original protocol was latency, it could
16 16 potentially require lots of roundtrips to discover that the local repo was a
17 17 subset of remote (which is a very common case, you usually have few changes
18 18 compared to upstream, while upstream probably had lots of development).
19 19
20 20 The new protocol only requires one interface for the remote repo: `known()`,
21 21 which given a set of changelists tells you if they are present in the DAG.
22 22
23 23 The algorithm then works as follow:
24 24
25 25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
26 26 all nodes are in `unknown`.
27 27 - Take a sample from `unknown`, call `remote.known(sample)`
28 28 - For each node that remote knows, move it and all its ancestors to `common`
29 29 - For each node that remote doesn't know, move it and all its descendants
30 30 to `missing`
31 31 - Iterate until `unknown` is empty
32 32
33 33 There are a couple optimizations, first is instead of starting with a random
34 34 sample of missing, start by sending all heads, in the case where the local
35 35 repo is a subset, you computed the answer in one round trip.
36 36
37 37 Then you can do something similar to the bisecting strategy used when
38 38 finding faulty changesets. Instead of random samples, you can try picking
39 39 nodes that will maximize the number of nodes that will be
40 40 classified with it (since all ancestors or descendants will be marked as well).
41 41 """
42 42
43 43 from __future__ import absolute_import
44 44
45 45 import collections
46 46 import random
47 47
48 48 from .i18n import _
49 49 from .node import (
50 50 nullid,
51 51 nullrev,
52 52 )
53 53 from . import (
54 54 error,
55 55 util,
56 56 )
57 57
58 58 def _updatesample(revs, heads, sample, parentfn, quicksamplesize=0):
59 59 """update an existing sample to match the expected size
60 60
61 61 The sample is updated with revs exponentially distant from each head of the
62 62 <revs> set. (H~1, H~2, H~4, H~8, etc).
63 63
64 64 If a target size is specified, the sampling will stop once this size is
65 65 reached. Otherwise sampling will happen until roots of the <revs> set are
66 66 reached.
67 67
68 68 :revs: set of revs we want to discover (if None, assume the whole dag)
69 69 :heads: set of DAG head revs
70 70 :sample: a sample to update
71 71 :parentfn: a callable to resolve parents for a revision
72 72 :quicksamplesize: optional target size of the sample"""
73 73 dist = {}
74 74 visit = collections.deque(heads)
75 75 seen = set()
76 76 factor = 1
77 77 while visit:
78 78 curr = visit.popleft()
79 79 if curr in seen:
80 80 continue
81 81 d = dist.setdefault(curr, 1)
82 82 if d > factor:
83 83 factor *= 2
84 84 if d == factor:
85 85 sample.add(curr)
86 86 if quicksamplesize and (len(sample) >= quicksamplesize):
87 87 return
88 88 seen.add(curr)
89 89
90 90 for p in parentfn(curr):
91 91 if p != nullrev and (not revs or p in revs):
92 92 dist.setdefault(p, d + 1)
93 93 visit.append(p)
94 94
95 95 def _takequicksample(repo, headrevs, revs, size):
96 96 """takes a quick sample of size <size>
97 97
98 98 It is meant for initial sampling and focuses on querying heads and close
99 99 ancestors of heads.
100 100
101 101 :dag: a dag object
102 102 :headrevs: set of head revisions in local DAG to consider
103 103 :revs: set of revs to discover
104 104 :size: the maximum size of the sample"""
105 if len(revs) <= size:
106 return list(revs)
105 107 sample = set(repo.revs('heads(%ld)', revs))
106 108
107 109 if len(sample) >= size:
108 110 return _limitsample(sample, size)
109 111
110 112 _updatesample(None, headrevs, sample, repo.changelog.parentrevs,
111 113 quicksamplesize=size)
112 114 return sample
113 115
114 116 def _takefullsample(repo, headrevs, revs, size):
117 if len(revs) <= size:
118 return list(revs)
115 119 sample = set(repo.revs('heads(%ld)', revs))
116 120
117 121 # update from heads
118 122 revsheads = set(repo.revs('heads(%ld)', revs))
119 123 _updatesample(revs, revsheads, sample, repo.changelog.parentrevs)
120 124
121 125 # update from roots
122 126 revsroots = set(repo.revs('roots(%ld)', revs))
123 127
124 128 # _updatesample() essentially does interaction over revisions to look up
125 129 # their children. This lookup is expensive and doing it in a loop is
126 130 # quadratic. We precompute the children for all relevant revisions and
127 131 # make the lookup in _updatesample() a simple dict lookup.
128 132 #
129 133 # Because this function can be called multiple times during discovery, we
130 134 # may still perform redundant work and there is room to optimize this by
131 135 # keeping a persistent cache of children across invocations.
132 136 children = {}
133 137
134 138 parentrevs = repo.changelog.parentrevs
135 139 for rev in repo.changelog.revs(start=min(revsroots)):
136 140 # Always ensure revision has an entry so we don't need to worry about
137 141 # missing keys.
138 142 children.setdefault(rev, [])
139 143
140 144 for prev in parentrevs(rev):
141 145 if prev == nullrev:
142 146 continue
143 147
144 148 children.setdefault(prev, []).append(rev)
145 149
146 150 _updatesample(revs, revsroots, sample, children.__getitem__)
147 151 assert sample
148 152 sample = _limitsample(sample, size)
149 153 if len(sample) <= size:
150 154 more = size - len(sample)
151 155 sample.update(random.sample(list(revs - sample), more))
152 156 return sample
153 157
154 158 def _limitsample(sample, desiredlen):
155 159 """return a random subset of sample of at most desiredlen item"""
156 160 if len(sample) > desiredlen:
157 161 sample = set(random.sample(sample, desiredlen))
158 162 return sample
159 163
160 164 def findcommonheads(ui, local, remote,
161 165 initialsamplesize=100,
162 166 fullsamplesize=200,
163 167 abortwhenunrelated=True,
164 168 ancestorsof=None):
165 169 '''Return a tuple (common, anyincoming, remoteheads) used to identify
166 170 missing nodes from or in remote.
167 171 '''
168 172 start = util.timer()
169 173
170 174 roundtrips = 0
171 175 cl = local.changelog
172 176 clnode = cl.node
173 177 clrev = cl.rev
174 178
175 179 if ancestorsof is not None:
176 180 ownheads = [clrev(n) for n in ancestorsof]
177 181 else:
178 182 ownheads = [rev for rev in cl.headrevs() if rev != nullrev]
179 183
180 184 # early exit if we know all the specified remote heads already
181 185 ui.debug("query 1; heads\n")
182 186 roundtrips += 1
183 187 sample = _limitsample(ownheads, initialsamplesize)
184 188 # indices between sample and externalized version must match
185 189 sample = list(sample)
186 190
187 191 with remote.commandexecutor() as e:
188 192 fheads = e.callcommand('heads', {})
189 193 fknown = e.callcommand('known', {
190 194 'nodes': [clnode(r) for r in sample],
191 195 })
192 196
193 197 srvheadhashes, yesno = fheads.result(), fknown.result()
194 198
195 199 if cl.tip() == nullid:
196 200 if srvheadhashes != [nullid]:
197 201 return [nullid], True, srvheadhashes
198 202 return [nullid], False, []
199 203
200 204 # start actual discovery (we note this before the next "if" for
201 205 # compatibility reasons)
202 206 ui.status(_("searching for changes\n"))
203 207
204 208 srvheads = []
205 209 for node in srvheadhashes:
206 210 if node == nullid:
207 211 continue
208 212
209 213 try:
210 214 srvheads.append(clrev(node))
211 215 # Catches unknown and filtered nodes.
212 216 except error.LookupError:
213 217 continue
214 218
215 219 if len(srvheads) == len(srvheadhashes):
216 220 ui.debug("all remote heads known locally\n")
217 221 return srvheadhashes, False, srvheadhashes
218 222
219 223 if len(sample) == len(ownheads) and all(yesno):
220 224 ui.note(_("all local heads known remotely\n"))
221 225 ownheadhashes = [clnode(r) for r in ownheads]
222 226 return ownheadhashes, True, srvheadhashes
223 227
224 228 # full blown discovery
225 229
226 230 # own nodes I know we both know
227 231 # treat remote heads (and maybe own heads) as a first implicit sample
228 232 # response
229 233 common = cl.incrementalmissingrevs(srvheads)
230 234 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
231 235 common.addbases(commoninsample)
232 236 # own nodes where I don't know if remote knows them
233 237 undecided = set(common.missingancestors(ownheads))
234 238 # own nodes I know remote lacks
235 239 missing = set()
236 240
237 241 full = False
238 242 progress = ui.makeprogress(_('searching'), unit=_('queries'))
239 243 while undecided:
240 244
241 245 if sample:
242 246 missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
243 247
244 248 if missing:
245 249 missing.update(local.revs('descendants(%ld) - descendants(%ld)',
246 250 missinginsample, missing))
247 251 else:
248 252 missing.update(local.revs('descendants(%ld)', missinginsample))
249 253
250 254 undecided.difference_update(missing)
251 255
252 256 if not undecided:
253 257 break
254 258
255 259 if full or common.hasbases():
256 260 if full:
257 261 ui.note(_("sampling from both directions\n"))
258 262 else:
259 263 ui.debug("taking initial sample\n")
260 264 samplefunc = _takefullsample
261 265 targetsize = fullsamplesize
262 266 else:
263 267 # use even cheaper initial sample
264 268 ui.debug("taking quick initial sample\n")
265 269 samplefunc = _takequicksample
266 270 targetsize = initialsamplesize
267 if len(undecided) <= targetsize:
268 sample = list(undecided)
269 else:
270 sample = samplefunc(local, ownheads, undecided, targetsize)
271 sample = samplefunc(local, ownheads, undecided, targetsize)
271 272
272 273 roundtrips += 1
273 274 progress.update(roundtrips)
274 275 ui.debug("query %i; still undecided: %i, sample size is: %i\n"
275 276 % (roundtrips, len(undecided), len(sample)))
276 277 # indices between sample and externalized version must match
277 278 sample = list(sample)
278 279
279 280 with remote.commandexecutor() as e:
280 281 yesno = e.callcommand('known', {
281 282 'nodes': [clnode(r) for r in sample],
282 283 }).result()
283 284
284 285 full = True
285 286
286 287 if sample:
287 288 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
288 289 common.addbases(commoninsample)
289 290 common.removeancestorsfrom(undecided)
290 291
291 292 # heads(common) == heads(common.bases) since common represents common.bases
292 293 # and all its ancestors
293 294 # The presence of nullrev will confuse heads(). So filter it out.
294 295 result = set(local.revs('heads(%ld)', common.bases - {nullrev}))
295 296 elapsed = util.timer() - start
296 297 progress.complete()
297 298 ui.debug("%d total queries in %.4fs\n" % (roundtrips, elapsed))
298 299 msg = ('found %d common and %d unknown server heads,'
299 300 ' %d roundtrips in %.4fs\n')
300 301 missing = set(result) - set(srvheads)
301 302 ui.log('discovery', msg, len(result), len(missing), roundtrips,
302 303 elapsed)
303 304
304 305 if not result and srvheadhashes != [nullid]:
305 306 if abortwhenunrelated:
306 307 raise error.Abort(_("repository is unrelated"))
307 308 else:
308 309 ui.warn(_("warning: repository is unrelated\n"))
309 310 return ({nullid}, True, srvheadhashes,)
310 311
311 312 anyincoming = (srvheadhashes != [nullid])
312 313 result = {clnode(r) for r in result}
313 314 return result, anyincoming, srvheadhashes
General Comments 0
You need to be logged in to leave comments. Login now