##// END OF EJS Templates
discovery: be more conservative when adjusting the sample size...
marmoute -
r42618:b9ff059f default
parent child Browse files
Show More
@@ -1,442 +1,442
1 1 # setdiscovery.py - improved discovery of common nodeset for mercurial
2 2 #
3 3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
4 4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """
9 9 Algorithm works in the following way. You have two repository: local and
10 10 remote. They both contains a DAG of changelists.
11 11
12 12 The goal of the discovery protocol is to find one set of node *common*,
13 13 the set of nodes shared by local and remote.
14 14
15 15 One of the issue with the original protocol was latency, it could
16 16 potentially require lots of roundtrips to discover that the local repo was a
17 17 subset of remote (which is a very common case, you usually have few changes
18 18 compared to upstream, while upstream probably had lots of development).
19 19
20 20 The new protocol only requires one interface for the remote repo: `known()`,
21 21 which given a set of changelists tells you if they are present in the DAG.
22 22
23 23 The algorithm then works as follow:
24 24
25 25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
26 26 all nodes are in `unknown`.
27 27 - Take a sample from `unknown`, call `remote.known(sample)`
28 28 - For each node that remote knows, move it and all its ancestors to `common`
29 29 - For each node that remote doesn't know, move it and all its descendants
30 30 to `missing`
31 31 - Iterate until `unknown` is empty
32 32
33 33 There are a couple optimizations, first is instead of starting with a random
34 34 sample of missing, start by sending all heads, in the case where the local
35 35 repo is a subset, you computed the answer in one round trip.
36 36
37 37 Then you can do something similar to the bisecting strategy used when
38 38 finding faulty changesets. Instead of random samples, you can try picking
39 39 nodes that will maximize the number of nodes that will be
40 40 classified with it (since all ancestors or descendants will be marked as well).
41 41 """
42 42
43 43 from __future__ import absolute_import
44 44
45 45 import collections
46 46 import random
47 47
48 48 from .i18n import _
49 49 from .node import (
50 50 nullid,
51 51 nullrev,
52 52 )
53 53 from . import (
54 54 error,
55 55 util,
56 56 )
57 57
58 58 def _updatesample(revs, heads, sample, parentfn, quicksamplesize=0):
59 59 """update an existing sample to match the expected size
60 60
61 61 The sample is updated with revs exponentially distant from each head of the
62 62 <revs> set. (H~1, H~2, H~4, H~8, etc).
63 63
64 64 If a target size is specified, the sampling will stop once this size is
65 65 reached. Otherwise sampling will happen until roots of the <revs> set are
66 66 reached.
67 67
68 68 :revs: set of revs we want to discover (if None, assume the whole dag)
69 69 :heads: set of DAG head revs
70 70 :sample: a sample to update
71 71 :parentfn: a callable to resolve parents for a revision
72 72 :quicksamplesize: optional target size of the sample"""
73 73 dist = {}
74 74 visit = collections.deque(heads)
75 75 seen = set()
76 76 factor = 1
77 77 while visit:
78 78 curr = visit.popleft()
79 79 if curr in seen:
80 80 continue
81 81 d = dist.setdefault(curr, 1)
82 82 if d > factor:
83 83 factor *= 2
84 84 if d == factor:
85 85 sample.add(curr)
86 86 if quicksamplesize and (len(sample) >= quicksamplesize):
87 87 return
88 88 seen.add(curr)
89 89
90 90 for p in parentfn(curr):
91 91 if p != nullrev and (not revs or p in revs):
92 92 dist.setdefault(p, d + 1)
93 93 visit.append(p)
94 94
95 95 def _limitsample(sample, desiredlen):
96 96 """return a random subset of sample of at most desiredlen item"""
97 97 if len(sample) > desiredlen:
98 98 sample = set(random.sample(sample, desiredlen))
99 99 return sample
100 100
101 101 class partialdiscovery(object):
102 102 """an object representing ongoing discovery
103 103
104 104 Feed with data from the remote repository, this object keep track of the
105 105 current set of changeset in various states:
106 106
107 107 - common: revs also known remotely
108 108 - undecided: revs we don't have information on yet
109 109 - missing: revs missing remotely
110 110 (all tracked revisions are known locally)
111 111 """
112 112
113 113 def __init__(self, repo, targetheads, respectsize):
114 114 self._repo = repo
115 115 self._targetheads = targetheads
116 116 self._common = repo.changelog.incrementalmissingrevs()
117 117 self._undecided = None
118 118 self.missing = set()
119 119 self._childrenmap = None
120 120 self._respectsize = respectsize
121 121
122 122 def addcommons(self, commons):
123 123 """register nodes known as common"""
124 124 self._common.addbases(commons)
125 125 if self._undecided is not None:
126 126 self._common.removeancestorsfrom(self._undecided)
127 127
128 128 def addmissings(self, missings):
129 129 """register some nodes as missing"""
130 130 newmissing = self._repo.revs('%ld::%ld', missings, self.undecided)
131 131 if newmissing:
132 132 self.missing.update(newmissing)
133 133 self.undecided.difference_update(newmissing)
134 134
135 135 def addinfo(self, sample):
136 136 """consume an iterable of (rev, known) tuples"""
137 137 common = set()
138 138 missing = set()
139 139 for rev, known in sample:
140 140 if known:
141 141 common.add(rev)
142 142 else:
143 143 missing.add(rev)
144 144 if common:
145 145 self.addcommons(common)
146 146 if missing:
147 147 self.addmissings(missing)
148 148
149 149 def hasinfo(self):
150 150 """return True is we have any clue about the remote state"""
151 151 return self._common.hasbases()
152 152
153 153 def iscomplete(self):
154 154 """True if all the necessary data have been gathered"""
155 155 return self._undecided is not None and not self._undecided
156 156
157 157 @property
158 158 def undecided(self):
159 159 if self._undecided is not None:
160 160 return self._undecided
161 161 self._undecided = set(self._common.missingancestors(self._targetheads))
162 162 return self._undecided
163 163
164 164 def stats(self):
165 165 return {
166 166 'undecided': len(self.undecided),
167 167 }
168 168
169 169 def commonheads(self):
170 170 """the heads of the known common set"""
171 171 # heads(common) == heads(common.bases) since common represents
172 172 # common.bases and all its ancestors
173 173 return self._common.basesheads()
174 174
175 175 def _parentsgetter(self):
176 176 getrev = self._repo.changelog.index.__getitem__
177 177 def getparents(r):
178 178 return getrev(r)[5:7]
179 179 return getparents
180 180
181 181 def _childrengetter(self):
182 182
183 183 if self._childrenmap is not None:
184 184 # During discovery, the `undecided` set keep shrinking.
185 185 # Therefore, the map computed for an iteration N will be
186 186 # valid for iteration N+1. Instead of computing the same
187 187 # data over and over we cached it the first time.
188 188 return self._childrenmap.__getitem__
189 189
190 190 # _updatesample() essentially does interaction over revisions to look
191 191 # up their children. This lookup is expensive and doing it in a loop is
192 192 # quadratic. We precompute the children for all relevant revisions and
193 193 # make the lookup in _updatesample() a simple dict lookup.
194 194 self._childrenmap = children = {}
195 195
196 196 parentrevs = self._parentsgetter()
197 197 revs = self.undecided
198 198
199 199 for rev in sorted(revs):
200 200 # Always ensure revision has an entry so we don't need to worry
201 201 # about missing keys.
202 202 children[rev] = []
203 203 for prev in parentrevs(rev):
204 204 if prev == nullrev:
205 205 continue
206 206 c = children.get(prev)
207 207 if c is not None:
208 208 c.append(rev)
209 209 return children.__getitem__
210 210
211 211 def takequicksample(self, headrevs, size):
212 212 """takes a quick sample of size <size>
213 213
214 214 It is meant for initial sampling and focuses on querying heads and close
215 215 ancestors of heads.
216 216
217 217 :headrevs: set of head revisions in local DAG to consider
218 218 :size: the maximum size of the sample"""
219 219 revs = self.undecided
220 220 if len(revs) <= size:
221 221 return list(revs)
222 222 sample = set(self._repo.revs('heads(%ld)', revs))
223 223
224 224 if len(sample) >= size:
225 225 return _limitsample(sample, size)
226 226
227 227 _updatesample(None, headrevs, sample, self._parentsgetter(),
228 228 quicksamplesize=size)
229 229 return sample
230 230
231 231 def takefullsample(self, headrevs, size):
232 232 revs = self.undecided
233 233 if len(revs) <= size:
234 234 return list(revs)
235 235 repo = self._repo
236 236 sample = set(repo.revs('heads(%ld)', revs))
237 237 parentrevs = self._parentsgetter()
238 238
239 239 # update from heads
240 240 revsheads = sample.copy()
241 241 _updatesample(revs, revsheads, sample, parentrevs)
242 242
243 243 # update from roots
244 244 revsroots = set(repo.revs('roots(%ld)', revs))
245 if not self._respectsize:
246 size = max(size, len(revsroots))
247
248 245 childrenrevs = self._childrengetter()
249
250 246 _updatesample(revs, revsroots, sample, childrenrevs)
251 247 assert sample
248
249 if not self._respectsize:
250 size = max(size, min(len(revsroots), len(revsheads)))
251
252 252 sample = _limitsample(sample, size)
253 253 if len(sample) < size:
254 254 more = size - len(sample)
255 255 sample.update(random.sample(list(revs - sample), more))
256 256 return sample
257 257
258 258 def findcommonheads(ui, local, remote,
259 259 initialsamplesize=100,
260 260 fullsamplesize=200,
261 261 abortwhenunrelated=True,
262 262 ancestorsof=None,
263 263 samplegrowth=1.05):
264 264 '''Return a tuple (common, anyincoming, remoteheads) used to identify
265 265 missing nodes from or in remote.
266 266 '''
267 267 start = util.timer()
268 268
269 269 roundtrips = 0
270 270 cl = local.changelog
271 271 clnode = cl.node
272 272 clrev = cl.rev
273 273
274 274 if ancestorsof is not None:
275 275 ownheads = [clrev(n) for n in ancestorsof]
276 276 else:
277 277 ownheads = [rev for rev in cl.headrevs() if rev != nullrev]
278 278
279 279 # early exit if we know all the specified remote heads already
280 280 ui.debug("query 1; heads\n")
281 281 roundtrips += 1
282 282 # We also ask remote about all the local heads. That set can be arbitrarily
283 283 # large, so we used to limit it size to `initialsamplesize`. We no longer
284 284 # do as it proved counter productive. The skipped heads could lead to a
285 285 # large "undecided" set, slower to be clarified than if we asked the
286 286 # question for all heads right away.
287 287 #
288 288 # We are already fetching all server heads using the `heads` commands,
289 289 # sending a equivalent number of heads the other way should not have a
290 290 # significant impact. In addition, it is very likely that we are going to
291 291 # have to issue "known" request for an equivalent amount of revisions in
292 292 # order to decide if theses heads are common or missing.
293 293 #
294 294 # find a detailled analysis below.
295 295 #
296 296 # Case A: local and server both has few heads
297 297 #
298 298 # Ownheads is below initialsamplesize, limit would not have any effect.
299 299 #
300 300 # Case B: local has few heads and server has many
301 301 #
302 302 # Ownheads is below initialsamplesize, limit would not have any effect.
303 303 #
304 304 # Case C: local and server both has many heads
305 305 #
306 306 # We now transfert some more data, but not significantly more than is
307 307 # already transfered to carry the server heads.
308 308 #
309 309 # Case D: local has many heads, server has few
310 310 #
311 311 # D.1 local heads are mostly known remotely
312 312 #
313 313 # All the known head will have be part of a `known` request at some
314 314 # point for the discovery to finish. Sending them all earlier is
315 315 # actually helping.
316 316 #
317 317 # (This case is fairly unlikely, it requires the numerous heads to all
318 318 # be merged server side in only a few heads)
319 319 #
320 320 # D.2 local heads are mostly missing remotely
321 321 #
322 322 # To determine that the heads are missing, we'll have to issue `known`
323 323 # request for them or one of their ancestors. This amount of `known`
324 324 # request will likely be in the same order of magnitude than the amount
325 325 # of local heads.
326 326 #
327 327 # The only case where we can be more efficient using `known` request on
328 328 # ancestors are case were all the "missing" local heads are based on a
329 329 # few changeset, also "missing". This means we would have a "complex"
330 330 # graph (with many heads) attached to, but very independant to a the
331 331 # "simple" graph on the server. This is a fairly usual case and have
332 332 # not been met in the wild so far.
333 333 if remote.limitedarguments:
334 334 sample = _limitsample(ownheads, initialsamplesize)
335 335 # indices between sample and externalized version must match
336 336 sample = list(sample)
337 337 else:
338 338 sample = ownheads
339 339
340 340 with remote.commandexecutor() as e:
341 341 fheads = e.callcommand('heads', {})
342 342 fknown = e.callcommand('known', {
343 343 'nodes': [clnode(r) for r in sample],
344 344 })
345 345
346 346 srvheadhashes, yesno = fheads.result(), fknown.result()
347 347
348 348 if cl.tip() == nullid:
349 349 if srvheadhashes != [nullid]:
350 350 return [nullid], True, srvheadhashes
351 351 return [nullid], False, []
352 352
353 353 # start actual discovery (we note this before the next "if" for
354 354 # compatibility reasons)
355 355 ui.status(_("searching for changes\n"))
356 356
357 357 knownsrvheads = [] # revnos of remote heads that are known locally
358 358 for node in srvheadhashes:
359 359 if node == nullid:
360 360 continue
361 361
362 362 try:
363 363 knownsrvheads.append(clrev(node))
364 364 # Catches unknown and filtered nodes.
365 365 except error.LookupError:
366 366 continue
367 367
368 368 if len(knownsrvheads) == len(srvheadhashes):
369 369 ui.debug("all remote heads known locally\n")
370 370 return srvheadhashes, False, srvheadhashes
371 371
372 372 if len(sample) == len(ownheads) and all(yesno):
373 373 ui.note(_("all local heads known remotely\n"))
374 374 ownheadhashes = [clnode(r) for r in ownheads]
375 375 return ownheadhashes, True, srvheadhashes
376 376
377 377 # full blown discovery
378 378
379 379 disco = partialdiscovery(local, ownheads, remote.limitedarguments)
380 380 # treat remote heads (and maybe own heads) as a first implicit sample
381 381 # response
382 382 disco.addcommons(knownsrvheads)
383 383 disco.addinfo(zip(sample, yesno))
384 384
385 385 full = False
386 386 progress = ui.makeprogress(_('searching'), unit=_('queries'))
387 387 while not disco.iscomplete():
388 388
389 389 if full or disco.hasinfo():
390 390 if full:
391 391 ui.note(_("sampling from both directions\n"))
392 392 else:
393 393 ui.debug("taking initial sample\n")
394 394 samplefunc = disco.takefullsample
395 395 targetsize = fullsamplesize
396 396 if not remote.limitedarguments:
397 397 fullsamplesize = int(fullsamplesize * samplegrowth)
398 398 else:
399 399 # use even cheaper initial sample
400 400 ui.debug("taking quick initial sample\n")
401 401 samplefunc = disco.takequicksample
402 402 targetsize = initialsamplesize
403 403 sample = samplefunc(ownheads, targetsize)
404 404
405 405 roundtrips += 1
406 406 progress.update(roundtrips)
407 407 stats = disco.stats()
408 408 ui.debug("query %i; still undecided: %i, sample size is: %i\n"
409 409 % (roundtrips, stats['undecided'], len(sample)))
410 410
411 411 # indices between sample and externalized version must match
412 412 sample = list(sample)
413 413
414 414 with remote.commandexecutor() as e:
415 415 yesno = e.callcommand('known', {
416 416 'nodes': [clnode(r) for r in sample],
417 417 }).result()
418 418
419 419 full = True
420 420
421 421 disco.addinfo(zip(sample, yesno))
422 422
423 423 result = disco.commonheads()
424 424 elapsed = util.timer() - start
425 425 progress.complete()
426 426 ui.debug("%d total queries in %.4fs\n" % (roundtrips, elapsed))
427 427 msg = ('found %d common and %d unknown server heads,'
428 428 ' %d roundtrips in %.4fs\n')
429 429 missing = set(result) - set(knownsrvheads)
430 430 ui.log('discovery', msg, len(result), len(missing), roundtrips,
431 431 elapsed)
432 432
433 433 if not result and srvheadhashes != [nullid]:
434 434 if abortwhenunrelated:
435 435 raise error.Abort(_("repository is unrelated"))
436 436 else:
437 437 ui.warn(_("warning: repository is unrelated\n"))
438 438 return ({nullid}, True, srvheadhashes,)
439 439
440 440 anyincoming = (srvheadhashes != [nullid])
441 441 result = {clnode(r) for r in result}
442 442 return result, anyincoming, srvheadhashes
General Comments 0
You need to be logged in to leave comments. Login now