##// END OF EJS Templates
py3-discovery: using plain str in stats dict...
Georges Racinet -
r43432:3c6976b1 default
parent child Browse files
Show More
@@ -1,483 +1,483 b''
1 1 # setdiscovery.py - improved discovery of common nodeset for mercurial
2 2 #
3 3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
4 4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """
9 9 Algorithm works in the following way. You have two repository: local and
10 10 remote. They both contains a DAG of changelists.
11 11
12 12 The goal of the discovery protocol is to find one set of node *common*,
13 13 the set of nodes shared by local and remote.
14 14
15 15 One of the issue with the original protocol was latency, it could
16 16 potentially require lots of roundtrips to discover that the local repo was a
17 17 subset of remote (which is a very common case, you usually have few changes
18 18 compared to upstream, while upstream probably had lots of development).
19 19
20 20 The new protocol only requires one interface for the remote repo: `known()`,
21 21 which given a set of changelists tells you if they are present in the DAG.
22 22
23 23 The algorithm then works as follow:
24 24
25 25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
26 26 all nodes are in `unknown`.
27 27 - Take a sample from `unknown`, call `remote.known(sample)`
28 28 - For each node that remote knows, move it and all its ancestors to `common`
29 29 - For each node that remote doesn't know, move it and all its descendants
30 30 to `missing`
31 31 - Iterate until `unknown` is empty
32 32
33 33 There are a couple optimizations, first is instead of starting with a random
34 34 sample of missing, start by sending all heads, in the case where the local
35 35 repo is a subset, you computed the answer in one round trip.
36 36
37 37 Then you can do something similar to the bisecting strategy used when
38 38 finding faulty changesets. Instead of random samples, you can try picking
39 39 nodes that will maximize the number of nodes that will be
40 40 classified with it (since all ancestors or descendants will be marked as well).
41 41 """
42 42
43 43 from __future__ import absolute_import
44 44
45 45 import collections
46 46 import random
47 47
48 48 from .i18n import _
49 49 from .node import (
50 50 nullid,
51 51 nullrev,
52 52 )
53 53 from . import (
54 54 error,
55 55 policy,
56 56 util,
57 57 )
58 58
59 59
60 60 def _updatesample(revs, heads, sample, parentfn, quicksamplesize=0):
61 61 """update an existing sample to match the expected size
62 62
63 63 The sample is updated with revs exponentially distant from each head of the
64 64 <revs> set. (H~1, H~2, H~4, H~8, etc).
65 65
66 66 If a target size is specified, the sampling will stop once this size is
67 67 reached. Otherwise sampling will happen until roots of the <revs> set are
68 68 reached.
69 69
70 70 :revs: set of revs we want to discover (if None, assume the whole dag)
71 71 :heads: set of DAG head revs
72 72 :sample: a sample to update
73 73 :parentfn: a callable to resolve parents for a revision
74 74 :quicksamplesize: optional target size of the sample"""
75 75 dist = {}
76 76 visit = collections.deque(heads)
77 77 seen = set()
78 78 factor = 1
79 79 while visit:
80 80 curr = visit.popleft()
81 81 if curr in seen:
82 82 continue
83 83 d = dist.setdefault(curr, 1)
84 84 if d > factor:
85 85 factor *= 2
86 86 if d == factor:
87 87 sample.add(curr)
88 88 if quicksamplesize and (len(sample) >= quicksamplesize):
89 89 return
90 90 seen.add(curr)
91 91
92 92 for p in parentfn(curr):
93 93 if p != nullrev and (not revs or p in revs):
94 94 dist.setdefault(p, d + 1)
95 95 visit.append(p)
96 96
97 97
98 98 def _limitsample(sample, desiredlen, randomize=True):
99 99 """return a random subset of sample of at most desiredlen item.
100 100
101 101 If randomize is False, though, a deterministic subset is returned.
102 102 This is meant for integration tests.
103 103 """
104 104 if len(sample) <= desiredlen:
105 105 return sample
106 106 if randomize:
107 107 return set(random.sample(sample, desiredlen))
108 108 sample = list(sample)
109 109 sample.sort()
110 110 return set(sample[:desiredlen])
111 111
112 112
113 113 class partialdiscovery(object):
114 114 """an object representing ongoing discovery
115 115
116 116 Feed with data from the remote repository, this object keep track of the
117 117 current set of changeset in various states:
118 118
119 119 - common: revs also known remotely
120 120 - undecided: revs we don't have information on yet
121 121 - missing: revs missing remotely
122 122 (all tracked revisions are known locally)
123 123 """
124 124
125 125 def __init__(self, repo, targetheads, respectsize, randomize=True):
126 126 self._repo = repo
127 127 self._targetheads = targetheads
128 128 self._common = repo.changelog.incrementalmissingrevs()
129 129 self._undecided = None
130 130 self.missing = set()
131 131 self._childrenmap = None
132 132 self._respectsize = respectsize
133 133 self.randomize = randomize
134 134
135 135 def addcommons(self, commons):
136 136 """register nodes known as common"""
137 137 self._common.addbases(commons)
138 138 if self._undecided is not None:
139 139 self._common.removeancestorsfrom(self._undecided)
140 140
141 141 def addmissings(self, missings):
142 142 """register some nodes as missing"""
143 143 newmissing = self._repo.revs(b'%ld::%ld', missings, self.undecided)
144 144 if newmissing:
145 145 self.missing.update(newmissing)
146 146 self.undecided.difference_update(newmissing)
147 147
148 148 def addinfo(self, sample):
149 149 """consume an iterable of (rev, known) tuples"""
150 150 common = set()
151 151 missing = set()
152 152 for rev, known in sample:
153 153 if known:
154 154 common.add(rev)
155 155 else:
156 156 missing.add(rev)
157 157 if common:
158 158 self.addcommons(common)
159 159 if missing:
160 160 self.addmissings(missing)
161 161
162 162 def hasinfo(self):
163 163 """return True is we have any clue about the remote state"""
164 164 return self._common.hasbases()
165 165
166 166 def iscomplete(self):
167 167 """True if all the necessary data have been gathered"""
168 168 return self._undecided is not None and not self._undecided
169 169
170 170 @property
171 171 def undecided(self):
172 172 if self._undecided is not None:
173 173 return self._undecided
174 174 self._undecided = set(self._common.missingancestors(self._targetheads))
175 175 return self._undecided
176 176
177 177 def stats(self):
178 178 return {
179 b'undecided': len(self.undecided),
179 'undecided': len(self.undecided),
180 180 }
181 181
182 182 def commonheads(self):
183 183 """the heads of the known common set"""
184 184 # heads(common) == heads(common.bases) since common represents
185 185 # common.bases and all its ancestors
186 186 return self._common.basesheads()
187 187
188 188 def _parentsgetter(self):
189 189 getrev = self._repo.changelog.index.__getitem__
190 190
191 191 def getparents(r):
192 192 return getrev(r)[5:7]
193 193
194 194 return getparents
195 195
196 196 def _childrengetter(self):
197 197
198 198 if self._childrenmap is not None:
199 199 # During discovery, the `undecided` set keep shrinking.
200 200 # Therefore, the map computed for an iteration N will be
201 201 # valid for iteration N+1. Instead of computing the same
202 202 # data over and over we cached it the first time.
203 203 return self._childrenmap.__getitem__
204 204
205 205 # _updatesample() essentially does interaction over revisions to look
206 206 # up their children. This lookup is expensive and doing it in a loop is
207 207 # quadratic. We precompute the children for all relevant revisions and
208 208 # make the lookup in _updatesample() a simple dict lookup.
209 209 self._childrenmap = children = {}
210 210
211 211 parentrevs = self._parentsgetter()
212 212 revs = self.undecided
213 213
214 214 for rev in sorted(revs):
215 215 # Always ensure revision has an entry so we don't need to worry
216 216 # about missing keys.
217 217 children[rev] = []
218 218 for prev in parentrevs(rev):
219 219 if prev == nullrev:
220 220 continue
221 221 c = children.get(prev)
222 222 if c is not None:
223 223 c.append(rev)
224 224 return children.__getitem__
225 225
226 226 def takequicksample(self, headrevs, size):
227 227 """takes a quick sample of size <size>
228 228
229 229 It is meant for initial sampling and focuses on querying heads and close
230 230 ancestors of heads.
231 231
232 232 :headrevs: set of head revisions in local DAG to consider
233 233 :size: the maximum size of the sample"""
234 234 revs = self.undecided
235 235 if len(revs) <= size:
236 236 return list(revs)
237 237 sample = set(self._repo.revs(b'heads(%ld)', revs))
238 238
239 239 if len(sample) >= size:
240 240 return _limitsample(sample, size, randomize=self.randomize)
241 241
242 242 _updatesample(
243 243 None, headrevs, sample, self._parentsgetter(), quicksamplesize=size
244 244 )
245 245 return sample
246 246
247 247 def takefullsample(self, headrevs, size):
248 248 revs = self.undecided
249 249 if len(revs) <= size:
250 250 return list(revs)
251 251 repo = self._repo
252 252 sample = set(repo.revs(b'heads(%ld)', revs))
253 253 parentrevs = self._parentsgetter()
254 254
255 255 # update from heads
256 256 revsheads = sample.copy()
257 257 _updatesample(revs, revsheads, sample, parentrevs)
258 258
259 259 # update from roots
260 260 revsroots = set(repo.revs(b'roots(%ld)', revs))
261 261 childrenrevs = self._childrengetter()
262 262 _updatesample(revs, revsroots, sample, childrenrevs)
263 263 assert sample
264 264
265 265 if not self._respectsize:
266 266 size = max(size, min(len(revsroots), len(revsheads)))
267 267
268 268 sample = _limitsample(sample, size, randomize=self.randomize)
269 269 if len(sample) < size:
270 270 more = size - len(sample)
271 271 takefrom = list(revs - sample)
272 272 if self.randomize:
273 273 sample.update(random.sample(takefrom, more))
274 274 else:
275 275 takefrom.sort()
276 276 sample.update(takefrom[:more])
277 277 return sample
278 278
279 279
280 280 partialdiscovery = policy.importrust(
281 281 r'discovery', member=r'PartialDiscovery', default=partialdiscovery
282 282 )
283 283
284 284
285 285 def findcommonheads(
286 286 ui,
287 287 local,
288 288 remote,
289 289 initialsamplesize=100,
290 290 fullsamplesize=200,
291 291 abortwhenunrelated=True,
292 292 ancestorsof=None,
293 293 samplegrowth=1.05,
294 294 ):
295 295 '''Return a tuple (common, anyincoming, remoteheads) used to identify
296 296 missing nodes from or in remote.
297 297 '''
298 298 start = util.timer()
299 299
300 300 roundtrips = 0
301 301 cl = local.changelog
302 302 clnode = cl.node
303 303 clrev = cl.rev
304 304
305 305 if ancestorsof is not None:
306 306 ownheads = [clrev(n) for n in ancestorsof]
307 307 else:
308 308 ownheads = [rev for rev in cl.headrevs() if rev != nullrev]
309 309
310 310 # early exit if we know all the specified remote heads already
311 311 ui.debug(b"query 1; heads\n")
312 312 roundtrips += 1
313 313 # We also ask remote about all the local heads. That set can be arbitrarily
314 314 # large, so we used to limit it size to `initialsamplesize`. We no longer
315 315 # do as it proved counter productive. The skipped heads could lead to a
316 316 # large "undecided" set, slower to be clarified than if we asked the
317 317 # question for all heads right away.
318 318 #
319 319 # We are already fetching all server heads using the `heads` commands,
320 320 # sending a equivalent number of heads the other way should not have a
321 321 # significant impact. In addition, it is very likely that we are going to
322 322 # have to issue "known" request for an equivalent amount of revisions in
323 323 # order to decide if theses heads are common or missing.
324 324 #
325 325 # find a detailled analysis below.
326 326 #
327 327 # Case A: local and server both has few heads
328 328 #
329 329 # Ownheads is below initialsamplesize, limit would not have any effect.
330 330 #
331 331 # Case B: local has few heads and server has many
332 332 #
333 333 # Ownheads is below initialsamplesize, limit would not have any effect.
334 334 #
335 335 # Case C: local and server both has many heads
336 336 #
337 337 # We now transfert some more data, but not significantly more than is
338 338 # already transfered to carry the server heads.
339 339 #
340 340 # Case D: local has many heads, server has few
341 341 #
342 342 # D.1 local heads are mostly known remotely
343 343 #
344 344 # All the known head will have be part of a `known` request at some
345 345 # point for the discovery to finish. Sending them all earlier is
346 346 # actually helping.
347 347 #
348 348 # (This case is fairly unlikely, it requires the numerous heads to all
349 349 # be merged server side in only a few heads)
350 350 #
351 351 # D.2 local heads are mostly missing remotely
352 352 #
353 353 # To determine that the heads are missing, we'll have to issue `known`
354 354 # request for them or one of their ancestors. This amount of `known`
355 355 # request will likely be in the same order of magnitude than the amount
356 356 # of local heads.
357 357 #
358 358 # The only case where we can be more efficient using `known` request on
359 359 # ancestors are case were all the "missing" local heads are based on a
360 360 # few changeset, also "missing". This means we would have a "complex"
361 361 # graph (with many heads) attached to, but very independant to a the
362 362 # "simple" graph on the server. This is a fairly usual case and have
363 363 # not been met in the wild so far.
364 364 if remote.limitedarguments:
365 365 sample = _limitsample(ownheads, initialsamplesize)
366 366 # indices between sample and externalized version must match
367 367 sample = list(sample)
368 368 else:
369 369 sample = ownheads
370 370
371 371 with remote.commandexecutor() as e:
372 372 fheads = e.callcommand(b'heads', {})
373 373 fknown = e.callcommand(
374 374 b'known', {b'nodes': [clnode(r) for r in sample],}
375 375 )
376 376
377 377 srvheadhashes, yesno = fheads.result(), fknown.result()
378 378
379 379 if cl.tip() == nullid:
380 380 if srvheadhashes != [nullid]:
381 381 return [nullid], True, srvheadhashes
382 382 return [nullid], False, []
383 383
384 384 # start actual discovery (we note this before the next "if" for
385 385 # compatibility reasons)
386 386 ui.status(_(b"searching for changes\n"))
387 387
388 388 knownsrvheads = [] # revnos of remote heads that are known locally
389 389 for node in srvheadhashes:
390 390 if node == nullid:
391 391 continue
392 392
393 393 try:
394 394 knownsrvheads.append(clrev(node))
395 395 # Catches unknown and filtered nodes.
396 396 except error.LookupError:
397 397 continue
398 398
399 399 if len(knownsrvheads) == len(srvheadhashes):
400 400 ui.debug(b"all remote heads known locally\n")
401 401 return srvheadhashes, False, srvheadhashes
402 402
403 403 if len(sample) == len(ownheads) and all(yesno):
404 404 ui.note(_(b"all local changesets known remotely\n"))
405 405 ownheadhashes = [clnode(r) for r in ownheads]
406 406 return ownheadhashes, True, srvheadhashes
407 407
408 408 # full blown discovery
409 409
410 410 randomize = ui.configbool(b'devel', b'discovery.randomize')
411 411 disco = partialdiscovery(
412 412 local, ownheads, remote.limitedarguments, randomize=randomize
413 413 )
414 414 # treat remote heads (and maybe own heads) as a first implicit sample
415 415 # response
416 416 disco.addcommons(knownsrvheads)
417 417 disco.addinfo(zip(sample, yesno))
418 418
419 419 full = False
420 420 progress = ui.makeprogress(_(b'searching'), unit=_(b'queries'))
421 421 while not disco.iscomplete():
422 422
423 423 if full or disco.hasinfo():
424 424 if full:
425 425 ui.note(_(b"sampling from both directions\n"))
426 426 else:
427 427 ui.debug(b"taking initial sample\n")
428 428 samplefunc = disco.takefullsample
429 429 targetsize = fullsamplesize
430 430 if not remote.limitedarguments:
431 431 fullsamplesize = int(fullsamplesize * samplegrowth)
432 432 else:
433 433 # use even cheaper initial sample
434 434 ui.debug(b"taking quick initial sample\n")
435 435 samplefunc = disco.takequicksample
436 436 targetsize = initialsamplesize
437 437 sample = samplefunc(ownheads, targetsize)
438 438
439 439 roundtrips += 1
440 440 progress.update(roundtrips)
441 441 stats = disco.stats()
442 442 ui.debug(
443 443 b"query %i; still undecided: %i, sample size is: %i\n"
444 % (roundtrips, stats[b'undecided'], len(sample))
444 % (roundtrips, stats['undecided'], len(sample))
445 445 )
446 446
447 447 # indices between sample and externalized version must match
448 448 sample = list(sample)
449 449
450 450 with remote.commandexecutor() as e:
451 451 yesno = e.callcommand(
452 452 b'known', {b'nodes': [clnode(r) for r in sample],}
453 453 ).result()
454 454
455 455 full = True
456 456
457 457 disco.addinfo(zip(sample, yesno))
458 458
459 459 result = disco.commonheads()
460 460 elapsed = util.timer() - start
461 461 progress.complete()
462 462 ui.debug(b"%d total queries in %.4fs\n" % (roundtrips, elapsed))
463 463 msg = (
464 464 b'found %d common and %d unknown server heads,'
465 465 b' %d roundtrips in %.4fs\n'
466 466 )
467 467 missing = set(result) - set(knownsrvheads)
468 468 ui.log(b'discovery', msg, len(result), len(missing), roundtrips, elapsed)
469 469
470 470 if not result and srvheadhashes != [nullid]:
471 471 if abortwhenunrelated:
472 472 raise error.Abort(_(b"repository is unrelated"))
473 473 else:
474 474 ui.warn(_(b"warning: repository is unrelated\n"))
475 475 return (
476 476 {nullid},
477 477 True,
478 478 srvheadhashes,
479 479 )
480 480
481 481 anyincoming = srvheadhashes != [nullid]
482 482 result = {clnode(r) for r in result}
483 483 return result, anyincoming, srvheadhashes
General Comments 0
You need to be logged in to leave comments. Login now