##// END OF EJS Templates
setdiscovery: precompute children revisions to avoid quadratic lookup...
Gregory Szorc -
r39214:274acf37 default
parent child Browse files
Show More
@@ -1,293 +1,313 b''
1 1 # setdiscovery.py - improved discovery of common nodeset for mercurial
2 2 #
3 3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
4 4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """
9 9 Algorithm works in the following way. You have two repository: local and
10 10 remote. They both contains a DAG of changelists.
11 11
12 12 The goal of the discovery protocol is to find one set of node *common*,
13 13 the set of nodes shared by local and remote.
14 14
15 15 One of the issue with the original protocol was latency, it could
16 16 potentially require lots of roundtrips to discover that the local repo was a
17 17 subset of remote (which is a very common case, you usually have few changes
18 18 compared to upstream, while upstream probably had lots of development).
19 19
20 20 The new protocol only requires one interface for the remote repo: `known()`,
21 21 which given a set of changelists tells you if they are present in the DAG.
22 22
23 23 The algorithm then works as follow:
24 24
25 25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
26 26 all nodes are in `unknown`.
27 27 - Take a sample from `unknown`, call `remote.known(sample)`
28 28 - For each node that remote knows, move it and all its ancestors to `common`
29 29 - For each node that remote doesn't know, move it and all its descendants
30 30 to `missing`
31 31 - Iterate until `unknown` is empty
32 32
33 33 There are a couple optimizations, first is instead of starting with a random
34 34 sample of missing, start by sending all heads, in the case where the local
35 35 repo is a subset, you computed the answer in one round trip.
36 36
37 37 Then you can do something similar to the bisecting strategy used when
38 38 finding faulty changesets. Instead of random samples, you can try picking
39 39 nodes that will maximize the number of nodes that will be
40 40 classified with it (since all ancestors or descendants will be marked as well).
41 41 """
42 42
43 43 from __future__ import absolute_import
44 44
45 45 import collections
46 46 import random
47 47
48 48 from .i18n import _
49 49 from .node import (
50 50 nullid,
51 51 nullrev,
52 52 )
53 53 from . import (
54 54 error,
55 55 util,
56 56 )
57 57
58 58 def _updatesample(revs, heads, sample, parentfn, quicksamplesize=0):
59 59 """update an existing sample to match the expected size
60 60
61 61 The sample is updated with revs exponentially distant from each head of the
62 62 <revs> set. (H~1, H~2, H~4, H~8, etc).
63 63
64 64 If a target size is specified, the sampling will stop once this size is
65 65 reached. Otherwise sampling will happen until roots of the <revs> set are
66 66 reached.
67 67
68 68 :revs: set of revs we want to discover (if None, assume the whole dag)
69 69 :heads: set of DAG head revs
70 70 :sample: a sample to update
71 71 :parentfn: a callable to resolve parents for a revision
72 72 :quicksamplesize: optional target size of the sample"""
73 73 dist = {}
74 74 visit = collections.deque(heads)
75 75 seen = set()
76 76 factor = 1
77 77 while visit:
78 78 curr = visit.popleft()
79 79 if curr in seen:
80 80 continue
81 81 d = dist.setdefault(curr, 1)
82 82 if d > factor:
83 83 factor *= 2
84 84 if d == factor:
85 85 sample.add(curr)
86 86 if quicksamplesize and (len(sample) >= quicksamplesize):
87 87 return
88 88 seen.add(curr)
89 89
90 90 for p in parentfn(curr):
91 91 if p != nullrev and (not revs or p in revs):
92 92 dist.setdefault(p, d + 1)
93 93 visit.append(p)
94 94
95 95 def _takequicksample(repo, headrevs, revs, size):
96 96 """takes a quick sample of size <size>
97 97
98 98 It is meant for initial sampling and focuses on querying heads and close
99 99 ancestors of heads.
100 100
101 101 :dag: a dag object
102 102 :headrevs: set of head revisions in local DAG to consider
103 103 :revs: set of revs to discover
104 104 :size: the maximum size of the sample"""
105 105 sample = set(repo.revs('heads(%ld)', revs))
106 106
107 107 if len(sample) >= size:
108 108 return _limitsample(sample, size)
109 109
110 110 _updatesample(None, headrevs, sample, repo.changelog.parentrevs,
111 111 quicksamplesize=size)
112 112 return sample
113 113
114 114 def _takefullsample(repo, headrevs, revs, size):
115 115 sample = set(repo.revs('heads(%ld)', revs))
116 116
117 117 # update from heads
118 118 revsheads = set(repo.revs('heads(%ld)', revs))
119 119 _updatesample(revs, revsheads, sample, repo.changelog.parentrevs)
120
120 121 # update from roots
121 122 revsroots = set(repo.revs('roots(%ld)', revs))
122 123
123 # TODO this is quadratic
124 parentfn = lambda rev: repo.changelog.children(repo.changelog.node(rev))
124 # _updatesample() essentially does interaction over revisions to look up
125 # their children. This lookup is expensive and doing it in a loop is
126 # quadratic. We precompute the children for all relevant revisions and
127 # make the lookup in _updatesample() a simple dict lookup.
128 #
129 # Because this function can be called multiple times during discovery, we
130 # may still perform redundant work and there is room to optimize this by
131 # keeping a persistent cache of children across invocations.
132 children = {}
125 133
126 _updatesample(revs, revsroots, sample, parentfn)
134 parentrevs = repo.changelog.parentrevs
135 for rev in repo.changelog.revs(start=min(revsroots)):
136 # Always ensure revision has an entry so we don't need to worry about
137 # missing keys.
138 children.setdefault(rev, [])
139
140 for prev in parentrevs(rev):
141 if prev == nullrev:
142 continue
143
144 children.setdefault(prev, []).append(rev)
145
146 _updatesample(revs, revsroots, sample, children.__getitem__)
127 147 assert sample
128 148 sample = _limitsample(sample, size)
129 149 if len(sample) < size:
130 150 more = size - len(sample)
131 151 sample.update(random.sample(list(revs - sample), more))
132 152 return sample
133 153
134 154 def _limitsample(sample, desiredlen):
135 155 """return a random subset of sample of at most desiredlen item"""
136 156 if len(sample) > desiredlen:
137 157 sample = set(random.sample(sample, desiredlen))
138 158 return sample
139 159
140 160 def findcommonheads(ui, local, remote,
141 161 initialsamplesize=100,
142 162 fullsamplesize=200,
143 163 abortwhenunrelated=True,
144 164 ancestorsof=None):
145 165 '''Return a tuple (common, anyincoming, remoteheads) used to identify
146 166 missing nodes from or in remote.
147 167 '''
148 168 start = util.timer()
149 169
150 170 roundtrips = 0
151 171 cl = local.changelog
152 172 clnode = cl.node
153 173 clrev = cl.rev
154 174
155 175 if ancestorsof is not None:
156 176 ownheads = [clrev(n) for n in ancestorsof]
157 177 else:
158 178 ownheads = [rev for rev in cl.headrevs() if rev != nullrev]
159 179
160 180 # early exit if we know all the specified remote heads already
161 181 ui.debug("query 1; heads\n")
162 182 roundtrips += 1
163 183 sample = _limitsample(ownheads, initialsamplesize)
164 184 # indices between sample and externalized version must match
165 185 sample = list(sample)
166 186
167 187 with remote.commandexecutor() as e:
168 188 fheads = e.callcommand('heads', {})
169 189 fknown = e.callcommand('known', {
170 190 'nodes': [clnode(r) for r in sample],
171 191 })
172 192
173 193 srvheadhashes, yesno = fheads.result(), fknown.result()
174 194
175 195 if cl.tip() == nullid:
176 196 if srvheadhashes != [nullid]:
177 197 return [nullid], True, srvheadhashes
178 198 return [nullid], False, []
179 199
180 200 # start actual discovery (we note this before the next "if" for
181 201 # compatibility reasons)
182 202 ui.status(_("searching for changes\n"))
183 203
184 204 srvheads = []
185 205 for node in srvheadhashes:
186 206 if node == nullid:
187 207 continue
188 208
189 209 try:
190 210 srvheads.append(clrev(node))
191 211 # Catches unknown and filtered nodes.
192 212 except error.LookupError:
193 213 continue
194 214
195 215 if len(srvheads) == len(srvheadhashes):
196 216 ui.debug("all remote heads known locally\n")
197 217 return srvheadhashes, False, srvheadhashes
198 218
199 219 if len(sample) == len(ownheads) and all(yesno):
200 220 ui.note(_("all local heads known remotely\n"))
201 221 ownheadhashes = [clnode(r) for r in ownheads]
202 222 return ownheadhashes, True, srvheadhashes
203 223
204 224 # full blown discovery
205 225
206 226 # own nodes I know we both know
207 227 # treat remote heads (and maybe own heads) as a first implicit sample
208 228 # response
209 229 common = cl.incrementalmissingrevs(srvheads)
210 230 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
211 231 common.addbases(commoninsample)
212 232 # own nodes where I don't know if remote knows them
213 233 undecided = set(common.missingancestors(ownheads))
214 234 # own nodes I know remote lacks
215 235 missing = set()
216 236
217 237 full = False
218 238 progress = ui.makeprogress(_('searching'), unit=_('queries'))
219 239 while undecided:
220 240
221 241 if sample:
222 242 missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
223 243
224 244 if missing:
225 245 missing.update(local.revs('descendants(%ld) - descendants(%ld)',
226 246 missinginsample, missing))
227 247 else:
228 248 missing.update(local.revs('descendants(%ld)', missinginsample))
229 249
230 250 undecided.difference_update(missing)
231 251
232 252 if not undecided:
233 253 break
234 254
235 255 if full or common.hasbases():
236 256 if full:
237 257 ui.note(_("sampling from both directions\n"))
238 258 else:
239 259 ui.debug("taking initial sample\n")
240 260 samplefunc = _takefullsample
241 261 targetsize = fullsamplesize
242 262 else:
243 263 # use even cheaper initial sample
244 264 ui.debug("taking quick initial sample\n")
245 265 samplefunc = _takequicksample
246 266 targetsize = initialsamplesize
247 267 if len(undecided) < targetsize:
248 268 sample = list(undecided)
249 269 else:
250 270 sample = samplefunc(local, ownheads, undecided, targetsize)
251 271
252 272 roundtrips += 1
253 273 progress.update(roundtrips)
254 274 ui.debug("query %i; still undecided: %i, sample size is: %i\n"
255 275 % (roundtrips, len(undecided), len(sample)))
256 276 # indices between sample and externalized version must match
257 277 sample = list(sample)
258 278
259 279 with remote.commandexecutor() as e:
260 280 yesno = e.callcommand('known', {
261 281 'nodes': [clnode(r) for r in sample],
262 282 }).result()
263 283
264 284 full = True
265 285
266 286 if sample:
267 287 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
268 288 common.addbases(commoninsample)
269 289 common.removeancestorsfrom(undecided)
270 290
271 291 # heads(common) == heads(common.bases) since common represents common.bases
272 292 # and all its ancestors
273 293 # The presence of nullrev will confuse heads(). So filter it out.
274 294 result = set(local.revs('heads(%ld)', common.bases - {nullrev}))
275 295 elapsed = util.timer() - start
276 296 progress.complete()
277 297 ui.debug("%d total queries in %.4fs\n" % (roundtrips, elapsed))
278 298 msg = ('found %d common and %d unknown server heads,'
279 299 ' %d roundtrips in %.4fs\n')
280 300 missing = set(result) - set(srvheads)
281 301 ui.log('discovery', msg, len(result), len(missing), roundtrips,
282 302 elapsed)
283 303
284 304 if not result and srvheadhashes != [nullid]:
285 305 if abortwhenunrelated:
286 306 raise error.Abort(_("repository is unrelated"))
287 307 else:
288 308 ui.warn(_("warning: repository is unrelated\n"))
289 309 return ({nullid}, True, srvheadhashes,)
290 310
291 311 anyincoming = (srvheadhashes != [nullid])
292 312 result = {clnode(r) for r in result}
293 313 return result, anyincoming, srvheadhashes
General Comments 0
You need to be logged in to leave comments. Login now