##// END OF EJS Templates
setdiscovery: document the '_updatesample' function...
Pierre-Yves David -
r23809:9ca2eb88 default
parent child Browse files
Show More
@@ -1,243 +1,257 b''
1 1 # setdiscovery.py - improved discovery of common nodeset for mercurial
2 2 #
3 3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
4 4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """
9 9 Algorithm works in the following way. You have two repository: local and
10 10 remote. They both contains a DAG of changelists.
11 11
12 12 The goal of the discovery protocol is to find one set of node *common*,
13 13 the set of nodes shared by local and remote.
14 14
15 15 One of the issue with the original protocol was latency, it could
16 16 potentially require lots of roundtrips to discover that the local repo was a
17 17 subset of remote (which is a very common case, you usually have few changes
18 18 compared to upstream, while upstream probably had lots of development).
19 19
20 20 The new protocol only requires one interface for the remote repo: `known()`,
21 21 which given a set of changelists tells you if they are present in the DAG.
22 22
23 23 The algorithm then works as follow:
24 24
25 25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
26 26 all nodes are in `unknown`.
27 27 - Take a sample from `unknown`, call `remote.known(sample)`
28 28 - For each node that remote knows, move it and all its ancestors to `common`
29 29 - For each node that remote doesn't know, move it and all its descendants
30 30 to `missing`
31 31 - Iterate until `unknown` is empty
32 32
33 33 There are a couple optimizations, first is instead of starting with a random
34 34 sample of missing, start by sending all heads, in the case where the local
35 35 repo is a subset, you computed the answer in one round trip.
36 36
37 37 Then you can do something similar to the bisecting strategy used when
38 38 finding faulty changesets. Instead of random samples, you can try picking
39 39 nodes that will maximize the number of nodes that will be
40 40 classified with it (since all ancestors or descendants will be marked as well).
41 41 """
42 42
43 43 from node import nullid, nullrev
44 44 from i18n import _
45 45 import random
46 46 import util, dagutil
47 47
48 48 def _updatesample(dag, nodes, sample, always, quicksamplesize=0):
49 """update an existing sample to match the expected size
50
51 The sample is updated with nodes exponentially distant from each head of the
52 <nodes> set. (H~1, H~2, H~4, H~8, etc).
53
54 If a target size is specified, the sampling will stop once this size is
55 reached. Otherwise sampling will happen until roots of the <nodes> set are
56 reached.
57
58 :dag: a dag object from dagutil
59 :nodes: set of nodes we want to discover (if None, assume the whole dag)
60 :sample: a sample to update
61 :always: set of notable nodes that will be part of the sample anyway
62 :quicksamplesize: optional target size of the sample"""
49 63 # if nodes is empty we scan the entire graph
50 64 if nodes:
51 65 heads = dag.headsetofconnecteds(nodes)
52 66 else:
53 67 heads = dag.heads()
54 68 dist = {}
55 69 visit = util.deque(heads)
56 70 seen = set()
57 71 factor = 1
58 72 while visit:
59 73 curr = visit.popleft()
60 74 if curr in seen:
61 75 continue
62 76 d = dist.setdefault(curr, 1)
63 77 if d > factor:
64 78 factor *= 2
65 79 if d == factor:
66 80 if curr not in always: # need this check for the early exit below
67 81 sample.add(curr)
68 82 if quicksamplesize and (len(sample) >= quicksamplesize):
69 83 return
70 84 seen.add(curr)
71 85 for p in dag.parents(curr):
72 86 if not nodes or p in nodes:
73 87 dist.setdefault(p, d + 1)
74 88 visit.append(p)
75 89
76 90 def _setupsample(dag, nodes, size):
77 91 always = dag.headsetofconnecteds(nodes)
78 92 desiredlen = size - len(always)
79 93 if desiredlen <= 0:
80 94 # This could be bad if there are very many heads, all unknown to the
81 95 # server. We're counting on long request support here.
82 96 return always, None, desiredlen
83 97 return always, set(), desiredlen
84 98
85 99 def _takequicksample(dag, nodes, size):
86 100 always, sample, desiredlen = _setupsample(dag, nodes, size)
87 101 if sample is None:
88 102 return always
89 103 _updatesample(dag, None, sample, always, quicksamplesize=desiredlen)
90 104 sample.update(always)
91 105 return sample
92 106
93 107 def _takefullsample(dag, nodes, size):
94 108 always, sample, desiredlen = _setupsample(dag, nodes, size)
95 109 if sample is None:
96 110 return always
97 111 # update from heads
98 112 _updatesample(dag, nodes, sample, always)
99 113 # update from roots
100 114 _updatesample(dag.inverse(), nodes, sample, always)
101 115 assert sample
102 116 sample = _limitsample(sample, desiredlen)
103 117 if len(sample) < desiredlen:
104 118 more = desiredlen - len(sample)
105 119 sample.update(random.sample(list(nodes - sample - always), more))
106 120 sample.update(always)
107 121 return sample
108 122
109 123 def _limitsample(sample, desiredlen):
110 124 """return a random subset of sample of at most desiredlen item"""
111 125 if len(sample) > desiredlen:
112 126 sample = set(random.sample(sample, desiredlen))
113 127 return sample
114 128
115 129 def findcommonheads(ui, local, remote,
116 130 initialsamplesize=100,
117 131 fullsamplesize=200,
118 132 abortwhenunrelated=True):
119 133 '''Return a tuple (common, anyincoming, remoteheads) used to identify
120 134 missing nodes from or in remote.
121 135 '''
122 136 roundtrips = 0
123 137 cl = local.changelog
124 138 dag = dagutil.revlogdag(cl)
125 139
126 140 # early exit if we know all the specified remote heads already
127 141 ui.debug("query 1; heads\n")
128 142 roundtrips += 1
129 143 ownheads = dag.heads()
130 144 sample = _limitsample(ownheads, initialsamplesize)
131 145 # indices between sample and externalized version must match
132 146 sample = list(sample)
133 147 if remote.local():
134 148 # stopgap until we have a proper localpeer that supports batch()
135 149 srvheadhashes = remote.heads()
136 150 yesno = remote.known(dag.externalizeall(sample))
137 151 elif remote.capable('batch'):
138 152 batch = remote.batch()
139 153 srvheadhashesref = batch.heads()
140 154 yesnoref = batch.known(dag.externalizeall(sample))
141 155 batch.submit()
142 156 srvheadhashes = srvheadhashesref.value
143 157 yesno = yesnoref.value
144 158 else:
145 159 # compatibility with pre-batch, but post-known remotes during 1.9
146 160 # development
147 161 srvheadhashes = remote.heads()
148 162 sample = []
149 163
150 164 if cl.tip() == nullid:
151 165 if srvheadhashes != [nullid]:
152 166 return [nullid], True, srvheadhashes
153 167 return [nullid], False, []
154 168
155 169 # start actual discovery (we note this before the next "if" for
156 170 # compatibility reasons)
157 171 ui.status(_("searching for changes\n"))
158 172
159 173 srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)
160 174 if len(srvheads) == len(srvheadhashes):
161 175 ui.debug("all remote heads known locally\n")
162 176 return (srvheadhashes, False, srvheadhashes,)
163 177
164 178 if sample and len(ownheads) <= initialsamplesize and util.all(yesno):
165 179 ui.note(_("all local heads known remotely\n"))
166 180 ownheadhashes = dag.externalizeall(ownheads)
167 181 return (ownheadhashes, True, srvheadhashes,)
168 182
169 183 # full blown discovery
170 184
171 185 # own nodes I know we both know
172 186 # treat remote heads (and maybe own heads) as a first implicit sample
173 187 # response
174 188 common = cl.incrementalmissingrevs(srvheads)
175 189 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
176 190 common.addbases(commoninsample)
177 191 # own nodes where I don't know if remote knows them
178 192 undecided = set(common.missingancestors(ownheads))
179 193 # own nodes I know remote lacks
180 194 missing = set()
181 195
182 196 full = False
183 197 while undecided:
184 198
185 199 if sample:
186 200 missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
187 201 missing.update(dag.descendantset(missinginsample, missing))
188 202
189 203 undecided.difference_update(missing)
190 204
191 205 if not undecided:
192 206 break
193 207
194 208 if full or common.hasbases():
195 209 if full:
196 210 ui.note(_("sampling from both directions\n"))
197 211 else:
198 212 ui.debug("taking initial sample\n")
199 213 samplefunc = _takefullsample
200 214 targetsize = fullsamplesize
201 215 else:
202 216 # use even cheaper initial sample
203 217 ui.debug("taking quick initial sample\n")
204 218 samplefunc = _takequicksample
205 219 targetsize = initialsamplesize
206 220 if len(undecided) < targetsize:
207 221 sample = list(undecided)
208 222 else:
209 223 sample = samplefunc(dag, undecided, targetsize)
210 224 sample = _limitsample(sample, targetsize)
211 225
212 226 roundtrips += 1
213 227 ui.progress(_('searching'), roundtrips, unit=_('queries'))
214 228 ui.debug("query %i; still undecided: %i, sample size is: %i\n"
215 229 % (roundtrips, len(undecided), len(sample)))
216 230 # indices between sample and externalized version must match
217 231 sample = list(sample)
218 232 yesno = remote.known(dag.externalizeall(sample))
219 233 full = True
220 234
221 235 if sample:
222 236 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
223 237 common.addbases(commoninsample)
224 238 common.removeancestorsfrom(undecided)
225 239
226 240 # heads(common) == heads(common.bases) since common represents common.bases
227 241 # and all its ancestors
228 242 result = dag.headsetofconnecteds(common.bases)
229 243 # common.bases can include nullrev, but our contract requires us to not
230 244 # return any heads in that case, so discard that
231 245 result.discard(nullrev)
232 246 ui.progress(_('searching'), None)
233 247 ui.debug("%d total queries\n" % roundtrips)
234 248
235 249 if not result and srvheadhashes != [nullid]:
236 250 if abortwhenunrelated:
237 251 raise util.Abort(_("repository is unrelated"))
238 252 else:
239 253 ui.warn(_("warning: repository is unrelated\n"))
240 254 return (set([nullid]), True, srvheadhashes,)
241 255
242 256 anyincoming = (srvheadhashes != [nullid])
243 257 return dag.externalizeall(result), anyincoming, srvheadhashes
General Comments 0
You need to be logged in to leave comments. Login now