##// END OF EJS Templates
setdiscovery: extract sample limitation in a `_limitsample` function...
Pierre-Yves David -
r23083:ee45f5c2 stable
parent child Browse files
Show More
@@ -1,232 +1,237 b''
1 1 # setdiscovery.py - improved discovery of common nodeset for mercurial
2 2 #
3 3 # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
4 4 # and Peter Arrenbrecht <peter@arrenbrecht.ch>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """
9 9 Algorithm works in the following way. You have two repository: local and
10 10 remote. They both contains a DAG of changelists.
11 11
12 12 The goal of the discovery protocol is to find one set of node *common*,
13 13 the set of nodes shared by local and remote.
14 14
15 15 One of the issue with the original protocol was latency, it could
16 16 potentially require lots of roundtrips to discover that the local repo was a
17 17 subset of remote (which is a very common case, you usually have few changes
18 18 compared to upstream, while upstream probably had lots of development).
19 19
20 20 The new protocol only requires one interface for the remote repo: `known()`,
21 21 which given a set of changelists tells you if they are present in the DAG.
22 22
23 23 The algorithm then works as follow:
24 24
25 25 - We will be using three sets, `common`, `missing`, `unknown`. Originally
26 26 all nodes are in `unknown`.
27 27 - Take a sample from `unknown`, call `remote.known(sample)`
28 28 - For each node that remote knows, move it and all its ancestors to `common`
29 29 - For each node that remote doesn't know, move it and all its descendants
30 30 to `missing`
31 31 - Iterate until `unknown` is empty
32 32
33 33 There are a couple optimizations, first is instead of starting with a random
34 34 sample of missing, start by sending all heads, in the case where the local
35 35 repo is a subset, you computed the answer in one round trip.
36 36
37 37 Then you can do something similar to the bisecting strategy used when
38 38 finding faulty changesets. Instead of random samples, you can try picking
39 39 nodes that will maximize the number of nodes that will be
40 40 classified with it (since all ancestors or descendants will be marked as well).
41 41 """
42 42
43 43 from node import nullid
44 44 from i18n import _
45 45 import random
46 46 import util, dagutil
47 47
48 48 def _updatesample(dag, nodes, sample, always, quicksamplesize=0):
49 49 # if nodes is empty we scan the entire graph
50 50 if nodes:
51 51 heads = dag.headsetofconnecteds(nodes)
52 52 else:
53 53 heads = dag.heads()
54 54 dist = {}
55 55 visit = util.deque(heads)
56 56 seen = set()
57 57 factor = 1
58 58 while visit:
59 59 curr = visit.popleft()
60 60 if curr in seen:
61 61 continue
62 62 d = dist.setdefault(curr, 1)
63 63 if d > factor:
64 64 factor *= 2
65 65 if d == factor:
66 66 if curr not in always: # need this check for the early exit below
67 67 sample.add(curr)
68 68 if quicksamplesize and (len(sample) >= quicksamplesize):
69 69 return
70 70 seen.add(curr)
71 71 for p in dag.parents(curr):
72 72 if not nodes or p in nodes:
73 73 dist.setdefault(p, d + 1)
74 74 visit.append(p)
75 75
76 76 def _setupsample(dag, nodes, size):
77 77 if len(nodes) <= size:
78 78 return set(nodes), None, 0
79 79 always = dag.headsetofconnecteds(nodes)
80 80 desiredlen = size - len(always)
81 81 if desiredlen <= 0:
82 82 # This could be bad if there are very many heads, all unknown to the
83 83 # server. We're counting on long request support here.
84 84 return always, None, desiredlen
85 85 return always, set(), desiredlen
86 86
87 87 def _takequicksample(dag, nodes, size, initial):
88 88 always, sample, desiredlen = _setupsample(dag, nodes, size)
89 89 if sample is None:
90 90 return always
91 91 if initial:
92 92 fromset = None
93 93 else:
94 94 fromset = nodes
95 95 _updatesample(dag, fromset, sample, always, quicksamplesize=desiredlen)
96 96 sample.update(always)
97 97 return sample
98 98
99 99 def _takefullsample(dag, nodes, size):
100 100 always, sample, desiredlen = _setupsample(dag, nodes, size)
101 101 if sample is None:
102 102 return always
103 103 # update from heads
104 104 _updatesample(dag, nodes, sample, always)
105 105 # update from roots
106 106 _updatesample(dag.inverse(), nodes, sample, always)
107 107 assert sample
108 if len(sample) > desiredlen:
109 sample = set(random.sample(sample, desiredlen))
110 elif len(sample) < desiredlen:
108 sample = _limitsample(sample, desiredlen)
109 if len(sample) < desiredlen:
111 110 more = desiredlen - len(sample)
112 111 sample.update(random.sample(list(nodes - sample - always), more))
113 112 sample.update(always)
114 113 return sample
115 114
115 def _limitsample(sample, desiredlen):
116 """return a random subset of sample of at most desiredlen item"""
117 if len(sample) > desiredlen:
118 sample = set(random.sample(sample, desiredlen))
119 return sample
120
116 121 def findcommonheads(ui, local, remote,
117 122 initialsamplesize=100,
118 123 fullsamplesize=200,
119 124 abortwhenunrelated=True):
120 125 '''Return a tuple (common, anyincoming, remoteheads) used to identify
121 126 missing nodes from or in remote.
122 127 '''
123 128 roundtrips = 0
124 129 cl = local.changelog
125 130 dag = dagutil.revlogdag(cl)
126 131
127 132 # early exit if we know all the specified remote heads already
128 133 ui.debug("query 1; heads\n")
129 134 roundtrips += 1
130 135 ownheads = dag.heads()
131 136 sample = ownheads
132 137 if remote.local():
133 138 # stopgap until we have a proper localpeer that supports batch()
134 139 srvheadhashes = remote.heads()
135 140 yesno = remote.known(dag.externalizeall(sample))
136 141 elif remote.capable('batch'):
137 142 batch = remote.batch()
138 143 srvheadhashesref = batch.heads()
139 144 yesnoref = batch.known(dag.externalizeall(sample))
140 145 batch.submit()
141 146 srvheadhashes = srvheadhashesref.value
142 147 yesno = yesnoref.value
143 148 else:
144 149 # compatibility with pre-batch, but post-known remotes during 1.9
145 150 # development
146 151 srvheadhashes = remote.heads()
147 152 sample = []
148 153
149 154 if cl.tip() == nullid:
150 155 if srvheadhashes != [nullid]:
151 156 return [nullid], True, srvheadhashes
152 157 return [nullid], False, []
153 158
154 159 # start actual discovery (we note this before the next "if" for
155 160 # compatibility reasons)
156 161 ui.status(_("searching for changes\n"))
157 162
158 163 srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)
159 164 if len(srvheads) == len(srvheadhashes):
160 165 ui.debug("all remote heads known locally\n")
161 166 return (srvheadhashes, False, srvheadhashes,)
162 167
163 168 if sample and util.all(yesno):
164 169 ui.note(_("all local heads known remotely\n"))
165 170 ownheadhashes = dag.externalizeall(ownheads)
166 171 return (ownheadhashes, True, srvheadhashes,)
167 172
168 173 # full blown discovery
169 174
170 175 # own nodes where I don't know if remote knows them
171 176 undecided = dag.nodeset()
172 177 # own nodes I know we both know
173 178 common = set()
174 179 # own nodes I know remote lacks
175 180 missing = set()
176 181
177 182 # treat remote heads (and maybe own heads) as a first implicit sample
178 183 # response
179 184 common.update(dag.ancestorset(srvheads))
180 185 undecided.difference_update(common)
181 186
182 187 full = False
183 188 while undecided:
184 189
185 190 if sample:
186 191 commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
187 192 common.update(dag.ancestorset(commoninsample, common))
188 193
189 194 missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
190 195 missing.update(dag.descendantset(missinginsample, missing))
191 196
192 197 undecided.difference_update(missing)
193 198 undecided.difference_update(common)
194 199
195 200 if not undecided:
196 201 break
197 202
198 203 if full:
199 204 ui.note(_("sampling from both directions\n"))
200 205 sample = _takefullsample(dag, undecided, size=fullsamplesize)
201 206 elif common:
202 207 # use cheapish initial sample
203 208 ui.debug("taking initial sample\n")
204 209 sample = _takefullsample(dag, undecided, size=fullsamplesize)
205 210 else:
206 211 # use even cheaper initial sample
207 212 ui.debug("taking quick initial sample\n")
208 213 sample = _takequicksample(dag, undecided, size=initialsamplesize,
209 214 initial=True)
210 215
211 216 roundtrips += 1
212 217 ui.progress(_('searching'), roundtrips, unit=_('queries'))
213 218 ui.debug("query %i; still undecided: %i, sample size is: %i\n"
214 219 % (roundtrips, len(undecided), len(sample)))
215 220 # indices between sample and externalized version must match
216 221 sample = list(sample)
217 222 yesno = remote.known(dag.externalizeall(sample))
218 223 full = True
219 224
220 225 result = dag.headsetofconnecteds(common)
221 226 ui.progress(_('searching'), None)
222 227 ui.debug("%d total queries\n" % roundtrips)
223 228
224 229 if not result and srvheadhashes != [nullid]:
225 230 if abortwhenunrelated:
226 231 raise util.Abort(_("repository is unrelated"))
227 232 else:
228 233 ui.warn(_("warning: repository is unrelated\n"))
229 234 return (set([nullid]), True, srvheadhashes,)
230 235
231 236 anyincoming = (srvheadhashes != [nullid])
232 237 return dag.externalizeall(result), anyincoming, srvheadhashes
General Comments 0
You need to be logged in to leave comments. Login now