upstream/mercurial-mirror Commit - r20656:cdecbc5a

1

# setdiscovery.py - improved discovery of common nodeset for mercurial

1

# setdiscovery.py - improved discovery of common nodeset for mercurial

2

#

2

#

3

4

# and Peter Arrenbrecht <peter@arrenbrecht.ch>

4

# and Peter Arrenbrecht <peter@arrenbrecht.ch>

5

#

5

#

6

# This software may be used and distributed according to the terms of the

6

# This software may be used and distributed according to the terms of the

7

# GNU General Public License version 2 or any later version.

7

# GNU General Public License version 2 or any later version.

8

"""

9

Algorithm works in the following way. You have two repository: local and

10

remote. They both contains a DAG of changelists.

11

12

The goal of the discovery protocol is to find one set of node *common*,

13

the set of nodes shared by local and remote.

14

15

One of the issue with the original protocol was latency, it could

16

potentially require lots of roundtrips to discover that the local repo was a

17

subset of remote (which is a very common case, you usually have few changes

18

compared to upstream, while upstream probably had lots of development).

19

20

The new protocol only requires one interface for the remote repo: `known()`,

21

which given a set of changelists tells you if they are present in the DAG.

22

23

The algorithm then works as follow:

24

25

- We will be using three sets, `common`, `missing`, `unknown`. Originally

26

all nodes are in `unknown`.

27

- Take a sample from `unknown`, call `remote.known(sample)`

28

- For each node that remote knows, move it and all its ancestors to `common`

29

- For each node that remote doesn't know, move it and all its descendants

30

to `missing`

31

- Iterate until `unknown` is empty

32

33

There are a couple optimizations, first is instead of starting with a random

34

sample of missing, start by sending all heads, in the case where the local

35

repo is a subset, you computed the answer in one round trip.

36

37

Then you can do something similar to the bisecting strategy used when

38

finding faulty changesets. Instead of random samples, you can try picking

39

nodes that will maximize the number of nodes that will be

40

classified with it (since all ancestors or descendants will be marked as well).

41

"""

8

42

9

from node import nullid

43

from node import nullid

10

from i18n import _

44

from i18n import _

11

import random

45

import random

12

import util, dagutil

46

import util, dagutil

13

47

14

def _updatesample(dag, nodes, sample, always, quicksamplesize=0):

48

def _updatesample(dag, nodes, sample, always, quicksamplesize=0):

15

# if nodes is empty we scan the entire graph

49

# if nodes is empty we scan the entire graph

16

if nodes:

50

if nodes:

17

heads = dag.headsetofconnecteds(nodes)

51

heads = dag.headsetofconnecteds(nodes)

18

else:

52

else:

19

heads = dag.heads()

53

heads = dag.heads()

20

dist = {}

54

dist = {}

21

visit = util.deque(heads)

55

visit = util.deque(heads)

22

seen = set()

56

seen = set()

23

factor = 1

57

factor = 1

24

while visit:

58

while visit:

25

curr = visit.popleft()

59

curr = visit.popleft()

26

if curr in seen:

60

if curr in seen:

27

continue

61

continue

28

d = dist.setdefault(curr, 1)

62

d = dist.setdefault(curr, 1)

29

if d > factor:

63

if d > factor:

30

factor *= 2

64

factor *= 2

31

if d == factor:

65

if d == factor:

32

if curr not in always: # need this check for the early exit below

66

if curr not in always: # need this check for the early exit below

33

sample.add(curr)

67

sample.add(curr)

34

if quicksamplesize and (len(sample) >= quicksamplesize):

68

if quicksamplesize and (len(sample) >= quicksamplesize):

35

return

69

return

36

seen.add(curr)

70

seen.add(curr)

37

for p in dag.parents(curr):

71

for p in dag.parents(curr):

38

if not nodes or p in nodes:

72

if not nodes or p in nodes:

39

dist.setdefault(p, d + 1)

73

dist.setdefault(p, d + 1)

40

visit.append(p)

74

visit.append(p)

41

75

42

def _setupsample(dag, nodes, size):

76

def _setupsample(dag, nodes, size):

43

if len(nodes) <= size:

77

if len(nodes) <= size:

44

return set(nodes), None, 0

78

return set(nodes), None, 0

45

always = dag.headsetofconnecteds(nodes)

79

always = dag.headsetofconnecteds(nodes)

46

desiredlen = size - len(always)

80

desiredlen = size - len(always)

47

if desiredlen <= 0:

81

if desiredlen <= 0:

48

# This could be bad if there are very many heads, all unknown to the

82

# This could be bad if there are very many heads, all unknown to the

49

# server. We're counting on long request support here.

83

# server. We're counting on long request support here.

50

return always, None, desiredlen

84

return always, None, desiredlen

51

return always, set(), desiredlen

85

return always, set(), desiredlen

52

86

53

def _takequicksample(dag, nodes, size, initial):

87

def _takequicksample(dag, nodes, size, initial):

54

always, sample, desiredlen = _setupsample(dag, nodes, size)

88

always, sample, desiredlen = _setupsample(dag, nodes, size)

55

if sample is None:

89

if sample is None:

56

return always

90

return always

57

if initial:

91

if initial:

58

fromset = None

92

fromset = None

59

else:

93

else:

60

fromset = nodes

94

fromset = nodes

61

_updatesample(dag, fromset, sample, always, quicksamplesize=desiredlen)

95

_updatesample(dag, fromset, sample, always, quicksamplesize=desiredlen)

62

sample.update(always)

96

sample.update(always)

63

return sample

97

return sample

64

98

65

def _takefullsample(dag, nodes, size):

99

def _takefullsample(dag, nodes, size):

66

always, sample, desiredlen = _setupsample(dag, nodes, size)

100

always, sample, desiredlen = _setupsample(dag, nodes, size)

67

if sample is None:

101

if sample is None:

68

return always

102

return always

69

# update from heads

103

# update from heads

70

_updatesample(dag, nodes, sample, always)

104

_updatesample(dag, nodes, sample, always)

71

# update from roots

105

# update from roots

72

_updatesample(dag.inverse(), nodes, sample, always)

106

_updatesample(dag.inverse(), nodes, sample, always)

73

assert sample

107

assert sample

74

if len(sample) > desiredlen:

108

if len(sample) > desiredlen:

75

sample = set(random.sample(sample, desiredlen))

109

sample = set(random.sample(sample, desiredlen))

76

elif len(sample) < desiredlen:

110

elif len(sample) < desiredlen:

77

more = desiredlen - len(sample)

111

more = desiredlen - len(sample)

78

sample.update(random.sample(list(nodes - sample - always), more))

112

sample.update(random.sample(list(nodes - sample - always), more))

79

sample.update(always)

113

sample.update(always)

80

return sample

114

return sample

81

115

82

def findcommonheads(ui, local, remote,

116

def findcommonheads(ui, local, remote,

83

initialsamplesize=100,

117

initialsamplesize=100,

84

fullsamplesize=200,

118

fullsamplesize=200,

85

abortwhenunrelated=True):

119

abortwhenunrelated=True):

86

'''Return a tuple (common, anyincoming, remoteheads) used to identify

120

'''Return a tuple (common, anyincoming, remoteheads) used to identify

87

missing nodes from or in remote.

121

missing nodes from or in remote.

88

'''

122

'''

89

roundtrips = 0

123

roundtrips = 0

90

cl = local.changelog

124

cl = local.changelog

91

dag = dagutil.revlogdag(cl)

125

dag = dagutil.revlogdag(cl)

92

126

93

# early exit if we know all the specified remote heads already

127

# early exit if we know all the specified remote heads already

94

ui.debug("query 1; heads\n")

128

ui.debug("query 1; heads\n")

95

roundtrips += 1

129

roundtrips += 1

96

ownheads = dag.heads()

130

ownheads = dag.heads()

97

sample = ownheads

131

sample = ownheads

98

if remote.local():

132

if remote.local():

99

# stopgap until we have a proper localpeer that supports batch()

133

# stopgap until we have a proper localpeer that supports batch()

100

srvheadhashes = remote.heads()

134

srvheadhashes = remote.heads()

101

yesno = remote.known(dag.externalizeall(sample))

135

yesno = remote.known(dag.externalizeall(sample))

102

elif remote.capable('batch'):

136

elif remote.capable('batch'):

103

batch = remote.batch()

137

batch = remote.batch()

104

srvheadhashesref = batch.heads()

138

srvheadhashesref = batch.heads()

105

yesnoref = batch.known(dag.externalizeall(sample))

139

yesnoref = batch.known(dag.externalizeall(sample))

106

batch.submit()

140

batch.submit()

107

srvheadhashes = srvheadhashesref.value

141

srvheadhashes = srvheadhashesref.value

108

yesno = yesnoref.value

142

yesno = yesnoref.value

109

else:

143

else:

110

# compatibility with pre-batch, but post-known remotes during 1.9

144

# compatibility with pre-batch, but post-known remotes during 1.9

111

# development

145

# development

112

srvheadhashes = remote.heads()

146

srvheadhashes = remote.heads()

113

sample = []

147

sample = []

114

148

115

if cl.tip() == nullid:

149

if cl.tip() == nullid:

116

if srvheadhashes != [nullid]:

150

if srvheadhashes != [nullid]:

117

return [nullid], True, srvheadhashes

151

return [nullid], True, srvheadhashes

118

return [nullid], False, []

152

return [nullid], False, []

119

153

120

# start actual discovery (we note this before the next "if" for

154

# start actual discovery (we note this before the next "if" for

121

# compatibility reasons)

155

# compatibility reasons)

122

ui.status(_("searching for changes\n"))

156

ui.status(_("searching for changes\n"))

123

157

124

srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)

158

srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)

125

if len(srvheads) == len(srvheadhashes):

159

if len(srvheads) == len(srvheadhashes):

126

ui.debug("all remote heads known locally\n")

160

ui.debug("all remote heads known locally\n")

127

return (srvheadhashes, False, srvheadhashes,)

161

return (srvheadhashes, False, srvheadhashes,)

128

162

129

if sample and util.all(yesno):

163

if sample and util.all(yesno):

130

ui.note(_("all local heads known remotely\n"))

164

ui.note(_("all local heads known remotely\n"))

131

ownheadhashes = dag.externalizeall(ownheads)

165

ownheadhashes = dag.externalizeall(ownheads)

132

return (ownheadhashes, True, srvheadhashes,)

166

return (ownheadhashes, True, srvheadhashes,)

133

167

134

# full blown discovery

168

# full blown discovery

135

169

136

# own nodes where I don't know if remote knows them

170

# own nodes where I don't know if remote knows them

137

undecided = dag.nodeset()

171

undecided = dag.nodeset()

138

# own nodes I know we both know

172

# own nodes I know we both know

139

common = set()

173

common = set()

140

# own nodes I know remote lacks

174

# own nodes I know remote lacks

141

missing = set()

175

missing = set()

142

176

143

# treat remote heads (and maybe own heads) as a first implicit sample

177

# treat remote heads (and maybe own heads) as a first implicit sample

144

# response

178

# response

145

common.update(dag.ancestorset(srvheads))

179

common.update(dag.ancestorset(srvheads))

146

undecided.difference_update(common)

180

undecided.difference_update(common)

147

181

148

full = False

182

full = False

149

while undecided:

183

while undecided:

150

184

151

if sample:

185

if sample:

152

commoninsample = set(n for i, n in enumerate(sample) if yesno[i])

186

commoninsample = set(n for i, n in enumerate(sample) if yesno[i])

153

common.update(dag.ancestorset(commoninsample, common))

187

common.update(dag.ancestorset(commoninsample, common))

154

188

155

missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]

189

missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]

156

missing.update(dag.descendantset(missinginsample, missing))

190

missing.update(dag.descendantset(missinginsample, missing))

157

191

158

undecided.difference_update(missing)

192

undecided.difference_update(missing)

159

undecided.difference_update(common)

193

undecided.difference_update(common)

160

194

161

if not undecided:

195

if not undecided:

162

break

196

break

163

197

164

if full:

198

if full:

165

ui.note(_("sampling from both directions\n"))

199

ui.note(_("sampling from both directions\n"))

166

sample = _takefullsample(dag, undecided, size=fullsamplesize)

200

sample = _takefullsample(dag, undecided, size=fullsamplesize)

167

elif common:

201

elif common:

168

# use cheapish initial sample

202

# use cheapish initial sample

169

ui.debug("taking initial sample\n")

203

ui.debug("taking initial sample\n")

170

sample = _takefullsample(dag, undecided, size=fullsamplesize)

204

sample = _takefullsample(dag, undecided, size=fullsamplesize)

171

else:

205

else:

172

# use even cheaper initial sample

206

# use even cheaper initial sample

173

ui.debug("taking quick initial sample\n")

207

ui.debug("taking quick initial sample\n")

174

sample = _takequicksample(dag, undecided, size=initialsamplesize,

208

sample = _takequicksample(dag, undecided, size=initialsamplesize,

175

initial=True)

209

initial=True)

176

210

177

roundtrips += 1

211

roundtrips += 1

178

ui.progress(_('searching'), roundtrips, unit=_('queries'))

212

ui.progress(_('searching'), roundtrips, unit=_('queries'))

179

ui.debug("query %i; still undecided: %i, sample size is: %i\n"

213

ui.debug("query %i; still undecided: %i, sample size is: %i\n"

180

% (roundtrips, len(undecided), len(sample)))

214

% (roundtrips, len(undecided), len(sample)))

181

# indices between sample and externalized version must match

215

# indices between sample and externalized version must match

182

sample = list(sample)

216

sample = list(sample)

183

yesno = remote.known(dag.externalizeall(sample))

217

yesno = remote.known(dag.externalizeall(sample))

184

full = True

218

full = True

185

219

186

result = dag.headsetofconnecteds(common)

220

result = dag.headsetofconnecteds(common)

187

ui.progress(_('searching'), None)

221

ui.progress(_('searching'), None)

188

ui.debug("%d total queries\n" % roundtrips)

222

ui.debug("%d total queries\n" % roundtrips)

189

223

190

if not result and srvheadhashes != [nullid]:

224

if not result and srvheadhashes != [nullid]:

191

if abortwhenunrelated:

225

if abortwhenunrelated:

192

raise util.Abort(_("repository is unrelated"))

226

raise util.Abort(_("repository is unrelated"))

193

else:

227

else:

194

ui.warn(_("warning: repository is unrelated\n"))

228

ui.warn(_("warning: repository is unrelated\n"))

195

return (set([nullid]), True, srvheadhashes,)

229

return (set([nullid]), True, srvheadhashes,)

196

230

197

anyincoming = (srvheadhashes != [nullid])

231

anyincoming = (srvheadhashes != [nullid])

198

return dag.externalizeall(result), anyincoming, srvheadhashes

232

return dag.externalizeall(result), anyincoming, srvheadhashes

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # setdiscovery.py - improved discovery of common nodeset for mercurial
             #
             # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
             # and Peter Arrenbrecht <peter@arrenbrecht.ch>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
+            """
+            Algorithm works in the following way. You have two repository: local and
+            remote. They both contains a DAG of changelists.
+            The goal of the discovery protocol is to find one set of node *common*,
+            the set of nodes shared by local and remote.
+            One of the issue with the original protocol was latency, it could
+            potentially require lots of roundtrips to discover that the local repo was a
+            subset of remote (which is a very common case, you usually have few changes
+            compared to upstream, while upstream probably had lots of development).
+            The new protocol only requires one interface for the remote repo: `known()`,
+            which given a set of changelists tells you if they are present in the DAG.
+            The algorithm then works as follow:
+             - We will be using three sets, `common`, `missing`, `unknown`. Originally
+             all nodes are in `unknown`.
+             - Take a sample from `unknown`, call `remote.known(sample)`
+               - For each node that remote knows, move it and all its ancestors to `common`
+               - For each node that remote doesn't know, move it and all its descendants
+               to `missing`
+             - Iterate until `unknown` is empty
+            There are a couple optimizations, first is instead of starting with a random
+            sample of missing, start by sending all heads, in the case where the local
+            repo is a subset, you computed the answer in one round trip.
+            Then you can do something similar to the bisecting strategy used when
+            finding faulty changesets. Instead of random samples, you can try picking
+            nodes that will maximize the number of nodes that will be
+            classified with it (since all ancestors or descendants will be marked as well).
+            """
             from node import nullid
             from i18n import _
             import random
             import util, dagutil
             def _updatesample(dag, nodes, sample, always, quicksamplesize=0):
                 # if nodes is empty we scan the entire graph
                 if nodes:
                     heads = dag.headsetofconnecteds(nodes)
                 else:
                     heads = dag.heads()
                 dist = {}
                 visit = util.deque(heads)
                 seen = set()
                 factor = 1
                 while visit:
                     curr = visit.popleft()
                     if curr in seen:
                         continue
                     d = dist.setdefault(curr, 1)
                     if d > factor:
                         factor *= 2
                     if d == factor:
                         if curr not in always: # need this check for the early exit below
                             sample.add(curr)
                             if quicksamplesize and (len(sample) >= quicksamplesize):
                                 return
                     seen.add(curr)
                     for p in dag.parents(curr):
                         if not nodes or p in nodes:
                             dist.setdefault(p, d + 1)
                             visit.append(p)
             def _setupsample(dag, nodes, size):
                 if len(nodes) <= size:
                     return set(nodes), None, 0
                 always = dag.headsetofconnecteds(nodes)
                 desiredlen = size - len(always)
                 if desiredlen <= 0:
                     # This could be bad if there are very many heads, all unknown to the
                     # server. We're counting on long request support here.
                     return always, None, desiredlen
                 return always, set(), desiredlen
             def _takequicksample(dag, nodes, size, initial):
                 always, sample, desiredlen = _setupsample(dag, nodes, size)
                 if sample is None:
                     return always
                 if initial:
                     fromset = None
                 else:
                     fromset = nodes
                 _updatesample(dag, fromset, sample, always, quicksamplesize=desiredlen)
                 sample.update(always)
                 return sample
             def _takefullsample(dag, nodes, size):
                 always, sample, desiredlen = _setupsample(dag, nodes, size)
                 if sample is None:
                     return always
                 # update from heads
                 _updatesample(dag, nodes, sample, always)
                 # update from roots
                 _updatesample(dag.inverse(), nodes, sample, always)
                 assert sample
                 if len(sample) > desiredlen:
                     sample = set(random.sample(sample, desiredlen))
                 elif len(sample) < desiredlen:
                     more = desiredlen - len(sample)
                     sample.update(random.sample(list(nodes - sample - always), more))
                 sample.update(always)
                 return sample
             def findcommonheads(ui, local, remote,
                                 initialsamplesize=100,
                                 fullsamplesize=200,
                                 abortwhenunrelated=True):
                 '''Return a tuple (common, anyincoming, remoteheads) used to identify
                 missing nodes from or in remote.
                 '''
                 roundtrips = 0
                 cl = local.changelog
                 dag = dagutil.revlogdag(cl)
                 # early exit if we know all the specified remote heads already
                 ui.debug("query 1; heads\n")
                 roundtrips += 1
                 ownheads = dag.heads()
                 sample = ownheads
                 if remote.local():
                     # stopgap until we have a proper localpeer that supports batch()
                     srvheadhashes = remote.heads()
                     yesno = remote.known(dag.externalizeall(sample))
                 elif remote.capable('batch'):
                     batch = remote.batch()
                     srvheadhashesref = batch.heads()
                     yesnoref = batch.known(dag.externalizeall(sample))
                     batch.submit()
                     srvheadhashes = srvheadhashesref.value
                     yesno = yesnoref.value
                 else:
                     # compatibility with pre-batch, but post-known remotes during 1.9
                     # development
                     srvheadhashes = remote.heads()
                     sample = []
                 if cl.tip() == nullid:
                     if srvheadhashes != [nullid]:
                         return [nullid], True, srvheadhashes
                     return [nullid], False, []
                 # start actual discovery (we note this before the next "if" for
                 # compatibility reasons)
                 ui.status(_("searching for changes\n"))
                 srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)
                 if len(srvheads) == len(srvheadhashes):
                     ui.debug("all remote heads known locally\n")
                     return (srvheadhashes, False, srvheadhashes,)
                 if sample and util.all(yesno):
                     ui.note(_("all local heads known remotely\n"))
                     ownheadhashes = dag.externalizeall(ownheads)
                     return (ownheadhashes, True, srvheadhashes,)
                 # full blown discovery
                 # own nodes where I don't know if remote knows them
                 undecided = dag.nodeset()
                 # own nodes I know we both know
                 common = set()
                 # own nodes I know remote lacks
                 missing = set()
                 # treat remote heads (and maybe own heads) as a first implicit sample
                 # response
                 common.update(dag.ancestorset(srvheads))
                 undecided.difference_update(common)
                 full = False
                 while undecided:
                     if sample:
                         commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
                         common.update(dag.ancestorset(commoninsample, common))
                         missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
                         missing.update(dag.descendantset(missinginsample, missing))
                         undecided.difference_update(missing)
                         undecided.difference_update(common)
                     if not undecided:
                         break
                     if full:
                         ui.note(_("sampling from both directions\n"))
                         sample = _takefullsample(dag, undecided, size=fullsamplesize)
                     elif common:
                         # use cheapish initial sample
                         ui.debug("taking initial sample\n")
                         sample = _takefullsample(dag, undecided, size=fullsamplesize)
                     else:
                         # use even cheaper initial sample
                         ui.debug("taking quick initial sample\n")
                         sample = _takequicksample(dag, undecided, size=initialsamplesize,
                                                   initial=True)
                     roundtrips += 1
                     ui.progress(_('searching'), roundtrips, unit=_('queries'))
                     ui.debug("query %i; still undecided: %i, sample size is: %i\n"
                              % (roundtrips, len(undecided), len(sample)))
                     # indices between sample and externalized version must match
                     sample = list(sample)
                     yesno = remote.known(dag.externalizeall(sample))
                     full = True
                 result = dag.headsetofconnecteds(common)
                 ui.progress(_('searching'), None)
                 ui.debug("%d total queries\n" % roundtrips)
                 if not result and srvheadhashes != [nullid]:
                     if abortwhenunrelated:
                         raise util.Abort(_("repository is unrelated"))
                     else:
                         ui.warn(_("warning: repository is unrelated\n"))
                     return (set([nullid]), True, srvheadhashes,)
                 anyincoming = (srvheadhashes != [nullid])
                 return dag.externalizeall(result), anyincoming, srvheadhashes