##// END OF EJS Templates
pvec: introduce pvecs
Matt Mackall -
r16249:0d175ac5 default
parent child Browse files
Show More
@@ -0,0 +1,210 b''
1 # pvec.py - probabilistic vector clocks for Mercurial
2 #
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
4 #
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
7
8 '''
9 A "pvec" is a changeset property based on the theory of vector clocks
10 that can be compared to discover relatedness without consulting a
11 graph. This can be useful for tasks like determining how a
12 disconnected patch relates to a repository.
13
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
15 remainder are a bit vector. It is represented as a 70-character base85
16 string.
17
18 Construction:
19
20 - a root changeset has a depth of 0 and a bit vector based on its hash
21 - a normal commit has a changeset where depth is increased by one and
22 one bit vector bit is flipped based on its hash
23 - a merge changeset pvec is constructed by copying changes from one pvec into
24 the other to balance its depth
25
26 Properties:
27
28 - for linear changes, difference in depth is always <= hamming distance
29 - otherwise, changes are probably divergent
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
31
32 Issues:
33
34 - hamming distance ceases to work over distances of ~ 200
35 - detecting divergence is less accurate when the common ancestor is very close
36 to either revision or total distance is high
37 - this could probably be improved by modeling the relation between
38 delta and hdist
39
40 Uses:
41
42 - a patch pvec can be used to locate the nearest available common ancestor for
43 resolving conflicts
44 - ordering of patches can be established without a DAG
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
46 and approximately how many changesets are involved
47 - can be used to find a heuristic divergence measure between changesets on
48 different branches
49 '''
50
51 import base85, util
52 from node import nullrev
53
54 _size = 448 # 70 chars b85-encoded
55 _bytes = _size / 8
56 _depthbits = 24
57 _depthbytes = _depthbits / 8
58 _vecbytes = _bytes - _depthbytes
59 _vecbits = _vecbytes * 8
60 _radius = (_vecbits - 30) / 2 # high probability vecs are related
61
62 def _bin(bs):
63 '''convert a bytestring to a long'''
64 v = 0
65 for b in bs:
66 v = v * 256 + ord(b)
67 return v
68
69 def _str(v, l):
70 bs = ""
71 for p in xrange(l):
72 bs = chr(v & 255) + bs
73 v >>= 8
74 return bs
75
76 def _split(b):
77 '''depth and bitvec'''
78 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
79
80 def _join(depth, bitvec):
81 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
82
83 def _hweight(x):
84 c = 0
85 while x:
86 if x & 1:
87 c += 1
88 x >>= 1
89 return c
90 _htab = [_hweight(x) for x in xrange(256)]
91
92 def _hamming(a, b):
93 '''find the hamming distance between two longs'''
94 d = a ^ b
95 c = 0
96 while d:
97 c += _htab[d & 0xff]
98 d >>= 8
99 return c
100
101 def _mergevec(x, y, c):
102 # Ideally, this function would be x ^ y ^ ancestor, but finding
103 # ancestors is a nuisance. So instead we find the minimal number
104 # of changes to balance the depth and hamming distance
105
106 d1, v1 = x
107 d2, v2 = y
108 if d1 < d2:
109 d1, d2, v1, v2 = d2, d1, v2, v1
110
111 hdist = _hamming(v1, v2)
112 ddist = d1 - d2
113 v = v1
114 m = v1 ^ v2 # mask of different bits
115 i = 1
116
117 if hdist > ddist:
118 # if delta = 10 and hdist = 100, then we need to go up 55 steps
119 # to the ancestor and down 45
120 changes = (hdist - ddist + 1) / 2
121 else:
122 # must make at least one change
123 changes = 1
124 depth = d1 + changes
125
126 # copy changes from v2
127 if m:
128 while changes:
129 if m & i:
130 v ^= i
131 changes -= 1
132 i <<= 1
133 else:
134 v = _flipbit(v, c)
135
136 return depth, v
137
138 def _flipbit(v, node):
139 # converting bit strings to longs is slow
140 bit = (hash(node) & 0xffffffff) % _vecbits
141 return v ^ (1<<bit)
142
143 def ctxpvec(ctx):
144 '''construct a pvec for ctx while filling in the cache'''
145 r = ctx._repo
146 if not util.safehasattr(r, "_pveccache"):
147 r._pveccache = {}
148 pvc = r._pveccache
149 if ctx.rev() not in pvc:
150 cl = r.changelog
151 for n in xrange(ctx.rev() + 1):
152 if n not in pvc:
153 node = cl.node(n)
154 p1, p2 = cl.parentrevs(n)
155 if p1 == nullrev:
156 # start with a 'random' vector at root
157 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
158 elif p2 == nullrev:
159 d, v = pvc[p1]
160 pvc[n] = (d + 1, _flipbit(v, node))
161 else:
162 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
163 bs = _join(*pvc[ctx.rev()])
164 return pvec(base85.b85encode(bs))
165
166 class pvec(object):
167 def __init__(self, hashorctx):
168 if isinstance(hashorctx, str):
169 self._bs = hashorctx
170 self._depth, self._vec = _split(base85.b85decode(hashorctx))
171 else:
172 self._vec = ctxpvec(ctx)
173
174 def __str__(self):
175 return self._bs
176
177 def __eq__(self, b):
178 return self._vec == b._vec and self._depth == b._depth
179
180 def __lt__(self, b):
181 delta = b._depth - self._depth
182 if delta < 0:
183 return False # always correct
184 if _hamming(self._vec, b._vec) > delta:
185 return False
186 return True
187
188 def __gt__(self, b):
189 return b < self
190
191 def __or__(self, b):
192 delta = abs(b._depth - self._depth)
193 if _hamming(self._vec, b._vec) <= delta:
194 return False
195 return True
196
197 def __sub__(self, b):
198 if self | b:
199 raise ValueError("concurrent pvecs")
200 return self._depth - b._depth
201
202 def distance(self, b):
203 d = abs(b._depth - self._depth)
204 h = _hamming(self._vec, b._vec)
205 return max(d, h)
206
207 def near(self, b):
208 dist = abs(b.depth - self._depth)
209 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
210 return False
@@ -169,7 +169,7 b' pypats = ['
169 "missing whitespace around operator"),
169 "missing whitespace around operator"),
170 (r'\s(\+=|-=|!=|<>|<=|>=|<<=|>>=)\S',
170 (r'\s(\+=|-=|!=|<>|<=|>=|<<=|>>=)\S',
171 "missing whitespace around operator"),
171 "missing whitespace around operator"),
172 (r'[^+=*/!<>&| -](\s=|=\s)[^= ]',
172 (r'[^^+=*/!<>&| -](\s=|=\s)[^= ]',
173 "wrong whitespace around ="),
173 "wrong whitespace around ="),
174 (r'raise Exception', "don't raise generic exceptions"),
174 (r'raise Exception', "don't raise generic exceptions"),
175 (r' is\s+(not\s+)?["\'0-9-]', "object comparison with literal"),
175 (r' is\s+(not\s+)?["\'0-9-]', "object comparison with literal"),
@@ -16,7 +16,7 b' import sshserver, hgweb, hgweb.server, c'
16 import merge as mergemod
16 import merge as mergemod
17 import minirst, revset, fileset
17 import minirst, revset, fileset
18 import dagparser, context, simplemerge
18 import dagparser, context, simplemerge
19 import random, setdiscovery, treediscovery, dagutil
19 import random, setdiscovery, treediscovery, dagutil, pvec
20 import phases
20 import phases
21
21
22 table = {}
22 table = {}
@@ -1963,6 +1963,27 b' def debugpushkey(ui, repopath, namespace'
1963 ui.write("%s\t%s\n" % (k.encode('string-escape'),
1963 ui.write("%s\t%s\n" % (k.encode('string-escape'),
1964 v.encode('string-escape')))
1964 v.encode('string-escape')))
1965
1965
1966 @command('debugpvec', [], _('A B'))
1967 def debugpvec(ui, repo, a, b=None):
1968 ca = scmutil.revsingle(repo, a)
1969 cb = scmutil.revsingle(repo, b)
1970 pa = pvec.ctxpvec(ca)
1971 pb = pvec.ctxpvec(cb)
1972 if pa == pb:
1973 rel = "="
1974 elif pa > pb:
1975 rel = ">"
1976 elif pa < pb:
1977 rel = "<"
1978 elif pa | pb:
1979 rel = "|"
1980 ui.write(_("a: %s\n") % pa)
1981 ui.write(_("b: %s\n") % pb)
1982 ui.write(_("depth(a): %d depth(b): %d\n") % (pa._depth, pb._depth))
1983 ui.write(_("delta: %d hdist: %d distance: %d relation: %s\n") %
1984 (abs(pa._depth - pb._depth), pvec._hamming(pa._vec, pb._vec),
1985 pa.distance(pb), rel))
1986
1966 @command('debugrebuildstate',
1987 @command('debugrebuildstate',
1967 [('r', 'rev', '', _('revision to rebuild to'), _('REV'))],
1988 [('r', 'rev', '', _('revision to rebuild to'), _('REV'))],
1968 _('[-r REV] [REV]'))
1989 _('[-r REV] [REV]'))
@@ -87,6 +87,7 b' Show debug commands if there are no othe'
87 debuginstall
87 debuginstall
88 debugknown
88 debugknown
89 debugpushkey
89 debugpushkey
90 debugpvec
90 debugrebuildstate
91 debugrebuildstate
91 debugrename
92 debugrename
92 debugrevlog
93 debugrevlog
@@ -236,6 +237,7 b' Show all commands + options'
236 debuginstall:
237 debuginstall:
237 debugknown:
238 debugknown:
238 debugpushkey:
239 debugpushkey:
240 debugpvec:
239 debugrebuildstate: rev
241 debugrebuildstate: rev
240 debugrename: rev
242 debugrename: rev
241 debugrevlog: changelog, manifest, dump
243 debugrevlog: changelog, manifest, dump
General Comments 0
You need to be logged in to leave comments. Login now