##// END OF EJS Templates
pvec: replace 'ctx._repo' with 'ctx.repo()'
Matt Harbison -
r24339:bcc319d9 default
parent child Browse files
Show More
@@ -1,210 +1,210 b''
1 # pvec.py - probabilistic vector clocks for Mercurial
1 # pvec.py - probabilistic vector clocks for Mercurial
2 #
2 #
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 '''
8 '''
9 A "pvec" is a changeset property based on the theory of vector clocks
9 A "pvec" is a changeset property based on the theory of vector clocks
10 that can be compared to discover relatedness without consulting a
10 that can be compared to discover relatedness without consulting a
11 graph. This can be useful for tasks like determining how a
11 graph. This can be useful for tasks like determining how a
12 disconnected patch relates to a repository.
12 disconnected patch relates to a repository.
13
13
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
15 remainder are a bit vector. It is represented as a 70-character base85
15 remainder are a bit vector. It is represented as a 70-character base85
16 string.
16 string.
17
17
18 Construction:
18 Construction:
19
19
20 - a root changeset has a depth of 0 and a bit vector based on its hash
20 - a root changeset has a depth of 0 and a bit vector based on its hash
21 - a normal commit has a changeset where depth is increased by one and
21 - a normal commit has a changeset where depth is increased by one and
22 one bit vector bit is flipped based on its hash
22 one bit vector bit is flipped based on its hash
23 - a merge changeset pvec is constructed by copying changes from one pvec into
23 - a merge changeset pvec is constructed by copying changes from one pvec into
24 the other to balance its depth
24 the other to balance its depth
25
25
26 Properties:
26 Properties:
27
27
28 - for linear changes, difference in depth is always <= hamming distance
28 - for linear changes, difference in depth is always <= hamming distance
29 - otherwise, changes are probably divergent
29 - otherwise, changes are probably divergent
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
31
31
32 Issues:
32 Issues:
33
33
34 - hamming distance ceases to work over distances of ~ 200
34 - hamming distance ceases to work over distances of ~ 200
35 - detecting divergence is less accurate when the common ancestor is very close
35 - detecting divergence is less accurate when the common ancestor is very close
36 to either revision or total distance is high
36 to either revision or total distance is high
37 - this could probably be improved by modeling the relation between
37 - this could probably be improved by modeling the relation between
38 delta and hdist
38 delta and hdist
39
39
40 Uses:
40 Uses:
41
41
42 - a patch pvec can be used to locate the nearest available common ancestor for
42 - a patch pvec can be used to locate the nearest available common ancestor for
43 resolving conflicts
43 resolving conflicts
44 - ordering of patches can be established without a DAG
44 - ordering of patches can be established without a DAG
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
46 and approximately how many changesets are involved
46 and approximately how many changesets are involved
47 - can be used to find a heuristic divergence measure between changesets on
47 - can be used to find a heuristic divergence measure between changesets on
48 different branches
48 different branches
49 '''
49 '''
50
50
51 import base85, util
51 import base85, util
52 from node import nullrev
52 from node import nullrev
53
53
54 _size = 448 # 70 chars b85-encoded
54 _size = 448 # 70 chars b85-encoded
55 _bytes = _size / 8
55 _bytes = _size / 8
56 _depthbits = 24
56 _depthbits = 24
57 _depthbytes = _depthbits / 8
57 _depthbytes = _depthbits / 8
58 _vecbytes = _bytes - _depthbytes
58 _vecbytes = _bytes - _depthbytes
59 _vecbits = _vecbytes * 8
59 _vecbits = _vecbytes * 8
60 _radius = (_vecbits - 30) / 2 # high probability vectors are related
60 _radius = (_vecbits - 30) / 2 # high probability vectors are related
61
61
62 def _bin(bs):
62 def _bin(bs):
63 '''convert a bytestring to a long'''
63 '''convert a bytestring to a long'''
64 v = 0
64 v = 0
65 for b in bs:
65 for b in bs:
66 v = v * 256 + ord(b)
66 v = v * 256 + ord(b)
67 return v
67 return v
68
68
69 def _str(v, l):
69 def _str(v, l):
70 bs = ""
70 bs = ""
71 for p in xrange(l):
71 for p in xrange(l):
72 bs = chr(v & 255) + bs
72 bs = chr(v & 255) + bs
73 v >>= 8
73 v >>= 8
74 return bs
74 return bs
75
75
76 def _split(b):
76 def _split(b):
77 '''depth and bitvec'''
77 '''depth and bitvec'''
78 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
78 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
79
79
80 def _join(depth, bitvec):
80 def _join(depth, bitvec):
81 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
81 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
82
82
83 def _hweight(x):
83 def _hweight(x):
84 c = 0
84 c = 0
85 while x:
85 while x:
86 if x & 1:
86 if x & 1:
87 c += 1
87 c += 1
88 x >>= 1
88 x >>= 1
89 return c
89 return c
90 _htab = [_hweight(x) for x in xrange(256)]
90 _htab = [_hweight(x) for x in xrange(256)]
91
91
92 def _hamming(a, b):
92 def _hamming(a, b):
93 '''find the hamming distance between two longs'''
93 '''find the hamming distance between two longs'''
94 d = a ^ b
94 d = a ^ b
95 c = 0
95 c = 0
96 while d:
96 while d:
97 c += _htab[d & 0xff]
97 c += _htab[d & 0xff]
98 d >>= 8
98 d >>= 8
99 return c
99 return c
100
100
101 def _mergevec(x, y, c):
101 def _mergevec(x, y, c):
102 # Ideally, this function would be x ^ y ^ ancestor, but finding
102 # Ideally, this function would be x ^ y ^ ancestor, but finding
103 # ancestors is a nuisance. So instead we find the minimal number
103 # ancestors is a nuisance. So instead we find the minimal number
104 # of changes to balance the depth and hamming distance
104 # of changes to balance the depth and hamming distance
105
105
106 d1, v1 = x
106 d1, v1 = x
107 d2, v2 = y
107 d2, v2 = y
108 if d1 < d2:
108 if d1 < d2:
109 d1, d2, v1, v2 = d2, d1, v2, v1
109 d1, d2, v1, v2 = d2, d1, v2, v1
110
110
111 hdist = _hamming(v1, v2)
111 hdist = _hamming(v1, v2)
112 ddist = d1 - d2
112 ddist = d1 - d2
113 v = v1
113 v = v1
114 m = v1 ^ v2 # mask of different bits
114 m = v1 ^ v2 # mask of different bits
115 i = 1
115 i = 1
116
116
117 if hdist > ddist:
117 if hdist > ddist:
118 # if delta = 10 and hdist = 100, then we need to go up 55 steps
118 # if delta = 10 and hdist = 100, then we need to go up 55 steps
119 # to the ancestor and down 45
119 # to the ancestor and down 45
120 changes = (hdist - ddist + 1) / 2
120 changes = (hdist - ddist + 1) / 2
121 else:
121 else:
122 # must make at least one change
122 # must make at least one change
123 changes = 1
123 changes = 1
124 depth = d1 + changes
124 depth = d1 + changes
125
125
126 # copy changes from v2
126 # copy changes from v2
127 if m:
127 if m:
128 while changes:
128 while changes:
129 if m & i:
129 if m & i:
130 v ^= i
130 v ^= i
131 changes -= 1
131 changes -= 1
132 i <<= 1
132 i <<= 1
133 else:
133 else:
134 v = _flipbit(v, c)
134 v = _flipbit(v, c)
135
135
136 return depth, v
136 return depth, v
137
137
138 def _flipbit(v, node):
138 def _flipbit(v, node):
139 # converting bit strings to longs is slow
139 # converting bit strings to longs is slow
140 bit = (hash(node) & 0xffffffff) % _vecbits
140 bit = (hash(node) & 0xffffffff) % _vecbits
141 return v ^ (1<<bit)
141 return v ^ (1<<bit)
142
142
143 def ctxpvec(ctx):
143 def ctxpvec(ctx):
144 '''construct a pvec for ctx while filling in the cache'''
144 '''construct a pvec for ctx while filling in the cache'''
145 r = ctx._repo
145 r = ctx.repo()
146 if not util.safehasattr(r, "_pveccache"):
146 if not util.safehasattr(r, "_pveccache"):
147 r._pveccache = {}
147 r._pveccache = {}
148 pvc = r._pveccache
148 pvc = r._pveccache
149 if ctx.rev() not in pvc:
149 if ctx.rev() not in pvc:
150 cl = r.changelog
150 cl = r.changelog
151 for n in xrange(ctx.rev() + 1):
151 for n in xrange(ctx.rev() + 1):
152 if n not in pvc:
152 if n not in pvc:
153 node = cl.node(n)
153 node = cl.node(n)
154 p1, p2 = cl.parentrevs(n)
154 p1, p2 = cl.parentrevs(n)
155 if p1 == nullrev:
155 if p1 == nullrev:
156 # start with a 'random' vector at root
156 # start with a 'random' vector at root
157 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
157 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
158 elif p2 == nullrev:
158 elif p2 == nullrev:
159 d, v = pvc[p1]
159 d, v = pvc[p1]
160 pvc[n] = (d + 1, _flipbit(v, node))
160 pvc[n] = (d + 1, _flipbit(v, node))
161 else:
161 else:
162 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
162 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
163 bs = _join(*pvc[ctx.rev()])
163 bs = _join(*pvc[ctx.rev()])
164 return pvec(base85.b85encode(bs))
164 return pvec(base85.b85encode(bs))
165
165
166 class pvec(object):
166 class pvec(object):
167 def __init__(self, hashorctx):
167 def __init__(self, hashorctx):
168 if isinstance(hashorctx, str):
168 if isinstance(hashorctx, str):
169 self._bs = hashorctx
169 self._bs = hashorctx
170 self._depth, self._vec = _split(base85.b85decode(hashorctx))
170 self._depth, self._vec = _split(base85.b85decode(hashorctx))
171 else:
171 else:
172 self._vec = ctxpvec(hashorctx)
172 self._vec = ctxpvec(hashorctx)
173
173
174 def __str__(self):
174 def __str__(self):
175 return self._bs
175 return self._bs
176
176
177 def __eq__(self, b):
177 def __eq__(self, b):
178 return self._vec == b._vec and self._depth == b._depth
178 return self._vec == b._vec and self._depth == b._depth
179
179
180 def __lt__(self, b):
180 def __lt__(self, b):
181 delta = b._depth - self._depth
181 delta = b._depth - self._depth
182 if delta < 0:
182 if delta < 0:
183 return False # always correct
183 return False # always correct
184 if _hamming(self._vec, b._vec) > delta:
184 if _hamming(self._vec, b._vec) > delta:
185 return False
185 return False
186 return True
186 return True
187
187
188 def __gt__(self, b):
188 def __gt__(self, b):
189 return b < self
189 return b < self
190
190
191 def __or__(self, b):
191 def __or__(self, b):
192 delta = abs(b._depth - self._depth)
192 delta = abs(b._depth - self._depth)
193 if _hamming(self._vec, b._vec) <= delta:
193 if _hamming(self._vec, b._vec) <= delta:
194 return False
194 return False
195 return True
195 return True
196
196
197 def __sub__(self, b):
197 def __sub__(self, b):
198 if self | b:
198 if self | b:
199 raise ValueError("concurrent pvecs")
199 raise ValueError("concurrent pvecs")
200 return self._depth - b._depth
200 return self._depth - b._depth
201
201
202 def distance(self, b):
202 def distance(self, b):
203 d = abs(b._depth - self._depth)
203 d = abs(b._depth - self._depth)
204 h = _hamming(self._vec, b._vec)
204 h = _hamming(self._vec, b._vec)
205 return max(d, h)
205 return max(d, h)
206
206
207 def near(self, b):
207 def near(self, b):
208 dist = abs(b.depth - self._depth)
208 dist = abs(b.depth - self._depth)
209 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
209 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
210 return False
210 return False
General Comments 0
You need to be logged in to leave comments. Login now