##// END OF EJS Templates
pvec: add an explicit type hint to help pytype...
Augie Fackler -
r43780:8e89b6e1 default
parent child Browse files
Show More
@@ -1,227 +1,228 b''
1 # pvec.py - probabilistic vector clocks for Mercurial
1 # pvec.py - probabilistic vector clocks for Mercurial
2 #
2 #
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 '''
8 '''
9 A "pvec" is a changeset property based on the theory of vector clocks
9 A "pvec" is a changeset property based on the theory of vector clocks
10 that can be compared to discover relatedness without consulting a
10 that can be compared to discover relatedness without consulting a
11 graph. This can be useful for tasks like determining how a
11 graph. This can be useful for tasks like determining how a
12 disconnected patch relates to a repository.
12 disconnected patch relates to a repository.
13
13
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
15 remainder are a bit vector. It is represented as a 70-character base85
15 remainder are a bit vector. It is represented as a 70-character base85
16 string.
16 string.
17
17
18 Construction:
18 Construction:
19
19
20 - a root changeset has a depth of 0 and a bit vector based on its hash
20 - a root changeset has a depth of 0 and a bit vector based on its hash
21 - a normal commit has a changeset where depth is increased by one and
21 - a normal commit has a changeset where depth is increased by one and
22 one bit vector bit is flipped based on its hash
22 one bit vector bit is flipped based on its hash
23 - a merge changeset pvec is constructed by copying changes from one pvec into
23 - a merge changeset pvec is constructed by copying changes from one pvec into
24 the other to balance its depth
24 the other to balance its depth
25
25
26 Properties:
26 Properties:
27
27
28 - for linear changes, difference in depth is always <= hamming distance
28 - for linear changes, difference in depth is always <= hamming distance
29 - otherwise, changes are probably divergent
29 - otherwise, changes are probably divergent
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
31
31
32 Issues:
32 Issues:
33
33
34 - hamming distance ceases to work over distances of ~ 200
34 - hamming distance ceases to work over distances of ~ 200
35 - detecting divergence is less accurate when the common ancestor is very close
35 - detecting divergence is less accurate when the common ancestor is very close
36 to either revision or total distance is high
36 to either revision or total distance is high
37 - this could probably be improved by modeling the relation between
37 - this could probably be improved by modeling the relation between
38 delta and hdist
38 delta and hdist
39
39
40 Uses:
40 Uses:
41
41
42 - a patch pvec can be used to locate the nearest available common ancestor for
42 - a patch pvec can be used to locate the nearest available common ancestor for
43 resolving conflicts
43 resolving conflicts
44 - ordering of patches can be established without a DAG
44 - ordering of patches can be established without a DAG
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
46 and approximately how many changesets are involved
46 and approximately how many changesets are involved
47 - can be used to find a heuristic divergence measure between changesets on
47 - can be used to find a heuristic divergence measure between changesets on
48 different branches
48 different branches
49 '''
49 '''
50
50
51 from __future__ import absolute_import, division
51 from __future__ import absolute_import, division
52
52
53 from .node import nullrev
53 from .node import nullrev
54 from . import (
54 from . import (
55 pycompat,
55 pycompat,
56 util,
56 util,
57 )
57 )
58
58
59 _size = 448 # 70 chars b85-encoded
59 _size = 448 # 70 chars b85-encoded
60 _bytes = _size // 8
60 _bytes = _size // 8
61 _depthbits = 24
61 _depthbits = 24
62 _depthbytes = _depthbits // 8
62 _depthbytes = _depthbits // 8
63 _vecbytes = _bytes - _depthbytes
63 _vecbytes = _bytes - _depthbytes
64 _vecbits = _vecbytes * 8
64 _vecbits = _vecbytes * 8
65 _radius = (_vecbits - 30) // 2 # high probability vectors are related
65 _radius = (_vecbits - 30) // 2 # high probability vectors are related
66
66
67
67
68 def _bin(bs):
68 def _bin(bs):
69 '''convert a bytestring to a long'''
69 '''convert a bytestring to a long'''
70 v = 0
70 v = 0
71 for b in bs:
71 for b in bs:
72 v = v * 256 + ord(b)
72 v = v * 256 + ord(b)
73 return v
73 return v
74
74
75
75
76 def _str(v, l):
76 def _str(v, l):
77 # type: (int, int) -> bytes
77 bs = b""
78 bs = b""
78 for p in pycompat.xrange(l):
79 for p in pycompat.xrange(l):
79 bs = pycompat.bytechr(v & 255) + bs
80 bs = pycompat.bytechr(v & 255) + bs
80 v >>= 8
81 v >>= 8
81 return bs
82 return bs
82
83
83
84
84 def _split(b):
85 def _split(b):
85 '''depth and bitvec'''
86 '''depth and bitvec'''
86 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
87 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
87
88
88
89
89 def _join(depth, bitvec):
90 def _join(depth, bitvec):
90 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
91 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
91
92
92
93
93 def _hweight(x):
94 def _hweight(x):
94 c = 0
95 c = 0
95 while x:
96 while x:
96 if x & 1:
97 if x & 1:
97 c += 1
98 c += 1
98 x >>= 1
99 x >>= 1
99 return c
100 return c
100
101
101
102
102 _htab = [_hweight(x) for x in pycompat.xrange(256)]
103 _htab = [_hweight(x) for x in pycompat.xrange(256)]
103
104
104
105
105 def _hamming(a, b):
106 def _hamming(a, b):
106 '''find the hamming distance between two longs'''
107 '''find the hamming distance between two longs'''
107 d = a ^ b
108 d = a ^ b
108 c = 0
109 c = 0
109 while d:
110 while d:
110 c += _htab[d & 0xFF]
111 c += _htab[d & 0xFF]
111 d >>= 8
112 d >>= 8
112 return c
113 return c
113
114
114
115
115 def _mergevec(x, y, c):
116 def _mergevec(x, y, c):
116 # Ideally, this function would be x ^ y ^ ancestor, but finding
117 # Ideally, this function would be x ^ y ^ ancestor, but finding
117 # ancestors is a nuisance. So instead we find the minimal number
118 # ancestors is a nuisance. So instead we find the minimal number
118 # of changes to balance the depth and hamming distance
119 # of changes to balance the depth and hamming distance
119
120
120 d1, v1 = x
121 d1, v1 = x
121 d2, v2 = y
122 d2, v2 = y
122 if d1 < d2:
123 if d1 < d2:
123 d1, d2, v1, v2 = d2, d1, v2, v1
124 d1, d2, v1, v2 = d2, d1, v2, v1
124
125
125 hdist = _hamming(v1, v2)
126 hdist = _hamming(v1, v2)
126 ddist = d1 - d2
127 ddist = d1 - d2
127 v = v1
128 v = v1
128 m = v1 ^ v2 # mask of different bits
129 m = v1 ^ v2 # mask of different bits
129 i = 1
130 i = 1
130
131
131 if hdist > ddist:
132 if hdist > ddist:
132 # if delta = 10 and hdist = 100, then we need to go up 55 steps
133 # if delta = 10 and hdist = 100, then we need to go up 55 steps
133 # to the ancestor and down 45
134 # to the ancestor and down 45
134 changes = (hdist - ddist + 1) // 2
135 changes = (hdist - ddist + 1) // 2
135 else:
136 else:
136 # must make at least one change
137 # must make at least one change
137 changes = 1
138 changes = 1
138 depth = d1 + changes
139 depth = d1 + changes
139
140
140 # copy changes from v2
141 # copy changes from v2
141 if m:
142 if m:
142 while changes:
143 while changes:
143 if m & i:
144 if m & i:
144 v ^= i
145 v ^= i
145 changes -= 1
146 changes -= 1
146 i <<= 1
147 i <<= 1
147 else:
148 else:
148 v = _flipbit(v, c)
149 v = _flipbit(v, c)
149
150
150 return depth, v
151 return depth, v
151
152
152
153
153 def _flipbit(v, node):
154 def _flipbit(v, node):
154 # converting bit strings to longs is slow
155 # converting bit strings to longs is slow
155 bit = (hash(node) & 0xFFFFFFFF) % _vecbits
156 bit = (hash(node) & 0xFFFFFFFF) % _vecbits
156 return v ^ (1 << bit)
157 return v ^ (1 << bit)
157
158
158
159
159 def ctxpvec(ctx):
160 def ctxpvec(ctx):
160 '''construct a pvec for ctx while filling in the cache'''
161 '''construct a pvec for ctx while filling in the cache'''
161 r = ctx.repo()
162 r = ctx.repo()
162 if not util.safehasattr(r, "_pveccache"):
163 if not util.safehasattr(r, "_pveccache"):
163 r._pveccache = {}
164 r._pveccache = {}
164 pvc = r._pveccache
165 pvc = r._pveccache
165 if ctx.rev() not in pvc:
166 if ctx.rev() not in pvc:
166 cl = r.changelog
167 cl = r.changelog
167 for n in pycompat.xrange(ctx.rev() + 1):
168 for n in pycompat.xrange(ctx.rev() + 1):
168 if n not in pvc:
169 if n not in pvc:
169 node = cl.node(n)
170 node = cl.node(n)
170 p1, p2 = cl.parentrevs(n)
171 p1, p2 = cl.parentrevs(n)
171 if p1 == nullrev:
172 if p1 == nullrev:
172 # start with a 'random' vector at root
173 # start with a 'random' vector at root
173 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
174 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
174 elif p2 == nullrev:
175 elif p2 == nullrev:
175 d, v = pvc[p1]
176 d, v = pvc[p1]
176 pvc[n] = (d + 1, _flipbit(v, node))
177 pvc[n] = (d + 1, _flipbit(v, node))
177 else:
178 else:
178 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
179 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
179 bs = _join(*pvc[ctx.rev()])
180 bs = _join(*pvc[ctx.rev()])
180 return pvec(util.b85encode(bs))
181 return pvec(util.b85encode(bs))
181
182
182
183
183 class pvec(object):
184 class pvec(object):
184 def __init__(self, hashorctx):
185 def __init__(self, hashorctx):
185 if isinstance(hashorctx, str):
186 if isinstance(hashorctx, str):
186 self._bs = hashorctx
187 self._bs = hashorctx
187 self._depth, self._vec = _split(util.b85decode(hashorctx))
188 self._depth, self._vec = _split(util.b85decode(hashorctx))
188 else:
189 else:
189 self._vec = ctxpvec(hashorctx)
190 self._vec = ctxpvec(hashorctx)
190
191
191 def __str__(self):
192 def __str__(self):
192 return self._bs
193 return self._bs
193
194
194 def __eq__(self, b):
195 def __eq__(self, b):
195 return self._vec == b._vec and self._depth == b._depth
196 return self._vec == b._vec and self._depth == b._depth
196
197
197 def __lt__(self, b):
198 def __lt__(self, b):
198 delta = b._depth - self._depth
199 delta = b._depth - self._depth
199 if delta < 0:
200 if delta < 0:
200 return False # always correct
201 return False # always correct
201 if _hamming(self._vec, b._vec) > delta:
202 if _hamming(self._vec, b._vec) > delta:
202 return False
203 return False
203 return True
204 return True
204
205
205 def __gt__(self, b):
206 def __gt__(self, b):
206 return b < self
207 return b < self
207
208
208 def __or__(self, b):
209 def __or__(self, b):
209 delta = abs(b._depth - self._depth)
210 delta = abs(b._depth - self._depth)
210 if _hamming(self._vec, b._vec) <= delta:
211 if _hamming(self._vec, b._vec) <= delta:
211 return False
212 return False
212 return True
213 return True
213
214
214 def __sub__(self, b):
215 def __sub__(self, b):
215 if self | b:
216 if self | b:
216 raise ValueError(b"concurrent pvecs")
217 raise ValueError(b"concurrent pvecs")
217 return self._depth - b._depth
218 return self._depth - b._depth
218
219
219 def distance(self, b):
220 def distance(self, b):
220 d = abs(b._depth - self._depth)
221 d = abs(b._depth - self._depth)
221 h = _hamming(self._vec, b._vec)
222 h = _hamming(self._vec, b._vec)
222 return max(d, h)
223 return max(d, h)
223
224
224 def near(self, b):
225 def near(self, b):
225 dist = abs(b.depth - self._depth)
226 dist = abs(b.depth - self._depth)
226 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
227 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
227 return False
228 return False
General Comments 0
You need to be logged in to leave comments. Login now