##// END OF EJS Templates
pvec: drop an unused `from __future__ import division`...
Martin von Zweigbergk -
r45093:a89aa2d7 default
parent child Browse files
Show More
@@ -1,228 +1,228 b''
1 # pvec.py - probabilistic vector clocks for Mercurial
1 # pvec.py - probabilistic vector clocks for Mercurial
2 #
2 #
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 '''
8 '''
9 A "pvec" is a changeset property based on the theory of vector clocks
9 A "pvec" is a changeset property based on the theory of vector clocks
10 that can be compared to discover relatedness without consulting a
10 that can be compared to discover relatedness without consulting a
11 graph. This can be useful for tasks like determining how a
11 graph. This can be useful for tasks like determining how a
12 disconnected patch relates to a repository.
12 disconnected patch relates to a repository.
13
13
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
15 remainder are a bit vector. It is represented as a 70-character base85
15 remainder are a bit vector. It is represented as a 70-character base85
16 string.
16 string.
17
17
18 Construction:
18 Construction:
19
19
20 - a root changeset has a depth of 0 and a bit vector based on its hash
20 - a root changeset has a depth of 0 and a bit vector based on its hash
21 - a normal commit has a changeset where depth is increased by one and
21 - a normal commit has a changeset where depth is increased by one and
22 one bit vector bit is flipped based on its hash
22 one bit vector bit is flipped based on its hash
23 - a merge changeset pvec is constructed by copying changes from one pvec into
23 - a merge changeset pvec is constructed by copying changes from one pvec into
24 the other to balance its depth
24 the other to balance its depth
25
25
26 Properties:
26 Properties:
27
27
28 - for linear changes, difference in depth is always <= hamming distance
28 - for linear changes, difference in depth is always <= hamming distance
29 - otherwise, changes are probably divergent
29 - otherwise, changes are probably divergent
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
31
31
32 Issues:
32 Issues:
33
33
34 - hamming distance ceases to work over distances of ~ 200
34 - hamming distance ceases to work over distances of ~ 200
35 - detecting divergence is less accurate when the common ancestor is very close
35 - detecting divergence is less accurate when the common ancestor is very close
36 to either revision or total distance is high
36 to either revision or total distance is high
37 - this could probably be improved by modeling the relation between
37 - this could probably be improved by modeling the relation between
38 delta and hdist
38 delta and hdist
39
39
40 Uses:
40 Uses:
41
41
42 - a patch pvec can be used to locate the nearest available common ancestor for
42 - a patch pvec can be used to locate the nearest available common ancestor for
43 resolving conflicts
43 resolving conflicts
44 - ordering of patches can be established without a DAG
44 - ordering of patches can be established without a DAG
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
46 and approximately how many changesets are involved
46 and approximately how many changesets are involved
47 - can be used to find a heuristic divergence measure between changesets on
47 - can be used to find a heuristic divergence measure between changesets on
48 different branches
48 different branches
49 '''
49 '''
50
50
51 from __future__ import absolute_import, division
51 from __future__ import absolute_import
52
52
53 from .node import nullrev
53 from .node import nullrev
54 from . import (
54 from . import (
55 pycompat,
55 pycompat,
56 util,
56 util,
57 )
57 )
58
58
59 _size = 448 # 70 chars b85-encoded
59 _size = 448 # 70 chars b85-encoded
60 _bytes = _size // 8
60 _bytes = _size // 8
61 _depthbits = 24
61 _depthbits = 24
62 _depthbytes = _depthbits // 8
62 _depthbytes = _depthbits // 8
63 _vecbytes = _bytes - _depthbytes
63 _vecbytes = _bytes - _depthbytes
64 _vecbits = _vecbytes * 8
64 _vecbits = _vecbytes * 8
65 _radius = (_vecbits - 30) // 2 # high probability vectors are related
65 _radius = (_vecbits - 30) // 2 # high probability vectors are related
66
66
67
67
68 def _bin(bs):
68 def _bin(bs):
69 '''convert a bytestring to a long'''
69 '''convert a bytestring to a long'''
70 v = 0
70 v = 0
71 for b in bs:
71 for b in bs:
72 v = v * 256 + ord(b)
72 v = v * 256 + ord(b)
73 return v
73 return v
74
74
75
75
76 def _str(v, l):
76 def _str(v, l):
77 # type: (int, int) -> bytes
77 # type: (int, int) -> bytes
78 bs = b""
78 bs = b""
79 for p in pycompat.xrange(l):
79 for p in pycompat.xrange(l):
80 bs = pycompat.bytechr(v & 255) + bs
80 bs = pycompat.bytechr(v & 255) + bs
81 v >>= 8
81 v >>= 8
82 return bs
82 return bs
83
83
84
84
85 def _split(b):
85 def _split(b):
86 '''depth and bitvec'''
86 '''depth and bitvec'''
87 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
87 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
88
88
89
89
90 def _join(depth, bitvec):
90 def _join(depth, bitvec):
91 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
91 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
92
92
93
93
94 def _hweight(x):
94 def _hweight(x):
95 c = 0
95 c = 0
96 while x:
96 while x:
97 if x & 1:
97 if x & 1:
98 c += 1
98 c += 1
99 x >>= 1
99 x >>= 1
100 return c
100 return c
101
101
102
102
103 _htab = [_hweight(x) for x in pycompat.xrange(256)]
103 _htab = [_hweight(x) for x in pycompat.xrange(256)]
104
104
105
105
106 def _hamming(a, b):
106 def _hamming(a, b):
107 '''find the hamming distance between two longs'''
107 '''find the hamming distance between two longs'''
108 d = a ^ b
108 d = a ^ b
109 c = 0
109 c = 0
110 while d:
110 while d:
111 c += _htab[d & 0xFF]
111 c += _htab[d & 0xFF]
112 d >>= 8
112 d >>= 8
113 return c
113 return c
114
114
115
115
116 def _mergevec(x, y, c):
116 def _mergevec(x, y, c):
117 # Ideally, this function would be x ^ y ^ ancestor, but finding
117 # Ideally, this function would be x ^ y ^ ancestor, but finding
118 # ancestors is a nuisance. So instead we find the minimal number
118 # ancestors is a nuisance. So instead we find the minimal number
119 # of changes to balance the depth and hamming distance
119 # of changes to balance the depth and hamming distance
120
120
121 d1, v1 = x
121 d1, v1 = x
122 d2, v2 = y
122 d2, v2 = y
123 if d1 < d2:
123 if d1 < d2:
124 d1, d2, v1, v2 = d2, d1, v2, v1
124 d1, d2, v1, v2 = d2, d1, v2, v1
125
125
126 hdist = _hamming(v1, v2)
126 hdist = _hamming(v1, v2)
127 ddist = d1 - d2
127 ddist = d1 - d2
128 v = v1
128 v = v1
129 m = v1 ^ v2 # mask of different bits
129 m = v1 ^ v2 # mask of different bits
130 i = 1
130 i = 1
131
131
132 if hdist > ddist:
132 if hdist > ddist:
133 # if delta = 10 and hdist = 100, then we need to go up 55 steps
133 # if delta = 10 and hdist = 100, then we need to go up 55 steps
134 # to the ancestor and down 45
134 # to the ancestor and down 45
135 changes = (hdist - ddist + 1) // 2
135 changes = (hdist - ddist + 1) // 2
136 else:
136 else:
137 # must make at least one change
137 # must make at least one change
138 changes = 1
138 changes = 1
139 depth = d1 + changes
139 depth = d1 + changes
140
140
141 # copy changes from v2
141 # copy changes from v2
142 if m:
142 if m:
143 while changes:
143 while changes:
144 if m & i:
144 if m & i:
145 v ^= i
145 v ^= i
146 changes -= 1
146 changes -= 1
147 i <<= 1
147 i <<= 1
148 else:
148 else:
149 v = _flipbit(v, c)
149 v = _flipbit(v, c)
150
150
151 return depth, v
151 return depth, v
152
152
153
153
154 def _flipbit(v, node):
154 def _flipbit(v, node):
155 # converting bit strings to longs is slow
155 # converting bit strings to longs is slow
156 bit = (hash(node) & 0xFFFFFFFF) % _vecbits
156 bit = (hash(node) & 0xFFFFFFFF) % _vecbits
157 return v ^ (1 << bit)
157 return v ^ (1 << bit)
158
158
159
159
160 def ctxpvec(ctx):
160 def ctxpvec(ctx):
161 '''construct a pvec for ctx while filling in the cache'''
161 '''construct a pvec for ctx while filling in the cache'''
162 r = ctx.repo()
162 r = ctx.repo()
163 if not util.safehasattr(r, "_pveccache"):
163 if not util.safehasattr(r, "_pveccache"):
164 r._pveccache = {}
164 r._pveccache = {}
165 pvc = r._pveccache
165 pvc = r._pveccache
166 if ctx.rev() not in pvc:
166 if ctx.rev() not in pvc:
167 cl = r.changelog
167 cl = r.changelog
168 for n in pycompat.xrange(ctx.rev() + 1):
168 for n in pycompat.xrange(ctx.rev() + 1):
169 if n not in pvc:
169 if n not in pvc:
170 node = cl.node(n)
170 node = cl.node(n)
171 p1, p2 = cl.parentrevs(n)
171 p1, p2 = cl.parentrevs(n)
172 if p1 == nullrev:
172 if p1 == nullrev:
173 # start with a 'random' vector at root
173 # start with a 'random' vector at root
174 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
174 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
175 elif p2 == nullrev:
175 elif p2 == nullrev:
176 d, v = pvc[p1]
176 d, v = pvc[p1]
177 pvc[n] = (d + 1, _flipbit(v, node))
177 pvc[n] = (d + 1, _flipbit(v, node))
178 else:
178 else:
179 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
179 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
180 bs = _join(*pvc[ctx.rev()])
180 bs = _join(*pvc[ctx.rev()])
181 return pvec(util.b85encode(bs))
181 return pvec(util.b85encode(bs))
182
182
183
183
184 class pvec(object):
184 class pvec(object):
185 def __init__(self, hashorctx):
185 def __init__(self, hashorctx):
186 if isinstance(hashorctx, bytes):
186 if isinstance(hashorctx, bytes):
187 self._bs = hashorctx
187 self._bs = hashorctx
188 self._depth, self._vec = _split(util.b85decode(hashorctx))
188 self._depth, self._vec = _split(util.b85decode(hashorctx))
189 else:
189 else:
190 self._vec = ctxpvec(hashorctx)
190 self._vec = ctxpvec(hashorctx)
191
191
192 def __str__(self):
192 def __str__(self):
193 return self._bs
193 return self._bs
194
194
195 def __eq__(self, b):
195 def __eq__(self, b):
196 return self._vec == b._vec and self._depth == b._depth
196 return self._vec == b._vec and self._depth == b._depth
197
197
198 def __lt__(self, b):
198 def __lt__(self, b):
199 delta = b._depth - self._depth
199 delta = b._depth - self._depth
200 if delta < 0:
200 if delta < 0:
201 return False # always correct
201 return False # always correct
202 if _hamming(self._vec, b._vec) > delta:
202 if _hamming(self._vec, b._vec) > delta:
203 return False
203 return False
204 return True
204 return True
205
205
206 def __gt__(self, b):
206 def __gt__(self, b):
207 return b < self
207 return b < self
208
208
209 def __or__(self, b):
209 def __or__(self, b):
210 delta = abs(b._depth - self._depth)
210 delta = abs(b._depth - self._depth)
211 if _hamming(self._vec, b._vec) <= delta:
211 if _hamming(self._vec, b._vec) <= delta:
212 return False
212 return False
213 return True
213 return True
214
214
215 def __sub__(self, b):
215 def __sub__(self, b):
216 if self | b:
216 if self | b:
217 raise ValueError(b"concurrent pvecs")
217 raise ValueError(b"concurrent pvecs")
218 return self._depth - b._depth
218 return self._depth - b._depth
219
219
220 def distance(self, b):
220 def distance(self, b):
221 d = abs(b._depth - self._depth)
221 d = abs(b._depth - self._depth)
222 h = _hamming(self._vec, b._vec)
222 h = _hamming(self._vec, b._vec)
223 return max(d, h)
223 return max(d, h)
224
224
225 def near(self, b):
225 def near(self, b):
226 dist = abs(b.depth - self._depth)
226 dist = abs(b.depth - self._depth)
227 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
227 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
228 return False
228 return False
General Comments 0
You need to be logged in to leave comments. Login now