##// END OF EJS Templates
pvec: fix overlooked chr() call...
Augie Fackler -
r43764:271af23d default
parent child Browse files
Show More
@@ -1,227 +1,227 b''
1 # pvec.py - probabilistic vector clocks for Mercurial
1 # pvec.py - probabilistic vector clocks for Mercurial
2 #
2 #
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 '''
8 '''
9 A "pvec" is a changeset property based on the theory of vector clocks
9 A "pvec" is a changeset property based on the theory of vector clocks
10 that can be compared to discover relatedness without consulting a
10 that can be compared to discover relatedness without consulting a
11 graph. This can be useful for tasks like determining how a
11 graph. This can be useful for tasks like determining how a
12 disconnected patch relates to a repository.
12 disconnected patch relates to a repository.
13
13
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
15 remainder are a bit vector. It is represented as a 70-character base85
15 remainder are a bit vector. It is represented as a 70-character base85
16 string.
16 string.
17
17
18 Construction:
18 Construction:
19
19
20 - a root changeset has a depth of 0 and a bit vector based on its hash
20 - a root changeset has a depth of 0 and a bit vector based on its hash
21 - a normal commit has a changeset where depth is increased by one and
21 - a normal commit has a changeset where depth is increased by one and
22 one bit vector bit is flipped based on its hash
22 one bit vector bit is flipped based on its hash
23 - a merge changeset pvec is constructed by copying changes from one pvec into
23 - a merge changeset pvec is constructed by copying changes from one pvec into
24 the other to balance its depth
24 the other to balance its depth
25
25
26 Properties:
26 Properties:
27
27
28 - for linear changes, difference in depth is always <= hamming distance
28 - for linear changes, difference in depth is always <= hamming distance
29 - otherwise, changes are probably divergent
29 - otherwise, changes are probably divergent
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
30 - when hamming distance is < 200, we can reliably detect when pvecs are near
31
31
32 Issues:
32 Issues:
33
33
34 - hamming distance ceases to work over distances of ~ 200
34 - hamming distance ceases to work over distances of ~ 200
35 - detecting divergence is less accurate when the common ancestor is very close
35 - detecting divergence is less accurate when the common ancestor is very close
36 to either revision or total distance is high
36 to either revision or total distance is high
37 - this could probably be improved by modeling the relation between
37 - this could probably be improved by modeling the relation between
38 delta and hdist
38 delta and hdist
39
39
40 Uses:
40 Uses:
41
41
42 - a patch pvec can be used to locate the nearest available common ancestor for
42 - a patch pvec can be used to locate the nearest available common ancestor for
43 resolving conflicts
43 resolving conflicts
44 - ordering of patches can be established without a DAG
44 - ordering of patches can be established without a DAG
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
45 - two head pvecs can be compared to determine whether push/pull/merge is needed
46 and approximately how many changesets are involved
46 and approximately how many changesets are involved
47 - can be used to find a heuristic divergence measure between changesets on
47 - can be used to find a heuristic divergence measure between changesets on
48 different branches
48 different branches
49 '''
49 '''
50
50
51 from __future__ import absolute_import
51 from __future__ import absolute_import
52
52
53 from .node import nullrev
53 from .node import nullrev
54 from . import (
54 from . import (
55 pycompat,
55 pycompat,
56 util,
56 util,
57 )
57 )
58
58
59 _size = 448 # 70 chars b85-encoded
59 _size = 448 # 70 chars b85-encoded
60 _bytes = _size / 8
60 _bytes = _size / 8
61 _depthbits = 24
61 _depthbits = 24
62 _depthbytes = _depthbits / 8
62 _depthbytes = _depthbits / 8
63 _vecbytes = _bytes - _depthbytes
63 _vecbytes = _bytes - _depthbytes
64 _vecbits = _vecbytes * 8
64 _vecbits = _vecbytes * 8
65 _radius = (_vecbits - 30) / 2 # high probability vectors are related
65 _radius = (_vecbits - 30) / 2 # high probability vectors are related
66
66
67
67
68 def _bin(bs):
68 def _bin(bs):
69 '''convert a bytestring to a long'''
69 '''convert a bytestring to a long'''
70 v = 0
70 v = 0
71 for b in bs:
71 for b in bs:
72 v = v * 256 + ord(b)
72 v = v * 256 + ord(b)
73 return v
73 return v
74
74
75
75
76 def _str(v, l):
76 def _str(v, l):
77 bs = b""
77 bs = b""
78 for p in pycompat.xrange(l):
78 for p in pycompat.xrange(l):
79 bs = chr(v & 255) + bs
79 bs = pycompat.bytechr(v & 255) + bs
80 v >>= 8
80 v >>= 8
81 return bs
81 return bs
82
82
83
83
84 def _split(b):
84 def _split(b):
85 '''depth and bitvec'''
85 '''depth and bitvec'''
86 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
86 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
87
87
88
88
89 def _join(depth, bitvec):
89 def _join(depth, bitvec):
90 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
90 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
91
91
92
92
93 def _hweight(x):
93 def _hweight(x):
94 c = 0
94 c = 0
95 while x:
95 while x:
96 if x & 1:
96 if x & 1:
97 c += 1
97 c += 1
98 x >>= 1
98 x >>= 1
99 return c
99 return c
100
100
101
101
102 _htab = [_hweight(x) for x in pycompat.xrange(256)]
102 _htab = [_hweight(x) for x in pycompat.xrange(256)]
103
103
104
104
105 def _hamming(a, b):
105 def _hamming(a, b):
106 '''find the hamming distance between two longs'''
106 '''find the hamming distance between two longs'''
107 d = a ^ b
107 d = a ^ b
108 c = 0
108 c = 0
109 while d:
109 while d:
110 c += _htab[d & 0xFF]
110 c += _htab[d & 0xFF]
111 d >>= 8
111 d >>= 8
112 return c
112 return c
113
113
114
114
115 def _mergevec(x, y, c):
115 def _mergevec(x, y, c):
116 # Ideally, this function would be x ^ y ^ ancestor, but finding
116 # Ideally, this function would be x ^ y ^ ancestor, but finding
117 # ancestors is a nuisance. So instead we find the minimal number
117 # ancestors is a nuisance. So instead we find the minimal number
118 # of changes to balance the depth and hamming distance
118 # of changes to balance the depth and hamming distance
119
119
120 d1, v1 = x
120 d1, v1 = x
121 d2, v2 = y
121 d2, v2 = y
122 if d1 < d2:
122 if d1 < d2:
123 d1, d2, v1, v2 = d2, d1, v2, v1
123 d1, d2, v1, v2 = d2, d1, v2, v1
124
124
125 hdist = _hamming(v1, v2)
125 hdist = _hamming(v1, v2)
126 ddist = d1 - d2
126 ddist = d1 - d2
127 v = v1
127 v = v1
128 m = v1 ^ v2 # mask of different bits
128 m = v1 ^ v2 # mask of different bits
129 i = 1
129 i = 1
130
130
131 if hdist > ddist:
131 if hdist > ddist:
132 # if delta = 10 and hdist = 100, then we need to go up 55 steps
132 # if delta = 10 and hdist = 100, then we need to go up 55 steps
133 # to the ancestor and down 45
133 # to the ancestor and down 45
134 changes = (hdist - ddist + 1) / 2
134 changes = (hdist - ddist + 1) / 2
135 else:
135 else:
136 # must make at least one change
136 # must make at least one change
137 changes = 1
137 changes = 1
138 depth = d1 + changes
138 depth = d1 + changes
139
139
140 # copy changes from v2
140 # copy changes from v2
141 if m:
141 if m:
142 while changes:
142 while changes:
143 if m & i:
143 if m & i:
144 v ^= i
144 v ^= i
145 changes -= 1
145 changes -= 1
146 i <<= 1
146 i <<= 1
147 else:
147 else:
148 v = _flipbit(v, c)
148 v = _flipbit(v, c)
149
149
150 return depth, v
150 return depth, v
151
151
152
152
153 def _flipbit(v, node):
153 def _flipbit(v, node):
154 # converting bit strings to longs is slow
154 # converting bit strings to longs is slow
155 bit = (hash(node) & 0xFFFFFFFF) % _vecbits
155 bit = (hash(node) & 0xFFFFFFFF) % _vecbits
156 return v ^ (1 << bit)
156 return v ^ (1 << bit)
157
157
158
158
159 def ctxpvec(ctx):
159 def ctxpvec(ctx):
160 '''construct a pvec for ctx while filling in the cache'''
160 '''construct a pvec for ctx while filling in the cache'''
161 r = ctx.repo()
161 r = ctx.repo()
162 if not util.safehasattr(r, "_pveccache"):
162 if not util.safehasattr(r, "_pveccache"):
163 r._pveccache = {}
163 r._pveccache = {}
164 pvc = r._pveccache
164 pvc = r._pveccache
165 if ctx.rev() not in pvc:
165 if ctx.rev() not in pvc:
166 cl = r.changelog
166 cl = r.changelog
167 for n in pycompat.xrange(ctx.rev() + 1):
167 for n in pycompat.xrange(ctx.rev() + 1):
168 if n not in pvc:
168 if n not in pvc:
169 node = cl.node(n)
169 node = cl.node(n)
170 p1, p2 = cl.parentrevs(n)
170 p1, p2 = cl.parentrevs(n)
171 if p1 == nullrev:
171 if p1 == nullrev:
172 # start with a 'random' vector at root
172 # start with a 'random' vector at root
173 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
173 pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
174 elif p2 == nullrev:
174 elif p2 == nullrev:
175 d, v = pvc[p1]
175 d, v = pvc[p1]
176 pvc[n] = (d + 1, _flipbit(v, node))
176 pvc[n] = (d + 1, _flipbit(v, node))
177 else:
177 else:
178 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
178 pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
179 bs = _join(*pvc[ctx.rev()])
179 bs = _join(*pvc[ctx.rev()])
180 return pvec(util.b85encode(bs))
180 return pvec(util.b85encode(bs))
181
181
182
182
183 class pvec(object):
183 class pvec(object):
184 def __init__(self, hashorctx):
184 def __init__(self, hashorctx):
185 if isinstance(hashorctx, str):
185 if isinstance(hashorctx, str):
186 self._bs = hashorctx
186 self._bs = hashorctx
187 self._depth, self._vec = _split(util.b85decode(hashorctx))
187 self._depth, self._vec = _split(util.b85decode(hashorctx))
188 else:
188 else:
189 self._vec = ctxpvec(hashorctx)
189 self._vec = ctxpvec(hashorctx)
190
190
191 def __str__(self):
191 def __str__(self):
192 return self._bs
192 return self._bs
193
193
194 def __eq__(self, b):
194 def __eq__(self, b):
195 return self._vec == b._vec and self._depth == b._depth
195 return self._vec == b._vec and self._depth == b._depth
196
196
197 def __lt__(self, b):
197 def __lt__(self, b):
198 delta = b._depth - self._depth
198 delta = b._depth - self._depth
199 if delta < 0:
199 if delta < 0:
200 return False # always correct
200 return False # always correct
201 if _hamming(self._vec, b._vec) > delta:
201 if _hamming(self._vec, b._vec) > delta:
202 return False
202 return False
203 return True
203 return True
204
204
205 def __gt__(self, b):
205 def __gt__(self, b):
206 return b < self
206 return b < self
207
207
208 def __or__(self, b):
208 def __or__(self, b):
209 delta = abs(b._depth - self._depth)
209 delta = abs(b._depth - self._depth)
210 if _hamming(self._vec, b._vec) <= delta:
210 if _hamming(self._vec, b._vec) <= delta:
211 return False
211 return False
212 return True
212 return True
213
213
214 def __sub__(self, b):
214 def __sub__(self, b):
215 if self | b:
215 if self | b:
216 raise ValueError(b"concurrent pvecs")
216 raise ValueError(b"concurrent pvecs")
217 return self._depth - b._depth
217 return self._depth - b._depth
218
218
219 def distance(self, b):
219 def distance(self, b):
220 d = abs(b._depth - self._depth)
220 d = abs(b._depth - self._depth)
221 h = _hamming(self._vec, b._vec)
221 h = _hamming(self._vec, b._vec)
222 return max(d, h)
222 return max(d, h)
223
223
224 def near(self, b):
224 def near(self, b):
225 dist = abs(b.depth - self._depth)
225 dist = abs(b.depth - self._depth)
226 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
226 if dist > _radius or _hamming(self._vec, b._vec) > _radius:
227 return False
227 return False
General Comments 0
You need to be logged in to leave comments. Login now