##// END OF EJS Templates
Add lazy{parser,index,map} to speed up processing of index files
mpm@selenic.com -
r76:d993ebd6 default
parent child Browse files
Show More
@@ -1,421 +1,473 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, os, tempfile, binascii
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16
17 17 def compress(text):
18 18 return zlib.compress(text)
19 19
20 20 def decompress(bin):
21 21 return zlib.decompress(bin)
22 22
23 23 def hash(text, p1, p2):
24 24 l = [p1, p2]
25 25 l.sort()
26 26 return sha.sha(l[0] + l[1] + text).digest()
27 27
28 28 nullid = "\0" * 20
29 29 indexformat = ">4l20s20s20s"
30 30
31 class lazyparser:
32 def __init__(self, data):
33 self.data = data
34 self.s = struct.calcsize(indexformat)
35 self.l = len(data)/self.s
36 self.index = [None] * self.l
37 self.map = {nullid: -1}
38
39 if 0:
40 n = 0
41 i = self.data
42 s = struct.calcsize(indexformat)
43 for f in xrange(0, len(i), s):
44 # offset, size, base, linkrev, p1, p2, nodeid
45 e = struct.unpack(indexformat, i[f:f + s])
46 self.map[e[6]] = n
47 self.index.append(e)
48 n += 1
49
50 def load(self, pos):
51 block = pos / 1000
52 i = block * 1000
53 end = min(self.l, i + 1000)
54 while i < end:
55 d = self.data[i * self.s: (i + 1) * self.s]
56 e = struct.unpack(indexformat, d)
57 self.index[i] = e
58 self.map[e[6]] = i
59 i += 1
60
61 class lazyindex:
62 def __init__(self, parser):
63 self.p = parser
64 def __len__(self):
65 return len(self.p.index)
66 def __getitem__(self, pos):
67 i = self.p.index[pos]
68 if not i:
69 self.p.load(pos)
70 return self.p.index[pos]
71 return i
72 def append(self, e):
73 self.p.index.append(e)
74
75 class lazymap:
76 def __init__(self, parser):
77 self.p = parser
78 def load(self, key):
79 n = self.p.data.find(key)
80 if n < 0: raise KeyError(key)
81 pos = n / self.p.s
82 self.p.load(pos)
83 def __contains__(self, key):
84 try:
85 self.p.map[key]
86 return True
87 except KeyError:
88 return False
89 def __getitem__(self, key):
90 try:
91 return self.p.map[key]
92 except KeyError:
93 self.load(key)
94 return self.p.map[key]
95 def __setitem__(self, key, val):
96 self.p.map[key] = val
97
31 98 class revlog:
32 99 def __init__(self, opener, indexfile, datafile):
33 100 self.indexfile = indexfile
34 101 self.datafile = datafile
35 self.index = []
36 102 self.opener = opener
37 103 self.cache = None
38 104 # read the whole index for now, handle on-demand later
39 105 try:
40 n = 0
41 106 i = self.opener(self.indexfile).read()
42 s = struct.calcsize(indexformat)
43
44 # preallocate arrays
45 l = len(i)/s
46 self.index = [None] * l
47 m = [None] * l
48
49 for f in xrange(0, len(i), s):
50 # offset, size, base, linkrev, p1, p2, nodeid
51 e = struct.unpack(indexformat, i[f:f + s])
52 self.index[n] = e
53 m[n] = (e[6], n)
54 n += 1
55
56 self.nodemap = dict(m)
57 107 except IOError:
58 self.nodemap = {}
59 self.nodemap[nullid] = -1
108 i = ""
109 parser = lazyparser(i)
110 self.index = lazyindex(parser)
111 self.nodemap = lazymap(parser)
60 112
61 113 def tip(self): return self.node(len(self.index) - 1)
62 114 def count(self): return len(self.index)
63 115 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
64 116 def rev(self, node): return self.nodemap[node]
65 117 def linkrev(self, node): return self.index[self.nodemap[node]][3]
66 118 def parents(self, node):
67 119 if node == nullid: return (nullid, nullid)
68 120 return self.index[self.nodemap[node]][4:6]
69 121
70 122 def start(self, rev): return self.index[rev][0]
71 123 def length(self, rev): return self.index[rev][1]
72 124 def end(self, rev): return self.start(rev) + self.length(rev)
73 125 def base(self, rev): return self.index[rev][2]
74 126
75 127 def lookup(self, id):
76 128 try:
77 129 rev = int(id)
78 130 return self.node(rev)
79 131 except ValueError:
80 132 c = []
81 133 for n in self.nodemap:
82 134 if id in hex(n):
83 135 c.append(n)
84 136 if len(c) > 1: raise KeyError("Ambiguous identifier")
85 137 if len(c) < 1: raise KeyError("No match found")
86 138 return c[0]
87 139
88 140 return None
89 141
90 142 def revisions(self, list):
91 143 # this can be optimized to do spans, etc
92 144 # be stupid for now
93 145 for node in list:
94 146 yield self.revision(node)
95 147
96 148 def diff(self, a, b):
97 149 return mdiff.textdiff(a, b)
98 150
99 151 def patches(self, t, pl):
100 152 return mdiff.patches(t, pl)
101 153
102 154 def revision(self, node):
103 155 if node == nullid: return ""
104 156 if self.cache and self.cache[0] == node: return self.cache[2]
105 157
106 158 text = None
107 159 rev = self.rev(node)
108 160 base = self.base(rev)
109 161 start = self.start(base)
110 162 end = self.end(rev)
111 163
112 164 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
113 165 base = self.cache[1]
114 166 start = self.start(base + 1)
115 167 text = self.cache[2]
116 168 last = 0
117 169
118 170 f = self.opener(self.datafile)
119 171 f.seek(start)
120 172 data = f.read(end - start)
121 173
122 174 if not text:
123 175 last = self.length(base)
124 176 text = decompress(data[:last])
125 177
126 178 bins = []
127 179 for r in xrange(base + 1, rev + 1):
128 180 s = self.length(r)
129 181 bins.append(decompress(data[last:last + s]))
130 182 last = last + s
131 183
132 184 text = mdiff.patches(text, bins)
133 185
134 186 (p1, p2) = self.parents(node)
135 187 if node != hash(text, p1, p2):
136 188 raise "integrity check failed on %s:%d" % (self.datafile, rev)
137 189
138 190 self.cache = (node, rev, text)
139 191 return text
140 192
141 193 def addrevision(self, text, transaction, link, p1=None, p2=None):
142 194 if text is None: text = ""
143 195 if p1 is None: p1 = self.tip()
144 196 if p2 is None: p2 = nullid
145 197
146 198 node = hash(text, p1, p2)
147 199
148 200 n = self.count()
149 201 t = n - 1
150 202
151 203 if n:
152 204 base = self.base(t)
153 205 start = self.start(base)
154 206 end = self.end(t)
155 207 prev = self.revision(self.tip())
156 208 data = compress(self.diff(prev, text))
157 209 dist = end - start + len(data)
158 210
159 211 # full versions are inserted when the needed deltas
160 212 # become comparable to the uncompressed text
161 213 if not n or dist > len(text) * 2:
162 214 data = compress(text)
163 215 base = n
164 216 else:
165 217 base = self.base(t)
166 218
167 219 offset = 0
168 220 if t >= 0:
169 221 offset = self.end(t)
170 222
171 223 e = (offset, len(data), base, link, p1, p2, node)
172 224
173 225 self.index.append(e)
174 226 self.nodemap[node] = n
175 227 entry = struct.pack(indexformat, *e)
176 228
177 229 transaction.add(self.datafile, e[0])
178 230 self.opener(self.datafile, "a").write(data)
179 231 transaction.add(self.indexfile, n * len(entry))
180 232 self.opener(self.indexfile, "a").write(entry)
181 233
182 234 self.cache = (node, n, text)
183 235 return node
184 236
185 237 def ancestor(self, a, b):
186 238 def expand(list, map):
187 239 a = []
188 240 while list:
189 241 n = list.pop(0)
190 242 map[n] = 1
191 243 yield n
192 244 for p in self.parents(n):
193 245 if p != nullid and p not in map:
194 246 list.append(p)
195 247 yield nullid
196 248
197 249 amap = {}
198 250 bmap = {}
199 251 ag = expand([a], amap)
200 252 bg = expand([b], bmap)
201 253 adone = bdone = 0
202 254
203 255 while not adone or not bdone:
204 256 if not adone:
205 257 an = ag.next()
206 258 if an == nullid:
207 259 adone = 1
208 260 elif an in bmap:
209 261 return an
210 262 if not bdone:
211 263 bn = bg.next()
212 264 if bn == nullid:
213 265 bdone = 1
214 266 elif bn in amap:
215 267 return bn
216 268
217 269 return nullid
218 270
219 271 def mergedag(self, other, transaction, linkseq, accumulate = None):
220 272 """combine the nodes from other's DAG into ours"""
221 273 old = self.tip()
222 274 i = self.count()
223 275 l = []
224 276
225 277 # merge the other revision log into our DAG
226 278 for r in range(other.count()):
227 279 id = other.node(r)
228 280 if id not in self.nodemap:
229 281 (xn, yn) = other.parents(id)
230 282 l.append((id, xn, yn))
231 283 self.nodemap[id] = i
232 284 i += 1
233 285
234 286 # merge node date for new nodes
235 287 r = other.revisions([e[0] for e in l])
236 288 for e in l:
237 289 t = r.next()
238 290 if accumulate: accumulate(t)
239 291 self.addrevision(t, transaction, linkseq.next(), e[1], e[2])
240 292
241 293 # return the unmerged heads for later resolving
242 294 return (old, self.tip())
243 295
244 296 def group(self, linkmap):
245 297 # given a list of changeset revs, return a set of deltas and
246 298 # metadata corresponding to nodes the first delta is
247 299 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
248 300 # have this parent as it has all history before these
249 301 # changesets. parent is parent[0]
250 302
251 303 revs = []
252 304 needed = {}
253 305
254 306 # find file nodes/revs that match changeset revs
255 307 for i in xrange(0, self.count()):
256 308 if self.index[i][3] in linkmap:
257 309 revs.append(i)
258 310 needed[i] = 1
259 311
260 312 # if we don't have any revisions touched by these changesets, bail
261 313 if not revs: return struct.pack(">l", 0)
262 314
263 315 # add the parent of the first rev
264 316 p = self.parents(self.node(revs[0]))[0]
265 317 revs.insert(0, self.rev(p))
266 318
267 319 # for each delta that isn't contiguous in the log, we need to
268 320 # reconstruct the base, reconstruct the result, and then
269 321 # calculate the delta. We also need to do this where we've
270 322 # stored a full version and not a delta
271 323 for i in xrange(0, len(revs) - 1):
272 324 a, b = revs[i], revs[i + 1]
273 325 if a + 1 != b or self.base(b) == b:
274 326 for j in xrange(self.base(a), a + 1):
275 327 needed[j] = 1
276 328 for j in xrange(self.base(b), b + 1):
277 329 needed[j] = 1
278 330
279 331 # calculate spans to retrieve from datafile
280 332 needed = needed.keys()
281 333 needed.sort()
282 334 spans = []
283 335 for n in needed:
284 336 if n < 0: continue
285 337 o = self.start(n)
286 338 l = self.length(n)
287 339 spans.append((o, l, [(n, l)]))
288 340
289 341 # merge spans
290 342 merge = [spans.pop(0)]
291 343 while spans:
292 344 e = spans.pop(0)
293 345 f = merge[-1]
294 346 if e[0] == f[0] + f[1]:
295 347 merge[-1] = (f[0], f[1] + e[1], f[2] + e[2])
296 348 else:
297 349 merge.append(e)
298 350
299 351 # read spans in, divide up chunks
300 352 chunks = {}
301 353 for span in merge:
302 354 # we reopen the file for each span to make http happy for now
303 355 f = self.opener(self.datafile)
304 356 f.seek(span[0])
305 357 data = f.read(span[1])
306 358
307 359 # divide up the span
308 360 pos = 0
309 361 for r, l in span[2]:
310 362 chunks[r] = data[pos: pos + l]
311 363 pos += l
312 364
313 365 # helper to reconstruct intermediate versions
314 366 def construct(text, base, rev):
315 367 bins = [decompress(chunks[r]) for r in xrange(base + 1, rev + 1)]
316 368 return mdiff.patches(text, bins)
317 369
318 370 # build deltas
319 371 deltas = []
320 372 for d in xrange(0, len(revs) - 1):
321 373 a, b = revs[d], revs[d + 1]
322 374 n = self.node(b)
323 375
324 376 if a + 1 != b or self.base(b) == b:
325 377 if a >= 0:
326 378 base = self.base(a)
327 379 ta = decompress(chunks[self.base(a)])
328 380 ta = construct(ta, base, a)
329 381 else:
330 382 ta = ""
331 383
332 384 base = self.base(b)
333 385 if a > base:
334 386 base = a
335 387 tb = ta
336 388 else:
337 389 tb = decompress(chunks[self.base(b)])
338 390 tb = construct(tb, base, b)
339 391 d = self.diff(ta, tb)
340 392 else:
341 393 d = decompress(chunks[b])
342 394
343 395 p = self.parents(n)
344 396 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
345 397 l = struct.pack(">l", len(meta) + len(d) + 4)
346 398 deltas.append(l + meta + d)
347 399
348 400 l = struct.pack(">l", sum(map(len, deltas)) + 4)
349 401 deltas.insert(0, l)
350 402 return "".join(deltas)
351 403
352 404 def addgroup(self, data, linkmapper, transaction):
353 405 # given a set of deltas, add them to the revision log. the
354 406 # first delta is against its parent, which should be in our
355 407 # log, the rest are against the previous delta.
356 408
357 409 if not data: return self.tip()
358 410
359 411 # retrieve the parent revision of the delta chain
360 412 chain = data[24:44]
361 413
362 414 # track the base of the current delta log
363 415 r = self.count()
364 416 t = r - 1
365 417
366 418 base = prev = -1
367 419 start = end = 0
368 420 if r:
369 421 start = self.start(self.base(t))
370 422 end = self.end(t)
371 423 measure = self.length(self.base(t))
372 424 base = self.base(t)
373 425 prev = self.tip()
374 426
375 427 transaction.add(self.datafile, end)
376 428 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
377 429 dfh = self.opener(self.datafile, "a")
378 430 ifh = self.opener(self.indexfile, "a")
379 431
380 432 # loop through our set of deltas
381 433 pos = 0
382 434 while pos < len(data):
383 435 l, node, p1, p2, cs = struct.unpack(">l20s20s20s20s",
384 436 data[pos:pos+84])
385 437 link = linkmapper(cs)
386 438 delta = data[pos + 84:pos + l]
387 439 pos += l
388 440
389 441 # full versions are inserted when the needed deltas become
390 442 # comparable to the uncompressed text or when the previous
391 443 # version is not the one we have a delta against. We use
392 444 # the size of the previous full rev as a proxy for the
393 445 # current size.
394 446
395 447 if chain == prev:
396 448 cdelta = compress(delta)
397 449
398 450 if chain != prev or (end - start + len(cdelta)) > measure * 2:
399 451 # flush our writes here so we can read it in revision
400 452 dfh.flush()
401 453 ifh.flush()
402 454 text = self.revision(chain)
403 455 text = self.patches(text, [delta])
404 456 chk = self.addrevision(text, transaction, link, p1, p2)
405 457 if chk != node:
406 458 raise "consistency error adding group"
407 459 measure = len(text)
408 460 else:
409 461 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
410 462 self.index.append(e)
411 463 self.nodemap[node] = r
412 464 dfh.write(cdelta)
413 465 ifh.write(struct.pack(indexformat, *e))
414 466
415 467 t, r, chain, prev = r, r + 1, node, node
416 468 start = self.start(self.base(t))
417 469 end = self.end(t)
418 470
419 471 dfh.close()
420 472 ifh.close()
421 473 return node
General Comments 0
You need to be logged in to leave comments. Login now