##// END OF EJS Templates
Make lazyindex load slightly faster
mpm@selenic.com -
r115:39b438ee default
parent child Browse files
Show More
@@ -1,474 +1,473 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, os, tempfile, binascii
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16 def short(node): return hex(node[:4])
17 17
18 18 def compress(text):
19 19 if not text: return text
20 20 if len(text) < 44:
21 21 if text[0] == '\0': return text
22 22 return 'u' + text
23 23 bin = zlib.compress(text)
24 24 if len(bin) > len(text):
25 25 if text[0] == '\0': return text
26 26 return 'u' + text
27 27 return bin
28 28
29 29 def decompress(bin):
30 30 if not bin: return bin
31 31 t = bin[0]
32 32 if t == '\0': return bin
33 33 if t == 'x': return zlib.decompress(bin)
34 34 if t == 'u': return bin[1:]
35 35 raise "unknown compression type %s" % t
36 36
37 37 def hash(text, p1, p2):
38 38 l = [p1, p2]
39 39 l.sort()
40 40 return sha.sha(l[0] + l[1] + text).digest()
41 41
42 42 nullid = "\0" * 20
43 43 indexformat = ">4l20s20s20s"
44 44
45 45 class lazyparser:
46 46 def __init__(self, data):
47 47 self.data = data
48 48 self.s = struct.calcsize(indexformat)
49 49 self.l = len(data)/self.s
50 50 self.index = [None] * self.l
51 51 self.map = {nullid: -1}
52 52
53 53 if 0:
54 54 n = 0
55 55 i = self.data
56 56 s = struct.calcsize(indexformat)
57 57 for f in xrange(0, len(i), s):
58 58 # offset, size, base, linkrev, p1, p2, nodeid
59 59 e = struct.unpack(indexformat, i[f:f + s])
60 60 self.map[e[6]] = n
61 61 self.index.append(e)
62 62 n += 1
63 63
64 64 def load(self, pos):
65 65 block = pos / 1000
66 66 i = block * 1000
67 67 end = min(self.l, i + 1000)
68 68 while i < end:
69 69 d = self.data[i * self.s: (i + 1) * self.s]
70 70 e = struct.unpack(indexformat, d)
71 71 self.index[i] = e
72 72 self.map[e[6]] = i
73 73 i += 1
74 74
75 75 class lazyindex:
76 76 def __init__(self, parser):
77 77 self.p = parser
78 78 def __len__(self):
79 79 return len(self.p.index)
80 def load(self, pos):
81 self.p.load(pos)
82 return self.p.index[pos]
80 83 def __getitem__(self, pos):
81 i = self.p.index[pos]
82 if not i:
83 self.p.load(pos)
84 return self.p.index[pos]
85 return i
84 return self.p.index[pos] or self.load(pos)
86 85 def append(self, e):
87 86 self.p.index.append(e)
88 87
89 88 class lazymap:
90 89 def __init__(self, parser):
91 90 self.p = parser
92 91 def load(self, key):
93 92 n = self.p.data.find(key)
94 93 if n < 0: raise KeyError("node " + hex(key))
95 94 pos = n / self.p.s
96 95 self.p.load(pos)
97 96 def __contains__(self, key):
98 97 try:
99 98 self[key]
100 99 return True
101 100 except KeyError:
102 101 return False
103 102 def __iter__(self):
104 103 for i in xrange(self.p.l):
105 104 try:
106 105 yield self.p.index[i][6]
107 106 except:
108 107 self.p.load(i)
109 108 yield self.p.index[i][6]
110 109 def __getitem__(self, key):
111 110 try:
112 111 return self.p.map[key]
113 112 except KeyError:
114 113 try:
115 114 self.load(key)
116 115 return self.p.map[key]
117 116 except KeyError:
118 117 raise KeyError("node " + hex(key))
119 118 def __setitem__(self, key, val):
120 119 self.p.map[key] = val
121 120
122 121 class revlog:
123 122 def __init__(self, opener, indexfile, datafile):
124 123 self.indexfile = indexfile
125 124 self.datafile = datafile
126 125 self.opener = opener
127 126 self.cache = None
128 127 # read the whole index for now, handle on-demand later
129 128 try:
130 129 i = self.opener(self.indexfile).read()
131 130 except IOError:
132 131 i = ""
133 132 parser = lazyparser(i)
134 133 self.index = lazyindex(parser)
135 134 self.nodemap = lazymap(parser)
136 135
137 136 def tip(self): return self.node(len(self.index) - 1)
138 137 def count(self): return len(self.index)
139 138 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
140 139 def rev(self, node): return self.nodemap[node]
141 140 def linkrev(self, node): return self.index[self.nodemap[node]][3]
142 141 def parents(self, node):
143 142 if node == nullid: return (nullid, nullid)
144 143 return self.index[self.nodemap[node]][4:6]
145 144
146 145 def start(self, rev): return self.index[rev][0]
147 146 def length(self, rev): return self.index[rev][1]
148 147 def end(self, rev): return self.start(rev) + self.length(rev)
149 148 def base(self, rev): return self.index[rev][2]
150 149
151 150 def lookup(self, id):
152 151 try:
153 152 rev = int(id)
154 153 return self.node(rev)
155 154 except ValueError:
156 155 c = []
157 156 for n in self.nodemap:
158 157 if id in hex(n):
159 158 c.append(n)
160 159 if len(c) > 1: raise KeyError("Ambiguous identifier")
161 160 if len(c) < 1: raise KeyError("No match found")
162 161 return c[0]
163 162
164 163 return None
165 164
166 165 def diff(self, a, b):
167 166 return mdiff.textdiff(a, b)
168 167
169 168 def patches(self, t, pl):
170 169 return mdiff.patches(t, pl)
171 170
172 171 def revision(self, node):
173 172 if node == nullid: return ""
174 173 if self.cache and self.cache[0] == node: return self.cache[2]
175 174
176 175 text = None
177 176 rev = self.rev(node)
178 177 base = self.base(rev)
179 178 start = self.start(base)
180 179 end = self.end(rev)
181 180
182 181 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
183 182 base = self.cache[1]
184 183 start = self.start(base + 1)
185 184 text = self.cache[2]
186 185 last = 0
187 186
188 187 f = self.opener(self.datafile)
189 188 f.seek(start)
190 189 data = f.read(end - start)
191 190
192 191 if not text:
193 192 last = self.length(base)
194 193 text = decompress(data[:last])
195 194
196 195 bins = []
197 196 for r in xrange(base + 1, rev + 1):
198 197 s = self.length(r)
199 198 bins.append(decompress(data[last:last + s]))
200 199 last = last + s
201 200
202 201 text = mdiff.patches(text, bins)
203 202
204 203 (p1, p2) = self.parents(node)
205 204 if node != hash(text, p1, p2):
206 205 raise IOError("integrity check failed on %s:%d"
207 206 % (self.datafile, rev))
208 207
209 208 self.cache = (node, rev, text)
210 209 return text
211 210
212 211 def addrevision(self, text, transaction, link, p1=None, p2=None):
213 212 if text is None: text = ""
214 213 if p1 is None: p1 = self.tip()
215 214 if p2 is None: p2 = nullid
216 215
217 216 node = hash(text, p1, p2)
218 217
219 218 n = self.count()
220 219 t = n - 1
221 220
222 221 if n:
223 222 base = self.base(t)
224 223 start = self.start(base)
225 224 end = self.end(t)
226 225 prev = self.revision(self.tip())
227 226 d = self.diff(prev, text)
228 227 if self.patches(prev, [d]) != text:
229 228 raise AssertionError("diff failed")
230 229 data = compress(d)
231 230 dist = end - start + len(data)
232 231
233 232 # full versions are inserted when the needed deltas
234 233 # become comparable to the uncompressed text
235 234 if not n or dist > len(text) * 2:
236 235 data = compress(text)
237 236 base = n
238 237 else:
239 238 base = self.base(t)
240 239
241 240 offset = 0
242 241 if t >= 0:
243 242 offset = self.end(t)
244 243
245 244 e = (offset, len(data), base, link, p1, p2, node)
246 245
247 246 self.index.append(e)
248 247 self.nodemap[node] = n
249 248 entry = struct.pack(indexformat, *e)
250 249
251 250 transaction.add(self.datafile, e[0])
252 251 self.opener(self.datafile, "a").write(data)
253 252 transaction.add(self.indexfile, n * len(entry))
254 253 self.opener(self.indexfile, "a").write(entry)
255 254
256 255 self.cache = (node, n, text)
257 256 return node
258 257
259 258 def ancestor(self, a, b):
260 259 def expand(list, map):
261 260 a = []
262 261 while list:
263 262 n = list.pop(0)
264 263 map[n] = 1
265 264 yield n
266 265 for p in self.parents(n):
267 266 if p != nullid and p not in map:
268 267 list.append(p)
269 268 yield nullid
270 269
271 270 amap = {}
272 271 bmap = {}
273 272 ag = expand([a], amap)
274 273 bg = expand([b], bmap)
275 274 adone = bdone = 0
276 275
277 276 while not adone or not bdone:
278 277 if not adone:
279 278 an = ag.next()
280 279 if an == nullid:
281 280 adone = 1
282 281 elif an in bmap:
283 282 return an
284 283 if not bdone:
285 284 bn = bg.next()
286 285 if bn == nullid:
287 286 bdone = 1
288 287 elif bn in amap:
289 288 return bn
290 289
291 290 return nullid
292 291
293 292 def group(self, linkmap):
294 293 # given a list of changeset revs, return a set of deltas and
295 294 # metadata corresponding to nodes. the first delta is
296 295 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
297 296 # have this parent as it has all history before these
298 297 # changesets. parent is parent[0]
299 298
300 299 revs = []
301 300 needed = {}
302 301
303 302 # find file nodes/revs that match changeset revs
304 303 for i in xrange(0, self.count()):
305 304 if self.index[i][3] in linkmap:
306 305 revs.append(i)
307 306 needed[i] = 1
308 307
309 308 # if we don't have any revisions touched by these changesets, bail
310 309 if not revs: return struct.pack(">l", 0)
311 310
312 311 # add the parent of the first rev
313 312 p = self.parents(self.node(revs[0]))[0]
314 313 revs.insert(0, self.rev(p))
315 314
316 315 # for each delta that isn't contiguous in the log, we need to
317 316 # reconstruct the base, reconstruct the result, and then
318 317 # calculate the delta. We also need to do this where we've
319 318 # stored a full version and not a delta
320 319 for i in xrange(0, len(revs) - 1):
321 320 a, b = revs[i], revs[i + 1]
322 321 if a + 1 != b or self.base(b) == b:
323 322 for j in xrange(self.base(a), a + 1):
324 323 needed[j] = 1
325 324 for j in xrange(self.base(b), b + 1):
326 325 needed[j] = 1
327 326
328 327 # calculate spans to retrieve from datafile
329 328 needed = needed.keys()
330 329 needed.sort()
331 330 spans = []
332 331 for n in needed:
333 332 if n < 0: continue
334 333 o = self.start(n)
335 334 l = self.length(n)
336 335 spans.append((o, l, [(n, l)]))
337 336
338 337 # merge spans
339 338 merge = [spans.pop(0)]
340 339 while spans:
341 340 e = spans.pop(0)
342 341 f = merge[-1]
343 342 if e[0] == f[0] + f[1]:
344 343 merge[-1] = (f[0], f[1] + e[1], f[2] + e[2])
345 344 else:
346 345 merge.append(e)
347 346
348 347 # read spans in, divide up chunks
349 348 chunks = {}
350 349 for span in merge:
351 350 # we reopen the file for each span to make http happy for now
352 351 f = self.opener(self.datafile)
353 352 f.seek(span[0])
354 353 data = f.read(span[1])
355 354
356 355 # divide up the span
357 356 pos = 0
358 357 for r, l in span[2]:
359 358 chunks[r] = data[pos: pos + l]
360 359 pos += l
361 360
362 361 # helper to reconstruct intermediate versions
363 362 def construct(text, base, rev):
364 363 bins = [decompress(chunks[r]) for r in xrange(base + 1, rev + 1)]
365 364 return mdiff.patches(text, bins)
366 365
367 366 # build deltas
368 367 deltas = []
369 368 for d in xrange(0, len(revs) - 1):
370 369 a, b = revs[d], revs[d + 1]
371 370 n = self.node(b)
372 371
373 372 if a + 1 != b or self.base(b) == b:
374 373 if a >= 0:
375 374 base = self.base(a)
376 375 ta = decompress(chunks[self.base(a)])
377 376 ta = construct(ta, base, a)
378 377 else:
379 378 ta = ""
380 379
381 380 base = self.base(b)
382 381 if a > base:
383 382 base = a
384 383 tb = ta
385 384 else:
386 385 tb = decompress(chunks[self.base(b)])
387 386 tb = construct(tb, base, b)
388 387 d = self.diff(ta, tb)
389 388 else:
390 389 d = decompress(chunks[b])
391 390
392 391 p = self.parents(n)
393 392 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
394 393 l = struct.pack(">l", len(meta) + len(d) + 4)
395 394 deltas.append(l + meta + d)
396 395
397 396 l = struct.pack(">l", sum(map(len, deltas)) + 4)
398 397 deltas.insert(0, l)
399 398 return "".join(deltas)
400 399
401 400 def addgroup(self, data, linkmapper, transaction):
402 401 # given a set of deltas, add them to the revision log. the
403 402 # first delta is against its parent, which should be in our
404 403 # log, the rest are against the previous delta.
405 404
406 405 if not data: return self.tip()
407 406
408 407 # retrieve the parent revision of the delta chain
409 408 chain = data[24:44]
410 409 if not chain in self.nodemap:
411 410 raise "unknown base %s" % short(chain[:4])
412 411
413 412 # track the base of the current delta log
414 413 r = self.count()
415 414 t = r - 1
416 415
417 416 base = prev = -1
418 417 start = end = 0
419 418 if r:
420 419 start = self.start(self.base(t))
421 420 end = self.end(t)
422 421 measure = self.length(self.base(t))
423 422 base = self.base(t)
424 423 prev = self.tip()
425 424
426 425 transaction.add(self.datafile, end)
427 426 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
428 427 dfh = self.opener(self.datafile, "a")
429 428 ifh = self.opener(self.indexfile, "a")
430 429
431 430 # loop through our set of deltas
432 431 pos = 0
433 432 while pos < len(data):
434 433 l, node, p1, p2, cs = struct.unpack(">l20s20s20s20s",
435 434 data[pos:pos+84])
436 435 link = linkmapper(cs)
437 436 if node in self.nodemap:
438 437 raise "already have %s" % hex(node[:4])
439 438 delta = data[pos + 84:pos + l]
440 439 pos += l
441 440
442 441 # full versions are inserted when the needed deltas become
443 442 # comparable to the uncompressed text or when the previous
444 443 # version is not the one we have a delta against. We use
445 444 # the size of the previous full rev as a proxy for the
446 445 # current size.
447 446
448 447 if chain == prev:
449 448 cdelta = compress(delta)
450 449
451 450 if chain != prev or (end - start + len(cdelta)) > measure * 2:
452 451 # flush our writes here so we can read it in revision
453 452 dfh.flush()
454 453 ifh.flush()
455 454 text = self.revision(chain)
456 455 text = self.patches(text, [delta])
457 456 chk = self.addrevision(text, transaction, link, p1, p2)
458 457 if chk != node:
459 458 raise "consistency error adding group"
460 459 measure = len(text)
461 460 else:
462 461 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
463 462 self.index.append(e)
464 463 self.nodemap[node] = r
465 464 dfh.write(cdelta)
466 465 ifh.write(struct.pack(indexformat, *e))
467 466
468 467 t, r, chain, prev = r, r + 1, node, node
469 468 start = self.start(self.base(t))
470 469 end = self.end(t)
471 470
472 471 dfh.close()
473 472 ifh.close()
474 473 return node
General Comments 0
You need to be logged in to leave comments. Login now