##// END OF EJS Templates
lazyindex fix, make load handle negative indexes properly.
Eric Hopper -
r1403:bc3e66ed default
parent child Browse files
Show More
@@ -1,675 +1,677 b''
1 1 """
2 2 revlog.py - storage back-end for mercurial
3 3
4 4 This provides efficient delta storage with O(1) retrieve and append
5 5 and O(changes) merge between branches
6 6
7 7 Copyright 2005 Matt Mackall <mpm@selenic.com>
8 8
9 9 This software may be used and distributed according to the terms
10 10 of the GNU General Public License, incorporated herein by reference.
11 11 """
12 12
13 13 from node import *
14 14 from i18n import gettext as _
15 15 from demandload import demandload
16 16 demandload(globals(), "binascii errno heapq mdiff sha struct zlib")
17 17
18 18 def hash(text, p1, p2):
19 19 """generate a hash from the given text and its parent hashes
20 20
21 21 This hash combines both the current file contents and its history
22 22 in a manner that makes it easy to distinguish nodes with the same
23 23 content in the revision graph.
24 24 """
25 25 l = [p1, p2]
26 26 l.sort()
27 27 s = sha.new(l[0])
28 28 s.update(l[1])
29 29 s.update(text)
30 30 return s.digest()
31 31
32 32 def compress(text):
33 33 """ generate a possibly-compressed representation of text """
34 34 if not text: return text
35 35 if len(text) < 44:
36 36 if text[0] == '\0': return text
37 37 return 'u' + text
38 38 bin = zlib.compress(text)
39 39 if len(bin) > len(text):
40 40 if text[0] == '\0': return text
41 41 return 'u' + text
42 42 return bin
43 43
44 44 def decompress(bin):
45 45 """ decompress the given input """
46 46 if not bin: return bin
47 47 t = bin[0]
48 48 if t == '\0': return bin
49 49 if t == 'x': return zlib.decompress(bin)
50 50 if t == 'u': return bin[1:]
51 51 raise RevlogError(_("unknown compression type %s") % t)
52 52
53 53 indexformat = ">4l20s20s20s"
54 54
55 55 class lazyparser:
56 56 """
57 57 this class avoids the need to parse the entirety of large indices
58 58
59 59 By default we parse and load 1000 entries at a time.
60 60
61 61 If no position is specified, we load the whole index, and replace
62 62 the lazy objects in revlog with the underlying objects for
63 63 efficiency in cases where we look at most of the nodes.
64 64 """
65 65 def __init__(self, data, revlog):
66 66 self.data = data
67 67 self.s = struct.calcsize(indexformat)
68 68 self.l = len(data)/self.s
69 69 self.index = [None] * self.l
70 70 self.map = {nullid: -1}
71 71 self.all = 0
72 72 self.revlog = revlog
73 73
74 74 def load(self, pos=None):
75 75 if self.all: return
76 76 if pos is not None:
77 77 block = pos / 1000
78 78 i = block * 1000
79 79 end = min(self.l, i + 1000)
80 80 else:
81 81 self.all = 1
82 82 i = 0
83 83 end = self.l
84 84 self.revlog.index = self.index
85 85 self.revlog.nodemap = self.map
86 86
87 87 while i < end:
88 88 d = self.data[i * self.s: (i + 1) * self.s]
89 89 e = struct.unpack(indexformat, d)
90 90 self.index[i] = e
91 91 self.map[e[6]] = i
92 92 i += 1
93 93
94 94 class lazyindex:
95 95 """a lazy version of the index array"""
96 96 def __init__(self, parser):
97 97 self.p = parser
98 98 def __len__(self):
99 99 return len(self.p.index)
100 100 def load(self, pos):
101 if pos < 0:
102 pos += len(self.p.index)
101 103 self.p.load(pos)
102 104 return self.p.index[pos]
103 105 def __getitem__(self, pos):
104 106 return self.p.index[pos] or self.load(pos)
105 107 def append(self, e):
106 108 self.p.index.append(e)
107 109
108 110 class lazymap:
109 111 """a lazy version of the node map"""
110 112 def __init__(self, parser):
111 113 self.p = parser
112 114 def load(self, key):
113 115 if self.p.all: return
114 116 n = self.p.data.find(key)
115 117 if n < 0:
116 118 raise KeyError(key)
117 119 pos = n / self.p.s
118 120 self.p.load(pos)
119 121 def __contains__(self, key):
120 122 self.p.load()
121 123 return key in self.p.map
122 124 def __iter__(self):
123 125 yield nullid
124 126 for i in xrange(self.p.l):
125 127 try:
126 128 yield self.p.index[i][6]
127 129 except:
128 130 self.p.load(i)
129 131 yield self.p.index[i][6]
130 132 def __getitem__(self, key):
131 133 try:
132 134 return self.p.map[key]
133 135 except KeyError:
134 136 try:
135 137 self.load(key)
136 138 return self.p.map[key]
137 139 except KeyError:
138 140 raise KeyError("node " + hex(key))
139 141 def __setitem__(self, key, val):
140 142 self.p.map[key] = val
141 143
142 144 class RevlogError(Exception): pass
143 145
144 146 class revlog:
145 147 """
146 148 the underlying revision storage object
147 149
148 150 A revlog consists of two parts, an index and the revision data.
149 151
150 152 The index is a file with a fixed record size containing
151 153 information on each revision, includings its nodeid (hash), the
152 154 nodeids of its parents, the position and offset of its data within
153 155 the data file, and the revision it's based on. Finally, each entry
154 156 contains a linkrev entry that can serve as a pointer to external
155 157 data.
156 158
157 159 The revision data itself is a linear collection of data chunks.
158 160 Each chunk represents a revision and is usually represented as a
159 161 delta against the previous chunk. To bound lookup time, runs of
160 162 deltas are limited to about 2 times the length of the original
161 163 version data. This makes retrieval of a version proportional to
162 164 its size, or O(1) relative to the number of revisions.
163 165
164 166 Both pieces of the revlog are written to in an append-only
165 167 fashion, which means we never need to rewrite a file to insert or
166 168 remove data, and can use some simple techniques to avoid the need
167 169 for locking while reading.
168 170 """
169 171 def __init__(self, opener, indexfile, datafile):
170 172 """
171 173 create a revlog object
172 174
173 175 opener is a function that abstracts the file opening operation
174 176 and can be used to implement COW semantics or the like.
175 177 """
176 178 self.indexfile = indexfile
177 179 self.datafile = datafile
178 180 self.opener = opener
179 181 self.cache = None
180 182
181 183 try:
182 184 i = self.opener(self.indexfile).read()
183 185 except IOError, inst:
184 186 if inst.errno != errno.ENOENT:
185 187 raise
186 188 i = ""
187 189
188 190 if len(i) > 10000:
189 191 # big index, let's parse it on demand
190 192 parser = lazyparser(i, self)
191 193 self.index = lazyindex(parser)
192 194 self.nodemap = lazymap(parser)
193 195 else:
194 196 s = struct.calcsize(indexformat)
195 197 l = len(i) / s
196 198 self.index = [None] * l
197 199 m = [None] * l
198 200
199 201 n = 0
200 202 for f in xrange(0, len(i), s):
201 203 # offset, size, base, linkrev, p1, p2, nodeid
202 204 e = struct.unpack(indexformat, i[f:f + s])
203 205 m[n] = (e[6], n)
204 206 self.index[n] = e
205 207 n += 1
206 208
207 209 self.nodemap = dict(m)
208 210 self.nodemap[nullid] = -1
209 211
210 212 def tip(self): return self.node(len(self.index) - 1)
211 213 def count(self): return len(self.index)
212 214 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
213 215 def rev(self, node):
214 216 try:
215 217 return self.nodemap[node]
216 218 except KeyError:
217 219 raise RevlogError(_('%s: no node %s') % (self.indexfile, hex(node)))
218 220 def linkrev(self, node): return self.index[self.rev(node)][3]
219 221 def parents(self, node):
220 222 if node == nullid: return (nullid, nullid)
221 223 return self.index[self.rev(node)][4:6]
222 224
223 225 def start(self, rev): return self.index[rev][0]
224 226 def length(self, rev): return self.index[rev][1]
225 227 def end(self, rev): return self.start(rev) + self.length(rev)
226 228 def base(self, rev): return self.index[rev][2]
227 229
228 230 def reachable(self, rev, stop=None):
229 231 reachable = {}
230 232 visit = [rev]
231 233 reachable[rev] = 1
232 234 if stop:
233 235 stopn = self.rev(stop)
234 236 else:
235 237 stopn = 0
236 238 while visit:
237 239 n = visit.pop(0)
238 240 if n == stop:
239 241 continue
240 242 if n == nullid:
241 243 continue
242 244 for p in self.parents(n):
243 245 if self.rev(p) < stopn:
244 246 continue
245 247 if p not in reachable:
246 248 reachable[p] = 1
247 249 visit.append(p)
248 250 return reachable
249 251
250 252 def heads(self, stop=None):
251 253 """return the list of all nodes that have no children"""
252 254 p = {}
253 255 h = []
254 256 stoprev = 0
255 257 if stop and stop in self.nodemap:
256 258 stoprev = self.rev(stop)
257 259
258 260 for r in range(self.count() - 1, -1, -1):
259 261 n = self.node(r)
260 262 if n not in p:
261 263 h.append(n)
262 264 if n == stop:
263 265 break
264 266 if r < stoprev:
265 267 break
266 268 for pn in self.parents(n):
267 269 p[pn] = 1
268 270 return h
269 271
270 272 def children(self, node):
271 273 """find the children of a given node"""
272 274 c = []
273 275 p = self.rev(node)
274 276 for r in range(p + 1, self.count()):
275 277 n = self.node(r)
276 278 for pn in self.parents(n):
277 279 if pn == node:
278 280 c.append(n)
279 281 continue
280 282 elif pn == nullid:
281 283 continue
282 284 return c
283 285
284 286 def lookup(self, id):
285 287 """locate a node based on revision number or subset of hex nodeid"""
286 288 try:
287 289 rev = int(id)
288 290 if str(rev) != id: raise ValueError
289 291 if rev < 0: rev = self.count() + rev
290 292 if rev < 0 or rev >= self.count(): raise ValueError
291 293 return self.node(rev)
292 294 except (ValueError, OverflowError):
293 295 c = []
294 296 for n in self.nodemap:
295 297 if hex(n).startswith(id):
296 298 c.append(n)
297 299 if len(c) > 1: raise RevlogError(_("Ambiguous identifier"))
298 300 if len(c) < 1: raise RevlogError(_("No match found"))
299 301 return c[0]
300 302
301 303 return None
302 304
303 305 def diff(self, a, b):
304 306 """return a delta between two revisions"""
305 307 return mdiff.textdiff(a, b)
306 308
307 309 def patches(self, t, pl):
308 310 """apply a list of patches to a string"""
309 311 return mdiff.patches(t, pl)
310 312
311 313 def delta(self, node):
312 314 """return or calculate a delta between a node and its predecessor"""
313 315 r = self.rev(node)
314 316 b = self.base(r)
315 317 if r == b:
316 318 return self.diff(self.revision(self.node(r - 1)),
317 319 self.revision(node))
318 320 else:
319 321 f = self.opener(self.datafile)
320 322 f.seek(self.start(r))
321 323 data = f.read(self.length(r))
322 324 return decompress(data)
323 325
324 326 def revision(self, node):
325 327 """return an uncompressed revision of a given"""
326 328 if node == nullid: return ""
327 329 if self.cache and self.cache[0] == node: return self.cache[2]
328 330
329 331 # look up what we need to read
330 332 text = None
331 333 rev = self.rev(node)
332 334 start, length, base, link, p1, p2, node = self.index[rev]
333 335 end = start + length
334 336 if base != rev: start = self.start(base)
335 337
336 338 # do we have useful data cached?
337 339 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
338 340 base = self.cache[1]
339 341 start = self.start(base + 1)
340 342 text = self.cache[2]
341 343 last = 0
342 344
343 345 f = self.opener(self.datafile)
344 346 f.seek(start)
345 347 data = f.read(end - start)
346 348
347 349 if text is None:
348 350 last = self.length(base)
349 351 text = decompress(data[:last])
350 352
351 353 bins = []
352 354 for r in xrange(base + 1, rev + 1):
353 355 s = self.length(r)
354 356 bins.append(decompress(data[last:last + s]))
355 357 last = last + s
356 358
357 359 text = mdiff.patches(text, bins)
358 360
359 361 if node != hash(text, p1, p2):
360 362 raise RevlogError(_("integrity check failed on %s:%d")
361 363 % (self.datafile, rev))
362 364
363 365 self.cache = (node, rev, text)
364 366 return text
365 367
366 368 def addrevision(self, text, transaction, link, p1=None, p2=None, d=None):
367 369 """add a revision to the log
368 370
369 371 text - the revision data to add
370 372 transaction - the transaction object used for rollback
371 373 link - the linkrev data to add
372 374 p1, p2 - the parent nodeids of the revision
373 375 d - an optional precomputed delta
374 376 """
375 377 if text is None: text = ""
376 378 if p1 is None: p1 = self.tip()
377 379 if p2 is None: p2 = nullid
378 380
379 381 node = hash(text, p1, p2)
380 382
381 383 if node in self.nodemap:
382 384 return node
383 385
384 386 n = self.count()
385 387 t = n - 1
386 388
387 389 if n:
388 390 base = self.base(t)
389 391 start = self.start(base)
390 392 end = self.end(t)
391 393 if not d:
392 394 prev = self.revision(self.tip())
393 395 d = self.diff(prev, text)
394 396 data = compress(d)
395 397 dist = end - start + len(data)
396 398
397 399 # full versions are inserted when the needed deltas
398 400 # become comparable to the uncompressed text
399 401 if not n or dist > len(text) * 2:
400 402 data = compress(text)
401 403 base = n
402 404 else:
403 405 base = self.base(t)
404 406
405 407 offset = 0
406 408 if t >= 0:
407 409 offset = self.end(t)
408 410
409 411 e = (offset, len(data), base, link, p1, p2, node)
410 412
411 413 self.index.append(e)
412 414 self.nodemap[node] = n
413 415 entry = struct.pack(indexformat, *e)
414 416
415 417 transaction.add(self.datafile, e[0])
416 418 self.opener(self.datafile, "a").write(data)
417 419 transaction.add(self.indexfile, n * len(entry))
418 420 self.opener(self.indexfile, "a").write(entry)
419 421
420 422 self.cache = (node, n, text)
421 423 return node
422 424
423 425 def ancestor(self, a, b):
424 426 """calculate the least common ancestor of nodes a and b"""
425 427 # calculate the distance of every node from root
426 428 dist = {nullid: 0}
427 429 for i in xrange(self.count()):
428 430 n = self.node(i)
429 431 p1, p2 = self.parents(n)
430 432 dist[n] = max(dist[p1], dist[p2]) + 1
431 433
432 434 # traverse ancestors in order of decreasing distance from root
433 435 def ancestors(node):
434 436 # we store negative distances because heap returns smallest member
435 437 h = [(-dist[node], node)]
436 438 seen = {}
437 439 earliest = self.count()
438 440 while h:
439 441 d, n = heapq.heappop(h)
440 442 if n not in seen:
441 443 seen[n] = 1
442 444 r = self.rev(n)
443 445 yield (-d, n)
444 446 for p in self.parents(n):
445 447 heapq.heappush(h, (-dist[p], p))
446 448
447 449 def generations(node):
448 450 sg, s = None, {}
449 451 for g,n in ancestors(node):
450 452 if g != sg:
451 453 if sg:
452 454 yield sg, s
453 455 sg, s = g, {n:1}
454 456 else:
455 457 s[n] = 1
456 458 yield sg, s
457 459
458 460 x = generations(a)
459 461 y = generations(b)
460 462 gx = x.next()
461 463 gy = y.next()
462 464
463 465 # increment each ancestor list until it is closer to root than
464 466 # the other, or they match
465 467 while 1:
466 468 #print "ancestor gen %s %s" % (gx[0], gy[0])
467 469 if gx[0] == gy[0]:
468 470 # find the intersection
469 471 i = [ n for n in gx[1] if n in gy[1] ]
470 472 if i:
471 473 return i[0]
472 474 else:
473 475 #print "next"
474 476 gy = y.next()
475 477 gx = x.next()
476 478 elif gx[0] < gy[0]:
477 479 #print "next y"
478 480 gy = y.next()
479 481 else:
480 482 #print "next x"
481 483 gx = x.next()
482 484
483 485 def group(self, linkmap):
484 486 """calculate a delta group
485 487
486 488 Given a list of changeset revs, return a set of deltas and
487 489 metadata corresponding to nodes. the first delta is
488 490 parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
489 491 have this parent as it has all history before these
490 492 changesets. parent is parent[0]
491 493 """
492 494 revs = []
493 495 needed = {}
494 496
495 497 # find file nodes/revs that match changeset revs
496 498 for i in xrange(0, self.count()):
497 499 if self.index[i][3] in linkmap:
498 500 revs.append(i)
499 501 needed[i] = 1
500 502
501 503 # if we don't have any revisions touched by these changesets, bail
502 504 if not revs:
503 505 yield struct.pack(">l", 0)
504 506 return
505 507
506 508 # add the parent of the first rev
507 509 p = self.parents(self.node(revs[0]))[0]
508 510 revs.insert(0, self.rev(p))
509 511
510 512 # for each delta that isn't contiguous in the log, we need to
511 513 # reconstruct the base, reconstruct the result, and then
512 514 # calculate the delta. We also need to do this where we've
513 515 # stored a full version and not a delta
514 516 for i in xrange(0, len(revs) - 1):
515 517 a, b = revs[i], revs[i + 1]
516 518 if a + 1 != b or self.base(b) == b:
517 519 for j in xrange(self.base(a), a + 1):
518 520 needed[j] = 1
519 521 for j in xrange(self.base(b), b + 1):
520 522 needed[j] = 1
521 523
522 524 # calculate spans to retrieve from datafile
523 525 needed = needed.keys()
524 526 needed.sort()
525 527 spans = []
526 528 oo = -1
527 529 ol = 0
528 530 for n in needed:
529 531 if n < 0: continue
530 532 o = self.start(n)
531 533 l = self.length(n)
532 534 if oo + ol == o: # can we merge with the previous?
533 535 nl = spans[-1][2]
534 536 nl.append((n, l))
535 537 ol += l
536 538 spans[-1] = (oo, ol, nl)
537 539 else:
538 540 oo = o
539 541 ol = l
540 542 spans.append((oo, ol, [(n, l)]))
541 543
542 544 # read spans in, divide up chunks
543 545 chunks = {}
544 546 for span in spans:
545 547 # we reopen the file for each span to make http happy for now
546 548 f = self.opener(self.datafile)
547 549 f.seek(span[0])
548 550 data = f.read(span[1])
549 551
550 552 # divide up the span
551 553 pos = 0
552 554 for r, l in span[2]:
553 555 chunks[r] = decompress(data[pos: pos + l])
554 556 pos += l
555 557
556 558 # helper to reconstruct intermediate versions
557 559 def construct(text, base, rev):
558 560 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
559 561 return mdiff.patches(text, bins)
560 562
561 563 # build deltas
562 564 deltas = []
563 565 for d in xrange(0, len(revs) - 1):
564 566 a, b = revs[d], revs[d + 1]
565 567 n = self.node(b)
566 568
567 569 # do we need to construct a new delta?
568 570 if a + 1 != b or self.base(b) == b:
569 571 if a >= 0:
570 572 base = self.base(a)
571 573 ta = chunks[self.base(a)]
572 574 ta = construct(ta, base, a)
573 575 else:
574 576 ta = ""
575 577
576 578 base = self.base(b)
577 579 if a > base:
578 580 base = a
579 581 tb = ta
580 582 else:
581 583 tb = chunks[self.base(b)]
582 584 tb = construct(tb, base, b)
583 585 d = self.diff(ta, tb)
584 586 else:
585 587 d = chunks[b]
586 588
587 589 p = self.parents(n)
588 590 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
589 591 l = struct.pack(">l", len(meta) + len(d) + 4)
590 592 yield l
591 593 yield meta
592 594 yield d
593 595
594 596 yield struct.pack(">l", 0)
595 597
596 598 def addgroup(self, revs, linkmapper, transaction, unique=0):
597 599 """
598 600 add a delta group
599 601
600 602 given a set of deltas, add them to the revision log. the
601 603 first delta is against its parent, which should be in our
602 604 log, the rest are against the previous delta.
603 605 """
604 606
605 607 #track the base of the current delta log
606 608 r = self.count()
607 609 t = r - 1
608 610 node = nullid
609 611
610 612 base = prev = -1
611 613 start = end = measure = 0
612 614 if r:
613 615 start = self.start(self.base(t))
614 616 end = self.end(t)
615 617 measure = self.length(self.base(t))
616 618 base = self.base(t)
617 619 prev = self.tip()
618 620
619 621 transaction.add(self.datafile, end)
620 622 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
621 623 dfh = self.opener(self.datafile, "a")
622 624 ifh = self.opener(self.indexfile, "a")
623 625
624 626 # loop through our set of deltas
625 627 chain = None
626 628 for chunk in revs:
627 629 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
628 630 link = linkmapper(cs)
629 631 if node in self.nodemap:
630 632 # this can happen if two branches make the same change
631 633 # if unique:
632 634 # raise RevlogError(_("already have %s") % hex(node[:4]))
633 635 chain = node
634 636 continue
635 637 delta = chunk[80:]
636 638
637 639 if not chain:
638 640 # retrieve the parent revision of the delta chain
639 641 chain = p1
640 642 if not chain in self.nodemap:
641 643 raise RevlogError(_("unknown base %s") % short(chain[:4]))
642 644
643 645 # full versions are inserted when the needed deltas become
644 646 # comparable to the uncompressed text or when the previous
645 647 # version is not the one we have a delta against. We use
646 648 # the size of the previous full rev as a proxy for the
647 649 # current size.
648 650
649 651 if chain == prev:
650 652 cdelta = compress(delta)
651 653
652 654 if chain != prev or (end - start + len(cdelta)) > measure * 2:
653 655 # flush our writes here so we can read it in revision
654 656 dfh.flush()
655 657 ifh.flush()
656 658 text = self.revision(chain)
657 659 text = self.patches(text, [delta])
658 660 chk = self.addrevision(text, transaction, link, p1, p2)
659 661 if chk != node:
660 662 raise RevlogError(_("consistency error adding group"))
661 663 measure = len(text)
662 664 else:
663 665 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
664 666 self.index.append(e)
665 667 self.nodemap[node] = r
666 668 dfh.write(cdelta)
667 669 ifh.write(struct.pack(indexformat, *e))
668 670
669 671 t, r, chain, prev = r, r + 1, node, node
670 672 start = self.start(self.base(t))
671 673 end = self.end(t)
672 674
673 675 dfh.close()
674 676 ifh.close()
675 677 return node
General Comments 0
You need to be logged in to leave comments. Login now