##// END OF EJS Templates
revlog: raise informative exception if file is missing.
Bryan O'Sullivan -
r1201:59bfbdbc default
parent child Browse files
Show More
@@ -1,646 +1,650 b''
1 1 """
2 2 revlog.py - storage back-end for mercurial
3 3
4 4 This provides efficient delta storage with O(1) retrieve and append
5 5 and O(changes) merge between branches
6 6
7 7 Copyright 2005 Matt Mackall <mpm@selenic.com>
8 8
9 9 This software may be used and distributed according to the terms
10 10 of the GNU General Public License, incorporated herein by reference.
11 11 """
12 12
13 13 import zlib, struct, sha, binascii, heapq
14 14 import mdiff
15 15 from node import *
16 16
17 17 def hash(text, p1, p2):
18 18 """generate a hash from the given text and its parent hashes
19 19
20 20 This hash combines both the current file contents and its history
21 21 in a manner that makes it easy to distinguish nodes with the same
22 22 content in the revision graph.
23 23 """
24 24 l = [p1, p2]
25 25 l.sort()
26 26 s = sha.new(l[0])
27 27 s.update(l[1])
28 28 s.update(text)
29 29 return s.digest()
30 30
31 31 def compress(text):
32 32 """ generate a possibly-compressed representation of text """
33 33 if not text: return text
34 34 if len(text) < 44:
35 35 if text[0] == '\0': return text
36 36 return 'u' + text
37 37 bin = zlib.compress(text)
38 38 if len(bin) > len(text):
39 39 if text[0] == '\0': return text
40 40 return 'u' + text
41 41 return bin
42 42
43 43 def decompress(bin):
44 44 """ decompress the given input """
45 45 if not bin: return bin
46 46 t = bin[0]
47 47 if t == '\0': return bin
48 48 if t == 'x': return zlib.decompress(bin)
49 49 if t == 'u': return bin[1:]
50 50 raise RevlogError("unknown compression type %s" % t)
51 51
52 52 indexformat = ">4l20s20s20s"
53 53
54 54 class lazyparser:
55 55 """
56 56 this class avoids the need to parse the entirety of large indices
57 57
58 58 By default we parse and load 1000 entries at a time.
59 59
60 60 If no position is specified, we load the whole index, and replace
61 61 the lazy objects in revlog with the underlying objects for
62 62 efficiency in cases where we look at most of the nodes.
63 63 """
64 64 def __init__(self, data, revlog):
65 65 self.data = data
66 66 self.s = struct.calcsize(indexformat)
67 67 self.l = len(data)/self.s
68 68 self.index = [None] * self.l
69 69 self.map = {nullid: -1}
70 70 self.all = 0
71 71 self.revlog = revlog
72 72
73 73 def load(self, pos=None):
74 74 if self.all: return
75 75 if pos is not None:
76 76 block = pos / 1000
77 77 i = block * 1000
78 78 end = min(self.l, i + 1000)
79 79 else:
80 80 self.all = 1
81 81 i = 0
82 82 end = self.l
83 83 self.revlog.index = self.index
84 84 self.revlog.nodemap = self.map
85 85
86 86 while i < end:
87 87 d = self.data[i * self.s: (i + 1) * self.s]
88 88 e = struct.unpack(indexformat, d)
89 89 self.index[i] = e
90 90 self.map[e[6]] = i
91 91 i += 1
92 92
93 93 class lazyindex:
94 94 """a lazy version of the index array"""
95 95 def __init__(self, parser):
96 96 self.p = parser
97 97 def __len__(self):
98 98 return len(self.p.index)
99 99 def load(self, pos):
100 100 self.p.load(pos)
101 101 return self.p.index[pos]
102 102 def __getitem__(self, pos):
103 103 return self.p.index[pos] or self.load(pos)
104 104 def append(self, e):
105 105 self.p.index.append(e)
106 106
107 107 class lazymap:
108 108 """a lazy version of the node map"""
109 109 def __init__(self, parser):
110 110 self.p = parser
111 111 def load(self, key):
112 112 if self.p.all: return
113 113 n = self.p.data.find(key)
114 114 if n < 0: raise KeyError("node " + hex(key))
115 115 pos = n / self.p.s
116 116 self.p.load(pos)
117 117 def __contains__(self, key):
118 118 self.p.load()
119 119 return key in self.p.map
120 120 def __iter__(self):
121 121 yield nullid
122 122 for i in xrange(self.p.l):
123 123 try:
124 124 yield self.p.index[i][6]
125 125 except:
126 126 self.p.load(i)
127 127 yield self.p.index[i][6]
128 128 def __getitem__(self, key):
129 129 try:
130 130 return self.p.map[key]
131 131 except KeyError:
132 132 try:
133 133 self.load(key)
134 134 return self.p.map[key]
135 135 except KeyError:
136 136 raise KeyError("node " + hex(key))
137 137 def __setitem__(self, key, val):
138 138 self.p.map[key] = val
139 139
140 140 class RevlogError(Exception): pass
141 141
142 142 class revlog:
143 143 """
144 144 the underlying revision storage object
145 145
146 146 A revlog consists of two parts, an index and the revision data.
147 147
148 148 The index is a file with a fixed record size containing
149 149 information on each revision, includings its nodeid (hash), the
150 150 nodeids of its parents, the position and offset of its data within
151 151 the data file, and the revision it's based on. Finally, each entry
152 152 contains a linkrev entry that can serve as a pointer to external
153 153 data.
154 154
155 155 The revision data itself is a linear collection of data chunks.
156 156 Each chunk represents a revision and is usually represented as a
157 157 delta against the previous chunk. To bound lookup time, runs of
158 158 deltas are limited to about 2 times the length of the original
159 159 version data. This makes retrieval of a version proportional to
160 160 its size, or O(1) relative to the number of revisions.
161 161
162 162 Both pieces of the revlog are written to in an append-only
163 163 fashion, which means we never need to rewrite a file to insert or
164 164 remove data, and can use some simple techniques to avoid the need
165 165 for locking while reading.
166 166 """
167 167 def __init__(self, opener, indexfile, datafile):
168 168 """
169 169 create a revlog object
170 170
171 171 opener is a function that abstracts the file opening operation
172 172 and can be used to implement COW semantics or the like.
173 173 """
174 174 self.indexfile = indexfile
175 175 self.datafile = datafile
176 176 self.opener = opener
177 177 self.cache = None
178 178
179 179 try:
180 180 i = self.opener(self.indexfile).read()
181 181 except IOError:
182 182 i = ""
183 183
184 184 if len(i) > 10000:
185 185 # big index, let's parse it on demand
186 186 parser = lazyparser(i, self)
187 187 self.index = lazyindex(parser)
188 188 self.nodemap = lazymap(parser)
189 189 else:
190 190 s = struct.calcsize(indexformat)
191 191 l = len(i) / s
192 192 self.index = [None] * l
193 193 m = [None] * l
194 194
195 195 n = 0
196 196 for f in xrange(0, len(i), s):
197 197 # offset, size, base, linkrev, p1, p2, nodeid
198 198 e = struct.unpack(indexformat, i[f:f + s])
199 199 m[n] = (e[6], n)
200 200 self.index[n] = e
201 201 n += 1
202 202
203 203 self.nodemap = dict(m)
204 204 self.nodemap[nullid] = -1
205 205
206 206 def tip(self): return self.node(len(self.index) - 1)
207 207 def count(self): return len(self.index)
208 208 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
209 def rev(self, node): return self.nodemap[node]
210 def linkrev(self, node): return self.index[self.nodemap[node]][3]
209 def rev(self, node):
210 try:
211 return self.nodemap[node]
212 except KeyError:
213 raise KeyError('%s: no node %s' % (self.indexfile, hex(node)))
214 def linkrev(self, node): return self.index[self.rev(node)][3]
211 215 def parents(self, node):
212 216 if node == nullid: return (nullid, nullid)
213 return self.index[self.nodemap[node]][4:6]
217 return self.index[self.rev(node)][4:6]
214 218
215 219 def start(self, rev): return self.index[rev][0]
216 220 def length(self, rev): return self.index[rev][1]
217 221 def end(self, rev): return self.start(rev) + self.length(rev)
218 222 def base(self, rev): return self.index[rev][2]
219 223
220 224 def reachable(self, rev, stop=None):
221 225 reachable = {}
222 226 visit = [rev]
223 227 reachable[rev] = 1
224 228 if stop:
225 229 stopn = self.rev(stop)
226 230 else:
227 231 stopn = 0
228 232 while visit:
229 233 n = visit.pop(0)
230 234 if n == stop:
231 235 continue
232 236 if n == nullid:
233 237 continue
234 238 for p in self.parents(n):
235 239 if self.rev(p) < stopn:
236 240 continue
237 241 if p not in reachable:
238 242 reachable[p] = 1
239 243 visit.append(p)
240 244 return reachable
241 245
242 246 def heads(self, stop=None):
243 247 """return the list of all nodes that have no children"""
244 248 p = {}
245 249 h = []
246 250 stoprev = 0
247 251 if stop and stop in self.nodemap:
248 252 stoprev = self.rev(stop)
249 253
250 254 for r in range(self.count() - 1, -1, -1):
251 255 n = self.node(r)
252 256 if n not in p:
253 257 h.append(n)
254 258 if n == stop:
255 259 break
256 260 if r < stoprev:
257 261 break
258 262 for pn in self.parents(n):
259 263 p[pn] = 1
260 264 return h
261 265
262 266 def children(self, node):
263 267 """find the children of a given node"""
264 268 c = []
265 269 p = self.rev(node)
266 270 for r in range(p + 1, self.count()):
267 271 n = self.node(r)
268 272 for pn in self.parents(n):
269 273 if pn == node:
270 274 c.append(n)
271 275 continue
272 276 elif pn == nullid:
273 277 continue
274 278 return c
275 279
276 280 def lookup(self, id):
277 281 """locate a node based on revision number or subset of hex nodeid"""
278 282 try:
279 283 rev = int(id)
280 284 if str(rev) != id: raise ValueError
281 285 if rev < 0: rev = self.count() + rev
282 286 if rev < 0 or rev >= self.count(): raise ValueError
283 287 return self.node(rev)
284 288 except (ValueError, OverflowError):
285 289 c = []
286 290 for n in self.nodemap:
287 291 if hex(n).startswith(id):
288 292 c.append(n)
289 293 if len(c) > 1: raise KeyError("Ambiguous identifier")
290 294 if len(c) < 1: raise KeyError("No match found")
291 295 return c[0]
292 296
293 297 return None
294 298
295 299 def diff(self, a, b):
296 300 """return a delta between two revisions"""
297 301 return mdiff.textdiff(a, b)
298 302
299 303 def patches(self, t, pl):
300 304 """apply a list of patches to a string"""
301 305 return mdiff.patches(t, pl)
302 306
303 307 def delta(self, node):
304 308 """return or calculate a delta between a node and its predecessor"""
305 309 r = self.rev(node)
306 310 b = self.base(r)
307 311 if r == b:
308 312 return self.diff(self.revision(self.node(r - 1)),
309 313 self.revision(node))
310 314 else:
311 315 f = self.opener(self.datafile)
312 316 f.seek(self.start(r))
313 317 data = f.read(self.length(r))
314 318 return decompress(data)
315 319
316 320 def revision(self, node):
317 321 """return an uncompressed revision of a given"""
318 322 if node == nullid: return ""
319 323 if self.cache and self.cache[0] == node: return self.cache[2]
320 324
321 325 # look up what we need to read
322 326 text = None
323 327 rev = self.rev(node)
324 328 start, length, base, link, p1, p2, node = self.index[rev]
325 329 end = start + length
326 330 if base != rev: start = self.start(base)
327 331
328 332 # do we have useful data cached?
329 333 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
330 334 base = self.cache[1]
331 335 start = self.start(base + 1)
332 336 text = self.cache[2]
333 337 last = 0
334 338
335 339 f = self.opener(self.datafile)
336 340 f.seek(start)
337 341 data = f.read(end - start)
338 342
339 343 if text is None:
340 344 last = self.length(base)
341 345 text = decompress(data[:last])
342 346
343 347 bins = []
344 348 for r in xrange(base + 1, rev + 1):
345 349 s = self.length(r)
346 350 bins.append(decompress(data[last:last + s]))
347 351 last = last + s
348 352
349 353 text = mdiff.patches(text, bins)
350 354
351 355 if node != hash(text, p1, p2):
352 356 raise IOError("integrity check failed on %s:%d"
353 357 % (self.datafile, rev))
354 358
355 359 self.cache = (node, rev, text)
356 360 return text
357 361
358 362 def addrevision(self, text, transaction, link, p1=None, p2=None, d=None):
359 363 """add a revision to the log
360 364
361 365 text - the revision data to add
362 366 transaction - the transaction object used for rollback
363 367 link - the linkrev data to add
364 368 p1, p2 - the parent nodeids of the revision
365 369 d - an optional precomputed delta
366 370 """
367 371 if text is None: text = ""
368 372 if p1 is None: p1 = self.tip()
369 373 if p2 is None: p2 = nullid
370 374
371 375 node = hash(text, p1, p2)
372 376
373 377 if node in self.nodemap:
374 378 return node
375 379
376 380 n = self.count()
377 381 t = n - 1
378 382
379 383 if n:
380 384 base = self.base(t)
381 385 start = self.start(base)
382 386 end = self.end(t)
383 387 if not d:
384 388 prev = self.revision(self.tip())
385 389 d = self.diff(prev, text)
386 390 data = compress(d)
387 391 dist = end - start + len(data)
388 392
389 393 # full versions are inserted when the needed deltas
390 394 # become comparable to the uncompressed text
391 395 if not n or dist > len(text) * 2:
392 396 data = compress(text)
393 397 base = n
394 398 else:
395 399 base = self.base(t)
396 400
397 401 offset = 0
398 402 if t >= 0:
399 403 offset = self.end(t)
400 404
401 405 e = (offset, len(data), base, link, p1, p2, node)
402 406
403 407 self.index.append(e)
404 408 self.nodemap[node] = n
405 409 entry = struct.pack(indexformat, *e)
406 410
407 411 transaction.add(self.datafile, e[0])
408 412 self.opener(self.datafile, "a").write(data)
409 413 transaction.add(self.indexfile, n * len(entry))
410 414 self.opener(self.indexfile, "a").write(entry)
411 415
412 416 self.cache = (node, n, text)
413 417 return node
414 418
415 419 def ancestor(self, a, b):
416 420 """calculate the least common ancestor of nodes a and b"""
417 421 # calculate the distance of every node from root
418 422 dist = {nullid: 0}
419 423 for i in xrange(self.count()):
420 424 n = self.node(i)
421 425 p1, p2 = self.parents(n)
422 426 dist[n] = max(dist[p1], dist[p2]) + 1
423 427
424 428 # traverse ancestors in order of decreasing distance from root
425 429 def ancestors(node):
426 430 # we store negative distances because heap returns smallest member
427 431 h = [(-dist[node], node)]
428 432 seen = {}
429 433 earliest = self.count()
430 434 while h:
431 435 d, n = heapq.heappop(h)
432 436 if n not in seen:
433 437 seen[n] = 1
434 438 r = self.rev(n)
435 439 yield (-d, r, n)
436 440 for p in self.parents(n):
437 441 heapq.heappush(h, (-dist[p], p))
438 442
439 443 x = ancestors(a)
440 444 y = ancestors(b)
441 445 lx = x.next()
442 446 ly = y.next()
443 447
444 448 # increment each ancestor list until it is closer to root than
445 449 # the other, or they match
446 450 while 1:
447 451 if lx == ly:
448 452 return lx[2]
449 453 elif lx < ly:
450 454 ly = y.next()
451 455 elif lx > ly:
452 456 lx = x.next()
453 457
454 458 def group(self, linkmap):
455 459 """calculate a delta group
456 460
457 461 Given a list of changeset revs, return a set of deltas and
458 462 metadata corresponding to nodes. the first delta is
459 463 parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
460 464 have this parent as it has all history before these
461 465 changesets. parent is parent[0]
462 466 """
463 467 revs = []
464 468 needed = {}
465 469
466 470 # find file nodes/revs that match changeset revs
467 471 for i in xrange(0, self.count()):
468 472 if self.index[i][3] in linkmap:
469 473 revs.append(i)
470 474 needed[i] = 1
471 475
472 476 # if we don't have any revisions touched by these changesets, bail
473 477 if not revs:
474 478 yield struct.pack(">l", 0)
475 479 return
476 480
477 481 # add the parent of the first rev
478 482 p = self.parents(self.node(revs[0]))[0]
479 483 revs.insert(0, self.rev(p))
480 484
481 485 # for each delta that isn't contiguous in the log, we need to
482 486 # reconstruct the base, reconstruct the result, and then
483 487 # calculate the delta. We also need to do this where we've
484 488 # stored a full version and not a delta
485 489 for i in xrange(0, len(revs) - 1):
486 490 a, b = revs[i], revs[i + 1]
487 491 if a + 1 != b or self.base(b) == b:
488 492 for j in xrange(self.base(a), a + 1):
489 493 needed[j] = 1
490 494 for j in xrange(self.base(b), b + 1):
491 495 needed[j] = 1
492 496
493 497 # calculate spans to retrieve from datafile
494 498 needed = needed.keys()
495 499 needed.sort()
496 500 spans = []
497 501 oo = -1
498 502 ol = 0
499 503 for n in needed:
500 504 if n < 0: continue
501 505 o = self.start(n)
502 506 l = self.length(n)
503 507 if oo + ol == o: # can we merge with the previous?
504 508 nl = spans[-1][2]
505 509 nl.append((n, l))
506 510 ol += l
507 511 spans[-1] = (oo, ol, nl)
508 512 else:
509 513 oo = o
510 514 ol = l
511 515 spans.append((oo, ol, [(n, l)]))
512 516
513 517 # read spans in, divide up chunks
514 518 chunks = {}
515 519 for span in spans:
516 520 # we reopen the file for each span to make http happy for now
517 521 f = self.opener(self.datafile)
518 522 f.seek(span[0])
519 523 data = f.read(span[1])
520 524
521 525 # divide up the span
522 526 pos = 0
523 527 for r, l in span[2]:
524 528 chunks[r] = decompress(data[pos: pos + l])
525 529 pos += l
526 530
527 531 # helper to reconstruct intermediate versions
528 532 def construct(text, base, rev):
529 533 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
530 534 return mdiff.patches(text, bins)
531 535
532 536 # build deltas
533 537 deltas = []
534 538 for d in xrange(0, len(revs) - 1):
535 539 a, b = revs[d], revs[d + 1]
536 540 n = self.node(b)
537 541
538 542 # do we need to construct a new delta?
539 543 if a + 1 != b or self.base(b) == b:
540 544 if a >= 0:
541 545 base = self.base(a)
542 546 ta = chunks[self.base(a)]
543 547 ta = construct(ta, base, a)
544 548 else:
545 549 ta = ""
546 550
547 551 base = self.base(b)
548 552 if a > base:
549 553 base = a
550 554 tb = ta
551 555 else:
552 556 tb = chunks[self.base(b)]
553 557 tb = construct(tb, base, b)
554 558 d = self.diff(ta, tb)
555 559 else:
556 560 d = chunks[b]
557 561
558 562 p = self.parents(n)
559 563 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
560 564 l = struct.pack(">l", len(meta) + len(d) + 4)
561 565 yield l
562 566 yield meta
563 567 yield d
564 568
565 569 yield struct.pack(">l", 0)
566 570
567 571 def addgroup(self, revs, linkmapper, transaction, unique=0):
568 572 """
569 573 add a delta group
570 574
571 575 given a set of deltas, add them to the revision log. the
572 576 first delta is against its parent, which should be in our
573 577 log, the rest are against the previous delta.
574 578 """
575 579
576 580 #track the base of the current delta log
577 581 r = self.count()
578 582 t = r - 1
579 583 node = nullid
580 584
581 585 base = prev = -1
582 586 start = end = measure = 0
583 587 if r:
584 588 start = self.start(self.base(t))
585 589 end = self.end(t)
586 590 measure = self.length(self.base(t))
587 591 base = self.base(t)
588 592 prev = self.tip()
589 593
590 594 transaction.add(self.datafile, end)
591 595 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
592 596 dfh = self.opener(self.datafile, "a")
593 597 ifh = self.opener(self.indexfile, "a")
594 598
595 599 # loop through our set of deltas
596 600 chain = None
597 601 for chunk in revs:
598 602 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
599 603 link = linkmapper(cs)
600 604 if node in self.nodemap:
601 605 # this can happen if two branches make the same change
602 606 if unique:
603 607 raise RevlogError("already have %s" % hex(node[:4]))
604 608 chain = node
605 609 continue
606 610 delta = chunk[80:]
607 611
608 612 if not chain:
609 613 # retrieve the parent revision of the delta chain
610 614 chain = p1
611 615 if not chain in self.nodemap:
612 616 raise RevlogError("unknown base %s" % short(chain[:4]))
613 617
614 618 # full versions are inserted when the needed deltas become
615 619 # comparable to the uncompressed text or when the previous
616 620 # version is not the one we have a delta against. We use
617 621 # the size of the previous full rev as a proxy for the
618 622 # current size.
619 623
620 624 if chain == prev:
621 625 cdelta = compress(delta)
622 626
623 627 if chain != prev or (end - start + len(cdelta)) > measure * 2:
624 628 # flush our writes here so we can read it in revision
625 629 dfh.flush()
626 630 ifh.flush()
627 631 text = self.revision(chain)
628 632 text = self.patches(text, [delta])
629 633 chk = self.addrevision(text, transaction, link, p1, p2)
630 634 if chk != node:
631 635 raise RevlogError("consistency error adding group")
632 636 measure = len(text)
633 637 else:
634 638 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
635 639 self.index.append(e)
636 640 self.nodemap[node] = r
637 641 dfh.write(cdelta)
638 642 ifh.write(struct.pack(indexformat, *e))
639 643
640 644 t, r, chain, prev = r, r + 1, node, node
641 645 start = self.start(self.base(t))
642 646 end = self.end(t)
643 647
644 648 dfh.close()
645 649 ifh.close()
646 650 return node
General Comments 0
You need to be logged in to leave comments. Login now