##// END OF EJS Templates
Make revlog constructor more discerning in its treatment of errors.
Bryan O'Sullivan -
r1322:b3d44e9b default
parent child Browse files
Show More
@@ -1,651 +1,655 b''
1 1 """
2 2 revlog.py - storage back-end for mercurial
3 3
4 4 This provides efficient delta storage with O(1) retrieve and append
5 5 and O(changes) merge between branches
6 6
7 7 Copyright 2005 Matt Mackall <mpm@selenic.com>
8 8
9 9 This software may be used and distributed according to the terms
10 10 of the GNU General Public License, incorporated herein by reference.
11 11 """
12 12
13 import zlib, struct, sha, binascii, heapq
14 import mdiff
15 13 from node import *
14 from demandload import demandload
15 demandload(globals(), "binascii errno heapq mdiff sha struct urllib2 zlib")
16 16
17 17 def hash(text, p1, p2):
18 18 """generate a hash from the given text and its parent hashes
19 19
20 20 This hash combines both the current file contents and its history
21 21 in a manner that makes it easy to distinguish nodes with the same
22 22 content in the revision graph.
23 23 """
24 24 l = [p1, p2]
25 25 l.sort()
26 26 s = sha.new(l[0])
27 27 s.update(l[1])
28 28 s.update(text)
29 29 return s.digest()
30 30
31 31 def compress(text):
32 32 """ generate a possibly-compressed representation of text """
33 33 if not text: return text
34 34 if len(text) < 44:
35 35 if text[0] == '\0': return text
36 36 return 'u' + text
37 37 bin = zlib.compress(text)
38 38 if len(bin) > len(text):
39 39 if text[0] == '\0': return text
40 40 return 'u' + text
41 41 return bin
42 42
43 43 def decompress(bin):
44 44 """ decompress the given input """
45 45 if not bin: return bin
46 46 t = bin[0]
47 47 if t == '\0': return bin
48 48 if t == 'x': return zlib.decompress(bin)
49 49 if t == 'u': return bin[1:]
50 50 raise RevlogError("unknown compression type %s" % t)
51 51
52 52 indexformat = ">4l20s20s20s"
53 53
54 54 class lazyparser:
55 55 """
56 56 this class avoids the need to parse the entirety of large indices
57 57
58 58 By default we parse and load 1000 entries at a time.
59 59
60 60 If no position is specified, we load the whole index, and replace
61 61 the lazy objects in revlog with the underlying objects for
62 62 efficiency in cases where we look at most of the nodes.
63 63 """
64 64 def __init__(self, data, revlog):
65 65 self.data = data
66 66 self.s = struct.calcsize(indexformat)
67 67 self.l = len(data)/self.s
68 68 self.index = [None] * self.l
69 69 self.map = {nullid: -1}
70 70 self.all = 0
71 71 self.revlog = revlog
72 72
73 73 def load(self, pos=None):
74 74 if self.all: return
75 75 if pos is not None:
76 76 block = pos / 1000
77 77 i = block * 1000
78 78 end = min(self.l, i + 1000)
79 79 else:
80 80 self.all = 1
81 81 i = 0
82 82 end = self.l
83 83 self.revlog.index = self.index
84 84 self.revlog.nodemap = self.map
85 85
86 86 while i < end:
87 87 d = self.data[i * self.s: (i + 1) * self.s]
88 88 e = struct.unpack(indexformat, d)
89 89 self.index[i] = e
90 90 self.map[e[6]] = i
91 91 i += 1
92 92
93 93 class lazyindex:
94 94 """a lazy version of the index array"""
95 95 def __init__(self, parser):
96 96 self.p = parser
97 97 def __len__(self):
98 98 return len(self.p.index)
99 99 def load(self, pos):
100 100 self.p.load(pos)
101 101 return self.p.index[pos]
102 102 def __getitem__(self, pos):
103 103 return self.p.index[pos] or self.load(pos)
104 104 def append(self, e):
105 105 self.p.index.append(e)
106 106
107 107 class lazymap:
108 108 """a lazy version of the node map"""
109 109 def __init__(self, parser):
110 110 self.p = parser
111 111 def load(self, key):
112 112 if self.p.all: return
113 113 n = self.p.data.find(key)
114 114 if n < 0:
115 115 raise KeyError(key)
116 116 pos = n / self.p.s
117 117 self.p.load(pos)
118 118 def __contains__(self, key):
119 119 self.p.load()
120 120 return key in self.p.map
121 121 def __iter__(self):
122 122 yield nullid
123 123 for i in xrange(self.p.l):
124 124 try:
125 125 yield self.p.index[i][6]
126 126 except:
127 127 self.p.load(i)
128 128 yield self.p.index[i][6]
129 129 def __getitem__(self, key):
130 130 try:
131 131 return self.p.map[key]
132 132 except KeyError:
133 133 try:
134 134 self.load(key)
135 135 return self.p.map[key]
136 136 except KeyError:
137 137 raise KeyError("node " + hex(key))
138 138 def __setitem__(self, key, val):
139 139 self.p.map[key] = val
140 140
141 141 class RevlogError(Exception): pass
142 142
143 143 class revlog:
144 144 """
145 145 the underlying revision storage object
146 146
147 147 A revlog consists of two parts, an index and the revision data.
148 148
149 149 The index is a file with a fixed record size containing
150 150 information on each revision, includings its nodeid (hash), the
151 151 nodeids of its parents, the position and offset of its data within
152 152 the data file, and the revision it's based on. Finally, each entry
153 153 contains a linkrev entry that can serve as a pointer to external
154 154 data.
155 155
156 156 The revision data itself is a linear collection of data chunks.
157 157 Each chunk represents a revision and is usually represented as a
158 158 delta against the previous chunk. To bound lookup time, runs of
159 159 deltas are limited to about 2 times the length of the original
160 160 version data. This makes retrieval of a version proportional to
161 161 its size, or O(1) relative to the number of revisions.
162 162
163 163 Both pieces of the revlog are written to in an append-only
164 164 fashion, which means we never need to rewrite a file to insert or
165 165 remove data, and can use some simple techniques to avoid the need
166 166 for locking while reading.
167 167 """
168 168 def __init__(self, opener, indexfile, datafile):
169 169 """
170 170 create a revlog object
171 171
172 172 opener is a function that abstracts the file opening operation
173 173 and can be used to implement COW semantics or the like.
174 174 """
175 175 self.indexfile = indexfile
176 176 self.datafile = datafile
177 177 self.opener = opener
178 178 self.cache = None
179 179
180 180 try:
181 181 i = self.opener(self.indexfile).read()
182 except IOError:
182 except urllib2.URLError:
183 raise
184 except IOError, inst:
185 if inst.errno != errno.ENOENT:
186 raise
183 187 i = ""
184 188
185 189 if len(i) > 10000:
186 190 # big index, let's parse it on demand
187 191 parser = lazyparser(i, self)
188 192 self.index = lazyindex(parser)
189 193 self.nodemap = lazymap(parser)
190 194 else:
191 195 s = struct.calcsize(indexformat)
192 196 l = len(i) / s
193 197 self.index = [None] * l
194 198 m = [None] * l
195 199
196 200 n = 0
197 201 for f in xrange(0, len(i), s):
198 202 # offset, size, base, linkrev, p1, p2, nodeid
199 203 e = struct.unpack(indexformat, i[f:f + s])
200 204 m[n] = (e[6], n)
201 205 self.index[n] = e
202 206 n += 1
203 207
204 208 self.nodemap = dict(m)
205 209 self.nodemap[nullid] = -1
206 210
207 211 def tip(self): return self.node(len(self.index) - 1)
208 212 def count(self): return len(self.index)
209 213 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
210 214 def rev(self, node):
211 215 try:
212 216 return self.nodemap[node]
213 217 except KeyError:
214 218 raise RevlogError('%s: no node %s' % (self.indexfile, hex(node)))
215 219 def linkrev(self, node): return self.index[self.rev(node)][3]
216 220 def parents(self, node):
217 221 if node == nullid: return (nullid, nullid)
218 222 return self.index[self.rev(node)][4:6]
219 223
220 224 def start(self, rev): return self.index[rev][0]
221 225 def length(self, rev): return self.index[rev][1]
222 226 def end(self, rev): return self.start(rev) + self.length(rev)
223 227 def base(self, rev): return self.index[rev][2]
224 228
225 229 def reachable(self, rev, stop=None):
226 230 reachable = {}
227 231 visit = [rev]
228 232 reachable[rev] = 1
229 233 if stop:
230 234 stopn = self.rev(stop)
231 235 else:
232 236 stopn = 0
233 237 while visit:
234 238 n = visit.pop(0)
235 239 if n == stop:
236 240 continue
237 241 if n == nullid:
238 242 continue
239 243 for p in self.parents(n):
240 244 if self.rev(p) < stopn:
241 245 continue
242 246 if p not in reachable:
243 247 reachable[p] = 1
244 248 visit.append(p)
245 249 return reachable
246 250
247 251 def heads(self, stop=None):
248 252 """return the list of all nodes that have no children"""
249 253 p = {}
250 254 h = []
251 255 stoprev = 0
252 256 if stop and stop in self.nodemap:
253 257 stoprev = self.rev(stop)
254 258
255 259 for r in range(self.count() - 1, -1, -1):
256 260 n = self.node(r)
257 261 if n not in p:
258 262 h.append(n)
259 263 if n == stop:
260 264 break
261 265 if r < stoprev:
262 266 break
263 267 for pn in self.parents(n):
264 268 p[pn] = 1
265 269 return h
266 270
267 271 def children(self, node):
268 272 """find the children of a given node"""
269 273 c = []
270 274 p = self.rev(node)
271 275 for r in range(p + 1, self.count()):
272 276 n = self.node(r)
273 277 for pn in self.parents(n):
274 278 if pn == node:
275 279 c.append(n)
276 280 continue
277 281 elif pn == nullid:
278 282 continue
279 283 return c
280 284
281 285 def lookup(self, id):
282 286 """locate a node based on revision number or subset of hex nodeid"""
283 287 try:
284 288 rev = int(id)
285 289 if str(rev) != id: raise ValueError
286 290 if rev < 0: rev = self.count() + rev
287 291 if rev < 0 or rev >= self.count(): raise ValueError
288 292 return self.node(rev)
289 293 except (ValueError, OverflowError):
290 294 c = []
291 295 for n in self.nodemap:
292 296 if hex(n).startswith(id):
293 297 c.append(n)
294 298 if len(c) > 1: raise KeyError("Ambiguous identifier")
295 299 if len(c) < 1: raise KeyError("No match found")
296 300 return c[0]
297 301
298 302 return None
299 303
300 304 def diff(self, a, b):
301 305 """return a delta between two revisions"""
302 306 return mdiff.textdiff(a, b)
303 307
304 308 def patches(self, t, pl):
305 309 """apply a list of patches to a string"""
306 310 return mdiff.patches(t, pl)
307 311
308 312 def delta(self, node):
309 313 """return or calculate a delta between a node and its predecessor"""
310 314 r = self.rev(node)
311 315 b = self.base(r)
312 316 if r == b:
313 317 return self.diff(self.revision(self.node(r - 1)),
314 318 self.revision(node))
315 319 else:
316 320 f = self.opener(self.datafile)
317 321 f.seek(self.start(r))
318 322 data = f.read(self.length(r))
319 323 return decompress(data)
320 324
321 325 def revision(self, node):
322 326 """return an uncompressed revision of a given"""
323 327 if node == nullid: return ""
324 328 if self.cache and self.cache[0] == node: return self.cache[2]
325 329
326 330 # look up what we need to read
327 331 text = None
328 332 rev = self.rev(node)
329 333 start, length, base, link, p1, p2, node = self.index[rev]
330 334 end = start + length
331 335 if base != rev: start = self.start(base)
332 336
333 337 # do we have useful data cached?
334 338 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
335 339 base = self.cache[1]
336 340 start = self.start(base + 1)
337 341 text = self.cache[2]
338 342 last = 0
339 343
340 344 f = self.opener(self.datafile)
341 345 f.seek(start)
342 346 data = f.read(end - start)
343 347
344 348 if text is None:
345 349 last = self.length(base)
346 350 text = decompress(data[:last])
347 351
348 352 bins = []
349 353 for r in xrange(base + 1, rev + 1):
350 354 s = self.length(r)
351 355 bins.append(decompress(data[last:last + s]))
352 356 last = last + s
353 357
354 358 text = mdiff.patches(text, bins)
355 359
356 360 if node != hash(text, p1, p2):
357 361 raise RevlogError("integrity check failed on %s:%d"
358 362 % (self.datafile, rev))
359 363
360 364 self.cache = (node, rev, text)
361 365 return text
362 366
363 367 def addrevision(self, text, transaction, link, p1=None, p2=None, d=None):
364 368 """add a revision to the log
365 369
366 370 text - the revision data to add
367 371 transaction - the transaction object used for rollback
368 372 link - the linkrev data to add
369 373 p1, p2 - the parent nodeids of the revision
370 374 d - an optional precomputed delta
371 375 """
372 376 if text is None: text = ""
373 377 if p1 is None: p1 = self.tip()
374 378 if p2 is None: p2 = nullid
375 379
376 380 node = hash(text, p1, p2)
377 381
378 382 if node in self.nodemap:
379 383 return node
380 384
381 385 n = self.count()
382 386 t = n - 1
383 387
384 388 if n:
385 389 base = self.base(t)
386 390 start = self.start(base)
387 391 end = self.end(t)
388 392 if not d:
389 393 prev = self.revision(self.tip())
390 394 d = self.diff(prev, text)
391 395 data = compress(d)
392 396 dist = end - start + len(data)
393 397
394 398 # full versions are inserted when the needed deltas
395 399 # become comparable to the uncompressed text
396 400 if not n or dist > len(text) * 2:
397 401 data = compress(text)
398 402 base = n
399 403 else:
400 404 base = self.base(t)
401 405
402 406 offset = 0
403 407 if t >= 0:
404 408 offset = self.end(t)
405 409
406 410 e = (offset, len(data), base, link, p1, p2, node)
407 411
408 412 self.index.append(e)
409 413 self.nodemap[node] = n
410 414 entry = struct.pack(indexformat, *e)
411 415
412 416 transaction.add(self.datafile, e[0])
413 417 self.opener(self.datafile, "a").write(data)
414 418 transaction.add(self.indexfile, n * len(entry))
415 419 self.opener(self.indexfile, "a").write(entry)
416 420
417 421 self.cache = (node, n, text)
418 422 return node
419 423
420 424 def ancestor(self, a, b):
421 425 """calculate the least common ancestor of nodes a and b"""
422 426 # calculate the distance of every node from root
423 427 dist = {nullid: 0}
424 428 for i in xrange(self.count()):
425 429 n = self.node(i)
426 430 p1, p2 = self.parents(n)
427 431 dist[n] = max(dist[p1], dist[p2]) + 1
428 432
429 433 # traverse ancestors in order of decreasing distance from root
430 434 def ancestors(node):
431 435 # we store negative distances because heap returns smallest member
432 436 h = [(-dist[node], node)]
433 437 seen = {}
434 438 earliest = self.count()
435 439 while h:
436 440 d, n = heapq.heappop(h)
437 441 if n not in seen:
438 442 seen[n] = 1
439 443 r = self.rev(n)
440 444 yield (-d, r, n)
441 445 for p in self.parents(n):
442 446 heapq.heappush(h, (-dist[p], p))
443 447
444 448 x = ancestors(a)
445 449 y = ancestors(b)
446 450 lx = x.next()
447 451 ly = y.next()
448 452
449 453 # increment each ancestor list until it is closer to root than
450 454 # the other, or they match
451 455 while 1:
452 456 if lx == ly:
453 457 return lx[2]
454 458 elif lx < ly:
455 459 ly = y.next()
456 460 elif lx > ly:
457 461 lx = x.next()
458 462
459 463 def group(self, linkmap):
460 464 """calculate a delta group
461 465
462 466 Given a list of changeset revs, return a set of deltas and
463 467 metadata corresponding to nodes. the first delta is
464 468 parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
465 469 have this parent as it has all history before these
466 470 changesets. parent is parent[0]
467 471 """
468 472 revs = []
469 473 needed = {}
470 474
471 475 # find file nodes/revs that match changeset revs
472 476 for i in xrange(0, self.count()):
473 477 if self.index[i][3] in linkmap:
474 478 revs.append(i)
475 479 needed[i] = 1
476 480
477 481 # if we don't have any revisions touched by these changesets, bail
478 482 if not revs:
479 483 yield struct.pack(">l", 0)
480 484 return
481 485
482 486 # add the parent of the first rev
483 487 p = self.parents(self.node(revs[0]))[0]
484 488 revs.insert(0, self.rev(p))
485 489
486 490 # for each delta that isn't contiguous in the log, we need to
487 491 # reconstruct the base, reconstruct the result, and then
488 492 # calculate the delta. We also need to do this where we've
489 493 # stored a full version and not a delta
490 494 for i in xrange(0, len(revs) - 1):
491 495 a, b = revs[i], revs[i + 1]
492 496 if a + 1 != b or self.base(b) == b:
493 497 for j in xrange(self.base(a), a + 1):
494 498 needed[j] = 1
495 499 for j in xrange(self.base(b), b + 1):
496 500 needed[j] = 1
497 501
498 502 # calculate spans to retrieve from datafile
499 503 needed = needed.keys()
500 504 needed.sort()
501 505 spans = []
502 506 oo = -1
503 507 ol = 0
504 508 for n in needed:
505 509 if n < 0: continue
506 510 o = self.start(n)
507 511 l = self.length(n)
508 512 if oo + ol == o: # can we merge with the previous?
509 513 nl = spans[-1][2]
510 514 nl.append((n, l))
511 515 ol += l
512 516 spans[-1] = (oo, ol, nl)
513 517 else:
514 518 oo = o
515 519 ol = l
516 520 spans.append((oo, ol, [(n, l)]))
517 521
518 522 # read spans in, divide up chunks
519 523 chunks = {}
520 524 for span in spans:
521 525 # we reopen the file for each span to make http happy for now
522 526 f = self.opener(self.datafile)
523 527 f.seek(span[0])
524 528 data = f.read(span[1])
525 529
526 530 # divide up the span
527 531 pos = 0
528 532 for r, l in span[2]:
529 533 chunks[r] = decompress(data[pos: pos + l])
530 534 pos += l
531 535
532 536 # helper to reconstruct intermediate versions
533 537 def construct(text, base, rev):
534 538 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
535 539 return mdiff.patches(text, bins)
536 540
537 541 # build deltas
538 542 deltas = []
539 543 for d in xrange(0, len(revs) - 1):
540 544 a, b = revs[d], revs[d + 1]
541 545 n = self.node(b)
542 546
543 547 # do we need to construct a new delta?
544 548 if a + 1 != b or self.base(b) == b:
545 549 if a >= 0:
546 550 base = self.base(a)
547 551 ta = chunks[self.base(a)]
548 552 ta = construct(ta, base, a)
549 553 else:
550 554 ta = ""
551 555
552 556 base = self.base(b)
553 557 if a > base:
554 558 base = a
555 559 tb = ta
556 560 else:
557 561 tb = chunks[self.base(b)]
558 562 tb = construct(tb, base, b)
559 563 d = self.diff(ta, tb)
560 564 else:
561 565 d = chunks[b]
562 566
563 567 p = self.parents(n)
564 568 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
565 569 l = struct.pack(">l", len(meta) + len(d) + 4)
566 570 yield l
567 571 yield meta
568 572 yield d
569 573
570 574 yield struct.pack(">l", 0)
571 575
572 576 def addgroup(self, revs, linkmapper, transaction, unique=0):
573 577 """
574 578 add a delta group
575 579
576 580 given a set of deltas, add them to the revision log. the
577 581 first delta is against its parent, which should be in our
578 582 log, the rest are against the previous delta.
579 583 """
580 584
581 585 #track the base of the current delta log
582 586 r = self.count()
583 587 t = r - 1
584 588 node = nullid
585 589
586 590 base = prev = -1
587 591 start = end = measure = 0
588 592 if r:
589 593 start = self.start(self.base(t))
590 594 end = self.end(t)
591 595 measure = self.length(self.base(t))
592 596 base = self.base(t)
593 597 prev = self.tip()
594 598
595 599 transaction.add(self.datafile, end)
596 600 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
597 601 dfh = self.opener(self.datafile, "a")
598 602 ifh = self.opener(self.indexfile, "a")
599 603
600 604 # loop through our set of deltas
601 605 chain = None
602 606 for chunk in revs:
603 607 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
604 608 link = linkmapper(cs)
605 609 if node in self.nodemap:
606 610 # this can happen if two branches make the same change
607 611 # if unique:
608 612 # raise RevlogError("already have %s" % hex(node[:4]))
609 613 chain = node
610 614 continue
611 615 delta = chunk[80:]
612 616
613 617 if not chain:
614 618 # retrieve the parent revision of the delta chain
615 619 chain = p1
616 620 if not chain in self.nodemap:
617 621 raise RevlogError("unknown base %s" % short(chain[:4]))
618 622
619 623 # full versions are inserted when the needed deltas become
620 624 # comparable to the uncompressed text or when the previous
621 625 # version is not the one we have a delta against. We use
622 626 # the size of the previous full rev as a proxy for the
623 627 # current size.
624 628
625 629 if chain == prev:
626 630 cdelta = compress(delta)
627 631
628 632 if chain != prev or (end - start + len(cdelta)) > measure * 2:
629 633 # flush our writes here so we can read it in revision
630 634 dfh.flush()
631 635 ifh.flush()
632 636 text = self.revision(chain)
633 637 text = self.patches(text, [delta])
634 638 chk = self.addrevision(text, transaction, link, p1, p2)
635 639 if chk != node:
636 640 raise RevlogError("consistency error adding group")
637 641 measure = len(text)
638 642 else:
639 643 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
640 644 self.index.append(e)
641 645 self.nodemap[node] = r
642 646 dfh.write(cdelta)
643 647 ifh.write(struct.pack(indexformat, *e))
644 648
645 649 t, r, chain, prev = r, r + 1, node, node
646 650 start = self.start(self.base(t))
647 651 end = self.end(t)
648 652
649 653 dfh.close()
650 654 ifh.close()
651 655 return node
General Comments 0
You need to be logged in to leave comments. Login now