##// END OF EJS Templates
Add some docstrings to revlog.py
mpm@selenic.com -
r1083:30974cf7 default
parent child Browse files
Show More
@@ -1,575 +1,650 b''
1 # revlog.py - storage back-end for mercurial
2 #
3 # This provides efficient delta storage with O(1) retrieve and append
4 # and O(changes) merge between branches
5 #
6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 #
8 # This software may be used and distributed according to the terms
9 # of the GNU General Public License, incorporated herein by reference.
1 """
2 revlog.py - storage back-end for mercurial
3
4 This provides efficient delta storage with O(1) retrieve and append
5 and O(changes) merge between branches
6
7 Copyright 2005 Matt Mackall <mpm@selenic.com>
8
9 This software may be used and distributed according to the terms
10 of the GNU General Public License, incorporated herein by reference.
11 """
10 12
11 13 import zlib, struct, sha, binascii, heapq
12 14 from mercurial import mdiff
13 15
14 16 def hex(node): return binascii.hexlify(node)
15 17 def bin(node): return binascii.unhexlify(node)
16 18 def short(node): return hex(node[:6])
17 19
18 20 def compress(text):
21 """ generate a possibly-compressed representation of text """
19 22 if not text: return text
20 23 if len(text) < 44:
21 24 if text[0] == '\0': return text
22 25 return 'u' + text
23 26 bin = zlib.compress(text)
24 27 if len(bin) > len(text):
25 28 if text[0] == '\0': return text
26 29 return 'u' + text
27 30 return bin
28 31
29 32 def decompress(bin):
33 """ decompress the given input """
30 34 if not bin: return bin
31 35 t = bin[0]
32 36 if t == '\0': return bin
33 37 if t == 'x': return zlib.decompress(bin)
34 38 if t == 'u': return bin[1:]
35 39 raise RevlogError("unknown compression type %s" % t)
36 40
37 41 def hash(text, p1, p2):
42 """generate a hash from the given text and its parent hashes
43
44 This hash combines both the current file contents and its history
45 in a manner that makes it easy to distinguish nodes with the same
46 content in the revision graph.
47 """
38 48 l = [p1, p2]
39 49 l.sort()
40 50 s = sha.new(l[0])
41 51 s.update(l[1])
42 52 s.update(text)
43 53 return s.digest()
44 54
45 55 nullid = "\0" * 20
46 56 indexformat = ">4l20s20s20s"
47 57
48 58 class lazyparser:
59 """
60 this class avoids the need to parse the entirety of large indices
61
62 By default we parse and load 1000 entries at a time.
63
64 If no position is specified, we load the whole index, and replace
65 the lazy objects in revlog with the underlying objects for
66 efficiency in cases where we look at most of the nodes.
67 """
49 68 def __init__(self, data, revlog):
50 69 self.data = data
51 70 self.s = struct.calcsize(indexformat)
52 71 self.l = len(data)/self.s
53 72 self.index = [None] * self.l
54 73 self.map = {nullid: -1}
55 74 self.all = 0
56 75 self.revlog = revlog
57 76
58 77 def load(self, pos=None):
59 78 if self.all: return
60 79 if pos is not None:
61 80 block = pos / 1000
62 81 i = block * 1000
63 82 end = min(self.l, i + 1000)
64 83 else:
65 84 self.all = 1
66 85 i = 0
67 86 end = self.l
68 87 self.revlog.index = self.index
69 88 self.revlog.nodemap = self.map
70 89
71 90 while i < end:
72 91 d = self.data[i * self.s: (i + 1) * self.s]
73 92 e = struct.unpack(indexformat, d)
74 93 self.index[i] = e
75 94 self.map[e[6]] = i
76 95 i += 1
77 96
78 97 class lazyindex:
98 """a lazy version of the index array"""
79 99 def __init__(self, parser):
80 100 self.p = parser
81 101 def __len__(self):
82 102 return len(self.p.index)
83 103 def load(self, pos):
84 104 self.p.load(pos)
85 105 return self.p.index[pos]
86 106 def __getitem__(self, pos):
87 107 return self.p.index[pos] or self.load(pos)
88 108 def append(self, e):
89 109 self.p.index.append(e)
90 110
91 111 class lazymap:
112 """a lazy version of the node map"""
92 113 def __init__(self, parser):
93 114 self.p = parser
94 115 def load(self, key):
95 116 if self.p.all: return
96 117 n = self.p.data.find(key)
97 118 if n < 0: raise KeyError("node " + hex(key))
98 119 pos = n / self.p.s
99 120 self.p.load(pos)
100 121 def __contains__(self, key):
101 122 self.p.load()
102 123 return key in self.p.map
103 124 def __iter__(self):
104 125 yield nullid
105 126 for i in xrange(self.p.l):
106 127 try:
107 128 yield self.p.index[i][6]
108 129 except:
109 130 self.p.load(i)
110 131 yield self.p.index[i][6]
111 132 def __getitem__(self, key):
112 133 try:
113 134 return self.p.map[key]
114 135 except KeyError:
115 136 try:
116 137 self.load(key)
117 138 return self.p.map[key]
118 139 except KeyError:
119 140 raise KeyError("node " + hex(key))
120 141 def __setitem__(self, key, val):
121 142 self.p.map[key] = val
122 143
123 144 class RevlogError(Exception): pass
124 145
125 146 class revlog:
147 """
148 the underlying revision storage object
149
150 A revlog consists of two parts, an index and the revision data.
151
152 The index is a file with a fixed record size containing
153 information on each revision, includings its nodeid (hash), the
154 nodeids of its parents, the position and offset of its data within
155 the data file, and the revision it's based on. Finally, each entry
156 contains a linkrev entry that can serve as a pointer to external
157 data.
158
159 The revision data itself is a linear collection of data chunks.
160 Each chunk represents a revision and is usually represented as a
161 delta against the previous chunk. To bound lookup time, runs of
162 deltas are limited to about 2 times the length of the original
163 version data. This makes retrieval of a version proportional to
164 its size, or O(1) relative to the number of revisions.
165
166 Both pieces of the revlog are written to in an append-only
167 fashion, which means we never need to rewrite a file to insert or
168 remove data, and can use some simple techniques to avoid the need
169 for locking while reading.
170 """
126 171 def __init__(self, opener, indexfile, datafile):
172 """
173 create a revlog object
174
175 opener is a function that abstracts the file opening operation
176 and can be used to implement COW semantics or the like.
177 """
127 178 self.indexfile = indexfile
128 179 self.datafile = datafile
129 180 self.opener = opener
130 181 self.cache = None
131 182
132 183 try:
133 184 i = self.opener(self.indexfile).read()
134 185 except IOError:
135 186 i = ""
136 187
137 188 if len(i) > 10000:
138 189 # big index, let's parse it on demand
139 190 parser = lazyparser(i, self)
140 191 self.index = lazyindex(parser)
141 192 self.nodemap = lazymap(parser)
142 193 else:
143 194 s = struct.calcsize(indexformat)
144 195 l = len(i) / s
145 196 self.index = [None] * l
146 197 m = [None] * l
147 198
148 199 n = 0
149 200 for f in xrange(0, len(i), s):
150 201 # offset, size, base, linkrev, p1, p2, nodeid
151 202 e = struct.unpack(indexformat, i[f:f + s])
152 203 m[n] = (e[6], n)
153 204 self.index[n] = e
154 205 n += 1
155 206
156 207 self.nodemap = dict(m)
157 208 self.nodemap[nullid] = -1
158 209
159 210 def tip(self): return self.node(len(self.index) - 1)
160 211 def count(self): return len(self.index)
161 212 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
162 213 def rev(self, node): return self.nodemap[node]
163 214 def linkrev(self, node): return self.index[self.nodemap[node]][3]
164 215 def parents(self, node):
165 216 if node == nullid: return (nullid, nullid)
166 217 return self.index[self.nodemap[node]][4:6]
167 218
168 219 def start(self, rev): return self.index[rev][0]
169 220 def length(self, rev): return self.index[rev][1]
170 221 def end(self, rev): return self.start(rev) + self.length(rev)
171 222 def base(self, rev): return self.index[rev][2]
172 223
173 224 def reachable(self, rev, stop=None):
174 225 reachable = {}
175 226 visit = [rev]
176 227 reachable[rev] = 1
177 228 if stop:
178 229 stopn = self.rev(stop)
179 230 else:
180 231 stopn = 0
181 232 while visit:
182 233 n = visit.pop(0)
183 234 if n == stop:
184 235 continue
185 236 if n == nullid:
186 237 continue
187 238 for p in self.parents(n):
188 239 if self.rev(p) < stopn:
189 240 continue
190 241 if p not in reachable:
191 242 reachable[p] = 1
192 243 visit.append(p)
193 244 return reachable
194 245
195 246 def heads(self, stop=None):
247 """return the list of all nodes that have no children"""
196 248 p = {}
197 249 h = []
198 250 stoprev = 0
199 251 if stop and stop in self.nodemap:
200 252 stoprev = self.rev(stop)
201
253
202 254 for r in range(self.count() - 1, -1, -1):
203 255 n = self.node(r)
204 256 if n not in p:
205 257 h.append(n)
206 258 if n == stop:
207 259 break
208 260 if r < stoprev:
209 261 break
210 262 for pn in self.parents(n):
211 263 p[pn] = 1
212 264 return h
213 265
214 266 def children(self, node):
267 """find the children of a given node"""
215 268 c = []
216 269 p = self.rev(node)
217 270 for r in range(p + 1, self.count()):
218 271 n = self.node(r)
219 272 for pn in self.parents(n):
220 273 if pn == node:
221 274 c.append(n)
222 275 continue
223 276 elif pn == nullid:
224 277 continue
225 278 return c
226 279
227 280 def lookup(self, id):
281 """locate a node based on revision number or subset of hex nodeid"""
228 282 try:
229 283 rev = int(id)
230 284 if str(rev) != id: raise ValueError
231 285 if rev < 0: rev = self.count() + rev
232 286 if rev < 0 or rev >= self.count(): raise ValueError
233 287 return self.node(rev)
234 288 except (ValueError, OverflowError):
235 289 c = []
236 290 for n in self.nodemap:
237 291 if hex(n).startswith(id):
238 292 c.append(n)
239 293 if len(c) > 1: raise KeyError("Ambiguous identifier")
240 294 if len(c) < 1: raise KeyError("No match found")
241 295 return c[0]
242 296
243 297 return None
244 298
245 299 def diff(self, a, b):
300 """return a delta between two revisions"""
246 301 return mdiff.textdiff(a, b)
247 302
248 303 def patches(self, t, pl):
304 """apply a list of patches to a string"""
249 305 return mdiff.patches(t, pl)
250 306
251 307 def delta(self, node):
308 """return or calculate a delta between a node and its predecessor"""
252 309 r = self.rev(node)
253 310 b = self.base(r)
254 311 if r == b:
255 312 return self.diff(self.revision(self.node(r - 1)),
256 313 self.revision(node))
257 314 else:
258 315 f = self.opener(self.datafile)
259 316 f.seek(self.start(r))
260 317 data = f.read(self.length(r))
261 318 return decompress(data)
262 319
263 320 def revision(self, node):
321 """return an uncompressed revision of a given"""
264 322 if node == nullid: return ""
265 323 if self.cache and self.cache[0] == node: return self.cache[2]
266 324
325 # look up what we need to read
267 326 text = None
268 327 rev = self.rev(node)
269 328 start, length, base, link, p1, p2, node = self.index[rev]
270 329 end = start + length
271 330 if base != rev: start = self.start(base)
272 331
332 # do we have useful data cached?
273 333 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
274 334 base = self.cache[1]
275 335 start = self.start(base + 1)
276 336 text = self.cache[2]
277 337 last = 0
278 338
279 339 f = self.opener(self.datafile)
280 340 f.seek(start)
281 341 data = f.read(end - start)
282 342
283 343 if text is None:
284 344 last = self.length(base)
285 345 text = decompress(data[:last])
286 346
287 347 bins = []
288 348 for r in xrange(base + 1, rev + 1):
289 349 s = self.length(r)
290 350 bins.append(decompress(data[last:last + s]))
291 351 last = last + s
292 352
293 353 text = mdiff.patches(text, bins)
294 354
295 355 if node != hash(text, p1, p2):
296 356 raise IOError("integrity check failed on %s:%d"
297 357 % (self.datafile, rev))
298 358
299 359 self.cache = (node, rev, text)
300 360 return text
301 361
302 362 def addrevision(self, text, transaction, link, p1=None, p2=None, d=None):
363 """add a revision to the log
364
365 text - the revision data to add
366 transaction - the transaction object used for rollback
367 link - the linkrev data to add
368 p1, p2 - the parent nodeids of the revision
369 d - an optional precomputed delta
370 """
303 371 if text is None: text = ""
304 372 if p1 is None: p1 = self.tip()
305 373 if p2 is None: p2 = nullid
306 374
307 375 node = hash(text, p1, p2)
308 376
309 377 if node in self.nodemap:
310 378 return node
311 379
312 380 n = self.count()
313 381 t = n - 1
314 382
315 383 if n:
316 384 base = self.base(t)
317 385 start = self.start(base)
318 386 end = self.end(t)
319 387 if not d:
320 388 prev = self.revision(self.tip())
321 389 d = self.diff(prev, text)
322 390 data = compress(d)
323 391 dist = end - start + len(data)
324 392
325 393 # full versions are inserted when the needed deltas
326 394 # become comparable to the uncompressed text
327 395 if not n or dist > len(text) * 2:
328 396 data = compress(text)
329 397 base = n
330 398 else:
331 399 base = self.base(t)
332 400
333 401 offset = 0
334 402 if t >= 0:
335 403 offset = self.end(t)
336 404
337 405 e = (offset, len(data), base, link, p1, p2, node)
338 406
339 407 self.index.append(e)
340 408 self.nodemap[node] = n
341 409 entry = struct.pack(indexformat, *e)
342 410
343 411 transaction.add(self.datafile, e[0])
344 412 self.opener(self.datafile, "a").write(data)
345 413 transaction.add(self.indexfile, n * len(entry))
346 414 self.opener(self.indexfile, "a").write(entry)
347 415
348 416 self.cache = (node, n, text)
349 417 return node
350 418
351 419 def ancestor(self, a, b):
420 """calculate the least common ancestor of nodes a and b"""
352 421 # calculate the distance of every node from root
353 422 dist = {nullid: 0}
354 423 for i in xrange(self.count()):
355 424 n = self.node(i)
356 425 p1, p2 = self.parents(n)
357 426 dist[n] = max(dist[p1], dist[p2]) + 1
358 427
359 428 # traverse ancestors in order of decreasing distance from root
360 429 def ancestors(node):
361 430 # we store negative distances because heap returns smallest member
362 431 h = [(-dist[node], node)]
363 432 seen = {}
364 433 earliest = self.count()
365 434 while h:
366 435 d, n = heapq.heappop(h)
367 436 if n not in seen:
368 437 seen[n] = 1
369 438 r = self.rev(n)
370 439 yield (-d, r, n)
371 440 for p in self.parents(n):
372 441 heapq.heappush(h, (-dist[p], p))
373 442
374 443 x = ancestors(a)
375 444 y = ancestors(b)
376 445 lx = x.next()
377 446 ly = y.next()
378 447
379 448 # increment each ancestor list until it is closer to root than
380 449 # the other, or they match
381 450 while 1:
382 451 if lx == ly:
383 452 return lx[2]
384 453 elif lx < ly:
385 454 ly = y.next()
386 455 elif lx > ly:
387 456 lx = x.next()
388 457
389 458 def group(self, linkmap):
390 # given a list of changeset revs, return a set of deltas and
391 # metadata corresponding to nodes. the first delta is
392 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
393 # have this parent as it has all history before these
394 # changesets. parent is parent[0]
459 """calculate a delta group
395 460
461 Given a list of changeset revs, return a set of deltas and
462 metadata corresponding to nodes. the first delta is
463 parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
464 have this parent as it has all history before these
465 changesets. parent is parent[0]
466 """
396 467 revs = []
397 468 needed = {}
398 469
399 470 # find file nodes/revs that match changeset revs
400 471 for i in xrange(0, self.count()):
401 472 if self.index[i][3] in linkmap:
402 473 revs.append(i)
403 474 needed[i] = 1
404 475
405 476 # if we don't have any revisions touched by these changesets, bail
406 477 if not revs:
407 478 yield struct.pack(">l", 0)
408 479 return
409 480
410 481 # add the parent of the first rev
411 482 p = self.parents(self.node(revs[0]))[0]
412 483 revs.insert(0, self.rev(p))
413 484
414 485 # for each delta that isn't contiguous in the log, we need to
415 486 # reconstruct the base, reconstruct the result, and then
416 487 # calculate the delta. We also need to do this where we've
417 488 # stored a full version and not a delta
418 489 for i in xrange(0, len(revs) - 1):
419 490 a, b = revs[i], revs[i + 1]
420 491 if a + 1 != b or self.base(b) == b:
421 492 for j in xrange(self.base(a), a + 1):
422 493 needed[j] = 1
423 494 for j in xrange(self.base(b), b + 1):
424 495 needed[j] = 1
425 496
426 497 # calculate spans to retrieve from datafile
427 498 needed = needed.keys()
428 499 needed.sort()
429 500 spans = []
430 501 oo = -1
431 502 ol = 0
432 503 for n in needed:
433 504 if n < 0: continue
434 505 o = self.start(n)
435 506 l = self.length(n)
436 507 if oo + ol == o: # can we merge with the previous?
437 508 nl = spans[-1][2]
438 509 nl.append((n, l))
439 510 ol += l
440 511 spans[-1] = (oo, ol, nl)
441 512 else:
442 513 oo = o
443 514 ol = l
444 515 spans.append((oo, ol, [(n, l)]))
445 516
446 517 # read spans in, divide up chunks
447 518 chunks = {}
448 519 for span in spans:
449 520 # we reopen the file for each span to make http happy for now
450 521 f = self.opener(self.datafile)
451 522 f.seek(span[0])
452 523 data = f.read(span[1])
453 524
454 525 # divide up the span
455 526 pos = 0
456 527 for r, l in span[2]:
457 528 chunks[r] = decompress(data[pos: pos + l])
458 529 pos += l
459 530
460 531 # helper to reconstruct intermediate versions
461 532 def construct(text, base, rev):
462 533 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
463 534 return mdiff.patches(text, bins)
464 535
465 536 # build deltas
466 537 deltas = []
467 538 for d in xrange(0, len(revs) - 1):
468 539 a, b = revs[d], revs[d + 1]
469 540 n = self.node(b)
470 541
471 542 # do we need to construct a new delta?
472 543 if a + 1 != b or self.base(b) == b:
473 544 if a >= 0:
474 545 base = self.base(a)
475 546 ta = chunks[self.base(a)]
476 547 ta = construct(ta, base, a)
477 548 else:
478 549 ta = ""
479 550
480 551 base = self.base(b)
481 552 if a > base:
482 553 base = a
483 554 tb = ta
484 555 else:
485 556 tb = chunks[self.base(b)]
486 557 tb = construct(tb, base, b)
487 558 d = self.diff(ta, tb)
488 559 else:
489 560 d = chunks[b]
490 561
491 562 p = self.parents(n)
492 563 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
493 564 l = struct.pack(">l", len(meta) + len(d) + 4)
494 565 yield l
495 566 yield meta
496 567 yield d
497 568
498 569 yield struct.pack(">l", 0)
499 570
500 571 def addgroup(self, revs, linkmapper, transaction, unique=0):
501 # given a set of deltas, add them to the revision log. the
502 # first delta is against its parent, which should be in our
503 # log, the rest are against the previous delta.
572 """
573 add a delta group
504 574
505 # track the base of the current delta log
575 given a set of deltas, add them to the revision log. the
576 first delta is against its parent, which should be in our
577 log, the rest are against the previous delta.
578 """
579
580 #track the base of the current delta log
506 581 r = self.count()
507 582 t = r - 1
508 583 node = nullid
509 584
510 585 base = prev = -1
511 586 start = end = measure = 0
512 587 if r:
513 588 start = self.start(self.base(t))
514 589 end = self.end(t)
515 590 measure = self.length(self.base(t))
516 591 base = self.base(t)
517 592 prev = self.tip()
518 593
519 594 transaction.add(self.datafile, end)
520 595 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
521 596 dfh = self.opener(self.datafile, "a")
522 597 ifh = self.opener(self.indexfile, "a")
523 598
524 599 # loop through our set of deltas
525 600 chain = None
526 601 for chunk in revs:
527 602 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
528 603 link = linkmapper(cs)
529 604 if node in self.nodemap:
530 605 # this can happen if two branches make the same change
531 606 if unique:
532 607 raise RevlogError("already have %s" % hex(node[:4]))
533 608 chain = node
534 609 continue
535 610 delta = chunk[80:]
536 611
537 612 if not chain:
538 613 # retrieve the parent revision of the delta chain
539 614 chain = p1
540 615 if not chain in self.nodemap:
541 616 raise RevlogError("unknown base %s" % short(chain[:4]))
542 617
543 618 # full versions are inserted when the needed deltas become
544 619 # comparable to the uncompressed text or when the previous
545 620 # version is not the one we have a delta against. We use
546 621 # the size of the previous full rev as a proxy for the
547 622 # current size.
548 623
549 624 if chain == prev:
550 625 cdelta = compress(delta)
551 626
552 627 if chain != prev or (end - start + len(cdelta)) > measure * 2:
553 628 # flush our writes here so we can read it in revision
554 629 dfh.flush()
555 630 ifh.flush()
556 631 text = self.revision(chain)
557 632 text = self.patches(text, [delta])
558 633 chk = self.addrevision(text, transaction, link, p1, p2)
559 634 if chk != node:
560 635 raise RevlogError("consistency error adding group")
561 636 measure = len(text)
562 637 else:
563 638 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
564 639 self.index.append(e)
565 640 self.nodemap[node] = r
566 641 dfh.write(cdelta)
567 642 ifh.write(struct.pack(indexformat, *e))
568 643
569 644 t, r, chain, prev = r, r + 1, node, node
570 645 start = self.start(self.base(t))
571 646 end = self.end(t)
572 647
573 648 dfh.close()
574 649 ifh.close()
575 650 return node
General Comments 0
You need to be logged in to leave comments. Login now