##// END OF EJS Templates
revlog: break up compression of large deltas...
Matt Mackall -
r5451:0a438756 default
parent child Browse files
Show More
@@ -1,1291 +1,1306 b''
1 1 """
2 2 revlog.py - storage back-end for mercurial
3 3
4 4 This provides efficient delta storage with O(1) retrieve and append
5 5 and O(changes) merge between branches
6 6
7 7 Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
8 8
9 9 This software may be used and distributed according to the terms
10 10 of the GNU General Public License, incorporated herein by reference.
11 11 """
12 12
13 13 from node import *
14 14 from i18n import _
15 15 import binascii, changegroup, errno, ancestor, mdiff, os
16 16 import sha, struct, util, zlib
17 17
18 18 _pack = struct.pack
19 19 _unpack = struct.unpack
20 20 _compress = zlib.compress
21 21 _decompress = zlib.decompress
22 22 _sha = sha.new
23 23
24 24 # revlog flags
25 25 REVLOGV0 = 0
26 26 REVLOGNG = 1
27 27 REVLOGNGINLINEDATA = (1 << 16)
28 28 REVLOG_DEFAULT_FLAGS = REVLOGNGINLINEDATA
29 29 REVLOG_DEFAULT_FORMAT = REVLOGNG
30 30 REVLOG_DEFAULT_VERSION = REVLOG_DEFAULT_FORMAT | REVLOG_DEFAULT_FLAGS
31 31
32 32 class RevlogError(Exception):
33 33 pass
34 34 class LookupError(RevlogError):
35 35 pass
36 36
37 37 def getoffset(q):
38 38 return int(q >> 16)
39 39
40 40 def gettype(q):
41 41 return int(q & 0xFFFF)
42 42
43 43 def offset_type(offset, type):
44 44 return long(long(offset) << 16 | type)
45 45
46 46 def hash(text, p1, p2):
47 47 """generate a hash from the given text and its parent hashes
48 48
49 49 This hash combines both the current file contents and its history
50 50 in a manner that makes it easy to distinguish nodes with the same
51 51 content in the revision graph.
52 52 """
53 53 l = [p1, p2]
54 54 l.sort()
55 55 s = _sha(l[0])
56 56 s.update(l[1])
57 57 s.update(text)
58 58 return s.digest()
59 59
60 60 def compress(text):
61 61 """ generate a possibly-compressed representation of text """
62 62 if not text:
63 63 return ("", text)
64 if len(text) < 44:
64 l = len(text)
65 if l < 44:
65 66 if text[0] == '\0':
66 67 return ("", text)
67 68 return ('u', text)
68 bin = _compress(text)
69 if len(bin) > len(text):
69 elif l > 1000000:
70 # zlib makes an internal copy, thus doubling memory usage for
71 # large files, so lets do this in pieces
72 z = zlib.compressobj()
73 p = []
74 pos = 0
75 while pos < l:
76 pos2 = pos + 2**20
77 p.append(z.compress(text[pos:pos2]))
78 pos = pos2
79 p.append(z.flush())
80 if sum(map(len, p)) < l:
81 bin = "".join(p)
82 else:
83 bin = _compress(text)
84 if len(bin) > l:
70 85 if text[0] == '\0':
71 86 return ("", text)
72 87 return ('u', text)
73 88 return ("", bin)
74 89
75 90 def decompress(bin):
76 91 """ decompress the given input """
77 92 if not bin:
78 93 return bin
79 94 t = bin[0]
80 95 if t == '\0':
81 96 return bin
82 97 if t == 'x':
83 98 return _decompress(bin)
84 99 if t == 'u':
85 100 return bin[1:]
86 101 raise RevlogError(_("unknown compression type %r") % t)
87 102
88 103 class lazyparser(object):
89 104 """
90 105 this class avoids the need to parse the entirety of large indices
91 106 """
92 107
93 108 # lazyparser is not safe to use on windows if win32 extensions not
94 109 # available. it keeps file handle open, which make it not possible
95 110 # to break hardlinks on local cloned repos.
96 111 safe_to_use = os.name != 'nt' or (not util.is_win_9x() and
97 112 hasattr(util, 'win32api'))
98 113
99 114 def __init__(self, dataf, size):
100 115 self.dataf = dataf
101 116 self.s = struct.calcsize(indexformatng)
102 117 self.datasize = size
103 118 self.l = size/self.s
104 119 self.index = [None] * self.l
105 120 self.map = {nullid: nullrev}
106 121 self.allmap = 0
107 122 self.all = 0
108 123 self.mapfind_count = 0
109 124
110 125 def loadmap(self):
111 126 """
112 127 during a commit, we need to make sure the rev being added is
113 128 not a duplicate. This requires loading the entire index,
114 129 which is fairly slow. loadmap can load up just the node map,
115 130 which takes much less time.
116 131 """
117 132 if self.allmap:
118 133 return
119 134 end = self.datasize
120 135 self.allmap = 1
121 136 cur = 0
122 137 count = 0
123 138 blocksize = self.s * 256
124 139 self.dataf.seek(0)
125 140 while cur < end:
126 141 data = self.dataf.read(blocksize)
127 142 off = 0
128 143 for x in xrange(256):
129 144 n = data[off + ngshaoffset:off + ngshaoffset + 20]
130 145 self.map[n] = count
131 146 count += 1
132 147 if count >= self.l:
133 148 break
134 149 off += self.s
135 150 cur += blocksize
136 151
137 152 def loadblock(self, blockstart, blocksize, data=None):
138 153 if self.all:
139 154 return
140 155 if data is None:
141 156 self.dataf.seek(blockstart)
142 157 if blockstart + blocksize > self.datasize:
143 158 # the revlog may have grown since we've started running,
144 159 # but we don't have space in self.index for more entries.
145 160 # limit blocksize so that we don't get too much data.
146 161 blocksize = max(self.datasize - blockstart, 0)
147 162 data = self.dataf.read(blocksize)
148 163 lend = len(data) / self.s
149 164 i = blockstart / self.s
150 165 off = 0
151 166 # lazyindex supports __delitem__
152 167 if lend > len(self.index) - i:
153 168 lend = len(self.index) - i
154 169 for x in xrange(lend):
155 170 if self.index[i + x] == None:
156 171 b = data[off : off + self.s]
157 172 self.index[i + x] = b
158 173 n = b[ngshaoffset:ngshaoffset + 20]
159 174 self.map[n] = i + x
160 175 off += self.s
161 176
162 177 def findnode(self, node):
163 178 """search backwards through the index file for a specific node"""
164 179 if self.allmap:
165 180 return None
166 181
167 182 # hg log will cause many many searches for the manifest
168 183 # nodes. After we get called a few times, just load the whole
169 184 # thing.
170 185 if self.mapfind_count > 8:
171 186 self.loadmap()
172 187 if node in self.map:
173 188 return node
174 189 return None
175 190 self.mapfind_count += 1
176 191 last = self.l - 1
177 192 while self.index[last] != None:
178 193 if last == 0:
179 194 self.all = 1
180 195 self.allmap = 1
181 196 return None
182 197 last -= 1
183 198 end = (last + 1) * self.s
184 199 blocksize = self.s * 256
185 200 while end >= 0:
186 201 start = max(end - blocksize, 0)
187 202 self.dataf.seek(start)
188 203 data = self.dataf.read(end - start)
189 204 findend = end - start
190 205 while True:
191 206 # we're searching backwards, so we have to make sure
192 207 # we don't find a changeset where this node is a parent
193 208 off = data.find(node, 0, findend)
194 209 findend = off
195 210 if off >= 0:
196 211 i = off / self.s
197 212 off = i * self.s
198 213 n = data[off + ngshaoffset:off + ngshaoffset + 20]
199 214 if n == node:
200 215 self.map[n] = i + start / self.s
201 216 return node
202 217 else:
203 218 break
204 219 end -= blocksize
205 220 return None
206 221
207 222 def loadindex(self, i=None, end=None):
208 223 if self.all:
209 224 return
210 225 all = False
211 226 if i == None:
212 227 blockstart = 0
213 228 blocksize = (65536 / self.s) * self.s
214 229 end = self.datasize
215 230 all = True
216 231 else:
217 232 if end:
218 233 blockstart = i * self.s
219 234 end = end * self.s
220 235 blocksize = end - blockstart
221 236 else:
222 237 blockstart = (i & ~1023) * self.s
223 238 blocksize = self.s * 1024
224 239 end = blockstart + blocksize
225 240 while blockstart < end:
226 241 self.loadblock(blockstart, blocksize)
227 242 blockstart += blocksize
228 243 if all:
229 244 self.all = True
230 245
231 246 class lazyindex(object):
232 247 """a lazy version of the index array"""
233 248 def __init__(self, parser):
234 249 self.p = parser
235 250 def __len__(self):
236 251 return len(self.p.index)
237 252 def load(self, pos):
238 253 if pos < 0:
239 254 pos += len(self.p.index)
240 255 self.p.loadindex(pos)
241 256 return self.p.index[pos]
242 257 def __getitem__(self, pos):
243 258 return _unpack(indexformatng, self.p.index[pos] or self.load(pos))
244 259 def __setitem__(self, pos, item):
245 260 self.p.index[pos] = _pack(indexformatng, *item)
246 261 def __delitem__(self, pos):
247 262 del self.p.index[pos]
248 263 def insert(self, pos, e):
249 264 self.p.index.insert(pos, _pack(indexformatng, *e))
250 265 def append(self, e):
251 266 self.p.index.append(_pack(indexformatng, *e))
252 267
253 268 class lazymap(object):
254 269 """a lazy version of the node map"""
255 270 def __init__(self, parser):
256 271 self.p = parser
257 272 def load(self, key):
258 273 n = self.p.findnode(key)
259 274 if n == None:
260 275 raise KeyError(key)
261 276 def __contains__(self, key):
262 277 if key in self.p.map:
263 278 return True
264 279 self.p.loadmap()
265 280 return key in self.p.map
266 281 def __iter__(self):
267 282 yield nullid
268 283 for i in xrange(self.p.l):
269 284 ret = self.p.index[i]
270 285 if not ret:
271 286 self.p.loadindex(i)
272 287 ret = self.p.index[i]
273 288 if isinstance(ret, str):
274 289 ret = _unpack(indexformatng, ret)
275 290 yield ret[7]
276 291 def __getitem__(self, key):
277 292 try:
278 293 return self.p.map[key]
279 294 except KeyError:
280 295 try:
281 296 self.load(key)
282 297 return self.p.map[key]
283 298 except KeyError:
284 299 raise KeyError("node " + hex(key))
285 300 def __setitem__(self, key, val):
286 301 self.p.map[key] = val
287 302 def __delitem__(self, key):
288 303 del self.p.map[key]
289 304
290 305 indexformatv0 = ">4l20s20s20s"
291 306 v0shaoffset = 56
292 307
293 308 class revlogoldio(object):
294 309 def __init__(self):
295 310 self.size = struct.calcsize(indexformatv0)
296 311
297 312 def parseindex(self, fp, inline):
298 313 s = self.size
299 314 index = []
300 315 nodemap = {nullid: nullrev}
301 316 n = off = 0
302 317 data = fp.read()
303 318 l = len(data)
304 319 while off + s <= l:
305 320 cur = data[off:off + s]
306 321 off += s
307 322 e = _unpack(indexformatv0, cur)
308 323 # transform to revlogv1 format
309 324 e2 = (offset_type(e[0], 0), e[1], -1, e[2], e[3],
310 325 nodemap[e[4]], nodemap[e[5]], e[6])
311 326 index.append(e2)
312 327 nodemap[e[6]] = n
313 328 n += 1
314 329
315 330 return index, nodemap, None
316 331
317 332 def packentry(self, entry, node, version, rev):
318 333 e2 = (getoffset(entry[0]), entry[1], entry[3], entry[4],
319 334 node(entry[5]), node(entry[6]), entry[7])
320 335 return _pack(indexformatv0, *e2)
321 336
322 337 # index ng:
323 338 # 6 bytes offset
324 339 # 2 bytes flags
325 340 # 4 bytes compressed length
326 341 # 4 bytes uncompressed length
327 342 # 4 bytes: base rev
328 343 # 4 bytes link rev
329 344 # 4 bytes parent 1 rev
330 345 # 4 bytes parent 2 rev
331 346 # 32 bytes: nodeid
332 347 indexformatng = ">Qiiiiii20s12x"
333 348 ngshaoffset = 32
334 349 versionformat = ">I"
335 350
336 351 class revlogio(object):
337 352 def __init__(self):
338 353 self.size = struct.calcsize(indexformatng)
339 354
340 355 def parseindex(self, fp, inline):
341 356 try:
342 357 size = util.fstat(fp).st_size
343 358 except AttributeError:
344 359 size = 0
345 360
346 361 if lazyparser.safe_to_use and not inline and size > 1000000:
347 362 # big index, let's parse it on demand
348 363 parser = lazyparser(fp, size)
349 364 index = lazyindex(parser)
350 365 nodemap = lazymap(parser)
351 366 e = list(index[0])
352 367 type = gettype(e[0])
353 368 e[0] = offset_type(0, type)
354 369 index[0] = e
355 370 return index, nodemap, None
356 371
357 372 s = self.size
358 373 cache = None
359 374 index = []
360 375 nodemap = {nullid: nullrev}
361 376 n = off = 0
362 377 # if we're not using lazymap, always read the whole index
363 378 data = fp.read()
364 379 l = len(data) - s
365 380 append = index.append
366 381 if inline:
367 382 cache = (0, data)
368 383 while off <= l:
369 384 e = _unpack(indexformatng, data[off:off + s])
370 385 nodemap[e[7]] = n
371 386 append(e)
372 387 n += 1
373 388 if e[1] < 0:
374 389 break
375 390 off += e[1] + s
376 391 else:
377 392 while off <= l:
378 393 e = _unpack(indexformatng, data[off:off + s])
379 394 nodemap[e[7]] = n
380 395 append(e)
381 396 n += 1
382 397 off += s
383 398
384 399 e = list(index[0])
385 400 type = gettype(e[0])
386 401 e[0] = offset_type(0, type)
387 402 index[0] = e
388 403
389 404 return index, nodemap, cache
390 405
391 406 def packentry(self, entry, node, version, rev):
392 407 p = _pack(indexformatng, *entry)
393 408 if rev == 0:
394 409 p = _pack(versionformat, version) + p[4:]
395 410 return p
396 411
397 412 class revlog(object):
398 413 """
399 414 the underlying revision storage object
400 415
401 416 A revlog consists of two parts, an index and the revision data.
402 417
403 418 The index is a file with a fixed record size containing
404 419 information on each revision, includings its nodeid (hash), the
405 420 nodeids of its parents, the position and offset of its data within
406 421 the data file, and the revision it's based on. Finally, each entry
407 422 contains a linkrev entry that can serve as a pointer to external
408 423 data.
409 424
410 425 The revision data itself is a linear collection of data chunks.
411 426 Each chunk represents a revision and is usually represented as a
412 427 delta against the previous chunk. To bound lookup time, runs of
413 428 deltas are limited to about 2 times the length of the original
414 429 version data. This makes retrieval of a version proportional to
415 430 its size, or O(1) relative to the number of revisions.
416 431
417 432 Both pieces of the revlog are written to in an append-only
418 433 fashion, which means we never need to rewrite a file to insert or
419 434 remove data, and can use some simple techniques to avoid the need
420 435 for locking while reading.
421 436 """
422 437 def __init__(self, opener, indexfile):
423 438 """
424 439 create a revlog object
425 440
426 441 opener is a function that abstracts the file opening operation
427 442 and can be used to implement COW semantics or the like.
428 443 """
429 444 self.indexfile = indexfile
430 445 self.datafile = indexfile[:-2] + ".d"
431 446 self.opener = opener
432 447 self._cache = None
433 448 self._chunkcache = None
434 449 self.nodemap = {nullid: nullrev}
435 450 self.index = []
436 451
437 452 v = REVLOG_DEFAULT_VERSION
438 453 if hasattr(opener, "defversion"):
439 454 v = opener.defversion
440 455 if v & REVLOGNG:
441 456 v |= REVLOGNGINLINEDATA
442 457
443 458 i = ""
444 459 try:
445 460 f = self.opener(self.indexfile)
446 461 i = f.read(4)
447 462 f.seek(0)
448 463 if len(i) > 0:
449 464 v = struct.unpack(versionformat, i)[0]
450 465 except IOError, inst:
451 466 if inst.errno != errno.ENOENT:
452 467 raise
453 468
454 469 self.version = v
455 470 self._inline = v & REVLOGNGINLINEDATA
456 471 flags = v & ~0xFFFF
457 472 fmt = v & 0xFFFF
458 473 if fmt == REVLOGV0 and flags:
459 474 raise RevlogError(_("index %s unknown flags %#04x for format v0")
460 475 % (self.indexfile, flags >> 16))
461 476 elif fmt == REVLOGNG and flags & ~REVLOGNGINLINEDATA:
462 477 raise RevlogError(_("index %s unknown flags %#04x for revlogng")
463 478 % (self.indexfile, flags >> 16))
464 479 elif fmt > REVLOGNG:
465 480 raise RevlogError(_("index %s unknown format %d")
466 481 % (self.indexfile, fmt))
467 482
468 483 self._io = revlogio()
469 484 if self.version == REVLOGV0:
470 485 self._io = revlogoldio()
471 486 if i:
472 487 d = self._io.parseindex(f, self._inline)
473 488 self.index, self.nodemap, self._chunkcache = d
474 489
475 490 # add the magic null revision at -1
476 491 self.index.append((0, 0, 0, -1, -1, -1, -1, nullid))
477 492
478 493 def _loadindex(self, start, end):
479 494 """load a block of indexes all at once from the lazy parser"""
480 495 if isinstance(self.index, lazyindex):
481 496 self.index.p.loadindex(start, end)
482 497
483 498 def _loadindexmap(self):
484 499 """loads both the map and the index from the lazy parser"""
485 500 if isinstance(self.index, lazyindex):
486 501 p = self.index.p
487 502 p.loadindex()
488 503 self.nodemap = p.map
489 504
490 505 def _loadmap(self):
491 506 """loads the map from the lazy parser"""
492 507 if isinstance(self.nodemap, lazymap):
493 508 self.nodemap.p.loadmap()
494 509 self.nodemap = self.nodemap.p.map
495 510
496 511 def tip(self):
497 512 return self.node(len(self.index) - 2)
498 513 def count(self):
499 514 return len(self.index) - 1
500 515
501 516 def rev(self, node):
502 517 try:
503 518 return self.nodemap[node]
504 519 except KeyError:
505 520 raise LookupError(_('%s: no node %s') % (self.indexfile, hex(node)))
506 521 def node(self, rev):
507 522 return self.index[rev][7]
508 523 def linkrev(self, node):
509 524 return self.index[self.rev(node)][4]
510 525 def parents(self, node):
511 526 d = self.index[self.rev(node)][5:7]
512 527 return (self.node(d[0]), self.node(d[1]))
513 528 def parentrevs(self, rev):
514 529 return self.index[rev][5:7]
515 530 def start(self, rev):
516 531 return int(self.index[rev][0] >> 16)
517 532 def end(self, rev):
518 533 return self.start(rev) + self.length(rev)
519 534 def length(self, rev):
520 535 return self.index[rev][1]
521 536 def base(self, rev):
522 537 return self.index[rev][3]
523 538
524 539 def size(self, rev):
525 540 """return the length of the uncompressed text for a given revision"""
526 541 l = self.index[rev][2]
527 542 if l >= 0:
528 543 return l
529 544
530 545 t = self.revision(self.node(rev))
531 546 return len(t)
532 547
533 548 # alternate implementation, The advantage to this code is it
534 549 # will be faster for a single revision. But, the results are not
535 550 # cached, so finding the size of every revision will be slower.
536 551 """
537 552 if self.cache and self.cache[1] == rev:
538 553 return len(self.cache[2])
539 554
540 555 base = self.base(rev)
541 556 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
542 557 base = self.cache[1]
543 558 text = self.cache[2]
544 559 else:
545 560 text = self.revision(self.node(base))
546 561
547 562 l = len(text)
548 563 for x in xrange(base + 1, rev + 1):
549 564 l = mdiff.patchedsize(l, self.chunk(x))
550 565 return l
551 566 """
552 567
553 568 def reachable(self, node, stop=None):
554 569 """return a hash of all nodes ancestral to a given node, including
555 570 the node itself, stopping when stop is matched"""
556 571 reachable = {}
557 572 visit = [node]
558 573 reachable[node] = 1
559 574 if stop:
560 575 stopn = self.rev(stop)
561 576 else:
562 577 stopn = 0
563 578 while visit:
564 579 n = visit.pop(0)
565 580 if n == stop:
566 581 continue
567 582 if n == nullid:
568 583 continue
569 584 for p in self.parents(n):
570 585 if self.rev(p) < stopn:
571 586 continue
572 587 if p not in reachable:
573 588 reachable[p] = 1
574 589 visit.append(p)
575 590 return reachable
576 591
577 592 def nodesbetween(self, roots=None, heads=None):
578 593 """Return a tuple containing three elements. Elements 1 and 2 contain
579 594 a final list bases and heads after all the unreachable ones have been
580 595 pruned. Element 0 contains a topologically sorted list of all
581 596
582 597 nodes that satisfy these constraints:
583 598 1. All nodes must be descended from a node in roots (the nodes on
584 599 roots are considered descended from themselves).
585 600 2. All nodes must also be ancestors of a node in heads (the nodes in
586 601 heads are considered to be their own ancestors).
587 602
588 603 If roots is unspecified, nullid is assumed as the only root.
589 604 If heads is unspecified, it is taken to be the output of the
590 605 heads method (i.e. a list of all nodes in the repository that
591 606 have no children)."""
592 607 nonodes = ([], [], [])
593 608 if roots is not None:
594 609 roots = list(roots)
595 610 if not roots:
596 611 return nonodes
597 612 lowestrev = min([self.rev(n) for n in roots])
598 613 else:
599 614 roots = [nullid] # Everybody's a descendent of nullid
600 615 lowestrev = nullrev
601 616 if (lowestrev == nullrev) and (heads is None):
602 617 # We want _all_ the nodes!
603 618 return ([self.node(r) for r in xrange(0, self.count())],
604 619 [nullid], list(self.heads()))
605 620 if heads is None:
606 621 # All nodes are ancestors, so the latest ancestor is the last
607 622 # node.
608 623 highestrev = self.count() - 1
609 624 # Set ancestors to None to signal that every node is an ancestor.
610 625 ancestors = None
611 626 # Set heads to an empty dictionary for later discovery of heads
612 627 heads = {}
613 628 else:
614 629 heads = list(heads)
615 630 if not heads:
616 631 return nonodes
617 632 ancestors = {}
618 633 # Turn heads into a dictionary so we can remove 'fake' heads.
619 634 # Also, later we will be using it to filter out the heads we can't
620 635 # find from roots.
621 636 heads = dict.fromkeys(heads, 0)
622 637 # Start at the top and keep marking parents until we're done.
623 638 nodestotag = heads.keys()
624 639 # Remember where the top was so we can use it as a limit later.
625 640 highestrev = max([self.rev(n) for n in nodestotag])
626 641 while nodestotag:
627 642 # grab a node to tag
628 643 n = nodestotag.pop()
629 644 # Never tag nullid
630 645 if n == nullid:
631 646 continue
632 647 # A node's revision number represents its place in a
633 648 # topologically sorted list of nodes.
634 649 r = self.rev(n)
635 650 if r >= lowestrev:
636 651 if n not in ancestors:
637 652 # If we are possibly a descendent of one of the roots
638 653 # and we haven't already been marked as an ancestor
639 654 ancestors[n] = 1 # Mark as ancestor
640 655 # Add non-nullid parents to list of nodes to tag.
641 656 nodestotag.extend([p for p in self.parents(n) if
642 657 p != nullid])
643 658 elif n in heads: # We've seen it before, is it a fake head?
644 659 # So it is, real heads should not be the ancestors of
645 660 # any other heads.
646 661 heads.pop(n)
647 662 if not ancestors:
648 663 return nonodes
649 664 # Now that we have our set of ancestors, we want to remove any
650 665 # roots that are not ancestors.
651 666
652 667 # If one of the roots was nullid, everything is included anyway.
653 668 if lowestrev > nullrev:
654 669 # But, since we weren't, let's recompute the lowest rev to not
655 670 # include roots that aren't ancestors.
656 671
657 672 # Filter out roots that aren't ancestors of heads
658 673 roots = [n for n in roots if n in ancestors]
659 674 # Recompute the lowest revision
660 675 if roots:
661 676 lowestrev = min([self.rev(n) for n in roots])
662 677 else:
663 678 # No more roots? Return empty list
664 679 return nonodes
665 680 else:
666 681 # We are descending from nullid, and don't need to care about
667 682 # any other roots.
668 683 lowestrev = nullrev
669 684 roots = [nullid]
670 685 # Transform our roots list into a 'set' (i.e. a dictionary where the
671 686 # values don't matter.
672 687 descendents = dict.fromkeys(roots, 1)
673 688 # Also, keep the original roots so we can filter out roots that aren't
674 689 # 'real' roots (i.e. are descended from other roots).
675 690 roots = descendents.copy()
676 691 # Our topologically sorted list of output nodes.
677 692 orderedout = []
678 693 # Don't start at nullid since we don't want nullid in our output list,
679 694 # and if nullid shows up in descedents, empty parents will look like
680 695 # they're descendents.
681 696 for r in xrange(max(lowestrev, 0), highestrev + 1):
682 697 n = self.node(r)
683 698 isdescendent = False
684 699 if lowestrev == nullrev: # Everybody is a descendent of nullid
685 700 isdescendent = True
686 701 elif n in descendents:
687 702 # n is already a descendent
688 703 isdescendent = True
689 704 # This check only needs to be done here because all the roots
690 705 # will start being marked is descendents before the loop.
691 706 if n in roots:
692 707 # If n was a root, check if it's a 'real' root.
693 708 p = tuple(self.parents(n))
694 709 # If any of its parents are descendents, it's not a root.
695 710 if (p[0] in descendents) or (p[1] in descendents):
696 711 roots.pop(n)
697 712 else:
698 713 p = tuple(self.parents(n))
699 714 # A node is a descendent if either of its parents are
700 715 # descendents. (We seeded the dependents list with the roots
701 716 # up there, remember?)
702 717 if (p[0] in descendents) or (p[1] in descendents):
703 718 descendents[n] = 1
704 719 isdescendent = True
705 720 if isdescendent and ((ancestors is None) or (n in ancestors)):
706 721 # Only include nodes that are both descendents and ancestors.
707 722 orderedout.append(n)
708 723 if (ancestors is not None) and (n in heads):
709 724 # We're trying to figure out which heads are reachable
710 725 # from roots.
711 726 # Mark this head as having been reached
712 727 heads[n] = 1
713 728 elif ancestors is None:
714 729 # Otherwise, we're trying to discover the heads.
715 730 # Assume this is a head because if it isn't, the next step
716 731 # will eventually remove it.
717 732 heads[n] = 1
718 733 # But, obviously its parents aren't.
719 734 for p in self.parents(n):
720 735 heads.pop(p, None)
721 736 heads = [n for n in heads.iterkeys() if heads[n] != 0]
722 737 roots = roots.keys()
723 738 assert orderedout
724 739 assert roots
725 740 assert heads
726 741 return (orderedout, roots, heads)
727 742
728 743 def heads(self, start=None, stop=None):
729 744 """return the list of all nodes that have no children
730 745
731 746 if start is specified, only heads that are descendants of
732 747 start will be returned
733 748 if stop is specified, it will consider all the revs from stop
734 749 as if they had no children
735 750 """
736 751 if start is None and stop is None:
737 752 count = self.count()
738 753 if not count:
739 754 return [nullid]
740 755 ishead = [1] * (count + 1)
741 756 index = self.index
742 757 for r in xrange(count):
743 758 e = index[r]
744 759 ishead[e[5]] = ishead[e[6]] = 0
745 760 return [self.node(r) for r in xrange(count) if ishead[r]]
746 761
747 762 if start is None:
748 763 start = nullid
749 764 if stop is None:
750 765 stop = []
751 766 stoprevs = dict.fromkeys([self.rev(n) for n in stop])
752 767 startrev = self.rev(start)
753 768 reachable = {startrev: 1}
754 769 heads = {startrev: 1}
755 770
756 771 parentrevs = self.parentrevs
757 772 for r in xrange(startrev + 1, self.count()):
758 773 for p in parentrevs(r):
759 774 if p in reachable:
760 775 if r not in stoprevs:
761 776 reachable[r] = 1
762 777 heads[r] = 1
763 778 if p in heads and p not in stoprevs:
764 779 del heads[p]
765 780
766 781 return [self.node(r) for r in heads]
767 782
768 783 def children(self, node):
769 784 """find the children of a given node"""
770 785 c = []
771 786 p = self.rev(node)
772 787 for r in range(p + 1, self.count()):
773 788 prevs = [pr for pr in self.parentrevs(r) if pr != nullrev]
774 789 if prevs:
775 790 for pr in prevs:
776 791 if pr == p:
777 792 c.append(self.node(r))
778 793 elif p == nullrev:
779 794 c.append(self.node(r))
780 795 return c
781 796
782 797 def _match(self, id):
783 798 if isinstance(id, (long, int)):
784 799 # rev
785 800 return self.node(id)
786 801 if len(id) == 20:
787 802 # possibly a binary node
788 803 # odds of a binary node being all hex in ASCII are 1 in 10**25
789 804 try:
790 805 node = id
791 806 r = self.rev(node) # quick search the index
792 807 return node
793 808 except LookupError:
794 809 pass # may be partial hex id
795 810 try:
796 811 # str(rev)
797 812 rev = int(id)
798 813 if str(rev) != id:
799 814 raise ValueError
800 815 if rev < 0:
801 816 rev = self.count() + rev
802 817 if rev < 0 or rev >= self.count():
803 818 raise ValueError
804 819 return self.node(rev)
805 820 except (ValueError, OverflowError):
806 821 pass
807 822 if len(id) == 40:
808 823 try:
809 824 # a full hex nodeid?
810 825 node = bin(id)
811 826 r = self.rev(node)
812 827 return node
813 828 except TypeError:
814 829 pass
815 830
816 831 def _partialmatch(self, id):
817 832 if len(id) < 40:
818 833 try:
819 834 # hex(node)[:...]
820 835 bin_id = bin(id[:len(id) & ~1]) # grab an even number of digits
821 836 node = None
822 837 for n in self.nodemap:
823 838 if n.startswith(bin_id) and hex(n).startswith(id):
824 839 if node is not None:
825 840 raise LookupError(_("Ambiguous identifier"))
826 841 node = n
827 842 if node is not None:
828 843 return node
829 844 except TypeError:
830 845 pass
831 846
832 847 def lookup(self, id):
833 848 """locate a node based on:
834 849 - revision number or str(revision number)
835 850 - nodeid or subset of hex nodeid
836 851 """
837 852 n = self._match(id)
838 853 if n is not None:
839 854 return n
840 855 n = self._partialmatch(id)
841 856 if n:
842 857 return n
843 858
844 859 raise LookupError(_("No match found"))
845 860
846 861 def cmp(self, node, text):
847 862 """compare text with a given file revision"""
848 863 p1, p2 = self.parents(node)
849 864 return hash(text, p1, p2) != node
850 865
851 866 def chunk(self, rev, df=None):
852 867 def loadcache(df):
853 868 if not df:
854 869 if self._inline:
855 870 df = self.opener(self.indexfile)
856 871 else:
857 872 df = self.opener(self.datafile)
858 873 df.seek(start)
859 874 self._chunkcache = (start, df.read(cache_length))
860 875
861 876 start, length = self.start(rev), self.length(rev)
862 877 if self._inline:
863 878 start += (rev + 1) * self._io.size
864 879 end = start + length
865 880
866 881 offset = 0
867 882 if not self._chunkcache:
868 883 cache_length = max(65536, length)
869 884 loadcache(df)
870 885 else:
871 886 cache_start = self._chunkcache[0]
872 887 cache_length = len(self._chunkcache[1])
873 888 cache_end = cache_start + cache_length
874 889 if start >= cache_start and end <= cache_end:
875 890 # it is cached
876 891 offset = start - cache_start
877 892 else:
878 893 cache_length = max(65536, length)
879 894 loadcache(df)
880 895
881 896 # avoid copying large chunks
882 897 c = self._chunkcache[1]
883 898 if cache_length != length:
884 899 c = c[offset:offset + length]
885 900
886 901 return decompress(c)
887 902
888 903 def delta(self, node):
889 904 """return or calculate a delta between a node and its predecessor"""
890 905 r = self.rev(node)
891 906 return self.revdiff(r - 1, r)
892 907
893 908 def revdiff(self, rev1, rev2):
894 909 """return or calculate a delta between two revisions"""
895 910 if rev1 + 1 == rev2 and self.base(rev1) == self.base(rev2):
896 911 return self.chunk(rev2)
897 912
898 913 return mdiff.textdiff(self.revision(self.node(rev1)),
899 914 self.revision(self.node(rev2)))
900 915
901 916 def revision(self, node):
902 917 """return an uncompressed revision of a given"""
903 918 if node == nullid:
904 919 return ""
905 920 if self._cache and self._cache[0] == node:
906 921 return str(self._cache[2])
907 922
908 923 # look up what we need to read
909 924 text = None
910 925 rev = self.rev(node)
911 926 base = self.base(rev)
912 927
913 928 # check rev flags
914 929 if self.index[rev][0] & 0xFFFF:
915 930 raise RevlogError(_('incompatible revision flag %x') %
916 931 (self.index[rev][0] & 0xFFFF))
917 932
918 933 if self._inline:
919 934 # we probably have the whole chunk cached
920 935 df = None
921 936 else:
922 937 df = self.opener(self.datafile)
923 938
924 939 # do we have useful data cached?
925 940 if self._cache and self._cache[1] >= base and self._cache[1] < rev:
926 941 base = self._cache[1]
927 942 text = str(self._cache[2])
928 943 self._loadindex(base, rev + 1)
929 944 else:
930 945 self._loadindex(base, rev + 1)
931 946 text = self.chunk(base, df=df)
932 947
933 948 bins = [self.chunk(r, df) for r in xrange(base + 1, rev + 1)]
934 949 text = mdiff.patches(text, bins)
935 950 p1, p2 = self.parents(node)
936 951 if node != hash(text, p1, p2):
937 952 raise RevlogError(_("integrity check failed on %s:%d")
938 953 % (self.datafile, rev))
939 954
940 955 self._cache = (node, rev, text)
941 956 return text
942 957
943 958 def checkinlinesize(self, tr, fp=None):
944 959 if not self._inline:
945 960 return
946 961 if not fp:
947 962 fp = self.opener(self.indexfile, 'r')
948 963 fp.seek(0, 2)
949 964 size = fp.tell()
950 965 if size < 131072:
951 966 return
952 967 trinfo = tr.find(self.indexfile)
953 968 if trinfo == None:
954 969 raise RevlogError(_("%s not found in the transaction")
955 970 % self.indexfile)
956 971
957 972 trindex = trinfo[2]
958 973 dataoff = self.start(trindex)
959 974
960 975 tr.add(self.datafile, dataoff)
961 976 df = self.opener(self.datafile, 'w')
962 977 calc = self._io.size
963 978 for r in xrange(self.count()):
964 979 start = self.start(r) + (r + 1) * calc
965 980 length = self.length(r)
966 981 fp.seek(start)
967 982 d = fp.read(length)
968 983 df.write(d)
969 984 fp.close()
970 985 df.close()
971 986 fp = self.opener(self.indexfile, 'w', atomictemp=True)
972 987 self.version &= ~(REVLOGNGINLINEDATA)
973 988 self._inline = False
974 989 for i in xrange(self.count()):
975 990 e = self._io.packentry(self.index[i], self.node, self.version, i)
976 991 fp.write(e)
977 992
978 993 # if we don't call rename, the temp file will never replace the
979 994 # real index
980 995 fp.rename()
981 996
982 997 tr.replace(self.indexfile, trindex * calc)
983 998 self._chunkcache = None
984 999
985 1000 def addrevision(self, text, transaction, link, p1, p2, d=None):
986 1001 """add a revision to the log
987 1002
988 1003 text - the revision data to add
989 1004 transaction - the transaction object used for rollback
990 1005 link - the linkrev data to add
991 1006 p1, p2 - the parent nodeids of the revision
992 1007 d - an optional precomputed delta
993 1008 """
994 1009 dfh = None
995 1010 if not self._inline:
996 1011 dfh = self.opener(self.datafile, "a")
997 1012 ifh = self.opener(self.indexfile, "a+")
998 1013 return self._addrevision(text, transaction, link, p1, p2, d, ifh, dfh)
999 1014
1000 1015 def _addrevision(self, text, transaction, link, p1, p2, d, ifh, dfh):
1001 1016 node = hash(text, p1, p2)
1002 1017 if node in self.nodemap:
1003 1018 return node
1004 1019
1005 1020 curr = self.count()
1006 1021 prev = curr - 1
1007 1022 base = self.base(prev)
1008 1023 offset = self.end(prev)
1009 1024
1010 1025 if curr:
1011 1026 if not d:
1012 1027 ptext = self.revision(self.node(prev))
1013 1028 d = mdiff.textdiff(ptext, text)
1014 1029 data = compress(d)
1015 1030 l = len(data[1]) + len(data[0])
1016 1031 dist = l + offset - self.start(base)
1017 1032
1018 1033 # full versions are inserted when the needed deltas
1019 1034 # become comparable to the uncompressed text
1020 1035 if not curr or dist > len(text) * 2:
1021 1036 data = compress(text)
1022 1037 l = len(data[1]) + len(data[0])
1023 1038 base = curr
1024 1039
1025 1040 e = (offset_type(offset, 0), l, len(text),
1026 1041 base, link, self.rev(p1), self.rev(p2), node)
1027 1042 self.index.insert(-1, e)
1028 1043 self.nodemap[node] = curr
1029 1044
1030 1045 entry = self._io.packentry(e, self.node, self.version, curr)
1031 1046 if not self._inline:
1032 1047 transaction.add(self.datafile, offset)
1033 1048 transaction.add(self.indexfile, curr * len(entry))
1034 1049 if data[0]:
1035 1050 dfh.write(data[0])
1036 1051 dfh.write(data[1])
1037 1052 dfh.flush()
1038 1053 ifh.write(entry)
1039 1054 else:
1040 1055 offset += curr * self._io.size
1041 1056 transaction.add(self.indexfile, offset, curr)
1042 1057 ifh.write(entry)
1043 1058 ifh.write(data[0])
1044 1059 ifh.write(data[1])
1045 1060 self.checkinlinesize(transaction, ifh)
1046 1061
1047 1062 self._cache = (node, curr, text)
1048 1063 return node
1049 1064
1050 1065 def ancestor(self, a, b):
1051 1066 """calculate the least common ancestor of nodes a and b"""
1052 1067
1053 1068 def parents(rev):
1054 1069 return [p for p in self.parentrevs(rev) if p != nullrev]
1055 1070
1056 1071 c = ancestor.ancestor(self.rev(a), self.rev(b), parents)
1057 1072 if c is None:
1058 1073 return nullid
1059 1074
1060 1075 return self.node(c)
1061 1076
1062 1077 def group(self, nodelist, lookup, infocollect=None):
1063 1078 """calculate a delta group
1064 1079
1065 1080 Given a list of changeset revs, return a set of deltas and
1066 1081 metadata corresponding to nodes. the first delta is
1067 1082 parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
1068 1083 have this parent as it has all history before these
1069 1084 changesets. parent is parent[0]
1070 1085 """
1071 1086 revs = [self.rev(n) for n in nodelist]
1072 1087
1073 1088 # if we don't have any revisions touched by these changesets, bail
1074 1089 if not revs:
1075 1090 yield changegroup.closechunk()
1076 1091 return
1077 1092
1078 1093 # add the parent of the first rev
1079 1094 p = self.parents(self.node(revs[0]))[0]
1080 1095 revs.insert(0, self.rev(p))
1081 1096
1082 1097 # build deltas
1083 1098 for d in xrange(0, len(revs) - 1):
1084 1099 a, b = revs[d], revs[d + 1]
1085 1100 nb = self.node(b)
1086 1101
1087 1102 if infocollect is not None:
1088 1103 infocollect(nb)
1089 1104
1090 1105 p = self.parents(nb)
1091 1106 meta = nb + p[0] + p[1] + lookup(nb)
1092 1107 if a == -1:
1093 1108 d = self.revision(nb)
1094 1109 meta += mdiff.trivialdiffheader(len(d))
1095 1110 else:
1096 1111 d = self.revdiff(a, b)
1097 1112 yield changegroup.chunkheader(len(meta) + len(d))
1098 1113 yield meta
1099 1114 if len(d) > 2**20:
1100 1115 pos = 0
1101 1116 while pos < len(d):
1102 1117 pos2 = pos + 2 ** 18
1103 1118 yield d[pos:pos2]
1104 1119 pos = pos2
1105 1120 else:
1106 1121 yield d
1107 1122
1108 1123 yield changegroup.closechunk()
1109 1124
1110 1125 def addgroup(self, revs, linkmapper, transaction, unique=0):
1111 1126 """
1112 1127 add a delta group
1113 1128
1114 1129 given a set of deltas, add them to the revision log. the
1115 1130 first delta is against its parent, which should be in our
1116 1131 log, the rest are against the previous delta.
1117 1132 """
1118 1133
1119 1134 #track the base of the current delta log
1120 1135 r = self.count()
1121 1136 t = r - 1
1122 1137 node = None
1123 1138
1124 1139 base = prev = nullrev
1125 1140 start = end = textlen = 0
1126 1141 if r:
1127 1142 end = self.end(t)
1128 1143
1129 1144 ifh = self.opener(self.indexfile, "a+")
1130 1145 isize = r * self._io.size
1131 1146 if self._inline:
1132 1147 transaction.add(self.indexfile, end + isize, r)
1133 1148 dfh = None
1134 1149 else:
1135 1150 transaction.add(self.indexfile, isize, r)
1136 1151 transaction.add(self.datafile, end)
1137 1152 dfh = self.opener(self.datafile, "a")
1138 1153
1139 1154 # loop through our set of deltas
1140 1155 chain = None
1141 1156 for chunk in revs:
1142 1157 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
1143 1158 link = linkmapper(cs)
1144 1159 if node in self.nodemap:
1145 1160 # this can happen if two branches make the same change
1146 1161 # if unique:
1147 1162 # raise RevlogError(_("already have %s") % hex(node[:4]))
1148 1163 chain = node
1149 1164 continue
1150 1165 delta = buffer(chunk, 80)
1151 1166 del chunk
1152 1167
1153 1168 for p in (p1, p2):
1154 1169 if not p in self.nodemap:
1155 1170 raise LookupError(_("unknown parent %s") % short(p))
1156 1171
1157 1172 if not chain:
1158 1173 # retrieve the parent revision of the delta chain
1159 1174 chain = p1
1160 1175 if not chain in self.nodemap:
1161 1176 raise LookupError(_("unknown base %s") % short(chain[:4]))
1162 1177
1163 1178 # full versions are inserted when the needed deltas become
1164 1179 # comparable to the uncompressed text or when the previous
1165 1180 # version is not the one we have a delta against. We use
1166 1181 # the size of the previous full rev as a proxy for the
1167 1182 # current size.
1168 1183
1169 1184 if chain == prev:
1170 1185 cdelta = compress(delta)
1171 1186 cdeltalen = len(cdelta[0]) + len(cdelta[1])
1172 1187 textlen = mdiff.patchedsize(textlen, delta)
1173 1188
1174 1189 if chain != prev or (end - start + cdeltalen) > textlen * 2:
1175 1190 # flush our writes here so we can read it in revision
1176 1191 if dfh:
1177 1192 dfh.flush()
1178 1193 ifh.flush()
1179 1194 text = self.revision(chain)
1180 1195 if len(text) == 0:
1181 1196 # skip over trivial delta header
1182 1197 text = buffer(delta, 12)
1183 1198 else:
1184 1199 text = mdiff.patches(text, [delta])
1185 1200 del delta
1186 1201 chk = self._addrevision(text, transaction, link, p1, p2, None,
1187 1202 ifh, dfh)
1188 1203 if not dfh and not self._inline:
1189 1204 # addrevision switched from inline to conventional
1190 1205 # reopen the index
1191 1206 dfh = self.opener(self.datafile, "a")
1192 1207 ifh = self.opener(self.indexfile, "a")
1193 1208 if chk != node:
1194 1209 raise RevlogError(_("consistency error adding group"))
1195 1210 textlen = len(text)
1196 1211 else:
1197 1212 e = (offset_type(end, 0), cdeltalen, textlen, base,
1198 1213 link, self.rev(p1), self.rev(p2), node)
1199 1214 self.index.insert(-1, e)
1200 1215 self.nodemap[node] = r
1201 1216 entry = self._io.packentry(e, self.node, self.version, r)
1202 1217 if self._inline:
1203 1218 ifh.write(entry)
1204 1219 ifh.write(cdelta[0])
1205 1220 ifh.write(cdelta[1])
1206 1221 self.checkinlinesize(transaction, ifh)
1207 1222 if not self._inline:
1208 1223 dfh = self.opener(self.datafile, "a")
1209 1224 ifh = self.opener(self.indexfile, "a")
1210 1225 else:
1211 1226 dfh.write(cdelta[0])
1212 1227 dfh.write(cdelta[1])
1213 1228 ifh.write(entry)
1214 1229
1215 1230 t, r, chain, prev = r, r + 1, node, node
1216 1231 base = self.base(t)
1217 1232 start = self.start(base)
1218 1233 end = self.end(t)
1219 1234
1220 1235 return node
1221 1236
1222 1237 def strip(self, rev, minlink):
1223 1238 if self.count() == 0 or rev >= self.count():
1224 1239 return
1225 1240
1226 1241 if isinstance(self.index, lazyindex):
1227 1242 self._loadindexmap()
1228 1243
1229 1244 # When stripping away a revision, we need to make sure it
1230 1245 # does not actually belong to an older changeset.
1231 1246 # The minlink parameter defines the oldest revision
1232 1247 # we're allowed to strip away.
1233 1248 while minlink > self.index[rev][4]:
1234 1249 rev += 1
1235 1250 if rev >= self.count():
1236 1251 return
1237 1252
1238 1253 # first truncate the files on disk
1239 1254 end = self.start(rev)
1240 1255 if not self._inline:
1241 1256 df = self.opener(self.datafile, "a")
1242 1257 df.truncate(end)
1243 1258 end = rev * self._io.size
1244 1259 else:
1245 1260 end += rev * self._io.size
1246 1261
1247 1262 indexf = self.opener(self.indexfile, "a")
1248 1263 indexf.truncate(end)
1249 1264
1250 1265 # then reset internal state in memory to forget those revisions
1251 1266 self._cache = None
1252 1267 self._chunkcache = None
1253 1268 for x in xrange(rev, self.count()):
1254 1269 del self.nodemap[self.node(x)]
1255 1270
1256 1271 del self.index[rev:-1]
1257 1272
1258 1273 def checksize(self):
1259 1274 expected = 0
1260 1275 if self.count():
1261 1276 expected = max(0, self.end(self.count() - 1))
1262 1277
1263 1278 try:
1264 1279 f = self.opener(self.datafile)
1265 1280 f.seek(0, 2)
1266 1281 actual = f.tell()
1267 1282 dd = actual - expected
1268 1283 except IOError, inst:
1269 1284 if inst.errno != errno.ENOENT:
1270 1285 raise
1271 1286 dd = 0
1272 1287
1273 1288 try:
1274 1289 f = self.opener(self.indexfile)
1275 1290 f.seek(0, 2)
1276 1291 actual = f.tell()
1277 1292 s = self._io.size
1278 1293 i = max(0, actual / s)
1279 1294 di = actual - (i * s)
1280 1295 if self._inline:
1281 1296 databytes = 0
1282 1297 for r in xrange(self.count()):
1283 1298 databytes += max(0, self.length(r))
1284 1299 dd = 0
1285 1300 di = actual - self.count() * s - databytes
1286 1301 except IOError, inst:
1287 1302 if inst.errno != errno.ENOENT:
1288 1303 raise
1289 1304 di = 0
1290 1305
1291 1306 return (dd, di)
General Comments 0
You need to be logged in to leave comments. Login now