##// END OF EJS Templates
Move hash function back to revlog from node
mpm@selenic.com -
r1091:d62130f9 default
parent child Browse files
Show More
@@ -1,36 +1,21 b''
1 1 """
2 2 node.py - basic nodeid manipulation for mercurial
3 3
4 4 Copyright 2005 Matt Mackall <mpm@selenic.com>
5 5
6 6 This software may be used and distributed according to the terms
7 7 of the GNU General Public License, incorporated herein by reference.
8 8 """
9 9
10 10 import sha, binascii
11 11
12 12 nullid = "\0" * 20
13 13
14 14 def hex(node):
15 15 return binascii.hexlify(node)
16 16
17 17 def bin(node):
18 18 return binascii.unhexlify(node)
19 19
20 20 def short(node):
21 21 return hex(node[:6])
22
23 def hash(text, p1, p2):
24 """generate a hash from the given text and its parent hashes
25
26 This hash combines both the current file contents and its history
27 in a manner that makes it easy to distinguish nodes with the same
28 content in the revision graph.
29 """
30 l = [p1, p2]
31 l.sort()
32 s = sha.new(l[0])
33 s.update(l[1])
34 s.update(text)
35 return s.digest()
36
@@ -1,632 +1,646 b''
1 1 """
2 2 revlog.py - storage back-end for mercurial
3 3
4 4 This provides efficient delta storage with O(1) retrieve and append
5 5 and O(changes) merge between branches
6 6
7 7 Copyright 2005 Matt Mackall <mpm@selenic.com>
8 8
9 9 This software may be used and distributed according to the terms
10 10 of the GNU General Public License, incorporated herein by reference.
11 11 """
12 12
13 13 import zlib, struct, sha, binascii, heapq
14 14 from mercurial import mdiff
15 15 from node import *
16 16
17 def hash(text, p1, p2):
18 """generate a hash from the given text and its parent hashes
19
20 This hash combines both the current file contents and its history
21 in a manner that makes it easy to distinguish nodes with the same
22 content in the revision graph.
23 """
24 l = [p1, p2]
25 l.sort()
26 s = sha.new(l[0])
27 s.update(l[1])
28 s.update(text)
29 return s.digest()
30
17 31 def compress(text):
18 32 """ generate a possibly-compressed representation of text """
19 33 if not text: return text
20 34 if len(text) < 44:
21 35 if text[0] == '\0': return text
22 36 return 'u' + text
23 37 bin = zlib.compress(text)
24 38 if len(bin) > len(text):
25 39 if text[0] == '\0': return text
26 40 return 'u' + text
27 41 return bin
28 42
29 43 def decompress(bin):
30 44 """ decompress the given input """
31 45 if not bin: return bin
32 46 t = bin[0]
33 47 if t == '\0': return bin
34 48 if t == 'x': return zlib.decompress(bin)
35 49 if t == 'u': return bin[1:]
36 50 raise RevlogError("unknown compression type %s" % t)
37 51
38 52 indexformat = ">4l20s20s20s"
39 53
40 54 class lazyparser:
41 55 """
42 56 this class avoids the need to parse the entirety of large indices
43 57
44 58 By default we parse and load 1000 entries at a time.
45 59
46 60 If no position is specified, we load the whole index, and replace
47 61 the lazy objects in revlog with the underlying objects for
48 62 efficiency in cases where we look at most of the nodes.
49 63 """
50 64 def __init__(self, data, revlog):
51 65 self.data = data
52 66 self.s = struct.calcsize(indexformat)
53 67 self.l = len(data)/self.s
54 68 self.index = [None] * self.l
55 69 self.map = {nullid: -1}
56 70 self.all = 0
57 71 self.revlog = revlog
58 72
59 73 def load(self, pos=None):
60 74 if self.all: return
61 75 if pos is not None:
62 76 block = pos / 1000
63 77 i = block * 1000
64 78 end = min(self.l, i + 1000)
65 79 else:
66 80 self.all = 1
67 81 i = 0
68 82 end = self.l
69 83 self.revlog.index = self.index
70 84 self.revlog.nodemap = self.map
71 85
72 86 while i < end:
73 87 d = self.data[i * self.s: (i + 1) * self.s]
74 88 e = struct.unpack(indexformat, d)
75 89 self.index[i] = e
76 90 self.map[e[6]] = i
77 91 i += 1
78 92
79 93 class lazyindex:
80 94 """a lazy version of the index array"""
81 95 def __init__(self, parser):
82 96 self.p = parser
83 97 def __len__(self):
84 98 return len(self.p.index)
85 99 def load(self, pos):
86 100 self.p.load(pos)
87 101 return self.p.index[pos]
88 102 def __getitem__(self, pos):
89 103 return self.p.index[pos] or self.load(pos)
90 104 def append(self, e):
91 105 self.p.index.append(e)
92 106
93 107 class lazymap:
94 108 """a lazy version of the node map"""
95 109 def __init__(self, parser):
96 110 self.p = parser
97 111 def load(self, key):
98 112 if self.p.all: return
99 113 n = self.p.data.find(key)
100 114 if n < 0: raise KeyError("node " + hex(key))
101 115 pos = n / self.p.s
102 116 self.p.load(pos)
103 117 def __contains__(self, key):
104 118 self.p.load()
105 119 return key in self.p.map
106 120 def __iter__(self):
107 121 yield nullid
108 122 for i in xrange(self.p.l):
109 123 try:
110 124 yield self.p.index[i][6]
111 125 except:
112 126 self.p.load(i)
113 127 yield self.p.index[i][6]
114 128 def __getitem__(self, key):
115 129 try:
116 130 return self.p.map[key]
117 131 except KeyError:
118 132 try:
119 133 self.load(key)
120 134 return self.p.map[key]
121 135 except KeyError:
122 136 raise KeyError("node " + hex(key))
123 137 def __setitem__(self, key, val):
124 138 self.p.map[key] = val
125 139
126 140 class RevlogError(Exception): pass
127 141
128 142 class revlog:
129 143 """
130 144 the underlying revision storage object
131 145
132 146 A revlog consists of two parts, an index and the revision data.
133 147
134 148 The index is a file with a fixed record size containing
135 149 information on each revision, includings its nodeid (hash), the
136 150 nodeids of its parents, the position and offset of its data within
137 151 the data file, and the revision it's based on. Finally, each entry
138 152 contains a linkrev entry that can serve as a pointer to external
139 153 data.
140 154
141 155 The revision data itself is a linear collection of data chunks.
142 156 Each chunk represents a revision and is usually represented as a
143 157 delta against the previous chunk. To bound lookup time, runs of
144 158 deltas are limited to about 2 times the length of the original
145 159 version data. This makes retrieval of a version proportional to
146 160 its size, or O(1) relative to the number of revisions.
147 161
148 162 Both pieces of the revlog are written to in an append-only
149 163 fashion, which means we never need to rewrite a file to insert or
150 164 remove data, and can use some simple techniques to avoid the need
151 165 for locking while reading.
152 166 """
153 167 def __init__(self, opener, indexfile, datafile):
154 168 """
155 169 create a revlog object
156 170
157 171 opener is a function that abstracts the file opening operation
158 172 and can be used to implement COW semantics or the like.
159 173 """
160 174 self.indexfile = indexfile
161 175 self.datafile = datafile
162 176 self.opener = opener
163 177 self.cache = None
164 178
165 179 try:
166 180 i = self.opener(self.indexfile).read()
167 181 except IOError:
168 182 i = ""
169 183
170 184 if len(i) > 10000:
171 185 # big index, let's parse it on demand
172 186 parser = lazyparser(i, self)
173 187 self.index = lazyindex(parser)
174 188 self.nodemap = lazymap(parser)
175 189 else:
176 190 s = struct.calcsize(indexformat)
177 191 l = len(i) / s
178 192 self.index = [None] * l
179 193 m = [None] * l
180 194
181 195 n = 0
182 196 for f in xrange(0, len(i), s):
183 197 # offset, size, base, linkrev, p1, p2, nodeid
184 198 e = struct.unpack(indexformat, i[f:f + s])
185 199 m[n] = (e[6], n)
186 200 self.index[n] = e
187 201 n += 1
188 202
189 203 self.nodemap = dict(m)
190 204 self.nodemap[nullid] = -1
191 205
192 206 def tip(self): return self.node(len(self.index) - 1)
193 207 def count(self): return len(self.index)
194 208 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
195 209 def rev(self, node): return self.nodemap[node]
196 210 def linkrev(self, node): return self.index[self.nodemap[node]][3]
197 211 def parents(self, node):
198 212 if node == nullid: return (nullid, nullid)
199 213 return self.index[self.nodemap[node]][4:6]
200 214
201 215 def start(self, rev): return self.index[rev][0]
202 216 def length(self, rev): return self.index[rev][1]
203 217 def end(self, rev): return self.start(rev) + self.length(rev)
204 218 def base(self, rev): return self.index[rev][2]
205 219
206 220 def reachable(self, rev, stop=None):
207 221 reachable = {}
208 222 visit = [rev]
209 223 reachable[rev] = 1
210 224 if stop:
211 225 stopn = self.rev(stop)
212 226 else:
213 227 stopn = 0
214 228 while visit:
215 229 n = visit.pop(0)
216 230 if n == stop:
217 231 continue
218 232 if n == nullid:
219 233 continue
220 234 for p in self.parents(n):
221 235 if self.rev(p) < stopn:
222 236 continue
223 237 if p not in reachable:
224 238 reachable[p] = 1
225 239 visit.append(p)
226 240 return reachable
227 241
228 242 def heads(self, stop=None):
229 243 """return the list of all nodes that have no children"""
230 244 p = {}
231 245 h = []
232 246 stoprev = 0
233 247 if stop and stop in self.nodemap:
234 248 stoprev = self.rev(stop)
235 249
236 250 for r in range(self.count() - 1, -1, -1):
237 251 n = self.node(r)
238 252 if n not in p:
239 253 h.append(n)
240 254 if n == stop:
241 255 break
242 256 if r < stoprev:
243 257 break
244 258 for pn in self.parents(n):
245 259 p[pn] = 1
246 260 return h
247 261
248 262 def children(self, node):
249 263 """find the children of a given node"""
250 264 c = []
251 265 p = self.rev(node)
252 266 for r in range(p + 1, self.count()):
253 267 n = self.node(r)
254 268 for pn in self.parents(n):
255 269 if pn == node:
256 270 c.append(n)
257 271 continue
258 272 elif pn == nullid:
259 273 continue
260 274 return c
261 275
262 276 def lookup(self, id):
263 277 """locate a node based on revision number or subset of hex nodeid"""
264 278 try:
265 279 rev = int(id)
266 280 if str(rev) != id: raise ValueError
267 281 if rev < 0: rev = self.count() + rev
268 282 if rev < 0 or rev >= self.count(): raise ValueError
269 283 return self.node(rev)
270 284 except (ValueError, OverflowError):
271 285 c = []
272 286 for n in self.nodemap:
273 287 if hex(n).startswith(id):
274 288 c.append(n)
275 289 if len(c) > 1: raise KeyError("Ambiguous identifier")
276 290 if len(c) < 1: raise KeyError("No match found")
277 291 return c[0]
278 292
279 293 return None
280 294
281 295 def diff(self, a, b):
282 296 """return a delta between two revisions"""
283 297 return mdiff.textdiff(a, b)
284 298
285 299 def patches(self, t, pl):
286 300 """apply a list of patches to a string"""
287 301 return mdiff.patches(t, pl)
288 302
289 303 def delta(self, node):
290 304 """return or calculate a delta between a node and its predecessor"""
291 305 r = self.rev(node)
292 306 b = self.base(r)
293 307 if r == b:
294 308 return self.diff(self.revision(self.node(r - 1)),
295 309 self.revision(node))
296 310 else:
297 311 f = self.opener(self.datafile)
298 312 f.seek(self.start(r))
299 313 data = f.read(self.length(r))
300 314 return decompress(data)
301 315
302 316 def revision(self, node):
303 317 """return an uncompressed revision of a given"""
304 318 if node == nullid: return ""
305 319 if self.cache and self.cache[0] == node: return self.cache[2]
306 320
307 321 # look up what we need to read
308 322 text = None
309 323 rev = self.rev(node)
310 324 start, length, base, link, p1, p2, node = self.index[rev]
311 325 end = start + length
312 326 if base != rev: start = self.start(base)
313 327
314 328 # do we have useful data cached?
315 329 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
316 330 base = self.cache[1]
317 331 start = self.start(base + 1)
318 332 text = self.cache[2]
319 333 last = 0
320 334
321 335 f = self.opener(self.datafile)
322 336 f.seek(start)
323 337 data = f.read(end - start)
324 338
325 339 if text is None:
326 340 last = self.length(base)
327 341 text = decompress(data[:last])
328 342
329 343 bins = []
330 344 for r in xrange(base + 1, rev + 1):
331 345 s = self.length(r)
332 346 bins.append(decompress(data[last:last + s]))
333 347 last = last + s
334 348
335 349 text = mdiff.patches(text, bins)
336 350
337 351 if node != hash(text, p1, p2):
338 352 raise IOError("integrity check failed on %s:%d"
339 353 % (self.datafile, rev))
340 354
341 355 self.cache = (node, rev, text)
342 356 return text
343 357
344 358 def addrevision(self, text, transaction, link, p1=None, p2=None, d=None):
345 359 """add a revision to the log
346 360
347 361 text - the revision data to add
348 362 transaction - the transaction object used for rollback
349 363 link - the linkrev data to add
350 364 p1, p2 - the parent nodeids of the revision
351 365 d - an optional precomputed delta
352 366 """
353 367 if text is None: text = ""
354 368 if p1 is None: p1 = self.tip()
355 369 if p2 is None: p2 = nullid
356 370
357 371 node = hash(text, p1, p2)
358 372
359 373 if node in self.nodemap:
360 374 return node
361 375
362 376 n = self.count()
363 377 t = n - 1
364 378
365 379 if n:
366 380 base = self.base(t)
367 381 start = self.start(base)
368 382 end = self.end(t)
369 383 if not d:
370 384 prev = self.revision(self.tip())
371 385 d = self.diff(prev, text)
372 386 data = compress(d)
373 387 dist = end - start + len(data)
374 388
375 389 # full versions are inserted when the needed deltas
376 390 # become comparable to the uncompressed text
377 391 if not n or dist > len(text) * 2:
378 392 data = compress(text)
379 393 base = n
380 394 else:
381 395 base = self.base(t)
382 396
383 397 offset = 0
384 398 if t >= 0:
385 399 offset = self.end(t)
386 400
387 401 e = (offset, len(data), base, link, p1, p2, node)
388 402
389 403 self.index.append(e)
390 404 self.nodemap[node] = n
391 405 entry = struct.pack(indexformat, *e)
392 406
393 407 transaction.add(self.datafile, e[0])
394 408 self.opener(self.datafile, "a").write(data)
395 409 transaction.add(self.indexfile, n * len(entry))
396 410 self.opener(self.indexfile, "a").write(entry)
397 411
398 412 self.cache = (node, n, text)
399 413 return node
400 414
401 415 def ancestor(self, a, b):
402 416 """calculate the least common ancestor of nodes a and b"""
403 417 # calculate the distance of every node from root
404 418 dist = {nullid: 0}
405 419 for i in xrange(self.count()):
406 420 n = self.node(i)
407 421 p1, p2 = self.parents(n)
408 422 dist[n] = max(dist[p1], dist[p2]) + 1
409 423
410 424 # traverse ancestors in order of decreasing distance from root
411 425 def ancestors(node):
412 426 # we store negative distances because heap returns smallest member
413 427 h = [(-dist[node], node)]
414 428 seen = {}
415 429 earliest = self.count()
416 430 while h:
417 431 d, n = heapq.heappop(h)
418 432 if n not in seen:
419 433 seen[n] = 1
420 434 r = self.rev(n)
421 435 yield (-d, r, n)
422 436 for p in self.parents(n):
423 437 heapq.heappush(h, (-dist[p], p))
424 438
425 439 x = ancestors(a)
426 440 y = ancestors(b)
427 441 lx = x.next()
428 442 ly = y.next()
429 443
430 444 # increment each ancestor list until it is closer to root than
431 445 # the other, or they match
432 446 while 1:
433 447 if lx == ly:
434 448 return lx[2]
435 449 elif lx < ly:
436 450 ly = y.next()
437 451 elif lx > ly:
438 452 lx = x.next()
439 453
440 454 def group(self, linkmap):
441 455 """calculate a delta group
442 456
443 457 Given a list of changeset revs, return a set of deltas and
444 458 metadata corresponding to nodes. the first delta is
445 459 parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
446 460 have this parent as it has all history before these
447 461 changesets. parent is parent[0]
448 462 """
449 463 revs = []
450 464 needed = {}
451 465
452 466 # find file nodes/revs that match changeset revs
453 467 for i in xrange(0, self.count()):
454 468 if self.index[i][3] in linkmap:
455 469 revs.append(i)
456 470 needed[i] = 1
457 471
458 472 # if we don't have any revisions touched by these changesets, bail
459 473 if not revs:
460 474 yield struct.pack(">l", 0)
461 475 return
462 476
463 477 # add the parent of the first rev
464 478 p = self.parents(self.node(revs[0]))[0]
465 479 revs.insert(0, self.rev(p))
466 480
467 481 # for each delta that isn't contiguous in the log, we need to
468 482 # reconstruct the base, reconstruct the result, and then
469 483 # calculate the delta. We also need to do this where we've
470 484 # stored a full version and not a delta
471 485 for i in xrange(0, len(revs) - 1):
472 486 a, b = revs[i], revs[i + 1]
473 487 if a + 1 != b or self.base(b) == b:
474 488 for j in xrange(self.base(a), a + 1):
475 489 needed[j] = 1
476 490 for j in xrange(self.base(b), b + 1):
477 491 needed[j] = 1
478 492
479 493 # calculate spans to retrieve from datafile
480 494 needed = needed.keys()
481 495 needed.sort()
482 496 spans = []
483 497 oo = -1
484 498 ol = 0
485 499 for n in needed:
486 500 if n < 0: continue
487 501 o = self.start(n)
488 502 l = self.length(n)
489 503 if oo + ol == o: # can we merge with the previous?
490 504 nl = spans[-1][2]
491 505 nl.append((n, l))
492 506 ol += l
493 507 spans[-1] = (oo, ol, nl)
494 508 else:
495 509 oo = o
496 510 ol = l
497 511 spans.append((oo, ol, [(n, l)]))
498 512
499 513 # read spans in, divide up chunks
500 514 chunks = {}
501 515 for span in spans:
502 516 # we reopen the file for each span to make http happy for now
503 517 f = self.opener(self.datafile)
504 518 f.seek(span[0])
505 519 data = f.read(span[1])
506 520
507 521 # divide up the span
508 522 pos = 0
509 523 for r, l in span[2]:
510 524 chunks[r] = decompress(data[pos: pos + l])
511 525 pos += l
512 526
513 527 # helper to reconstruct intermediate versions
514 528 def construct(text, base, rev):
515 529 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
516 530 return mdiff.patches(text, bins)
517 531
518 532 # build deltas
519 533 deltas = []
520 534 for d in xrange(0, len(revs) - 1):
521 535 a, b = revs[d], revs[d + 1]
522 536 n = self.node(b)
523 537
524 538 # do we need to construct a new delta?
525 539 if a + 1 != b or self.base(b) == b:
526 540 if a >= 0:
527 541 base = self.base(a)
528 542 ta = chunks[self.base(a)]
529 543 ta = construct(ta, base, a)
530 544 else:
531 545 ta = ""
532 546
533 547 base = self.base(b)
534 548 if a > base:
535 549 base = a
536 550 tb = ta
537 551 else:
538 552 tb = chunks[self.base(b)]
539 553 tb = construct(tb, base, b)
540 554 d = self.diff(ta, tb)
541 555 else:
542 556 d = chunks[b]
543 557
544 558 p = self.parents(n)
545 559 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
546 560 l = struct.pack(">l", len(meta) + len(d) + 4)
547 561 yield l
548 562 yield meta
549 563 yield d
550 564
551 565 yield struct.pack(">l", 0)
552 566
553 567 def addgroup(self, revs, linkmapper, transaction, unique=0):
554 568 """
555 569 add a delta group
556 570
557 571 given a set of deltas, add them to the revision log. the
558 572 first delta is against its parent, which should be in our
559 573 log, the rest are against the previous delta.
560 574 """
561 575
562 576 #track the base of the current delta log
563 577 r = self.count()
564 578 t = r - 1
565 579 node = nullid
566 580
567 581 base = prev = -1
568 582 start = end = measure = 0
569 583 if r:
570 584 start = self.start(self.base(t))
571 585 end = self.end(t)
572 586 measure = self.length(self.base(t))
573 587 base = self.base(t)
574 588 prev = self.tip()
575 589
576 590 transaction.add(self.datafile, end)
577 591 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
578 592 dfh = self.opener(self.datafile, "a")
579 593 ifh = self.opener(self.indexfile, "a")
580 594
581 595 # loop through our set of deltas
582 596 chain = None
583 597 for chunk in revs:
584 598 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
585 599 link = linkmapper(cs)
586 600 if node in self.nodemap:
587 601 # this can happen if two branches make the same change
588 602 if unique:
589 603 raise RevlogError("already have %s" % hex(node[:4]))
590 604 chain = node
591 605 continue
592 606 delta = chunk[80:]
593 607
594 608 if not chain:
595 609 # retrieve the parent revision of the delta chain
596 610 chain = p1
597 611 if not chain in self.nodemap:
598 612 raise RevlogError("unknown base %s" % short(chain[:4]))
599 613
600 614 # full versions are inserted when the needed deltas become
601 615 # comparable to the uncompressed text or when the previous
602 616 # version is not the one we have a delta against. We use
603 617 # the size of the previous full rev as a proxy for the
604 618 # current size.
605 619
606 620 if chain == prev:
607 621 cdelta = compress(delta)
608 622
609 623 if chain != prev or (end - start + len(cdelta)) > measure * 2:
610 624 # flush our writes here so we can read it in revision
611 625 dfh.flush()
612 626 ifh.flush()
613 627 text = self.revision(chain)
614 628 text = self.patches(text, [delta])
615 629 chk = self.addrevision(text, transaction, link, p1, p2)
616 630 if chk != node:
617 631 raise RevlogError("consistency error adding group")
618 632 measure = len(text)
619 633 else:
620 634 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
621 635 self.index.append(e)
622 636 self.nodemap[node] = r
623 637 dfh.write(cdelta)
624 638 ifh.write(struct.pack(indexformat, *e))
625 639
626 640 t, r, chain, prev = r, r + 1, node, node
627 641 start = self.start(self.base(t))
628 642 end = self.end(t)
629 643
630 644 dfh.close()
631 645 ifh.close()
632 646 return node
General Comments 0
You need to be logged in to leave comments. Login now