##// END OF EJS Templates
Hashing speed-up...
mpm@selenic.com -
r531:0e9234a1 default
parent child Browse files
Show More
@@ -1,538 +1,541 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, binascii, heapq
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16 def short(node): return hex(node[:6])
17 17
18 18 def compress(text):
19 19 if not text: return text
20 20 if len(text) < 44:
21 21 if text[0] == '\0': return text
22 22 return 'u' + text
23 23 bin = zlib.compress(text)
24 24 if len(bin) > len(text):
25 25 if text[0] == '\0': return text
26 26 return 'u' + text
27 27 return bin
28 28
29 29 def decompress(bin):
30 30 if not bin: return bin
31 31 t = bin[0]
32 32 if t == '\0': return bin
33 33 if t == 'x': return zlib.decompress(bin)
34 34 if t == 'u': return bin[1:]
35 35 raise "unknown compression type %s" % t
36 36
37 37 def hash(text, p1, p2):
38 38 l = [p1, p2]
39 39 l.sort()
40 return sha.sha(l[0] + l[1] + text).digest()
40 s = sha.new(l[0])
41 s.update(l[1])
42 s.update(text)
43 return s.digest()
41 44
42 45 nullid = "\0" * 20
43 46 indexformat = ">4l20s20s20s"
44 47
45 48 class lazyparser:
46 49 def __init__(self, data, revlog):
47 50 self.data = data
48 51 self.s = struct.calcsize(indexformat)
49 52 self.l = len(data)/self.s
50 53 self.index = [None] * self.l
51 54 self.map = {nullid: -1}
52 55 self.all = 0
53 56 self.revlog = revlog
54 57
55 58 def load(self, pos=None):
56 59 if self.all: return
57 60 if pos is not None:
58 61 block = pos / 1000
59 62 i = block * 1000
60 63 end = min(self.l, i + 1000)
61 64 else:
62 65 self.all = 1
63 66 i = 0
64 67 end = self.l
65 68 self.revlog.index = self.index
66 69 self.revlog.nodemap = self.map
67 70
68 71 while i < end:
69 72 d = self.data[i * self.s: (i + 1) * self.s]
70 73 e = struct.unpack(indexformat, d)
71 74 self.index[i] = e
72 75 self.map[e[6]] = i
73 76 i += 1
74 77
75 78 class lazyindex:
76 79 def __init__(self, parser):
77 80 self.p = parser
78 81 def __len__(self):
79 82 return len(self.p.index)
80 83 def load(self, pos):
81 84 self.p.load(pos)
82 85 return self.p.index[pos]
83 86 def __getitem__(self, pos):
84 87 return self.p.index[pos] or self.load(pos)
85 88 def append(self, e):
86 89 self.p.index.append(e)
87 90
88 91 class lazymap:
89 92 def __init__(self, parser):
90 93 self.p = parser
91 94 def load(self, key):
92 95 if self.p.all: return
93 96 n = self.p.data.find(key)
94 97 if n < 0: raise KeyError("node " + hex(key))
95 98 pos = n / self.p.s
96 99 self.p.load(pos)
97 100 def __contains__(self, key):
98 101 self.p.load()
99 102 return key in self.p.map
100 103 def __iter__(self):
101 104 yield nullid
102 105 for i in xrange(self.p.l):
103 106 try:
104 107 yield self.p.index[i][6]
105 108 except:
106 109 self.p.load(i)
107 110 yield self.p.index[i][6]
108 111 def __getitem__(self, key):
109 112 try:
110 113 return self.p.map[key]
111 114 except KeyError:
112 115 try:
113 116 self.load(key)
114 117 return self.p.map[key]
115 118 except KeyError:
116 119 raise KeyError("node " + hex(key))
117 120 def __setitem__(self, key, val):
118 121 self.p.map[key] = val
119 122
120 123 class revlog:
121 124 def __init__(self, opener, indexfile, datafile):
122 125 self.indexfile = indexfile
123 126 self.datafile = datafile
124 127 self.opener = opener
125 128 self.cache = None
126 129
127 130 try:
128 131 i = self.opener(self.indexfile).read()
129 132 except IOError:
130 133 i = ""
131 134
132 135 if len(i) > 10000:
133 136 # big index, let's parse it on demand
134 137 parser = lazyparser(i, self)
135 138 self.index = lazyindex(parser)
136 139 self.nodemap = lazymap(parser)
137 140 else:
138 141 s = struct.calcsize(indexformat)
139 142 l = len(i) / s
140 143 self.index = [None] * l
141 144 m = [None] * l
142 145
143 146 n = 0
144 147 for f in xrange(0, len(i), s):
145 148 # offset, size, base, linkrev, p1, p2, nodeid
146 149 e = struct.unpack(indexformat, i[f:f + s])
147 150 m[n] = (e[6], n)
148 151 self.index[n] = e
149 152 n += 1
150 153
151 154 self.nodemap = dict(m)
152 155 self.nodemap[nullid] = -1
153 156
154 157 def tip(self): return self.node(len(self.index) - 1)
155 158 def count(self): return len(self.index)
156 159 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
157 160 def rev(self, node): return self.nodemap[node]
158 161 def linkrev(self, node): return self.index[self.nodemap[node]][3]
159 162 def parents(self, node):
160 163 if node == nullid: return (nullid, nullid)
161 164 return self.index[self.nodemap[node]][4:6]
162 165
163 166 def start(self, rev): return self.index[rev][0]
164 167 def length(self, rev): return self.index[rev][1]
165 168 def end(self, rev): return self.start(rev) + self.length(rev)
166 169 def base(self, rev): return self.index[rev][2]
167 170
168 171 def heads(self):
169 172 p = {}
170 173 h = []
171 174 for r in range(self.count() - 1, -1, -1):
172 175 n = self.node(r)
173 176 if n not in p:
174 177 h.append(n)
175 178 for pn in self.parents(n):
176 179 p[pn] = 1
177 180 return h
178 181
179 182 def children(self, node):
180 183 c = []
181 184 p = self.rev(node)
182 185 for r in range(p + 1, self.count()):
183 186 n = self.node(r)
184 187 for pn in self.parents(n):
185 188 if pn == p:
186 189 c.append(p)
187 190 continue
188 191 elif pn == nullid:
189 192 continue
190 193 return c
191 194
192 195 def lookup(self, id):
193 196 try:
194 197 rev = int(id)
195 198 if str(rev) != id: raise ValueError
196 199 if rev < 0: rev = self.count() + rev
197 200 if rev < 0 or rev >= self.count(): raise ValueError
198 201 return self.node(rev)
199 202 except (ValueError, OverflowError):
200 203 c = []
201 204 for n in self.nodemap:
202 205 if hex(n).startswith(id):
203 206 c.append(n)
204 207 if len(c) > 1: raise KeyError("Ambiguous identifier")
205 208 if len(c) < 1: raise KeyError("No match found")
206 209 return c[0]
207 210
208 211 return None
209 212
210 213 def diff(self, a, b):
211 214 return mdiff.textdiff(a, b)
212 215
213 216 def patches(self, t, pl):
214 217 return mdiff.patches(t, pl)
215 218
216 219 def delta(self, node):
217 220 r = self.rev(node)
218 221 b = self.base(r)
219 222 if r == b:
220 223 return self.diff(self.revision(self.node(r - 1)),
221 224 self.revision(node))
222 225 else:
223 226 f = self.opener(self.datafile)
224 227 f.seek(self.start(r))
225 228 data = f.read(self.length(r))
226 229 return decompress(data)
227 230
228 231 def revision(self, node):
229 232 if node == nullid: return ""
230 233 if self.cache and self.cache[0] == node: return self.cache[2]
231 234
232 235 text = None
233 236 rev = self.rev(node)
234 237 start, length, base, link, p1, p2, node = self.index[rev]
235 238 end = start + length
236 239 if base != rev: start = self.start(base)
237 240
238 241 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
239 242 base = self.cache[1]
240 243 start = self.start(base + 1)
241 244 text = self.cache[2]
242 245 last = 0
243 246
244 247 f = self.opener(self.datafile)
245 248 f.seek(start)
246 249 data = f.read(end - start)
247 250
248 251 if not text:
249 252 last = self.length(base)
250 253 text = decompress(data[:last])
251 254
252 255 bins = []
253 256 for r in xrange(base + 1, rev + 1):
254 257 s = self.length(r)
255 258 bins.append(decompress(data[last:last + s]))
256 259 last = last + s
257 260
258 261 text = mdiff.patches(text, bins)
259 262
260 263 if node != hash(text, p1, p2):
261 264 raise IOError("integrity check failed on %s:%d"
262 265 % (self.datafile, rev))
263 266
264 267 self.cache = (node, rev, text)
265 268 return text
266 269
267 270 def addrevision(self, text, transaction, link, p1=None, p2=None):
268 271 if text is None: text = ""
269 272 if p1 is None: p1 = self.tip()
270 273 if p2 is None: p2 = nullid
271 274
272 275 node = hash(text, p1, p2)
273 276
274 277 if node in self.nodemap:
275 278 return node
276 279
277 280 n = self.count()
278 281 t = n - 1
279 282
280 283 if n:
281 284 base = self.base(t)
282 285 start = self.start(base)
283 286 end = self.end(t)
284 287 prev = self.revision(self.tip())
285 288 d = self.diff(prev, text)
286 289 data = compress(d)
287 290 dist = end - start + len(data)
288 291
289 292 # full versions are inserted when the needed deltas
290 293 # become comparable to the uncompressed text
291 294 if not n or dist > len(text) * 2:
292 295 data = compress(text)
293 296 base = n
294 297 else:
295 298 base = self.base(t)
296 299
297 300 offset = 0
298 301 if t >= 0:
299 302 offset = self.end(t)
300 303
301 304 e = (offset, len(data), base, link, p1, p2, node)
302 305
303 306 self.index.append(e)
304 307 self.nodemap[node] = n
305 308 entry = struct.pack(indexformat, *e)
306 309
307 310 transaction.add(self.datafile, e[0])
308 311 self.opener(self.datafile, "a").write(data)
309 312 transaction.add(self.indexfile, n * len(entry))
310 313 self.opener(self.indexfile, "a").write(entry)
311 314
312 315 self.cache = (node, n, text)
313 316 return node
314 317
315 318 def ancestor(self, a, b):
316 319 # calculate the distance of every node from root
317 320 dist = {nullid: 0}
318 321 for i in xrange(self.count()):
319 322 n = self.node(i)
320 323 p1, p2 = self.parents(n)
321 324 dist[n] = max(dist[p1], dist[p2]) + 1
322 325
323 326 # traverse ancestors in order of decreasing distance from root
324 327 def ancestors(node):
325 328 # we store negative distances because heap returns smallest member
326 329 h = [(-dist[node], node)]
327 330 seen = {}
328 331 earliest = self.count()
329 332 while h:
330 333 d, n = heapq.heappop(h)
331 334 if n not in seen:
332 335 seen[n] = 1
333 336 r = self.rev(n)
334 337 yield (-d, r, n)
335 338 for p in self.parents(n):
336 339 heapq.heappush(h, (-dist[p], p))
337 340
338 341 x = ancestors(a)
339 342 y = ancestors(b)
340 343 lx = x.next()
341 344 ly = y.next()
342 345
343 346 # increment each ancestor list until it is closer to root than
344 347 # the other, or they match
345 348 while 1:
346 349 if lx == ly:
347 350 return lx[2]
348 351 elif lx < ly:
349 352 ly = y.next()
350 353 elif lx > ly:
351 354 lx = x.next()
352 355
353 356 def group(self, linkmap):
354 357 # given a list of changeset revs, return a set of deltas and
355 358 # metadata corresponding to nodes. the first delta is
356 359 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
357 360 # have this parent as it has all history before these
358 361 # changesets. parent is parent[0]
359 362
360 363 revs = []
361 364 needed = {}
362 365
363 366 # find file nodes/revs that match changeset revs
364 367 for i in xrange(0, self.count()):
365 368 if self.index[i][3] in linkmap:
366 369 revs.append(i)
367 370 needed[i] = 1
368 371
369 372 # if we don't have any revisions touched by these changesets, bail
370 373 if not revs:
371 374 yield struct.pack(">l", 0)
372 375 return
373 376
374 377 # add the parent of the first rev
375 378 p = self.parents(self.node(revs[0]))[0]
376 379 revs.insert(0, self.rev(p))
377 380
378 381 # for each delta that isn't contiguous in the log, we need to
379 382 # reconstruct the base, reconstruct the result, and then
380 383 # calculate the delta. We also need to do this where we've
381 384 # stored a full version and not a delta
382 385 for i in xrange(0, len(revs) - 1):
383 386 a, b = revs[i], revs[i + 1]
384 387 if a + 1 != b or self.base(b) == b:
385 388 for j in xrange(self.base(a), a + 1):
386 389 needed[j] = 1
387 390 for j in xrange(self.base(b), b + 1):
388 391 needed[j] = 1
389 392
390 393 # calculate spans to retrieve from datafile
391 394 needed = needed.keys()
392 395 needed.sort()
393 396 spans = []
394 397 oo = -1
395 398 ol = 0
396 399 for n in needed:
397 400 if n < 0: continue
398 401 o = self.start(n)
399 402 l = self.length(n)
400 403 if oo + ol == o: # can we merge with the previous?
401 404 nl = spans[-1][2]
402 405 nl.append((n, l))
403 406 ol += l
404 407 spans[-1] = (oo, ol, nl)
405 408 else:
406 409 oo = o
407 410 ol = l
408 411 spans.append((oo, ol, [(n, l)]))
409 412
410 413 # read spans in, divide up chunks
411 414 chunks = {}
412 415 for span in spans:
413 416 # we reopen the file for each span to make http happy for now
414 417 f = self.opener(self.datafile)
415 418 f.seek(span[0])
416 419 data = f.read(span[1])
417 420
418 421 # divide up the span
419 422 pos = 0
420 423 for r, l in span[2]:
421 424 chunks[r] = decompress(data[pos: pos + l])
422 425 pos += l
423 426
424 427 # helper to reconstruct intermediate versions
425 428 def construct(text, base, rev):
426 429 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
427 430 return mdiff.patches(text, bins)
428 431
429 432 # build deltas
430 433 deltas = []
431 434 for d in xrange(0, len(revs) - 1):
432 435 a, b = revs[d], revs[d + 1]
433 436 n = self.node(b)
434 437
435 438 # do we need to construct a new delta?
436 439 if a + 1 != b or self.base(b) == b:
437 440 if a >= 0:
438 441 base = self.base(a)
439 442 ta = chunks[self.base(a)]
440 443 ta = construct(ta, base, a)
441 444 else:
442 445 ta = ""
443 446
444 447 base = self.base(b)
445 448 if a > base:
446 449 base = a
447 450 tb = ta
448 451 else:
449 452 tb = chunks[self.base(b)]
450 453 tb = construct(tb, base, b)
451 454 d = self.diff(ta, tb)
452 455 else:
453 456 d = chunks[b]
454 457
455 458 p = self.parents(n)
456 459 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
457 460 l = struct.pack(">l", len(meta) + len(d) + 4)
458 461 yield l
459 462 yield meta
460 463 yield d
461 464
462 465 yield struct.pack(">l", 0)
463 466
464 467 def addgroup(self, revs, linkmapper, transaction, unique = 0):
465 468 # given a set of deltas, add them to the revision log. the
466 469 # first delta is against its parent, which should be in our
467 470 # log, the rest are against the previous delta.
468 471
469 472 # track the base of the current delta log
470 473 r = self.count()
471 474 t = r - 1
472 475 node = nullid
473 476
474 477 base = prev = -1
475 478 start = end = 0
476 479 if r:
477 480 start = self.start(self.base(t))
478 481 end = self.end(t)
479 482 measure = self.length(self.base(t))
480 483 base = self.base(t)
481 484 prev = self.tip()
482 485
483 486 transaction.add(self.datafile, end)
484 487 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
485 488 dfh = self.opener(self.datafile, "a")
486 489 ifh = self.opener(self.indexfile, "a")
487 490
488 491 # loop through our set of deltas
489 492 chain = None
490 493 for chunk in revs:
491 494 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
492 495 link = linkmapper(cs)
493 496 if node in self.nodemap:
494 497 # this can happen if two branches make the same change
495 498 if unique:
496 499 raise "already have %s" % hex(node[:4])
497 500 continue
498 501 delta = chunk[80:]
499 502
500 503 if not chain:
501 504 # retrieve the parent revision of the delta chain
502 505 chain = p1
503 506 if not chain in self.nodemap:
504 507 raise "unknown base %s" % short(chain[:4])
505 508
506 509 # full versions are inserted when the needed deltas become
507 510 # comparable to the uncompressed text or when the previous
508 511 # version is not the one we have a delta against. We use
509 512 # the size of the previous full rev as a proxy for the
510 513 # current size.
511 514
512 515 if chain == prev:
513 516 cdelta = compress(delta)
514 517
515 518 if chain != prev or (end - start + len(cdelta)) > measure * 2:
516 519 # flush our writes here so we can read it in revision
517 520 dfh.flush()
518 521 ifh.flush()
519 522 text = self.revision(chain)
520 523 text = self.patches(text, [delta])
521 524 chk = self.addrevision(text, transaction, link, p1, p2)
522 525 if chk != node:
523 526 raise "consistency error adding group"
524 527 measure = len(text)
525 528 else:
526 529 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
527 530 self.index.append(e)
528 531 self.nodemap[node] = r
529 532 dfh.write(cdelta)
530 533 ifh.write(struct.pack(indexformat, *e))
531 534
532 535 t, r, chain, prev = r, r + 1, node, node
533 536 start = self.start(self.base(t))
534 537 end = self.end(t)
535 538
536 539 dfh.close()
537 540 ifh.close()
538 541 return node
General Comments 0
You need to be logged in to leave comments. Login now