##// END OF EJS Templates
lazyparser speed ups...
mpm@selenic.com -
r323:c6f0673a default
parent child Browse files
Show More
@@ -1,513 +1,522 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, binascii, heapq
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16 def short(node): return hex(node[:4])
17 17
18 18 def compress(text):
19 19 if not text: return text
20 20 if len(text) < 44:
21 21 if text[0] == '\0': return text
22 22 return 'u' + text
23 23 bin = zlib.compress(text)
24 24 if len(bin) > len(text):
25 25 if text[0] == '\0': return text
26 26 return 'u' + text
27 27 return bin
28 28
29 29 def decompress(bin):
30 30 if not bin: return bin
31 31 t = bin[0]
32 32 if t == '\0': return bin
33 33 if t == 'x': return zlib.decompress(bin)
34 34 if t == 'u': return bin[1:]
35 35 raise "unknown compression type %s" % t
36 36
37 37 def hash(text, p1, p2):
38 38 l = [p1, p2]
39 39 l.sort()
40 40 return sha.sha(l[0] + l[1] + text).digest()
41 41
42 42 nullid = "\0" * 20
43 43 indexformat = ">4l20s20s20s"
44 44
45 45 class lazyparser:
46 def __init__(self, data):
46 def __init__(self, data, revlog):
47 47 self.data = data
48 48 self.s = struct.calcsize(indexformat)
49 49 self.l = len(data)/self.s
50 50 self.index = [None] * self.l
51 51 self.map = {nullid: -1}
52 self.all = 0
53 self.revlog = revlog
52 54
53 def load(self, pos):
54 block = pos / 1000
55 i = block * 1000
56 end = min(self.l, i + 1000)
55 def load(self, pos=None):
56 if self.all: return
57 if pos is not None:
58 block = pos / 1000
59 i = block * 1000
60 end = min(self.l, i + 1000)
61 else:
62 self.all = 1
63 i = 0
64 end = self.l
65 self.revlog.index = self.index
66 self.revlog.nodemap = self.map
67
57 68 while i < end:
58 69 d = self.data[i * self.s: (i + 1) * self.s]
59 70 e = struct.unpack(indexformat, d)
60 71 self.index[i] = e
61 72 self.map[e[6]] = i
62 73 i += 1
63 74
64 75 class lazyindex:
65 76 def __init__(self, parser):
66 77 self.p = parser
67 78 def __len__(self):
68 79 return len(self.p.index)
69 80 def load(self, pos):
70 81 self.p.load(pos)
71 82 return self.p.index[pos]
72 83 def __getitem__(self, pos):
73 84 return self.p.index[pos] or self.load(pos)
74 85 def append(self, e):
75 86 self.p.index.append(e)
76 87
77 88 class lazymap:
78 89 def __init__(self, parser):
79 90 self.p = parser
80 91 def load(self, key):
92 if self.p.all: return
81 93 n = self.p.data.find(key)
82 94 if n < 0: raise KeyError("node " + hex(key))
83 95 pos = n / self.p.s
84 96 self.p.load(pos)
85 97 def __contains__(self, key):
86 try:
87 self[key]
88 return True
89 except KeyError:
90 return False
98 self.p.load()
99 return key in self.p.map
91 100 def __iter__(self):
92 101 for i in xrange(self.p.l):
93 102 try:
94 103 yield self.p.index[i][6]
95 104 except:
96 105 self.p.load(i)
97 106 yield self.p.index[i][6]
98 107 def __getitem__(self, key):
99 108 try:
100 109 return self.p.map[key]
101 110 except KeyError:
102 111 try:
103 112 self.load(key)
104 113 return self.p.map[key]
105 114 except KeyError:
106 115 raise KeyError("node " + hex(key))
107 116 def __setitem__(self, key, val):
108 117 self.p.map[key] = val
109 118
110 119 class revlog:
111 120 def __init__(self, opener, indexfile, datafile):
112 121 self.indexfile = indexfile
113 122 self.datafile = datafile
114 123 self.opener = opener
115 124 self.cache = None
116 125
117 126 try:
118 127 i = self.opener(self.indexfile).read()
119 128 except IOError:
120 129 i = ""
121 130
122 131 if len(i) > 10000:
123 132 # big index, let's parse it on demand
124 parser = lazyparser(i)
133 parser = lazyparser(i, self)
125 134 self.index = lazyindex(parser)
126 135 self.nodemap = lazymap(parser)
127 136 else:
128 137 s = struct.calcsize(indexformat)
129 138 l = len(i) / s
130 139 self.index = [None] * l
131 140 m = [None] * l
132 141
133 142 n = 0
134 143 for f in xrange(0, len(i), s):
135 144 # offset, size, base, linkrev, p1, p2, nodeid
136 145 e = struct.unpack(indexformat, i[f:f + s])
137 146 m[n] = (e[6], n)
138 147 self.index[n] = e
139 148 n += 1
140 149
141 150 self.nodemap = dict(m)
142 151 self.nodemap[nullid] = -1
143 152
144 153
145 154 def tip(self): return self.node(len(self.index) - 1)
146 155 def count(self): return len(self.index)
147 156 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
148 157 def rev(self, node): return self.nodemap[node]
149 158 def linkrev(self, node): return self.index[self.nodemap[node]][3]
150 159 def parents(self, node):
151 160 if node == nullid: return (nullid, nullid)
152 161 return self.index[self.nodemap[node]][4:6]
153 162
154 163 def start(self, rev): return self.index[rev][0]
155 164 def length(self, rev): return self.index[rev][1]
156 165 def end(self, rev): return self.start(rev) + self.length(rev)
157 166 def base(self, rev): return self.index[rev][2]
158 167
159 168 def heads(self):
160 169 p = {}
161 170 h = []
162 171 for r in range(self.count() - 1, -1, -1):
163 172 n = self.node(r)
164 173 if n not in p:
165 174 h.append(n)
166 175 for pn in self.parents(n):
167 176 p[pn] = 1
168 177 return h
169 178
170 179 def lookup(self, id):
171 180 try:
172 181 rev = int(id)
173 182 return self.node(rev)
174 183 except ValueError:
175 184 c = []
176 185 for n in self.nodemap:
177 186 if id in hex(n):
178 187 c.append(n)
179 188 if len(c) > 1: raise KeyError("Ambiguous identifier")
180 189 if len(c) < 1: raise KeyError("No match found")
181 190 return c[0]
182 191
183 192 return None
184 193
185 194 def diff(self, a, b):
186 195 return mdiff.textdiff(a, b)
187 196
188 197 def patches(self, t, pl):
189 198 return mdiff.patches(t, pl)
190 199
191 200 def delta(self, node):
192 201 r = self.rev(node)
193 202 b = self.base(r)
194 203 if r == b:
195 204 return self.diff(self.revision(self.node(r - 1)),
196 205 self.revision(node))
197 206 else:
198 207 f = self.opener(self.datafile)
199 208 f.seek(self.start(r))
200 209 data = f.read(self.length(r))
201 210 return decompress(data)
202 211
203 212 def revision(self, node):
204 213 if node == nullid: return ""
205 214 if self.cache and self.cache[0] == node: return self.cache[2]
206 215
207 216 text = None
208 217 rev = self.rev(node)
209 218 start, length, base, link, p1, p2, node = self.index[rev]
210 219 end = start + length
211 220 if base != rev: start = self.start(base)
212 221
213 222 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
214 223 base = self.cache[1]
215 224 start = self.start(base + 1)
216 225 text = self.cache[2]
217 226 last = 0
218 227
219 228 f = self.opener(self.datafile)
220 229 f.seek(start)
221 230 data = f.read(end - start)
222 231
223 232 if not text:
224 233 last = self.length(base)
225 234 text = decompress(data[:last])
226 235
227 236 bins = []
228 237 for r in xrange(base + 1, rev + 1):
229 238 s = self.length(r)
230 239 bins.append(decompress(data[last:last + s]))
231 240 last = last + s
232 241
233 242 text = mdiff.patches(text, bins)
234 243
235 244 if node != hash(text, p1, p2):
236 245 raise IOError("integrity check failed on %s:%d"
237 246 % (self.datafile, rev))
238 247
239 248 self.cache = (node, rev, text)
240 249 return text
241 250
242 251 def addrevision(self, text, transaction, link, p1=None, p2=None):
243 252 if text is None: text = ""
244 253 if p1 is None: p1 = self.tip()
245 254 if p2 is None: p2 = nullid
246 255
247 256 node = hash(text, p1, p2)
248 257
249 258 if node in self.nodemap:
250 259 return node
251 260
252 261 n = self.count()
253 262 t = n - 1
254 263
255 264 if n:
256 265 base = self.base(t)
257 266 start = self.start(base)
258 267 end = self.end(t)
259 268 prev = self.revision(self.tip())
260 269 d = self.diff(prev, text)
261 270 data = compress(d)
262 271 dist = end - start + len(data)
263 272
264 273 # full versions are inserted when the needed deltas
265 274 # become comparable to the uncompressed text
266 275 if not n or dist > len(text) * 2:
267 276 data = compress(text)
268 277 base = n
269 278 else:
270 279 base = self.base(t)
271 280
272 281 offset = 0
273 282 if t >= 0:
274 283 offset = self.end(t)
275 284
276 285 e = (offset, len(data), base, link, p1, p2, node)
277 286
278 287 self.index.append(e)
279 288 self.nodemap[node] = n
280 289 entry = struct.pack(indexformat, *e)
281 290
282 291 transaction.add(self.datafile, e[0])
283 292 self.opener(self.datafile, "a").write(data)
284 293 transaction.add(self.indexfile, n * len(entry))
285 294 self.opener(self.indexfile, "a").write(entry)
286 295
287 296 self.cache = (node, n, text)
288 297 return node
289 298
290 299 def ancestor(self, a, b):
291 300 # calculate the distance of every node from root
292 301 dist = {nullid: 0}
293 302 for i in xrange(self.count()):
294 303 n = self.node(i)
295 304 p1, p2 = self.parents(n)
296 305 dist[n] = max(dist[p1], dist[p2]) + 1
297 306
298 307 # traverse ancestors in order of decreasing distance from root
299 308 def ancestors(node):
300 309 # we store negative distances because heap returns smallest member
301 310 h = [(-dist[node], node)]
302 311 seen = {}
303 312 earliest = self.count()
304 313 while h:
305 314 d, n = heapq.heappop(h)
306 315 r = self.rev(n)
307 316 if n not in seen:
308 317 seen[n] = 1
309 318 yield (-d, n)
310 319 for p in self.parents(n):
311 320 heapq.heappush(h, (-dist[p], p))
312 321
313 322 x = ancestors(a)
314 323 y = ancestors(b)
315 324 lx = x.next()
316 325 ly = y.next()
317 326
318 327 # increment each ancestor list until it is closer to root than
319 328 # the other, or they match
320 329 while 1:
321 330 if lx == ly:
322 331 return lx[1]
323 332 elif lx < ly:
324 333 ly = y.next()
325 334 elif lx > ly:
326 335 lx = x.next()
327 336
328 337 def group(self, linkmap):
329 338 # given a list of changeset revs, return a set of deltas and
330 339 # metadata corresponding to nodes. the first delta is
331 340 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
332 341 # have this parent as it has all history before these
333 342 # changesets. parent is parent[0]
334 343
335 344 revs = []
336 345 needed = {}
337 346
338 347 # find file nodes/revs that match changeset revs
339 348 for i in xrange(0, self.count()):
340 349 if self.index[i][3] in linkmap:
341 350 revs.append(i)
342 351 needed[i] = 1
343 352
344 353 # if we don't have any revisions touched by these changesets, bail
345 354 if not revs:
346 355 yield struct.pack(">l", 0)
347 356 return
348 357
349 358 # add the parent of the first rev
350 359 p = self.parents(self.node(revs[0]))[0]
351 360 revs.insert(0, self.rev(p))
352 361
353 362 # for each delta that isn't contiguous in the log, we need to
354 363 # reconstruct the base, reconstruct the result, and then
355 364 # calculate the delta. We also need to do this where we've
356 365 # stored a full version and not a delta
357 366 for i in xrange(0, len(revs) - 1):
358 367 a, b = revs[i], revs[i + 1]
359 368 if a + 1 != b or self.base(b) == b:
360 369 for j in xrange(self.base(a), a + 1):
361 370 needed[j] = 1
362 371 for j in xrange(self.base(b), b + 1):
363 372 needed[j] = 1
364 373
365 374 # calculate spans to retrieve from datafile
366 375 needed = needed.keys()
367 376 needed.sort()
368 377 spans = []
369 378 oo = -1
370 379 ol = 0
371 380 for n in needed:
372 381 if n < 0: continue
373 382 o = self.start(n)
374 383 l = self.length(n)
375 384 if oo + ol == o: # can we merge with the previous?
376 385 nl = spans[-1][2]
377 386 nl.append((n, l))
378 387 ol += l
379 388 spans[-1] = (oo, ol, nl)
380 389 else:
381 390 oo = o
382 391 ol = l
383 392 spans.append((oo, ol, [(n, l)]))
384 393
385 394 # read spans in, divide up chunks
386 395 chunks = {}
387 396 for span in spans:
388 397 # we reopen the file for each span to make http happy for now
389 398 f = self.opener(self.datafile)
390 399 f.seek(span[0])
391 400 data = f.read(span[1])
392 401
393 402 # divide up the span
394 403 pos = 0
395 404 for r, l in span[2]:
396 405 chunks[r] = decompress(data[pos: pos + l])
397 406 pos += l
398 407
399 408 # helper to reconstruct intermediate versions
400 409 def construct(text, base, rev):
401 410 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
402 411 return mdiff.patches(text, bins)
403 412
404 413 # build deltas
405 414 deltas = []
406 415 for d in xrange(0, len(revs) - 1):
407 416 a, b = revs[d], revs[d + 1]
408 417 n = self.node(b)
409 418
410 419 # do we need to construct a new delta?
411 420 if a + 1 != b or self.base(b) == b:
412 421 if a >= 0:
413 422 base = self.base(a)
414 423 ta = chunks[self.base(a)]
415 424 ta = construct(ta, base, a)
416 425 else:
417 426 ta = ""
418 427
419 428 base = self.base(b)
420 429 if a > base:
421 430 base = a
422 431 tb = ta
423 432 else:
424 433 tb = chunks[self.base(b)]
425 434 tb = construct(tb, base, b)
426 435 d = self.diff(ta, tb)
427 436 else:
428 437 d = chunks[b]
429 438
430 439 p = self.parents(n)
431 440 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
432 441 l = struct.pack(">l", len(meta) + len(d) + 4)
433 442 yield l
434 443 yield meta
435 444 yield d
436 445
437 446 yield struct.pack(">l", 0)
438 447
439 448 def addgroup(self, revs, linkmapper, transaction, unique = 0):
440 449 # given a set of deltas, add them to the revision log. the
441 450 # first delta is against its parent, which should be in our
442 451 # log, the rest are against the previous delta.
443 452
444 453 # track the base of the current delta log
445 454 r = self.count()
446 455 t = r - 1
447 456 node = nullid
448 457
449 458 base = prev = -1
450 459 start = end = 0
451 460 if r:
452 461 start = self.start(self.base(t))
453 462 end = self.end(t)
454 463 measure = self.length(self.base(t))
455 464 base = self.base(t)
456 465 prev = self.tip()
457 466
458 467 transaction.add(self.datafile, end)
459 468 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
460 469 dfh = self.opener(self.datafile, "a")
461 470 ifh = self.opener(self.indexfile, "a")
462 471
463 472 # loop through our set of deltas
464 473 chain = None
465 474 for chunk in revs:
466 475 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
467 476 link = linkmapper(cs)
468 477 if node in self.nodemap:
469 478 # this can happen if two branches make the same change
470 479 if unique:
471 480 raise "already have %s" % hex(node[:4])
472 481 continue
473 482 delta = chunk[80:]
474 483
475 484 if not chain:
476 485 # retrieve the parent revision of the delta chain
477 486 chain = p1
478 487 if not chain in self.nodemap:
479 488 raise "unknown base %s" % short(chain[:4])
480 489
481 490 # full versions are inserted when the needed deltas become
482 491 # comparable to the uncompressed text or when the previous
483 492 # version is not the one we have a delta against. We use
484 493 # the size of the previous full rev as a proxy for the
485 494 # current size.
486 495
487 496 if chain == prev:
488 497 cdelta = compress(delta)
489 498
490 499 if chain != prev or (end - start + len(cdelta)) > measure * 2:
491 500 # flush our writes here so we can read it in revision
492 501 dfh.flush()
493 502 ifh.flush()
494 503 text = self.revision(chain)
495 504 text = self.patches(text, [delta])
496 505 chk = self.addrevision(text, transaction, link, p1, p2)
497 506 if chk != node:
498 507 raise "consistency error adding group"
499 508 measure = len(text)
500 509 else:
501 510 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
502 511 self.index.append(e)
503 512 self.nodemap[node] = r
504 513 dfh.write(cdelta)
505 514 ifh.write(struct.pack(indexformat, *e))
506 515
507 516 t, r, chain, prev = r, r + 1, node, node
508 517 start = self.start(self.base(t))
509 518 end = self.end(t)
510 519
511 520 dfh.close()
512 521 ifh.close()
513 522 return node
General Comments 0
You need to be logged in to leave comments. Login now