##// END OF EJS Templates
Only use lazy indexing for big indices and avoid the overhead of the...
mpm@selenic.com -
r116:e484cd5e default
parent child Browse files
Show More
@@ -1,473 +1,493 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, os, tempfile, binascii
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16 def short(node): return hex(node[:4])
17 17
18 18 def compress(text):
19 19 if not text: return text
20 20 if len(text) < 44:
21 21 if text[0] == '\0': return text
22 22 return 'u' + text
23 23 bin = zlib.compress(text)
24 24 if len(bin) > len(text):
25 25 if text[0] == '\0': return text
26 26 return 'u' + text
27 27 return bin
28 28
29 29 def decompress(bin):
30 30 if not bin: return bin
31 31 t = bin[0]
32 32 if t == '\0': return bin
33 33 if t == 'x': return zlib.decompress(bin)
34 34 if t == 'u': return bin[1:]
35 35 raise "unknown compression type %s" % t
36 36
37 37 def hash(text, p1, p2):
38 38 l = [p1, p2]
39 39 l.sort()
40 40 return sha.sha(l[0] + l[1] + text).digest()
41 41
42 42 nullid = "\0" * 20
43 43 indexformat = ">4l20s20s20s"
44 44
45 45 class lazyparser:
46 46 def __init__(self, data):
47 47 self.data = data
48 48 self.s = struct.calcsize(indexformat)
49 49 self.l = len(data)/self.s
50 50 self.index = [None] * self.l
51 51 self.map = {nullid: -1}
52 52
53 53 if 0:
54 54 n = 0
55 55 i = self.data
56 56 s = struct.calcsize(indexformat)
57 57 for f in xrange(0, len(i), s):
58 58 # offset, size, base, linkrev, p1, p2, nodeid
59 59 e = struct.unpack(indexformat, i[f:f + s])
60 60 self.map[e[6]] = n
61 61 self.index.append(e)
62 62 n += 1
63 63
64 64 def load(self, pos):
65 65 block = pos / 1000
66 66 i = block * 1000
67 67 end = min(self.l, i + 1000)
68 68 while i < end:
69 69 d = self.data[i * self.s: (i + 1) * self.s]
70 70 e = struct.unpack(indexformat, d)
71 71 self.index[i] = e
72 72 self.map[e[6]] = i
73 73 i += 1
74 74
75 75 class lazyindex:
76 76 def __init__(self, parser):
77 77 self.p = parser
78 78 def __len__(self):
79 79 return len(self.p.index)
80 80 def load(self, pos):
81 81 self.p.load(pos)
82 82 return self.p.index[pos]
83 83 def __getitem__(self, pos):
84 84 return self.p.index[pos] or self.load(pos)
85 85 def append(self, e):
86 86 self.p.index.append(e)
87 87
88 88 class lazymap:
89 89 def __init__(self, parser):
90 90 self.p = parser
91 91 def load(self, key):
92 92 n = self.p.data.find(key)
93 93 if n < 0: raise KeyError("node " + hex(key))
94 94 pos = n / self.p.s
95 95 self.p.load(pos)
96 96 def __contains__(self, key):
97 97 try:
98 98 self[key]
99 99 return True
100 100 except KeyError:
101 101 return False
102 102 def __iter__(self):
103 103 for i in xrange(self.p.l):
104 104 try:
105 105 yield self.p.index[i][6]
106 106 except:
107 107 self.p.load(i)
108 108 yield self.p.index[i][6]
109 109 def __getitem__(self, key):
110 110 try:
111 111 return self.p.map[key]
112 112 except KeyError:
113 113 try:
114 114 self.load(key)
115 115 return self.p.map[key]
116 116 except KeyError:
117 117 raise KeyError("node " + hex(key))
118 118 def __setitem__(self, key, val):
119 119 self.p.map[key] = val
120 120
121 121 class revlog:
122 122 def __init__(self, opener, indexfile, datafile):
123 123 self.indexfile = indexfile
124 124 self.datafile = datafile
125 125 self.opener = opener
126 126 self.cache = None
127 # read the whole index for now, handle on-demand later
127
128 128 try:
129 129 i = self.opener(self.indexfile).read()
130 130 except IOError:
131 131 i = ""
132 parser = lazyparser(i)
133 self.index = lazyindex(parser)
134 self.nodemap = lazymap(parser)
132
133 if len(i) > 10000:
134 # big index, let's parse it on demand
135 parser = lazyparser(i)
136 self.index = lazyindex(parser)
137 self.nodemap = lazymap(parser)
138 else:
139 s = struct.calcsize(indexformat)
140 l = len(i) / s
141 self.index = [None] * l
142 m = [None] * l
143
144 n = 0
145 for f in xrange(0, len(i), s):
146 # offset, size, base, linkrev, p1, p2, nodeid
147 e = struct.unpack(indexformat, i[f:f + s])
148 m[n] = (e[6], n)
149 self.index[n] = e
150 n += 1
151
152 self.nodemap = dict(m)
153 self.nodemap[nullid] = -1
154
135 155
136 156 def tip(self): return self.node(len(self.index) - 1)
137 157 def count(self): return len(self.index)
138 158 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
139 159 def rev(self, node): return self.nodemap[node]
140 160 def linkrev(self, node): return self.index[self.nodemap[node]][3]
141 161 def parents(self, node):
142 162 if node == nullid: return (nullid, nullid)
143 163 return self.index[self.nodemap[node]][4:6]
144 164
145 165 def start(self, rev): return self.index[rev][0]
146 166 def length(self, rev): return self.index[rev][1]
147 167 def end(self, rev): return self.start(rev) + self.length(rev)
148 168 def base(self, rev): return self.index[rev][2]
149 169
150 170 def lookup(self, id):
151 171 try:
152 172 rev = int(id)
153 173 return self.node(rev)
154 174 except ValueError:
155 175 c = []
156 176 for n in self.nodemap:
157 177 if id in hex(n):
158 178 c.append(n)
159 179 if len(c) > 1: raise KeyError("Ambiguous identifier")
160 180 if len(c) < 1: raise KeyError("No match found")
161 181 return c[0]
162 182
163 183 return None
164 184
165 185 def diff(self, a, b):
166 186 return mdiff.textdiff(a, b)
167 187
168 188 def patches(self, t, pl):
169 189 return mdiff.patches(t, pl)
170 190
171 191 def revision(self, node):
172 192 if node == nullid: return ""
173 193 if self.cache and self.cache[0] == node: return self.cache[2]
174 194
175 195 text = None
176 196 rev = self.rev(node)
177 197 base = self.base(rev)
178 198 start = self.start(base)
179 199 end = self.end(rev)
180 200
181 201 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
182 202 base = self.cache[1]
183 203 start = self.start(base + 1)
184 204 text = self.cache[2]
185 205 last = 0
186 206
187 207 f = self.opener(self.datafile)
188 208 f.seek(start)
189 209 data = f.read(end - start)
190 210
191 211 if not text:
192 212 last = self.length(base)
193 213 text = decompress(data[:last])
194 214
195 215 bins = []
196 216 for r in xrange(base + 1, rev + 1):
197 217 s = self.length(r)
198 218 bins.append(decompress(data[last:last + s]))
199 219 last = last + s
200 220
201 221 text = mdiff.patches(text, bins)
202 222
203 223 (p1, p2) = self.parents(node)
204 224 if node != hash(text, p1, p2):
205 225 raise IOError("integrity check failed on %s:%d"
206 226 % (self.datafile, rev))
207 227
208 228 self.cache = (node, rev, text)
209 229 return text
210 230
211 231 def addrevision(self, text, transaction, link, p1=None, p2=None):
212 232 if text is None: text = ""
213 233 if p1 is None: p1 = self.tip()
214 234 if p2 is None: p2 = nullid
215 235
216 236 node = hash(text, p1, p2)
217 237
218 238 n = self.count()
219 239 t = n - 1
220 240
221 241 if n:
222 242 base = self.base(t)
223 243 start = self.start(base)
224 244 end = self.end(t)
225 245 prev = self.revision(self.tip())
226 246 d = self.diff(prev, text)
227 247 if self.patches(prev, [d]) != text:
228 248 raise AssertionError("diff failed")
229 249 data = compress(d)
230 250 dist = end - start + len(data)
231 251
232 252 # full versions are inserted when the needed deltas
233 253 # become comparable to the uncompressed text
234 254 if not n or dist > len(text) * 2:
235 255 data = compress(text)
236 256 base = n
237 257 else:
238 258 base = self.base(t)
239 259
240 260 offset = 0
241 261 if t >= 0:
242 262 offset = self.end(t)
243 263
244 264 e = (offset, len(data), base, link, p1, p2, node)
245 265
246 266 self.index.append(e)
247 267 self.nodemap[node] = n
248 268 entry = struct.pack(indexformat, *e)
249 269
250 270 transaction.add(self.datafile, e[0])
251 271 self.opener(self.datafile, "a").write(data)
252 272 transaction.add(self.indexfile, n * len(entry))
253 273 self.opener(self.indexfile, "a").write(entry)
254 274
255 275 self.cache = (node, n, text)
256 276 return node
257 277
258 278 def ancestor(self, a, b):
259 279 def expand(list, map):
260 280 a = []
261 281 while list:
262 282 n = list.pop(0)
263 283 map[n] = 1
264 284 yield n
265 285 for p in self.parents(n):
266 286 if p != nullid and p not in map:
267 287 list.append(p)
268 288 yield nullid
269 289
270 290 amap = {}
271 291 bmap = {}
272 292 ag = expand([a], amap)
273 293 bg = expand([b], bmap)
274 294 adone = bdone = 0
275 295
276 296 while not adone or not bdone:
277 297 if not adone:
278 298 an = ag.next()
279 299 if an == nullid:
280 300 adone = 1
281 301 elif an in bmap:
282 302 return an
283 303 if not bdone:
284 304 bn = bg.next()
285 305 if bn == nullid:
286 306 bdone = 1
287 307 elif bn in amap:
288 308 return bn
289 309
290 310 return nullid
291 311
292 312 def group(self, linkmap):
293 313 # given a list of changeset revs, return a set of deltas and
294 314 # metadata corresponding to nodes. the first delta is
295 315 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
296 316 # have this parent as it has all history before these
297 317 # changesets. parent is parent[0]
298 318
299 319 revs = []
300 320 needed = {}
301 321
302 322 # find file nodes/revs that match changeset revs
303 323 for i in xrange(0, self.count()):
304 324 if self.index[i][3] in linkmap:
305 325 revs.append(i)
306 326 needed[i] = 1
307 327
308 328 # if we don't have any revisions touched by these changesets, bail
309 329 if not revs: return struct.pack(">l", 0)
310 330
311 331 # add the parent of the first rev
312 332 p = self.parents(self.node(revs[0]))[0]
313 333 revs.insert(0, self.rev(p))
314 334
315 335 # for each delta that isn't contiguous in the log, we need to
316 336 # reconstruct the base, reconstruct the result, and then
317 337 # calculate the delta. We also need to do this where we've
318 338 # stored a full version and not a delta
319 339 for i in xrange(0, len(revs) - 1):
320 340 a, b = revs[i], revs[i + 1]
321 341 if a + 1 != b or self.base(b) == b:
322 342 for j in xrange(self.base(a), a + 1):
323 343 needed[j] = 1
324 344 for j in xrange(self.base(b), b + 1):
325 345 needed[j] = 1
326 346
327 347 # calculate spans to retrieve from datafile
328 348 needed = needed.keys()
329 349 needed.sort()
330 350 spans = []
331 351 for n in needed:
332 352 if n < 0: continue
333 353 o = self.start(n)
334 354 l = self.length(n)
335 355 spans.append((o, l, [(n, l)]))
336 356
337 357 # merge spans
338 358 merge = [spans.pop(0)]
339 359 while spans:
340 360 e = spans.pop(0)
341 361 f = merge[-1]
342 362 if e[0] == f[0] + f[1]:
343 363 merge[-1] = (f[0], f[1] + e[1], f[2] + e[2])
344 364 else:
345 365 merge.append(e)
346 366
347 367 # read spans in, divide up chunks
348 368 chunks = {}
349 369 for span in merge:
350 370 # we reopen the file for each span to make http happy for now
351 371 f = self.opener(self.datafile)
352 372 f.seek(span[0])
353 373 data = f.read(span[1])
354 374
355 375 # divide up the span
356 376 pos = 0
357 377 for r, l in span[2]:
358 378 chunks[r] = data[pos: pos + l]
359 379 pos += l
360 380
361 381 # helper to reconstruct intermediate versions
362 382 def construct(text, base, rev):
363 383 bins = [decompress(chunks[r]) for r in xrange(base + 1, rev + 1)]
364 384 return mdiff.patches(text, bins)
365 385
366 386 # build deltas
367 387 deltas = []
368 388 for d in xrange(0, len(revs) - 1):
369 389 a, b = revs[d], revs[d + 1]
370 390 n = self.node(b)
371 391
372 392 if a + 1 != b or self.base(b) == b:
373 393 if a >= 0:
374 394 base = self.base(a)
375 395 ta = decompress(chunks[self.base(a)])
376 396 ta = construct(ta, base, a)
377 397 else:
378 398 ta = ""
379 399
380 400 base = self.base(b)
381 401 if a > base:
382 402 base = a
383 403 tb = ta
384 404 else:
385 405 tb = decompress(chunks[self.base(b)])
386 406 tb = construct(tb, base, b)
387 407 d = self.diff(ta, tb)
388 408 else:
389 409 d = decompress(chunks[b])
390 410
391 411 p = self.parents(n)
392 412 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
393 413 l = struct.pack(">l", len(meta) + len(d) + 4)
394 414 deltas.append(l + meta + d)
395 415
396 416 l = struct.pack(">l", sum(map(len, deltas)) + 4)
397 417 deltas.insert(0, l)
398 418 return "".join(deltas)
399 419
400 420 def addgroup(self, data, linkmapper, transaction):
401 421 # given a set of deltas, add them to the revision log. the
402 422 # first delta is against its parent, which should be in our
403 423 # log, the rest are against the previous delta.
404 424
405 425 if not data: return self.tip()
406 426
407 427 # retrieve the parent revision of the delta chain
408 428 chain = data[24:44]
409 429 if not chain in self.nodemap:
410 430 raise "unknown base %s" % short(chain[:4])
411 431
412 432 # track the base of the current delta log
413 433 r = self.count()
414 434 t = r - 1
415 435
416 436 base = prev = -1
417 437 start = end = 0
418 438 if r:
419 439 start = self.start(self.base(t))
420 440 end = self.end(t)
421 441 measure = self.length(self.base(t))
422 442 base = self.base(t)
423 443 prev = self.tip()
424 444
425 445 transaction.add(self.datafile, end)
426 446 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
427 447 dfh = self.opener(self.datafile, "a")
428 448 ifh = self.opener(self.indexfile, "a")
429 449
430 450 # loop through our set of deltas
431 451 pos = 0
432 452 while pos < len(data):
433 453 l, node, p1, p2, cs = struct.unpack(">l20s20s20s20s",
434 454 data[pos:pos+84])
435 455 link = linkmapper(cs)
436 456 if node in self.nodemap:
437 457 raise "already have %s" % hex(node[:4])
438 458 delta = data[pos + 84:pos + l]
439 459 pos += l
440 460
441 461 # full versions are inserted when the needed deltas become
442 462 # comparable to the uncompressed text or when the previous
443 463 # version is not the one we have a delta against. We use
444 464 # the size of the previous full rev as a proxy for the
445 465 # current size.
446 466
447 467 if chain == prev:
448 468 cdelta = compress(delta)
449 469
450 470 if chain != prev or (end - start + len(cdelta)) > measure * 2:
451 471 # flush our writes here so we can read it in revision
452 472 dfh.flush()
453 473 ifh.flush()
454 474 text = self.revision(chain)
455 475 text = self.patches(text, [delta])
456 476 chk = self.addrevision(text, transaction, link, p1, p2)
457 477 if chk != node:
458 478 raise "consistency error adding group"
459 479 measure = len(text)
460 480 else:
461 481 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
462 482 self.index.append(e)
463 483 self.nodemap[node] = r
464 484 dfh.write(cdelta)
465 485 ifh.write(struct.pack(indexformat, *e))
466 486
467 487 t, r, chain, prev = r, r + 1, node, node
468 488 start = self.start(self.base(t))
469 489 end = self.end(t)
470 490
471 491 dfh.close()
472 492 ifh.close()
473 493 return node
General Comments 0
You need to be logged in to leave comments. Login now