##// END OF EJS Templates
Add code to retrieve or construct a revlog delta
mpm@selenic.com -
r119:c7a66f97 default
parent child Browse files
Show More
@@ -1,492 +1,504
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, os, tempfile, binascii
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16 def short(node): return hex(node[:4])
17 17
18 18 def compress(text):
19 19 if not text: return text
20 20 if len(text) < 44:
21 21 if text[0] == '\0': return text
22 22 return 'u' + text
23 23 bin = zlib.compress(text)
24 24 if len(bin) > len(text):
25 25 if text[0] == '\0': return text
26 26 return 'u' + text
27 27 return bin
28 28
29 29 def decompress(bin):
30 30 if not bin: return bin
31 31 t = bin[0]
32 32 if t == '\0': return bin
33 33 if t == 'x': return zlib.decompress(bin)
34 34 if t == 'u': return bin[1:]
35 35 raise "unknown compression type %s" % t
36 36
37 37 def hash(text, p1, p2):
38 38 l = [p1, p2]
39 39 l.sort()
40 40 return sha.sha(l[0] + l[1] + text).digest()
41 41
42 42 nullid = "\0" * 20
43 43 indexformat = ">4l20s20s20s"
44 44
45 45 class lazyparser:
46 46 def __init__(self, data):
47 47 self.data = data
48 48 self.s = struct.calcsize(indexformat)
49 49 self.l = len(data)/self.s
50 50 self.index = [None] * self.l
51 51 self.map = {nullid: -1}
52 52
53 53 if 0:
54 54 n = 0
55 55 i = self.data
56 56 s = struct.calcsize(indexformat)
57 57 for f in xrange(0, len(i), s):
58 58 # offset, size, base, linkrev, p1, p2, nodeid
59 59 e = struct.unpack(indexformat, i[f:f + s])
60 60 self.map[e[6]] = n
61 61 self.index.append(e)
62 62 n += 1
63 63
64 64 def load(self, pos):
65 65 block = pos / 1000
66 66 i = block * 1000
67 67 end = min(self.l, i + 1000)
68 68 while i < end:
69 69 d = self.data[i * self.s: (i + 1) * self.s]
70 70 e = struct.unpack(indexformat, d)
71 71 self.index[i] = e
72 72 self.map[e[6]] = i
73 73 i += 1
74 74
75 75 class lazyindex:
76 76 def __init__(self, parser):
77 77 self.p = parser
78 78 def __len__(self):
79 79 return len(self.p.index)
80 80 def load(self, pos):
81 81 self.p.load(pos)
82 82 return self.p.index[pos]
83 83 def __getitem__(self, pos):
84 84 return self.p.index[pos] or self.load(pos)
85 85 def append(self, e):
86 86 self.p.index.append(e)
87 87
88 88 class lazymap:
89 89 def __init__(self, parser):
90 90 self.p = parser
91 91 def load(self, key):
92 92 n = self.p.data.find(key)
93 93 if n < 0: raise KeyError("node " + hex(key))
94 94 pos = n / self.p.s
95 95 self.p.load(pos)
96 96 def __contains__(self, key):
97 97 try:
98 98 self[key]
99 99 return True
100 100 except KeyError:
101 101 return False
102 102 def __iter__(self):
103 103 for i in xrange(self.p.l):
104 104 try:
105 105 yield self.p.index[i][6]
106 106 except:
107 107 self.p.load(i)
108 108 yield self.p.index[i][6]
109 109 def __getitem__(self, key):
110 110 try:
111 111 return self.p.map[key]
112 112 except KeyError:
113 113 try:
114 114 self.load(key)
115 115 return self.p.map[key]
116 116 except KeyError:
117 117 raise KeyError("node " + hex(key))
118 118 def __setitem__(self, key, val):
119 119 self.p.map[key] = val
120 120
121 121 class revlog:
122 122 def __init__(self, opener, indexfile, datafile):
123 123 self.indexfile = indexfile
124 124 self.datafile = datafile
125 125 self.opener = opener
126 126 self.cache = None
127 127
128 128 try:
129 129 i = self.opener(self.indexfile).read()
130 130 except IOError:
131 131 i = ""
132 132
133 133 if len(i) > 10000:
134 134 # big index, let's parse it on demand
135 135 parser = lazyparser(i)
136 136 self.index = lazyindex(parser)
137 137 self.nodemap = lazymap(parser)
138 138 else:
139 139 s = struct.calcsize(indexformat)
140 140 l = len(i) / s
141 141 self.index = [None] * l
142 142 m = [None] * l
143 143
144 144 n = 0
145 145 for f in xrange(0, len(i), s):
146 146 # offset, size, base, linkrev, p1, p2, nodeid
147 147 e = struct.unpack(indexformat, i[f:f + s])
148 148 m[n] = (e[6], n)
149 149 self.index[n] = e
150 150 n += 1
151 151
152 152 self.nodemap = dict(m)
153 153 self.nodemap[nullid] = -1
154 154
155 155
156 156 def tip(self): return self.node(len(self.index) - 1)
157 157 def count(self): return len(self.index)
158 158 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
159 159 def rev(self, node): return self.nodemap[node]
160 160 def linkrev(self, node): return self.index[self.nodemap[node]][3]
161 161 def parents(self, node):
162 162 if node == nullid: return (nullid, nullid)
163 163 return self.index[self.nodemap[node]][4:6]
164 164
165 165 def start(self, rev): return self.index[rev][0]
166 166 def length(self, rev): return self.index[rev][1]
167 167 def end(self, rev): return self.start(rev) + self.length(rev)
168 168 def base(self, rev): return self.index[rev][2]
169 169
170 170 def lookup(self, id):
171 171 try:
172 172 rev = int(id)
173 173 return self.node(rev)
174 174 except ValueError:
175 175 c = []
176 176 for n in self.nodemap:
177 177 if id in hex(n):
178 178 c.append(n)
179 179 if len(c) > 1: raise KeyError("Ambiguous identifier")
180 180 if len(c) < 1: raise KeyError("No match found")
181 181 return c[0]
182 182
183 183 return None
184 184
185 185 def diff(self, a, b):
186 186 return mdiff.textdiff(a, b)
187 187
188 188 def patches(self, t, pl):
189 189 return mdiff.patches(t, pl)
190 190
191 def delta(self, node):
192 r = self.rev(node)
193 b = self.base(r)
194 if r == b:
195 return self.diff(self.revision(self.node(r - 1)),
196 self.revision(node))
197 else:
198 f = self.opener(self.datafile)
199 f.seek(self.start(r))
200 data = f.read(self.length(r))
201 return decompress(data)
202
191 203 def revision(self, node):
192 204 if node == nullid: return ""
193 205 if self.cache and self.cache[0] == node: return self.cache[2]
194 206
195 207 text = None
196 208 rev = self.rev(node)
197 209 start, length, base, link, p1, p2, node = self.index[rev]
198 210 end = start + length
199 211 if base != rev: start = self.start(base)
200 212
201 213 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
202 214 base = self.cache[1]
203 215 start = self.start(base + 1)
204 216 text = self.cache[2]
205 217 last = 0
206 218
207 219 f = self.opener(self.datafile)
208 220 f.seek(start)
209 221 data = f.read(end - start)
210 222
211 223 if not text:
212 224 last = self.length(base)
213 225 text = decompress(data[:last])
214 226
215 227 bins = []
216 228 for r in xrange(base + 1, rev + 1):
217 229 s = self.length(r)
218 230 bins.append(decompress(data[last:last + s]))
219 231 last = last + s
220 232
221 233 text = mdiff.patches(text, bins)
222 234
223 235 if node != hash(text, p1, p2):
224 236 raise IOError("integrity check failed on %s:%d"
225 237 % (self.datafile, rev))
226 238
227 239 self.cache = (node, rev, text)
228 240 return text
229 241
230 242 def addrevision(self, text, transaction, link, p1=None, p2=None):
231 243 if text is None: text = ""
232 244 if p1 is None: p1 = self.tip()
233 245 if p2 is None: p2 = nullid
234 246
235 247 node = hash(text, p1, p2)
236 248
237 249 n = self.count()
238 250 t = n - 1
239 251
240 252 if n:
241 253 base = self.base(t)
242 254 start = self.start(base)
243 255 end = self.end(t)
244 256 prev = self.revision(self.tip())
245 257 d = self.diff(prev, text)
246 258 if self.patches(prev, [d]) != text:
247 259 raise AssertionError("diff failed")
248 260 data = compress(d)
249 261 dist = end - start + len(data)
250 262
251 263 # full versions are inserted when the needed deltas
252 264 # become comparable to the uncompressed text
253 265 if not n or dist > len(text) * 2:
254 266 data = compress(text)
255 267 base = n
256 268 else:
257 269 base = self.base(t)
258 270
259 271 offset = 0
260 272 if t >= 0:
261 273 offset = self.end(t)
262 274
263 275 e = (offset, len(data), base, link, p1, p2, node)
264 276
265 277 self.index.append(e)
266 278 self.nodemap[node] = n
267 279 entry = struct.pack(indexformat, *e)
268 280
269 281 transaction.add(self.datafile, e[0])
270 282 self.opener(self.datafile, "a").write(data)
271 283 transaction.add(self.indexfile, n * len(entry))
272 284 self.opener(self.indexfile, "a").write(entry)
273 285
274 286 self.cache = (node, n, text)
275 287 return node
276 288
277 289 def ancestor(self, a, b):
278 290 def expand(list, map):
279 291 a = []
280 292 while list:
281 293 n = list.pop(0)
282 294 map[n] = 1
283 295 yield n
284 296 for p in self.parents(n):
285 297 if p != nullid and p not in map:
286 298 list.append(p)
287 299 yield nullid
288 300
289 301 amap = {}
290 302 bmap = {}
291 303 ag = expand([a], amap)
292 304 bg = expand([b], bmap)
293 305 adone = bdone = 0
294 306
295 307 while not adone or not bdone:
296 308 if not adone:
297 309 an = ag.next()
298 310 if an == nullid:
299 311 adone = 1
300 312 elif an in bmap:
301 313 return an
302 314 if not bdone:
303 315 bn = bg.next()
304 316 if bn == nullid:
305 317 bdone = 1
306 318 elif bn in amap:
307 319 return bn
308 320
309 321 return nullid
310 322
311 323 def group(self, linkmap):
312 324 # given a list of changeset revs, return a set of deltas and
313 325 # metadata corresponding to nodes. the first delta is
314 326 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
315 327 # have this parent as it has all history before these
316 328 # changesets. parent is parent[0]
317 329
318 330 revs = []
319 331 needed = {}
320 332
321 333 # find file nodes/revs that match changeset revs
322 334 for i in xrange(0, self.count()):
323 335 if self.index[i][3] in linkmap:
324 336 revs.append(i)
325 337 needed[i] = 1
326 338
327 339 # if we don't have any revisions touched by these changesets, bail
328 340 if not revs: return struct.pack(">l", 0)
329 341
330 342 # add the parent of the first rev
331 343 p = self.parents(self.node(revs[0]))[0]
332 344 revs.insert(0, self.rev(p))
333 345
334 346 # for each delta that isn't contiguous in the log, we need to
335 347 # reconstruct the base, reconstruct the result, and then
336 348 # calculate the delta. We also need to do this where we've
337 349 # stored a full version and not a delta
338 350 for i in xrange(0, len(revs) - 1):
339 351 a, b = revs[i], revs[i + 1]
340 352 if a + 1 != b or self.base(b) == b:
341 353 for j in xrange(self.base(a), a + 1):
342 354 needed[j] = 1
343 355 for j in xrange(self.base(b), b + 1):
344 356 needed[j] = 1
345 357
346 358 # calculate spans to retrieve from datafile
347 359 needed = needed.keys()
348 360 needed.sort()
349 361 spans = []
350 362 for n in needed:
351 363 if n < 0: continue
352 364 o = self.start(n)
353 365 l = self.length(n)
354 366 spans.append((o, l, [(n, l)]))
355 367
356 368 # merge spans
357 369 merge = [spans.pop(0)]
358 370 while spans:
359 371 e = spans.pop(0)
360 372 f = merge[-1]
361 373 if e[0] == f[0] + f[1]:
362 374 merge[-1] = (f[0], f[1] + e[1], f[2] + e[2])
363 375 else:
364 376 merge.append(e)
365 377
366 378 # read spans in, divide up chunks
367 379 chunks = {}
368 380 for span in merge:
369 381 # we reopen the file for each span to make http happy for now
370 382 f = self.opener(self.datafile)
371 383 f.seek(span[0])
372 384 data = f.read(span[1])
373 385
374 386 # divide up the span
375 387 pos = 0
376 388 for r, l in span[2]:
377 389 chunks[r] = data[pos: pos + l]
378 390 pos += l
379 391
380 392 # helper to reconstruct intermediate versions
381 393 def construct(text, base, rev):
382 394 bins = [decompress(chunks[r]) for r in xrange(base + 1, rev + 1)]
383 395 return mdiff.patches(text, bins)
384 396
385 397 # build deltas
386 398 deltas = []
387 399 for d in xrange(0, len(revs) - 1):
388 400 a, b = revs[d], revs[d + 1]
389 401 n = self.node(b)
390 402
391 403 if a + 1 != b or self.base(b) == b:
392 404 if a >= 0:
393 405 base = self.base(a)
394 406 ta = decompress(chunks[self.base(a)])
395 407 ta = construct(ta, base, a)
396 408 else:
397 409 ta = ""
398 410
399 411 base = self.base(b)
400 412 if a > base:
401 413 base = a
402 414 tb = ta
403 415 else:
404 416 tb = decompress(chunks[self.base(b)])
405 417 tb = construct(tb, base, b)
406 418 d = self.diff(ta, tb)
407 419 else:
408 420 d = decompress(chunks[b])
409 421
410 422 p = self.parents(n)
411 423 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
412 424 l = struct.pack(">l", len(meta) + len(d) + 4)
413 425 deltas.append(l + meta + d)
414 426
415 427 l = struct.pack(">l", sum(map(len, deltas)) + 4)
416 428 deltas.insert(0, l)
417 429 return "".join(deltas)
418 430
419 431 def addgroup(self, data, linkmapper, transaction):
420 432 # given a set of deltas, add them to the revision log. the
421 433 # first delta is against its parent, which should be in our
422 434 # log, the rest are against the previous delta.
423 435
424 436 if not data: return self.tip()
425 437
426 438 # retrieve the parent revision of the delta chain
427 439 chain = data[24:44]
428 440 if not chain in self.nodemap:
429 441 raise "unknown base %s" % short(chain[:4])
430 442
431 443 # track the base of the current delta log
432 444 r = self.count()
433 445 t = r - 1
434 446
435 447 base = prev = -1
436 448 start = end = 0
437 449 if r:
438 450 start = self.start(self.base(t))
439 451 end = self.end(t)
440 452 measure = self.length(self.base(t))
441 453 base = self.base(t)
442 454 prev = self.tip()
443 455
444 456 transaction.add(self.datafile, end)
445 457 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
446 458 dfh = self.opener(self.datafile, "a")
447 459 ifh = self.opener(self.indexfile, "a")
448 460
449 461 # loop through our set of deltas
450 462 pos = 0
451 463 while pos < len(data):
452 464 l, node, p1, p2, cs = struct.unpack(">l20s20s20s20s",
453 465 data[pos:pos+84])
454 466 link = linkmapper(cs)
455 467 if node in self.nodemap:
456 468 raise "already have %s" % hex(node[:4])
457 469 delta = data[pos + 84:pos + l]
458 470 pos += l
459 471
460 472 # full versions are inserted when the needed deltas become
461 473 # comparable to the uncompressed text or when the previous
462 474 # version is not the one we have a delta against. We use
463 475 # the size of the previous full rev as a proxy for the
464 476 # current size.
465 477
466 478 if chain == prev:
467 479 cdelta = compress(delta)
468 480
469 481 if chain != prev or (end - start + len(cdelta)) > measure * 2:
470 482 # flush our writes here so we can read it in revision
471 483 dfh.flush()
472 484 ifh.flush()
473 485 text = self.revision(chain)
474 486 text = self.patches(text, [delta])
475 487 chk = self.addrevision(text, transaction, link, p1, p2)
476 488 if chk != node:
477 489 raise "consistency error adding group"
478 490 measure = len(text)
479 491 else:
480 492 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
481 493 self.index.append(e)
482 494 self.nodemap[node] = r
483 495 dfh.write(cdelta)
484 496 ifh.write(struct.pack(indexformat, *e))
485 497
486 498 t, r, chain, prev = r, r + 1, node, node
487 499 start = self.start(self.base(t))
488 500 end = self.end(t)
489 501
490 502 dfh.close()
491 503 ifh.close()
492 504 return node
General Comments 0
You need to be logged in to leave comments. Login now