##// END OF EJS Templates
Make compression more intelligent:...
mpm@selenic.com -
r112:aea6562a default
parent child Browse files
Show More
@@ -1,461 +1,474 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, os, tempfile, binascii
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16 def short(node): return hex(node[:4])
17 17
18 18 def compress(text):
19 return zlib.compress(text)
19 if not text: return text
20 if len(text) < 44:
21 if text[0] == '\0': return text
22 return 'u' + text
23 bin = zlib.compress(text)
24 if len(bin) > len(text):
25 if text[0] == '\0': return text
26 return 'u' + text
27 return bin
20 28
21 29 def decompress(bin):
22 return zlib.decompress(bin)
30 if not bin: return bin
31 t = bin[0]
32 if t == '\0': return bin
33 if t == 'x': return zlib.decompress(bin)
34 if t == 'u': return bin[1:]
35 raise "unknown compression type %s" % t
23 36
24 37 def hash(text, p1, p2):
25 38 l = [p1, p2]
26 39 l.sort()
27 40 return sha.sha(l[0] + l[1] + text).digest()
28 41
29 42 nullid = "\0" * 20
30 43 indexformat = ">4l20s20s20s"
31 44
32 45 class lazyparser:
33 46 def __init__(self, data):
34 47 self.data = data
35 48 self.s = struct.calcsize(indexformat)
36 49 self.l = len(data)/self.s
37 50 self.index = [None] * self.l
38 51 self.map = {nullid: -1}
39 52
40 53 if 0:
41 54 n = 0
42 55 i = self.data
43 56 s = struct.calcsize(indexformat)
44 57 for f in xrange(0, len(i), s):
45 58 # offset, size, base, linkrev, p1, p2, nodeid
46 59 e = struct.unpack(indexformat, i[f:f + s])
47 60 self.map[e[6]] = n
48 61 self.index.append(e)
49 62 n += 1
50 63
51 64 def load(self, pos):
52 65 block = pos / 1000
53 66 i = block * 1000
54 67 end = min(self.l, i + 1000)
55 68 while i < end:
56 69 d = self.data[i * self.s: (i + 1) * self.s]
57 70 e = struct.unpack(indexformat, d)
58 71 self.index[i] = e
59 72 self.map[e[6]] = i
60 73 i += 1
61 74
62 75 class lazyindex:
63 76 def __init__(self, parser):
64 77 self.p = parser
65 78 def __len__(self):
66 79 return len(self.p.index)
67 80 def __getitem__(self, pos):
68 81 i = self.p.index[pos]
69 82 if not i:
70 83 self.p.load(pos)
71 84 return self.p.index[pos]
72 85 return i
73 86 def append(self, e):
74 87 self.p.index.append(e)
75 88
76 89 class lazymap:
77 90 def __init__(self, parser):
78 91 self.p = parser
79 92 def load(self, key):
80 93 n = self.p.data.find(key)
81 94 if n < 0: raise KeyError("node " + hex(key))
82 95 pos = n / self.p.s
83 96 self.p.load(pos)
84 97 def __contains__(self, key):
85 98 try:
86 99 self[key]
87 100 return True
88 101 except KeyError:
89 102 return False
90 103 def __iter__(self):
91 104 for i in xrange(self.p.l):
92 105 try:
93 106 yield self.p.index[i][6]
94 107 except:
95 108 self.p.load(i)
96 109 yield self.p.index[i][6]
97 110 def __getitem__(self, key):
98 111 try:
99 112 return self.p.map[key]
100 113 except KeyError:
101 114 try:
102 115 self.load(key)
103 116 return self.p.map[key]
104 117 except KeyError:
105 118 raise KeyError("node " + hex(key))
106 119 def __setitem__(self, key, val):
107 120 self.p.map[key] = val
108 121
109 122 class revlog:
110 123 def __init__(self, opener, indexfile, datafile):
111 124 self.indexfile = indexfile
112 125 self.datafile = datafile
113 126 self.opener = opener
114 127 self.cache = None
115 128 # read the whole index for now, handle on-demand later
116 129 try:
117 130 i = self.opener(self.indexfile).read()
118 131 except IOError:
119 132 i = ""
120 133 parser = lazyparser(i)
121 134 self.index = lazyindex(parser)
122 135 self.nodemap = lazymap(parser)
123 136
124 137 def tip(self): return self.node(len(self.index) - 1)
125 138 def count(self): return len(self.index)
126 139 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
127 140 def rev(self, node): return self.nodemap[node]
128 141 def linkrev(self, node): return self.index[self.nodemap[node]][3]
129 142 def parents(self, node):
130 143 if node == nullid: return (nullid, nullid)
131 144 return self.index[self.nodemap[node]][4:6]
132 145
133 146 def start(self, rev): return self.index[rev][0]
134 147 def length(self, rev): return self.index[rev][1]
135 148 def end(self, rev): return self.start(rev) + self.length(rev)
136 149 def base(self, rev): return self.index[rev][2]
137 150
138 151 def lookup(self, id):
139 152 try:
140 153 rev = int(id)
141 154 return self.node(rev)
142 155 except ValueError:
143 156 c = []
144 157 for n in self.nodemap:
145 158 if id in hex(n):
146 159 c.append(n)
147 160 if len(c) > 1: raise KeyError("Ambiguous identifier")
148 161 if len(c) < 1: raise KeyError("No match found")
149 162 return c[0]
150 163
151 164 return None
152 165
153 166 def diff(self, a, b):
154 167 return mdiff.textdiff(a, b)
155 168
156 169 def patches(self, t, pl):
157 170 return mdiff.patches(t, pl)
158 171
159 172 def revision(self, node):
160 173 if node == nullid: return ""
161 174 if self.cache and self.cache[0] == node: return self.cache[2]
162 175
163 176 text = None
164 177 rev = self.rev(node)
165 178 base = self.base(rev)
166 179 start = self.start(base)
167 180 end = self.end(rev)
168 181
169 182 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
170 183 base = self.cache[1]
171 184 start = self.start(base + 1)
172 185 text = self.cache[2]
173 186 last = 0
174 187
175 188 f = self.opener(self.datafile)
176 189 f.seek(start)
177 190 data = f.read(end - start)
178 191
179 192 if not text:
180 193 last = self.length(base)
181 194 text = decompress(data[:last])
182 195
183 196 bins = []
184 197 for r in xrange(base + 1, rev + 1):
185 198 s = self.length(r)
186 199 bins.append(decompress(data[last:last + s]))
187 200 last = last + s
188 201
189 202 text = mdiff.patches(text, bins)
190 203
191 204 (p1, p2) = self.parents(node)
192 205 if node != hash(text, p1, p2):
193 206 raise IOError("integrity check failed on %s:%d"
194 207 % (self.datafile, rev))
195 208
196 209 self.cache = (node, rev, text)
197 210 return text
198 211
199 212 def addrevision(self, text, transaction, link, p1=None, p2=None):
200 213 if text is None: text = ""
201 214 if p1 is None: p1 = self.tip()
202 215 if p2 is None: p2 = nullid
203 216
204 217 node = hash(text, p1, p2)
205 218
206 219 n = self.count()
207 220 t = n - 1
208 221
209 222 if n:
210 223 base = self.base(t)
211 224 start = self.start(base)
212 225 end = self.end(t)
213 226 prev = self.revision(self.tip())
214 227 d = self.diff(prev, text)
215 228 if self.patches(prev, [d]) != text:
216 229 raise AssertionError("diff failed")
217 230 data = compress(d)
218 231 dist = end - start + len(data)
219 232
220 233 # full versions are inserted when the needed deltas
221 234 # become comparable to the uncompressed text
222 235 if not n or dist > len(text) * 2:
223 236 data = compress(text)
224 237 base = n
225 238 else:
226 239 base = self.base(t)
227 240
228 241 offset = 0
229 242 if t >= 0:
230 243 offset = self.end(t)
231 244
232 245 e = (offset, len(data), base, link, p1, p2, node)
233 246
234 247 self.index.append(e)
235 248 self.nodemap[node] = n
236 249 entry = struct.pack(indexformat, *e)
237 250
238 251 transaction.add(self.datafile, e[0])
239 252 self.opener(self.datafile, "a").write(data)
240 253 transaction.add(self.indexfile, n * len(entry))
241 254 self.opener(self.indexfile, "a").write(entry)
242 255
243 256 self.cache = (node, n, text)
244 257 return node
245 258
246 259 def ancestor(self, a, b):
247 260 def expand(list, map):
248 261 a = []
249 262 while list:
250 263 n = list.pop(0)
251 264 map[n] = 1
252 265 yield n
253 266 for p in self.parents(n):
254 267 if p != nullid and p not in map:
255 268 list.append(p)
256 269 yield nullid
257 270
258 271 amap = {}
259 272 bmap = {}
260 273 ag = expand([a], amap)
261 274 bg = expand([b], bmap)
262 275 adone = bdone = 0
263 276
264 277 while not adone or not bdone:
265 278 if not adone:
266 279 an = ag.next()
267 280 if an == nullid:
268 281 adone = 1
269 282 elif an in bmap:
270 283 return an
271 284 if not bdone:
272 285 bn = bg.next()
273 286 if bn == nullid:
274 287 bdone = 1
275 288 elif bn in amap:
276 289 return bn
277 290
278 291 return nullid
279 292
280 293 def group(self, linkmap):
281 294 # given a list of changeset revs, return a set of deltas and
282 295 # metadata corresponding to nodes. the first delta is
283 296 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
284 297 # have this parent as it has all history before these
285 298 # changesets. parent is parent[0]
286 299
287 300 revs = []
288 301 needed = {}
289 302
290 303 # find file nodes/revs that match changeset revs
291 304 for i in xrange(0, self.count()):
292 305 if self.index[i][3] in linkmap:
293 306 revs.append(i)
294 307 needed[i] = 1
295 308
296 309 # if we don't have any revisions touched by these changesets, bail
297 310 if not revs: return struct.pack(">l", 0)
298 311
299 312 # add the parent of the first rev
300 313 p = self.parents(self.node(revs[0]))[0]
301 314 revs.insert(0, self.rev(p))
302 315
303 316 # for each delta that isn't contiguous in the log, we need to
304 317 # reconstruct the base, reconstruct the result, and then
305 318 # calculate the delta. We also need to do this where we've
306 319 # stored a full version and not a delta
307 320 for i in xrange(0, len(revs) - 1):
308 321 a, b = revs[i], revs[i + 1]
309 322 if a + 1 != b or self.base(b) == b:
310 323 for j in xrange(self.base(a), a + 1):
311 324 needed[j] = 1
312 325 for j in xrange(self.base(b), b + 1):
313 326 needed[j] = 1
314 327
315 328 # calculate spans to retrieve from datafile
316 329 needed = needed.keys()
317 330 needed.sort()
318 331 spans = []
319 332 for n in needed:
320 333 if n < 0: continue
321 334 o = self.start(n)
322 335 l = self.length(n)
323 336 spans.append((o, l, [(n, l)]))
324 337
325 338 # merge spans
326 339 merge = [spans.pop(0)]
327 340 while spans:
328 341 e = spans.pop(0)
329 342 f = merge[-1]
330 343 if e[0] == f[0] + f[1]:
331 344 merge[-1] = (f[0], f[1] + e[1], f[2] + e[2])
332 345 else:
333 346 merge.append(e)
334 347
335 348 # read spans in, divide up chunks
336 349 chunks = {}
337 350 for span in merge:
338 351 # we reopen the file for each span to make http happy for now
339 352 f = self.opener(self.datafile)
340 353 f.seek(span[0])
341 354 data = f.read(span[1])
342 355
343 356 # divide up the span
344 357 pos = 0
345 358 for r, l in span[2]:
346 359 chunks[r] = data[pos: pos + l]
347 360 pos += l
348 361
349 362 # helper to reconstruct intermediate versions
350 363 def construct(text, base, rev):
351 364 bins = [decompress(chunks[r]) for r in xrange(base + 1, rev + 1)]
352 365 return mdiff.patches(text, bins)
353 366
354 367 # build deltas
355 368 deltas = []
356 369 for d in xrange(0, len(revs) - 1):
357 370 a, b = revs[d], revs[d + 1]
358 371 n = self.node(b)
359 372
360 373 if a + 1 != b or self.base(b) == b:
361 374 if a >= 0:
362 375 base = self.base(a)
363 376 ta = decompress(chunks[self.base(a)])
364 377 ta = construct(ta, base, a)
365 378 else:
366 379 ta = ""
367 380
368 381 base = self.base(b)
369 382 if a > base:
370 383 base = a
371 384 tb = ta
372 385 else:
373 386 tb = decompress(chunks[self.base(b)])
374 387 tb = construct(tb, base, b)
375 388 d = self.diff(ta, tb)
376 389 else:
377 390 d = decompress(chunks[b])
378 391
379 392 p = self.parents(n)
380 393 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
381 394 l = struct.pack(">l", len(meta) + len(d) + 4)
382 395 deltas.append(l + meta + d)
383 396
384 397 l = struct.pack(">l", sum(map(len, deltas)) + 4)
385 398 deltas.insert(0, l)
386 399 return "".join(deltas)
387 400
388 401 def addgroup(self, data, linkmapper, transaction):
389 402 # given a set of deltas, add them to the revision log. the
390 403 # first delta is against its parent, which should be in our
391 404 # log, the rest are against the previous delta.
392 405
393 406 if not data: return self.tip()
394 407
395 408 # retrieve the parent revision of the delta chain
396 409 chain = data[24:44]
397 410 if not chain in self.nodemap:
398 411 raise "unknown base %s" % short(chain[:4])
399 412
400 413 # track the base of the current delta log
401 414 r = self.count()
402 415 t = r - 1
403 416
404 417 base = prev = -1
405 418 start = end = 0
406 419 if r:
407 420 start = self.start(self.base(t))
408 421 end = self.end(t)
409 422 measure = self.length(self.base(t))
410 423 base = self.base(t)
411 424 prev = self.tip()
412 425
413 426 transaction.add(self.datafile, end)
414 427 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
415 428 dfh = self.opener(self.datafile, "a")
416 429 ifh = self.opener(self.indexfile, "a")
417 430
418 431 # loop through our set of deltas
419 432 pos = 0
420 433 while pos < len(data):
421 434 l, node, p1, p2, cs = struct.unpack(">l20s20s20s20s",
422 435 data[pos:pos+84])
423 436 link = linkmapper(cs)
424 437 if node in self.nodemap:
425 438 raise "already have %s" % hex(node[:4])
426 439 delta = data[pos + 84:pos + l]
427 440 pos += l
428 441
429 442 # full versions are inserted when the needed deltas become
430 443 # comparable to the uncompressed text or when the previous
431 444 # version is not the one we have a delta against. We use
432 445 # the size of the previous full rev as a proxy for the
433 446 # current size.
434 447
435 448 if chain == prev:
436 449 cdelta = compress(delta)
437 450
438 451 if chain != prev or (end - start + len(cdelta)) > measure * 2:
439 452 # flush our writes here so we can read it in revision
440 453 dfh.flush()
441 454 ifh.flush()
442 455 text = self.revision(chain)
443 456 text = self.patches(text, [delta])
444 457 chk = self.addrevision(text, transaction, link, p1, p2)
445 458 if chk != node:
446 459 raise "consistency error adding group"
447 460 measure = len(text)
448 461 else:
449 462 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
450 463 self.index.append(e)
451 464 self.nodemap[node] = r
452 465 dfh.write(cdelta)
453 466 ifh.write(struct.pack(indexformat, *e))
454 467
455 468 t, r, chain, prev = r, r + 1, node, node
456 469 start = self.start(self.base(t))
457 470 end = self.end(t)
458 471
459 472 dfh.close()
460 473 ifh.close()
461 474 return node
General Comments 0
You need to be logged in to leave comments. Login now