##// END OF EJS Templates
revlog: allow duplicates...
mpm@selenic.com -
r301:5add718d default
parent child Browse files
Show More
@@ -1,510 +1,513 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, binascii, heapq
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16 def short(node): return hex(node[:4])
17 17
18 18 def compress(text):
19 19 if not text: return text
20 20 if len(text) < 44:
21 21 if text[0] == '\0': return text
22 22 return 'u' + text
23 23 bin = zlib.compress(text)
24 24 if len(bin) > len(text):
25 25 if text[0] == '\0': return text
26 26 return 'u' + text
27 27 return bin
28 28
29 29 def decompress(bin):
30 30 if not bin: return bin
31 31 t = bin[0]
32 32 if t == '\0': return bin
33 33 if t == 'x': return zlib.decompress(bin)
34 34 if t == 'u': return bin[1:]
35 35 raise "unknown compression type %s" % t
36 36
37 37 def hash(text, p1, p2):
38 38 l = [p1, p2]
39 39 l.sort()
40 40 return sha.sha(l[0] + l[1] + text).digest()
41 41
42 42 nullid = "\0" * 20
43 43 indexformat = ">4l20s20s20s"
44 44
45 45 class lazyparser:
46 46 def __init__(self, data):
47 47 self.data = data
48 48 self.s = struct.calcsize(indexformat)
49 49 self.l = len(data)/self.s
50 50 self.index = [None] * self.l
51 51 self.map = {nullid: -1}
52 52
53 53 def load(self, pos):
54 54 block = pos / 1000
55 55 i = block * 1000
56 56 end = min(self.l, i + 1000)
57 57 while i < end:
58 58 d = self.data[i * self.s: (i + 1) * self.s]
59 59 e = struct.unpack(indexformat, d)
60 60 self.index[i] = e
61 61 self.map[e[6]] = i
62 62 i += 1
63 63
64 64 class lazyindex:
65 65 def __init__(self, parser):
66 66 self.p = parser
67 67 def __len__(self):
68 68 return len(self.p.index)
69 69 def load(self, pos):
70 70 self.p.load(pos)
71 71 return self.p.index[pos]
72 72 def __getitem__(self, pos):
73 73 return self.p.index[pos] or self.load(pos)
74 74 def append(self, e):
75 75 self.p.index.append(e)
76 76
77 77 class lazymap:
78 78 def __init__(self, parser):
79 79 self.p = parser
80 80 def load(self, key):
81 81 n = self.p.data.find(key)
82 82 if n < 0: raise KeyError("node " + hex(key))
83 83 pos = n / self.p.s
84 84 self.p.load(pos)
85 85 def __contains__(self, key):
86 86 try:
87 87 self[key]
88 88 return True
89 89 except KeyError:
90 90 return False
91 91 def __iter__(self):
92 92 for i in xrange(self.p.l):
93 93 try:
94 94 yield self.p.index[i][6]
95 95 except:
96 96 self.p.load(i)
97 97 yield self.p.index[i][6]
98 98 def __getitem__(self, key):
99 99 try:
100 100 return self.p.map[key]
101 101 except KeyError:
102 102 try:
103 103 self.load(key)
104 104 return self.p.map[key]
105 105 except KeyError:
106 106 raise KeyError("node " + hex(key))
107 107 def __setitem__(self, key, val):
108 108 self.p.map[key] = val
109 109
110 110 class revlog:
111 111 def __init__(self, opener, indexfile, datafile):
112 112 self.indexfile = indexfile
113 113 self.datafile = datafile
114 114 self.opener = opener
115 115 self.cache = None
116 116
117 117 try:
118 118 i = self.opener(self.indexfile).read()
119 119 except IOError:
120 120 i = ""
121 121
122 122 if len(i) > 10000:
123 123 # big index, let's parse it on demand
124 124 parser = lazyparser(i)
125 125 self.index = lazyindex(parser)
126 126 self.nodemap = lazymap(parser)
127 127 else:
128 128 s = struct.calcsize(indexformat)
129 129 l = len(i) / s
130 130 self.index = [None] * l
131 131 m = [None] * l
132 132
133 133 n = 0
134 134 for f in xrange(0, len(i), s):
135 135 # offset, size, base, linkrev, p1, p2, nodeid
136 136 e = struct.unpack(indexformat, i[f:f + s])
137 137 m[n] = (e[6], n)
138 138 self.index[n] = e
139 139 n += 1
140 140
141 141 self.nodemap = dict(m)
142 142 self.nodemap[nullid] = -1
143 143
144 144
145 145 def tip(self): return self.node(len(self.index) - 1)
146 146 def count(self): return len(self.index)
147 147 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
148 148 def rev(self, node): return self.nodemap[node]
149 149 def linkrev(self, node): return self.index[self.nodemap[node]][3]
150 150 def parents(self, node):
151 151 if node == nullid: return (nullid, nullid)
152 152 return self.index[self.nodemap[node]][4:6]
153 153
154 154 def start(self, rev): return self.index[rev][0]
155 155 def length(self, rev): return self.index[rev][1]
156 156 def end(self, rev): return self.start(rev) + self.length(rev)
157 157 def base(self, rev): return self.index[rev][2]
158 158
159 159 def heads(self):
160 160 p = {}
161 161 h = []
162 162 for r in range(self.count() - 1, -1, -1):
163 163 n = self.node(r)
164 164 if n not in p:
165 165 h.append(n)
166 166 for pn in self.parents(n):
167 167 p[pn] = 1
168 168 return h
169 169
170 170 def lookup(self, id):
171 171 try:
172 172 rev = int(id)
173 173 return self.node(rev)
174 174 except ValueError:
175 175 c = []
176 176 for n in self.nodemap:
177 177 if id in hex(n):
178 178 c.append(n)
179 179 if len(c) > 1: raise KeyError("Ambiguous identifier")
180 180 if len(c) < 1: raise KeyError("No match found")
181 181 return c[0]
182 182
183 183 return None
184 184
185 185 def diff(self, a, b):
186 186 return mdiff.textdiff(a, b)
187 187
188 188 def patches(self, t, pl):
189 189 return mdiff.patches(t, pl)
190 190
191 191 def delta(self, node):
192 192 r = self.rev(node)
193 193 b = self.base(r)
194 194 if r == b:
195 195 return self.diff(self.revision(self.node(r - 1)),
196 196 self.revision(node))
197 197 else:
198 198 f = self.opener(self.datafile)
199 199 f.seek(self.start(r))
200 200 data = f.read(self.length(r))
201 201 return decompress(data)
202 202
203 203 def revision(self, node):
204 204 if node == nullid: return ""
205 205 if self.cache and self.cache[0] == node: return self.cache[2]
206 206
207 207 text = None
208 208 rev = self.rev(node)
209 209 start, length, base, link, p1, p2, node = self.index[rev]
210 210 end = start + length
211 211 if base != rev: start = self.start(base)
212 212
213 213 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
214 214 base = self.cache[1]
215 215 start = self.start(base + 1)
216 216 text = self.cache[2]
217 217 last = 0
218 218
219 219 f = self.opener(self.datafile)
220 220 f.seek(start)
221 221 data = f.read(end - start)
222 222
223 223 if not text:
224 224 last = self.length(base)
225 225 text = decompress(data[:last])
226 226
227 227 bins = []
228 228 for r in xrange(base + 1, rev + 1):
229 229 s = self.length(r)
230 230 bins.append(decompress(data[last:last + s]))
231 231 last = last + s
232 232
233 233 text = mdiff.patches(text, bins)
234 234
235 235 if node != hash(text, p1, p2):
236 236 raise IOError("integrity check failed on %s:%d"
237 237 % (self.datafile, rev))
238 238
239 239 self.cache = (node, rev, text)
240 240 return text
241 241
242 242 def addrevision(self, text, transaction, link, p1=None, p2=None):
243 243 if text is None: text = ""
244 244 if p1 is None: p1 = self.tip()
245 245 if p2 is None: p2 = nullid
246 246
247 247 node = hash(text, p1, p2)
248 248
249 if node in self.nodemap:
250 return node
251
249 252 n = self.count()
250 253 t = n - 1
251 254
252 255 if n:
253 256 base = self.base(t)
254 257 start = self.start(base)
255 258 end = self.end(t)
256 259 prev = self.revision(self.tip())
257 260 d = self.diff(prev, text)
258 261 data = compress(d)
259 262 dist = end - start + len(data)
260 263
261 264 # full versions are inserted when the needed deltas
262 265 # become comparable to the uncompressed text
263 266 if not n or dist > len(text) * 2:
264 267 data = compress(text)
265 268 base = n
266 269 else:
267 270 base = self.base(t)
268 271
269 272 offset = 0
270 273 if t >= 0:
271 274 offset = self.end(t)
272 275
273 276 e = (offset, len(data), base, link, p1, p2, node)
274 277
275 278 self.index.append(e)
276 279 self.nodemap[node] = n
277 280 entry = struct.pack(indexformat, *e)
278 281
279 282 transaction.add(self.datafile, e[0])
280 283 self.opener(self.datafile, "a").write(data)
281 284 transaction.add(self.indexfile, n * len(entry))
282 285 self.opener(self.indexfile, "a").write(entry)
283 286
284 287 self.cache = (node, n, text)
285 288 return node
286 289
287 290 def ancestor(self, a, b):
288 291 # calculate the distance of every node from root
289 292 dist = {nullid: 0}
290 293 for i in xrange(self.count()):
291 294 n = self.node(i)
292 295 p1, p2 = self.parents(n)
293 296 dist[n] = max(dist[p1], dist[p2]) + 1
294 297
295 298 # traverse ancestors in order of decreasing distance from root
296 299 def ancestors(node):
297 300 # we store negative distances because heap returns smallest member
298 301 h = [(-dist[node], node)]
299 302 seen = {}
300 303 earliest = self.count()
301 304 while h:
302 305 d, n = heapq.heappop(h)
303 306 r = self.rev(n)
304 307 if n not in seen:
305 308 seen[n] = 1
306 309 yield (-d, n)
307 310 for p in self.parents(n):
308 311 heapq.heappush(h, (-dist[p], p))
309 312
310 313 x = ancestors(a)
311 314 y = ancestors(b)
312 315 lx = x.next()
313 316 ly = y.next()
314 317
315 318 # increment each ancestor list until it is closer to root than
316 319 # the other, or they match
317 320 while 1:
318 321 if lx == ly:
319 322 return lx[1]
320 323 elif lx < ly:
321 324 ly = y.next()
322 325 elif lx > ly:
323 326 lx = x.next()
324 327
325 328 def group(self, linkmap):
326 329 # given a list of changeset revs, return a set of deltas and
327 330 # metadata corresponding to nodes. the first delta is
328 331 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
329 332 # have this parent as it has all history before these
330 333 # changesets. parent is parent[0]
331 334
332 335 revs = []
333 336 needed = {}
334 337
335 338 # find file nodes/revs that match changeset revs
336 339 for i in xrange(0, self.count()):
337 340 if self.index[i][3] in linkmap:
338 341 revs.append(i)
339 342 needed[i] = 1
340 343
341 344 # if we don't have any revisions touched by these changesets, bail
342 345 if not revs:
343 346 yield struct.pack(">l", 0)
344 347 return
345 348
346 349 # add the parent of the first rev
347 350 p = self.parents(self.node(revs[0]))[0]
348 351 revs.insert(0, self.rev(p))
349 352
350 353 # for each delta that isn't contiguous in the log, we need to
351 354 # reconstruct the base, reconstruct the result, and then
352 355 # calculate the delta. We also need to do this where we've
353 356 # stored a full version and not a delta
354 357 for i in xrange(0, len(revs) - 1):
355 358 a, b = revs[i], revs[i + 1]
356 359 if a + 1 != b or self.base(b) == b:
357 360 for j in xrange(self.base(a), a + 1):
358 361 needed[j] = 1
359 362 for j in xrange(self.base(b), b + 1):
360 363 needed[j] = 1
361 364
362 365 # calculate spans to retrieve from datafile
363 366 needed = needed.keys()
364 367 needed.sort()
365 368 spans = []
366 369 oo = -1
367 370 ol = 0
368 371 for n in needed:
369 372 if n < 0: continue
370 373 o = self.start(n)
371 374 l = self.length(n)
372 375 if oo + ol == o: # can we merge with the previous?
373 376 nl = spans[-1][2]
374 377 nl.append((n, l))
375 378 ol += l
376 379 spans[-1] = (oo, ol, nl)
377 380 else:
378 381 oo = o
379 382 ol = l
380 383 spans.append((oo, ol, [(n, l)]))
381 384
382 385 # read spans in, divide up chunks
383 386 chunks = {}
384 387 for span in spans:
385 388 # we reopen the file for each span to make http happy for now
386 389 f = self.opener(self.datafile)
387 390 f.seek(span[0])
388 391 data = f.read(span[1])
389 392
390 393 # divide up the span
391 394 pos = 0
392 395 for r, l in span[2]:
393 396 chunks[r] = decompress(data[pos: pos + l])
394 397 pos += l
395 398
396 399 # helper to reconstruct intermediate versions
397 400 def construct(text, base, rev):
398 401 bins = [chunks[r] for r in xrange(base + 1, rev + 1)]
399 402 return mdiff.patches(text, bins)
400 403
401 404 # build deltas
402 405 deltas = []
403 406 for d in xrange(0, len(revs) - 1):
404 407 a, b = revs[d], revs[d + 1]
405 408 n = self.node(b)
406 409
407 410 # do we need to construct a new delta?
408 411 if a + 1 != b or self.base(b) == b:
409 412 if a >= 0:
410 413 base = self.base(a)
411 414 ta = chunks[self.base(a)]
412 415 ta = construct(ta, base, a)
413 416 else:
414 417 ta = ""
415 418
416 419 base = self.base(b)
417 420 if a > base:
418 421 base = a
419 422 tb = ta
420 423 else:
421 424 tb = chunks[self.base(b)]
422 425 tb = construct(tb, base, b)
423 426 d = self.diff(ta, tb)
424 427 else:
425 428 d = chunks[b]
426 429
427 430 p = self.parents(n)
428 431 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
429 432 l = struct.pack(">l", len(meta) + len(d) + 4)
430 433 yield l
431 434 yield meta
432 435 yield d
433 436
434 437 yield struct.pack(">l", 0)
435 438
436 439 def addgroup(self, revs, linkmapper, transaction, unique = 0):
437 440 # given a set of deltas, add them to the revision log. the
438 441 # first delta is against its parent, which should be in our
439 442 # log, the rest are against the previous delta.
440 443
441 444 # track the base of the current delta log
442 445 r = self.count()
443 446 t = r - 1
444 447 node = nullid
445 448
446 449 base = prev = -1
447 450 start = end = 0
448 451 if r:
449 452 start = self.start(self.base(t))
450 453 end = self.end(t)
451 454 measure = self.length(self.base(t))
452 455 base = self.base(t)
453 456 prev = self.tip()
454 457
455 458 transaction.add(self.datafile, end)
456 459 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
457 460 dfh = self.opener(self.datafile, "a")
458 461 ifh = self.opener(self.indexfile, "a")
459 462
460 463 # loop through our set of deltas
461 464 chain = None
462 465 for chunk in revs:
463 466 node, p1, p2, cs = struct.unpack("20s20s20s20s", chunk[:80])
464 467 link = linkmapper(cs)
465 468 if node in self.nodemap:
466 469 # this can happen if two branches make the same change
467 470 if unique:
468 471 raise "already have %s" % hex(node[:4])
469 472 continue
470 473 delta = chunk[80:]
471 474
472 475 if not chain:
473 476 # retrieve the parent revision of the delta chain
474 477 chain = p1
475 478 if not chain in self.nodemap:
476 479 raise "unknown base %s" % short(chain[:4])
477 480
478 481 # full versions are inserted when the needed deltas become
479 482 # comparable to the uncompressed text or when the previous
480 483 # version is not the one we have a delta against. We use
481 484 # the size of the previous full rev as a proxy for the
482 485 # current size.
483 486
484 487 if chain == prev:
485 488 cdelta = compress(delta)
486 489
487 490 if chain != prev or (end - start + len(cdelta)) > measure * 2:
488 491 # flush our writes here so we can read it in revision
489 492 dfh.flush()
490 493 ifh.flush()
491 494 text = self.revision(chain)
492 495 text = self.patches(text, [delta])
493 496 chk = self.addrevision(text, transaction, link, p1, p2)
494 497 if chk != node:
495 498 raise "consistency error adding group"
496 499 measure = len(text)
497 500 else:
498 501 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
499 502 self.index.append(e)
500 503 self.nodemap[node] = r
501 504 dfh.write(cdelta)
502 505 ifh.write(struct.pack(indexformat, *e))
503 506
504 507 t, r, chain, prev = r, r + 1, node, node
505 508 start = self.start(self.base(t))
506 509 end = self.end(t)
507 510
508 511 dfh.close()
509 512 ifh.close()
510 513 return node
General Comments 0
You need to be logged in to leave comments. Login now