##// END OF EJS Templates
Add mdiff.patches to speed up applying thousands of patches to the manifest
mpm@selenic.com -
r71:47c9a869 default
parent child Browse files
Show More
@@ -1,75 +1,144 b''
1 1 #!/usr/bin/python
2 import difflib, struct
3 from cStringIO import StringIO
2 import difflib, struct, mmap
3
4 devzero = file("/dev/zero")
4 5
5 6 def unidiff(a, ad, b, bd, fn):
6 7 if not a and not b: return ""
7 8 a = a.splitlines(1)
8 9 b = b.splitlines(1)
9 10 l = list(difflib.unified_diff(a, b, "a/" + fn, "b/" + fn, ad, bd))
10 11 return "".join(l)
11 12
12 13 def textdiff(a, b):
13 14 return diff(a.splitlines(1), b.splitlines(1))
14 15
15 16 def sortdiff(a, b):
16 17 la = lb = 0
17 18
18 19 while 1:
19 20 if la >= len(a) or lb >= len(b): break
20 21 if b[lb] < a[la]:
21 22 si = lb
22 23 while lb < len(b) and b[lb] < a[la] : lb += 1
23 24 yield "insert", la, la, si, lb
24 25 elif a[la] < b[lb]:
25 26 si = la
26 27 while la < len(a) and a[la] < b[lb]: la += 1
27 28 yield "delete", si, la, lb, lb
28 29 else:
29 30 la += 1
30 31 lb += 1
31 32
32 33 if lb < len(b):
33 34 yield "insert", la, la, lb, len(b)
34 35
35 36 if la < len(a):
36 37 yield "delete", la, len(a), lb, lb
37 38
38 39 def diff(a, b, sorted=0):
39 40 bin = []
40 41 p = [0]
41 42 for i in a: p.append(p[-1] + len(i))
42 43
43 44 if sorted:
44 45 d = sortdiff(a, b)
45 46 else:
46 47 d = difflib.SequenceMatcher(None, a, b).get_opcodes()
47 48
48 49 for o, m, n, s, t in d:
49 50 if o == 'equal': continue
50 51 s = "".join(b[s:t])
51 52 bin.append(struct.pack(">lll", p[m], p[n], len(s)) + s)
52 53
53 54 return "".join(bin)
54 55
56 # This attempts to apply a series of patches in time proportional to
57 # the total size of the patches, rather than patches * len(text). This
58 # means rather than shuffling strings around, we shuffle around
59 # pointers to fragments with fragment lists.
60 #
61 # When the fragment lists get too long, we collapse them. To do this
62 # efficiently, we do all our operations inside a buffer created by
63 # mmap and simply use memmove. This avoids creating a bunch of large
64 # temporary string buffers.
65
66 def patches(a, bins):
67 if not bins: return a
68
69 plens = [len(x) for x in bins]
70 pl = sum(plens)
71 bl = len(a) + pl
72 tl = bl + bl + pl # enough for the patches and two working texts
73 b1, b2 = 0, bl
74 m = mmap.mmap(devzero.fileno(), tl, mmap.MAP_PRIVATE)
75
76 # load our original text
77 m.write(a)
78 frags = [(len(a), b1)]
79
80 # copy all the patches into our segment so we can memmove from them
81 pos = b2 + bl
82 m.seek(pos)
83 for p in bins: m.write(p)
84
85 def pull(dst, src, l): # pull l bytes from src
86 while l:
87 f = src.pop(0)
88 if f[0] > l: # do we need to split?
89 src.insert(0, (f[0] - l, f[1] + l))
90 dst.append((l, f[1]))
91 return
92 dst.append(f)
93 l -= f[0]
94
95 def collect(buf, list):
96 start = buf
97 for l, p in list:
98 m.move(buf, p, l)
99 buf += l
100 return (buf - start, start)
101
102 for plen in plens:
103 # if our list gets too long, execute it
104 if len(frags) > 128:
105 b2, b1 = b1, b2
106 frags = [collect(b1, frags)]
107
108 new = []
109 end = pos + plen
110 last = 0
111 while pos < end:
112 p1, p2, l = struct.unpack(">lll", m[pos:pos + 12])
113 pull(new, frags, p1 - last) # what didn't change
114 pull([], frags, p2 - p1) # what got deleted
115 new.append((l, pos + 12)) # what got added
116 pos += l + 12
117 last = p2
118 frags = new + frags # what was left at the end
119
120 t = collect(b2, frags)
121
122 return m[t[1]:t[1] + t[0]]
123
55 124 def patch(a, bin):
56 125 last = pos = 0
57 126 r = []
58 127
59 128 c = 0
60 129 while pos < len(bin):
61 130 p1, p2, l = struct.unpack(">lll", bin[pos:pos + 12])
62 131 pos += 12
63 132 r.append(a[last:p1])
64 133 r.append(bin[pos:pos + l])
65 134 pos += l
66 135 last = p2
67 136 c += 1
68 137 r.append(a[last:])
69 138
70 139 return "".join(r)
71 140
72 141
73 142
74 143
75 144
@@ -1,412 +1,409 b''
1 1 # revlog.py - storage back-end for mercurial
2 2 #
3 3 # This provides efficient delta storage with O(1) retrieve and append
4 4 # and O(changes) merge between branches
5 5 #
6 6 # Copyright 2005 Matt Mackall <mpm@selenic.com>
7 7 #
8 8 # This software may be used and distributed according to the terms
9 9 # of the GNU General Public License, incorporated herein by reference.
10 10
11 11 import zlib, struct, sha, os, tempfile, binascii
12 12 from mercurial import mdiff
13 13
14 14 def hex(node): return binascii.hexlify(node)
15 15 def bin(node): return binascii.unhexlify(node)
16 16
17 17 def compress(text):
18 18 return zlib.compress(text)
19 19
20 20 def decompress(bin):
21 21 return zlib.decompress(bin)
22 22
23 23 def hash(text, p1, p2):
24 24 l = [p1, p2]
25 25 l.sort()
26 26 return sha.sha(l[0] + l[1] + text).digest()
27 27
28 28 nullid = "\0" * 20
29 29 indexformat = ">4l20s20s20s"
30 30
31 31 class revlog:
32 32 def __init__(self, opener, indexfile, datafile):
33 33 self.indexfile = indexfile
34 34 self.datafile = datafile
35 35 self.index = []
36 36 self.opener = opener
37 37 self.cache = None
38 38 self.nodemap = {nullid: -1}
39 39 # read the whole index for now, handle on-demand later
40 40 try:
41 41 n = 0
42 42 i = self.opener(self.indexfile).read()
43 43 s = struct.calcsize(indexformat)
44 for f in range(0, len(i), s):
44 for f in xrange(0, len(i), s):
45 45 # offset, size, base, linkrev, p1, p2, nodeid
46 46 e = struct.unpack(indexformat, i[f:f + s])
47 47 self.nodemap[e[6]] = n
48 48 self.index.append(e)
49 49 n += 1
50 50 except IOError: pass
51 51
52 52 def tip(self): return self.node(len(self.index) - 1)
53 53 def count(self): return len(self.index)
54 54 def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
55 55 def rev(self, node): return self.nodemap[node]
56 56 def linkrev(self, node): return self.index[self.nodemap[node]][3]
57 57 def parents(self, node):
58 58 if node == nullid: return (nullid, nullid)
59 59 return self.index[self.nodemap[node]][4:6]
60 60
61 61 def start(self, rev): return self.index[rev][0]
62 62 def length(self, rev): return self.index[rev][1]
63 63 def end(self, rev): return self.start(rev) + self.length(rev)
64 64 def base(self, rev): return self.index[rev][2]
65 65
66 66 def lookup(self, id):
67 67 try:
68 68 rev = int(id)
69 69 return self.node(rev)
70 70 except ValueError:
71 71 c = []
72 72 for n in self.nodemap:
73 73 if id in hex(n):
74 74 c.append(n)
75 75 if len(c) > 1: raise KeyError("Ambiguous identifier")
76 76 if len(c) < 1: raise KeyError("No match found")
77 77 return c[0]
78 78
79 79 return None
80 80
81 81 def revisions(self, list):
82 82 # this can be optimized to do spans, etc
83 83 # be stupid for now
84 84 for node in list:
85 85 yield self.revision(node)
86 86
87 87 def diff(self, a, b):
88 88 return mdiff.textdiff(a, b)
89 89
90 def patch(self, text, patch):
91 return mdiff.patch(text, patch)
92
93 90 def revision(self, node):
94 91 if node == nullid: return ""
95 92 if self.cache and self.cache[0] == node: return self.cache[2]
96 93
97 94 text = None
98 95 rev = self.rev(node)
99 96 base = self.base(rev)
100 97 start = self.start(base)
101 98 end = self.end(rev)
102 99
103 100 if self.cache and self.cache[1] >= base and self.cache[1] < rev:
104 101 base = self.cache[1]
105 102 start = self.start(base + 1)
106 103 text = self.cache[2]
107 104 last = 0
108 105
109 106 f = self.opener(self.datafile)
110 107 f.seek(start)
111 108 data = f.read(end - start)
112 109
113 110 if not text:
114 111 last = self.length(base)
115 112 text = decompress(data[:last])
116 113
114 bins = []
117 115 for r in xrange(base + 1, rev + 1):
118 116 s = self.length(r)
119 b = decompress(data[last:last + s])
120 text = self.patch(text, b)
117 bins.append(decompress(data[last:last + s]))
121 118 last = last + s
122 119
120 text = mdiff.patches(text, bins)
121
123 122 (p1, p2) = self.parents(node)
124 123 if node != hash(text, p1, p2):
125 124 raise "integrity check failed on %s:%d" % (self.datafile, rev)
126 125
127 126 self.cache = (node, rev, text)
128 127 return text
129 128
130 129 def addrevision(self, text, transaction, link, p1=None, p2=None):
131 130 if text is None: text = ""
132 131 if p1 is None: p1 = self.tip()
133 132 if p2 is None: p2 = nullid
134 133
135 134 node = hash(text, p1, p2)
136 135
137 136 n = self.count()
138 137 t = n - 1
139 138
140 139 if n:
141 140 base = self.base(t)
142 141 start = self.start(base)
143 142 end = self.end(t)
144 143 prev = self.revision(self.tip())
145 144 data = compress(self.diff(prev, text))
146 145 dist = end - start + len(data)
147 146
148 147 # full versions are inserted when the needed deltas
149 148 # become comparable to the uncompressed text
150 149 if not n or dist > len(text) * 2:
151 150 data = compress(text)
152 151 base = n
153 152 else:
154 153 base = self.base(t)
155 154
156 155 offset = 0
157 156 if t >= 0:
158 157 offset = self.end(t)
159 158
160 159 e = (offset, len(data), base, link, p1, p2, node)
161 160
162 161 self.index.append(e)
163 162 self.nodemap[node] = n
164 163 entry = struct.pack(indexformat, *e)
165 164
166 165 transaction.add(self.datafile, e[0])
167 166 self.opener(self.datafile, "a").write(data)
168 167 transaction.add(self.indexfile, n * len(entry))
169 168 self.opener(self.indexfile, "a").write(entry)
170 169
171 170 self.cache = (node, n, text)
172 171 return node
173 172
174 173 def ancestor(self, a, b):
175 174 def expand(list, map):
176 175 a = []
177 176 while list:
178 177 n = list.pop(0)
179 178 map[n] = 1
180 179 yield n
181 180 for p in self.parents(n):
182 181 if p != nullid and p not in map:
183 182 list.append(p)
184 183 yield nullid
185 184
186 185 amap = {}
187 186 bmap = {}
188 187 ag = expand([a], amap)
189 188 bg = expand([b], bmap)
190 189 adone = bdone = 0
191 190
192 191 while not adone or not bdone:
193 192 if not adone:
194 193 an = ag.next()
195 194 if an == nullid:
196 195 adone = 1
197 196 elif an in bmap:
198 197 return an
199 198 if not bdone:
200 199 bn = bg.next()
201 200 if bn == nullid:
202 201 bdone = 1
203 202 elif bn in amap:
204 203 return bn
205 204
206 205 return nullid
207 206
208 207 def mergedag(self, other, transaction, linkseq, accumulate = None):
209 208 """combine the nodes from other's DAG into ours"""
210 209 old = self.tip()
211 210 i = self.count()
212 211 l = []
213 212
214 213 # merge the other revision log into our DAG
215 214 for r in range(other.count()):
216 215 id = other.node(r)
217 216 if id not in self.nodemap:
218 217 (xn, yn) = other.parents(id)
219 218 l.append((id, xn, yn))
220 219 self.nodemap[id] = i
221 220 i += 1
222 221
223 222 # merge node date for new nodes
224 223 r = other.revisions([e[0] for e in l])
225 224 for e in l:
226 225 t = r.next()
227 226 if accumulate: accumulate(t)
228 227 self.addrevision(t, transaction, linkseq.next(), e[1], e[2])
229 228
230 229 # return the unmerged heads for later resolving
231 230 return (old, self.tip())
232 231
233 232 def group(self, linkmap):
234 233 # given a list of changeset revs, return a set of deltas and
235 234 # metadata corresponding to nodes the first delta is
236 235 # parent(nodes[0]) -> nodes[0] the receiver is guaranteed to
237 236 # have this parent as it has all history before these
238 237 # changesets. parent is parent[0]
239 238
240 239 revs = []
241 240 needed = {}
242 241
243 242 # find file nodes/revs that match changeset revs
244 243 for i in xrange(0, self.count()):
245 244 if self.index[i][3] in linkmap:
246 245 revs.append(i)
247 246 needed[i] = 1
248 247
249 248 # if we don't have any revisions touched by these changesets, bail
250 249 if not revs: return struct.pack(">l", 0)
251 250
252 251 # add the parent of the first rev
253 252 p = self.parents(self.node(revs[0]))[0]
254 253 revs.insert(0, self.rev(p))
255 254
256 255 # for each delta that isn't contiguous in the log, we need to
257 256 # reconstruct the base, reconstruct the result, and then
258 257 # calculate the delta. We also need to do this where we've
259 258 # stored a full version and not a delta
260 259 for i in xrange(0, len(revs) - 1):
261 260 a, b = revs[i], revs[i + 1]
262 261 if a + 1 != b or self.base(b) == b:
263 262 for j in xrange(self.base(a), a + 1):
264 263 needed[j] = 1
265 264 for j in xrange(self.base(b), b + 1):
266 265 needed[j] = 1
267 266
268 267 # calculate spans to retrieve from datafile
269 268 needed = needed.keys()
270 269 needed.sort()
271 270 spans = []
272 271 for n in needed:
273 272 if n < 0: continue
274 273 o = self.start(n)
275 274 l = self.length(n)
276 275 spans.append((o, l, [(n, l)]))
277 276
278 277 # merge spans
279 278 merge = [spans.pop(0)]
280 279 while spans:
281 280 e = spans.pop(0)
282 281 f = merge[-1]
283 282 if e[0] == f[0] + f[1]:
284 283 merge[-1] = (f[0], f[1] + e[1], f[2] + e[2])
285 284 else:
286 285 merge.append(e)
287 286
288 287 # read spans in, divide up chunks
289 288 chunks = {}
290 289 for span in merge:
291 290 # we reopen the file for each span to make http happy for now
292 291 f = self.opener(self.datafile)
293 292 f.seek(span[0])
294 293 data = f.read(span[1])
295 294
296 295 # divide up the span
297 296 pos = 0
298 297 for r, l in span[2]:
299 298 chunks[r] = data[pos: pos + l]
300 299 pos += l
301 300
302 301 # helper to reconstruct intermediate versions
303 302 def construct(text, base, rev):
304 for r in range(base + 1, rev + 1):
305 b = decompress(chunks[r])
306 text = self.patch(text, b)
307 return text
303 bins = [decompress(chunks[r]) for r in xrange(base + 1, rev + 1)]
304 return mdiff.patches(text, bins)
308 305
309 306 # build deltas
310 307 deltas = []
311 for d in range(0, len(revs) - 1):
308 for d in xrange(0, len(revs) - 1):
312 309 a, b = revs[d], revs[d + 1]
313 310 n = self.node(b)
314 311
315 312 if a + 1 != b or self.base(b) == b:
316 313 if a >= 0:
317 314 base = self.base(a)
318 315 ta = decompress(chunks[self.base(a)])
319 316 ta = construct(ta, base, a)
320 317 else:
321 318 ta = ""
322 319
323 320 base = self.base(b)
324 321 if a > base:
325 322 base = a
326 323 tb = ta
327 324 else:
328 325 tb = decompress(chunks[self.base(b)])
329 326 tb = construct(tb, base, b)
330 327 d = self.diff(ta, tb)
331 328 else:
332 329 d = decompress(chunks[b])
333 330
334 331 p = self.parents(n)
335 332 meta = n + p[0] + p[1] + linkmap[self.linkrev(n)]
336 333 l = struct.pack(">l", len(meta) + len(d) + 4)
337 334 deltas.append(l + meta + d)
338 335
339 336 l = struct.pack(">l", sum(map(len, deltas)) + 4)
340 337 deltas.insert(0, l)
341 338 return "".join(deltas)
342 339
343 340 def addgroup(self, data, linkmapper, transaction):
344 341 # given a set of deltas, add them to the revision log. the
345 342 # first delta is against its parent, which should be in our
346 343 # log, the rest are against the previous delta.
347 344
348 345 if not data: return self.tip()
349 346
350 347 # retrieve the parent revision of the delta chain
351 348 chain = data[24:44]
352 349
353 350 # track the base of the current delta log
354 351 r = self.count()
355 352 t = r - 1
356 353
357 354 base = prev = -1
358 355 start = end = 0
359 356 if r:
360 357 start = self.start(self.base(t))
361 358 end = self.end(t)
362 359 measure = self.length(self.base(t))
363 360 base = self.base(t)
364 361 prev = self.tip()
365 362
366 363 transaction.add(self.datafile, end)
367 364 transaction.add(self.indexfile, r * struct.calcsize(indexformat))
368 365 dfh = self.opener(self.datafile, "a")
369 366 ifh = self.opener(self.indexfile, "a")
370 367
371 368 # loop through our set of deltas
372 369 pos = 0
373 370 while pos < len(data):
374 371 l, node, p1, p2, cs = struct.unpack(">l20s20s20s20s",
375 372 data[pos:pos+84])
376 373 link = linkmapper(cs)
377 374 delta = data[pos + 84:pos + l]
378 375 pos += l
379 376
380 377 # full versions are inserted when the needed deltas become
381 378 # comparable to the uncompressed text or when the previous
382 379 # version is not the one we have a delta against. We use
383 380 # the size of the previous full rev as a proxy for the
384 381 # current size.
385 382
386 383 if chain == prev:
387 384 cdelta = compress(delta)
388 385
389 386 if chain != prev or (end - start + len(cdelta)) > measure * 2:
390 387 # flush our writes here so we can read it in revision
391 388 dfh.flush()
392 389 ifh.flush()
393 390 text = self.revision(chain)
394 391 text = self.patch(text, delta)
395 392 chk = self.addrevision(text, transaction, link, p1, p2)
396 393 if chk != node:
397 394 raise "consistency error adding group"
398 395 measure = len(text)
399 396 else:
400 397 e = (end, len(cdelta), self.base(t), link, p1, p2, node)
401 398 self.index.append(e)
402 399 self.nodemap[node] = r
403 400 dfh.write(cdelta)
404 401 ifh.write(struct.pack(indexformat, *e))
405 402
406 403 t, r, chain, prev = r, r + 1, node, node
407 404 start = self.start(self.base(t))
408 405 end = self.end(t)
409 406
410 407 dfh.close()
411 408 ifh.close()
412 409 return node
General Comments 0
You need to be logged in to leave comments. Login now