##// END OF EJS Templates
treemanifest: make diff() faster...
Martin von Zweigbergk -
r24404:96cccf1e default
parent child Browse files
Show More
@@ -1,643 +1,651 b''
1 1 # manifest.py - manifest revision class for mercurial
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from i18n import _
9 9 import mdiff, parsers, error, revlog, util, scmutil
10 10 import array, struct
11 11
12 12 propertycache = util.propertycache
13 13
14 14 class _lazymanifest(dict):
15 15 """This is the pure implementation of lazymanifest.
16 16
17 17 It has not been optimized *at all* and is not lazy.
18 18 """
19 19
20 20 def __init__(self, data):
21 21 # This init method does a little bit of excessive-looking
22 22 # precondition checking. This is so that the behavior of this
23 23 # class exactly matches its C counterpart to try and help
24 24 # prevent surprise breakage for anyone that develops against
25 25 # the pure version.
26 26 if data and data[-1] != '\n':
27 27 raise ValueError('Manifest did not end in a newline.')
28 28 dict.__init__(self)
29 29 prev = None
30 30 for l in data.splitlines():
31 31 if prev is not None and prev > l:
32 32 raise ValueError('Manifest lines not in sorted order.')
33 33 prev = l
34 34 f, n = l.split('\0')
35 35 if len(n) > 40:
36 36 self[f] = revlog.bin(n[:40]), n[40:]
37 37 else:
38 38 self[f] = revlog.bin(n), ''
39 39
40 40 def __setitem__(self, k, v):
41 41 node, flag = v
42 42 assert node is not None
43 43 if len(node) > 21:
44 44 node = node[:21] # match c implementation behavior
45 45 dict.__setitem__(self, k, (node, flag))
46 46
47 47 def __iter__(self):
48 48 return iter(sorted(dict.keys(self)))
49 49
50 50 def iterkeys(self):
51 51 return iter(sorted(dict.keys(self)))
52 52
53 53 def iterentries(self):
54 54 return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
55 55
56 56 def copy(self):
57 57 c = _lazymanifest('')
58 58 c.update(self)
59 59 return c
60 60
61 61 def diff(self, m2, clean=False):
62 62 '''Finds changes between the current manifest and m2.'''
63 63 diff = {}
64 64
65 65 for fn, e1 in self.iteritems():
66 66 if fn not in m2:
67 67 diff[fn] = e1, (None, '')
68 68 else:
69 69 e2 = m2[fn]
70 70 if e1 != e2:
71 71 diff[fn] = e1, e2
72 72 elif clean:
73 73 diff[fn] = None
74 74
75 75 for fn, e2 in m2.iteritems():
76 76 if fn not in self:
77 77 diff[fn] = (None, ''), e2
78 78
79 79 return diff
80 80
81 81 def filtercopy(self, filterfn):
82 82 c = _lazymanifest('')
83 83 for f, n, fl in self.iterentries():
84 84 if filterfn(f):
85 85 c[f] = n, fl
86 86 return c
87 87
88 88 def text(self):
89 89 """Get the full data of this manifest as a bytestring."""
90 90 fl = sorted(self.iterentries())
91 91
92 92 _hex = revlog.hex
93 93 # if this is changed to support newlines in filenames,
94 94 # be sure to check the templates/ dir again (especially *-raw.tmpl)
95 95 return ''.join("%s\0%s%s\n" % (
96 96 f, _hex(n[:20]), flag) for f, n, flag in fl)
97 97
98 98 try:
99 99 _lazymanifest = parsers.lazymanifest
100 100 except AttributeError:
101 101 pass
102 102
103 103 class manifestdict(object):
104 104 def __init__(self, data=''):
105 105 self._lm = _lazymanifest(data)
106 106
107 107 def __getitem__(self, key):
108 108 return self._lm[key][0]
109 109
110 110 def find(self, key):
111 111 return self._lm[key]
112 112
113 113 def __len__(self):
114 114 return len(self._lm)
115 115
116 116 def __setitem__(self, key, node):
117 117 self._lm[key] = node, self.flags(key, '')
118 118
119 119 def __contains__(self, key):
120 120 return key in self._lm
121 121
122 122 def __delitem__(self, key):
123 123 del self._lm[key]
124 124
125 125 def __iter__(self):
126 126 return self._lm.__iter__()
127 127
128 128 def iterkeys(self):
129 129 return self._lm.iterkeys()
130 130
131 131 def keys(self):
132 132 return list(self.iterkeys())
133 133
134 134 def intersectfiles(self, files):
135 135 '''make a new lazymanifest with the intersection of self with files
136 136
137 137 The algorithm assumes that files is much smaller than self.'''
138 138 ret = manifestdict()
139 139 lm = self._lm
140 140 for fn in files:
141 141 if fn in lm:
142 142 ret._lm[fn] = self._lm[fn]
143 143 return ret
144 144
145 145 def filesnotin(self, m2):
146 146 '''Set of files in this manifest that are not in the other'''
147 147 files = set(self)
148 148 files.difference_update(m2)
149 149 return files
150 150
151 151 @propertycache
152 152 def _dirs(self):
153 153 return scmutil.dirs(self)
154 154
155 155 def dirs(self):
156 156 return self._dirs
157 157
158 158 def hasdir(self, dir):
159 159 return dir in self._dirs
160 160
161 161 def matches(self, match):
162 162 '''generate a new manifest filtered by the match argument'''
163 163 if match.always():
164 164 return self.copy()
165 165
166 166 files = match.files()
167 167 if (len(files) < 100 and (match.matchfn == match.exact or
168 168 (not match.anypats() and util.all(fn in self for fn in files)))):
169 169 return self.intersectfiles(files)
170 170
171 171 lm = manifestdict('')
172 172 lm._lm = self._lm.filtercopy(match)
173 173 return lm
174 174
175 175 def diff(self, m2, clean=False):
176 176 '''Finds changes between the current manifest and m2.
177 177
178 178 Args:
179 179 m2: the manifest to which this manifest should be compared.
180 180 clean: if true, include files unchanged between these manifests
181 181 with a None value in the returned dictionary.
182 182
183 183 The result is returned as a dict with filename as key and
184 184 values of the form ((n1,fl1),(n2,fl2)), where n1/n2 is the
185 185 nodeid in the current/other manifest and fl1/fl2 is the flag
186 186 in the current/other manifest. Where the file does not exist,
187 187 the nodeid will be None and the flags will be the empty
188 188 string.
189 189 '''
190 190 return self._lm.diff(m2._lm, clean)
191 191
192 192 def setflag(self, key, flag):
193 193 self._lm[key] = self[key], flag
194 194
195 195 def get(self, key, default=None):
196 196 try:
197 197 return self._lm[key][0]
198 198 except KeyError:
199 199 return default
200 200
201 201 def flags(self, key, default=''):
202 202 try:
203 203 return self._lm[key][1]
204 204 except KeyError:
205 205 return default
206 206
207 207 def copy(self):
208 208 c = manifestdict('')
209 209 c._lm = self._lm.copy()
210 210 return c
211 211
212 212 def iteritems(self):
213 213 return (x[:2] for x in self._lm.iterentries())
214 214
215 215 def text(self):
216 216 return self._lm.text()
217 217
218 218 def fastdelta(self, base, changes):
219 219 """Given a base manifest text as an array.array and a list of changes
220 220 relative to that text, compute a delta that can be used by revlog.
221 221 """
222 222 delta = []
223 223 dstart = None
224 224 dend = None
225 225 dline = [""]
226 226 start = 0
227 227 # zero copy representation of base as a buffer
228 228 addbuf = util.buffer(base)
229 229
230 230 # start with a readonly loop that finds the offset of
231 231 # each line and creates the deltas
232 232 for f, todelete in changes:
233 233 # bs will either be the index of the item or the insert point
234 234 start, end = _msearch(addbuf, f, start)
235 235 if not todelete:
236 236 h, fl = self._lm[f]
237 237 l = "%s\0%s%s\n" % (f, revlog.hex(h), fl)
238 238 else:
239 239 if start == end:
240 240 # item we want to delete was not found, error out
241 241 raise AssertionError(
242 242 _("failed to remove %s from manifest") % f)
243 243 l = ""
244 244 if dstart is not None and dstart <= start and dend >= start:
245 245 if dend < end:
246 246 dend = end
247 247 if l:
248 248 dline.append(l)
249 249 else:
250 250 if dstart is not None:
251 251 delta.append([dstart, dend, "".join(dline)])
252 252 dstart = start
253 253 dend = end
254 254 dline = [l]
255 255
256 256 if dstart is not None:
257 257 delta.append([dstart, dend, "".join(dline)])
258 258 # apply the delta to the base, and get a delta for addrevision
259 259 deltatext, arraytext = _addlistdelta(base, delta)
260 260 return arraytext, deltatext
261 261
262 262 def _msearch(m, s, lo=0, hi=None):
263 263 '''return a tuple (start, end) that says where to find s within m.
264 264
265 265 If the string is found m[start:end] are the line containing
266 266 that string. If start == end the string was not found and
267 267 they indicate the proper sorted insertion point.
268 268
269 269 m should be a buffer or a string
270 270 s is a string'''
271 271 def advance(i, c):
272 272 while i < lenm and m[i] != c:
273 273 i += 1
274 274 return i
275 275 if not s:
276 276 return (lo, lo)
277 277 lenm = len(m)
278 278 if not hi:
279 279 hi = lenm
280 280 while lo < hi:
281 281 mid = (lo + hi) // 2
282 282 start = mid
283 283 while start > 0 and m[start - 1] != '\n':
284 284 start -= 1
285 285 end = advance(start, '\0')
286 286 if m[start:end] < s:
287 287 # we know that after the null there are 40 bytes of sha1
288 288 # this translates to the bisect lo = mid + 1
289 289 lo = advance(end + 40, '\n') + 1
290 290 else:
291 291 # this translates to the bisect hi = mid
292 292 hi = start
293 293 end = advance(lo, '\0')
294 294 found = m[lo:end]
295 295 if s == found:
296 296 # we know that after the null there are 40 bytes of sha1
297 297 end = advance(end + 40, '\n')
298 298 return (lo, end + 1)
299 299 else:
300 300 return (lo, lo)
301 301
302 302 def _checkforbidden(l):
303 303 """Check filenames for illegal characters."""
304 304 for f in l:
305 305 if '\n' in f or '\r' in f:
306 306 raise error.RevlogError(
307 307 _("'\\n' and '\\r' disallowed in filenames: %r") % f)
308 308
309 309
310 310 # apply the changes collected during the bisect loop to our addlist
311 311 # return a delta suitable for addrevision
312 312 def _addlistdelta(addlist, x):
313 313 # for large addlist arrays, building a new array is cheaper
314 314 # than repeatedly modifying the existing one
315 315 currentposition = 0
316 316 newaddlist = array.array('c')
317 317
318 318 for start, end, content in x:
319 319 newaddlist += addlist[currentposition:start]
320 320 if content:
321 321 newaddlist += array.array('c', content)
322 322
323 323 currentposition = end
324 324
325 325 newaddlist += addlist[currentposition:]
326 326
327 327 deltatext = "".join(struct.pack(">lll", start, end, len(content))
328 328 + content for start, end, content in x)
329 329 return deltatext, newaddlist
330 330
331 331 def _splittopdir(f):
332 332 if '/' in f:
333 333 dir, subpath = f.split('/', 1)
334 334 return dir + '/', subpath
335 335 else:
336 336 return '', f
337 337
338 338 class treemanifest(object):
339 339 def __init__(self, dir='', text=''):
340 340 self._dir = dir
341 341 self._dirs = {}
342 342 # Using _lazymanifest here is a little slower than plain old dicts
343 343 self._files = {}
344 344 self._flags = {}
345 345 lm = _lazymanifest(text)
346 346 for f, n, fl in lm.iterentries():
347 347 self[f] = n
348 348 if fl:
349 349 self.setflag(f, fl)
350 350
351 351 def _subpath(self, path):
352 352 return self._dir + path
353 353
354 354 def __len__(self):
355 355 size = len(self._files)
356 356 for m in self._dirs.values():
357 357 size += m.__len__()
358 358 return size
359 359
360 360 def __str__(self):
361 361 return '<treemanifest dir=%s>' % self._dir
362 362
363 363 def iteritems(self):
364 364 for p, n in sorted(self._dirs.items() + self._files.items()):
365 365 if p in self._files:
366 366 yield self._subpath(p), n
367 367 else:
368 368 for f, sn in n.iteritems():
369 369 yield f, sn
370 370
371 371 def iterkeys(self):
372 372 for p in sorted(self._dirs.keys() + self._files.keys()):
373 373 if p in self._files:
374 374 yield self._subpath(p)
375 375 else:
376 376 for f in self._dirs[p].iterkeys():
377 377 yield f
378 378
379 379 def keys(self):
380 380 return list(self.iterkeys())
381 381
382 382 def __iter__(self):
383 383 return self.iterkeys()
384 384
385 385 def __contains__(self, f):
386 386 if f is None:
387 387 return False
388 388 dir, subpath = _splittopdir(f)
389 389 if dir:
390 390 if dir not in self._dirs:
391 391 return False
392 392 return self._dirs[dir].__contains__(subpath)
393 393 else:
394 394 return f in self._files
395 395
396 396 def get(self, f, default=None):
397 397 dir, subpath = _splittopdir(f)
398 398 if dir:
399 399 if dir not in self._dirs:
400 400 return default
401 401 return self._dirs[dir].get(subpath, default)
402 402 else:
403 403 return self._files.get(f, default)
404 404
405 405 def __getitem__(self, f):
406 406 dir, subpath = _splittopdir(f)
407 407 if dir:
408 408 return self._dirs[dir].__getitem__(subpath)
409 409 else:
410 410 return self._files[f]
411 411
412 412 def flags(self, f):
413 413 dir, subpath = _splittopdir(f)
414 414 if dir:
415 415 if dir not in self._dirs:
416 416 return ''
417 417 return self._dirs[dir].flags(subpath)
418 418 else:
419 419 if f in self._dirs:
420 420 return ''
421 421 return self._flags.get(f, '')
422 422
423 423 def find(self, f):
424 424 dir, subpath = _splittopdir(f)
425 425 if dir:
426 426 return self._dirs[dir].find(subpath)
427 427 else:
428 428 return self._files[f], self._flags.get(f, '')
429 429
430 430 def __delitem__(self, f):
431 431 dir, subpath = _splittopdir(f)
432 432 if dir:
433 433 self._dirs[dir].__delitem__(subpath)
434 434 # If the directory is now empty, remove it
435 435 if not self._dirs[dir]._dirs and not self._dirs[dir]._files:
436 436 del self._dirs[dir]
437 437 else:
438 438 del self._files[f]
439 439 if f in self._flags:
440 440 del self._flags[f]
441 441
442 442 def __setitem__(self, f, n):
443 443 assert n is not None
444 444 dir, subpath = _splittopdir(f)
445 445 if dir:
446 446 if dir not in self._dirs:
447 447 self._dirs[dir] = treemanifest(self._subpath(dir))
448 448 self._dirs[dir].__setitem__(subpath, n)
449 449 else:
450 450 self._files[f] = n
451 451
452 452 def setflag(self, f, flags):
453 453 """Set the flags (symlink, executable) for path f."""
454 454 dir, subpath = _splittopdir(f)
455 455 if dir:
456 456 if dir not in self._dirs:
457 457 self._dirs[dir] = treemanifest(self._subpath(dir))
458 458 self._dirs[dir].setflag(subpath, flags)
459 459 else:
460 460 self._flags[f] = flags
461 461
462 462 def copy(self):
463 463 copy = treemanifest(self._dir)
464 464 for d in self._dirs:
465 465 copy._dirs[d] = self._dirs[d].copy()
466 466 copy._files = dict.copy(self._files)
467 467 copy._flags = dict.copy(self._flags)
468 468 return copy
469 469
470 470 def intersectfiles(self, files):
471 471 '''make a new treemanifest with the intersection of self with files
472 472
473 473 The algorithm assumes that files is much smaller than self.'''
474 474 ret = treemanifest()
475 475 for fn in files:
476 476 if fn in self:
477 477 ret[fn] = self[fn]
478 478 flags = self.flags(fn)
479 479 if flags:
480 480 ret.setflag(fn, flags)
481 481 return ret
482 482
483 483 def filesnotin(self, m2):
484 484 '''Set of files in this manifest that are not in the other'''
485 485 files = set(self.iterkeys())
486 486 files.difference_update(m2.iterkeys())
487 487 return files
488 488
489 489 @propertycache
490 490 def _alldirs(self):
491 491 return scmutil.dirs(self)
492 492
493 493 def dirs(self):
494 494 return self._alldirs
495 495
496 496 def hasdir(self, dir):
497 497 return dir in self._alldirs
498 498
499 499 def matches(self, match):
500 500 '''generate a new manifest filtered by the match argument'''
501 501 if match.always():
502 502 return self.copy()
503 503
504 504 files = match.files()
505 505 if (match.matchfn == match.exact or
506 506 (not match.anypats() and util.all(fn in self for fn in files))):
507 507 return self.intersectfiles(files)
508 508
509 509 m = self.copy()
510 510 for fn in m.keys():
511 511 if not match(fn):
512 512 del m[fn]
513 513 return m
514 514
515 515 def diff(self, m2, clean=False):
516 516 '''Finds changes between the current manifest and m2.
517 517
518 518 Args:
519 519 m2: the manifest to which this manifest should be compared.
520 520 clean: if true, include files unchanged between these manifests
521 521 with a None value in the returned dictionary.
522 522
523 523 The result is returned as a dict with filename as key and
524 524 values of the form ((n1,fl1),(n2,fl2)), where n1/n2 is the
525 525 nodeid in the current/other manifest and fl1/fl2 is the flag
526 526 in the current/other manifest. Where the file does not exist,
527 527 the nodeid will be None and the flags will be the empty
528 528 string.
529 529 '''
530 diff = {}
530 result = {}
531 emptytree = treemanifest()
532 def _diff(t1, t2):
533 for d, m1 in t1._dirs.iteritems():
534 m2 = t2._dirs.get(d, emptytree)
535 _diff(m1, m2)
536
537 for d, m2 in t2._dirs.iteritems():
538 if d not in t1._dirs:
539 _diff(emptytree, m2)
531 540
532 for fn, n1 in self.iteritems():
533 fl1 = self.flags(fn)
534 n2 = m2.get(fn, None)
535 fl2 = m2.flags(fn)
536 if n2 is None:
537 fl2 = ''
538 if n1 != n2 or fl1 != fl2:
539 diff[fn] = ((n1, fl1), (n2, fl2))
540 elif clean:
541 diff[fn] = None
541 for fn, n1 in t1._files.iteritems():
542 fl1 = t1._flags.get(fn, '')
543 n2 = t2._files.get(fn, None)
544 fl2 = t2._flags.get(fn, '')
545 if n1 != n2 or fl1 != fl2:
546 result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2))
547 elif clean:
548 result[t1._subpath(fn)] = None
542 549
543 for fn, n2 in m2.iteritems():
544 if fn not in self:
545 fl2 = m2.flags(fn)
546 diff[fn] = ((None, ''), (n2, fl2))
550 for fn, n2 in t2._files.iteritems():
551 if fn not in t1._files:
552 fl2 = t2._flags.get(fn, '')
553 result[t2._subpath(fn)] = ((None, ''), (n2, fl2))
547 554
548 return diff
555 _diff(self, m2)
556 return result
549 557
550 558 def text(self):
551 559 """Get the full data of this manifest as a bytestring."""
552 560 fl = self.keys()
553 561 _checkforbidden(fl)
554 562
555 563 hex, flags = revlog.hex, self.flags
556 564 # if this is changed to support newlines in filenames,
557 565 # be sure to check the templates/ dir again (especially *-raw.tmpl)
558 566 return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f in fl)
559 567
560 568 class manifest(revlog.revlog):
561 569 def __init__(self, opener):
562 570 # During normal operations, we expect to deal with not more than four
563 571 # revs at a time (such as during commit --amend). When rebasing large
564 572 # stacks of commits, the number can go up, hence the config knob below.
565 573 cachesize = 4
566 574 usetreemanifest = False
567 575 opts = getattr(opener, 'options', None)
568 576 if opts is not None:
569 577 cachesize = opts.get('manifestcachesize', cachesize)
570 578 usetreemanifest = opts.get('usetreemanifest', usetreemanifest)
571 579 self._mancache = util.lrucachedict(cachesize)
572 580 revlog.revlog.__init__(self, opener, "00manifest.i")
573 581 self._usetreemanifest = usetreemanifest
574 582
575 583 def _newmanifest(self, data=''):
576 584 if self._usetreemanifest:
577 585 return treemanifest('', data)
578 586 return manifestdict(data)
579 587
580 588 def readdelta(self, node):
581 589 r = self.rev(node)
582 590 d = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
583 591 return self._newmanifest(d)
584 592
585 593 def readfast(self, node):
586 594 '''use the faster of readdelta or read'''
587 595 r = self.rev(node)
588 596 deltaparent = self.deltaparent(r)
589 597 if deltaparent != revlog.nullrev and deltaparent in self.parentrevs(r):
590 598 return self.readdelta(node)
591 599 return self.read(node)
592 600
593 601 def read(self, node):
594 602 if node == revlog.nullid:
595 603 return self._newmanifest() # don't upset local cache
596 604 if node in self._mancache:
597 605 return self._mancache[node][0]
598 606 text = self.revision(node)
599 607 arraytext = array.array('c', text)
600 608 m = self._newmanifest(text)
601 609 self._mancache[node] = (m, arraytext)
602 610 return m
603 611
604 612 def find(self, node, f):
605 613 '''look up entry for a single file efficiently.
606 614 return (node, flags) pair if found, (None, None) if not.'''
607 615 m = self.read(node)
608 616 try:
609 617 return m.find(f)
610 618 except KeyError:
611 619 return None, None
612 620
613 621 def add(self, m, transaction, link, p1, p2, added, removed):
614 622 if p1 in self._mancache and not self._usetreemanifest:
615 623 # If our first parent is in the manifest cache, we can
616 624 # compute a delta here using properties we know about the
617 625 # manifest up-front, which may save time later for the
618 626 # revlog layer.
619 627
620 628 _checkforbidden(added)
621 629 # combine the changed lists into one list for sorting
622 630 work = [(x, False) for x in added]
623 631 work.extend((x, True) for x in removed)
624 632 # this could use heapq.merge() (from Python 2.6+) or equivalent
625 633 # since the lists are already sorted
626 634 work.sort()
627 635
628 636 arraytext, deltatext = m.fastdelta(self._mancache[p1][1], work)
629 637 cachedelta = self.rev(p1), deltatext
630 638 text = util.buffer(arraytext)
631 639 else:
632 640 # The first parent manifest isn't already loaded, so we'll
633 641 # just encode a fulltext of the manifest and pass that
634 642 # through to the revlog layer, and let it handle the delta
635 643 # process.
636 644 text = m.text()
637 645 arraytext = array.array('c', text)
638 646 cachedelta = None
639 647
640 648 n = self.addrevision(text, transaction, link, p1, p2, cachedelta)
641 649 self._mancache[n] = (m, arraytext)
642 650
643 651 return n
General Comments 0
You need to be logged in to leave comments. Login now