##// END OF EJS Templates
bdiff: improve worst case behavior by 100x....
bdiff: improve worst case behavior by 100x. on 5.8MB (244.000 lines) text file with similar lines, hash before this change made diff against empty file take 75 seconds. this change improves performance to 0.6 seconds. result is that clone of smallish repo (137MB) with some files like this takes 1 minute instead of 10 minutes. common case of diff is 10% slower now, probably because of worse cache locality. but diff does not affect overall performance in common case (less than 1% of runtime is in diff when it is working ok), so this tradeoff looks good.

File last commit:

r2470:fe168927 default
r2577:fa76c5d6 default
Show More
manifest.py
188 lines | 6.9 KiB | text/x-python | PythonLexer
mpm@selenic.com
Break apart hg.py...
r1089 # manifest.py - manifest revision class for mercurial
#
# Copyright 2005 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms
# of the GNU General Public License, incorporated herein by reference.
from revlog import *
Benoit Boissinot
i18n first part: make '_' available for files who need it
r1400 from i18n import gettext as _
mpm@selenic.com
Break apart hg.py...
r1089 from demandload import *
Vadim Gelfer
use demandload more.
r2470 demandload(globals(), "array bisect struct")
mpm@selenic.com
Break apart hg.py...
r1089
class manifest(revlog):
Thomas Arendsen Hein
Replaced 0 with REVLOGV0 where this meaning is used.
r2142 def __init__(self, opener, defversion=REVLOGV0):
mpm@selenic.com
Break apart hg.py...
r1089 self.mapcache = None
self.listcache = None
mason@suse.com
Implement revlogng....
r2072 revlog.__init__(self, opener, "00manifest.i", "00manifest.d",
defversion)
mpm@selenic.com
Break apart hg.py...
r1089
def read(self, node):
if node == nullid: return {} # don't upset local cache
if self.mapcache and self.mapcache[0] == node:
return self.mapcache[1]
text = self.revision(node)
map = {}
flag = {}
mason@suse.com
Optimize manifest.add...
r1534 self.listcache = array.array('c', text)
lines = text.splitlines(1)
for l in lines:
mpm@selenic.com
Break apart hg.py...
r1089 (f, n) = l.split('\0')
map[f] = bin(n[:40])
flag[f] = (n[40:-1] == "x")
self.mapcache = (node, map, flag)
return map
def readflags(self, node):
if node == nullid: return {} # don't upset local cache
if not self.mapcache or self.mapcache[0] != node:
self.read(node)
return self.mapcache[2]
mason@suse.com
Optimize manifest.add...
r1534 def diff(self, a, b):
return mdiff.textdiff(str(a), str(b))
Vadim Gelfer
fix parsing of tags. make parse errors useful. add new tag tests....
r2320 def _search(self, m, s, lo=0, hi=None):
'''return a tuple (start, end) that says where to find s within m.
If the string is found m[start:end] are the line containing
that string. If start == end the string was not found and
they indicate the proper sorted insertion point. This was
taken from bisect_left, and modified to find line start/end as
it goes along.
m should be a buffer or a string
s is a string'''
def advance(i, c):
while i < lenm and m[i] != c:
i += 1
return i
lenm = len(m)
if not hi:
hi = lenm
while lo < hi:
mid = (lo + hi) // 2
start = mid
while start > 0 and m[start-1] != '\n':
start -= 1
end = advance(start, '\0')
if m[start:end] < s:
# we know that after the null there are 40 bytes of sha1
# this translates to the bisect lo = mid + 1
lo = advance(end + 40, '\n') + 1
else:
# this translates to the bisect hi = mid
hi = start
end = advance(lo, '\0')
found = m[lo:end]
if cmp(s, found) == 0:
# we know that after the null there are 40 bytes of sha1
end = advance(end + 40, '\n')
return (lo, end+1)
else:
return (lo, lo)
def find(self, node, f):
'''look up entry for a single file efficiently.
return (node, flag) pair if found, (None, None) if not.'''
if self.mapcache and node == self.mapcache[0]:
return self.mapcache[1].get(f), self.mapcache[2].get(f)
text = self.revision(node)
start, end = self._search(text, f)
if start == end:
return None, None
l = text[start:end]
f, n = l.split('\0')
return bin(n[:40]), n[40:-1] == 'x'
mpm@selenic.com
Break apart hg.py...
r1089 def add(self, map, flags, transaction, link, p1=None, p2=None,
changed=None):
# apply the changes collected during the bisect loop to our addlist
mason@suse.com
Optimize manifest.add...
r1534 # return a delta suitable for addrevision
def addlistdelta(addlist, x):
# start from the bottom up
mpm@selenic.com
Break apart hg.py...
r1089 # so changes to the offsets don't mess things up.
mason@suse.com
Optimize manifest.add...
r1534 i = len(x)
mpm@selenic.com
Break apart hg.py...
r1089 while i > 0:
i -= 1
mason@suse.com
Optimize manifest.add...
r1534 start = x[i][0]
end = x[i][1]
if x[i][2]:
addlist[start:end] = array.array('c', x[i][2])
mpm@selenic.com
Break apart hg.py...
r1089 else:
del addlist[start:end]
mason@suse.com
Optimize manifest.add...
r1534 return "".join([struct.pack(">lll", d[0], d[1], len(d[2])) + d[2] \
for d in x ])
mpm@selenic.com
Break apart hg.py...
r1089
# if we're using the listcache, make sure it is valid and
# parented by the same node we're diffing against
if not changed or not self.listcache or not p1 or \
self.mapcache[0] != p1:
files = map.keys()
files.sort()
Matt Mackall
Fix comment syntax
r1651 # if this is changed to support newlines in filenames,
# be sure to check the templates/ dir again (especially *-raw.tmpl)
mason@suse.com
Optimize manifest.add...
r1534 text = ["%s\000%s%s\n" %
mpm@selenic.com
Break apart hg.py...
r1089 (f, hex(map[f]), flags[f] and "x" or '')
for f in files]
mason@suse.com
Optimize manifest.add...
r1534 self.listcache = array.array('c', "".join(text))
mpm@selenic.com
Break apart hg.py...
r1089 cachedelta = None
else:
mason@suse.com
Optimize manifest.add...
r1534 addlist = self.listcache
mpm@selenic.com
Break apart hg.py...
r1089
# combine the changed lists into one list for sorting
work = [[x, 0] for x in changed[0]]
work[len(work):] = [[x, 1] for x in changed[1]]
work.sort()
delta = []
mason@suse.com
Optimize manifest.add...
r1534 dstart = None
dend = None
dline = [""]
start = 0
# zero copy representation of addlist as a buffer
addbuf = buffer(addlist)
mpm@selenic.com
Break apart hg.py...
r1089
mason@suse.com
Optimize manifest.add...
r1534 # start with a readonly loop that finds the offset of
# each line and creates the deltas
mpm@selenic.com
Break apart hg.py...
r1089 for w in work:
f = w[0]
# bs will either be the index of the item or the insert point
Vadim Gelfer
fix parsing of tags. make parse errors useful. add new tag tests....
r2320 start, end = self._search(addbuf, f, start)
mpm@selenic.com
Break apart hg.py...
r1089 if w[1] == 0:
l = "%s\000%s%s\n" % (f, hex(map[f]),
flags[f] and "x" or '')
else:
mason@suse.com
Optimize manifest.add...
r1534 l = ""
if start == end and w[1] == 1:
# item we want to delete was not found, error out
raise AssertionError(
Benoit Boissinot
i18n part2: use '_' for all strings who are part of the user interface
r1402 _("failed to remove %s from manifest\n") % f)
mason@suse.com
Optimize manifest.add...
r1534 if dstart != None and dstart <= start and dend >= start:
if dend < end:
dend = end
if l:
dline.append(l)
mpm@selenic.com
Break apart hg.py...
r1089 else:
mason@suse.com
Optimize manifest.add...
r1534 if dstart != None:
delta.append([dstart, dend, "".join(dline)])
dstart = start
dend = end
dline = [l]
mpm@selenic.com
Break apart hg.py...
r1089
mason@suse.com
Optimize manifest.add...
r1534 if dstart != None:
delta.append([dstart, dend, "".join(dline)])
# apply the delta to the addlist, and get a delta for addrevision
cachedelta = addlistdelta(addlist, delta)
mpm@selenic.com
Break apart hg.py...
r1089
mason@suse.com
Optimize manifest.add...
r1534 # the delta is only valid if we've been processing the tip revision
if self.mapcache[0] != self.tip():
cachedelta = None
self.listcache = addlist
n = self.addrevision(buffer(self.listcache), transaction, link, p1, \
p2, cachedelta)
mpm@selenic.com
Break apart hg.py...
r1089 self.mapcache = (n, map, flags)
return n