upstream/mercurial-mirror Files · mercurial/manifest.py

mdiff: speed up showfunc for large diffs...

mdiff: speed up showfunc for large diffs This addresses the following issues with showfunc: - Silly usage of regular expressions. - Doing str.rstrip() needlessly in an inner loop. - Doing catastrophic backtracking when trying to find a function line. Finding function text is now at worst O(n lines in the old file), and at best close to O(n hunks). Given a diff like this[1]: src/main/antlr3/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunker.g | 4 +- src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerLexer.java | 2 +- src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerParser.java | 29189 +++++---- 3 files changed, 14741 insertions(+), 14454 deletions(-) [1]: https://bitbucket.org/wwmm/chemicaltagger/changeset/d2bfbaecd4fc/raw Without this change, hg log --stat --config diff.showfunc=1 takes an absurdly long time to complete: CallCount Recursive Total(ms) Inline(ms) module:lineno(function) 32813 0 80.3546 40.6086 mercurial.mdiff:160(yieldhunk) +65062746 0 25.7227 25.7227 +<method 'match' of '_sre.SRE_Pattern' objects> +65062746 0 14.0221 14.0221 +<method 'rstrip' of 'str' objects> +1809 0 0.0009 0.0009 +mercurial.mdiff:148(contextend) +1809 0 0.0003 0.0003 +<len> 65062746 0 25.7227 25.7227 <method 'match' of '_sre.SRE_Pattern' objects> 65062763 0 14.0221 14.0221 <method 'rstrip' of 'str' objects> 543 0 0.1631 0.1631 <zlib.decompress> 3 0 0.0505 0.0505 <mercurial.bdiff.blocks> 31007 0 80.4564 0.0477 mercurial.mdiff:147(_unidiff) +32813 0 80.3546 40.6086 +mercurial.mdiff:160(yieldhunk) +3 0 0.0505 0.0505 +<mercurial.bdiff.blocks> +3618 0 0.0022 0.0022 +mercurial.mdiff:154(contextstart) +5427 0 0.0013 0.0013 +<len> +3 0 0.0001 0.0000 +re:188(compile) 1 0 80.8381 0.0322 mercurial.patch:1777(diffstatdata) +107499 0 0.0235 0.0235 +<method 'startswith' of 'str' objects> +31014 0 80.7820 0.0071 +mercurial.util:1284(iterlines) +3 0 0.0000 0.0000 +<method 'search' of '_sre.SRE_Pattern' objects> +4 0 0.0000 0.0000 +mercurial.patch:1783(addresult) +3 0 0.0000 0.0000 +<method 'group' of '_sre.SRE_Match' objects> 6 0 0.0444 0.0283 mercurial.mdiff:12(splitnewlines) +6 0 0.0160 0.0160 +<method 'split' of 'str' objects> 32 0 0.0246 0.0246 <method 'update' of '_hashlib.HASH' objects> 11 0 0.0236 0.0236 <method 'read' of 'file' objects> Time: real 80.880 secs (user 80.200+0.000 sys 0.380+0.000) With this change, it's almost as fast as not using showfunc at all: CallCount Recursive Total(ms) Inline(ms) module:lineno(function) 543 0 0.1699 0.1699 <zlib.decompress> 3 0 0.0501 0.0501 <mercurial.bdiff.blocks> 32813 0 0.0415 0.0348 mercurial.mdiff:161(yieldhunk) +70837 0 0.0058 0.0058 +<method 'isalnum' of 'str' objects> +1809 0 0.0006 0.0006 +mercurial.mdiff:148(contextend) +1809 0 0.0002 0.0002 +<len> 1 0 0.4879 0.0310 mercurial.patch:1777(diffstatdata) +107499 0 0.0230 0.0230 +<method 'startswith' of 'str' objects> +31014 0 0.4335 0.0065 +mercurial.util:1284(iterlines) +3 0 0.0000 0.0000 +<method 'search' of '_sre.SRE_Pattern' objects> +4 0 0.0000 0.0000 +mercurial.patch:1783(addresult) +1 0 0.0004 0.0000 +re:188(compile) 32 0 0.0293 0.0293 <method 'update' of '_hashlib.HASH' objects> 6 0 0.0427 0.0279 mercurial.mdiff:12(splitnewlines) +6 0 0.0147 0.0147 +<method 'split' of 'str' objects> 31007 0 0.1169 0.0235 mercurial.mdiff:147(_unidiff) +3 0 0.0501 0.0501 +<mercurial.bdiff.blocks> +32813 0 0.0415 0.0348 +mercurial.mdiff:161(yieldhunk) +3618 0 0.0012 0.0012 +mercurial.mdiff:154(contextstart) +5427 0 0.0006 0.0006 +<len> 107597 0 0.0230 0.0230 <method 'startswith' of 'str' objects> 16 0 0.0213 0.0213 <mercurial.mpatch.patches> 194 0 0.0149 0.0149 <method 'split' of 'str' objects> Time: real 0.530 secs (user 0.450+0.000 sys 0.070+0.000)

Martin Geisler - - Load All Authors

File last commit:

r14632:4819241e default


                r15141:16dc9a32

default

Download file

             manifest.py
        
                    204 lines
            
             | 7.6 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / manifest.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # manifest.py - manifest revision class for mercurial

      #

      # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>

      #

      # This software may be used and distributed according to the terms of the

      # GNU General Public License version 2 or any later version.

      from i18n import _

      import mdiff, parsers, error, revlog

      import array, struct

      class manifestdict(dict):

          def __init__(self, mapping=None, flags=None):

              if mapping is None:

                  mapping = {}

              if flags is None:

                  flags = {}

              dict.__init__(self, mapping)

              self._flags = flags

          def flags(self, f):

              return self._flags.get(f, "")

          def set(self, f, flags):

              self._flags[f] = flags

          def copy(self):

              return manifestdict(self, dict.copy(self._flags))

      class manifest(revlog.revlog):

          def __init__(self, opener):

              self._mancache = None

              revlog.revlog.__init__(self, opener, "00manifest.i")

          def parse(self, lines):

              mfdict = manifestdict()

              parsers.parse_manifest(mfdict, mfdict._flags, lines)

              return mfdict

          def readdelta(self, node):

              r = self.rev(node)

              return self.parse(mdiff.patchtext(self.revdiff(self.deltaparent(r), r)))

          def readfast(self, node):

              '''use the faster of readdelta or read'''

              r = self.rev(node)

              deltaparent = self.deltaparent(r)

              if deltaparent != revlog.nullrev and deltaparent in self.parentrevs(r):

                  return self.readdelta(node)

              return self.read(node)

          def read(self, node):

              if node == revlog.nullid:

                  return manifestdict() # don't upset local cache

              if self._mancache and self._mancache[0] == node:

                  return self._mancache[1]

              text = self.revision(node)

              arraytext = array.array('c', text)

              mapping = self.parse(text)

              self._mancache = (node, mapping, arraytext)

              return mapping

          def _search(self, m, s, lo=0, hi=None):

              '''return a tuple (start, end) that says where to find s within m.

              If the string is found m[start:end] are the line containing

              that string.  If start == end the string was not found and

              they indicate the proper sorted insertion point.  This was

              taken from bisect_left, and modified to find line start/end as

              it goes along.

              m should be a buffer or a string

              s is a string'''

              def advance(i, c):

                  while i < lenm and m[i] != c:

                      i += 1

                  return i

              if not s:

                  return (lo, lo)

              lenm = len(m)

              if not hi:

                  hi = lenm

              while lo < hi:

                  mid = (lo + hi) // 2

                  start = mid

                  while start > 0 and m[start - 1] != '\n':

                      start -= 1

                  end = advance(start, '\0')

                  if m[start:end] < s:

                      # we know that after the null there are 40 bytes of sha1

                      # this translates to the bisect lo = mid + 1

                      lo = advance(end + 40, '\n') + 1

                  else:

                      # this translates to the bisect hi = mid

                      hi = start

              end = advance(lo, '\0')

              found = m[lo:end]

              if s == found:

                  # we know that after the null there are 40 bytes of sha1

                  end = advance(end + 40, '\n')

                  return (lo, end + 1)

              else:

                  return (lo, lo)

          def find(self, node, f):

              '''look up entry for a single file efficiently.

              return (node, flags) pair if found, (None, None) if not.'''

              if self._mancache and self._mancache[0] == node:

                  return self._mancache[1].get(f), self._mancache[1].flags(f)

              text = self.revision(node)

              start, end = self._search(text, f)

              if start == end:

                  return None, None

              l = text[start:end]

              f, n = l.split('\0')

              return revlog.bin(n[:40]), n[40:-1]

          def add(self, map, transaction, link, p1=None, p2=None,

                  changed=None):

              # apply the changes collected during the bisect loop to our addlist

              # return a delta suitable for addrevision

              def addlistdelta(addlist, x):

                  # start from the bottom up

                  # so changes to the offsets don't mess things up.

                  for start, end, content in reversed(x):

                      if content:

                          addlist[start:end] = array.array('c', content)

                      else:

                          del addlist[start:end]

                  return "".join(struct.pack(">lll", start, end, len(content)) + content

                                 for start, end, content in x)

              def checkforbidden(l):

                  for f in l:

                      if '\n' in f or '\r' in f:

                          raise error.RevlogError(

                              _("'\\n' and '\\r' disallowed in filenames: %r") % f)

              # if we're using the cache, make sure it is valid and

              # parented by the same node we're diffing against

              if not (changed and self._mancache and p1 and self._mancache[0] == p1):

                  files = sorted(map)

                  checkforbidden(files)

                  # if this is changed to support newlines in filenames,

                  # be sure to check the templates/ dir again (especially *-raw.tmpl)

                  hex, flags = revlog.hex, map.flags

                  text = ''.join("%s\0%s%s\n" % (f, hex(map[f]), flags(f))

                                 for f in files)

                  arraytext = array.array('c', text)

                  cachedelta = None

              else:

                  added, removed = changed

                  addlist = self._mancache[2]

                  checkforbidden(added)

                  # combine the changed lists into one list for sorting

                  work = [(x, False) for x in added]

                  work.extend((x, True) for x in removed)

                  # this could use heapq.merge() (from python2.6+) or equivalent

                  # since the lists are already sorted

                  work.sort()

                  delta = []

                  dstart = None

                  dend = None

                  dline = [""]

                  start = 0

                  # zero copy representation of addlist as a buffer

                  addbuf = buffer(addlist)

                  # start with a readonly loop that finds the offset of

                  # each line and creates the deltas

                  for f, todelete in work:

                      # bs will either be the index of the item or the insert point

                      start, end = self._search(addbuf, f, start)

                      if not todelete:

                          l = "%s\0%s%s\n" % (f, revlog.hex(map[f]), map.flags(f))

                      else:

                          if start == end:

                              # item we want to delete was not found, error out

                              raise AssertionError(

                                      _("failed to remove %s from manifest") % f)

                          l = ""

                      if dstart is not None and dstart <= start and dend >= start:

                          if dend < end:

                              dend = end

                          if l:

                              dline.append(l)

                      else:

                          if dstart is not None:

                              delta.append([dstart, dend, "".join(dline)])

                          dstart = start

                          dend = end

                          dline = [l]

                  if dstart is not None:

                      delta.append([dstart, dend, "".join(dline)])

                  # apply the delta to the addlist, and get a delta for addrevision

                  cachedelta = (self.rev(p1), addlistdelta(addlist, delta))

                  arraytext = addlist

                  text = buffer(arraytext)

              n = self.addrevision(text, transaction, link, p1, p2, cachedelta)

              self._mancache = (n, map, arraytext)

              return n

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# manifest.py - manifest revision class for mercurial
				#
				# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
				#
				# This software may be used and distributed according to the terms of the
				# GNU General Public License version 2 or any later version.

				from i18n import _
				import mdiff, parsers, error, revlog
				import array, struct

				class manifestdict(dict):
				def __init__(self, mapping=None, flags=None):
				if mapping is None:
				mapping = {}
				if flags is None:
				flags = {}
				dict.__init__(self, mapping)
				self._flags = flags
				def flags(self, f):
				return self._flags.get(f, "")
				def set(self, f, flags):
				self._flags[f] = flags
				def copy(self):
				return manifestdict(self, dict.copy(self._flags))

				class manifest(revlog.revlog):
				def __init__(self, opener):
				self._mancache = None
				revlog.revlog.__init__(self, opener, "00manifest.i")

				def parse(self, lines):
				mfdict = manifestdict()
				parsers.parse_manifest(mfdict, mfdict._flags, lines)
				return mfdict

				def readdelta(self, node):
				r = self.rev(node)
				return self.parse(mdiff.patchtext(self.revdiff(self.deltaparent(r), r)))

				def readfast(self, node):
				'''use the faster of readdelta or read'''
				r = self.rev(node)
				deltaparent = self.deltaparent(r)
				if deltaparent != revlog.nullrev and deltaparent in self.parentrevs(r):
				return self.readdelta(node)
				return self.read(node)

				def read(self, node):
				if node == revlog.nullid:
				return manifestdict() # don't upset local cache
				if self._mancache and self._mancache[0] == node:
				return self._mancache[1]
				text = self.revision(node)
				arraytext = array.array('c', text)
				mapping = self.parse(text)
				self._mancache = (node, mapping, arraytext)
				return mapping

				def _search(self, m, s, lo=0, hi=None):
				'''return a tuple (start, end) that says where to find s within m.

				If the string is found m[start:end] are the line containing
				that string. If start == end the string was not found and
				they indicate the proper sorted insertion point. This was
				taken from bisect_left, and modified to find line start/end as
				it goes along.

				m should be a buffer or a string
				s is a string'''
				def advance(i, c):
				while i < lenm and m[i] != c:
				i += 1
				return i
				if not s:
				return (lo, lo)
				lenm = len(m)
				if not hi:
				hi = lenm
				while lo < hi:
				mid = (lo + hi) // 2
				start = mid
				while start > 0 and m[start - 1] != '\n':
				start -= 1
				end = advance(start, '\0')
				if m[start:end] < s:
				# we know that after the null there are 40 bytes of sha1
				# this translates to the bisect lo = mid + 1
				lo = advance(end + 40, '\n') + 1
				else:
				# this translates to the bisect hi = mid
				hi = start
				end = advance(lo, '\0')
				found = m[lo:end]
				if s == found:
				# we know that after the null there are 40 bytes of sha1
				end = advance(end + 40, '\n')
				return (lo, end + 1)
				else:
				return (lo, lo)

				def find(self, node, f):
				'''look up entry for a single file efficiently.
				return (node, flags) pair if found, (None, None) if not.'''
				if self._mancache and self._mancache[0] == node:
				return self._mancache[1].get(f), self._mancache[1].flags(f)
				text = self.revision(node)
				start, end = self._search(text, f)
				if start == end:
				return None, None
				l = text[start:end]
				f, n = l.split('\0')
				return revlog.bin(n[:40]), n[40:-1]

				def add(self, map, transaction, link, p1=None, p2=None,
				changed=None):
				# apply the changes collected during the bisect loop to our addlist
				# return a delta suitable for addrevision
				def addlistdelta(addlist, x):
				# start from the bottom up
				# so changes to the offsets don't mess things up.
				for start, end, content in reversed(x):
				if content:
				addlist[start:end] = array.array('c', content)
				else:
				del addlist[start:end]
				return "".join(struct.pack(">lll", start, end, len(content)) + content
				for start, end, content in x)

				def checkforbidden(l):
				for f in l:
				if '\n' in f or '\r' in f:
				raise error.RevlogError(
				_("'\\n' and '\\r' disallowed in filenames: %r") % f)

				# if we're using the cache, make sure it is valid and
				# parented by the same node we're diffing against
				if not (changed and self._mancache and p1 and self._mancache[0] == p1):
				files = sorted(map)
				checkforbidden(files)

				# if this is changed to support newlines in filenames,
				# be sure to check the templates/ dir again (especially *-raw.tmpl)
				hex, flags = revlog.hex, map.flags
				text = ''.join("%s\0%s%s\n" % (f, hex(map[f]), flags(f))
				for f in files)
				arraytext = array.array('c', text)
				cachedelta = None
				else:
				added, removed = changed
				addlist = self._mancache[2]

				checkforbidden(added)
				# combine the changed lists into one list for sorting
				work = [(x, False) for x in added]
				work.extend((x, True) for x in removed)
				# this could use heapq.merge() (from python2.6+) or equivalent
				# since the lists are already sorted
				work.sort()

				delta = []
				dstart = None
				dend = None
				dline = [""]
				start = 0
				# zero copy representation of addlist as a buffer
				addbuf = buffer(addlist)

				# start with a readonly loop that finds the offset of
				# each line and creates the deltas
				for f, todelete in work:
				# bs will either be the index of the item or the insert point
				start, end = self._search(addbuf, f, start)
				if not todelete:
				l = "%s\0%s%s\n" % (f, revlog.hex(map[f]), map.flags(f))
				else:
				if start == end:
				# item we want to delete was not found, error out
				raise AssertionError(
				_("failed to remove %s from manifest") % f)
				l = ""
				if dstart is not None and dstart <= start and dend >= start:
				if dend < end:
				dend = end
				if l:
				dline.append(l)
				else:
				if dstart is not None:
				delta.append([dstart, dend, "".join(dline)])
				dstart = start
				dend = end
				dline = [l]

				if dstart is not None:
				delta.append([dstart, dend, "".join(dline)])
				# apply the delta to the addlist, and get a delta for addrevision
				cachedelta = (self.rev(p1), addlistdelta(addlist, delta))
				arraytext = addlist
				text = buffer(arraytext)

				n = self.addrevision(text, transaction, link, p1, p2, cachedelta)
				self._mancache = (n, map, arraytext)

				return n