upstream/mercurial-mirror Files · mercurial/similar.py

pure: write a really lazy version of pure indexObject...

pure: write a really lazy version of pure indexObject On PyPy this version performs reasonably well compared to C version. Example command is "hg id" which gets faster, depending on details of your operating system and hard drive (it's bottlenecked on stat mostly) There is potential for improvements by storing extra as a condensed struct too.

av6 - - Load All Authors

File last commit:

r28468:0d6b3630 default


                r29133:25527471

default

Download file

             similar.py
        
                    109 lines
            
             | 3.7 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / similar.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
      # similar.py - mechanisms for finding similar files

      #

      # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>

      #

      # This software may be used and distributed according to the terms of the

      # GNU General Public License version 2 or any later version.

        Gregory Szorc
    
similar: use absolute_import

              r27359
            
      from __future__ import absolute_import

      from .i18n import _

      from . import (

          bdiff,

          mdiff,

          util,

      )

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
      def _findexactmatches(repo, added, removed):

          '''find renamed files that have no changes

          Takes a list of new filectxs and a list of removed filectxs, and yields

          (before, after) tuples of exact matches.

          '''

          numfiles = len(added) + len(removed)

          # Get hashes of removed files.

          hashes = {}

          for i, fctx in enumerate(removed):

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
              repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

                               unit=_('files'))

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
              h = util.sha1(fctx.data()).digest()

              hashes[h] = fctx

          # For each added file, see if it corresponds to a removed file.

          for i, fctx in enumerate(added):

              repo.ui.progress(_('searching for exact renames'), i + len(removed),

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
                      total=numfiles, unit=_('files'))

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
              h = util.sha1(fctx.data()).digest()

              if h in hashes:

                  yield (hashes[h], fctx)

          # Done

          repo.ui.progress(_('searching for exact renames'), None)

      def _findsimilarmatches(repo, added, removed, threshold):

          '''find potentially renamed files based on similar file content

          Takes a list of new filectxs and a list of removed filectxs, and yields

          (before, after, score) tuples of partial matches.

          '''

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
          copies = {}

          for i, r in enumerate(removed):

        Brodie Rao
    
cleanup: eradicate long lines

              r16683
            
              repo.ui.progress(_('searching for similar files'), i,

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
                               total=len(removed), unit=_('files'))

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
              # lazily load text

              @util.cachefunc

              def data():

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
                  orig = r.data()

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
                  return orig, mdiff.splitnewlines(orig)

              def score(text):

                  orig, lines = data()

                  # bdiff.blocks() returns blocks of matching lines

                  # count the number of bytes in each

                  equal = 0

                  matches = bdiff.blocks(text, orig)

                  for x1, x2, y1, y2 in matches:

                      for line in lines[y1:y2]:

                          equal += len(line)

                  lengths = len(text) + len(orig)

                  return equal * 2.0 / lengths

              for a in added:

                  bestscore = copies.get(a, (None, threshold))[1]

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
                  myscore = score(a.data())

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
                  if myscore >= bestscore:

                      copies[a] = (r, myscore)

          repo.ui.progress(_('searching'), None)

          for dest, v in copies.iteritems():

              source, score = v

              yield source, dest, score

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
      def findrenames(repo, added, removed, threshold):

          '''find renamed files -- yields (before, after, score) tuples'''

          parentctx = repo['.']

          workingctx = repo[None]

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
          # Zero length files will be frequently unrelated to each other, and

          # tracking the deletion/addition of such a file will probably cause more

          # harm than good. We strip them out here to avoid matching them later on.

          addedfiles = set([workingctx[fp] for fp in added

                  if workingctx[fp].size() > 0])

          removedfiles = set([parentctx[fp] for fp in removed

                  if fp in parentctx and parentctx[fp].size() > 0])

          # Find exact matches.

          for (a, b) in _findexactmatches(repo,

        Benoit Boissinot
    
fix coding style

              r11085
            
                  sorted(addedfiles), sorted(removedfiles)):

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
              addedfiles.remove(b)

              yield (a.path(), b.path(), 1.0)

          # If the user requested similar files to be matched, search for them also.

          if threshold < 1.0:

              for (a, b, score) in _findsimilarmatches(repo,

                      sorted(addedfiles), sorted(removedfiles), threshold):

                  yield (a.path(), b.path(), score)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

David Greenaway Move 'findrenames' code into its own file....	r11059	# similar.py - mechanisms for finding similar files
		#
		# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
		#
		# This software may be used and distributed according to the terms of the
		# GNU General Public License version 2 or any later version.

Gregory Szorc similar: use absolute_import	r27359	from __future__ import absolute_import

		from .i18n import _
		from . import (
		bdiff,
		mdiff,
		util,
		)
David Greenaway Move 'findrenames' code into its own file....	r11059
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	def _findexactmatches(repo, added, removed):
		'''find renamed files that have no changes

		Takes a list of new filectxs and a list of removed filectxs, and yields
		(before, after) tuples of exact matches.
		'''
		numfiles = len(added) + len(removed)

		# Get hashes of removed files.
		hashes = {}
		for i, fctx in enumerate(removed):
av6 similar: specify unit for ui.progress when operating on files	r28468	repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
		unit=_('files'))
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	h = util.sha1(fctx.data()).digest()
		hashes[h] = fctx

		# For each added file, see if it corresponds to a removed file.
		for i, fctx in enumerate(added):
		repo.ui.progress(_('searching for exact renames'), i + len(removed),
av6 similar: specify unit for ui.progress when operating on files	r28468	total=numfiles, unit=_('files'))
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	h = util.sha1(fctx.data()).digest()
		if h in hashes:
		yield (hashes[h], fctx)

		# Done
		repo.ui.progress(_('searching for exact renames'), None)

		def _findsimilarmatches(repo, added, removed, threshold):
		'''find potentially renamed files based on similar file content

		Takes a list of new filectxs and a list of removed filectxs, and yields
		(before, after, score) tuples of partial matches.
		'''
David Greenaway Move 'findrenames' code into its own file....	r11059	copies = {}
		for i, r in enumerate(removed):
Brodie Rao cleanup: eradicate long lines	r16683	repo.ui.progress(_('searching for similar files'), i,
av6 similar: specify unit for ui.progress when operating on files	r28468	total=len(removed), unit=_('files'))
David Greenaway Move 'findrenames' code into its own file....	r11059
		# lazily load text
		@util.cachefunc
		def data():
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	orig = r.data()
David Greenaway Move 'findrenames' code into its own file....	r11059	return orig, mdiff.splitnewlines(orig)

		def score(text):
		orig, lines = data()
		# bdiff.blocks() returns blocks of matching lines
		# count the number of bytes in each
		equal = 0
		matches = bdiff.blocks(text, orig)
		for x1, x2, y1, y2 in matches:
		for line in lines[y1:y2]:
		equal += len(line)

		lengths = len(text) + len(orig)
		return equal * 2.0 / lengths

		for a in added:
		bestscore = copies.get(a, (None, threshold))[1]
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	myscore = score(a.data())
David Greenaway Move 'findrenames' code into its own file....	r11059	if myscore >= bestscore:
		copies[a] = (r, myscore)
		repo.ui.progress(_('searching'), None)

		for dest, v in copies.iteritems():
		source, score = v
		yield source, dest, score

David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	def findrenames(repo, added, removed, threshold):
		'''find renamed files -- yields (before, after, score) tuples'''
		parentctx = repo['.']
		workingctx = repo[None]
David Greenaway Move 'findrenames' code into its own file....	r11059
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	# Zero length files will be frequently unrelated to each other, and
		# tracking the deletion/addition of such a file will probably cause more
		# harm than good. We strip them out here to avoid matching them later on.
		addedfiles = set([workingctx[fp] for fp in added
		if workingctx[fp].size() > 0])
		removedfiles = set([parentctx[fp] for fp in removed
		if fp in parentctx and parentctx[fp].size() > 0])

		# Find exact matches.
		for (a, b) in _findexactmatches(repo,
Benoit Boissinot fix coding style	r11085	sorted(addedfiles), sorted(removedfiles)):
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	addedfiles.remove(b)
		yield (a.path(), b.path(), 1.0)

		# If the user requested similar files to be matched, search for them also.
		if threshold < 1.0:
		for (a, b, score) in _findsimilarmatches(repo,
		sorted(addedfiles), sorted(removedfiles), threshold):
		yield (a.path(), b.path(), score)