upstream/mercurial-mirror Files · mercurial/similar.py

revset: abuse x:y syntax to specify line range of followlines()...

revset: abuse x:y syntax to specify line range of followlines() This slightly complicates the parsing (see the previous patch), but the overall result seems not bad. I keep x:, :y and : for future extension.

Sean Farley - - Load All Authors

File last commit:

r30791:ada160a8 default


                r30804:4227f80f

default

Download file

             similar.py
        
                    110 lines
            
             | 3.7 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / similar.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
      # similar.py - mechanisms for finding similar files

      #

      # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>

      #

      # This software may be used and distributed according to the terms of the

      # GNU General Public License version 2 or any later version.

        Gregory Szorc
    
similar: use absolute_import

              r27359
            
      from __future__ import absolute_import

        Augie Fackler
    
cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1...

              r29341
            
      import hashlib

        Gregory Szorc
    
similar: use absolute_import

              r27359
            
      from .i18n import _

      from . import (

          bdiff,

          mdiff,

          util,

      )

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
      def _findexactmatches(repo, added, removed):

          '''find renamed files that have no changes

          Takes a list of new filectxs and a list of removed filectxs, and yields

          (before, after) tuples of exact matches.

          '''

          numfiles = len(added) + len(removed)

          # Get hashes of removed files.

          hashes = {}

          for i, fctx in enumerate(removed):

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
              repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

                               unit=_('files'))

        Augie Fackler
    
cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1...

              r29341
            
              h = hashlib.sha1(fctx.data()).digest()

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
              hashes[h] = fctx

          # For each added file, see if it corresponds to a removed file.

          for i, fctx in enumerate(added):

              repo.ui.progress(_('searching for exact renames'), i + len(removed),

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
                      total=numfiles, unit=_('files'))

        Augie Fackler
    
cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1...

              r29341
            
              h = hashlib.sha1(fctx.data()).digest()

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
              if h in hashes:

                  yield (hashes[h], fctx)

          # Done

          repo.ui.progress(_('searching for exact renames'), None)

      def _findsimilarmatches(repo, added, removed, threshold):

          '''find potentially renamed files based on similar file content

          Takes a list of new filectxs and a list of removed filectxs, and yields

          (before, after, score) tuples of partial matches.

          '''

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
          copies = {}

          for i, r in enumerate(removed):

        Brodie Rao
    
cleanup: eradicate long lines

              r16683
            
              repo.ui.progress(_('searching for similar files'), i,

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
                               total=len(removed), unit=_('files'))

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
              # lazily load text

              @util.cachefunc

              def data():

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
                  orig = r.data()

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
                  return orig, mdiff.splitnewlines(orig)

              def score(text):

                  orig, lines = data()

                  # bdiff.blocks() returns blocks of matching lines

                  # count the number of bytes in each

                  equal = 0

                  matches = bdiff.blocks(text, orig)

                  for x1, x2, y1, y2 in matches:

                      for line in lines[y1:y2]:

                          equal += len(line)

                  lengths = len(text) + len(orig)

                  return equal * 2.0 / lengths

              for a in added:

                  bestscore = copies.get(a, (None, threshold))[1]

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
                  myscore = score(a.data())

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
                  if myscore >= bestscore:

                      copies[a] = (r, myscore)

          repo.ui.progress(_('searching'), None)

          for dest, v in copies.iteritems():

        Sean Farley
    
similar: rename local variable to not collide with previous...

              r30791
            
              source, bscore = v

              yield source, dest, bscore

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
      def findrenames(repo, added, removed, threshold):

          '''find renamed files -- yields (before, after, score) tuples'''

          parentctx = repo['.']

          workingctx = repo[None]

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
          # Zero length files will be frequently unrelated to each other, and

          # tracking the deletion/addition of such a file will probably cause more

          # harm than good. We strip them out here to avoid matching them later on.

          addedfiles = set([workingctx[fp] for fp in added

                  if workingctx[fp].size() > 0])

          removedfiles = set([parentctx[fp] for fp in removed

                  if fp in parentctx and parentctx[fp].size() > 0])

          # Find exact matches.

          for (a, b) in _findexactmatches(repo,

        Benoit Boissinot
    
fix coding style

              r11085
            
                  sorted(addedfiles), sorted(removedfiles)):

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
              addedfiles.remove(b)

              yield (a.path(), b.path(), 1.0)

          # If the user requested similar files to be matched, search for them also.

          if threshold < 1.0:

              for (a, b, score) in _findsimilarmatches(repo,

                      sorted(addedfiles), sorted(removedfiles), threshold):

                  yield (a.path(), b.path(), score)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

David Greenaway Move 'findrenames' code into its own file....	r11059	# similar.py - mechanisms for finding similar files
		#
		# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
		#
		# This software may be used and distributed according to the terms of the
		# GNU General Public License version 2 or any later version.

Gregory Szorc similar: use absolute_import	r27359	from __future__ import absolute_import

Augie Fackler cleanup: replace uses of util.(md5\|sha1\|sha256\|sha512) with hashlib.\1...	r29341	import hashlib

Gregory Szorc similar: use absolute_import	r27359	from .i18n import _
		from . import (
		bdiff,
		mdiff,
		util,
		)
David Greenaway Move 'findrenames' code into its own file....	r11059
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	def _findexactmatches(repo, added, removed):
		'''find renamed files that have no changes

		Takes a list of new filectxs and a list of removed filectxs, and yields
		(before, after) tuples of exact matches.
		'''
		numfiles = len(added) + len(removed)

		# Get hashes of removed files.
		hashes = {}
		for i, fctx in enumerate(removed):
av6 similar: specify unit for ui.progress when operating on files	r28468	repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
		unit=_('files'))
Augie Fackler cleanup: replace uses of util.(md5\|sha1\|sha256\|sha512) with hashlib.\1...	r29341	h = hashlib.sha1(fctx.data()).digest()
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	hashes[h] = fctx

		# For each added file, see if it corresponds to a removed file.
		for i, fctx in enumerate(added):
		repo.ui.progress(_('searching for exact renames'), i + len(removed),
av6 similar: specify unit for ui.progress when operating on files	r28468	total=numfiles, unit=_('files'))
Augie Fackler cleanup: replace uses of util.(md5\|sha1\|sha256\|sha512) with hashlib.\1...	r29341	h = hashlib.sha1(fctx.data()).digest()
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	if h in hashes:
		yield (hashes[h], fctx)

		# Done
		repo.ui.progress(_('searching for exact renames'), None)

		def _findsimilarmatches(repo, added, removed, threshold):
		'''find potentially renamed files based on similar file content

		Takes a list of new filectxs and a list of removed filectxs, and yields
		(before, after, score) tuples of partial matches.
		'''
David Greenaway Move 'findrenames' code into its own file....	r11059	copies = {}
		for i, r in enumerate(removed):
Brodie Rao cleanup: eradicate long lines	r16683	repo.ui.progress(_('searching for similar files'), i,
av6 similar: specify unit for ui.progress when operating on files	r28468	total=len(removed), unit=_('files'))
David Greenaway Move 'findrenames' code into its own file....	r11059
		# lazily load text
		@util.cachefunc
		def data():
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	orig = r.data()
David Greenaway Move 'findrenames' code into its own file....	r11059	return orig, mdiff.splitnewlines(orig)

		def score(text):
		orig, lines = data()
		# bdiff.blocks() returns blocks of matching lines
		# count the number of bytes in each
		equal = 0
		matches = bdiff.blocks(text, orig)
		for x1, x2, y1, y2 in matches:
		for line in lines[y1:y2]:
		equal += len(line)

		lengths = len(text) + len(orig)
		return equal * 2.0 / lengths

		for a in added:
		bestscore = copies.get(a, (None, threshold))[1]
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	myscore = score(a.data())
David Greenaway Move 'findrenames' code into its own file....	r11059	if myscore >= bestscore:
		copies[a] = (r, myscore)
		repo.ui.progress(_('searching'), None)

		for dest, v in copies.iteritems():
Sean Farley similar: rename local variable to not collide with previous...	r30791	source, bscore = v
		yield source, dest, bscore
David Greenaway Move 'findrenames' code into its own file....	r11059
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	def findrenames(repo, added, removed, threshold):
		'''find renamed files -- yields (before, after, score) tuples'''
		parentctx = repo['.']
		workingctx = repo[None]
David Greenaway Move 'findrenames' code into its own file....	r11059
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	# Zero length files will be frequently unrelated to each other, and
		# tracking the deletion/addition of such a file will probably cause more
		# harm than good. We strip them out here to avoid matching them later on.
		addedfiles = set([workingctx[fp] for fp in added
		if workingctx[fp].size() > 0])
		removedfiles = set([parentctx[fp] for fp in removed
		if fp in parentctx and parentctx[fp].size() > 0])

		# Find exact matches.
		for (a, b) in _findexactmatches(repo,
Benoit Boissinot fix coding style	r11085	sorted(addedfiles), sorted(removedfiles)):
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	addedfiles.remove(b)
		yield (a.path(), b.path(), 1.0)

		# If the user requested similar files to be matched, search for them also.
		if threshold < 1.0:
		for (a, b, score) in _findsimilarmatches(repo,
		sorted(addedfiles), sorted(removedfiles), threshold):
		yield (a.path(), b.path(), score)