upstream/mercurial-mirror Files · mercurial/similar.py

py3: use r'' instead of sysstr('') to get around code transformer...

py3: use r'' instead of sysstr('') to get around code transformer Fewer function calls should be better.

Yuya Nishihara - - Load All Authors

File last commit:

r32201:ded48ad5 default


                r36853:5bc7ff10

default

Download file

             similar.py
        
                    122 lines
            
             | 4.1 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / similar.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
      # similar.py - mechanisms for finding similar files

      #

      # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>

      #

      # This software may be used and distributed according to the terms of the

      # GNU General Public License version 2 or any later version.

        Gregory Szorc
    
similar: use absolute_import

              r27359
            
      from __future__ import absolute_import

      from .i18n import _

      from . import (

          mdiff,

      )

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
      def _findexactmatches(repo, added, removed):

          '''find renamed files that have no changes

          Takes a list of new filectxs and a list of removed filectxs, and yields

          (before, after) tuples of exact matches.

          '''

          numfiles = len(added) + len(removed)

        Yuya Nishihara
    
similar: use cheaper hash() function to test exact matches...

              r31584
            
          # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.

          # We use hash() to discard fctx.data() from memory.

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
          hashes = {}

        Yuya Nishihara
    
similar: use cheaper hash() function to test exact matches...

              r31584
            
          for i, fctx in enumerate(removed):

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
              repo.ui.progress(_('searching for exact renames'), i, total=numfiles,

                               unit=_('files'))

        Yuya Nishihara
    
similar: use cheaper hash() function to test exact matches...

              r31584
            
              h = hash(fctx.data())

              if h not in hashes:

                  hashes[h] = [fctx]

              else:

                  hashes[h].append(fctx)

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
          # For each added file, see if it corresponds to a removed file.

          for i, fctx in enumerate(added):

              repo.ui.progress(_('searching for exact renames'), i + len(removed),

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
                      total=numfiles, unit=_('files'))

        FUJIWARA Katsunori
    
similar: compare between actual file contents for exact identity...

              r31210
            
              adata = fctx.data()

        Yuya Nishihara
    
similar: use cheaper hash() function to test exact matches...

              r31584
            
              h = hash(adata)

              for rfctx in hashes.get(h, []):

        FUJIWARA Katsunori
    
similar: compare between actual file contents for exact identity...

              r31210
            
                  # compare between actual file contents for exact identity

                  if adata == rfctx.data():

                      yield (rfctx, fctx)

        Yuya Nishihara
    
similar: use cheaper hash() function to test exact matches...

              r31584
            
                      break

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
          # Done

          repo.ui.progress(_('searching for exact renames'), None)

        Sean Farley
    
similar: move score function to module level...

              r30805
            
      def _ctxdata(fctx):

          # lazily load text

          orig = fctx.data()

          return orig, mdiff.splitnewlines(orig)

        Pierre-Yves David
    
similar: remove caching from the module level...

              r30809
            
      def _score(fctx, otherdata):

          orig, lines = otherdata

          text = fctx.data()

        Yuya Nishihara
    
bdiff: proxy through mdiff module...

              r32201
            
          # mdiff.blocks() returns blocks of matching lines

        Sean Farley
    
similar: move score function to module level...

              r30805
            
          # count the number of bytes in each

          equal = 0

        Yuya Nishihara
    
bdiff: proxy through mdiff module...

              r32201
            
          matches = mdiff.blocks(text, orig)

        Sean Farley
    
similar: move score function to module level...

              r30805
            
          for x1, x2, y1, y2 in matches:

              for line in lines[y1:y2]:

                  equal += len(line)

          lengths = len(text) + len(orig)

          return equal * 2.0 / lengths

        Pierre-Yves David
    
similar: remove caching from the module level...

              r30809
            
      def score(fctx1, fctx2):

          return _score(fctx1, _ctxdata(fctx2))

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
      def _findsimilarmatches(repo, added, removed, threshold):

          '''find potentially renamed files based on similar file content

          Takes a list of new filectxs and a list of removed filectxs, and yields

          (before, after, score) tuples of partial matches.

          '''

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
          copies = {}

          for i, r in enumerate(removed):

        Brodie Rao
    
cleanup: eradicate long lines

              r16683
            
              repo.ui.progress(_('searching for similar files'), i,

        av6
    
similar: specify unit for ui.progress when operating on files

              r28468
            
                               total=len(removed), unit=_('files'))

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        Pierre-Yves David
    
similar: remove caching from the module level...

              r30809
            
              data = None

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
              for a in added:

                  bestscore = copies.get(a, (None, threshold))[1]

        Pierre-Yves David
    
similar: remove caching from the module level...

              r30809
            
                  if data is None:

                      data = _ctxdata(r)

                  myscore = _score(a, data)

        Yuya Nishihara
    
similar: take the first match instead of the last...

              r31583
            
                  if myscore > bestscore:

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
                      copies[a] = (r, myscore)

          repo.ui.progress(_('searching'), None)

          for dest, v in copies.iteritems():

        Sean Farley
    
similar: rename local variable to not collide with previous...

              r30791
            
              source, bscore = v

              yield source, dest, bscore

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        Yuya Nishihara
    
similar: do not look up and create filectx more than once...

              r31582
            
      def _dropempty(fctxs):

          return [x for x in fctxs if x.size() > 0]

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
      def findrenames(repo, added, removed, threshold):

          '''find renamed files -- yields (before, after, score) tuples'''

        Yuya Nishihara
    
similar: use common names for changectx variables...

              r31581
            
          wctx = repo[None]

          pctx = wctx.p1()

        David Greenaway
    
Move 'findrenames' code into its own file....

              r11059
            
        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
          # Zero length files will be frequently unrelated to each other, and

          # tracking the deletion/addition of such a file will probably cause more

          # harm than good. We strip them out here to avoid matching them later on.

        Yuya Nishihara
    
similar: do not look up and create filectx more than once...

              r31582
            
          addedfiles = _dropempty(wctx[fp] for fp in sorted(added))

          removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
          # Find exact matches.

        Yuya Nishihara
    
similar: get rid of quadratic addedfiles.remove()...

              r31580
            
          matchedfiles = set()

          for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):

              matchedfiles.add(b)

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
              yield (a.path(), b.path(), 1.0)

          # If the user requested similar files to be matched, search for them also.

          if threshold < 1.0:

        Yuya Nishihara
    
similar: get rid of quadratic addedfiles.remove()...

              r31580
            
              addedfiles = [x for x in addedfiles if x not in matchedfiles]

        Yuya Nishihara
    
similar: sort files not by object id but by path for stable result...

              r31579
            
              for (a, b, score) in _findsimilarmatches(repo, addedfiles,

                                                       removedfiles, threshold):

        David Greenaway
    
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

              r11060
            
                  yield (a.path(), b.path(), score)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

David Greenaway Move 'findrenames' code into its own file....	r11059	# similar.py - mechanisms for finding similar files
		#
		# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
		#
		# This software may be used and distributed according to the terms of the
		# GNU General Public License version 2 or any later version.

Gregory Szorc similar: use absolute_import	r27359	from __future__ import absolute_import

		from .i18n import _
		from . import (
		mdiff,
		)
David Greenaway Move 'findrenames' code into its own file....	r11059
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	def _findexactmatches(repo, added, removed):
		'''find renamed files that have no changes

		Takes a list of new filectxs and a list of removed filectxs, and yields
		(before, after) tuples of exact matches.
		'''
		numfiles = len(added) + len(removed)

Yuya Nishihara similar: use cheaper hash() function to test exact matches...	r31584	# Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
		# We use hash() to discard fctx.data() from memory.
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	hashes = {}
Yuya Nishihara similar: use cheaper hash() function to test exact matches...	r31584	for i, fctx in enumerate(removed):
av6 similar: specify unit for ui.progress when operating on files	r28468	repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
		unit=_('files'))
Yuya Nishihara similar: use cheaper hash() function to test exact matches...	r31584	h = hash(fctx.data())
		if h not in hashes:
		hashes[h] = [fctx]
		else:
		hashes[h].append(fctx)
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060
		# For each added file, see if it corresponds to a removed file.
		for i, fctx in enumerate(added):
		repo.ui.progress(_('searching for exact renames'), i + len(removed),
av6 similar: specify unit for ui.progress when operating on files	r28468	total=numfiles, unit=_('files'))
FUJIWARA Katsunori similar: compare between actual file contents for exact identity...	r31210	adata = fctx.data()
Yuya Nishihara similar: use cheaper hash() function to test exact matches...	r31584	h = hash(adata)
		for rfctx in hashes.get(h, []):
FUJIWARA Katsunori similar: compare between actual file contents for exact identity...	r31210	# compare between actual file contents for exact identity
		if adata == rfctx.data():
		yield (rfctx, fctx)
Yuya Nishihara similar: use cheaper hash() function to test exact matches...	r31584	break
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060
		# Done
		repo.ui.progress(_('searching for exact renames'), None)

Sean Farley similar: move score function to module level...	r30805	def _ctxdata(fctx):
		# lazily load text
		orig = fctx.data()
		return orig, mdiff.splitnewlines(orig)

Pierre-Yves David similar: remove caching from the module level...	r30809	def _score(fctx, otherdata):
		orig, lines = otherdata
		text = fctx.data()
Yuya Nishihara bdiff: proxy through mdiff module...	r32201	# mdiff.blocks() returns blocks of matching lines
Sean Farley similar: move score function to module level...	r30805	# count the number of bytes in each
		equal = 0
Yuya Nishihara bdiff: proxy through mdiff module...	r32201	matches = mdiff.blocks(text, orig)
Sean Farley similar: move score function to module level...	r30805	for x1, x2, y1, y2 in matches:
		for line in lines[y1:y2]:
		equal += len(line)

		lengths = len(text) + len(orig)
		return equal * 2.0 / lengths

Pierre-Yves David similar: remove caching from the module level...	r30809	def score(fctx1, fctx2):
		return _score(fctx1, _ctxdata(fctx2))

David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	def _findsimilarmatches(repo, added, removed, threshold):
		'''find potentially renamed files based on similar file content

		Takes a list of new filectxs and a list of removed filectxs, and yields
		(before, after, score) tuples of partial matches.
		'''
David Greenaway Move 'findrenames' code into its own file....	r11059	copies = {}
		for i, r in enumerate(removed):
Brodie Rao cleanup: eradicate long lines	r16683	repo.ui.progress(_('searching for similar files'), i,
av6 similar: specify unit for ui.progress when operating on files	r28468	total=len(removed), unit=_('files'))
David Greenaway Move 'findrenames' code into its own file....	r11059
Pierre-Yves David similar: remove caching from the module level...	r30809	data = None
David Greenaway Move 'findrenames' code into its own file....	r11059	for a in added:
		bestscore = copies.get(a, (None, threshold))[1]
Pierre-Yves David similar: remove caching from the module level...	r30809	if data is None:
		data = _ctxdata(r)
		myscore = _score(a, data)
Yuya Nishihara similar: take the first match instead of the last...	r31583	if myscore > bestscore:
David Greenaway Move 'findrenames' code into its own file....	r11059	copies[a] = (r, myscore)
		repo.ui.progress(_('searching'), None)

		for dest, v in copies.iteritems():
Sean Farley similar: rename local variable to not collide with previous...	r30791	source, bscore = v
		yield source, dest, bscore
David Greenaway Move 'findrenames' code into its own file....	r11059
Yuya Nishihara similar: do not look up and create filectx more than once...	r31582	def _dropempty(fctxs):
		return [x for x in fctxs if x.size() > 0]

David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	def findrenames(repo, added, removed, threshold):
		'''find renamed files -- yields (before, after, score) tuples'''
Yuya Nishihara similar: use common names for changectx variables...	r31581	wctx = repo[None]
		pctx = wctx.p1()
David Greenaway Move 'findrenames' code into its own file....	r11059
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	# Zero length files will be frequently unrelated to each other, and
		# tracking the deletion/addition of such a file will probably cause more
		# harm than good. We strip them out here to avoid matching them later on.
Yuya Nishihara similar: do not look up and create filectx more than once...	r31582	addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
		removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060
		# Find exact matches.
Yuya Nishihara similar: get rid of quadratic addedfiles.remove()...	r31580	matchedfiles = set()
		for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
		matchedfiles.add(b)
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	yield (a.path(), b.path(), 1.0)

		# If the user requested similar files to be matched, search for them also.
		if threshold < 1.0:
Yuya Nishihara similar: get rid of quadratic addedfiles.remove()...	r31580	addedfiles = [x for x in addedfiles if x not in matchedfiles]
Yuya Nishihara similar: sort files not by object id but by path for stable result...	r31579	for (a, b, score) in _findsimilarmatches(repo, addedfiles,
		removedfiles, threshold):
David Greenaway findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....	r11060	yield (a.path(), b.path(), score)