upstream/mercurial-mirror Commit - r11060:e6df0177

findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

David Greenaway -

r11060:e6df0177 default

parent child

mercurial/similar.py

0 +59 -15

              # similar.py - mechanisms for finding similar files
              #
              # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
              #
              # This software may be used and distributed according to the terms of the
              # GNU General Public License version 2 or any later version.
              from i18n import _
              import util
              import mdiff
              import bdiff
-             def findrenames(repo, added, removed, threshold):
-                 '''find renamed files -- yields (before, after, score) tuples'''
+             def _findexactmatches(repo, added, removed):
+                 '''find renamed files that have no changes
+                 Takes a list of new filectxs and a list of removed filectxs, and yields
+                 (before, after) tuples of exact matches.
+                 '''
+                 numfiles = len(added) + len(removed)
+                 # Get hashes of removed files.
+                 hashes = {}
+                 for i, fctx in enumerate(removed):
+                     repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
+                     h = util.sha1(fctx.data()).digest()
+                     hashes[h] = fctx
+                 # For each added file, see if it corresponds to a removed file.
+                 for i, fctx in enumerate(added):
+                     repo.ui.progress(_('searching for exact renames'), i + len(removed),
+                             total=numfiles)
+                     h = util.sha1(fctx.data()).digest()
+                     if h in hashes:
+                         yield (hashes[h], fctx)
+                 # Done
+                 repo.ui.progress(_('searching for exact renames'), None)
+             def _findsimilarmatches(repo, added, removed, threshold):
+                 '''find potentially renamed files based on similar file content
+                 Takes a list of new filectxs and a list of removed filectxs, and yields
+                 (before, after, score) tuples of partial matches.
+                 '''
                  copies = {}
-                 ctx = repo['.']
                  for i, r in enumerate(removed):
-                     repo.ui.progress(_('searching'), i, total=len(removed))
-                     if r not in ctx:
-                         continue
-                     fctx = ctx.filectx(r)
+                     repo.ui.progress(_('searching for similar files'), i, total=len(removed))
                      # lazily load text
                      @util.cachefunc
                      def data():
-                         orig = fctx.data()
+                         orig = r.data()
                          return orig, mdiff.splitnewlines(orig)
                      def score(text):
-                         if not len(text):
-                             return 0.0
-                         if not fctx.cmp(text):
-                             return 1.0
-                         if threshold == 1.0:
-                             return 0.0
                          orig, lines = data()
                          # bdiff.blocks() returns blocks of matching lines
                          # count the number of bytes in each
                          equal = 0
                          matches = bdiff.blocks(text, orig)
                          for x1, x2, y1, y2 in matches:
                              for line in lines[y1:y2]:
                                  equal += len(line)
                          lengths = len(text) + len(orig)
                          return equal * 2.0 / lengths
                      for a in added:
                          bestscore = copies.get(a, (None, threshold))[1]
-                         myscore = score(repo.wread(a))
+                         myscore = score(a.data())
                          if myscore >= bestscore:
                              copies[a] = (r, myscore)
                  repo.ui.progress(_('searching'), None)
                  for dest, v in copies.iteritems():
                      source, score = v
                      yield source, dest, score
+             def findrenames(repo, added, removed, threshold):
+                 '''find renamed files -- yields (before, after, score) tuples'''
+                 parentctx = repo['.']
+                 workingctx = repo[None]
+                 # Zero length files will be frequently unrelated to each other, and
+                 # tracking the deletion/addition of such a file will probably cause more
+                 # harm than good. We strip them out here to avoid matching them later on.
+                 addedfiles = set([workingctx[fp] for fp in added
+                         if workingctx[fp].size() > 0])
+                 removedfiles = set([parentctx[fp] for fp in removed
+                         if fp in parentctx and parentctx[fp].size() > 0])
+                 # Find exact matches.
+                 for (a, b) in _findexactmatches(repo,
+                         sorted(addedfiles),sorted( removedfiles)):
+                     addedfiles.remove(b)
+                     yield (a.path(), b.path(), 1.0)
+                 # If the user requested similar files to be matched, search for them also.
+                 if threshold < 1.0:
+                     for (a, b, score) in _findsimilarmatches(repo,
+                             sorted(addedfiles), sorted(removedfiles), threshold):
+                         yield (a.path(), b.path(), score)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages