upstream/mercurial-mirror Commit - r31583:2efd9771

similar: take the first match instead of the last...

Yuya Nishihara -

r31583:2efd9771 default

parent child

mercurial/similar.py

0 +2 -2

              # similar.py - mechanisms for finding similar files
              #
              # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
              #
              # This software may be used and distributed according to the terms of the
              # GNU General Public License version 2 or any later version.
              from __future__ import absolute_import
              import hashlib
              from .i18n import _
              from . import (
                  bdiff,
                  mdiff,
              )
              def _findexactmatches(repo, added, removed):
                  '''find renamed files that have no changes
                  Takes a list of new filectxs and a list of removed filectxs, and yields
                  (before, after) tuples of exact matches.
                  '''
                  numfiles = len(added) + len(removed)
                  # Get hashes of removed files.
                  hashes = {}
-                 for i, fctx in enumerate(removed):
+                 for i, fctx in enumerate(reversed(removed)):
                      repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
                                       unit=_('files'))
                      h = hashlib.sha1(fctx.data()).digest()
                      hashes[h] = fctx
                  # For each added file, see if it corresponds to a removed file.
                  for i, fctx in enumerate(added):
                      repo.ui.progress(_('searching for exact renames'), i + len(removed),
                              total=numfiles, unit=_('files'))
                      adata = fctx.data()
                      h = hashlib.sha1(adata).digest()
                      if h in hashes:
                          rfctx = hashes[h]
                          # compare between actual file contents for exact identity
                          if adata == rfctx.data():
                              yield (rfctx, fctx)
                  # Done
                  repo.ui.progress(_('searching for exact renames'), None)
              def _ctxdata(fctx):
                  # lazily load text
                  orig = fctx.data()
                  return orig, mdiff.splitnewlines(orig)
              def _score(fctx, otherdata):
                  orig, lines = otherdata
                  text = fctx.data()
                  # bdiff.blocks() returns blocks of matching lines
                  # count the number of bytes in each
                  equal = 0
                  matches = bdiff.blocks(text, orig)
                  for x1, x2, y1, y2 in matches:
                      for line in lines[y1:y2]:
                          equal += len(line)
                  lengths = len(text) + len(orig)
                  return equal * 2.0 / lengths
              def score(fctx1, fctx2):
                  return _score(fctx1, _ctxdata(fctx2))
              def _findsimilarmatches(repo, added, removed, threshold):
                  '''find potentially renamed files based on similar file content
                  Takes a list of new filectxs and a list of removed filectxs, and yields
                  (before, after, score) tuples of partial matches.
                  '''
                  copies = {}
                  for i, r in enumerate(removed):
                      repo.ui.progress(_('searching for similar files'), i,
                                       total=len(removed), unit=_('files'))
                      data = None
                      for a in added:
                          bestscore = copies.get(a, (None, threshold))[1]
                          if data is None:
                              data = _ctxdata(r)
                          myscore = _score(a, data)
-                         if myscore >= bestscore:
+                         if myscore > bestscore:
                              copies[a] = (r, myscore)
                  repo.ui.progress(_('searching'), None)
                  for dest, v in copies.iteritems():
                      source, bscore = v
                      yield source, dest, bscore
              def _dropempty(fctxs):
                  return [x for x in fctxs if x.size() > 0]
              def findrenames(repo, added, removed, threshold):
                  '''find renamed files -- yields (before, after, score) tuples'''
                  wctx = repo[None]
                  pctx = wctx.p1()
                  # Zero length files will be frequently unrelated to each other, and
                  # tracking the deletion/addition of such a file will probably cause more
                  # harm than good. We strip them out here to avoid matching them later on.
                  addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
                  removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
                  # Find exact matches.
                  matchedfiles = set()
                  for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
                      matchedfiles.add(b)
                      yield (a.path(), b.path(), 1.0)
                  # If the user requested similar files to be matched, search for them also.
                  if threshold < 1.0:
                      addedfiles = [x for x in addedfiles if x not in matchedfiles]
                      for (a, b, score) in _findsimilarmatches(repo, addedfiles,
                                                               removedfiles, threshold):
                          yield (a.path(), b.path(), score)

tests/test-addremove-similar.t

0 +2 -2

                $ hg init rep; cd rep
                $ touch empty-file
                $ $PYTHON -c 'for x in range(10000): print(x)' > large-file
                $ hg addremove
                adding empty-file
                adding large-file
                $ hg commit -m A
                $ rm large-file empty-file
                $ $PYTHON -c 'for x in range(10,10000): print(x)' > another-file
                $ hg addremove -s50
                adding another-file
                removing empty-file
                removing large-file
                recording removal of large-file as rename to another-file (99% similar)
                $ hg commit -m B
              comparing two empty files caused ZeroDivisionError in the past
                $ hg update -C 0
 files updated, 0 files merged, 1 files removed, 0 files unresolved
                $ rm empty-file
                $ touch another-empty-file
                $ hg addremove -s50
                adding another-empty-file
                removing empty-file
                $ cd ..
                $ hg init rep2; cd rep2
                $ $PYTHON -c 'for x in range(10000): print(x)' > large-file
                $ $PYTHON -c 'for x in range(50): print(x)' > tiny-file
                $ hg addremove
                adding large-file
                adding tiny-file
                $ hg commit -m A
                $ $PYTHON -c 'for x in range(70): print(x)' > small-file
                $ rm tiny-file
                $ rm large-file
                $ hg addremove -s50
                removing large-file
                adding small-file
                removing tiny-file
                recording removal of tiny-file as rename to small-file (82% similar)
                $ hg commit -m B
              should be sorted by path for stable result
                $ for i in `python $TESTDIR/seq.py 0 9`; do
                >     cp small-file $i
                > done
                $ rm small-file
                $ hg addremove
                adding 0
                adding 1
                adding 2
                adding 3
                adding 4
                adding 5
                adding 6
                adding 7
                adding 8
                adding 9
                removing small-file
                recording removal of small-file as rename to 0 (100% similar)
                recording removal of small-file as rename to 1 (100% similar)
                recording removal of small-file as rename to 2 (100% similar)
                recording removal of small-file as rename to 3 (100% similar)
                recording removal of small-file as rename to 4 (100% similar)
                recording removal of small-file as rename to 5 (100% similar)
                recording removal of small-file as rename to 6 (100% similar)
                recording removal of small-file as rename to 7 (100% similar)
                recording removal of small-file as rename to 8 (100% similar)
                recording removal of small-file as rename to 9 (100% similar)
                $ hg commit -m '10 same files'
              pick one from many identical files
                $ cp 0 a
                $ rm `python $TESTDIR/seq.py 0 9`
                $ hg addremove
                removing 0
                removing 1
                removing 2
                removing 3
                removing 4
                removing 5
                removing 6
                removing 7
                removing 8
                removing 9
                adding a
-               recording removal of 9 as rename to a (100% similar)
+               recording removal of 0 as rename to a (100% similar)
                $ hg revert -aq
              pick one from many similar files
                $ cp 0 a
                $ for i in `python $TESTDIR/seq.py 0 9`; do
                >     echo $i >> $i
                > done
                $ hg commit -m 'make them slightly different'
                $ rm `python $TESTDIR/seq.py 0 9`
                $ hg addremove -s50
                removing 0
                removing 1
                removing 2
                removing 3
                removing 4
                removing 5
                removing 6
                removing 7
                removing 8
                removing 9
                adding a
-               recording removal of 9 as rename to a (99% similar)
+               recording removal of 0 as rename to a (99% similar)
                $ hg commit -m 'always the same file should be selected'
              should all fail
                $ hg addremove -s foo
                abort: similarity must be a number
                [255]
                $ hg addremove -s -1
                abort: similarity must be between 0 and 100
                [255]
                $ hg addremove -s 1e6
                abort: similarity must be between 0 and 100
                [255]
                $ cd ..
              Issue1527: repeated addremove causes Abort
                $ hg init rep3; cd rep3
                $ mkdir d
                $ echo a > d/a
                $ hg add d/a
                $ hg commit -m 1
                $ mv d/a d/b
                $ hg addremove -s80
                removing d/a
                adding d/b
                recording removal of d/a as rename to d/b (100% similar) (glob)
                $ hg debugstate
                r   0          0 1970-01-01 00:00:00 d/a
                a   0         -1 unset               d/b
                copy: d/a -> d/b
                $ mv d/b c
              no copies found here (since the target isn't in d
                $ hg addremove -s80 d
                removing d/b (glob)
              copies here
                $ hg addremove -s80
                adding c
                recording removal of d/a as rename to c (100% similar) (glob)
                $ cd ..

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages