# HG changeset patch # User Yuya Nishihara # Date 2015-03-15 09:58:56 # Node ID 3a383caa97f49efd7b4a68b287966b2bb8084f65 # Parent 6262e30b0c09903279bc0215af8c622a557d393d similar: sort files not by object id but by path for stable result Perhaps the original implementation would want to sort added/removed files alphabetically, but actually it did sort fctx objects by memory location. This patch removes the use of set()s in order to preserve the order of added/removed files. addedfiles.remove() becomes quadratic, but its cost appears not dominant. Anyway, the quadratic behavior will be eliminated by the next patch. Benchmark with 50k added/removed files, on tmpfs: $ mkdir src $ for n in `seq 0 49`; do > mkdir `printf src/%02d $n` > done $ for n in `seq 0 49999`; do > f=`printf src/%02d/%05d $(($n/1000)) $n` > dd if=/dev/urandom of=$f bs=8k count=1 status=none > done $ hg ci -qAm 'add 50k files of random content' $ mv src dest $ hg addremove --dry-run --time -q original: real 16.550 secs (user 15.000+0.000 sys 1.540+0.000) this patch: real 16.730 secs (user 15.280+0.000 sys 1.440+0.000) diff --git a/mercurial/similar.py b/mercurial/similar.py --- a/mercurial/similar.py +++ b/mercurial/similar.py @@ -101,19 +101,18 @@ def findrenames(repo, added, removed, th # Zero length files will be frequently unrelated to each other, and # tracking the deletion/addition of such a file will probably cause more # harm than good. We strip them out here to avoid matching them later on. - addedfiles = set([workingctx[fp] for fp in added - if workingctx[fp].size() > 0]) - removedfiles = set([parentctx[fp] for fp in removed - if fp in parentctx and parentctx[fp].size() > 0]) + addedfiles = [workingctx[fp] for fp in sorted(added) + if workingctx[fp].size() > 0] + removedfiles = [parentctx[fp] for fp in sorted(removed) + if fp in parentctx and parentctx[fp].size() > 0] # Find exact matches. - for (a, b) in _findexactmatches(repo, - sorted(addedfiles), sorted(removedfiles)): + for (a, b) in _findexactmatches(repo, addedfiles[:], removedfiles): addedfiles.remove(b) yield (a.path(), b.path(), 1.0) # If the user requested similar files to be matched, search for them also. if threshold < 1.0: - for (a, b, score) in _findsimilarmatches(repo, - sorted(addedfiles), sorted(removedfiles), threshold): + for (a, b, score) in _findsimilarmatches(repo, addedfiles, + removedfiles, threshold): yield (a.path(), b.path(), score) diff --git a/tests/test-addremove-similar.t b/tests/test-addremove-similar.t --- a/tests/test-addremove-similar.t +++ b/tests/test-addremove-similar.t @@ -55,6 +55,78 @@ comparing two empty files caused ZeroDiv $ hg commit -m B +should be sorted by path for stable result + + $ for i in `python $TESTDIR/seq.py 0 9`; do + > cp small-file $i + > done + $ rm small-file + $ hg addremove + adding 0 + adding 1 + adding 2 + adding 3 + adding 4 + adding 5 + adding 6 + adding 7 + adding 8 + adding 9 + removing small-file + recording removal of small-file as rename to 0 (100% similar) + recording removal of small-file as rename to 1 (100% similar) + recording removal of small-file as rename to 2 (100% similar) + recording removal of small-file as rename to 3 (100% similar) + recording removal of small-file as rename to 4 (100% similar) + recording removal of small-file as rename to 5 (100% similar) + recording removal of small-file as rename to 6 (100% similar) + recording removal of small-file as rename to 7 (100% similar) + recording removal of small-file as rename to 8 (100% similar) + recording removal of small-file as rename to 9 (100% similar) + $ hg commit -m '10 same files' + +pick one from many identical files + + $ cp 0 a + $ rm `python $TESTDIR/seq.py 0 9` + $ hg addremove + removing 0 + removing 1 + removing 2 + removing 3 + removing 4 + removing 5 + removing 6 + removing 7 + removing 8 + removing 9 + adding a + recording removal of 9 as rename to a (100% similar) + $ hg revert -aq + +pick one from many similar files + + $ cp 0 a + $ for i in `python $TESTDIR/seq.py 0 9`; do + > echo $i >> $i + > done + $ hg commit -m 'make them slightly different' + $ rm `python $TESTDIR/seq.py 0 9` + $ hg addremove -s50 + removing 0 + removing 1 + removing 2 + removing 3 + removing 4 + removing 5 + removing 6 + removing 7 + removing 8 + removing 9 + adding a + recording removal of 9 as rename to a (99% similar) + $ hg commit -m 'always the same file should be selected' + should all fail $ hg addremove -s foo