##// END OF EJS Templates
similar: take the first match instead of the last...
Yuya Nishihara -
r31583:2efd9771 default
parent child Browse files
Show More
@@ -1,121 +1,121
1 # similar.py - mechanisms for finding similar files
1 # similar.py - mechanisms for finding similar files
2 #
2 #
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import hashlib
10 import hashlib
11
11
12 from .i18n import _
12 from .i18n import _
13 from . import (
13 from . import (
14 bdiff,
14 bdiff,
15 mdiff,
15 mdiff,
16 )
16 )
17
17
18 def _findexactmatches(repo, added, removed):
18 def _findexactmatches(repo, added, removed):
19 '''find renamed files that have no changes
19 '''find renamed files that have no changes
20
20
21 Takes a list of new filectxs and a list of removed filectxs, and yields
21 Takes a list of new filectxs and a list of removed filectxs, and yields
22 (before, after) tuples of exact matches.
22 (before, after) tuples of exact matches.
23 '''
23 '''
24 numfiles = len(added) + len(removed)
24 numfiles = len(added) + len(removed)
25
25
26 # Get hashes of removed files.
26 # Get hashes of removed files.
27 hashes = {}
27 hashes = {}
28 for i, fctx in enumerate(removed):
28 for i, fctx in enumerate(reversed(removed)):
29 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
29 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
30 unit=_('files'))
30 unit=_('files'))
31 h = hashlib.sha1(fctx.data()).digest()
31 h = hashlib.sha1(fctx.data()).digest()
32 hashes[h] = fctx
32 hashes[h] = fctx
33
33
34 # For each added file, see if it corresponds to a removed file.
34 # For each added file, see if it corresponds to a removed file.
35 for i, fctx in enumerate(added):
35 for i, fctx in enumerate(added):
36 repo.ui.progress(_('searching for exact renames'), i + len(removed),
36 repo.ui.progress(_('searching for exact renames'), i + len(removed),
37 total=numfiles, unit=_('files'))
37 total=numfiles, unit=_('files'))
38 adata = fctx.data()
38 adata = fctx.data()
39 h = hashlib.sha1(adata).digest()
39 h = hashlib.sha1(adata).digest()
40 if h in hashes:
40 if h in hashes:
41 rfctx = hashes[h]
41 rfctx = hashes[h]
42 # compare between actual file contents for exact identity
42 # compare between actual file contents for exact identity
43 if adata == rfctx.data():
43 if adata == rfctx.data():
44 yield (rfctx, fctx)
44 yield (rfctx, fctx)
45
45
46 # Done
46 # Done
47 repo.ui.progress(_('searching for exact renames'), None)
47 repo.ui.progress(_('searching for exact renames'), None)
48
48
49 def _ctxdata(fctx):
49 def _ctxdata(fctx):
50 # lazily load text
50 # lazily load text
51 orig = fctx.data()
51 orig = fctx.data()
52 return orig, mdiff.splitnewlines(orig)
52 return orig, mdiff.splitnewlines(orig)
53
53
54 def _score(fctx, otherdata):
54 def _score(fctx, otherdata):
55 orig, lines = otherdata
55 orig, lines = otherdata
56 text = fctx.data()
56 text = fctx.data()
57 # bdiff.blocks() returns blocks of matching lines
57 # bdiff.blocks() returns blocks of matching lines
58 # count the number of bytes in each
58 # count the number of bytes in each
59 equal = 0
59 equal = 0
60 matches = bdiff.blocks(text, orig)
60 matches = bdiff.blocks(text, orig)
61 for x1, x2, y1, y2 in matches:
61 for x1, x2, y1, y2 in matches:
62 for line in lines[y1:y2]:
62 for line in lines[y1:y2]:
63 equal += len(line)
63 equal += len(line)
64
64
65 lengths = len(text) + len(orig)
65 lengths = len(text) + len(orig)
66 return equal * 2.0 / lengths
66 return equal * 2.0 / lengths
67
67
68 def score(fctx1, fctx2):
68 def score(fctx1, fctx2):
69 return _score(fctx1, _ctxdata(fctx2))
69 return _score(fctx1, _ctxdata(fctx2))
70
70
71 def _findsimilarmatches(repo, added, removed, threshold):
71 def _findsimilarmatches(repo, added, removed, threshold):
72 '''find potentially renamed files based on similar file content
72 '''find potentially renamed files based on similar file content
73
73
74 Takes a list of new filectxs and a list of removed filectxs, and yields
74 Takes a list of new filectxs and a list of removed filectxs, and yields
75 (before, after, score) tuples of partial matches.
75 (before, after, score) tuples of partial matches.
76 '''
76 '''
77 copies = {}
77 copies = {}
78 for i, r in enumerate(removed):
78 for i, r in enumerate(removed):
79 repo.ui.progress(_('searching for similar files'), i,
79 repo.ui.progress(_('searching for similar files'), i,
80 total=len(removed), unit=_('files'))
80 total=len(removed), unit=_('files'))
81
81
82 data = None
82 data = None
83 for a in added:
83 for a in added:
84 bestscore = copies.get(a, (None, threshold))[1]
84 bestscore = copies.get(a, (None, threshold))[1]
85 if data is None:
85 if data is None:
86 data = _ctxdata(r)
86 data = _ctxdata(r)
87 myscore = _score(a, data)
87 myscore = _score(a, data)
88 if myscore >= bestscore:
88 if myscore > bestscore:
89 copies[a] = (r, myscore)
89 copies[a] = (r, myscore)
90 repo.ui.progress(_('searching'), None)
90 repo.ui.progress(_('searching'), None)
91
91
92 for dest, v in copies.iteritems():
92 for dest, v in copies.iteritems():
93 source, bscore = v
93 source, bscore = v
94 yield source, dest, bscore
94 yield source, dest, bscore
95
95
96 def _dropempty(fctxs):
96 def _dropempty(fctxs):
97 return [x for x in fctxs if x.size() > 0]
97 return [x for x in fctxs if x.size() > 0]
98
98
99 def findrenames(repo, added, removed, threshold):
99 def findrenames(repo, added, removed, threshold):
100 '''find renamed files -- yields (before, after, score) tuples'''
100 '''find renamed files -- yields (before, after, score) tuples'''
101 wctx = repo[None]
101 wctx = repo[None]
102 pctx = wctx.p1()
102 pctx = wctx.p1()
103
103
104 # Zero length files will be frequently unrelated to each other, and
104 # Zero length files will be frequently unrelated to each other, and
105 # tracking the deletion/addition of such a file will probably cause more
105 # tracking the deletion/addition of such a file will probably cause more
106 # harm than good. We strip them out here to avoid matching them later on.
106 # harm than good. We strip them out here to avoid matching them later on.
107 addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
107 addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
108 removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
108 removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
109
109
110 # Find exact matches.
110 # Find exact matches.
111 matchedfiles = set()
111 matchedfiles = set()
112 for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
112 for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
113 matchedfiles.add(b)
113 matchedfiles.add(b)
114 yield (a.path(), b.path(), 1.0)
114 yield (a.path(), b.path(), 1.0)
115
115
116 # If the user requested similar files to be matched, search for them also.
116 # If the user requested similar files to be matched, search for them also.
117 if threshold < 1.0:
117 if threshold < 1.0:
118 addedfiles = [x for x in addedfiles if x not in matchedfiles]
118 addedfiles = [x for x in addedfiles if x not in matchedfiles]
119 for (a, b, score) in _findsimilarmatches(repo, addedfiles,
119 for (a, b, score) in _findsimilarmatches(repo, addedfiles,
120 removedfiles, threshold):
120 removedfiles, threshold):
121 yield (a.path(), b.path(), score)
121 yield (a.path(), b.path(), score)
@@ -1,174 +1,174
1 $ hg init rep; cd rep
1 $ hg init rep; cd rep
2
2
3 $ touch empty-file
3 $ touch empty-file
4 $ $PYTHON -c 'for x in range(10000): print(x)' > large-file
4 $ $PYTHON -c 'for x in range(10000): print(x)' > large-file
5
5
6 $ hg addremove
6 $ hg addremove
7 adding empty-file
7 adding empty-file
8 adding large-file
8 adding large-file
9
9
10 $ hg commit -m A
10 $ hg commit -m A
11
11
12 $ rm large-file empty-file
12 $ rm large-file empty-file
13 $ $PYTHON -c 'for x in range(10,10000): print(x)' > another-file
13 $ $PYTHON -c 'for x in range(10,10000): print(x)' > another-file
14
14
15 $ hg addremove -s50
15 $ hg addremove -s50
16 adding another-file
16 adding another-file
17 removing empty-file
17 removing empty-file
18 removing large-file
18 removing large-file
19 recording removal of large-file as rename to another-file (99% similar)
19 recording removal of large-file as rename to another-file (99% similar)
20
20
21 $ hg commit -m B
21 $ hg commit -m B
22
22
23 comparing two empty files caused ZeroDivisionError in the past
23 comparing two empty files caused ZeroDivisionError in the past
24
24
25 $ hg update -C 0
25 $ hg update -C 0
26 2 files updated, 0 files merged, 1 files removed, 0 files unresolved
26 2 files updated, 0 files merged, 1 files removed, 0 files unresolved
27 $ rm empty-file
27 $ rm empty-file
28 $ touch another-empty-file
28 $ touch another-empty-file
29 $ hg addremove -s50
29 $ hg addremove -s50
30 adding another-empty-file
30 adding another-empty-file
31 removing empty-file
31 removing empty-file
32
32
33 $ cd ..
33 $ cd ..
34
34
35 $ hg init rep2; cd rep2
35 $ hg init rep2; cd rep2
36
36
37 $ $PYTHON -c 'for x in range(10000): print(x)' > large-file
37 $ $PYTHON -c 'for x in range(10000): print(x)' > large-file
38 $ $PYTHON -c 'for x in range(50): print(x)' > tiny-file
38 $ $PYTHON -c 'for x in range(50): print(x)' > tiny-file
39
39
40 $ hg addremove
40 $ hg addremove
41 adding large-file
41 adding large-file
42 adding tiny-file
42 adding tiny-file
43
43
44 $ hg commit -m A
44 $ hg commit -m A
45
45
46 $ $PYTHON -c 'for x in range(70): print(x)' > small-file
46 $ $PYTHON -c 'for x in range(70): print(x)' > small-file
47 $ rm tiny-file
47 $ rm tiny-file
48 $ rm large-file
48 $ rm large-file
49
49
50 $ hg addremove -s50
50 $ hg addremove -s50
51 removing large-file
51 removing large-file
52 adding small-file
52 adding small-file
53 removing tiny-file
53 removing tiny-file
54 recording removal of tiny-file as rename to small-file (82% similar)
54 recording removal of tiny-file as rename to small-file (82% similar)
55
55
56 $ hg commit -m B
56 $ hg commit -m B
57
57
58 should be sorted by path for stable result
58 should be sorted by path for stable result
59
59
60 $ for i in `python $TESTDIR/seq.py 0 9`; do
60 $ for i in `python $TESTDIR/seq.py 0 9`; do
61 > cp small-file $i
61 > cp small-file $i
62 > done
62 > done
63 $ rm small-file
63 $ rm small-file
64 $ hg addremove
64 $ hg addremove
65 adding 0
65 adding 0
66 adding 1
66 adding 1
67 adding 2
67 adding 2
68 adding 3
68 adding 3
69 adding 4
69 adding 4
70 adding 5
70 adding 5
71 adding 6
71 adding 6
72 adding 7
72 adding 7
73 adding 8
73 adding 8
74 adding 9
74 adding 9
75 removing small-file
75 removing small-file
76 recording removal of small-file as rename to 0 (100% similar)
76 recording removal of small-file as rename to 0 (100% similar)
77 recording removal of small-file as rename to 1 (100% similar)
77 recording removal of small-file as rename to 1 (100% similar)
78 recording removal of small-file as rename to 2 (100% similar)
78 recording removal of small-file as rename to 2 (100% similar)
79 recording removal of small-file as rename to 3 (100% similar)
79 recording removal of small-file as rename to 3 (100% similar)
80 recording removal of small-file as rename to 4 (100% similar)
80 recording removal of small-file as rename to 4 (100% similar)
81 recording removal of small-file as rename to 5 (100% similar)
81 recording removal of small-file as rename to 5 (100% similar)
82 recording removal of small-file as rename to 6 (100% similar)
82 recording removal of small-file as rename to 6 (100% similar)
83 recording removal of small-file as rename to 7 (100% similar)
83 recording removal of small-file as rename to 7 (100% similar)
84 recording removal of small-file as rename to 8 (100% similar)
84 recording removal of small-file as rename to 8 (100% similar)
85 recording removal of small-file as rename to 9 (100% similar)
85 recording removal of small-file as rename to 9 (100% similar)
86 $ hg commit -m '10 same files'
86 $ hg commit -m '10 same files'
87
87
88 pick one from many identical files
88 pick one from many identical files
89
89
90 $ cp 0 a
90 $ cp 0 a
91 $ rm `python $TESTDIR/seq.py 0 9`
91 $ rm `python $TESTDIR/seq.py 0 9`
92 $ hg addremove
92 $ hg addremove
93 removing 0
93 removing 0
94 removing 1
94 removing 1
95 removing 2
95 removing 2
96 removing 3
96 removing 3
97 removing 4
97 removing 4
98 removing 5
98 removing 5
99 removing 6
99 removing 6
100 removing 7
100 removing 7
101 removing 8
101 removing 8
102 removing 9
102 removing 9
103 adding a
103 adding a
104 recording removal of 9 as rename to a (100% similar)
104 recording removal of 0 as rename to a (100% similar)
105 $ hg revert -aq
105 $ hg revert -aq
106
106
107 pick one from many similar files
107 pick one from many similar files
108
108
109 $ cp 0 a
109 $ cp 0 a
110 $ for i in `python $TESTDIR/seq.py 0 9`; do
110 $ for i in `python $TESTDIR/seq.py 0 9`; do
111 > echo $i >> $i
111 > echo $i >> $i
112 > done
112 > done
113 $ hg commit -m 'make them slightly different'
113 $ hg commit -m 'make them slightly different'
114 $ rm `python $TESTDIR/seq.py 0 9`
114 $ rm `python $TESTDIR/seq.py 0 9`
115 $ hg addremove -s50
115 $ hg addremove -s50
116 removing 0
116 removing 0
117 removing 1
117 removing 1
118 removing 2
118 removing 2
119 removing 3
119 removing 3
120 removing 4
120 removing 4
121 removing 5
121 removing 5
122 removing 6
122 removing 6
123 removing 7
123 removing 7
124 removing 8
124 removing 8
125 removing 9
125 removing 9
126 adding a
126 adding a
127 recording removal of 9 as rename to a (99% similar)
127 recording removal of 0 as rename to a (99% similar)
128 $ hg commit -m 'always the same file should be selected'
128 $ hg commit -m 'always the same file should be selected'
129
129
130 should all fail
130 should all fail
131
131
132 $ hg addremove -s foo
132 $ hg addremove -s foo
133 abort: similarity must be a number
133 abort: similarity must be a number
134 [255]
134 [255]
135 $ hg addremove -s -1
135 $ hg addremove -s -1
136 abort: similarity must be between 0 and 100
136 abort: similarity must be between 0 and 100
137 [255]
137 [255]
138 $ hg addremove -s 1e6
138 $ hg addremove -s 1e6
139 abort: similarity must be between 0 and 100
139 abort: similarity must be between 0 and 100
140 [255]
140 [255]
141
141
142 $ cd ..
142 $ cd ..
143
143
144 Issue1527: repeated addremove causes Abort
144 Issue1527: repeated addremove causes Abort
145
145
146 $ hg init rep3; cd rep3
146 $ hg init rep3; cd rep3
147 $ mkdir d
147 $ mkdir d
148 $ echo a > d/a
148 $ echo a > d/a
149 $ hg add d/a
149 $ hg add d/a
150 $ hg commit -m 1
150 $ hg commit -m 1
151
151
152 $ mv d/a d/b
152 $ mv d/a d/b
153 $ hg addremove -s80
153 $ hg addremove -s80
154 removing d/a
154 removing d/a
155 adding d/b
155 adding d/b
156 recording removal of d/a as rename to d/b (100% similar) (glob)
156 recording removal of d/a as rename to d/b (100% similar) (glob)
157 $ hg debugstate
157 $ hg debugstate
158 r 0 0 1970-01-01 00:00:00 d/a
158 r 0 0 1970-01-01 00:00:00 d/a
159 a 0 -1 unset d/b
159 a 0 -1 unset d/b
160 copy: d/a -> d/b
160 copy: d/a -> d/b
161 $ mv d/b c
161 $ mv d/b c
162
162
163 no copies found here (since the target isn't in d
163 no copies found here (since the target isn't in d
164
164
165 $ hg addremove -s80 d
165 $ hg addremove -s80 d
166 removing d/b (glob)
166 removing d/b (glob)
167
167
168 copies here
168 copies here
169
169
170 $ hg addremove -s80
170 $ hg addremove -s80
171 adding c
171 adding c
172 recording removal of d/a as rename to c (100% similar) (glob)
172 recording removal of d/a as rename to c (100% similar) (glob)
173
173
174 $ cd ..
174 $ cd ..
General Comments 0
You need to be logged in to leave comments. Login now