Show More
@@ -1,119 +1,118 b'' | |||||
1 | # similar.py - mechanisms for finding similar files |
|
1 | # similar.py - mechanisms for finding similar files | |
2 | # |
|
2 | # | |
3 | # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> |
|
3 | # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> | |
4 | # |
|
4 | # | |
5 | # This software may be used and distributed according to the terms of the |
|
5 | # This software may be used and distributed according to the terms of the | |
6 | # GNU General Public License version 2 or any later version. |
|
6 | # GNU General Public License version 2 or any later version. | |
7 |
|
7 | |||
8 | from __future__ import absolute_import |
|
8 | from __future__ import absolute_import | |
9 |
|
9 | |||
10 | import hashlib |
|
10 | import hashlib | |
11 |
|
11 | |||
12 | from .i18n import _ |
|
12 | from .i18n import _ | |
13 | from . import ( |
|
13 | from . import ( | |
14 | bdiff, |
|
14 | bdiff, | |
15 | mdiff, |
|
15 | mdiff, | |
16 | ) |
|
16 | ) | |
17 |
|
17 | |||
18 | def _findexactmatches(repo, added, removed): |
|
18 | def _findexactmatches(repo, added, removed): | |
19 | '''find renamed files that have no changes |
|
19 | '''find renamed files that have no changes | |
20 |
|
20 | |||
21 | Takes a list of new filectxs and a list of removed filectxs, and yields |
|
21 | Takes a list of new filectxs and a list of removed filectxs, and yields | |
22 | (before, after) tuples of exact matches. |
|
22 | (before, after) tuples of exact matches. | |
23 | ''' |
|
23 | ''' | |
24 | numfiles = len(added) + len(removed) |
|
24 | numfiles = len(added) + len(removed) | |
25 |
|
25 | |||
26 | # Get hashes of removed files. |
|
26 | # Get hashes of removed files. | |
27 | hashes = {} |
|
27 | hashes = {} | |
28 | for i, fctx in enumerate(removed): |
|
28 | for i, fctx in enumerate(removed): | |
29 | repo.ui.progress(_('searching for exact renames'), i, total=numfiles, |
|
29 | repo.ui.progress(_('searching for exact renames'), i, total=numfiles, | |
30 | unit=_('files')) |
|
30 | unit=_('files')) | |
31 | h = hashlib.sha1(fctx.data()).digest() |
|
31 | h = hashlib.sha1(fctx.data()).digest() | |
32 | hashes[h] = fctx |
|
32 | hashes[h] = fctx | |
33 |
|
33 | |||
34 | # For each added file, see if it corresponds to a removed file. |
|
34 | # For each added file, see if it corresponds to a removed file. | |
35 | for i, fctx in enumerate(added): |
|
35 | for i, fctx in enumerate(added): | |
36 | repo.ui.progress(_('searching for exact renames'), i + len(removed), |
|
36 | repo.ui.progress(_('searching for exact renames'), i + len(removed), | |
37 | total=numfiles, unit=_('files')) |
|
37 | total=numfiles, unit=_('files')) | |
38 | adata = fctx.data() |
|
38 | adata = fctx.data() | |
39 | h = hashlib.sha1(adata).digest() |
|
39 | h = hashlib.sha1(adata).digest() | |
40 | if h in hashes: |
|
40 | if h in hashes: | |
41 | rfctx = hashes[h] |
|
41 | rfctx = hashes[h] | |
42 | # compare between actual file contents for exact identity |
|
42 | # compare between actual file contents for exact identity | |
43 | if adata == rfctx.data(): |
|
43 | if adata == rfctx.data(): | |
44 | yield (rfctx, fctx) |
|
44 | yield (rfctx, fctx) | |
45 |
|
45 | |||
46 | # Done |
|
46 | # Done | |
47 | repo.ui.progress(_('searching for exact renames'), None) |
|
47 | repo.ui.progress(_('searching for exact renames'), None) | |
48 |
|
48 | |||
49 | def _ctxdata(fctx): |
|
49 | def _ctxdata(fctx): | |
50 | # lazily load text |
|
50 | # lazily load text | |
51 | orig = fctx.data() |
|
51 | orig = fctx.data() | |
52 | return orig, mdiff.splitnewlines(orig) |
|
52 | return orig, mdiff.splitnewlines(orig) | |
53 |
|
53 | |||
54 | def _score(fctx, otherdata): |
|
54 | def _score(fctx, otherdata): | |
55 | orig, lines = otherdata |
|
55 | orig, lines = otherdata | |
56 | text = fctx.data() |
|
56 | text = fctx.data() | |
57 | # bdiff.blocks() returns blocks of matching lines |
|
57 | # bdiff.blocks() returns blocks of matching lines | |
58 | # count the number of bytes in each |
|
58 | # count the number of bytes in each | |
59 | equal = 0 |
|
59 | equal = 0 | |
60 | matches = bdiff.blocks(text, orig) |
|
60 | matches = bdiff.blocks(text, orig) | |
61 | for x1, x2, y1, y2 in matches: |
|
61 | for x1, x2, y1, y2 in matches: | |
62 | for line in lines[y1:y2]: |
|
62 | for line in lines[y1:y2]: | |
63 | equal += len(line) |
|
63 | equal += len(line) | |
64 |
|
64 | |||
65 | lengths = len(text) + len(orig) |
|
65 | lengths = len(text) + len(orig) | |
66 | return equal * 2.0 / lengths |
|
66 | return equal * 2.0 / lengths | |
67 |
|
67 | |||
68 | def score(fctx1, fctx2): |
|
68 | def score(fctx1, fctx2): | |
69 | return _score(fctx1, _ctxdata(fctx2)) |
|
69 | return _score(fctx1, _ctxdata(fctx2)) | |
70 |
|
70 | |||
71 | def _findsimilarmatches(repo, added, removed, threshold): |
|
71 | def _findsimilarmatches(repo, added, removed, threshold): | |
72 | '''find potentially renamed files based on similar file content |
|
72 | '''find potentially renamed files based on similar file content | |
73 |
|
73 | |||
74 | Takes a list of new filectxs and a list of removed filectxs, and yields |
|
74 | Takes a list of new filectxs and a list of removed filectxs, and yields | |
75 | (before, after, score) tuples of partial matches. |
|
75 | (before, after, score) tuples of partial matches. | |
76 | ''' |
|
76 | ''' | |
77 | copies = {} |
|
77 | copies = {} | |
78 | for i, r in enumerate(removed): |
|
78 | for i, r in enumerate(removed): | |
79 | repo.ui.progress(_('searching for similar files'), i, |
|
79 | repo.ui.progress(_('searching for similar files'), i, | |
80 | total=len(removed), unit=_('files')) |
|
80 | total=len(removed), unit=_('files')) | |
81 |
|
81 | |||
82 | data = None |
|
82 | data = None | |
83 | for a in added: |
|
83 | for a in added: | |
84 | bestscore = copies.get(a, (None, threshold))[1] |
|
84 | bestscore = copies.get(a, (None, threshold))[1] | |
85 | if data is None: |
|
85 | if data is None: | |
86 | data = _ctxdata(r) |
|
86 | data = _ctxdata(r) | |
87 | myscore = _score(a, data) |
|
87 | myscore = _score(a, data) | |
88 | if myscore >= bestscore: |
|
88 | if myscore >= bestscore: | |
89 | copies[a] = (r, myscore) |
|
89 | copies[a] = (r, myscore) | |
90 | repo.ui.progress(_('searching'), None) |
|
90 | repo.ui.progress(_('searching'), None) | |
91 |
|
91 | |||
92 | for dest, v in copies.iteritems(): |
|
92 | for dest, v in copies.iteritems(): | |
93 | source, bscore = v |
|
93 | source, bscore = v | |
94 | yield source, dest, bscore |
|
94 | yield source, dest, bscore | |
95 |
|
95 | |||
96 | def findrenames(repo, added, removed, threshold): |
|
96 | def findrenames(repo, added, removed, threshold): | |
97 | '''find renamed files -- yields (before, after, score) tuples''' |
|
97 | '''find renamed files -- yields (before, after, score) tuples''' | |
98 | parentctx = repo['.'] |
|
98 | parentctx = repo['.'] | |
99 | workingctx = repo[None] |
|
99 | workingctx = repo[None] | |
100 |
|
100 | |||
101 | # Zero length files will be frequently unrelated to each other, and |
|
101 | # Zero length files will be frequently unrelated to each other, and | |
102 | # tracking the deletion/addition of such a file will probably cause more |
|
102 | # tracking the deletion/addition of such a file will probably cause more | |
103 | # harm than good. We strip them out here to avoid matching them later on. |
|
103 | # harm than good. We strip them out here to avoid matching them later on. | |
104 |
addedfiles = |
|
104 | addedfiles = [workingctx[fp] for fp in sorted(added) | |
105 |
if workingctx[fp].size() > 0] |
|
105 | if workingctx[fp].size() > 0] | |
106 |
removedfiles = |
|
106 | removedfiles = [parentctx[fp] for fp in sorted(removed) | |
107 |
if fp in parentctx and parentctx[fp].size() > 0] |
|
107 | if fp in parentctx and parentctx[fp].size() > 0] | |
108 |
|
108 | |||
109 | # Find exact matches. |
|
109 | # Find exact matches. | |
110 | for (a, b) in _findexactmatches(repo, |
|
110 | for (a, b) in _findexactmatches(repo, addedfiles[:], removedfiles): | |
111 | sorted(addedfiles), sorted(removedfiles)): |
|
|||
112 | addedfiles.remove(b) |
|
111 | addedfiles.remove(b) | |
113 | yield (a.path(), b.path(), 1.0) |
|
112 | yield (a.path(), b.path(), 1.0) | |
114 |
|
113 | |||
115 | # If the user requested similar files to be matched, search for them also. |
|
114 | # If the user requested similar files to be matched, search for them also. | |
116 | if threshold < 1.0: |
|
115 | if threshold < 1.0: | |
117 | for (a, b, score) in _findsimilarmatches(repo, |
|
116 | for (a, b, score) in _findsimilarmatches(repo, addedfiles, | |
118 | sorted(addedfiles), sorted(removedfiles), threshold): |
|
117 | removedfiles, threshold): | |
119 | yield (a.path(), b.path(), score) |
|
118 | yield (a.path(), b.path(), score) |
@@ -1,102 +1,174 b'' | |||||
1 | $ hg init rep; cd rep |
|
1 | $ hg init rep; cd rep | |
2 |
|
2 | |||
3 | $ touch empty-file |
|
3 | $ touch empty-file | |
4 | $ $PYTHON -c 'for x in range(10000): print(x)' > large-file |
|
4 | $ $PYTHON -c 'for x in range(10000): print(x)' > large-file | |
5 |
|
5 | |||
6 | $ hg addremove |
|
6 | $ hg addremove | |
7 | adding empty-file |
|
7 | adding empty-file | |
8 | adding large-file |
|
8 | adding large-file | |
9 |
|
9 | |||
10 | $ hg commit -m A |
|
10 | $ hg commit -m A | |
11 |
|
11 | |||
12 | $ rm large-file empty-file |
|
12 | $ rm large-file empty-file | |
13 | $ $PYTHON -c 'for x in range(10,10000): print(x)' > another-file |
|
13 | $ $PYTHON -c 'for x in range(10,10000): print(x)' > another-file | |
14 |
|
14 | |||
15 | $ hg addremove -s50 |
|
15 | $ hg addremove -s50 | |
16 | adding another-file |
|
16 | adding another-file | |
17 | removing empty-file |
|
17 | removing empty-file | |
18 | removing large-file |
|
18 | removing large-file | |
19 | recording removal of large-file as rename to another-file (99% similar) |
|
19 | recording removal of large-file as rename to another-file (99% similar) | |
20 |
|
20 | |||
21 | $ hg commit -m B |
|
21 | $ hg commit -m B | |
22 |
|
22 | |||
23 | comparing two empty files caused ZeroDivisionError in the past |
|
23 | comparing two empty files caused ZeroDivisionError in the past | |
24 |
|
24 | |||
25 | $ hg update -C 0 |
|
25 | $ hg update -C 0 | |
26 | 2 files updated, 0 files merged, 1 files removed, 0 files unresolved |
|
26 | 2 files updated, 0 files merged, 1 files removed, 0 files unresolved | |
27 | $ rm empty-file |
|
27 | $ rm empty-file | |
28 | $ touch another-empty-file |
|
28 | $ touch another-empty-file | |
29 | $ hg addremove -s50 |
|
29 | $ hg addremove -s50 | |
30 | adding another-empty-file |
|
30 | adding another-empty-file | |
31 | removing empty-file |
|
31 | removing empty-file | |
32 |
|
32 | |||
33 | $ cd .. |
|
33 | $ cd .. | |
34 |
|
34 | |||
35 | $ hg init rep2; cd rep2 |
|
35 | $ hg init rep2; cd rep2 | |
36 |
|
36 | |||
37 | $ $PYTHON -c 'for x in range(10000): print(x)' > large-file |
|
37 | $ $PYTHON -c 'for x in range(10000): print(x)' > large-file | |
38 | $ $PYTHON -c 'for x in range(50): print(x)' > tiny-file |
|
38 | $ $PYTHON -c 'for x in range(50): print(x)' > tiny-file | |
39 |
|
39 | |||
40 | $ hg addremove |
|
40 | $ hg addremove | |
41 | adding large-file |
|
41 | adding large-file | |
42 | adding tiny-file |
|
42 | adding tiny-file | |
43 |
|
43 | |||
44 | $ hg commit -m A |
|
44 | $ hg commit -m A | |
45 |
|
45 | |||
46 | $ $PYTHON -c 'for x in range(70): print(x)' > small-file |
|
46 | $ $PYTHON -c 'for x in range(70): print(x)' > small-file | |
47 | $ rm tiny-file |
|
47 | $ rm tiny-file | |
48 | $ rm large-file |
|
48 | $ rm large-file | |
49 |
|
49 | |||
50 | $ hg addremove -s50 |
|
50 | $ hg addremove -s50 | |
51 | removing large-file |
|
51 | removing large-file | |
52 | adding small-file |
|
52 | adding small-file | |
53 | removing tiny-file |
|
53 | removing tiny-file | |
54 | recording removal of tiny-file as rename to small-file (82% similar) |
|
54 | recording removal of tiny-file as rename to small-file (82% similar) | |
55 |
|
55 | |||
56 | $ hg commit -m B |
|
56 | $ hg commit -m B | |
57 |
|
57 | |||
|
58 | should be sorted by path for stable result | |||
|
59 | ||||
|
60 | $ for i in `python $TESTDIR/seq.py 0 9`; do | |||
|
61 | > cp small-file $i | |||
|
62 | > done | |||
|
63 | $ rm small-file | |||
|
64 | $ hg addremove | |||
|
65 | adding 0 | |||
|
66 | adding 1 | |||
|
67 | adding 2 | |||
|
68 | adding 3 | |||
|
69 | adding 4 | |||
|
70 | adding 5 | |||
|
71 | adding 6 | |||
|
72 | adding 7 | |||
|
73 | adding 8 | |||
|
74 | adding 9 | |||
|
75 | removing small-file | |||
|
76 | recording removal of small-file as rename to 0 (100% similar) | |||
|
77 | recording removal of small-file as rename to 1 (100% similar) | |||
|
78 | recording removal of small-file as rename to 2 (100% similar) | |||
|
79 | recording removal of small-file as rename to 3 (100% similar) | |||
|
80 | recording removal of small-file as rename to 4 (100% similar) | |||
|
81 | recording removal of small-file as rename to 5 (100% similar) | |||
|
82 | recording removal of small-file as rename to 6 (100% similar) | |||
|
83 | recording removal of small-file as rename to 7 (100% similar) | |||
|
84 | recording removal of small-file as rename to 8 (100% similar) | |||
|
85 | recording removal of small-file as rename to 9 (100% similar) | |||
|
86 | $ hg commit -m '10 same files' | |||
|
87 | ||||
|
88 | pick one from many identical files | |||
|
89 | ||||
|
90 | $ cp 0 a | |||
|
91 | $ rm `python $TESTDIR/seq.py 0 9` | |||
|
92 | $ hg addremove | |||
|
93 | removing 0 | |||
|
94 | removing 1 | |||
|
95 | removing 2 | |||
|
96 | removing 3 | |||
|
97 | removing 4 | |||
|
98 | removing 5 | |||
|
99 | removing 6 | |||
|
100 | removing 7 | |||
|
101 | removing 8 | |||
|
102 | removing 9 | |||
|
103 | adding a | |||
|
104 | recording removal of 9 as rename to a (100% similar) | |||
|
105 | $ hg revert -aq | |||
|
106 | ||||
|
107 | pick one from many similar files | |||
|
108 | ||||
|
109 | $ cp 0 a | |||
|
110 | $ for i in `python $TESTDIR/seq.py 0 9`; do | |||
|
111 | > echo $i >> $i | |||
|
112 | > done | |||
|
113 | $ hg commit -m 'make them slightly different' | |||
|
114 | $ rm `python $TESTDIR/seq.py 0 9` | |||
|
115 | $ hg addremove -s50 | |||
|
116 | removing 0 | |||
|
117 | removing 1 | |||
|
118 | removing 2 | |||
|
119 | removing 3 | |||
|
120 | removing 4 | |||
|
121 | removing 5 | |||
|
122 | removing 6 | |||
|
123 | removing 7 | |||
|
124 | removing 8 | |||
|
125 | removing 9 | |||
|
126 | adding a | |||
|
127 | recording removal of 9 as rename to a (99% similar) | |||
|
128 | $ hg commit -m 'always the same file should be selected' | |||
|
129 | ||||
58 | should all fail |
|
130 | should all fail | |
59 |
|
131 | |||
60 | $ hg addremove -s foo |
|
132 | $ hg addremove -s foo | |
61 | abort: similarity must be a number |
|
133 | abort: similarity must be a number | |
62 | [255] |
|
134 | [255] | |
63 | $ hg addremove -s -1 |
|
135 | $ hg addremove -s -1 | |
64 | abort: similarity must be between 0 and 100 |
|
136 | abort: similarity must be between 0 and 100 | |
65 | [255] |
|
137 | [255] | |
66 | $ hg addremove -s 1e6 |
|
138 | $ hg addremove -s 1e6 | |
67 | abort: similarity must be between 0 and 100 |
|
139 | abort: similarity must be between 0 and 100 | |
68 | [255] |
|
140 | [255] | |
69 |
|
141 | |||
70 | $ cd .. |
|
142 | $ cd .. | |
71 |
|
143 | |||
72 | Issue1527: repeated addremove causes Abort |
|
144 | Issue1527: repeated addremove causes Abort | |
73 |
|
145 | |||
74 | $ hg init rep3; cd rep3 |
|
146 | $ hg init rep3; cd rep3 | |
75 | $ mkdir d |
|
147 | $ mkdir d | |
76 | $ echo a > d/a |
|
148 | $ echo a > d/a | |
77 | $ hg add d/a |
|
149 | $ hg add d/a | |
78 | $ hg commit -m 1 |
|
150 | $ hg commit -m 1 | |
79 |
|
151 | |||
80 | $ mv d/a d/b |
|
152 | $ mv d/a d/b | |
81 | $ hg addremove -s80 |
|
153 | $ hg addremove -s80 | |
82 | removing d/a |
|
154 | removing d/a | |
83 | adding d/b |
|
155 | adding d/b | |
84 | recording removal of d/a as rename to d/b (100% similar) (glob) |
|
156 | recording removal of d/a as rename to d/b (100% similar) (glob) | |
85 | $ hg debugstate |
|
157 | $ hg debugstate | |
86 | r 0 0 1970-01-01 00:00:00 d/a |
|
158 | r 0 0 1970-01-01 00:00:00 d/a | |
87 | a 0 -1 unset d/b |
|
159 | a 0 -1 unset d/b | |
88 | copy: d/a -> d/b |
|
160 | copy: d/a -> d/b | |
89 | $ mv d/b c |
|
161 | $ mv d/b c | |
90 |
|
162 | |||
91 | no copies found here (since the target isn't in d |
|
163 | no copies found here (since the target isn't in d | |
92 |
|
164 | |||
93 | $ hg addremove -s80 d |
|
165 | $ hg addremove -s80 d | |
94 | removing d/b (glob) |
|
166 | removing d/b (glob) | |
95 |
|
167 | |||
96 | copies here |
|
168 | copies here | |
97 |
|
169 | |||
98 | $ hg addremove -s80 |
|
170 | $ hg addremove -s80 | |
99 | adding c |
|
171 | adding c | |
100 | recording removal of d/a as rename to c (100% similar) (glob) |
|
172 | recording removal of d/a as rename to c (100% similar) (glob) | |
101 |
|
173 | |||
102 | $ cd .. |
|
174 | $ cd .. |
General Comments 0
You need to be logged in to leave comments.
Login now