Show More
@@ -1,104 +1,108 | |||
|
1 | 1 | # similar.py - mechanisms for finding similar files |
|
2 | 2 | # |
|
3 | 3 | # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> |
|
4 | 4 | # |
|
5 | 5 | # This software may be used and distributed according to the terms of the |
|
6 | 6 | # GNU General Public License version 2 or any later version. |
|
7 | 7 | |
|
8 | from i18n import _ | |
|
9 | import util | |
|
10 | import mdiff | |
|
11 | import bdiff | |
|
8 | from __future__ import absolute_import | |
|
9 | ||
|
10 | from .i18n import _ | |
|
11 | from . import ( | |
|
12 | bdiff, | |
|
13 | mdiff, | |
|
14 | util, | |
|
15 | ) | |
|
12 | 16 | |
|
13 | 17 | def _findexactmatches(repo, added, removed): |
|
14 | 18 | '''find renamed files that have no changes |
|
15 | 19 | |
|
16 | 20 | Takes a list of new filectxs and a list of removed filectxs, and yields |
|
17 | 21 | (before, after) tuples of exact matches. |
|
18 | 22 | ''' |
|
19 | 23 | numfiles = len(added) + len(removed) |
|
20 | 24 | |
|
21 | 25 | # Get hashes of removed files. |
|
22 | 26 | hashes = {} |
|
23 | 27 | for i, fctx in enumerate(removed): |
|
24 | 28 | repo.ui.progress(_('searching for exact renames'), i, total=numfiles) |
|
25 | 29 | h = util.sha1(fctx.data()).digest() |
|
26 | 30 | hashes[h] = fctx |
|
27 | 31 | |
|
28 | 32 | # For each added file, see if it corresponds to a removed file. |
|
29 | 33 | for i, fctx in enumerate(added): |
|
30 | 34 | repo.ui.progress(_('searching for exact renames'), i + len(removed), |
|
31 | 35 | total=numfiles) |
|
32 | 36 | h = util.sha1(fctx.data()).digest() |
|
33 | 37 | if h in hashes: |
|
34 | 38 | yield (hashes[h], fctx) |
|
35 | 39 | |
|
36 | 40 | # Done |
|
37 | 41 | repo.ui.progress(_('searching for exact renames'), None) |
|
38 | 42 | |
|
39 | 43 | def _findsimilarmatches(repo, added, removed, threshold): |
|
40 | 44 | '''find potentially renamed files based on similar file content |
|
41 | 45 | |
|
42 | 46 | Takes a list of new filectxs and a list of removed filectxs, and yields |
|
43 | 47 | (before, after, score) tuples of partial matches. |
|
44 | 48 | ''' |
|
45 | 49 | copies = {} |
|
46 | 50 | for i, r in enumerate(removed): |
|
47 | 51 | repo.ui.progress(_('searching for similar files'), i, |
|
48 | 52 | total=len(removed)) |
|
49 | 53 | |
|
50 | 54 | # lazily load text |
|
51 | 55 | @util.cachefunc |
|
52 | 56 | def data(): |
|
53 | 57 | orig = r.data() |
|
54 | 58 | return orig, mdiff.splitnewlines(orig) |
|
55 | 59 | |
|
56 | 60 | def score(text): |
|
57 | 61 | orig, lines = data() |
|
58 | 62 | # bdiff.blocks() returns blocks of matching lines |
|
59 | 63 | # count the number of bytes in each |
|
60 | 64 | equal = 0 |
|
61 | 65 | matches = bdiff.blocks(text, orig) |
|
62 | 66 | for x1, x2, y1, y2 in matches: |
|
63 | 67 | for line in lines[y1:y2]: |
|
64 | 68 | equal += len(line) |
|
65 | 69 | |
|
66 | 70 | lengths = len(text) + len(orig) |
|
67 | 71 | return equal * 2.0 / lengths |
|
68 | 72 | |
|
69 | 73 | for a in added: |
|
70 | 74 | bestscore = copies.get(a, (None, threshold))[1] |
|
71 | 75 | myscore = score(a.data()) |
|
72 | 76 | if myscore >= bestscore: |
|
73 | 77 | copies[a] = (r, myscore) |
|
74 | 78 | repo.ui.progress(_('searching'), None) |
|
75 | 79 | |
|
76 | 80 | for dest, v in copies.iteritems(): |
|
77 | 81 | source, score = v |
|
78 | 82 | yield source, dest, score |
|
79 | 83 | |
|
80 | 84 | def findrenames(repo, added, removed, threshold): |
|
81 | 85 | '''find renamed files -- yields (before, after, score) tuples''' |
|
82 | 86 | parentctx = repo['.'] |
|
83 | 87 | workingctx = repo[None] |
|
84 | 88 | |
|
85 | 89 | # Zero length files will be frequently unrelated to each other, and |
|
86 | 90 | # tracking the deletion/addition of such a file will probably cause more |
|
87 | 91 | # harm than good. We strip them out here to avoid matching them later on. |
|
88 | 92 | addedfiles = set([workingctx[fp] for fp in added |
|
89 | 93 | if workingctx[fp].size() > 0]) |
|
90 | 94 | removedfiles = set([parentctx[fp] for fp in removed |
|
91 | 95 | if fp in parentctx and parentctx[fp].size() > 0]) |
|
92 | 96 | |
|
93 | 97 | # Find exact matches. |
|
94 | 98 | for (a, b) in _findexactmatches(repo, |
|
95 | 99 | sorted(addedfiles), sorted(removedfiles)): |
|
96 | 100 | addedfiles.remove(b) |
|
97 | 101 | yield (a.path(), b.path(), 1.0) |
|
98 | 102 | |
|
99 | 103 | # If the user requested similar files to be matched, search for them also. |
|
100 | 104 | if threshold < 1.0: |
|
101 | 105 | for (a, b, score) in _findsimilarmatches(repo, |
|
102 | 106 | sorted(addedfiles), sorted(removedfiles), threshold): |
|
103 | 107 | yield (a.path(), b.path(), score) |
|
104 | 108 |
@@ -1,210 +1,209 | |||
|
1 | 1 | #require test-repo |
|
2 | 2 | |
|
3 | 3 | $ cd "$TESTDIR"/.. |
|
4 | 4 | |
|
5 | 5 | $ hg files 'set:(**.py)' | xargs python contrib/check-py3-compat.py |
|
6 | 6 | contrib/casesmash.py not using absolute_import |
|
7 | 7 | contrib/check-code.py not using absolute_import |
|
8 | 8 | contrib/check-code.py requires print_function |
|
9 | 9 | contrib/check-config.py not using absolute_import |
|
10 | 10 | contrib/check-config.py requires print_function |
|
11 | 11 | contrib/debugcmdserver.py not using absolute_import |
|
12 | 12 | contrib/debugcmdserver.py requires print_function |
|
13 | 13 | contrib/debugshell.py not using absolute_import |
|
14 | 14 | contrib/fixpax.py not using absolute_import |
|
15 | 15 | contrib/fixpax.py requires print_function |
|
16 | 16 | contrib/hgclient.py not using absolute_import |
|
17 | 17 | contrib/hgclient.py requires print_function |
|
18 | 18 | contrib/hgfixes/fix_bytes.py not using absolute_import |
|
19 | 19 | contrib/hgfixes/fix_bytesmod.py not using absolute_import |
|
20 | 20 | contrib/hgfixes/fix_leftover_imports.py not using absolute_import |
|
21 | 21 | contrib/import-checker.py not using absolute_import |
|
22 | 22 | contrib/import-checker.py requires print_function |
|
23 | 23 | contrib/memory.py not using absolute_import |
|
24 | 24 | contrib/perf.py not using absolute_import |
|
25 | 25 | contrib/python-hook-examples.py not using absolute_import |
|
26 | 26 | contrib/revsetbenchmarks.py not using absolute_import |
|
27 | 27 | contrib/revsetbenchmarks.py requires print_function |
|
28 | 28 | contrib/showstack.py not using absolute_import |
|
29 | 29 | contrib/synthrepo.py not using absolute_import |
|
30 | 30 | contrib/win32/hgwebdir_wsgi.py not using absolute_import |
|
31 | 31 | doc/check-seclevel.py not using absolute_import |
|
32 | 32 | doc/gendoc.py not using absolute_import |
|
33 | 33 | doc/hgmanpage.py not using absolute_import |
|
34 | 34 | hgext/__init__.py not using absolute_import |
|
35 | 35 | hgext/acl.py not using absolute_import |
|
36 | 36 | hgext/blackbox.py not using absolute_import |
|
37 | 37 | hgext/bugzilla.py not using absolute_import |
|
38 | 38 | hgext/censor.py not using absolute_import |
|
39 | 39 | hgext/children.py not using absolute_import |
|
40 | 40 | hgext/churn.py not using absolute_import |
|
41 | 41 | hgext/clonebundles.py not using absolute_import |
|
42 | 42 | hgext/color.py not using absolute_import |
|
43 | 43 | hgext/convert/__init__.py not using absolute_import |
|
44 | 44 | hgext/convert/bzr.py not using absolute_import |
|
45 | 45 | hgext/convert/common.py not using absolute_import |
|
46 | 46 | hgext/convert/convcmd.py not using absolute_import |
|
47 | 47 | hgext/convert/cvs.py not using absolute_import |
|
48 | 48 | hgext/convert/cvsps.py not using absolute_import |
|
49 | 49 | hgext/convert/darcs.py not using absolute_import |
|
50 | 50 | hgext/convert/filemap.py not using absolute_import |
|
51 | 51 | hgext/convert/git.py not using absolute_import |
|
52 | 52 | hgext/convert/gnuarch.py not using absolute_import |
|
53 | 53 | hgext/convert/hg.py not using absolute_import |
|
54 | 54 | hgext/convert/monotone.py not using absolute_import |
|
55 | 55 | hgext/convert/p4.py not using absolute_import |
|
56 | 56 | hgext/convert/subversion.py not using absolute_import |
|
57 | 57 | hgext/convert/transport.py not using absolute_import |
|
58 | 58 | hgext/eol.py not using absolute_import |
|
59 | 59 | hgext/extdiff.py not using absolute_import |
|
60 | 60 | hgext/factotum.py not using absolute_import |
|
61 | 61 | hgext/fetch.py not using absolute_import |
|
62 | 62 | hgext/gpg.py not using absolute_import |
|
63 | 63 | hgext/graphlog.py not using absolute_import |
|
64 | 64 | hgext/hgcia.py not using absolute_import |
|
65 | 65 | hgext/hgk.py not using absolute_import |
|
66 | 66 | hgext/highlight/__init__.py not using absolute_import |
|
67 | 67 | hgext/highlight/highlight.py not using absolute_import |
|
68 | 68 | hgext/histedit.py not using absolute_import |
|
69 | 69 | hgext/keyword.py not using absolute_import |
|
70 | 70 | hgext/largefiles/__init__.py not using absolute_import |
|
71 | 71 | hgext/largefiles/basestore.py not using absolute_import |
|
72 | 72 | hgext/largefiles/lfcommands.py not using absolute_import |
|
73 | 73 | hgext/largefiles/lfutil.py not using absolute_import |
|
74 | 74 | hgext/largefiles/localstore.py not using absolute_import |
|
75 | 75 | hgext/largefiles/overrides.py not using absolute_import |
|
76 | 76 | hgext/largefiles/proto.py not using absolute_import |
|
77 | 77 | hgext/largefiles/remotestore.py not using absolute_import |
|
78 | 78 | hgext/largefiles/reposetup.py not using absolute_import |
|
79 | 79 | hgext/largefiles/uisetup.py not using absolute_import |
|
80 | 80 | hgext/largefiles/wirestore.py not using absolute_import |
|
81 | 81 | hgext/mq.py not using absolute_import |
|
82 | 82 | hgext/notify.py not using absolute_import |
|
83 | 83 | hgext/pager.py not using absolute_import |
|
84 | 84 | hgext/patchbomb.py not using absolute_import |
|
85 | 85 | hgext/purge.py not using absolute_import |
|
86 | 86 | hgext/rebase.py not using absolute_import |
|
87 | 87 | hgext/record.py not using absolute_import |
|
88 | 88 | hgext/relink.py not using absolute_import |
|
89 | 89 | hgext/schemes.py not using absolute_import |
|
90 | 90 | hgext/share.py not using absolute_import |
|
91 | 91 | hgext/shelve.py not using absolute_import |
|
92 | 92 | hgext/strip.py not using absolute_import |
|
93 | 93 | hgext/transplant.py not using absolute_import |
|
94 | 94 | hgext/win32mbcs.py not using absolute_import |
|
95 | 95 | hgext/win32text.py not using absolute_import |
|
96 | 96 | hgext/zeroconf/Zeroconf.py not using absolute_import |
|
97 | 97 | hgext/zeroconf/Zeroconf.py requires print_function |
|
98 | 98 | hgext/zeroconf/__init__.py not using absolute_import |
|
99 | 99 | i18n/check-translation.py not using absolute_import |
|
100 | 100 | i18n/polib.py not using absolute_import |
|
101 | 101 | mercurial/byterange.py not using absolute_import |
|
102 | 102 | mercurial/cmdutil.py not using absolute_import |
|
103 | 103 | mercurial/commands.py not using absolute_import |
|
104 | 104 | mercurial/context.py not using absolute_import |
|
105 | 105 | mercurial/dirstate.py not using absolute_import |
|
106 | 106 | mercurial/dispatch.py requires print_function |
|
107 | 107 | mercurial/exchange.py not using absolute_import |
|
108 | 108 | mercurial/help.py not using absolute_import |
|
109 | 109 | mercurial/httpclient/__init__.py not using absolute_import |
|
110 | 110 | mercurial/httpclient/_readers.py not using absolute_import |
|
111 | 111 | mercurial/httpclient/socketutil.py not using absolute_import |
|
112 | 112 | mercurial/httpconnection.py not using absolute_import |
|
113 | 113 | mercurial/keepalive.py not using absolute_import |
|
114 | 114 | mercurial/keepalive.py requires print_function |
|
115 | 115 | mercurial/localrepo.py not using absolute_import |
|
116 | 116 | mercurial/lsprof.py requires print_function |
|
117 | 117 | mercurial/lsprofcalltree.py not using absolute_import |
|
118 | 118 | mercurial/lsprofcalltree.py requires print_function |
|
119 | 119 | mercurial/mail.py requires print_function |
|
120 | 120 | mercurial/manifest.py not using absolute_import |
|
121 | 121 | mercurial/mdiff.py not using absolute_import |
|
122 | 122 | mercurial/patch.py not using absolute_import |
|
123 | 123 | mercurial/pvec.py not using absolute_import |
|
124 | 124 | mercurial/py3kcompat.py not using absolute_import |
|
125 | 125 | mercurial/revlog.py not using absolute_import |
|
126 | 126 | mercurial/scmposix.py not using absolute_import |
|
127 | 127 | mercurial/scmutil.py not using absolute_import |
|
128 | 128 | mercurial/scmwindows.py not using absolute_import |
|
129 | mercurial/similar.py not using absolute_import | |
|
130 | 129 | mercurial/store.py not using absolute_import |
|
131 | 130 | mercurial/windows.py not using absolute_import |
|
132 | 131 | setup.py not using absolute_import |
|
133 | 132 | tests/filterpyflakes.py requires print_function |
|
134 | 133 | tests/generate-working-copy-states.py requires print_function |
|
135 | 134 | tests/get-with-headers.py requires print_function |
|
136 | 135 | tests/heredoctest.py requires print_function |
|
137 | 136 | tests/hypothesishelpers.py not using absolute_import |
|
138 | 137 | tests/hypothesishelpers.py requires print_function |
|
139 | 138 | tests/killdaemons.py not using absolute_import |
|
140 | 139 | tests/md5sum.py not using absolute_import |
|
141 | 140 | tests/mockblackbox.py not using absolute_import |
|
142 | 141 | tests/printenv.py not using absolute_import |
|
143 | 142 | tests/readlink.py not using absolute_import |
|
144 | 143 | tests/readlink.py requires print_function |
|
145 | 144 | tests/revlog-formatv0.py not using absolute_import |
|
146 | 145 | tests/run-tests.py not using absolute_import |
|
147 | 146 | tests/seq.py not using absolute_import |
|
148 | 147 | tests/seq.py requires print_function |
|
149 | 148 | tests/silenttestrunner.py not using absolute_import |
|
150 | 149 | tests/silenttestrunner.py requires print_function |
|
151 | 150 | tests/sitecustomize.py not using absolute_import |
|
152 | 151 | tests/svn-safe-append.py not using absolute_import |
|
153 | 152 | tests/svnxml.py not using absolute_import |
|
154 | 153 | tests/test-ancestor.py requires print_function |
|
155 | 154 | tests/test-atomictempfile.py not using absolute_import |
|
156 | 155 | tests/test-batching.py not using absolute_import |
|
157 | 156 | tests/test-batching.py requires print_function |
|
158 | 157 | tests/test-bdiff.py not using absolute_import |
|
159 | 158 | tests/test-bdiff.py requires print_function |
|
160 | 159 | tests/test-context.py not using absolute_import |
|
161 | 160 | tests/test-context.py requires print_function |
|
162 | 161 | tests/test-demandimport.py not using absolute_import |
|
163 | 162 | tests/test-demandimport.py requires print_function |
|
164 | 163 | tests/test-dispatch.py not using absolute_import |
|
165 | 164 | tests/test-dispatch.py requires print_function |
|
166 | 165 | tests/test-doctest.py not using absolute_import |
|
167 | 166 | tests/test-duplicateoptions.py not using absolute_import |
|
168 | 167 | tests/test-duplicateoptions.py requires print_function |
|
169 | 168 | tests/test-filecache.py not using absolute_import |
|
170 | 169 | tests/test-filecache.py requires print_function |
|
171 | 170 | tests/test-filelog.py not using absolute_import |
|
172 | 171 | tests/test-filelog.py requires print_function |
|
173 | 172 | tests/test-hg-parseurl.py not using absolute_import |
|
174 | 173 | tests/test-hg-parseurl.py requires print_function |
|
175 | 174 | tests/test-hgweb-auth.py not using absolute_import |
|
176 | 175 | tests/test-hgweb-auth.py requires print_function |
|
177 | 176 | tests/test-hgwebdir-paths.py not using absolute_import |
|
178 | 177 | tests/test-hybridencode.py not using absolute_import |
|
179 | 178 | tests/test-hybridencode.py requires print_function |
|
180 | 179 | tests/test-lrucachedict.py not using absolute_import |
|
181 | 180 | tests/test-lrucachedict.py requires print_function |
|
182 | 181 | tests/test-manifest.py not using absolute_import |
|
183 | 182 | tests/test-minirst.py not using absolute_import |
|
184 | 183 | tests/test-minirst.py requires print_function |
|
185 | 184 | tests/test-parseindex2.py not using absolute_import |
|
186 | 185 | tests/test-parseindex2.py requires print_function |
|
187 | 186 | tests/test-pathencode.py not using absolute_import |
|
188 | 187 | tests/test-pathencode.py requires print_function |
|
189 | 188 | tests/test-propertycache.py not using absolute_import |
|
190 | 189 | tests/test-propertycache.py requires print_function |
|
191 | 190 | tests/test-revlog-ancestry.py not using absolute_import |
|
192 | 191 | tests/test-revlog-ancestry.py requires print_function |
|
193 | 192 | tests/test-run-tests.py not using absolute_import |
|
194 | 193 | tests/test-simplemerge.py not using absolute_import |
|
195 | 194 | tests/test-status-inprocess.py not using absolute_import |
|
196 | 195 | tests/test-status-inprocess.py requires print_function |
|
197 | 196 | tests/test-symlink-os-yes-fs-no.py not using absolute_import |
|
198 | 197 | tests/test-trusted.py not using absolute_import |
|
199 | 198 | tests/test-trusted.py requires print_function |
|
200 | 199 | tests/test-ui-color.py not using absolute_import |
|
201 | 200 | tests/test-ui-color.py requires print_function |
|
202 | 201 | tests/test-ui-config.py not using absolute_import |
|
203 | 202 | tests/test-ui-config.py requires print_function |
|
204 | 203 | tests/test-ui-verbosity.py not using absolute_import |
|
205 | 204 | tests/test-ui-verbosity.py requires print_function |
|
206 | 205 | tests/test-url.py not using absolute_import |
|
207 | 206 | tests/test-url.py requires print_function |
|
208 | 207 | tests/test-walkrepo.py requires print_function |
|
209 | 208 | tests/test-wireproto.py requires print_function |
|
210 | 209 | tests/tinyproxy.py requires print_function |
General Comments 0
You need to be logged in to leave comments.
Login now