##// END OF EJS Templates
similar: move score function to module level...
Sean Farley -
r30805:0ae287eb default
parent child Browse files
Show More
@@ -1,110 +1,112
1 # similar.py - mechanisms for finding similar files
1 # similar.py - mechanisms for finding similar files
2 #
2 #
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import hashlib
10 import hashlib
11
11
12 from .i18n import _
12 from .i18n import _
13 from . import (
13 from . import (
14 bdiff,
14 bdiff,
15 mdiff,
15 mdiff,
16 util,
16 util,
17 )
17 )
18
18
19 def _findexactmatches(repo, added, removed):
19 def _findexactmatches(repo, added, removed):
20 '''find renamed files that have no changes
20 '''find renamed files that have no changes
21
21
22 Takes a list of new filectxs and a list of removed filectxs, and yields
22 Takes a list of new filectxs and a list of removed filectxs, and yields
23 (before, after) tuples of exact matches.
23 (before, after) tuples of exact matches.
24 '''
24 '''
25 numfiles = len(added) + len(removed)
25 numfiles = len(added) + len(removed)
26
26
27 # Get hashes of removed files.
27 # Get hashes of removed files.
28 hashes = {}
28 hashes = {}
29 for i, fctx in enumerate(removed):
29 for i, fctx in enumerate(removed):
30 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
30 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
31 unit=_('files'))
31 unit=_('files'))
32 h = hashlib.sha1(fctx.data()).digest()
32 h = hashlib.sha1(fctx.data()).digest()
33 hashes[h] = fctx
33 hashes[h] = fctx
34
34
35 # For each added file, see if it corresponds to a removed file.
35 # For each added file, see if it corresponds to a removed file.
36 for i, fctx in enumerate(added):
36 for i, fctx in enumerate(added):
37 repo.ui.progress(_('searching for exact renames'), i + len(removed),
37 repo.ui.progress(_('searching for exact renames'), i + len(removed),
38 total=numfiles, unit=_('files'))
38 total=numfiles, unit=_('files'))
39 h = hashlib.sha1(fctx.data()).digest()
39 h = hashlib.sha1(fctx.data()).digest()
40 if h in hashes:
40 if h in hashes:
41 yield (hashes[h], fctx)
41 yield (hashes[h], fctx)
42
42
43 # Done
43 # Done
44 repo.ui.progress(_('searching for exact renames'), None)
44 repo.ui.progress(_('searching for exact renames'), None)
45
45
46 @util.cachefunc
47 def _ctxdata(fctx):
48 # lazily load text
49 orig = fctx.data()
50 return orig, mdiff.splitnewlines(orig)
51
52 @util.cachefunc
53 def score(fctx1, fctx2):
54 text = fctx1.data()
55 orig, lines = _ctxdata(fctx2)
56 # bdiff.blocks() returns blocks of matching lines
57 # count the number of bytes in each
58 equal = 0
59 matches = bdiff.blocks(text, orig)
60 for x1, x2, y1, y2 in matches:
61 for line in lines[y1:y2]:
62 equal += len(line)
63
64 lengths = len(text) + len(orig)
65 return equal * 2.0 / lengths
66
46 def _findsimilarmatches(repo, added, removed, threshold):
67 def _findsimilarmatches(repo, added, removed, threshold):
47 '''find potentially renamed files based on similar file content
68 '''find potentially renamed files based on similar file content
48
69
49 Takes a list of new filectxs and a list of removed filectxs, and yields
70 Takes a list of new filectxs and a list of removed filectxs, and yields
50 (before, after, score) tuples of partial matches.
71 (before, after, score) tuples of partial matches.
51 '''
72 '''
52 copies = {}
73 copies = {}
53 for i, r in enumerate(removed):
74 for i, r in enumerate(removed):
54 repo.ui.progress(_('searching for similar files'), i,
75 repo.ui.progress(_('searching for similar files'), i,
55 total=len(removed), unit=_('files'))
76 total=len(removed), unit=_('files'))
56
77
57 # lazily load text
58 @util.cachefunc
59 def data():
60 orig = r.data()
61 return orig, mdiff.splitnewlines(orig)
62
63 def score(text):
64 orig, lines = data()
65 # bdiff.blocks() returns blocks of matching lines
66 # count the number of bytes in each
67 equal = 0
68 matches = bdiff.blocks(text, orig)
69 for x1, x2, y1, y2 in matches:
70 for line in lines[y1:y2]:
71 equal += len(line)
72
73 lengths = len(text) + len(orig)
74 return equal * 2.0 / lengths
75
76 for a in added:
78 for a in added:
77 bestscore = copies.get(a, (None, threshold))[1]
79 bestscore = copies.get(a, (None, threshold))[1]
78 myscore = score(a.data())
80 myscore = score(a, r)
79 if myscore >= bestscore:
81 if myscore >= bestscore:
80 copies[a] = (r, myscore)
82 copies[a] = (r, myscore)
81 repo.ui.progress(_('searching'), None)
83 repo.ui.progress(_('searching'), None)
82
84
83 for dest, v in copies.iteritems():
85 for dest, v in copies.iteritems():
84 source, bscore = v
86 source, bscore = v
85 yield source, dest, bscore
87 yield source, dest, bscore
86
88
87 def findrenames(repo, added, removed, threshold):
89 def findrenames(repo, added, removed, threshold):
88 '''find renamed files -- yields (before, after, score) tuples'''
90 '''find renamed files -- yields (before, after, score) tuples'''
89 parentctx = repo['.']
91 parentctx = repo['.']
90 workingctx = repo[None]
92 workingctx = repo[None]
91
93
92 # Zero length files will be frequently unrelated to each other, and
94 # Zero length files will be frequently unrelated to each other, and
93 # tracking the deletion/addition of such a file will probably cause more
95 # tracking the deletion/addition of such a file will probably cause more
94 # harm than good. We strip them out here to avoid matching them later on.
96 # harm than good. We strip them out here to avoid matching them later on.
95 addedfiles = set([workingctx[fp] for fp in added
97 addedfiles = set([workingctx[fp] for fp in added
96 if workingctx[fp].size() > 0])
98 if workingctx[fp].size() > 0])
97 removedfiles = set([parentctx[fp] for fp in removed
99 removedfiles = set([parentctx[fp] for fp in removed
98 if fp in parentctx and parentctx[fp].size() > 0])
100 if fp in parentctx and parentctx[fp].size() > 0])
99
101
100 # Find exact matches.
102 # Find exact matches.
101 for (a, b) in _findexactmatches(repo,
103 for (a, b) in _findexactmatches(repo,
102 sorted(addedfiles), sorted(removedfiles)):
104 sorted(addedfiles), sorted(removedfiles)):
103 addedfiles.remove(b)
105 addedfiles.remove(b)
104 yield (a.path(), b.path(), 1.0)
106 yield (a.path(), b.path(), 1.0)
105
107
106 # If the user requested similar files to be matched, search for them also.
108 # If the user requested similar files to be matched, search for them also.
107 if threshold < 1.0:
109 if threshold < 1.0:
108 for (a, b, score) in _findsimilarmatches(repo,
110 for (a, b, score) in _findsimilarmatches(repo,
109 sorted(addedfiles), sorted(removedfiles), threshold):
111 sorted(addedfiles), sorted(removedfiles), threshold):
110 yield (a.path(), b.path(), score)
112 yield (a.path(), b.path(), score)
General Comments 0
You need to be logged in to leave comments. Login now