##// END OF EJS Templates
similar: use progress helper...
Martin von Zweigbergk -
r38414:59c9d3cc default
parent child Browse files
Show More
@@ -1,121 +1,121 b''
1 # similar.py - mechanisms for finding similar files
1 # similar.py - mechanisms for finding similar files
2 #
2 #
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 from .i18n import _
10 from .i18n import _
11 from . import (
11 from . import (
12 mdiff,
12 mdiff,
13 )
13 )
14
14
15 def _findexactmatches(repo, added, removed):
15 def _findexactmatches(repo, added, removed):
16 '''find renamed files that have no changes
16 '''find renamed files that have no changes
17
17
18 Takes a list of new filectxs and a list of removed filectxs, and yields
18 Takes a list of new filectxs and a list of removed filectxs, and yields
19 (before, after) tuples of exact matches.
19 (before, after) tuples of exact matches.
20 '''
20 '''
21 # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
21 # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
22 # We use hash() to discard fctx.data() from memory.
22 # We use hash() to discard fctx.data() from memory.
23 hashes = {}
23 hashes = {}
24 progress = repo.ui.makeprogress(_('searching for exact renames'),
24 progress = repo.ui.makeprogress(_('searching for exact renames'),
25 total=(len(added) + len(removed)),
25 total=(len(added) + len(removed)),
26 unit=_('files'))
26 unit=_('files'))
27 for fctx in removed:
27 for fctx in removed:
28 progress.increment()
28 progress.increment()
29 h = hash(fctx.data())
29 h = hash(fctx.data())
30 if h not in hashes:
30 if h not in hashes:
31 hashes[h] = [fctx]
31 hashes[h] = [fctx]
32 else:
32 else:
33 hashes[h].append(fctx)
33 hashes[h].append(fctx)
34
34
35 # For each added file, see if it corresponds to a removed file.
35 # For each added file, see if it corresponds to a removed file.
36 for fctx in added:
36 for fctx in added:
37 progress.increment()
37 progress.increment()
38 adata = fctx.data()
38 adata = fctx.data()
39 h = hash(adata)
39 h = hash(adata)
40 for rfctx in hashes.get(h, []):
40 for rfctx in hashes.get(h, []):
41 # compare between actual file contents for exact identity
41 # compare between actual file contents for exact identity
42 if adata == rfctx.data():
42 if adata == rfctx.data():
43 yield (rfctx, fctx)
43 yield (rfctx, fctx)
44 break
44 break
45
45
46 # Done
46 # Done
47 progress.complete()
47 progress.complete()
48
48
49 def _ctxdata(fctx):
49 def _ctxdata(fctx):
50 # lazily load text
50 # lazily load text
51 orig = fctx.data()
51 orig = fctx.data()
52 return orig, mdiff.splitnewlines(orig)
52 return orig, mdiff.splitnewlines(orig)
53
53
54 def _score(fctx, otherdata):
54 def _score(fctx, otherdata):
55 orig, lines = otherdata
55 orig, lines = otherdata
56 text = fctx.data()
56 text = fctx.data()
57 # mdiff.blocks() returns blocks of matching lines
57 # mdiff.blocks() returns blocks of matching lines
58 # count the number of bytes in each
58 # count the number of bytes in each
59 equal = 0
59 equal = 0
60 matches = mdiff.blocks(text, orig)
60 matches = mdiff.blocks(text, orig)
61 for x1, x2, y1, y2 in matches:
61 for x1, x2, y1, y2 in matches:
62 for line in lines[y1:y2]:
62 for line in lines[y1:y2]:
63 equal += len(line)
63 equal += len(line)
64
64
65 lengths = len(text) + len(orig)
65 lengths = len(text) + len(orig)
66 return equal * 2.0 / lengths
66 return equal * 2.0 / lengths
67
67
68 def score(fctx1, fctx2):
68 def score(fctx1, fctx2):
69 return _score(fctx1, _ctxdata(fctx2))
69 return _score(fctx1, _ctxdata(fctx2))
70
70
71 def _findsimilarmatches(repo, added, removed, threshold):
71 def _findsimilarmatches(repo, added, removed, threshold):
72 '''find potentially renamed files based on similar file content
72 '''find potentially renamed files based on similar file content
73
73
74 Takes a list of new filectxs and a list of removed filectxs, and yields
74 Takes a list of new filectxs and a list of removed filectxs, and yields
75 (before, after, score) tuples of partial matches.
75 (before, after, score) tuples of partial matches.
76 '''
76 '''
77 copies = {}
77 copies = {}
78 for i, r in enumerate(removed):
78 progress = repo.ui.makeprogress(_('searching for similar files'),
79 repo.ui.progress(_('searching for similar files'), i,
79 unit=_('files'), total=len(removed))
80 total=len(removed), unit=_('files'))
80 for r in removed:
81
81 progress.increment()
82 data = None
82 data = None
83 for a in added:
83 for a in added:
84 bestscore = copies.get(a, (None, threshold))[1]
84 bestscore = copies.get(a, (None, threshold))[1]
85 if data is None:
85 if data is None:
86 data = _ctxdata(r)
86 data = _ctxdata(r)
87 myscore = _score(a, data)
87 myscore = _score(a, data)
88 if myscore > bestscore:
88 if myscore > bestscore:
89 copies[a] = (r, myscore)
89 copies[a] = (r, myscore)
90 repo.ui.progress(_('searching'), None)
90 progress.complete()
91
91
92 for dest, v in copies.iteritems():
92 for dest, v in copies.iteritems():
93 source, bscore = v
93 source, bscore = v
94 yield source, dest, bscore
94 yield source, dest, bscore
95
95
96 def _dropempty(fctxs):
96 def _dropempty(fctxs):
97 return [x for x in fctxs if x.size() > 0]
97 return [x for x in fctxs if x.size() > 0]
98
98
99 def findrenames(repo, added, removed, threshold):
99 def findrenames(repo, added, removed, threshold):
100 '''find renamed files -- yields (before, after, score) tuples'''
100 '''find renamed files -- yields (before, after, score) tuples'''
101 wctx = repo[None]
101 wctx = repo[None]
102 pctx = wctx.p1()
102 pctx = wctx.p1()
103
103
104 # Zero length files will be frequently unrelated to each other, and
104 # Zero length files will be frequently unrelated to each other, and
105 # tracking the deletion/addition of such a file will probably cause more
105 # tracking the deletion/addition of such a file will probably cause more
106 # harm than good. We strip them out here to avoid matching them later on.
106 # harm than good. We strip them out here to avoid matching them later on.
107 addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
107 addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
108 removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
108 removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
109
109
110 # Find exact matches.
110 # Find exact matches.
111 matchedfiles = set()
111 matchedfiles = set()
112 for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
112 for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
113 matchedfiles.add(b)
113 matchedfiles.add(b)
114 yield (a.path(), b.path(), 1.0)
114 yield (a.path(), b.path(), 1.0)
115
115
116 # If the user requested similar files to be matched, search for them also.
116 # If the user requested similar files to be matched, search for them also.
117 if threshold < 1.0:
117 if threshold < 1.0:
118 addedfiles = [x for x in addedfiles if x not in matchedfiles]
118 addedfiles = [x for x in addedfiles if x not in matchedfiles]
119 for (a, b, score) in _findsimilarmatches(repo, addedfiles,
119 for (a, b, score) in _findsimilarmatches(repo, addedfiles,
120 removedfiles, threshold):
120 removedfiles, threshold):
121 yield (a.path(), b.path(), score)
121 yield (a.path(), b.path(), score)
General Comments 0
You need to be logged in to leave comments. Login now