##// END OF EJS Templates
similar: compare between actual file contents for exact identity...
FUJIWARA Katsunori -
r31210:e1d03590 default
parent child Browse files
Show More
@@ -1,115 +1,119 b''
1 # similar.py - mechanisms for finding similar files
1 # similar.py - mechanisms for finding similar files
2 #
2 #
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import hashlib
10 import hashlib
11
11
12 from .i18n import _
12 from .i18n import _
13 from . import (
13 from . import (
14 bdiff,
14 bdiff,
15 mdiff,
15 mdiff,
16 )
16 )
17
17
18 def _findexactmatches(repo, added, removed):
18 def _findexactmatches(repo, added, removed):
19 '''find renamed files that have no changes
19 '''find renamed files that have no changes
20
20
21 Takes a list of new filectxs and a list of removed filectxs, and yields
21 Takes a list of new filectxs and a list of removed filectxs, and yields
22 (before, after) tuples of exact matches.
22 (before, after) tuples of exact matches.
23 '''
23 '''
24 numfiles = len(added) + len(removed)
24 numfiles = len(added) + len(removed)
25
25
26 # Get hashes of removed files.
26 # Get hashes of removed files.
27 hashes = {}
27 hashes = {}
28 for i, fctx in enumerate(removed):
28 for i, fctx in enumerate(removed):
29 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
29 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
30 unit=_('files'))
30 unit=_('files'))
31 h = hashlib.sha1(fctx.data()).digest()
31 h = hashlib.sha1(fctx.data()).digest()
32 hashes[h] = fctx
32 hashes[h] = fctx
33
33
34 # For each added file, see if it corresponds to a removed file.
34 # For each added file, see if it corresponds to a removed file.
35 for i, fctx in enumerate(added):
35 for i, fctx in enumerate(added):
36 repo.ui.progress(_('searching for exact renames'), i + len(removed),
36 repo.ui.progress(_('searching for exact renames'), i + len(removed),
37 total=numfiles, unit=_('files'))
37 total=numfiles, unit=_('files'))
38 h = hashlib.sha1(fctx.data()).digest()
38 adata = fctx.data()
39 h = hashlib.sha1(adata).digest()
39 if h in hashes:
40 if h in hashes:
40 yield (hashes[h], fctx)
41 rfctx = hashes[h]
42 # compare between actual file contents for exact identity
43 if adata == rfctx.data():
44 yield (rfctx, fctx)
41
45
42 # Done
46 # Done
43 repo.ui.progress(_('searching for exact renames'), None)
47 repo.ui.progress(_('searching for exact renames'), None)
44
48
45 def _ctxdata(fctx):
49 def _ctxdata(fctx):
46 # lazily load text
50 # lazily load text
47 orig = fctx.data()
51 orig = fctx.data()
48 return orig, mdiff.splitnewlines(orig)
52 return orig, mdiff.splitnewlines(orig)
49
53
50 def _score(fctx, otherdata):
54 def _score(fctx, otherdata):
51 orig, lines = otherdata
55 orig, lines = otherdata
52 text = fctx.data()
56 text = fctx.data()
53 # bdiff.blocks() returns blocks of matching lines
57 # bdiff.blocks() returns blocks of matching lines
54 # count the number of bytes in each
58 # count the number of bytes in each
55 equal = 0
59 equal = 0
56 matches = bdiff.blocks(text, orig)
60 matches = bdiff.blocks(text, orig)
57 for x1, x2, y1, y2 in matches:
61 for x1, x2, y1, y2 in matches:
58 for line in lines[y1:y2]:
62 for line in lines[y1:y2]:
59 equal += len(line)
63 equal += len(line)
60
64
61 lengths = len(text) + len(orig)
65 lengths = len(text) + len(orig)
62 return equal * 2.0 / lengths
66 return equal * 2.0 / lengths
63
67
64 def score(fctx1, fctx2):
68 def score(fctx1, fctx2):
65 return _score(fctx1, _ctxdata(fctx2))
69 return _score(fctx1, _ctxdata(fctx2))
66
70
67 def _findsimilarmatches(repo, added, removed, threshold):
71 def _findsimilarmatches(repo, added, removed, threshold):
68 '''find potentially renamed files based on similar file content
72 '''find potentially renamed files based on similar file content
69
73
70 Takes a list of new filectxs and a list of removed filectxs, and yields
74 Takes a list of new filectxs and a list of removed filectxs, and yields
71 (before, after, score) tuples of partial matches.
75 (before, after, score) tuples of partial matches.
72 '''
76 '''
73 copies = {}
77 copies = {}
74 for i, r in enumerate(removed):
78 for i, r in enumerate(removed):
75 repo.ui.progress(_('searching for similar files'), i,
79 repo.ui.progress(_('searching for similar files'), i,
76 total=len(removed), unit=_('files'))
80 total=len(removed), unit=_('files'))
77
81
78 data = None
82 data = None
79 for a in added:
83 for a in added:
80 bestscore = copies.get(a, (None, threshold))[1]
84 bestscore = copies.get(a, (None, threshold))[1]
81 if data is None:
85 if data is None:
82 data = _ctxdata(r)
86 data = _ctxdata(r)
83 myscore = _score(a, data)
87 myscore = _score(a, data)
84 if myscore >= bestscore:
88 if myscore >= bestscore:
85 copies[a] = (r, myscore)
89 copies[a] = (r, myscore)
86 repo.ui.progress(_('searching'), None)
90 repo.ui.progress(_('searching'), None)
87
91
88 for dest, v in copies.iteritems():
92 for dest, v in copies.iteritems():
89 source, bscore = v
93 source, bscore = v
90 yield source, dest, bscore
94 yield source, dest, bscore
91
95
92 def findrenames(repo, added, removed, threshold):
96 def findrenames(repo, added, removed, threshold):
93 '''find renamed files -- yields (before, after, score) tuples'''
97 '''find renamed files -- yields (before, after, score) tuples'''
94 parentctx = repo['.']
98 parentctx = repo['.']
95 workingctx = repo[None]
99 workingctx = repo[None]
96
100
97 # Zero length files will be frequently unrelated to each other, and
101 # Zero length files will be frequently unrelated to each other, and
98 # tracking the deletion/addition of such a file will probably cause more
102 # tracking the deletion/addition of such a file will probably cause more
99 # harm than good. We strip them out here to avoid matching them later on.
103 # harm than good. We strip them out here to avoid matching them later on.
100 addedfiles = set([workingctx[fp] for fp in added
104 addedfiles = set([workingctx[fp] for fp in added
101 if workingctx[fp].size() > 0])
105 if workingctx[fp].size() > 0])
102 removedfiles = set([parentctx[fp] for fp in removed
106 removedfiles = set([parentctx[fp] for fp in removed
103 if fp in parentctx and parentctx[fp].size() > 0])
107 if fp in parentctx and parentctx[fp].size() > 0])
104
108
105 # Find exact matches.
109 # Find exact matches.
106 for (a, b) in _findexactmatches(repo,
110 for (a, b) in _findexactmatches(repo,
107 sorted(addedfiles), sorted(removedfiles)):
111 sorted(addedfiles), sorted(removedfiles)):
108 addedfiles.remove(b)
112 addedfiles.remove(b)
109 yield (a.path(), b.path(), 1.0)
113 yield (a.path(), b.path(), 1.0)
110
114
111 # If the user requested similar files to be matched, search for them also.
115 # If the user requested similar files to be matched, search for them also.
112 if threshold < 1.0:
116 if threshold < 1.0:
113 for (a, b, score) in _findsimilarmatches(repo,
117 for (a, b, score) in _findsimilarmatches(repo,
114 sorted(addedfiles), sorted(removedfiles), threshold):
118 sorted(addedfiles), sorted(removedfiles), threshold):
115 yield (a.path(), b.path(), score)
119 yield (a.path(), b.path(), score)
General Comments 0
You need to be logged in to leave comments. Login now