##// END OF EJS Templates
similar: remove caching from the module level...
Pierre-Yves David -
r30809:86145461 default
parent child Browse files
Show More
@@ -1,112 +1,115 b''
1 1 # similar.py - mechanisms for finding similar files
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import hashlib
11 11
12 12 from .i18n import _
13 13 from . import (
14 14 bdiff,
15 15 mdiff,
16 util,
17 16 )
18 17
19 18 def _findexactmatches(repo, added, removed):
20 19 '''find renamed files that have no changes
21 20
22 21 Takes a list of new filectxs and a list of removed filectxs, and yields
23 22 (before, after) tuples of exact matches.
24 23 '''
25 24 numfiles = len(added) + len(removed)
26 25
27 26 # Get hashes of removed files.
28 27 hashes = {}
29 28 for i, fctx in enumerate(removed):
30 29 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
31 30 unit=_('files'))
32 31 h = hashlib.sha1(fctx.data()).digest()
33 32 hashes[h] = fctx
34 33
35 34 # For each added file, see if it corresponds to a removed file.
36 35 for i, fctx in enumerate(added):
37 36 repo.ui.progress(_('searching for exact renames'), i + len(removed),
38 37 total=numfiles, unit=_('files'))
39 38 h = hashlib.sha1(fctx.data()).digest()
40 39 if h in hashes:
41 40 yield (hashes[h], fctx)
42 41
43 42 # Done
44 43 repo.ui.progress(_('searching for exact renames'), None)
45 44
46 @util.cachefunc
47 45 def _ctxdata(fctx):
48 46 # lazily load text
49 47 orig = fctx.data()
50 48 return orig, mdiff.splitnewlines(orig)
51 49
52 @util.cachefunc
53 def score(fctx1, fctx2):
54 text = fctx1.data()
55 orig, lines = _ctxdata(fctx2)
50 def _score(fctx, otherdata):
51 orig, lines = otherdata
52 text = fctx.data()
56 53 # bdiff.blocks() returns blocks of matching lines
57 54 # count the number of bytes in each
58 55 equal = 0
59 56 matches = bdiff.blocks(text, orig)
60 57 for x1, x2, y1, y2 in matches:
61 58 for line in lines[y1:y2]:
62 59 equal += len(line)
63 60
64 61 lengths = len(text) + len(orig)
65 62 return equal * 2.0 / lengths
66 63
64 def score(fctx1, fctx2):
65 return _score(fctx1, _ctxdata(fctx2))
66
67 67 def _findsimilarmatches(repo, added, removed, threshold):
68 68 '''find potentially renamed files based on similar file content
69 69
70 70 Takes a list of new filectxs and a list of removed filectxs, and yields
71 71 (before, after, score) tuples of partial matches.
72 72 '''
73 73 copies = {}
74 74 for i, r in enumerate(removed):
75 75 repo.ui.progress(_('searching for similar files'), i,
76 76 total=len(removed), unit=_('files'))
77 77
78 data = None
78 79 for a in added:
79 80 bestscore = copies.get(a, (None, threshold))[1]
80 myscore = score(a, r)
81 if data is None:
82 data = _ctxdata(r)
83 myscore = _score(a, data)
81 84 if myscore >= bestscore:
82 85 copies[a] = (r, myscore)
83 86 repo.ui.progress(_('searching'), None)
84 87
85 88 for dest, v in copies.iteritems():
86 89 source, bscore = v
87 90 yield source, dest, bscore
88 91
89 92 def findrenames(repo, added, removed, threshold):
90 93 '''find renamed files -- yields (before, after, score) tuples'''
91 94 parentctx = repo['.']
92 95 workingctx = repo[None]
93 96
94 97 # Zero length files will be frequently unrelated to each other, and
95 98 # tracking the deletion/addition of such a file will probably cause more
96 99 # harm than good. We strip them out here to avoid matching them later on.
97 100 addedfiles = set([workingctx[fp] for fp in added
98 101 if workingctx[fp].size() > 0])
99 102 removedfiles = set([parentctx[fp] for fp in removed
100 103 if fp in parentctx and parentctx[fp].size() > 0])
101 104
102 105 # Find exact matches.
103 106 for (a, b) in _findexactmatches(repo,
104 107 sorted(addedfiles), sorted(removedfiles)):
105 108 addedfiles.remove(b)
106 109 yield (a.path(), b.path(), 1.0)
107 110
108 111 # If the user requested similar files to be matched, search for them also.
109 112 if threshold < 1.0:
110 113 for (a, b, score) in _findsimilarmatches(repo,
111 114 sorted(addedfiles), sorted(removedfiles), threshold):
112 115 yield (a.path(), b.path(), score)
General Comments 0
You need to be logged in to leave comments. Login now