Show More
@@ -0,0 +1,59 b'' | |||||
|
1 | # similar.py - mechanisms for finding similar files | |||
|
2 | # | |||
|
3 | # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> | |||
|
4 | # | |||
|
5 | # This software may be used and distributed according to the terms of the | |||
|
6 | # GNU General Public License version 2 or any later version. | |||
|
7 | ||||
|
8 | from i18n import _ | |||
|
9 | import util | |||
|
10 | import mdiff | |||
|
11 | import bdiff | |||
|
12 | ||||
|
13 | def findrenames(repo, added, removed, threshold): | |||
|
14 | '''find renamed files -- yields (before, after, score) tuples''' | |||
|
15 | copies = {} | |||
|
16 | ctx = repo['.'] | |||
|
17 | for i, r in enumerate(removed): | |||
|
18 | repo.ui.progress(_('searching'), i, total=len(removed)) | |||
|
19 | if r not in ctx: | |||
|
20 | continue | |||
|
21 | fctx = ctx.filectx(r) | |||
|
22 | ||||
|
23 | # lazily load text | |||
|
24 | @util.cachefunc | |||
|
25 | def data(): | |||
|
26 | orig = fctx.data() | |||
|
27 | return orig, mdiff.splitnewlines(orig) | |||
|
28 | ||||
|
29 | def score(text): | |||
|
30 | if not len(text): | |||
|
31 | return 0.0 | |||
|
32 | if not fctx.cmp(text): | |||
|
33 | return 1.0 | |||
|
34 | if threshold == 1.0: | |||
|
35 | return 0.0 | |||
|
36 | orig, lines = data() | |||
|
37 | # bdiff.blocks() returns blocks of matching lines | |||
|
38 | # count the number of bytes in each | |||
|
39 | equal = 0 | |||
|
40 | matches = bdiff.blocks(text, orig) | |||
|
41 | for x1, x2, y1, y2 in matches: | |||
|
42 | for line in lines[y1:y2]: | |||
|
43 | equal += len(line) | |||
|
44 | ||||
|
45 | lengths = len(text) + len(orig) | |||
|
46 | return equal * 2.0 / lengths | |||
|
47 | ||||
|
48 | for a in added: | |||
|
49 | bestscore = copies.get(a, (None, threshold))[1] | |||
|
50 | myscore = score(repo.wread(a)) | |||
|
51 | if myscore >= bestscore: | |||
|
52 | copies[a] = (r, myscore) | |||
|
53 | repo.ui.progress(_('searching'), None) | |||
|
54 | ||||
|
55 | for dest, v in copies.iteritems(): | |||
|
56 | source, score = v | |||
|
57 | yield source, dest, score | |||
|
58 | ||||
|
59 |
@@ -10,6 +10,7 b' from i18n import _' | |||||
10 | import os, sys, errno, re, glob, tempfile |
|
10 | import os, sys, errno, re, glob, tempfile | |
11 | import mdiff, bdiff, util, templater, patch, error, encoding, templatekw |
|
11 | import mdiff, bdiff, util, templater, patch, error, encoding, templatekw | |
12 | import match as _match |
|
12 | import match as _match | |
|
13 | import similar | |||
13 |
|
14 | |||
14 | revrangesep = ':' |
|
15 | revrangesep = ':' | |
15 |
|
16 | |||
@@ -286,52 +287,6 b' def matchall(repo):' | |||||
286 | def matchfiles(repo, files): |
|
287 | def matchfiles(repo, files): | |
287 | return _match.exact(repo.root, repo.getcwd(), files) |
|
288 | return _match.exact(repo.root, repo.getcwd(), files) | |
288 |
|
289 | |||
289 | def findrenames(repo, added, removed, threshold): |
|
|||
290 | '''find renamed files -- yields (before, after, score) tuples''' |
|
|||
291 | copies = {} |
|
|||
292 | ctx = repo['.'] |
|
|||
293 | for i, r in enumerate(removed): |
|
|||
294 | repo.ui.progress(_('searching'), i, total=len(removed)) |
|
|||
295 | if r not in ctx: |
|
|||
296 | continue |
|
|||
297 | fctx = ctx.filectx(r) |
|
|||
298 |
|
||||
299 | # lazily load text |
|
|||
300 | @util.cachefunc |
|
|||
301 | def data(): |
|
|||
302 | orig = fctx.data() |
|
|||
303 | return orig, mdiff.splitnewlines(orig) |
|
|||
304 |
|
||||
305 | def score(text): |
|
|||
306 | if not len(text): |
|
|||
307 | return 0.0 |
|
|||
308 | if not fctx.cmp(text): |
|
|||
309 | return 1.0 |
|
|||
310 | if threshold == 1.0: |
|
|||
311 | return 0.0 |
|
|||
312 | orig, lines = data() |
|
|||
313 | # bdiff.blocks() returns blocks of matching lines |
|
|||
314 | # count the number of bytes in each |
|
|||
315 | equal = 0 |
|
|||
316 | matches = bdiff.blocks(text, orig) |
|
|||
317 | for x1, x2, y1, y2 in matches: |
|
|||
318 | for line in lines[y1:y2]: |
|
|||
319 | equal += len(line) |
|
|||
320 |
|
||||
321 | lengths = len(text) + len(orig) |
|
|||
322 | return equal * 2.0 / lengths |
|
|||
323 |
|
||||
324 | for a in added: |
|
|||
325 | bestscore = copies.get(a, (None, threshold))[1] |
|
|||
326 | myscore = score(repo.wread(a)) |
|
|||
327 | if myscore >= bestscore: |
|
|||
328 | copies[a] = (r, myscore) |
|
|||
329 | repo.ui.progress(_('searching'), None) |
|
|||
330 |
|
||||
331 | for dest, v in copies.iteritems(): |
|
|||
332 | source, score = v |
|
|||
333 | yield source, dest, score |
|
|||
334 |
|
||||
335 | def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None): |
|
290 | def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None): | |
336 | if dry_run is None: |
|
291 | if dry_run is None: | |
337 | dry_run = opts.get('dry_run') |
|
292 | dry_run = opts.get('dry_run') | |
@@ -366,8 +321,8 b' def addremove(repo, pats=[], opts={}, dr' | |||||
366 | added.append(abs) |
|
321 | added.append(abs) | |
367 | copies = {} |
|
322 | copies = {} | |
368 | if similarity > 0: |
|
323 | if similarity > 0: | |
369 |
for old, new, score in findrenames(repo, |
|
324 | for old, new, score in similar.findrenames(repo, | |
370 |
|
|
325 | added + unknown, removed + deleted, similarity): | |
371 | if repo.ui.verbose or not m.exact(old) or not m.exact(new): |
|
326 | if repo.ui.verbose or not m.exact(old) or not m.exact(new): | |
372 | repo.ui.status(_('recording removal of %s as rename to %s ' |
|
327 | repo.ui.status(_('recording removal of %s as rename to %s ' | |
373 | '(%d%% similar)\n') % |
|
328 | '(%d%% similar)\n') % |
General Comments 0
You need to be logged in to leave comments.
Login now