similar.py
121 lines
| 4.0 KiB
| text/x-python
|
PythonLexer
/ mercurial / similar.py
David Greenaway
|
r11059 | # similar.py - mechanisms for finding similar files | ||
# | ||||
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Gregory Szorc
|
r27359 | from __future__ import absolute_import | ||
from .i18n import _ | ||||
from . import ( | ||||
mdiff, | ||||
) | ||||
David Greenaway
|
r11059 | |||
David Greenaway
|
r11060 | def _findexactmatches(repo, added, removed): | ||
'''find renamed files that have no changes | ||||
Takes a list of new filectxs and a list of removed filectxs, and yields | ||||
(before, after) tuples of exact matches. | ||||
''' | ||||
Yuya Nishihara
|
r31584 | # Build table of removed files: {hash(fctx.data()): [fctx, ...]}. | ||
# We use hash() to discard fctx.data() from memory. | ||||
David Greenaway
|
r11060 | hashes = {} | ||
Martin von Zweigbergk
|
r38367 | progress = repo.ui.makeprogress(_('searching for exact renames'), | ||
total=(len(added) + len(removed)), | ||||
unit=_('files')) | ||||
for fctx in removed: | ||||
progress.increment() | ||||
Yuya Nishihara
|
r31584 | h = hash(fctx.data()) | ||
if h not in hashes: | ||||
hashes[h] = [fctx] | ||||
else: | ||||
hashes[h].append(fctx) | ||||
David Greenaway
|
r11060 | |||
# For each added file, see if it corresponds to a removed file. | ||||
Martin von Zweigbergk
|
r38367 | for fctx in added: | ||
progress.increment() | ||||
FUJIWARA Katsunori
|
r31210 | adata = fctx.data() | ||
Yuya Nishihara
|
r31584 | h = hash(adata) | ||
for rfctx in hashes.get(h, []): | ||||
FUJIWARA Katsunori
|
r31210 | # compare between actual file contents for exact identity | ||
if adata == rfctx.data(): | ||||
yield (rfctx, fctx) | ||||
Yuya Nishihara
|
r31584 | break | ||
David Greenaway
|
r11060 | |||
# Done | ||||
Martin von Zweigbergk
|
r38367 | progress.update(None) | ||
David Greenaway
|
r11060 | |||
Sean Farley
|
r30805 | def _ctxdata(fctx): | ||
# lazily load text | ||||
orig = fctx.data() | ||||
return orig, mdiff.splitnewlines(orig) | ||||
Pierre-Yves David
|
r30809 | def _score(fctx, otherdata): | ||
orig, lines = otherdata | ||||
text = fctx.data() | ||||
Yuya Nishihara
|
r32201 | # mdiff.blocks() returns blocks of matching lines | ||
Sean Farley
|
r30805 | # count the number of bytes in each | ||
equal = 0 | ||||
Yuya Nishihara
|
r32201 | matches = mdiff.blocks(text, orig) | ||
Sean Farley
|
r30805 | for x1, x2, y1, y2 in matches: | ||
for line in lines[y1:y2]: | ||||
equal += len(line) | ||||
lengths = len(text) + len(orig) | ||||
return equal * 2.0 / lengths | ||||
Pierre-Yves David
|
r30809 | def score(fctx1, fctx2): | ||
return _score(fctx1, _ctxdata(fctx2)) | ||||
David Greenaway
|
r11060 | def _findsimilarmatches(repo, added, removed, threshold): | ||
'''find potentially renamed files based on similar file content | ||||
Takes a list of new filectxs and a list of removed filectxs, and yields | ||||
(before, after, score) tuples of partial matches. | ||||
''' | ||||
David Greenaway
|
r11059 | copies = {} | ||
for i, r in enumerate(removed): | ||||
Brodie Rao
|
r16683 | repo.ui.progress(_('searching for similar files'), i, | ||
r28468 | total=len(removed), unit=_('files')) | |||
David Greenaway
|
r11059 | |||
Pierre-Yves David
|
r30809 | data = None | ||
David Greenaway
|
r11059 | for a in added: | ||
bestscore = copies.get(a, (None, threshold))[1] | ||||
Pierre-Yves David
|
r30809 | if data is None: | ||
data = _ctxdata(r) | ||||
myscore = _score(a, data) | ||||
Yuya Nishihara
|
r31583 | if myscore > bestscore: | ||
David Greenaway
|
r11059 | copies[a] = (r, myscore) | ||
repo.ui.progress(_('searching'), None) | ||||
for dest, v in copies.iteritems(): | ||||
Sean Farley
|
r30791 | source, bscore = v | ||
yield source, dest, bscore | ||||
David Greenaway
|
r11059 | |||
Yuya Nishihara
|
r31582 | def _dropempty(fctxs): | ||
return [x for x in fctxs if x.size() > 0] | ||||
David Greenaway
|
r11060 | def findrenames(repo, added, removed, threshold): | ||
'''find renamed files -- yields (before, after, score) tuples''' | ||||
Yuya Nishihara
|
r31581 | wctx = repo[None] | ||
pctx = wctx.p1() | ||||
David Greenaway
|
r11059 | |||
David Greenaway
|
r11060 | # Zero length files will be frequently unrelated to each other, and | ||
# tracking the deletion/addition of such a file will probably cause more | ||||
# harm than good. We strip them out here to avoid matching them later on. | ||||
Yuya Nishihara
|
r31582 | addedfiles = _dropempty(wctx[fp] for fp in sorted(added)) | ||
removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx) | ||||
David Greenaway
|
r11060 | |||
# Find exact matches. | ||||
Yuya Nishihara
|
r31580 | matchedfiles = set() | ||
for (a, b) in _findexactmatches(repo, addedfiles, removedfiles): | ||||
matchedfiles.add(b) | ||||
David Greenaway
|
r11060 | yield (a.path(), b.path(), 1.0) | ||
# If the user requested similar files to be matched, search for them also. | ||||
if threshold < 1.0: | ||||
Yuya Nishihara
|
r31580 | addedfiles = [x for x in addedfiles if x not in matchedfiles] | ||
Yuya Nishihara
|
r31579 | for (a, b, score) in _findsimilarmatches(repo, addedfiles, | ||
removedfiles, threshold): | ||||
David Greenaway
|
r11060 | yield (a.path(), b.path(), score) | ||