similar.py
119 lines
| 3.9 KiB
| text/x-python
|
PythonLexer
/ mercurial / similar.py
David Greenaway
|
r11059 | # similar.py - mechanisms for finding similar files | ||
# | ||||
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Gregory Szorc
|
r27359 | from __future__ import absolute_import | ||
Augie Fackler
|
r29341 | import hashlib | ||
Gregory Szorc
|
r27359 | from .i18n import _ | ||
from . import ( | ||||
bdiff, | ||||
mdiff, | ||||
) | ||||
David Greenaway
|
r11059 | |||
David Greenaway
|
r11060 | def _findexactmatches(repo, added, removed): | ||
'''find renamed files that have no changes | ||||
Takes a list of new filectxs and a list of removed filectxs, and yields | ||||
(before, after) tuples of exact matches. | ||||
''' | ||||
numfiles = len(added) + len(removed) | ||||
# Get hashes of removed files. | ||||
hashes = {} | ||||
for i, fctx in enumerate(removed): | ||||
r28468 | repo.ui.progress(_('searching for exact renames'), i, total=numfiles, | |||
unit=_('files')) | ||||
Augie Fackler
|
r29341 | h = hashlib.sha1(fctx.data()).digest() | ||
David Greenaway
|
r11060 | hashes[h] = fctx | ||
# For each added file, see if it corresponds to a removed file. | ||||
for i, fctx in enumerate(added): | ||||
repo.ui.progress(_('searching for exact renames'), i + len(removed), | ||||
r28468 | total=numfiles, unit=_('files')) | |||
FUJIWARA Katsunori
|
r31210 | adata = fctx.data() | ||
h = hashlib.sha1(adata).digest() | ||||
David Greenaway
|
r11060 | if h in hashes: | ||
FUJIWARA Katsunori
|
r31210 | rfctx = hashes[h] | ||
# compare between actual file contents for exact identity | ||||
if adata == rfctx.data(): | ||||
yield (rfctx, fctx) | ||||
David Greenaway
|
r11060 | |||
# Done | ||||
repo.ui.progress(_('searching for exact renames'), None) | ||||
Sean Farley
|
r30805 | def _ctxdata(fctx): | ||
# lazily load text | ||||
orig = fctx.data() | ||||
return orig, mdiff.splitnewlines(orig) | ||||
Pierre-Yves David
|
r30809 | def _score(fctx, otherdata): | ||
orig, lines = otherdata | ||||
text = fctx.data() | ||||
Sean Farley
|
r30805 | # bdiff.blocks() returns blocks of matching lines | ||
# count the number of bytes in each | ||||
equal = 0 | ||||
matches = bdiff.blocks(text, orig) | ||||
for x1, x2, y1, y2 in matches: | ||||
for line in lines[y1:y2]: | ||||
equal += len(line) | ||||
lengths = len(text) + len(orig) | ||||
return equal * 2.0 / lengths | ||||
Pierre-Yves David
|
r30809 | def score(fctx1, fctx2): | ||
return _score(fctx1, _ctxdata(fctx2)) | ||||
David Greenaway
|
r11060 | def _findsimilarmatches(repo, added, removed, threshold): | ||
'''find potentially renamed files based on similar file content | ||||
Takes a list of new filectxs and a list of removed filectxs, and yields | ||||
(before, after, score) tuples of partial matches. | ||||
''' | ||||
David Greenaway
|
r11059 | copies = {} | ||
for i, r in enumerate(removed): | ||||
Brodie Rao
|
r16683 | repo.ui.progress(_('searching for similar files'), i, | ||
r28468 | total=len(removed), unit=_('files')) | |||
David Greenaway
|
r11059 | |||
Pierre-Yves David
|
r30809 | data = None | ||
David Greenaway
|
r11059 | for a in added: | ||
bestscore = copies.get(a, (None, threshold))[1] | ||||
Pierre-Yves David
|
r30809 | if data is None: | ||
data = _ctxdata(r) | ||||
myscore = _score(a, data) | ||||
David Greenaway
|
r11059 | if myscore >= bestscore: | ||
copies[a] = (r, myscore) | ||||
repo.ui.progress(_('searching'), None) | ||||
for dest, v in copies.iteritems(): | ||||
Sean Farley
|
r30791 | source, bscore = v | ||
yield source, dest, bscore | ||||
David Greenaway
|
r11059 | |||
David Greenaway
|
r11060 | def findrenames(repo, added, removed, threshold): | ||
'''find renamed files -- yields (before, after, score) tuples''' | ||||
parentctx = repo['.'] | ||||
workingctx = repo[None] | ||||
David Greenaway
|
r11059 | |||
David Greenaway
|
r11060 | # Zero length files will be frequently unrelated to each other, and | ||
# tracking the deletion/addition of such a file will probably cause more | ||||
# harm than good. We strip them out here to avoid matching them later on. | ||||
addedfiles = set([workingctx[fp] for fp in added | ||||
if workingctx[fp].size() > 0]) | ||||
removedfiles = set([parentctx[fp] for fp in removed | ||||
if fp in parentctx and parentctx[fp].size() > 0]) | ||||
# Find exact matches. | ||||
for (a, b) in _findexactmatches(repo, | ||||
Benoit Boissinot
|
r11085 | sorted(addedfiles), sorted(removedfiles)): | ||
David Greenaway
|
r11060 | addedfiles.remove(b) | ||
yield (a.path(), b.path(), 1.0) | ||||
# If the user requested similar files to be matched, search for them also. | ||||
if threshold < 1.0: | ||||
for (a, b, score) in _findsimilarmatches(repo, | ||||
sorted(addedfiles), sorted(removedfiles), threshold): | ||||
yield (a.path(), b.path(), score) | ||||