similar.py
131 lines
| 3.9 KiB
| text/x-python
|
PythonLexer
/ mercurial / similar.py
David Greenaway
|
r11059 | # similar.py - mechanisms for finding similar files | ||
# | ||||
Raphaël Gomès
|
r47575 | # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com> | ||
David Greenaway
|
r11059 | # | ||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Gregory Szorc
|
r27359 | |||
from .i18n import _ | ||||
Gregory Szorc
|
r43376 | from . import ( | ||
mdiff, | ||||
) | ||||
Augie Fackler
|
r43346 | |||
David Greenaway
|
r11059 | |||
David Greenaway
|
r11060 | def _findexactmatches(repo, added, removed): | ||
Augie Fackler
|
r46554 | """find renamed files that have no changes | ||
David Greenaway
|
r11060 | |||
Takes a list of new filectxs and a list of removed filectxs, and yields | ||||
(before, after) tuples of exact matches. | ||||
Augie Fackler
|
r46554 | """ | ||
Yuya Nishihara
|
r31584 | # Build table of removed files: {hash(fctx.data()): [fctx, ...]}. | ||
# We use hash() to discard fctx.data() from memory. | ||||
David Greenaway
|
r11060 | hashes = {} | ||
Augie Fackler
|
r43346 | progress = repo.ui.makeprogress( | ||
Augie Fackler
|
r43347 | _(b'searching for exact renames'), | ||
Augie Fackler
|
r43346 | total=(len(added) + len(removed)), | ||
Augie Fackler
|
r43347 | unit=_(b'files'), | ||
Augie Fackler
|
r43346 | ) | ||
Martin von Zweigbergk
|
r38367 | for fctx in removed: | ||
progress.increment() | ||||
Yuya Nishihara
|
r31584 | h = hash(fctx.data()) | ||
if h not in hashes: | ||||
hashes[h] = [fctx] | ||||
else: | ||||
hashes[h].append(fctx) | ||||
David Greenaway
|
r11060 | |||
# For each added file, see if it corresponds to a removed file. | ||||
Martin von Zweigbergk
|
r38367 | for fctx in added: | ||
progress.increment() | ||||
FUJIWARA Katsunori
|
r31210 | adata = fctx.data() | ||
Yuya Nishihara
|
r31584 | h = hash(adata) | ||
for rfctx in hashes.get(h, []): | ||||
FUJIWARA Katsunori
|
r31210 | # compare between actual file contents for exact identity | ||
if adata == rfctx.data(): | ||||
yield (rfctx, fctx) | ||||
Yuya Nishihara
|
r31584 | break | ||
David Greenaway
|
r11060 | |||
# Done | ||||
Martin von Zweigbergk
|
r38392 | progress.complete() | ||
David Greenaway
|
r11060 | |||
Augie Fackler
|
r43346 | |||
Sean Farley
|
r30805 | def _ctxdata(fctx): | ||
# lazily load text | ||||
orig = fctx.data() | ||||
return orig, mdiff.splitnewlines(orig) | ||||
Augie Fackler
|
r43346 | |||
Pierre-Yves David
|
r30809 | def _score(fctx, otherdata): | ||
orig, lines = otherdata | ||||
text = fctx.data() | ||||
Yuya Nishihara
|
r32201 | # mdiff.blocks() returns blocks of matching lines | ||
Sean Farley
|
r30805 | # count the number of bytes in each | ||
equal = 0 | ||||
Yuya Nishihara
|
r32201 | matches = mdiff.blocks(text, orig) | ||
Sean Farley
|
r30805 | for x1, x2, y1, y2 in matches: | ||
for line in lines[y1:y2]: | ||||
equal += len(line) | ||||
lengths = len(text) + len(orig) | ||||
return equal * 2.0 / lengths | ||||
Augie Fackler
|
r43346 | |||
Pierre-Yves David
|
r30809 | def score(fctx1, fctx2): | ||
return _score(fctx1, _ctxdata(fctx2)) | ||||
Augie Fackler
|
r43346 | |||
David Greenaway
|
r11060 | def _findsimilarmatches(repo, added, removed, threshold): | ||
Augie Fackler
|
r46554 | """find potentially renamed files based on similar file content | ||
David Greenaway
|
r11060 | |||
Takes a list of new filectxs and a list of removed filectxs, and yields | ||||
(before, after, score) tuples of partial matches. | ||||
Augie Fackler
|
r46554 | """ | ||
David Greenaway
|
r11059 | copies = {} | ||
Augie Fackler
|
r43346 | progress = repo.ui.makeprogress( | ||
Augie Fackler
|
r43347 | _(b'searching for similar files'), unit=_(b'files'), total=len(removed) | ||
Augie Fackler
|
r43346 | ) | ||
Martin von Zweigbergk
|
r38414 | for r in removed: | ||
progress.increment() | ||||
Pierre-Yves David
|
r30809 | data = None | ||
David Greenaway
|
r11059 | for a in added: | ||
bestscore = copies.get(a, (None, threshold))[1] | ||||
Pierre-Yves David
|
r30809 | if data is None: | ||
data = _ctxdata(r) | ||||
myscore = _score(a, data) | ||||
Yuya Nishihara
|
r31583 | if myscore > bestscore: | ||
David Greenaway
|
r11059 | copies[a] = (r, myscore) | ||
Martin von Zweigbergk
|
r38414 | progress.complete() | ||
David Greenaway
|
r11059 | |||
Gregory Szorc
|
r49768 | for dest, v in copies.items(): | ||
Sean Farley
|
r30791 | source, bscore = v | ||
yield source, dest, bscore | ||||
David Greenaway
|
r11059 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r31582 | def _dropempty(fctxs): | ||
return [x for x in fctxs if x.size() > 0] | ||||
Augie Fackler
|
r43346 | |||
David Greenaway
|
r11060 | def findrenames(repo, added, removed, threshold): | ||
'''find renamed files -- yields (before, after, score) tuples''' | ||||
Yuya Nishihara
|
r31581 | wctx = repo[None] | ||
pctx = wctx.p1() | ||||
David Greenaway
|
r11059 | |||
David Greenaway
|
r11060 | # Zero length files will be frequently unrelated to each other, and | ||
# tracking the deletion/addition of such a file will probably cause more | ||||
# harm than good. We strip them out here to avoid matching them later on. | ||||
Yuya Nishihara
|
r31582 | addedfiles = _dropempty(wctx[fp] for fp in sorted(added)) | ||
removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx) | ||||
David Greenaway
|
r11060 | |||
# Find exact matches. | ||||
Yuya Nishihara
|
r31580 | matchedfiles = set() | ||
Raphaël Gomès
|
r52596 | for a, b in _findexactmatches(repo, addedfiles, removedfiles): | ||
Yuya Nishihara
|
r31580 | matchedfiles.add(b) | ||
David Greenaway
|
r11060 | yield (a.path(), b.path(), 1.0) | ||
# If the user requested similar files to be matched, search for them also. | ||||
if threshold < 1.0: | ||||
Yuya Nishihara
|
r31580 | addedfiles = [x for x in addedfiles if x not in matchedfiles] | ||
Raphaël Gomès
|
r52596 | for a, b, score in _findsimilarmatches( | ||
Augie Fackler
|
r43346 | repo, addedfiles, removedfiles, threshold | ||
): | ||||
David Greenaway
|
r11060 | yield (a.path(), b.path(), score) | ||