##// END OF EJS Templates
py3: use r'' instead of sysstr('') to get around code transformer...
py3: use r'' instead of sysstr('') to get around code transformer Fewer function calls should be better.

File last commit:

r32201:ded48ad5 default
r36853:5bc7ff10 default
Show More
similar.py
122 lines | 4.1 KiB | text/x-python | PythonLexer
David Greenaway
Move 'findrenames' code into its own file....
r11059 # similar.py - mechanisms for finding similar files
#
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
Gregory Szorc
similar: use absolute_import
r27359 from __future__ import absolute_import
from .i18n import _
from . import (
mdiff,
)
David Greenaway
Move 'findrenames' code into its own file....
r11059
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060 def _findexactmatches(repo, added, removed):
'''find renamed files that have no changes
Takes a list of new filectxs and a list of removed filectxs, and yields
(before, after) tuples of exact matches.
'''
numfiles = len(added) + len(removed)
Yuya Nishihara
similar: use cheaper hash() function to test exact matches...
r31584 # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
# We use hash() to discard fctx.data() from memory.
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060 hashes = {}
Yuya Nishihara
similar: use cheaper hash() function to test exact matches...
r31584 for i, fctx in enumerate(removed):
av6
similar: specify unit for ui.progress when operating on files
r28468 repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
unit=_('files'))
Yuya Nishihara
similar: use cheaper hash() function to test exact matches...
r31584 h = hash(fctx.data())
if h not in hashes:
hashes[h] = [fctx]
else:
hashes[h].append(fctx)
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060
# For each added file, see if it corresponds to a removed file.
for i, fctx in enumerate(added):
repo.ui.progress(_('searching for exact renames'), i + len(removed),
av6
similar: specify unit for ui.progress when operating on files
r28468 total=numfiles, unit=_('files'))
FUJIWARA Katsunori
similar: compare between actual file contents for exact identity...
r31210 adata = fctx.data()
Yuya Nishihara
similar: use cheaper hash() function to test exact matches...
r31584 h = hash(adata)
for rfctx in hashes.get(h, []):
FUJIWARA Katsunori
similar: compare between actual file contents for exact identity...
r31210 # compare between actual file contents for exact identity
if adata == rfctx.data():
yield (rfctx, fctx)
Yuya Nishihara
similar: use cheaper hash() function to test exact matches...
r31584 break
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060
# Done
repo.ui.progress(_('searching for exact renames'), None)
Sean Farley
similar: move score function to module level...
r30805 def _ctxdata(fctx):
# lazily load text
orig = fctx.data()
return orig, mdiff.splitnewlines(orig)
Pierre-Yves David
similar: remove caching from the module level...
r30809 def _score(fctx, otherdata):
orig, lines = otherdata
text = fctx.data()
Yuya Nishihara
bdiff: proxy through mdiff module...
r32201 # mdiff.blocks() returns blocks of matching lines
Sean Farley
similar: move score function to module level...
r30805 # count the number of bytes in each
equal = 0
Yuya Nishihara
bdiff: proxy through mdiff module...
r32201 matches = mdiff.blocks(text, orig)
Sean Farley
similar: move score function to module level...
r30805 for x1, x2, y1, y2 in matches:
for line in lines[y1:y2]:
equal += len(line)
lengths = len(text) + len(orig)
return equal * 2.0 / lengths
Pierre-Yves David
similar: remove caching from the module level...
r30809 def score(fctx1, fctx2):
return _score(fctx1, _ctxdata(fctx2))
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060 def _findsimilarmatches(repo, added, removed, threshold):
'''find potentially renamed files based on similar file content
Takes a list of new filectxs and a list of removed filectxs, and yields
(before, after, score) tuples of partial matches.
'''
David Greenaway
Move 'findrenames' code into its own file....
r11059 copies = {}
for i, r in enumerate(removed):
Brodie Rao
cleanup: eradicate long lines
r16683 repo.ui.progress(_('searching for similar files'), i,
av6
similar: specify unit for ui.progress when operating on files
r28468 total=len(removed), unit=_('files'))
David Greenaway
Move 'findrenames' code into its own file....
r11059
Pierre-Yves David
similar: remove caching from the module level...
r30809 data = None
David Greenaway
Move 'findrenames' code into its own file....
r11059 for a in added:
bestscore = copies.get(a, (None, threshold))[1]
Pierre-Yves David
similar: remove caching from the module level...
r30809 if data is None:
data = _ctxdata(r)
myscore = _score(a, data)
Yuya Nishihara
similar: take the first match instead of the last...
r31583 if myscore > bestscore:
David Greenaway
Move 'findrenames' code into its own file....
r11059 copies[a] = (r, myscore)
repo.ui.progress(_('searching'), None)
for dest, v in copies.iteritems():
Sean Farley
similar: rename local variable to not collide with previous...
r30791 source, bscore = v
yield source, dest, bscore
David Greenaway
Move 'findrenames' code into its own file....
r11059
Yuya Nishihara
similar: do not look up and create filectx more than once...
r31582 def _dropempty(fctxs):
return [x for x in fctxs if x.size() > 0]
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060 def findrenames(repo, added, removed, threshold):
'''find renamed files -- yields (before, after, score) tuples'''
Yuya Nishihara
similar: use common names for changectx variables...
r31581 wctx = repo[None]
pctx = wctx.p1()
David Greenaway
Move 'findrenames' code into its own file....
r11059
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060 # Zero length files will be frequently unrelated to each other, and
# tracking the deletion/addition of such a file will probably cause more
# harm than good. We strip them out here to avoid matching them later on.
Yuya Nishihara
similar: do not look up and create filectx more than once...
r31582 addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060
# Find exact matches.
Yuya Nishihara
similar: get rid of quadratic addedfiles.remove()...
r31580 matchedfiles = set()
for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
matchedfiles.add(b)
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060 yield (a.path(), b.path(), 1.0)
# If the user requested similar files to be matched, search for them also.
if threshold < 1.0:
Yuya Nishihara
similar: get rid of quadratic addedfiles.remove()...
r31580 addedfiles = [x for x in addedfiles if x not in matchedfiles]
Yuya Nishihara
similar: sort files not by object id but by path for stable result...
r31579 for (a, b, score) in _findsimilarmatches(repo, addedfiles,
removedfiles, threshold):
David Greenaway
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
r11060 yield (a.path(), b.path(), score)