upstream/mercurial-mirror Commit - r11060:e6df0177

findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....

David Greenaway -

r11060:e6df0177 default

parent child

mercurial/similar.py

0 +59 -15

@@ -1,59 +1,103
1	# similar.py - mechanisms for finding similar files	1	# similar.py - mechanisms for finding similar files
2	#	2	#
3	# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>	3	# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4	#	4	#
5	# This software may be used and distributed according to the terms of the	5	# This software may be used and distributed according to the terms of the
6	# GNU General Public License version 2 or any later version.	6	# GNU General Public License version 2 or any later version.
7		7
8	from i18n import _	8	from i18n import _
9	import util	9	import util
10	import mdiff	10	import mdiff
11	import bdiff	11	import bdiff
12		12
13	def ~~findrenam~~es(repo, added, removed, ~~threshold~~):	13	def _findexactmatches(repo, added, removed):
14	'''find renamed files -- yields (before, after, score) tuples'''	14	'''find renamed files that have no changes
		15
		16	Takes a list of new filectxs and a list of removed filectxs, and yields
		17	(before, after) tuples of exact matches.
		18	'''
		19	numfiles = len(added) + len(removed)
		20
		21	# Get hashes of removed files.
		22	hashes = {}
		23	for i, fctx in enumerate(removed):
		24	repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
		25	h = util.sha1(fctx.data()).digest()
		26	hashes[h] = fctx
		27
		28	# For each added file, see if it corresponds to a removed file.
		29	for i, fctx in enumerate(added):
		30	repo.ui.progress(_('searching for exact renames'), i + len(removed),
		31	total=numfiles)
		32	h = util.sha1(fctx.data()).digest()
		33	if h in hashes:
		34	yield (hashes[h], fctx)
		35
		36	# Done
		37	repo.ui.progress(_('searching for exact renames'), None)
		38
		39	def _findsimilarmatches(repo, added, removed, threshold):
		40	'''find potentially renamed files based on similar file content
		41
		42	Takes a list of new filectxs and a list of removed filectxs, and yields
		43	(before, after, score) tuples of partial matches.
		44	'''
15	copies = {}	45	copies = {}
16	ctx = repo['.']
17	for i, r in enumerate(removed):	46	for i, r in enumerate(removed):
18	repo.ui.progress(_('searching'), i, total=len(removed))	47	repo.ui.progress(_('searching for similar files'), i, total=len(removed))
19	if r not in ctx:
20	continue
21	fctx = ctx.filectx(r)
22		48
23	# lazily load text	49	# lazily load text
24	@util.cachefunc	50	@util.cachefunc
25	def data():	51	def data():
26	orig = ~~fctx~~.data()	52	orig = r.data()
27	return orig, mdiff.splitnewlines(orig)	53	return orig, mdiff.splitnewlines(orig)
28		54
29	def score(text):	55	def score(text):
30	if not len(text):
31	return 0.0
32	if not fctx.cmp(text):
33	return 1.0
34	if threshold == 1.0:
35	return 0.0
36	orig, lines = data()	56	orig, lines = data()
37	# bdiff.blocks() returns blocks of matching lines	57	# bdiff.blocks() returns blocks of matching lines
38	# count the number of bytes in each	58	# count the number of bytes in each
39	equal = 0	59	equal = 0
40	matches = bdiff.blocks(text, orig)	60	matches = bdiff.blocks(text, orig)
41	for x1, x2, y1, y2 in matches:	61	for x1, x2, y1, y2 in matches:
42	for line in lines[y1:y2]:	62	for line in lines[y1:y2]:
43	equal += len(line)	63	equal += len(line)
44		64
45	lengths = len(text) + len(orig)	65	lengths = len(text) + len(orig)
46	return equal * 2.0 / lengths	66	return equal * 2.0 / lengths
47		67
48	for a in added:	68	for a in added:
49	bestscore = copies.get(a, (None, threshold))[1]	69	bestscore = copies.get(a, (None, threshold))[1]
50	myscore = score(~~repo~~.~~wread~~(a))	70	myscore = score(a.data())
51	if myscore >= bestscore:	71	if myscore >= bestscore:
52	copies[a] = (r, myscore)	72	copies[a] = (r, myscore)
53	repo.ui.progress(_('searching'), None)	73	repo.ui.progress(_('searching'), None)
54		74
55	for dest, v in copies.iteritems():	75	for dest, v in copies.iteritems():
56	source, score = v	76	source, score = v
57	yield source, dest, score	77	yield source, dest, score
58		78
		79	def findrenames(repo, added, removed, threshold):
		80	'''find renamed files -- yields (before, after, score) tuples'''
		81	parentctx = repo['.']
		82	workingctx = repo[None]
59		83
		84	# Zero length files will be frequently unrelated to each other, and
		85	# tracking the deletion/addition of such a file will probably cause more
		86	# harm than good. We strip them out here to avoid matching them later on.
		87	addedfiles = set([workingctx[fp] for fp in added
		88	if workingctx[fp].size() > 0])
		89	removedfiles = set([parentctx[fp] for fp in removed
		90	if fp in parentctx and parentctx[fp].size() > 0])
		91
		92	# Find exact matches.
		93	for (a, b) in _findexactmatches(repo,
		94	sorted(addedfiles),sorted( removedfiles)):
		95	addedfiles.remove(b)
		96	yield (a.path(), b.path(), 1.0)
		97
		98	# If the user requested similar files to be matched, search for them also.
		99	if threshold < 1.0:
		100	for (a, b, score) in _findsimilarmatches(repo,
		101	sorted(addedfiles), sorted(removedfiles), threshold):
		102	yield (a.path(), b.path(), score)
		103

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages