##// END OF EJS Templates
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes....
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes. We speed up 'findrenames' for the usecase when a user specifies they want a similarity of 100% by matching files by their exact SHA1 hash value. This reduces the number of comparisons required to find exact matches from O(n^2) to O(n). While it would be nice if we could just use mercurial's pre-calculated SHA1 hash for existing files, this hash includes the file's ancestor information making it unsuitable for our purposes. Instead, we calculate the hash of old content from scratch. The following benchmarks were taken on the current head of crew: addremove 100% similarity: rm -rf *; hg up -C; mv tests tests.new hg --time addremove -s100 --dry-run before: real 176.350 secs (user 128.890+0.000 sys 47.430+0.000) after: real 2.130 secs (user 1.890+0.000 sys 0.240+0.000) addremove 75% similarity: rm -rf *; hg up -C; mv tests tests.new; \ for i in tests.new/*; do echo x >> $i; done hg --time addremove -s75 --dry-run before: real 264.560 secs (user 215.130+0.000 sys 49.410+0.000) after: real 218.710 secs (user 172.790+0.000 sys 45.870+0.000)

File last commit:

r10282:08a0f04b default
r11060:e6df0177 default
Show More
match.py
250 lines | 7.5 KiB | text/x-python | PythonLexer
timeless
Generally replace "file name" with "filename" in help and comments.
r8761 # match.py - filename matching
Martin Geisler
match: add copyright and license header
r8231 #
# Copyright 2008, 2009 Matt Mackall <mpm@selenic.com> and others
#
# This software may be used and distributed according to the terms of the
Matt Mackall
Update license to GPLv2+
r10263 # GNU General Public License version 2 or any later version.
Martin Geisler
match: add copyright and license header
r8231
Alejandro Santos
split local and stdlib module imports (eases migration issues)
r9036 import re
import util
Matt Mackall
walk: introduce match objects
r6576
Matt Mackall
match: fold match into _match base class
r8587 class match(object):
Matt Mackall
match: add some default args
r8567 def __init__(self, root, cwd, patterns, include=[], exclude=[],
Matt Mackall
match: add exact flag to match() to unify all match forms
r8586 default='glob', exact=False):
Matt Mackall
match: fold _matcher into match.__init__
r8581 """build an object to match a set of file patterns
arguments:
root - the canonical root of the tree you're matching against
cwd - the current working directory, if relevant
patterns - patterns to find
include - patterns to include
exclude - patterns to exclude
default - if a pattern in names has no explicit type, assume this one
Matt Mackall
match: add exact flag to match() to unify all match forms
r8586 exact - patterns are actually literals
Matt Mackall
match: fold _matcher into match.__init__
r8581
a pattern is one of:
'glob:<glob>' - a glob relative to cwd
're:<regexp>' - a regular expression
'path:<path>' - a path relative to canonroot
'relglob:<glob>' - an unrooted glob (*.c matches C files in all dirs)
'relpath:<path>' - a path relative to cwd
Matt Mackall
match: fold match into _match base class
r8587 'relre:<regexp>' - a regexp that needn't match the start of a name
'<something>' - a pattern of the specified default type
Matt Mackall
match: fold _matcher into match.__init__
r8581 """
Matt Mackall
match: fold match into _match base class
r8587 self._root = root
self._cwd = cwd
self._files = []
self._anypats = bool(include or exclude)
Matt Mackall
match: fold _matcher into match.__init__
r8581
Matt Mackall
match: add exact flag to match() to unify all match forms
r8586 if include:
im = _buildmatch(_normalize(include, 'glob', root, cwd), '(?:/|$)')
if exclude:
em = _buildmatch(_normalize(exclude, 'glob', root, cwd), '(?:/|$)')
if exact:
Matt Mackall
match: fold match into _match base class
r8587 self._files = patterns
Matt Mackall
match: add exact flag to match() to unify all match forms
r8586 pm = self.exact
elif patterns:
Matt Mackall
match: fold _matcher into match.__init__
r8581 pats = _normalize(patterns, default, root, cwd)
Matt Mackall
match: fold match into _match base class
r8587 self._files = _roots(pats)
self._anypats = self._anypats or _anypats(pats)
Matt Mackall
match: fold _matcher into match.__init__
r8581 pm = _buildmatch(pats, '$')
Matt Mackall
match: add exact flag to match() to unify all match forms
r8586 if patterns or exact:
Matt Mackall
match: fold _matcher into match.__init__
r8581 if include:
if exclude:
m = lambda f: im(f) and not em(f) and pm(f)
else:
m = lambda f: im(f) and pm(f)
else:
if exclude:
m = lambda f: not em(f) and pm(f)
else:
m = pm
else:
if include:
if exclude:
m = lambda f: im(f) and not em(f)
else:
m = im
else:
if exclude:
m = lambda f: not em(f)
else:
m = lambda f: True
Matt Mackall
match: fold match into _match base class
r8587 self.matchfn = m
self._fmap = set(self._files)
def __call__(self, fn):
return self.matchfn(fn)
def __iter__(self):
for f in self._files:
yield f
def bad(self, f, msg):
Matt Mackall
match: document bad callback semantics
r8678 '''callback for each explicit file that can't be
found/accessed, with an error message
'''
Matt Mackall
match: ignore return of match.bad...
r8680 pass
Matt Mackall
match: fold match into _match base class
r8587 def dir(self, f):
pass
def missing(self, f):
pass
def exact(self, f):
return f in self._fmap
def rel(self, f):
return util.pathto(self._root, self._cwd, f)
def files(self):
return self._files
def anypats(self):
return self._anypats
Matt Mackall
match: refactor patkind...
r8568
Matt Mackall
match: add exact flag to match() to unify all match forms
r8586 class exact(match):
Matt Mackall
match: redefine always and never in terms of match and exact
r8585 def __init__(self, root, cwd, files):
Matt Mackall
match: add exact flag to match() to unify all match forms
r8586 match.__init__(self, root, cwd, files, exact = True)
Matt Mackall
match: redefine always and never in terms of match and exact
r8585
class always(match):
def __init__(self, root, cwd):
match.__init__(self, root, cwd, [])
Matt Mackall
match: refactor patkind...
r8568 def patkind(pat):
Matt Mackall
match: move util match functions over
r8570 return _patsplit(pat, None)[0]
def _patsplit(pat, default):
"""Split a string into an optional pattern kind prefix and the
actual pattern."""
Matt Mackall
match: optimize _patsplit
r8579 if ':' in pat:
Matt Mackall
match: fix _patsplit breakage with drive letters
r8613 kind, val = pat.split(':', 1)
if kind in ('re', 'glob', 'path', 'relglob', 'relpath', 'relre'):
return kind, val
Matt Mackall
match: move util match functions over
r8570 return default, pat
Matt Mackall
match: remove head and tail args from _globre
r8582 def _globre(pat):
Matt Mackall
match: move util match functions over
r8570 "convert a glob pattern into a regexp"
i, n = 0, len(pat)
res = ''
group = 0
Matt Mackall
match: optimize escaping in _globre...
r8583 escape = re.escape
Matt Mackall
many, many trivial check-code fixups
r10282 def peek():
return i < n and pat[i]
Matt Mackall
match: move util match functions over
r8570 while i < n:
c = pat[i]
Matt Mackall
many, many trivial check-code fixups
r10282 i += 1
Matt Mackall
match: optimize escaping in _globre...
r8583 if c not in '*?[{},\\':
res += escape(c)
elif c == '*':
Matt Mackall
match: move util match functions over
r8570 if peek() == '*':
i += 1
res += '.*'
else:
res += '[^/]*'
elif c == '?':
res += '.'
elif c == '[':
j = i
if j < n and pat[j] in '!]':
j += 1
while j < n and pat[j] != ']':
j += 1
if j >= n:
res += '\\['
else:
stuff = pat[i:j].replace('\\','\\\\')
i = j + 1
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] == '^':
stuff = '\\' + stuff
res = '%s[%s]' % (res, stuff)
elif c == '{':
group += 1
res += '(?:'
elif c == '}' and group:
res += ')'
group -= 1
elif c == ',' and group:
res += '|'
elif c == '\\':
p = peek()
if p:
i += 1
Matt Mackall
match: optimize escaping in _globre...
r8583 res += escape(p)
Matt Mackall
match: move util match functions over
r8570 else:
Matt Mackall
match: optimize escaping in _globre...
r8583 res += escape(c)
Matt Mackall
match: move util match functions over
r8570 else:
Matt Mackall
match: optimize escaping in _globre...
r8583 res += escape(c)
Matt Mackall
match: remove head and tail args from _globre
r8582 return res
Matt Mackall
match: move util match functions over
r8570
Matt Mackall
match: unnest functions in _matcher
r8574 def _regex(kind, name, tail):
'''convert a pattern into a regular expression'''
if not name:
return ''
if kind == 're':
return name
elif kind == 'path':
return '^' + re.escape(name) + '(?:/|$)'
elif kind == 'relglob':
Matt Mackall
match: remove head and tail args from _globre
r8582 return '(?:|.*/)' + _globre(name) + tail
Matt Mackall
match: unnest functions in _matcher
r8574 elif kind == 'relpath':
return re.escape(name) + '(?:/|$)'
elif kind == 'relre':
if name.startswith('^'):
return name
return '.*' + name
Matt Mackall
match: remove head and tail args from _globre
r8582 return _globre(name) + tail
Matt Mackall
match: unnest functions in _matcher
r8574
Matt Mackall
match: rename _matchfn to _buildmatch
r8580 def _buildmatch(pats, tail):
Matt Mackall
match: unnest functions in _matcher
r8574 """build a matching function from a set of patterns"""
try:
pat = '(?:%s)' % '|'.join([_regex(k, p, tail) for (k, p) in pats])
if len(pat) > 20000:
raise OverflowError()
return re.compile(pat).match
except OverflowError:
# We're using a Python with a tiny regex engine and we
# made it explode, so we'll divide the pattern list in two
# until it works
l = len(pats)
if l < 2:
raise
Matt Mackall
match: rename _matchfn to _buildmatch
r8580 a, b = _buildmatch(pats[:l//2], tail), _buildmatch(pats[l//2:], tail)
Matt Mackall
match: unnest functions in _matcher
r8574 return lambda s: a(s) or b(s)
except re.error:
for k, p in pats:
try:
re.compile('(?:%s)' % _regex(k, p, tail))
except re.error:
raise util.Abort("invalid pattern (%s): %s" % (k, p))
raise util.Abort("invalid pattern")
Matt Mackall
match: tweak some names
r8578 def _normalize(names, default, root, cwd):
Matt Mackall
match: unnest functions in _matcher
r8574 pats = []
for kind, name in [_patsplit(p, default) for p in names]:
if kind in ('glob', 'relpath'):
Matt Mackall
match: tweak some names
r8578 name = util.canonpath(root, cwd, name)
Matt Mackall
match: unnest functions in _matcher
r8574 elif kind in ('relglob', 'path'):
name = util.normpath(name)
pats.append((kind, name))
Matt Mackall
match: split up _normalizepats
r8576 return pats
Matt Mackall
match: unnest functions in _matcher
r8574
Matt Mackall
match: split up _normalizepats
r8576 def _roots(patterns):
r = []
for kind, name in patterns:
Matt Mackall
match: fold _globprefix into _roots
r8584 if kind == 'glob': # find the non-glob prefix
root = []
for p in name.split('/'):
if '[' in p or '{' in p or '*' in p or '?' in p:
break
root.append(p)
r.append('/'.join(root) or '.')
Matt Mackall
match: unnest functions in _matcher
r8574 elif kind in ('relpath', 'path'):
Matt Mackall
match: split up _normalizepats
r8576 r.append(name or '.')
Matt Mackall
match: unnest functions in _matcher
r8574 elif kind == 'relglob':
Matt Mackall
match: split up _normalizepats
r8576 r.append('.')
return r
def _anypats(patterns):
for kind, name in patterns:
if kind in ('glob', 're', 'relglob', 'relre'):
return True