##// END OF EJS Templates
match: match explicit file using a set...
match: match explicit file using a set The matcher as all the logic to do quick comparison against explicit patterns, however the pattern matcher was shadowing the code using that set and used the compiled regex pattern in all cases, which is quite slow. We restore the usage of the set based matching to boost performance. Building the regexp is still consuming a large amount of time (actually, the majority of the time), which is still silly. Maybe using re2 would help that, but this is a quest for another adventure. Another path to improve this is to have a pattern type dedicated to match the exact path to a file only (not a directory). This pattern could use the set matching only and be skipped in the regex all together. Benchmarks ========== In the following benchmark we are comparing the `hg cat` and `hg files` run time when matching against all files in the repository. They are run: - without the rust extensions - with the standard python engine (so without re2) Performance improvement in this series -------------------------------------- ###### hg files ############################################################### ### mercurial-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 0.230092 seconds prev-changeset: 0.230069 seconds this-changeset: 0.211425 seconds (-8.36%) ### mercurial-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 0.234235 seconds prev-changeset: 0.231165 seconds (-1.38%) this-changeset: 0.212300 seconds (-9.43%) ### pypy-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 0.613567 seconds prev-changeset: 0.616799 seconds this-changeset: 0.510852 seconds (-16.82%) ### pypy-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 0.801880 seconds prev-changeset: 0.616393 seconds (-23.22%) this-changeset: 0.511903 seconds (-36.23%) ### netbeans-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 21.541828 seconds prev-changeset: 21.586773 seconds this-changeset: 13.648347 seconds (-36.76%) ### netbeans-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 172.759857 seconds prev-changeset: 21.908197 seconds (-87.32%) this-changeset: 13.945110 seconds (-91.93%) ### mozilla-central-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 62.474221 seconds prev-changeset: 61.279490 seconds (-1.22%) this-changeset: 29.529469 seconds (-52.40%) ### mozilla-central-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 1364.180218 seconds prev-changeset: 62.473549 seconds (-95.40%) this-changeset: 30.625249 seconds (-97.75%) ###### hg cat ################################################################# ### mercurial-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 0.764407 seconds prev-changeset: 0.763883 seconds this-changeset: 0.737326 seconds (-3.68%) ### mercurial-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 0.768924 seconds prev-changeset: 0.765848 seconds this-changeset: 0.174d0b seconds (-4.44%) ### pypy-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 2.065220 seconds prev-changeset: 2.070498 seconds this-changeset: 1.939482 seconds (-6.08%) ### pypy-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 2.276388 seconds prev-changeset: 2.069197 seconds (-9.15%) this-changeset: 1.931746 seconds (-15.19%) ### netbeans-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 40.967983 seconds prev-changeset: 41.392423 seconds this-changeset: 32.181681 seconds (-22.20%) ### netbeans-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 216.388709 seconds prev-changeset: 41.648689 seconds (-80.88%) this-changeset: 32.580817 seconds (-85.04%) ### mozilla-central-2018-08-01-zstd-sparse-revlog ### sorted base-changeset: 105.228510 seconds prev-changeset: 103.315670 seconds (-1.23%) this-changeset: 69.416118 seconds (-33.64%) ### mozilla-central-2018-08-01-zstd-sparse-revlog ### shuffled base-changeset: 1448.722784 seconds prev-changeset: 104.369358 seconds (-92.80%) this-changeset: 70.554789 seconds (-95.13%) Different way to list the same data with this revision ------------------------------------------------------ ###### hg files ############################################################### ### mercurial-2018-08-01-zstd-sparse-revlog root: 0.119182 seconds glob: 0.120697 seconds (+1.27%) sorted: 0.211425 seconds (+77.40%) shuffled: 0.212300 seconds (+78.13%) ### pypy-2018-08-01-zstd-sparse-revlog root: 0.121986 seconds glob: 0.124822 seconds (+2.32%) sorted: 0.510852 seconds (+318.78%) shuffled: 0.511903 seconds (+319.64%) ### netbeans-2018-08-01-zstd-sparse-revlog root: 0.173984 seconds glob: 0.227203 seconds (+30.59%) sorted: 13.648347 seconds (+7744.59%) shuffled: 13.945110 seconds (+7915.16%) ### mozilla-central-2018-08-01-zstd-sparse-revlog root: 0.366463 seconds glob: 0.491030 seconds (+33.99%) sorted: 29.529469 seconds (+7957.96%) shuffled: 30.625249 seconds (+8256.97%) ###### hg cat ################################################################# ### mercurial-2018-08-01-zstd-sparse-revlog glob: 0.647471 seconds root: 0.643120 seconds shuffled: 0.174d0b seconds (+13.92%) sorted: 0.737326 seconds (+13.88%) ### mozilla-central-2018-08-01-zstd-sparse-revlog glob: 40.596983 seconds root: 40.129136 seconds shuffled: 70.554789 seconds (+73.79%) sorted: 69.416118 seconds (+70.99%) ### netbeans-2018-08-01-zstd-sparse-revlog glob: 18.777924 seconds root: 18.613905 seconds shuffled: 32.580817 seconds (+73.51%) sorted: 32.181681 seconds (+71.38%) ### pypy-2018-08-01-zstd-sparse-revlog glob: 1.555319 seconds root: 1.536534 seconds shuffled: 1.931746 seconds (+24.20%) sorted: 1.939482 seconds (+24.70%)

File last commit:

r50201:2e726c93 default
r51286:81c7d04f stable
Show More
grep.py
221 lines | 7.1 KiB | text/x-python | PythonLexer
# grep.py - logic for history walk and grep
#
# Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
import difflib
from .i18n import _
from . import (
error,
match as matchmod,
pycompat,
scmutil,
util,
)
def matchlines(body, regexp):
begin = 0
linenum = 0
while begin < len(body):
match = regexp.search(body, begin)
if not match:
break
mstart, mend = match.span()
linenum += body.count(b'\n', begin, mstart) + 1
lstart = body.rfind(b'\n', begin, mstart) + 1 or begin
begin = body.find(b'\n', mend) + 1 or len(body) + 1
lend = begin - 1
yield linenum, mstart - lstart, mend - lstart, body[lstart:lend]
class linestate:
def __init__(self, line, linenum, colstart, colend):
self.line = line
self.linenum = linenum
self.colstart = colstart
self.colend = colend
def __hash__(self):
return hash(self.line)
def __eq__(self, other):
return self.line == other.line
def findpos(self, regexp):
"""Iterate all (start, end) indices of matches"""
yield self.colstart, self.colend
p = self.colend
while p < len(self.line):
m = regexp.search(self.line, p)
if not m:
break
if m.end() == p:
p += 1
else:
yield m.span()
p = m.end()
def difflinestates(a, b):
sm = difflib.SequenceMatcher(None, a, b)
for tag, alo, ahi, blo, bhi in sm.get_opcodes():
if tag == 'insert':
for i in range(blo, bhi):
yield (b'+', b[i])
elif tag == 'delete':
for i in range(alo, ahi):
yield (b'-', a[i])
elif tag == 'replace':
for i in range(alo, ahi):
yield (b'-', a[i])
for i in range(blo, bhi):
yield (b'+', b[i])
class grepsearcher:
"""Search files and revisions for lines matching the given pattern
Options:
- all_files to search unchanged files at that revision.
- diff to search files in the parent revision so diffs can be generated.
- follow to skip files across copies and renames.
"""
def __init__(
self, ui, repo, regexp, all_files=False, diff=False, follow=False
):
self._ui = ui
self._repo = repo
self._regexp = regexp
self._all_files = all_files
self._diff = diff
self._follow = follow
self._getfile = util.lrucachefunc(repo.file)
self._getrenamed = scmutil.getrenamedfn(repo)
self._matches = {}
self._copies = {}
self._skip = set()
self._revfiles = {}
def skipfile(self, fn, rev):
"""Exclude the given file (and the copy at the specified revision)
from future search"""
copy = self._copies.get(rev, {}).get(fn)
self._skip.add(fn)
if copy:
self._skip.add(copy)
def searchfiles(self, revs, makefilematcher):
"""Walk files and revisions to yield (fn, ctx, pstates, states)
matches
states is a list of linestate objects. pstates may be empty unless
diff is True.
"""
for ctx in scmutil.walkchangerevs(
self._repo, revs, makefilematcher, self._prep
):
rev = ctx.rev()
parent = ctx.p1().rev()
for fn in sorted(self._revfiles.get(rev, [])):
states = self._matches[rev][fn]
copy = self._copies.get(rev, {}).get(fn)
if fn in self._skip:
if copy:
self._skip.add(copy)
continue
pstates = self._matches.get(parent, {}).get(copy or fn, [])
if pstates or states:
yield fn, ctx, pstates, states
del self._revfiles[rev]
# We will keep the matches dict for the duration of the window
# clear the matches dict once the window is over
if not self._revfiles:
self._matches.clear()
def _grepbody(self, fn, rev, body):
self._matches[rev].setdefault(fn, [])
m = self._matches[rev][fn]
if body is None:
return
for lnum, cstart, cend, line in matchlines(body, self._regexp):
s = linestate(line, lnum, cstart, cend)
m.append(s)
def _readfile(self, ctx, fn):
rev = ctx.rev()
if rev is None:
fctx = ctx[fn]
try:
return fctx.data()
except FileNotFoundError:
pass
else:
flog = self._getfile(fn)
fnode = ctx.filenode(fn)
try:
return flog.read(fnode)
except error.CensoredNodeError:
self._ui.warn(
_(
b'cannot search in censored file: '
b'%(filename)s:%(revnum)s\n'
)
% {b'filename': fn, b'revnum': pycompat.bytestr(rev)}
)
def _prep(self, ctx, fmatch):
rev = ctx.rev()
pctx = ctx.p1()
self._matches.setdefault(rev, {})
if self._diff:
parent = pctx.rev()
self._matches.setdefault(parent, {})
files = self._revfiles.setdefault(rev, [])
if rev is None:
# in `hg grep pattern`, 2/3 of the time is spent is spent in
# pathauditor checks without this in mozilla-central
contextmanager = self._repo.wvfs.audit.cached
else:
contextmanager = util.nullcontextmanager
with contextmanager():
# TODO: maybe better to warn missing files?
if self._all_files:
fmatch = matchmod.badmatch(fmatch, lambda f, msg: None)
filenames = ctx.matches(fmatch)
else:
filenames = (f for f in ctx.files() if fmatch(f))
for fn in filenames:
# fn might not exist in the revision (could be a file removed by
# the revision). We could check `fn not in ctx` even when rev is
# None, but it's less racy to protect againt that in readfile.
if rev is not None and fn not in ctx:
continue
copy = None
if self._follow:
copy = self._getrenamed(fn, rev)
if copy:
self._copies.setdefault(rev, {})[fn] = copy
if fn in self._skip:
self._skip.add(copy)
if fn in self._skip:
continue
files.append(fn)
if fn not in self._matches[rev]:
self._grepbody(fn, rev, self._readfile(ctx, fn))
if self._diff:
pfn = copy or fn
if pfn not in self._matches[parent] and pfn in pctx:
self._grepbody(pfn, parent, self._readfile(pctx, pfn))