# testparseutil.py - utilities to parse test script for check tools # # Copyright 2018 FUJIWARA Katsunori and others # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. from __future__ import absolute_import, print_function import abc import re import sys #################### # for Python3 compatibility (almost comes from mercurial/pycompat.py) ispy3 = (sys.version_info[0] >= 3) def identity(a): return a def _rapply(f, xs): if xs is None: # assume None means non-value of optional data return xs if isinstance(xs, (list, set, tuple)): return type(xs)(_rapply(f, x) for x in xs) if isinstance(xs, dict): return type(xs)((_rapply(f, k), _rapply(f, v)) for k, v in xs.items()) return f(xs) def rapply(f, xs): if f is identity: # fast path mainly for py2 return xs return _rapply(f, xs) if ispy3: import builtins # TODO: .buffer might not exist if std streams were replaced; we'll need # a silly wrapper to make a bytes stream backed by a unicode one. stdin = sys.stdin.buffer stdout = sys.stdout.buffer stderr = sys.stderr.buffer def bytestr(s): # tiny version of pycompat.bytestr return s.encode('latin1') def sysstr(s): if isinstance(s, builtins.str): return s return s.decode(u'latin-1') def opentext(f): return open(f, 'rb') else: stdin = sys.stdin stdout = sys.stdout stderr = sys.stderr bytestr = str sysstr = identity opentext = open def b2s(x): # convert BYTES elements in "x" to SYSSTR recursively return rapply(sysstr, x) def writeout(data): # write "data" in BYTES into stdout stdout.write(data) def writeerr(data): # write "data" in BYTES into stderr stderr.write(data) #################### class embeddedmatcher(object): """Base class to detect embedded code fragments in *.t test script """ __metaclass__ = abc.ABCMeta def __init__(self, desc): self.desc = desc @abc.abstractmethod def startsat(self, line): """Examine whether embedded code starts at line This can return arbitrary object, and it is used as 'ctx' for subsequent method invocations. """ @abc.abstractmethod def endsat(self, ctx, line): """Examine whether embedded code ends at line""" @abc.abstractmethod def isinside(self, ctx, line): """Examine whether line is inside embedded code, if not yet endsat """ @abc.abstractmethod def ignores(self, ctx): """Examine whether detected embedded code should be ignored""" @abc.abstractmethod def filename(self, ctx): """Return filename of embedded code If filename isn't specified for embedded code explicitly, this returns None. """ @abc.abstractmethod def codeatstart(self, ctx, line): """Return actual code at the start line of embedded code This might return None, if the start line doesn't contain actual code. """ @abc.abstractmethod def codeatend(self, ctx, line): """Return actual code at the end line of embedded code This might return None, if the end line doesn't contain actual code. """ @abc.abstractmethod def codeinside(self, ctx, line): """Return actual code at line inside embedded code""" def embedded(basefile, lines, errors, matchers): """pick embedded code fragments up from given lines This is common parsing logic, which examines specified matchers on given lines. :basefile: a name of a file, from which lines to be parsed come. :lines: to be parsed (might be a value returned by "open(basefile)") :errors: an array, into which messages for detected error are stored :matchers: an array of embeddedmatcher objects This function yields '(filename, starts, ends, code)' tuple. :filename: a name of embedded code, if it is explicitly specified (e.g. "foobar" of "cat >> foobar <>> class ambigmatcher(object): ... # mock matcher class to examine implementation of ... # "ambiguous matching" corner case ... def __init__(self, desc, matchfunc): ... self.desc = desc ... self.matchfunc = matchfunc ... def startsat(self, line): ... return self.matchfunc(line) >>> ambig1 = ambigmatcher(b'ambiguous #1', ... lambda l: l.startswith(b' $ cat ')) >>> ambig2 = ambigmatcher(b'ambiguous #2', ... lambda l: l.endswith(b'<< EOF\\n')) >>> lines = [b' $ cat > foo.py << EOF\\n'] >>> errors = [] >>> matchers = [ambig1, ambig2] >>> list(t for t in embedded(b'', lines, errors, matchers)) [] >>> b2s(errors) [':1: ambiguous line for "ambiguous #1", "ambiguous #2"'] """ matcher = None ctx = filename = code = startline = None # for pyflakes for lineno, line in enumerate(lines, 1): if not line.endswith(b'\n'): line += b'\n' # to normalize EOF line if matcher: # now, inside embedded code if matcher.endsat(ctx, line): codeatend = matcher.codeatend(ctx, line) if codeatend is not None: code.append(codeatend) if not matcher.ignores(ctx): yield (filename, startline, lineno, b''.join(code)) matcher = None # DO NOT "continue", because line might start next fragment elif not matcher.isinside(ctx, line): # this is an error of basefile # (if matchers are implemented correctly) errors.append(b'%s:%d: unexpected line for "%s"' % (basefile, lineno, matcher.desc)) # stop extracting embedded code by current 'matcher', # because appearance of unexpected line might mean # that expected end-of-embedded-code line might never # appear matcher = None # DO NOT "continue", because line might start next fragment else: code.append(matcher.codeinside(ctx, line)) continue # examine whether current line starts embedded code or not assert not matcher matched = [] for m in matchers: ctx = m.startsat(line) if ctx: matched.append((m, ctx)) if matched: if len(matched) > 1: # this is an error of matchers, maybe errors.append(b'%s:%d: ambiguous line for %s' % (basefile, lineno, b', '.join([b'"%s"' % m.desc for m, c in matched]))) # omit extracting embedded code, because choosing # arbitrary matcher from matched ones might fail to # detect the end of embedded code as expected. continue matcher, ctx = matched[0] filename = matcher.filename(ctx) code = [] codeatstart = matcher.codeatstart(ctx, line) if codeatstart is not None: code.append(codeatstart) startline = lineno else: startline = lineno + 1 if matcher: # examine whether EOF ends embedded code, because embedded # code isn't yet ended explicitly if matcher.endsat(ctx, b'\n'): codeatend = matcher.codeatend(ctx, b'\n') if codeatend is not None: code.append(codeatend) if not matcher.ignores(ctx): yield (filename, startline, lineno + 1, b''.join(code)) else: # this is an error of basefile # (if matchers are implemented correctly) errors.append(b'%s:%d: unexpected end of file for "%s"' % (basefile, lineno, matcher.desc)) # heredoc limit mark to ignore embedded code at check-code.py or so heredocignorelimit = b'NO_CHECK_EOF' # the pattern to match against cases below, and to return a limit mark # string as 'lname' group # # - << LIMITMARK # - << "LIMITMARK" # - << 'LIMITMARK' heredoclimitpat = br'\s*<<\s*(?P["\']?)(?P\w+)(?P=lquote)' class fileheredocmatcher(embeddedmatcher): """Detect "cat > FILE << LIMIT" style embedded code >>> matcher = fileheredocmatcher(b'heredoc .py file', br'[^<]+\.py') >>> b2s(matcher.startsat(b' $ cat > file.py << EOF\\n')) ('file.py', ' > EOF\\n') >>> b2s(matcher.startsat(b' $ cat >>file.py < EOF\\n') >>> b2s(matcher.startsat(b' $ cat> \\x27any file.py\\x27<< "EOF"\\n')) ('any file.py', ' > EOF\\n') >>> b2s(matcher.startsat(b" $ cat > file.py << 'ANYLIMIT'\\n")) ('file.py', ' > ANYLIMIT\\n') >>> b2s(matcher.startsat(b' $ cat<"file.py"\\n')) ('file.py', ' > ANYLIMIT\\n') >>> start = b' $ cat > file.py << EOF\\n' >>> ctx = matcher.startsat(start) >>> matcher.codeatstart(ctx, start) >>> b2s(matcher.filename(ctx)) 'file.py' >>> matcher.ignores(ctx) False >>> inside = b' > foo = 1\\n' >>> matcher.endsat(ctx, inside) False >>> matcher.isinside(ctx, inside) True >>> b2s(matcher.codeinside(ctx, inside)) 'foo = 1\\n' >>> end = b' > EOF\\n' >>> matcher.endsat(ctx, end) True >>> matcher.codeatend(ctx, end) >>> matcher.endsat(ctx, b' > EOFEOF\\n') False >>> ctx = matcher.startsat(b' $ cat > file.py << NO_CHECK_EOF\\n') >>> matcher.ignores(ctx) True """ _prefix = b' > ' def __init__(self, desc, namepat): super(fileheredocmatcher, self).__init__(desc) # build the pattern to match against cases below (and ">>" # variants), and to return a target filename string as 'name' # group # # - > NAMEPAT # - > "NAMEPAT" # - > 'NAMEPAT' namepat = (br'\s*>>?\s*(?P["\']?)(?P%s)(?P=nquote)' % namepat) self._fileres = [ # "cat > NAME << LIMIT" case re.compile(br' \$ \s*cat' + namepat + heredoclimitpat), # "cat << LIMIT > NAME" case re.compile(br' \$ \s*cat' + heredoclimitpat + namepat), ] def startsat(self, line): # ctx is (filename, END-LINE-OF-EMBEDDED-CODE) tuple for filere in self._fileres: matched = filere.match(line) if matched: return (matched.group('name'), b' > %s\n' % matched.group('limit')) def endsat(self, ctx, line): return ctx[1] == line def isinside(self, ctx, line): return line.startswith(self._prefix) def ignores(self, ctx): return b' > %s\n' % heredocignorelimit == ctx[1] def filename(self, ctx): return ctx[0] def codeatstart(self, ctx, line): return None # no embedded code at start line def codeatend(self, ctx, line): return None # no embedded code at end line def codeinside(self, ctx, line): return line[len(self._prefix):] # strip prefix #### # for embedded python script class pydoctestmatcher(embeddedmatcher): """Detect ">>> code" style embedded python code >>> matcher = pydoctestmatcher() >>> startline = b' >>> foo = 1\\n' >>> matcher.startsat(startline) True >>> matcher.startsat(b' ... foo = 1\\n') False >>> ctx = matcher.startsat(startline) >>> matcher.filename(ctx) >>> matcher.ignores(ctx) False >>> b2s(matcher.codeatstart(ctx, startline)) 'foo = 1\\n' >>> inside = b' >>> foo = 1\\n' >>> matcher.endsat(ctx, inside) False >>> matcher.isinside(ctx, inside) True >>> b2s(matcher.codeinside(ctx, inside)) 'foo = 1\\n' >>> inside = b' ... foo = 1\\n' >>> matcher.endsat(ctx, inside) False >>> matcher.isinside(ctx, inside) True >>> b2s(matcher.codeinside(ctx, inside)) 'foo = 1\\n' >>> inside = b' expected output\\n' >>> matcher.endsat(ctx, inside) False >>> matcher.isinside(ctx, inside) True >>> b2s(matcher.codeinside(ctx, inside)) '\\n' >>> inside = b' \\n' >>> matcher.endsat(ctx, inside) False >>> matcher.isinside(ctx, inside) True >>> b2s(matcher.codeinside(ctx, inside)) '\\n' >>> end = b' $ foo bar\\n' >>> matcher.endsat(ctx, end) True >>> matcher.codeatend(ctx, end) >>> end = b'\\n' >>> matcher.endsat(ctx, end) True >>> matcher.codeatend(ctx, end) """ _prefix = b' >>> ' _prefixre = re.compile(br' (>>>|\.\.\.) ') # If a line matches against not _prefixre but _outputre, that line # is "an expected output line" (= not a part of code fragment). # # Strictly speaking, a line matching against "(#if|#else|#endif)" # is also treated similarly in "inline python code" semantics by # run-tests.py. But "directive line inside inline python code" # should be rejected by Mercurial reviewers. Therefore, this # regexp does not matche against such directive lines. _outputre = re.compile(br' $| [^$]') def __init__(self): super(pydoctestmatcher, self).__init__(b"doctest style python code") def startsat(self, line): # ctx is "True" return line.startswith(self._prefix) def endsat(self, ctx, line): return not (self._prefixre.match(line) or self._outputre.match(line)) def isinside(self, ctx, line): return True # always true, if not yet ended def ignores(self, ctx): return False # should be checked always def filename(self, ctx): return None # no filename def codeatstart(self, ctx, line): return line[len(self._prefix):] # strip prefix ' >>> '/' ... ' def codeatend(self, ctx, line): return None # no embedded code at end line def codeinside(self, ctx, line): if self._prefixre.match(line): return line[len(self._prefix):] # strip prefix ' >>> '/' ... ' return b'\n' # an expected output line is treated as an empty line class pyheredocmatcher(embeddedmatcher): """Detect "python << LIMIT" style embedded python code >>> matcher = pyheredocmatcher() >>> b2s(matcher.startsat(b' $ python << EOF\\n')) ' > EOF\\n' >>> b2s(matcher.startsat(b' $ $PYTHON < EOF\\n' >>> b2s(matcher.startsat(b' $ "$PYTHON"<< "EOF"\\n')) ' > EOF\\n' >>> b2s(matcher.startsat(b" $ $PYTHON << 'ANYLIMIT'\\n")) ' > ANYLIMIT\\n' >>> matcher.startsat(b' $ "$PYTHON" < EOF\\n') >>> start = b' $ python << EOF\\n' >>> ctx = matcher.startsat(start) >>> matcher.codeatstart(ctx, start) >>> matcher.filename(ctx) >>> matcher.ignores(ctx) False >>> inside = b' > foo = 1\\n' >>> matcher.endsat(ctx, inside) False >>> matcher.isinside(ctx, inside) True >>> b2s(matcher.codeinside(ctx, inside)) 'foo = 1\\n' >>> end = b' > EOF\\n' >>> matcher.endsat(ctx, end) True >>> matcher.codeatend(ctx, end) >>> matcher.endsat(ctx, b' > EOFEOF\\n') False >>> ctx = matcher.startsat(b' $ python << NO_CHECK_EOF\\n') >>> matcher.ignores(ctx) True """ _prefix = b' > ' _startre = re.compile(br' \$ (\$PYTHON|"\$PYTHON"|python).*' + heredoclimitpat) def __init__(self): super(pyheredocmatcher, self).__init__(b"heredoc python invocation") def startsat(self, line): # ctx is END-LINE-OF-EMBEDDED-CODE matched = self._startre.match(line) if matched: return b' > %s\n' % matched.group('limit') def endsat(self, ctx, line): return ctx == line def isinside(self, ctx, line): return line.startswith(self._prefix) def ignores(self, ctx): return b' > %s\n' % heredocignorelimit == ctx def filename(self, ctx): return None # no filename def codeatstart(self, ctx, line): return None # no embedded code at start line def codeatend(self, ctx, line): return None # no embedded code at end line def codeinside(self, ctx, line): return line[len(self._prefix):] # strip prefix _pymatchers = [ pydoctestmatcher(), pyheredocmatcher(), # use '[^<]+' instead of '\S+', in order to match against # paths including whitespaces fileheredocmatcher(b'heredoc .py file', br'[^<]+\.py'), ] def pyembedded(basefile, lines, errors): return embedded(basefile, lines, errors, _pymatchers) #### # for embedded shell script _shmatchers = [ # use '[^<]+' instead of '\S+', in order to match against # paths including whitespaces fileheredocmatcher(b'heredoc .sh file', br'[^<]+\.sh'), ] def shembedded(basefile, lines, errors): return embedded(basefile, lines, errors, _shmatchers) #### # for embedded hgrc configuration _hgrcmatchers = [ # use '[^<]+' instead of '\S+', in order to match against # paths including whitespaces fileheredocmatcher(b'heredoc hgrc file', br'(([^/<]+/)+hgrc|\$HGRCPATH|\${HGRCPATH})'), ] def hgrcembedded(basefile, lines, errors): return embedded(basefile, lines, errors, _hgrcmatchers) #### if __name__ == "__main__": import optparse import sys def showembedded(basefile, lines, embeddedfunc, opts): errors = [] for name, starts, ends, code in embeddedfunc(basefile, lines, errors): if not name: name = b'' writeout(b"%s:%d: %s starts\n" % (basefile, starts, name)) if opts.verbose and code: writeout(b" |%s\n" % b"\n |".join(l for l in code.splitlines())) writeout(b"%s:%d: %s ends\n" % (basefile, ends, name)) for e in errors: writeerr(b"%s\n" % e) return len(errors) def applyembedded(args, embeddedfunc, opts): ret = 0 if args: for f in args: with opentext(f) as fp: if showembedded(bytestr(f), fp, embeddedfunc, opts): ret = 1 else: lines = [l for l in stdin.readlines()] if showembedded(b'', lines, embeddedfunc, opts): ret = 1 return ret commands = {} def command(name, desc): def wrap(func): commands[name] = (desc, func) return wrap @command("pyembedded", "detect embedded python script") def pyembeddedcmd(args, opts): return applyembedded(args, pyembedded, opts) @command("shembedded", "detect embedded shell script") def shembeddedcmd(args, opts): return applyembedded(args, shembedded, opts) @command("hgrcembedded", "detect embedded hgrc configuration") def hgrcembeddedcmd(args, opts): return applyembedded(args, hgrcembedded, opts) availablecommands = "\n".join([" - %s: %s" % (key, value[0]) for key, value in commands.items()]) parser = optparse.OptionParser("""%prog COMMAND [file ...] Pick up embedded code fragments from given file(s) or stdin, and list up start/end lines of them in standard compiler format ("FILENAME:LINENO:"). Available commands are: """ + availablecommands + """ """) parser.add_option("-v", "--verbose", help="enable additional output (e.g. actual code)", action="store_true") (opts, args) = parser.parse_args() if not args or args[0] not in commands: parser.print_help() sys.exit(255) sys.exit(commands[args[0]][1](args[1:], opts))