lexers.py
502 lines
| 17.6 KiB
| text/x-python
|
PythonLexer
chebee7i
|
r13629 | # -*- coding: utf-8 -*- | ||
""" | ||||
Defines a variety of Pygments lexers for highlighting IPython code. | ||||
This includes: | ||||
Thomas Kluyver
|
r14809 | IPythonLexer, IPython3Lexer | ||
chebee7i
|
r13629 | Lexers for pure IPython (python + magic/shell commands) | ||
Thomas Kluyver
|
r14809 | IPythonPartialTracebackLexer, IPythonTracebackLexer | ||
chebee7i
|
r13629 | Supports 2.x and 3.x via keyword `python3`. The partial traceback | ||
lexer reads everything but the Python code appearing in a traceback. | ||||
The full lexer combines the partial lexer with an IPython lexer. | ||||
IPythonConsoleLexer | ||||
A lexer for IPython console sessions, with support for tracebacks. | ||||
IPyLexer | ||||
A friendly lexer which examines the first line of text and from it, | ||||
decides whether to use an IPython lexer or an IPython console lexer. | ||||
This is probably the only lexer that needs to be explicitly added | ||||
to Pygments. | ||||
David Warde-Farley
|
r8789 | |||
""" | ||||
chebee7i
|
r13650 | #----------------------------------------------------------------------------- | ||
# Copyright (c) 2013, the IPython Development Team. | ||||
# | ||||
# Distributed under the terms of the Modified BSD License. | ||||
# | ||||
# The full license is in the file COPYING.txt, distributed with this software. | ||||
#----------------------------------------------------------------------------- | ||||
chebee7i
|
r13629 | |||
# Standard library | ||||
import re | ||||
# Third party | ||||
from pygments.lexers import BashLexer, PythonLexer, Python3Lexer | ||||
from pygments.lexer import ( | ||||
Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using, | ||||
) | ||||
from pygments.token import ( | ||||
Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error, | ||||
) | ||||
from pygments.util import get_bool_opt | ||||
chebee7i
|
r13634 | # Local | ||
from IPython.testing.skipdoctest import skip_doctest | ||||
chebee7i
|
r13629 | |||
line_re = re.compile('.*?\n') | ||||
ipython_tokens = [ | ||||
(r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword, | ||||
using(BashLexer), Text)), | ||||
(r'(\%+)(\w+)\b', bygroups(Operator, Keyword)), | ||||
(r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), | ||||
] | ||||
def build_ipy_lexer(python3): | ||||
"""Builds IPython lexers depending on the value of `python3`. | ||||
The lexer inherits from an appropriate Python lexer and then adds | ||||
information about IPython specific keywords (i.e. magic commands, | ||||
shell commands, etc.) | ||||
Parameters | ||||
---------- | ||||
python3 : bool | ||||
If `True`, then build an IPython lexer from a Python 3 lexer. | ||||
""" | ||||
# It would be nice to have a single IPython lexer class which takes | ||||
# a boolean `python3`. But since there are two Python lexer classes, | ||||
# we will also have two IPython lexer classes. | ||||
if python3: | ||||
PyLexer = Python3Lexer | ||||
clsname = 'IPython3Lexer' | ||||
name = 'IPython3' | ||||
aliases = ['ipython3'] | ||||
doc = """IPython3 Lexer""" | ||||
else: | ||||
PyLexer = PythonLexer | ||||
clsname = 'IPythonLexer' | ||||
name = 'IPython' | ||||
chebee7i
|
r13864 | aliases = ['ipython2', 'ipython'] | ||
chebee7i
|
r13629 | doc = """IPython Lexer""" | ||
tokens = PyLexer.tokens.copy() | ||||
tokens['root'] = ipython_tokens + tokens['root'] | ||||
attrs = {'name': name, 'aliases': aliases, | ||||
'__doc__': doc, 'tokens': tokens} | ||||
return type(name, (PyLexer,), attrs) | ||||
IPython3Lexer = build_ipy_lexer(python3=True) | ||||
IPythonLexer = build_ipy_lexer(python3=False) | ||||
class IPythonPartialTracebackLexer(RegexLexer): | ||||
""" | ||||
Partial lexer for IPython tracebacks. | ||||
Handles all the non-python output. This works for both Python 2.x and 3.x. | ||||
""" | ||||
name = 'IPython Partial Traceback' | ||||
tokens = { | ||||
'root': [ | ||||
# Tracebacks for syntax errors have a different style. | ||||
# For both types of tracebacks, we mark the first line with | ||||
# Generic.Traceback. For syntax errors, we mark the filename | ||||
# as we mark the filenames for non-syntax tracebacks. | ||||
# | ||||
# These two regexps define how IPythonConsoleLexer finds a | ||||
# traceback. | ||||
# | ||||
## Non-syntax traceback | ||||
(r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)), | ||||
## Syntax traceback | ||||
(r'^( File)(.*)(, line )(\d+\n)', | ||||
bygroups(Generic.Traceback, Name.Namespace, | ||||
Generic.Traceback, Literal.Number.Integer)), | ||||
# (Exception Identifier)(Whitespace)(Traceback Message) | ||||
(r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)', | ||||
bygroups(Name.Exception, Generic.Whitespace, Text)), | ||||
# (Module/Filename)(Text)(Callee)(Function Signature) | ||||
# Better options for callee and function signature? | ||||
(r'(.*)( in )(.*)(\(.*\)\n)', | ||||
bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)), | ||||
# Regular line: (Whitespace)(Line Number)(Python Code) | ||||
(r'(\s*?)(\d+)(.*?\n)', | ||||
bygroups(Generic.Whitespace, Literal.Number.Integer, Other)), | ||||
# Emphasized line: (Arrow)(Line Number)(Python Code) | ||||
# Using Exception token so arrow color matches the Exception. | ||||
(r'(-*>?\s?)(\d+)(.*?\n)', | ||||
bygroups(Name.Exception, Literal.Number.Integer, Other)), | ||||
# (Exception Identifier)(Message) | ||||
(r'(?u)(^[^\d\W]\w*)(:.*?\n)', | ||||
bygroups(Name.Exception, Text)), | ||||
# Tag everything else as Other, will be handled later. | ||||
(r'.*\n', Other), | ||||
], | ||||
} | ||||
class IPythonTracebackLexer(DelegatingLexer): | ||||
""" | ||||
IPython traceback lexer. | ||||
For doctests, the tracebacks can be snipped as much as desired with the | ||||
exception to the lines that designate a traceback. For non-syntax error | ||||
tracebacks, this is the line of hyphens. For syntax error tracebacks, | ||||
this is the line which lists the File and line number. | ||||
""" | ||||
# The lexer inherits from DelegatingLexer. The "root" lexer is an | ||||
# appropriate IPython lexer, which depends on the value of the boolean | ||||
# `python3`. First, we parse with the partial IPython traceback lexer. | ||||
# Then, any code marked with the "Other" token is delegated to the root | ||||
# lexer. | ||||
# | ||||
name = 'IPython Traceback' | ||||
aliases = ['ipythontb'] | ||||
def __init__(self, **options): | ||||
self.python3 = get_bool_opt(options, 'python3', False) | ||||
chebee7i
|
r13864 | if self.python3: | ||
chebee7i
|
r13903 | self.aliases = ['ipython3tb'] | ||
chebee7i
|
r13864 | else: | ||
chebee7i
|
r13903 | self.aliases = ['ipython2tb', 'ipythontb'] | ||
chebee7i
|
r13629 | |||
if self.python3: | ||||
IPyLexer = IPython3Lexer | ||||
else: | ||||
IPyLexer = IPythonLexer | ||||
DelegatingLexer.__init__(self, IPyLexer, | ||||
IPythonPartialTracebackLexer, **options) | ||||
chebee7i
|
r13634 | @skip_doctest | ||
chebee7i
|
r13629 | class IPythonConsoleLexer(Lexer): | ||
""" | ||||
chebee7i
|
r13635 | An IPython console lexer for IPython code-blocks and doctests, such as: | ||
chebee7i
|
r13634 | |||
chebee7i
|
r13635 | .. code-block:: rst | ||
chebee7i
|
r13903 | .. code-block:: ipythonconsole | ||
chebee7i
|
r13634 | |||
In [1]: a = 'foo' | ||||
chebee7i
|
r13629 | |||
chebee7i
|
r13634 | In [2]: a | ||
Out[2]: 'foo' | ||||
chebee7i
|
r13629 | |||
chebee7i
|
r13634 | In [3]: print a | ||
foo | ||||
chebee7i
|
r13629 | |||
chebee7i
|
r13634 | In [4]: 1 / 0 | ||
chebee7i
|
r13629 | |||
chebee7i
|
r13635 | Support is also provided for IPython exceptions: | ||
chebee7i
|
r13629 | |||
chebee7i
|
r13635 | .. code-block:: rst | ||
chebee7i
|
r13629 | |||
chebee7i
|
r13903 | .. code-block:: ipythonconsole | ||
chebee7i
|
r13629 | |||
chebee7i
|
r13634 | In [1]: raise Exception | ||
Thomas Kluyver
|
r14809 | |||
chebee7i
|
r13634 | --------------------------------------------------------------------------- | ||
Exception Traceback (most recent call last) | ||||
<ipython-input-1-fca2ab0ca76b> in <module>() | ||||
----> 1 raise Exception | ||||
chebee7i
|
r13629 | |||
chebee7i
|
r13634 | Exception: | ||
chebee7i
|
r13629 | |||
""" | ||||
name = 'IPython console session' | ||||
chebee7i
|
r13903 | aliases = ['ipythonconsole'] | ||
chebee7i
|
r13629 | mimetypes = ['text/x-ipython-console'] | ||
chebee7i
|
r13653 | # The regexps used to determine what is input and what is output. | ||
# The default prompts for IPython are: | ||||
chebee7i
|
r13629 | # | ||
# c.PromptManager.in_template = 'In [\#]: ' | ||||
# c.PromptManager.in2_template = ' .\D.: ' | ||||
# c.PromptManager.out_template = 'Out[\#]: ' | ||||
# | ||||
in1_regex = r'In \[[0-9]+\]: ' | ||||
in2_regex = r' \.\.+\.: ' | ||||
out_regex = r'Out\[[0-9]+\]: ' | ||||
#: The regex to determine when a traceback starts. | ||||
ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)') | ||||
def __init__(self, **options): | ||||
"""Initialize the IPython console lexer. | ||||
Parameters | ||||
---------- | ||||
python3 : bool | ||||
If `True`, then the console inputs are parsed using a Python 3 | ||||
lexer. Otherwise, they are parsed using a Python 2 lexer. | ||||
in1_regex : RegexObject | ||||
The compiled regular expression used to detect the start | ||||
of inputs. Although the IPython configuration setting may have a | ||||
trailing whitespace, do not include it in the regex. If `None`, | ||||
then the default input prompt is assumed. | ||||
in2_regex : RegexObject | ||||
The compiled regular expression used to detect the continuation | ||||
of inputs. Although the IPython configuration setting may have a | ||||
trailing whitespace, do not include it in the regex. If `None`, | ||||
then the default input prompt is assumed. | ||||
out_regex : RegexObject | ||||
The compiled regular expression used to detect outputs. If `None`, | ||||
then the default output prompt is assumed. | ||||
""" | ||||
self.python3 = get_bool_opt(options, 'python3', False) | ||||
chebee7i
|
r13864 | if self.python3: | ||
chebee7i
|
r13903 | self.aliases = ['ipython3console'] | ||
chebee7i
|
r13864 | else: | ||
chebee7i
|
r13903 | self.aliases = ['ipython2console', 'ipythonconsole'] | ||
chebee7i
|
r13629 | |||
in1_regex = options.get('in1_regex', self.in1_regex) | ||||
in2_regex = options.get('in2_regex', self.in2_regex) | ||||
out_regex = options.get('out_regex', self.out_regex) | ||||
# So that we can work with input and output prompts which have been | ||||
# rstrip'd (possibly by editors) we also need rstrip'd variants. If | ||||
# we do not do this, then such prompts will be tagged as 'output'. | ||||
# The reason can't just use the rstrip'd variants instead is because | ||||
# we want any whitespace associated with the prompt to be inserted | ||||
# with the token. This allows formatted code to be modified so as hide | ||||
chebee7i
|
r13651 | # the appearance of prompts, with the whitespace included. One example | ||
# use of this is in copybutton.js from the standard lib Python docs. | ||||
chebee7i
|
r13629 | in1_regex_rstrip = in1_regex.rstrip() + '\n' | ||
in2_regex_rstrip = in2_regex.rstrip() + '\n' | ||||
out_regex_rstrip = out_regex.rstrip() + '\n' | ||||
# Compile and save them all. | ||||
attrs = ['in1_regex', 'in2_regex', 'out_regex', | ||||
'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip'] | ||||
for attr in attrs: | ||||
self.__setattr__(attr, re.compile(locals()[attr])) | ||||
Lexer.__init__(self, **options) | ||||
if self.python3: | ||||
pylexer = IPython3Lexer | ||||
tblexer = IPythonTracebackLexer | ||||
else: | ||||
pylexer = IPythonLexer | ||||
tblexer = IPythonTracebackLexer | ||||
self.pylexer = pylexer(**options) | ||||
self.tblexer = tblexer(**options) | ||||
self.reset() | ||||
def reset(self): | ||||
self.mode = 'output' | ||||
self.index = 0 | ||||
self.buffer = u'' | ||||
self.insertions = [] | ||||
def buffered_tokens(self): | ||||
""" | ||||
Generator of unprocessed tokens after doing insertions and before | ||||
changing to a new state. | ||||
""" | ||||
if self.mode == 'output': | ||||
tokens = [(0, Generic.Output, self.buffer)] | ||||
elif self.mode == 'input': | ||||
tokens = self.pylexer.get_tokens_unprocessed(self.buffer) | ||||
else: # traceback | ||||
tokens = self.tblexer.get_tokens_unprocessed(self.buffer) | ||||
for i, t, v in do_insertions(self.insertions, tokens): | ||||
# All token indexes are relative to the buffer. | ||||
yield self.index + i, t, v | ||||
# Clear it all | ||||
self.index += len(self.buffer) | ||||
self.buffer = u'' | ||||
self.insertions = [] | ||||
chebee7i
|
r13653 | def get_mci(self, line): | ||
chebee7i
|
r13629 | """ | ||
chebee7i
|
r13653 | Parses the line and returns a 3-tuple: (mode, code, insertion). | ||
chebee7i
|
r13629 | |||
chebee7i
|
r13653 | `mode` is the next mode (or state) of the lexer, and is always equal | ||
to 'input', 'output', or 'tb'. | ||||
`code` is a portion of the line that should be added to the buffer | ||||
corresponding to the next mode and eventually lexed by another lexer. | ||||
For example, `code` could be Python code if `mode` were 'input'. | ||||
`insertion` is a 3-tuple (index, token, text) representing an | ||||
unprocessed "token" that will be inserted into the stream of tokens | ||||
that are created from the buffer once we change modes. This is usually | ||||
the input or output prompt. | ||||
In general, the next mode depends on current mode and on the contents | ||||
of `line`. | ||||
chebee7i
|
r13629 | |||
""" | ||||
# To reduce the number of regex match checks, we have multiple | ||||
# 'if' blocks instead of 'if-elif' blocks. | ||||
chebee7i
|
r13865 | # Check for possible end of input | ||
chebee7i
|
r13629 | in2_match = self.in2_regex.match(line) | ||
in2_match_rstrip = self.in2_regex_rstrip.match(line) | ||||
if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \ | ||||
in2_match_rstrip: | ||||
end_input = True | ||||
else: | ||||
end_input = False | ||||
if end_input and self.mode != 'tb': | ||||
# Only look for an end of input when not in tb mode. | ||||
# An ellipsis could appear within the traceback. | ||||
mode = 'output' | ||||
code = u'' | ||||
insertion = (0, Generic.Prompt, line) | ||||
return mode, code, insertion | ||||
chebee7i
|
r13865 | # Check for output prompt | ||
chebee7i
|
r13629 | out_match = self.out_regex.match(line) | ||
out_match_rstrip = self.out_regex_rstrip.match(line) | ||||
if out_match or out_match_rstrip: | ||||
mode = 'output' | ||||
if out_match: | ||||
idx = out_match.end() | ||||
else: | ||||
idx = out_match_rstrip.end() | ||||
code = line[idx:] | ||||
# Use the 'heading' token for output. We cannot use Generic.Error | ||||
# since it would conflict with exceptions. | ||||
insertion = (0, Generic.Heading, line[:idx]) | ||||
return mode, code, insertion | ||||
chebee7i
|
r13865 | # Check for input or continuation prompt (non stripped version) | ||
chebee7i
|
r13629 | in1_match = self.in1_regex.match(line) | ||
if in1_match or (in2_match and self.mode != 'tb'): | ||||
# New input or when not in tb, continued input. | ||||
# We do not check for continued input when in tb since it is | ||||
# allowable to replace a long stack with an ellipsis. | ||||
mode = 'input' | ||||
if in1_match: | ||||
idx = in1_match.end() | ||||
else: # in2_match | ||||
idx = in2_match.end() | ||||
code = line[idx:] | ||||
insertion = (0, Generic.Prompt, line[:idx]) | ||||
return mode, code, insertion | ||||
chebee7i
|
r13865 | # Check for input or continuation prompt (stripped version) | ||
chebee7i
|
r13629 | in1_match_rstrip = self.in1_regex_rstrip.match(line) | ||
if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'): | ||||
# New input or when not in tb, continued input. | ||||
# We do not check for continued input when in tb since it is | ||||
# allowable to replace a long stack with an ellipsis. | ||||
mode = 'input' | ||||
if in1_match_rstrip: | ||||
idx = in1_match_rstrip.end() | ||||
else: # in2_match | ||||
idx = in2_match_rstrip.end() | ||||
code = line[idx:] | ||||
insertion = (0, Generic.Prompt, line[:idx]) | ||||
return mode, code, insertion | ||||
chebee7i
|
r13865 | # Check for traceback | ||
chebee7i
|
r13629 | if self.ipytb_start.match(line): | ||
mode = 'tb' | ||||
code = line | ||||
insertion = None | ||||
return mode, code, insertion | ||||
chebee7i
|
r13865 | # All other stuff... | ||
chebee7i
|
r13629 | if self.mode in ('input', 'output'): | ||
# We assume all other text is output. Multiline input that | ||||
# does not use the continuation marker cannot be detected. | ||||
# For example, the 3 in the following is clearly output: | ||||
# | ||||
# In [1]: print 3 | ||||
# 3 | ||||
# | ||||
# But the following second line is part of the input: | ||||
# | ||||
# In [2]: while True: | ||||
# print True | ||||
# | ||||
# In both cases, the 2nd line will be 'output'. | ||||
# | ||||
mode = 'output' | ||||
else: | ||||
mode = 'tb' | ||||
code = line | ||||
insertion = None | ||||
return mode, code, insertion | ||||
def get_tokens_unprocessed(self, text): | ||||
self.reset() | ||||
for match in line_re.finditer(text): | ||||
line = match.group() | ||||
chebee7i
|
r13653 | mode, code, insertion = self.get_mci(line) | ||
chebee7i
|
r13629 | |||
if mode != self.mode: | ||||
# Yield buffered tokens before transitioning to new mode. | ||||
for token in self.buffered_tokens(): | ||||
yield token | ||||
self.mode = mode | ||||
if insertion: | ||||
self.insertions.append((len(self.buffer), [insertion])) | ||||
self.buffer += code | ||||
else: | ||||
for token in self.buffered_tokens(): | ||||
yield token | ||||
class IPyLexer(Lexer): | ||||
""" | ||||
Primary lexer for all IPython-like code. | ||||
This is a simple helper lexer. If the first line of the text begins with | ||||
"In \[[0-9]+\]:", then the entire text is parsed with an IPython console | ||||
lexer. If not, then the entire text is parsed with an IPython lexer. | ||||
The goal is to reduce the number of lexers that are registered | ||||
with Pygments. | ||||
""" | ||||
name = 'IPy session' | ||||
aliases = ['ipy'] | ||||
def __init__(self, **options): | ||||
self.python3 = get_bool_opt(options, 'python3', False) | ||||
chebee7i
|
r13864 | if self.python3: | ||
self.aliases = ['ipy3'] | ||||
else: | ||||
self.aliases = ['ipy2', 'ipy'] | ||||
chebee7i
|
r13629 | Lexer.__init__(self, **options) | ||
self.IPythonLexer = IPythonLexer(**options) | ||||
self.IPythonConsoleLexer = IPythonConsoleLexer(**options) | ||||
def get_tokens_unprocessed(self, text): | ||||
chebee7i
|
r16647 | # Search for the input prompt anywhere...this allows code blocks to | ||
# begin with comments as well. | ||||
if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL): | ||||
chebee7i
|
r13629 | lex = self.IPythonConsoleLexer | ||
else: | ||||
lex = self.IPythonLexer | ||||
for token in lex.get_tokens_unprocessed(text): | ||||
yield token | ||||