From e35dc3b48e5ec2a7e08d19cd0f62529a106bffde 2013-11-16 03:27:59 From: chebee7i Date: 2013-11-16 03:27:59 Subject: [PATCH] Update IPython Pygments lexers. --- diff --git a/IPython/nbconvert/utils/lexers.py b/IPython/nbconvert/utils/lexers.py index 9a9092e..065b1d3 100644 --- a/IPython/nbconvert/utils/lexers.py +++ b/IPython/nbconvert/utils/lexers.py @@ -1,46 +1,473 @@ -"""A custom pygments lexer for IPython code cells. +# -*- coding: utf-8 -*- +""" +Defines a variety of Pygments lexers for highlighting IPython code. + +This includes: + + IPythonLexer + IPython3Lexer + Lexers for pure IPython (python + magic/shell commands) + + IPythonPartialTracebackLexer + IPythonTracebackLexer + Supports 2.x and 3.x via keyword `python3`. The partial traceback + lexer reads everything but the Python code appearing in a traceback. + The full lexer combines the partial lexer with an IPython lexer. + + IPythonConsoleLexer + A lexer for IPython console sessions, with support for tracebacks. + + IPyLexer + A friendly lexer which examines the first line of text and from it, + decides whether to use an IPython lexer or an IPython console lexer. + This is probably the only lexer that needs to be explicitly added + to Pygments. -Informs The pygments highlighting library of the quirks of IPython's superset -of Python -- magic commands, !shell commands, etc. """ -#----------------------------------------------------------------------------- -# Copyright (c) 2013, the IPython Development Team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file COPYING.txt, distributed with this software. -#----------------------------------------------------------------------------- - -#----------------------------------------------------------------------------- -# Imports -#----------------------------------------------------------------------------- - -# Third-party imports -from pygments.lexers import PythonLexer, BashLexer -from pygments.lexer import bygroups, using -from pygments.token import Keyword, Operator, Text - -#----------------------------------------------------------------------------- -# Class declarations -#----------------------------------------------------------------------------- - -class IPythonLexer(PythonLexer): - """ - Pygments Lexer for use with IPython code. Inherits from - PythonLexer and adds information about IPython specific - keywords (i.e. magic commands, shell commands, etc.) - """ - - #Basic properties - name = 'IPython' - aliases = ['ip', 'ipython'] - filenames = ['*.ipy'] - - #Highlighting information - tokens = PythonLexer.tokens.copy() - tokens['root'] = [ - (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword, - using(BashLexer), Text)), - (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)), - (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), - ] + tokens['root'] + +# Standard library +import re + +# Third party +from pygments.lexers import BashLexer, PythonLexer, Python3Lexer +from pygments.lexer import ( + Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using, +) +from pygments.token import ( + Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error, +) +from pygments.util import get_bool_opt + + + +line_re = re.compile('.*?\n') + +ipython_tokens = [ + (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword, + using(BashLexer), Text)), + (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)), + (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), +] + +def build_ipy_lexer(python3): + """Builds IPython lexers depending on the value of `python3`. + + The lexer inherits from an appropriate Python lexer and then adds + information about IPython specific keywords (i.e. magic commands, + shell commands, etc.) + + Parameters + ---------- + python3 : bool + If `True`, then build an IPython lexer from a Python 3 lexer. + + """ + # It would be nice to have a single IPython lexer class which takes + # a boolean `python3`. But since there are two Python lexer classes, + # we will also have two IPython lexer classes. + if python3: + PyLexer = Python3Lexer + clsname = 'IPython3Lexer' + name = 'IPython3' + aliases = ['ipython3'] + doc = """IPython3 Lexer""" + else: + PyLexer = PythonLexer + clsname = 'IPythonLexer' + name = 'IPython' + aliases = ['ipython'] + doc = """IPython Lexer""" + + tokens = PyLexer.tokens.copy() + tokens['root'] = ipython_tokens + tokens['root'] + + attrs = {'name': name, 'aliases': aliases, + '__doc__': doc, 'tokens': tokens} + + return type(name, (PyLexer,), attrs) + + +IPython3Lexer = build_ipy_lexer(python3=True) +IPythonLexer = build_ipy_lexer(python3=False) + + +class IPythonPartialTracebackLexer(RegexLexer): + """ + Partial lexer for IPython tracebacks. + + Handles all the non-python output. This works for both Python 2.x and 3.x. + + """ + name = 'IPython Partial Traceback' + + tokens = { + 'root': [ + # Tracebacks for syntax errors have a different style. + # For both types of tracebacks, we mark the first line with + # Generic.Traceback. For syntax errors, we mark the filename + # as we mark the filenames for non-syntax tracebacks. + # + # These two regexps define how IPythonConsoleLexer finds a + # traceback. + # + ## Non-syntax traceback + (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)), + ## Syntax traceback + (r'^( File)(.*)(, line )(\d+\n)', + bygroups(Generic.Traceback, Name.Namespace, + Generic.Traceback, Literal.Number.Integer)), + + # (Exception Identifier)(Whitespace)(Traceback Message) + (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)', + bygroups(Name.Exception, Generic.Whitespace, Text)), + # (Module/Filename)(Text)(Callee)(Function Signature) + # Better options for callee and function signature? + (r'(.*)( in )(.*)(\(.*\)\n)', + bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)), + # Regular line: (Whitespace)(Line Number)(Python Code) + (r'(\s*?)(\d+)(.*?\n)', + bygroups(Generic.Whitespace, Literal.Number.Integer, Other)), + # Emphasized line: (Arrow)(Line Number)(Python Code) + # Using Exception token so arrow color matches the Exception. + (r'(-*>?\s?)(\d+)(.*?\n)', + bygroups(Name.Exception, Literal.Number.Integer, Other)), + # (Exception Identifier)(Message) + (r'(?u)(^[^\d\W]\w*)(:.*?\n)', + bygroups(Name.Exception, Text)), + # Tag everything else as Other, will be handled later. + (r'.*\n', Other), + ], + } + + +class IPythonTracebackLexer(DelegatingLexer): + """ + IPython traceback lexer. + + For doctests, the tracebacks can be snipped as much as desired with the + exception to the lines that designate a traceback. For non-syntax error + tracebacks, this is the line of hyphens. For syntax error tracebacks, + this is the line which lists the File and line number. + + """ + # The lexer inherits from DelegatingLexer. The "root" lexer is an + # appropriate IPython lexer, which depends on the value of the boolean + # `python3`. First, we parse with the partial IPython traceback lexer. + # Then, any code marked with the "Other" token is delegated to the root + # lexer. + # + name = 'IPython Traceback' + aliases = ['ipythontb'] + + def __init__(self, **options): + self.python3 = get_bool_opt(options, 'python3', False) + + if self.python3: + IPyLexer = IPython3Lexer + else: + IPyLexer = IPythonLexer + + DelegatingLexer.__init__(self, IPyLexer, + IPythonPartialTracebackLexer, **options) + + +class IPythonConsoleLexer(Lexer): + """ + An IPython console lexer for IPython code-blocks and doctests, such as: + + .. sourcecode:: ipythoncon + + In [1]: a = 'foo' + + In [2]: a + Out[2]: 'foo' + + In [3]: print a + foo + + In [4]: 1 / 0 + + Support is also provided for IPython exceptions. + + .. code-block:: ipythoncon + + In [1]: raise Exception + --------------------------------------------------------------------------- + Exception Traceback (most recent call last) + in () + ----> 1 raise Exception + + Exception: + + """ + name = 'IPython console session' + aliases = ['ipythoncon'] + mimetypes = ['text/x-ipython-console'] + + # The regexps used to determine what is input and what is output. The + # input regex should be consistent with and also be the combination of + # the values of the `in_template` and `in2_templates`. For example, the + # defaults prompts are: + # + # c.PromptManager.in_template = 'In [\#]: ' + # c.PromptManager.in2_template = ' .\D.: ' + # c.PromptManager.out_template = 'Out[\#]: ' + # + # Note, we do not include the trailing whitespace in the regex since + # we want to allow blank prompts (and editors often remove trailing + # whitespace). + # + in1_regex = r'In \[[0-9]+\]: ' + in2_regex = r' \.\.+\.: ' + out_regex = r'Out\[[0-9]+\]: ' + + #: The regex to determine when a traceback starts. + ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)') + + def __init__(self, **options): + """Initialize the IPython console lexer. + + Parameters + ---------- + python3 : bool + If `True`, then the console inputs are parsed using a Python 3 + lexer. Otherwise, they are parsed using a Python 2 lexer. + in1_regex : RegexObject + The compiled regular expression used to detect the start + of inputs. Although the IPython configuration setting may have a + trailing whitespace, do not include it in the regex. If `None`, + then the default input prompt is assumed. + in2_regex : RegexObject + The compiled regular expression used to detect the continuation + of inputs. Although the IPython configuration setting may have a + trailing whitespace, do not include it in the regex. If `None`, + then the default input prompt is assumed. + out_regex : RegexObject + The compiled regular expression used to detect outputs. If `None`, + then the default output prompt is assumed. + + """ + self.python3 = get_bool_opt(options, 'python3', False) + + in1_regex = options.get('in1_regex', self.in1_regex) + in2_regex = options.get('in2_regex', self.in2_regex) + out_regex = options.get('out_regex', self.out_regex) + + # So that we can work with input and output prompts which have been + # rstrip'd (possibly by editors) we also need rstrip'd variants. If + # we do not do this, then such prompts will be tagged as 'output'. + # The reason can't just use the rstrip'd variants instead is because + # we want any whitespace associated with the prompt to be inserted + # with the token. This allows formatted code to be modified so as hide + # the appearance of prompts. For example, see copybutton.js. + in1_regex_rstrip = in1_regex.rstrip() + '\n' + in2_regex_rstrip = in2_regex.rstrip() + '\n' + out_regex_rstrip = out_regex.rstrip() + '\n' + + # Compile and save them all. + attrs = ['in1_regex', 'in2_regex', 'out_regex', + 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip'] + for attr in attrs: + self.__setattr__(attr, re.compile(locals()[attr])) + + Lexer.__init__(self, **options) + + if self.python3: + pylexer = IPython3Lexer + tblexer = IPythonTracebackLexer + else: + pylexer = IPythonLexer + tblexer = IPythonTracebackLexer + + self.pylexer = pylexer(**options) + self.tblexer = tblexer(**options) + + self.reset() + + def reset(self): + self.mode = 'output' + self.index = 0 + self.buffer = u'' + self.insertions = [] + + def buffered_tokens(self): + """ + Generator of unprocessed tokens after doing insertions and before + changing to a new state. + + """ + if self.mode == 'output': + tokens = [(0, Generic.Output, self.buffer)] + elif self.mode == 'input': + tokens = self.pylexer.get_tokens_unprocessed(self.buffer) + else: # traceback + tokens = self.tblexer.get_tokens_unprocessed(self.buffer) + + for i, t, v in do_insertions(self.insertions, tokens): + # All token indexes are relative to the buffer. + yield self.index + i, t, v + + # Clear it all + self.index += len(self.buffer) + self.buffer = u'' + self.insertions = [] + + def get_modecode(self, line): + """ + Returns the next mode and code to be added to the next mode's buffer. + + The next mode depends on current mode and contents of line. + + """ + # To reduce the number of regex match checks, we have multiple + # 'if' blocks instead of 'if-elif' blocks. + + ### Check for possible end of input + ### + in2_match = self.in2_regex.match(line) + in2_match_rstrip = self.in2_regex_rstrip.match(line) + if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \ + in2_match_rstrip: + end_input = True + else: + end_input = False + if end_input and self.mode != 'tb': + # Only look for an end of input when not in tb mode. + # An ellipsis could appear within the traceback. + mode = 'output' + code = u'' + insertion = (0, Generic.Prompt, line) + return mode, code, insertion + + ### Check for output prompt + ### + out_match = self.out_regex.match(line) + out_match_rstrip = self.out_regex_rstrip.match(line) + if out_match or out_match_rstrip: + mode = 'output' + if out_match: + idx = out_match.end() + else: + idx = out_match_rstrip.end() + code = line[idx:] + # Use the 'heading' token for output. We cannot use Generic.Error + # since it would conflict with exceptions. + insertion = (0, Generic.Heading, line[:idx]) + return mode, code, insertion + + + ### Check for input or continuation prompt (non stripped version) + ### + in1_match = self.in1_regex.match(line) + if in1_match or (in2_match and self.mode != 'tb'): + # New input or when not in tb, continued input. + # We do not check for continued input when in tb since it is + # allowable to replace a long stack with an ellipsis. + mode = 'input' + if in1_match: + idx = in1_match.end() + else: # in2_match + idx = in2_match.end() + code = line[idx:] + insertion = (0, Generic.Prompt, line[:idx]) + return mode, code, insertion + + ### Check for input or continuation prompt (stripped version) + ### + in1_match_rstrip = self.in1_regex_rstrip.match(line) + if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'): + # New input or when not in tb, continued input. + # We do not check for continued input when in tb since it is + # allowable to replace a long stack with an ellipsis. + mode = 'input' + if in1_match_rstrip: + idx = in1_match_rstrip.end() + else: # in2_match + idx = in2_match_rstrip.end() + code = line[idx:] + insertion = (0, Generic.Prompt, line[:idx]) + return mode, code, insertion + + ### Check for traceback + ### + if self.ipytb_start.match(line): + mode = 'tb' + code = line + insertion = None + return mode, code, insertion + + ### All other stuff... + ### + if self.mode in ('input', 'output'): + # We assume all other text is output. Multiline input that + # does not use the continuation marker cannot be detected. + # For example, the 3 in the following is clearly output: + # + # In [1]: print 3 + # 3 + # + # But the following second line is part of the input: + # + # In [2]: while True: + # print True + # + # In both cases, the 2nd line will be 'output'. + # + mode = 'output' + else: + mode = 'tb' + + code = line + insertion = None + + return mode, code, insertion + + def get_tokens_unprocessed(self, text): + self.reset() + for match in line_re.finditer(text): + line = match.group() + mode, code, insertion = self.get_modecode(line) + + if mode != self.mode: + # Yield buffered tokens before transitioning to new mode. + for token in self.buffered_tokens(): + yield token + self.mode = mode + + if insertion: + self.insertions.append((len(self.buffer), [insertion])) + self.buffer += code + else: + for token in self.buffered_tokens(): + yield token + +class IPyLexer(Lexer): + """ + Primary lexer for all IPython-like code. + + This is a simple helper lexer. If the first line of the text begins with + "In \[[0-9]+\]:", then the entire text is parsed with an IPython console + lexer. If not, then the entire text is parsed with an IPython lexer. + + The goal is to reduce the number of lexers that are registered + with Pygments. + + """ + name = 'IPy session' + aliases = ['ipy'] + + def __init__(self, **options): + self.python3 = get_bool_opt(options, 'python3', False) + Lexer.__init__(self, **options) + + self.IPythonLexer = IPythonLexer(**options) + self.IPythonConsoleLexer = IPythonConsoleLexer(**options) + + def get_tokens_unprocessed(self, text): + if re.match(r'(In \[[0-9]+\]:)', text.strip()): + lex = self.IPythonConsoleLexer + else: + lex = self.IPythonLexer + for token in lex.get_tokens_unprocessed(text): + yield token +