From e35dc3b48e5ec2a7e08d19cd0f62529a106bffde 2013-11-16 03:27:59
From: chebee7i <chebee7i@gmail.com>
Date: 2013-11-16 03:27:59
Subject: [PATCH] Update IPython Pygments lexers.

---

diff --git a/IPython/nbconvert/utils/lexers.py b/IPython/nbconvert/utils/lexers.py
index 9a9092e..065b1d3 100644
--- a/IPython/nbconvert/utils/lexers.py
+++ b/IPython/nbconvert/utils/lexers.py
@@ -1,46 +1,473 @@
-"""A custom pygments lexer for IPython code cells.
+# -*- coding: utf-8 -*-
+"""
+Defines a variety of Pygments lexers for highlighting IPython code.
+
+This includes:
+
+    IPythonLexer
+    IPython3Lexer
+        Lexers for pure IPython (python + magic/shell commands)
+
+    IPythonPartialTracebackLexer
+    IPythonTracebackLexer
+        Supports 2.x and 3.x via keyword `python3`.  The partial traceback
+        lexer reads everything but the Python code appearing in a traceback.
+        The full lexer combines the partial lexer with an IPython lexer.
+
+    IPythonConsoleLexer
+        A lexer for IPython console sessions, with support for tracebacks.
+
+    IPyLexer
+        A friendly lexer which examines the first line of text and from it,
+        decides whether to use an IPython lexer or an IPython console lexer.
+        This is probably the only lexer that needs to be explicitly added
+        to Pygments.
 
-Informs The pygments highlighting library of the quirks of IPython's superset
-of Python -- magic commands, !shell commands, etc.
 """
-#-----------------------------------------------------------------------------
-# Copyright (c) 2013, the IPython Development Team.
-#
-# Distributed under the terms of the Modified BSD License.
-#
-# The full license is in the file COPYING.txt, distributed with this software.
-#-----------------------------------------------------------------------------
-
-#-----------------------------------------------------------------------------
-# Imports
-#-----------------------------------------------------------------------------
-
-# Third-party imports
-from pygments.lexers import PythonLexer, BashLexer
-from pygments.lexer import bygroups, using
-from pygments.token import Keyword, Operator, Text
-
-#-----------------------------------------------------------------------------
-# Class declarations
-#-----------------------------------------------------------------------------
-
-class IPythonLexer(PythonLexer):
-    """
-    Pygments Lexer for use with IPython code.  Inherits from 
-    PythonLexer and adds information about IPython specific
-    keywords (i.e. magic commands, shell commands, etc.)
-    """
-    
-    #Basic properties
-    name = 'IPython'
-    aliases = ['ip', 'ipython']
-    filenames = ['*.ipy']
-    
-    #Highlighting information
-    tokens = PythonLexer.tokens.copy()
-    tokens['root'] = [
-        (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword,
-                                             using(BashLexer), Text)),
-        (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)),
-        (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
-        ] + tokens['root']
+
+# Standard library
+import re
+
+# Third party
+from pygments.lexers import BashLexer, PythonLexer, Python3Lexer
+from pygments.lexer import (
+    Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
+)
+from pygments.token import (
+    Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
+)
+from pygments.util import get_bool_opt
+
+
+
+line_re = re.compile('.*?\n')
+
+ipython_tokens = [
+  (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword,
+                                       using(BashLexer), Text)),
+  (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)),
+  (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
+]
+
+def build_ipy_lexer(python3):
+    """Builds IPython lexers depending on the value of `python3`.
+
+    The lexer inherits from an appropriate Python lexer and then adds
+    information about IPython specific keywords (i.e. magic commands,
+    shell commands, etc.)
+
+    Parameters
+    ----------
+    python3 : bool
+        If `True`, then build an IPython lexer from a Python 3 lexer.
+
+    """
+    # It would be nice to have a single IPython lexer class which takes
+    # a boolean `python3`.  But since there are two Python lexer classes,
+    # we will also have two IPython lexer classes.
+    if python3:
+        PyLexer = Python3Lexer
+        clsname = 'IPython3Lexer'
+        name = 'IPython3'
+        aliases = ['ipython3']
+        doc = """IPython3 Lexer"""
+    else:
+        PyLexer = PythonLexer
+        clsname = 'IPythonLexer'
+        name = 'IPython'
+        aliases = ['ipython']
+        doc = """IPython Lexer"""
+
+    tokens = PyLexer.tokens.copy()
+    tokens['root'] = ipython_tokens + tokens['root']
+
+    attrs = {'name': name, 'aliases': aliases,
+             '__doc__': doc, 'tokens': tokens}
+
+    return type(name, (PyLexer,), attrs)
+
+
+IPython3Lexer = build_ipy_lexer(python3=True)
+IPythonLexer = build_ipy_lexer(python3=False)
+
+
+class IPythonPartialTracebackLexer(RegexLexer):
+    """
+    Partial lexer for IPython tracebacks.
+
+    Handles all the non-python output. This works for both Python 2.x and 3.x.
+
+    """
+    name = 'IPython Partial Traceback'
+
+    tokens = {
+        'root': [
+            # Tracebacks for syntax errors have a different style.
+            # For both types of tracebacks, we mark the first line with
+            # Generic.Traceback.  For syntax errors, we mark the filename
+            # as we mark the filenames for non-syntax tracebacks.
+            #
+            # These two regexps define how IPythonConsoleLexer finds a
+            # traceback.
+            #
+            ## Non-syntax traceback
+            (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
+            ## Syntax traceback
+            (r'^(  File)(.*)(, line )(\d+\n)',
+             bygroups(Generic.Traceback, Name.Namespace,
+                      Generic.Traceback, Literal.Number.Integer)),
+
+            # (Exception Identifier)(Whitespace)(Traceback Message)
+            (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
+             bygroups(Name.Exception, Generic.Whitespace, Text)),
+            # (Module/Filename)(Text)(Callee)(Function Signature)
+            # Better options for callee and function signature?
+            (r'(.*)( in )(.*)(\(.*\)\n)',
+             bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
+            # Regular line: (Whitespace)(Line Number)(Python Code)
+            (r'(\s*?)(\d+)(.*?\n)',
+             bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
+            # Emphasized line: (Arrow)(Line Number)(Python Code)
+            # Using Exception token so arrow color matches the Exception.
+            (r'(-*>?\s?)(\d+)(.*?\n)',
+             bygroups(Name.Exception, Literal.Number.Integer, Other)),
+            # (Exception Identifier)(Message)
+            (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
+             bygroups(Name.Exception, Text)),
+            # Tag everything else as Other, will be handled later.
+            (r'.*\n', Other),
+        ],
+    }
+
+
+class IPythonTracebackLexer(DelegatingLexer):
+    """
+    IPython traceback lexer.
+
+    For doctests, the tracebacks can be snipped as much as desired with the
+    exception to the lines that designate a traceback. For non-syntax error
+    tracebacks, this is the line of hyphens. For syntax error tracebacks,
+    this is the line which lists the File and line number.
+
+    """
+    # The lexer inherits from DelegatingLexer.  The "root" lexer is an
+    # appropriate IPython lexer, which depends on the value of the boolean
+    # `python3`.  First, we parse with the partial IPython traceback lexer.
+    # Then, any code marked with the "Other" token is delegated to the root
+    # lexer.
+    #
+    name = 'IPython Traceback'
+    aliases = ['ipythontb']
+
+    def __init__(self, **options):
+        self.python3 = get_bool_opt(options, 'python3', False)
+
+        if self.python3:
+            IPyLexer = IPython3Lexer
+        else:
+            IPyLexer = IPythonLexer
+
+        DelegatingLexer.__init__(self, IPyLexer,
+                                 IPythonPartialTracebackLexer, **options)
+
+
+class IPythonConsoleLexer(Lexer):
+    """
+    An IPython console lexer for IPython code-blocks and doctests, such as:
+
+    .. sourcecode:: ipythoncon
+
+        In [1]: a = 'foo'
+
+        In [2]: a
+        Out[2]: 'foo'
+
+        In [3]: print a
+        foo
+
+        In [4]: 1 / 0
+
+    Support is also provided for IPython exceptions.
+
+    .. code-block:: ipythoncon
+
+        In [1]: raise Exception
+        ---------------------------------------------------------------------------
+        Exception                                 Traceback (most recent call last)
+        <ipython-input-1-fca2ab0ca76b> in <module>()
+        ----> 1 raise Exception
+
+        Exception:
+
+    """
+    name = 'IPython console session'
+    aliases = ['ipythoncon']
+    mimetypes = ['text/x-ipython-console']
+
+    # The regexps used to determine what is input and what is output. The
+    # input regex should be consistent with and also be the combination of
+    # the values of the `in_template` and `in2_templates`. For example, the
+    # defaults prompts are:
+    #
+    #     c.PromptManager.in_template  = 'In [\#]: '
+    #     c.PromptManager.in2_template = '   .\D.: '
+    #     c.PromptManager.out_template = 'Out[\#]: '
+    #
+    # Note, we do not include the trailing whitespace in the regex since
+    # we want to allow blank prompts (and editors often remove trailing
+    # whitespace).
+    #
+    in1_regex = r'In \[[0-9]+\]: '
+    in2_regex = r'   \.\.+\.: '
+    out_regex = r'Out\[[0-9]+\]: '
+
+    #: The regex to determine when a traceback starts.
+    ipytb_start = re.compile(r'^(\^C)?(-+\n)|^(  File)(.*)(, line )(\d+\n)')
+
+    def __init__(self, **options):
+        """Initialize the IPython console lexer.
+
+        Parameters
+        ----------
+        python3 : bool
+            If `True`, then the console inputs are parsed using a Python 3
+            lexer. Otherwise, they are parsed using a Python 2 lexer.
+        in1_regex : RegexObject
+            The compiled regular expression used to detect the start
+            of inputs. Although the IPython configuration setting may have a
+            trailing whitespace, do not include it in the regex. If `None`,
+            then the default input prompt is assumed.
+        in2_regex : RegexObject
+            The compiled regular expression used to detect the continuation
+            of inputs. Although the IPython configuration setting may have a
+            trailing whitespace, do not include it in the regex. If `None`,
+            then the default input prompt is assumed.
+        out_regex : RegexObject
+            The compiled regular expression used to detect outputs. If `None`,
+            then the default output prompt is assumed.
+
+        """
+        self.python3 = get_bool_opt(options, 'python3', False)
+
+        in1_regex = options.get('in1_regex', self.in1_regex)
+        in2_regex = options.get('in2_regex', self.in2_regex)
+        out_regex = options.get('out_regex', self.out_regex)
+
+        # So that we can work with input and output prompts which have been
+        # rstrip'd (possibly by editors) we also need rstrip'd variants. If
+        # we do not do this, then such prompts will be tagged as 'output'.
+        # The reason can't just use the rstrip'd variants instead is because
+        # we want any whitespace associated with the prompt to be inserted
+        # with the token. This allows formatted code to be modified so as hide
+        # the appearance of prompts.  For example, see copybutton.js.
+        in1_regex_rstrip = in1_regex.rstrip() + '\n'
+        in2_regex_rstrip = in2_regex.rstrip() + '\n'
+        out_regex_rstrip = out_regex.rstrip() + '\n'
+
+        # Compile and save them all.
+        attrs = ['in1_regex', 'in2_regex', 'out_regex',
+                 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
+        for attr in attrs:
+            self.__setattr__(attr, re.compile(locals()[attr]))
+
+        Lexer.__init__(self, **options)
+
+        if self.python3:
+            pylexer = IPython3Lexer
+            tblexer = IPythonTracebackLexer
+        else:
+            pylexer = IPythonLexer
+            tblexer = IPythonTracebackLexer
+
+        self.pylexer = pylexer(**options)
+        self.tblexer = tblexer(**options)
+
+        self.reset()
+
+    def reset(self):
+        self.mode = 'output'
+        self.index = 0
+        self.buffer = u''
+        self.insertions = []
+
+    def buffered_tokens(self):
+        """
+        Generator of unprocessed tokens after doing insertions and before
+        changing to a new state.
+
+        """
+        if self.mode == 'output':
+            tokens = [(0, Generic.Output, self.buffer)]
+        elif self.mode == 'input':
+            tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
+        else: # traceback
+            tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
+
+        for i, t, v in do_insertions(self.insertions, tokens):
+            # All token indexes are relative to the buffer.
+            yield self.index + i, t, v
+
+        # Clear it all
+        self.index += len(self.buffer)
+        self.buffer = u''
+        self.insertions = []
+
+    def get_modecode(self, line):
+        """
+        Returns the next mode and code to be added to the next mode's buffer.
+
+        The next mode depends on current mode and contents of line.
+
+        """
+        # To reduce the number of regex match checks, we have multiple
+        # 'if' blocks instead of 'if-elif' blocks.
+
+        ### Check for possible end of input
+        ###
+        in2_match = self.in2_regex.match(line)
+        in2_match_rstrip = self.in2_regex_rstrip.match(line)
+        if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
+           in2_match_rstrip:
+            end_input = True
+        else:
+            end_input = False
+        if end_input and self.mode != 'tb':
+            # Only look for an end of input when not in tb mode.
+            # An ellipsis could appear within the traceback.
+            mode = 'output'
+            code = u''
+            insertion = (0, Generic.Prompt, line)
+            return mode, code, insertion
+
+        ### Check for output prompt
+        ###
+        out_match = self.out_regex.match(line)
+        out_match_rstrip = self.out_regex_rstrip.match(line)
+        if out_match or out_match_rstrip:
+            mode = 'output'
+            if out_match:
+                idx = out_match.end()
+            else:
+                idx = out_match_rstrip.end()
+            code = line[idx:]
+            # Use the 'heading' token for output.  We cannot use Generic.Error
+            # since it would conflict with exceptions.
+            insertion = (0, Generic.Heading, line[:idx])
+            return mode, code, insertion
+
+
+        ### Check for input or continuation prompt (non stripped version)
+        ###
+        in1_match = self.in1_regex.match(line)
+        if in1_match or (in2_match and self.mode != 'tb'):
+            # New input or when not in tb, continued input.
+            # We do not check for continued input when in tb since it is
+            # allowable to replace a long stack with an ellipsis.
+            mode = 'input'
+            if in1_match:
+                idx = in1_match.end()
+            else: # in2_match
+                idx = in2_match.end()
+            code = line[idx:]
+            insertion = (0, Generic.Prompt, line[:idx])
+            return mode, code, insertion
+
+        ### Check for input or continuation prompt (stripped version)
+        ###
+        in1_match_rstrip = self.in1_regex_rstrip.match(line)
+        if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
+            # New input or when not in tb, continued input.
+            # We do not check for continued input when in tb since it is
+            # allowable to replace a long stack with an ellipsis.
+            mode = 'input'
+            if in1_match_rstrip:
+                idx = in1_match_rstrip.end()
+            else: # in2_match
+                idx = in2_match_rstrip.end()
+            code = line[idx:]
+            insertion = (0, Generic.Prompt, line[:idx])
+            return mode, code, insertion
+
+        ### Check for traceback
+        ###
+        if self.ipytb_start.match(line):
+            mode = 'tb'
+            code = line
+            insertion = None
+            return mode, code, insertion
+
+        ### All other stuff...
+        ###
+        if self.mode in ('input', 'output'):
+            # We assume all other text is output. Multiline input that
+            # does not use the continuation marker cannot be detected.
+            # For example, the 3 in the following is clearly output:
+            #
+            #    In [1]: print 3
+            #    3
+            #
+            # But the following second line is part of the input:
+            #
+            #    In [2]: while True:
+            #        print True
+            #
+            # In both cases, the 2nd line will be 'output'.
+            #
+            mode = 'output'
+        else:
+            mode = 'tb'
+
+        code = line
+        insertion = None
+
+        return mode, code, insertion
+
+    def get_tokens_unprocessed(self, text):
+        self.reset()
+        for match in line_re.finditer(text):
+            line = match.group()
+            mode, code, insertion = self.get_modecode(line)
+
+            if mode != self.mode:
+                # Yield buffered tokens before transitioning to new mode.
+                for token in self.buffered_tokens():
+                    yield token
+                self.mode = mode
+
+            if insertion:
+                self.insertions.append((len(self.buffer), [insertion]))
+            self.buffer += code
+        else:
+            for token in self.buffered_tokens():
+                yield token
+
+class IPyLexer(Lexer):
+    """
+    Primary lexer for all IPython-like code.
+
+    This is a simple helper lexer.  If the first line of the text begins with
+    "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
+    lexer. If not, then the entire text is parsed with an IPython lexer.
+
+    The goal is to reduce the number of lexers that are registered
+    with Pygments.
+
+    """
+    name = 'IPy session'
+    aliases = ['ipy']
+
+    def __init__(self, **options):
+        self.python3 = get_bool_opt(options, 'python3', False)
+        Lexer.__init__(self, **options)
+
+        self.IPythonLexer = IPythonLexer(**options)
+        self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
+
+    def get_tokens_unprocessed(self, text):
+        if re.match(r'(In \[[0-9]+\]:)', text.strip()):
+            lex = self.IPythonConsoleLexer
+        else:
+            lex = self.IPythonLexer
+        for token in lex.get_tokens_unprocessed(text):
+            yield token
+