##// END OF EJS Templates
Update IPython Pygments lexers.
chebee7i -
Show More
This diff has been collapsed as it changes many lines, (515 lines changed) Show them Hide them
@@ -1,46 +1,473 b''
1 """A custom pygments lexer for IPython code cells.
1 # -*- coding: utf-8 -*-
2 """
3 Defines a variety of Pygments lexers for highlighting IPython code.
4
5 This includes:
6
7 IPythonLexer
8 IPython3Lexer
9 Lexers for pure IPython (python + magic/shell commands)
10
11 IPythonPartialTracebackLexer
12 IPythonTracebackLexer
13 Supports 2.x and 3.x via keyword `python3`. The partial traceback
14 lexer reads everything but the Python code appearing in a traceback.
15 The full lexer combines the partial lexer with an IPython lexer.
16
17 IPythonConsoleLexer
18 A lexer for IPython console sessions, with support for tracebacks.
19
20 IPyLexer
21 A friendly lexer which examines the first line of text and from it,
22 decides whether to use an IPython lexer or an IPython console lexer.
23 This is probably the only lexer that needs to be explicitly added
24 to Pygments.
2
25
3 Informs The pygments highlighting library of the quirks of IPython's superset
4 of Python -- magic commands, !shell commands, etc.
5 """
26 """
6 #-----------------------------------------------------------------------------
27
7 # Copyright (c) 2013, the IPython Development Team.
28 # Standard library
8 #
29 import re
9 # Distributed under the terms of the Modified BSD License.
30
10 #
31 # Third party
11 # The full license is in the file COPYING.txt, distributed with this software.
32 from pygments.lexers import BashLexer, PythonLexer, Python3Lexer
12 #-----------------------------------------------------------------------------
33 from pygments.lexer import (
13
34 Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
14 #-----------------------------------------------------------------------------
35 )
15 # Imports
36 from pygments.token import (
16 #-----------------------------------------------------------------------------
37 Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
17
38 )
18 # Third-party imports
39 from pygments.util import get_bool_opt
19 from pygments.lexers import PythonLexer, BashLexer
40
20 from pygments.lexer import bygroups, using
41
21 from pygments.token import Keyword, Operator, Text
42
22
43 line_re = re.compile('.*?\n')
23 #-----------------------------------------------------------------------------
44
24 # Class declarations
45 ipython_tokens = [
25 #-----------------------------------------------------------------------------
46 (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword,
26
47 using(BashLexer), Text)),
27 class IPythonLexer(PythonLexer):
48 (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)),
28 """
49 (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
29 Pygments Lexer for use with IPython code. Inherits from
50 ]
30 PythonLexer and adds information about IPython specific
51
31 keywords (i.e. magic commands, shell commands, etc.)
52 def build_ipy_lexer(python3):
32 """
53 """Builds IPython lexers depending on the value of `python3`.
33
54
34 #Basic properties
55 The lexer inherits from an appropriate Python lexer and then adds
35 name = 'IPython'
56 information about IPython specific keywords (i.e. magic commands,
36 aliases = ['ip', 'ipython']
57 shell commands, etc.)
37 filenames = ['*.ipy']
58
38
59 Parameters
39 #Highlighting information
60 ----------
40 tokens = PythonLexer.tokens.copy()
61 python3 : bool
41 tokens['root'] = [
62 If `True`, then build an IPython lexer from a Python 3 lexer.
42 (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword,
63
43 using(BashLexer), Text)),
64 """
44 (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)),
65 # It would be nice to have a single IPython lexer class which takes
45 (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
66 # a boolean `python3`. But since there are two Python lexer classes,
46 ] + tokens['root']
67 # we will also have two IPython lexer classes.
68 if python3:
69 PyLexer = Python3Lexer
70 clsname = 'IPython3Lexer'
71 name = 'IPython3'
72 aliases = ['ipython3']
73 doc = """IPython3 Lexer"""
74 else:
75 PyLexer = PythonLexer
76 clsname = 'IPythonLexer'
77 name = 'IPython'
78 aliases = ['ipython']
79 doc = """IPython Lexer"""
80
81 tokens = PyLexer.tokens.copy()
82 tokens['root'] = ipython_tokens + tokens['root']
83
84 attrs = {'name': name, 'aliases': aliases,
85 '__doc__': doc, 'tokens': tokens}
86
87 return type(name, (PyLexer,), attrs)
88
89
90 IPython3Lexer = build_ipy_lexer(python3=True)
91 IPythonLexer = build_ipy_lexer(python3=False)
92
93
94 class IPythonPartialTracebackLexer(RegexLexer):
95 """
96 Partial lexer for IPython tracebacks.
97
98 Handles all the non-python output. This works for both Python 2.x and 3.x.
99
100 """
101 name = 'IPython Partial Traceback'
102
103 tokens = {
104 'root': [
105 # Tracebacks for syntax errors have a different style.
106 # For both types of tracebacks, we mark the first line with
107 # Generic.Traceback. For syntax errors, we mark the filename
108 # as we mark the filenames for non-syntax tracebacks.
109 #
110 # These two regexps define how IPythonConsoleLexer finds a
111 # traceback.
112 #
113 ## Non-syntax traceback
114 (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
115 ## Syntax traceback
116 (r'^( File)(.*)(, line )(\d+\n)',
117 bygroups(Generic.Traceback, Name.Namespace,
118 Generic.Traceback, Literal.Number.Integer)),
119
120 # (Exception Identifier)(Whitespace)(Traceback Message)
121 (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
122 bygroups(Name.Exception, Generic.Whitespace, Text)),
123 # (Module/Filename)(Text)(Callee)(Function Signature)
124 # Better options for callee and function signature?
125 (r'(.*)( in )(.*)(\(.*\)\n)',
126 bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
127 # Regular line: (Whitespace)(Line Number)(Python Code)
128 (r'(\s*?)(\d+)(.*?\n)',
129 bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
130 # Emphasized line: (Arrow)(Line Number)(Python Code)
131 # Using Exception token so arrow color matches the Exception.
132 (r'(-*>?\s?)(\d+)(.*?\n)',
133 bygroups(Name.Exception, Literal.Number.Integer, Other)),
134 # (Exception Identifier)(Message)
135 (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
136 bygroups(Name.Exception, Text)),
137 # Tag everything else as Other, will be handled later.
138 (r'.*\n', Other),
139 ],
140 }
141
142
143 class IPythonTracebackLexer(DelegatingLexer):
144 """
145 IPython traceback lexer.
146
147 For doctests, the tracebacks can be snipped as much as desired with the
148 exception to the lines that designate a traceback. For non-syntax error
149 tracebacks, this is the line of hyphens. For syntax error tracebacks,
150 this is the line which lists the File and line number.
151
152 """
153 # The lexer inherits from DelegatingLexer. The "root" lexer is an
154 # appropriate IPython lexer, which depends on the value of the boolean
155 # `python3`. First, we parse with the partial IPython traceback lexer.
156 # Then, any code marked with the "Other" token is delegated to the root
157 # lexer.
158 #
159 name = 'IPython Traceback'
160 aliases = ['ipythontb']
161
162 def __init__(self, **options):
163 self.python3 = get_bool_opt(options, 'python3', False)
164
165 if self.python3:
166 IPyLexer = IPython3Lexer
167 else:
168 IPyLexer = IPythonLexer
169
170 DelegatingLexer.__init__(self, IPyLexer,
171 IPythonPartialTracebackLexer, **options)
172
173
174 class IPythonConsoleLexer(Lexer):
175 """
176 An IPython console lexer for IPython code-blocks and doctests, such as:
177
178 .. sourcecode:: ipythoncon
179
180 In [1]: a = 'foo'
181
182 In [2]: a
183 Out[2]: 'foo'
184
185 In [3]: print a
186 foo
187
188 In [4]: 1 / 0
189
190 Support is also provided for IPython exceptions.
191
192 .. code-block:: ipythoncon
193
194 In [1]: raise Exception
195 ---------------------------------------------------------------------------
196 Exception Traceback (most recent call last)
197 <ipython-input-1-fca2ab0ca76b> in <module>()
198 ----> 1 raise Exception
199
200 Exception:
201
202 """
203 name = 'IPython console session'
204 aliases = ['ipythoncon']
205 mimetypes = ['text/x-ipython-console']
206
207 # The regexps used to determine what is input and what is output. The
208 # input regex should be consistent with and also be the combination of
209 # the values of the `in_template` and `in2_templates`. For example, the
210 # defaults prompts are:
211 #
212 # c.PromptManager.in_template = 'In [\#]: '
213 # c.PromptManager.in2_template = ' .\D.: '
214 # c.PromptManager.out_template = 'Out[\#]: '
215 #
216 # Note, we do not include the trailing whitespace in the regex since
217 # we want to allow blank prompts (and editors often remove trailing
218 # whitespace).
219 #
220 in1_regex = r'In \[[0-9]+\]: '
221 in2_regex = r' \.\.+\.: '
222 out_regex = r'Out\[[0-9]+\]: '
223
224 #: The regex to determine when a traceback starts.
225 ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)')
226
227 def __init__(self, **options):
228 """Initialize the IPython console lexer.
229
230 Parameters
231 ----------
232 python3 : bool
233 If `True`, then the console inputs are parsed using a Python 3
234 lexer. Otherwise, they are parsed using a Python 2 lexer.
235 in1_regex : RegexObject
236 The compiled regular expression used to detect the start
237 of inputs. Although the IPython configuration setting may have a
238 trailing whitespace, do not include it in the regex. If `None`,
239 then the default input prompt is assumed.
240 in2_regex : RegexObject
241 The compiled regular expression used to detect the continuation
242 of inputs. Although the IPython configuration setting may have a
243 trailing whitespace, do not include it in the regex. If `None`,
244 then the default input prompt is assumed.
245 out_regex : RegexObject
246 The compiled regular expression used to detect outputs. If `None`,
247 then the default output prompt is assumed.
248
249 """
250 self.python3 = get_bool_opt(options, 'python3', False)
251
252 in1_regex = options.get('in1_regex', self.in1_regex)
253 in2_regex = options.get('in2_regex', self.in2_regex)
254 out_regex = options.get('out_regex', self.out_regex)
255
256 # So that we can work with input and output prompts which have been
257 # rstrip'd (possibly by editors) we also need rstrip'd variants. If
258 # we do not do this, then such prompts will be tagged as 'output'.
259 # The reason can't just use the rstrip'd variants instead is because
260 # we want any whitespace associated with the prompt to be inserted
261 # with the token. This allows formatted code to be modified so as hide
262 # the appearance of prompts. For example, see copybutton.js.
263 in1_regex_rstrip = in1_regex.rstrip() + '\n'
264 in2_regex_rstrip = in2_regex.rstrip() + '\n'
265 out_regex_rstrip = out_regex.rstrip() + '\n'
266
267 # Compile and save them all.
268 attrs = ['in1_regex', 'in2_regex', 'out_regex',
269 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
270 for attr in attrs:
271 self.__setattr__(attr, re.compile(locals()[attr]))
272
273 Lexer.__init__(self, **options)
274
275 if self.python3:
276 pylexer = IPython3Lexer
277 tblexer = IPythonTracebackLexer
278 else:
279 pylexer = IPythonLexer
280 tblexer = IPythonTracebackLexer
281
282 self.pylexer = pylexer(**options)
283 self.tblexer = tblexer(**options)
284
285 self.reset()
286
287 def reset(self):
288 self.mode = 'output'
289 self.index = 0
290 self.buffer = u''
291 self.insertions = []
292
293 def buffered_tokens(self):
294 """
295 Generator of unprocessed tokens after doing insertions and before
296 changing to a new state.
297
298 """
299 if self.mode == 'output':
300 tokens = [(0, Generic.Output, self.buffer)]
301 elif self.mode == 'input':
302 tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
303 else: # traceback
304 tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
305
306 for i, t, v in do_insertions(self.insertions, tokens):
307 # All token indexes are relative to the buffer.
308 yield self.index + i, t, v
309
310 # Clear it all
311 self.index += len(self.buffer)
312 self.buffer = u''
313 self.insertions = []
314
315 def get_modecode(self, line):
316 """
317 Returns the next mode and code to be added to the next mode's buffer.
318
319 The next mode depends on current mode and contents of line.
320
321 """
322 # To reduce the number of regex match checks, we have multiple
323 # 'if' blocks instead of 'if-elif' blocks.
324
325 ### Check for possible end of input
326 ###
327 in2_match = self.in2_regex.match(line)
328 in2_match_rstrip = self.in2_regex_rstrip.match(line)
329 if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
330 in2_match_rstrip:
331 end_input = True
332 else:
333 end_input = False
334 if end_input and self.mode != 'tb':
335 # Only look for an end of input when not in tb mode.
336 # An ellipsis could appear within the traceback.
337 mode = 'output'
338 code = u''
339 insertion = (0, Generic.Prompt, line)
340 return mode, code, insertion
341
342 ### Check for output prompt
343 ###
344 out_match = self.out_regex.match(line)
345 out_match_rstrip = self.out_regex_rstrip.match(line)
346 if out_match or out_match_rstrip:
347 mode = 'output'
348 if out_match:
349 idx = out_match.end()
350 else:
351 idx = out_match_rstrip.end()
352 code = line[idx:]
353 # Use the 'heading' token for output. We cannot use Generic.Error
354 # since it would conflict with exceptions.
355 insertion = (0, Generic.Heading, line[:idx])
356 return mode, code, insertion
357
358
359 ### Check for input or continuation prompt (non stripped version)
360 ###
361 in1_match = self.in1_regex.match(line)
362 if in1_match or (in2_match and self.mode != 'tb'):
363 # New input or when not in tb, continued input.
364 # We do not check for continued input when in tb since it is
365 # allowable to replace a long stack with an ellipsis.
366 mode = 'input'
367 if in1_match:
368 idx = in1_match.end()
369 else: # in2_match
370 idx = in2_match.end()
371 code = line[idx:]
372 insertion = (0, Generic.Prompt, line[:idx])
373 return mode, code, insertion
374
375 ### Check for input or continuation prompt (stripped version)
376 ###
377 in1_match_rstrip = self.in1_regex_rstrip.match(line)
378 if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
379 # New input or when not in tb, continued input.
380 # We do not check for continued input when in tb since it is
381 # allowable to replace a long stack with an ellipsis.
382 mode = 'input'
383 if in1_match_rstrip:
384 idx = in1_match_rstrip.end()
385 else: # in2_match
386 idx = in2_match_rstrip.end()
387 code = line[idx:]
388 insertion = (0, Generic.Prompt, line[:idx])
389 return mode, code, insertion
390
391 ### Check for traceback
392 ###
393 if self.ipytb_start.match(line):
394 mode = 'tb'
395 code = line
396 insertion = None
397 return mode, code, insertion
398
399 ### All other stuff...
400 ###
401 if self.mode in ('input', 'output'):
402 # We assume all other text is output. Multiline input that
403 # does not use the continuation marker cannot be detected.
404 # For example, the 3 in the following is clearly output:
405 #
406 # In [1]: print 3
407 # 3
408 #
409 # But the following second line is part of the input:
410 #
411 # In [2]: while True:
412 # print True
413 #
414 # In both cases, the 2nd line will be 'output'.
415 #
416 mode = 'output'
417 else:
418 mode = 'tb'
419
420 code = line
421 insertion = None
422
423 return mode, code, insertion
424
425 def get_tokens_unprocessed(self, text):
426 self.reset()
427 for match in line_re.finditer(text):
428 line = match.group()
429 mode, code, insertion = self.get_modecode(line)
430
431 if mode != self.mode:
432 # Yield buffered tokens before transitioning to new mode.
433 for token in self.buffered_tokens():
434 yield token
435 self.mode = mode
436
437 if insertion:
438 self.insertions.append((len(self.buffer), [insertion]))
439 self.buffer += code
440 else:
441 for token in self.buffered_tokens():
442 yield token
443
444 class IPyLexer(Lexer):
445 """
446 Primary lexer for all IPython-like code.
447
448 This is a simple helper lexer. If the first line of the text begins with
449 "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
450 lexer. If not, then the entire text is parsed with an IPython lexer.
451
452 The goal is to reduce the number of lexers that are registered
453 with Pygments.
454
455 """
456 name = 'IPy session'
457 aliases = ['ipy']
458
459 def __init__(self, **options):
460 self.python3 = get_bool_opt(options, 'python3', False)
461 Lexer.__init__(self, **options)
462
463 self.IPythonLexer = IPythonLexer(**options)
464 self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
465
466 def get_tokens_unprocessed(self, text):
467 if re.match(r'(In \[[0-9]+\]:)', text.strip()):
468 lex = self.IPythonConsoleLexer
469 else:
470 lex = self.IPythonLexer
471 for token in lex.get_tokens_unprocessed(text):
472 yield token
473
General Comments 0
You need to be logged in to leave comments. Login now