##// END OF EJS Templates
Same highlighting for %%file as for %%writefile...
Matthias Geier -
Show More
@@ -1,531 +1,532 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 Defines a variety of Pygments lexers for highlighting IPython code.
4 4
5 5 This includes:
6 6
7 7 IPythonLexer, IPython3Lexer
8 8 Lexers for pure IPython (python + magic/shell commands)
9 9
10 10 IPythonPartialTracebackLexer, IPythonTracebackLexer
11 11 Supports 2.x and 3.x via keyword `python3`. The partial traceback
12 12 lexer reads everything but the Python code appearing in a traceback.
13 13 The full lexer combines the partial lexer with an IPython lexer.
14 14
15 15 IPythonConsoleLexer
16 16 A lexer for IPython console sessions, with support for tracebacks.
17 17
18 18 IPyLexer
19 19 A friendly lexer which examines the first line of text and from it,
20 20 decides whether to use an IPython lexer or an IPython console lexer.
21 21 This is probably the only lexer that needs to be explicitly added
22 22 to Pygments.
23 23
24 24 """
25 25 #-----------------------------------------------------------------------------
26 26 # Copyright (c) 2013, the IPython Development Team.
27 27 #
28 28 # Distributed under the terms of the Modified BSD License.
29 29 #
30 30 # The full license is in the file COPYING.txt, distributed with this software.
31 31 #-----------------------------------------------------------------------------
32 32
33 33 # Standard library
34 34 import re
35 35
36 36 # Third party
37 37 from pygments.lexers import (
38 38 BashLexer, HtmlLexer, JavascriptLexer, RubyLexer, PerlLexer, PythonLexer,
39 39 Python3Lexer, TexLexer)
40 40 from pygments.lexer import (
41 41 Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
42 42 )
43 43 from pygments.token import (
44 44 Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
45 45 )
46 46 from pygments.util import get_bool_opt
47 47
48 48 # Local
49 49
50 50 line_re = re.compile('.*?\n')
51 51
52 52 __all__ = ['build_ipy_lexer', 'IPython3Lexer', 'IPythonLexer',
53 53 'IPythonPartialTracebackLexer', 'IPythonTracebackLexer',
54 54 'IPythonConsoleLexer', 'IPyLexer']
55 55
56 56
57 57 def build_ipy_lexer(python3):
58 58 """Builds IPython lexers depending on the value of `python3`.
59 59
60 60 The lexer inherits from an appropriate Python lexer and then adds
61 61 information about IPython specific keywords (i.e. magic commands,
62 62 shell commands, etc.)
63 63
64 64 Parameters
65 65 ----------
66 66 python3 : bool
67 67 If `True`, then build an IPython lexer from a Python 3 lexer.
68 68
69 69 """
70 70 # It would be nice to have a single IPython lexer class which takes
71 71 # a boolean `python3`. But since there are two Python lexer classes,
72 72 # we will also have two IPython lexer classes.
73 73 if python3:
74 74 PyLexer = Python3Lexer
75 75 name = 'IPython3'
76 76 aliases = ['ipython3']
77 77 doc = """IPython3 Lexer"""
78 78 else:
79 79 PyLexer = PythonLexer
80 80 name = 'IPython'
81 81 aliases = ['ipython2', 'ipython']
82 82 doc = """IPython Lexer"""
83 83
84 84 ipython_tokens = [
85 85 (r'(?s)(\s*)(%%capture)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
86 86 (r'(?s)(\s*)(%%debug)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
87 87 (r'(?is)(\s*)(%%html)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(HtmlLexer))),
88 88 (r'(?s)(\s*)(%%javascript)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(JavascriptLexer))),
89 89 (r'(?s)(\s*)(%%js)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(JavascriptLexer))),
90 90 (r'(?s)(\s*)(%%latex)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(TexLexer))),
91 91 (r'(?s)(\s*)(%%pypy)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PerlLexer))),
92 92 (r'(?s)(\s*)(%%prun)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
93 93 (r'(?s)(\s*)(%%pypy)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
94 94 (r'(?s)(\s*)(%%python)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
95 95 (r'(?s)(\s*)(%%python2)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PythonLexer))),
96 96 (r'(?s)(\s*)(%%python3)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(Python3Lexer))),
97 97 (r'(?s)(\s*)(%%ruby)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(RubyLexer))),
98 98 (r'(?s)(\s*)(%%time)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
99 99 (r'(?s)(\s*)(%%timeit)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
100 100 (r'(?s)(\s*)(%%writefile)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
101 (r'(?s)(\s*)(%%file)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
101 102 (r"(?s)(\s*)(%%)(\w+)(.*)", bygroups(Text, Operator, Keyword, Text)),
102 103 (r'(?s)(^\s*)(%%!)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(BashLexer))),
103 104 (r"(%%?)(\w+)(\?\??)$", bygroups(Operator, Keyword, Operator)),
104 105 (r"\b(\?\??)(\s*)$", bygroups(Operator, Text)),
105 106 (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword,
106 107 using(BashLexer), Text)),
107 108 (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)),
108 109 (r'^(!!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
109 110 (r'(!)(?!=)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
110 111 (r'^(\s*)(\?\??)(\s*%{0,2}[\w\.\*]*)', bygroups(Text, Operator, Text)),
111 112 (r'(\s*%{0,2}[\w\.\*]*)(\?\??)(\s*)$', bygroups(Text, Operator, Text)),
112 113 ]
113 114
114 115 tokens = PyLexer.tokens.copy()
115 116 tokens['root'] = ipython_tokens + tokens['root']
116 117
117 118 attrs = {'name': name, 'aliases': aliases, 'filenames': [],
118 119 '__doc__': doc, 'tokens': tokens}
119 120
120 121 return type(name, (PyLexer,), attrs)
121 122
122 123
123 124 IPython3Lexer = build_ipy_lexer(python3=True)
124 125 IPythonLexer = build_ipy_lexer(python3=False)
125 126
126 127
127 128 class IPythonPartialTracebackLexer(RegexLexer):
128 129 """
129 130 Partial lexer for IPython tracebacks.
130 131
131 132 Handles all the non-python output. This works for both Python 2.x and 3.x.
132 133
133 134 """
134 135 name = 'IPython Partial Traceback'
135 136
136 137 tokens = {
137 138 'root': [
138 139 # Tracebacks for syntax errors have a different style.
139 140 # For both types of tracebacks, we mark the first line with
140 141 # Generic.Traceback. For syntax errors, we mark the filename
141 142 # as we mark the filenames for non-syntax tracebacks.
142 143 #
143 144 # These two regexps define how IPythonConsoleLexer finds a
144 145 # traceback.
145 146 #
146 147 ## Non-syntax traceback
147 148 (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
148 149 ## Syntax traceback
149 150 (r'^( File)(.*)(, line )(\d+\n)',
150 151 bygroups(Generic.Traceback, Name.Namespace,
151 152 Generic.Traceback, Literal.Number.Integer)),
152 153
153 154 # (Exception Identifier)(Whitespace)(Traceback Message)
154 155 (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
155 156 bygroups(Name.Exception, Generic.Whitespace, Text)),
156 157 # (Module/Filename)(Text)(Callee)(Function Signature)
157 158 # Better options for callee and function signature?
158 159 (r'(.*)( in )(.*)(\(.*\)\n)',
159 160 bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
160 161 # Regular line: (Whitespace)(Line Number)(Python Code)
161 162 (r'(\s*?)(\d+)(.*?\n)',
162 163 bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
163 164 # Emphasized line: (Arrow)(Line Number)(Python Code)
164 165 # Using Exception token so arrow color matches the Exception.
165 166 (r'(-*>?\s?)(\d+)(.*?\n)',
166 167 bygroups(Name.Exception, Literal.Number.Integer, Other)),
167 168 # (Exception Identifier)(Message)
168 169 (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
169 170 bygroups(Name.Exception, Text)),
170 171 # Tag everything else as Other, will be handled later.
171 172 (r'.*\n', Other),
172 173 ],
173 174 }
174 175
175 176
176 177 class IPythonTracebackLexer(DelegatingLexer):
177 178 """
178 179 IPython traceback lexer.
179 180
180 181 For doctests, the tracebacks can be snipped as much as desired with the
181 182 exception to the lines that designate a traceback. For non-syntax error
182 183 tracebacks, this is the line of hyphens. For syntax error tracebacks,
183 184 this is the line which lists the File and line number.
184 185
185 186 """
186 187 # The lexer inherits from DelegatingLexer. The "root" lexer is an
187 188 # appropriate IPython lexer, which depends on the value of the boolean
188 189 # `python3`. First, we parse with the partial IPython traceback lexer.
189 190 # Then, any code marked with the "Other" token is delegated to the root
190 191 # lexer.
191 192 #
192 193 name = 'IPython Traceback'
193 194 aliases = ['ipythontb']
194 195
195 196 def __init__(self, **options):
196 197 self.python3 = get_bool_opt(options, 'python3', False)
197 198 if self.python3:
198 199 self.aliases = ['ipython3tb']
199 200 else:
200 201 self.aliases = ['ipython2tb', 'ipythontb']
201 202
202 203 if self.python3:
203 204 IPyLexer = IPython3Lexer
204 205 else:
205 206 IPyLexer = IPythonLexer
206 207
207 208 DelegatingLexer.__init__(self, IPyLexer,
208 209 IPythonPartialTracebackLexer, **options)
209 210
210 211 class IPythonConsoleLexer(Lexer):
211 212 """
212 213 An IPython console lexer for IPython code-blocks and doctests, such as:
213 214
214 215 .. code-block:: rst
215 216
216 217 .. code-block:: ipythonconsole
217 218
218 219 In [1]: a = 'foo'
219 220
220 221 In [2]: a
221 222 Out[2]: 'foo'
222 223
223 224 In [3]: print a
224 225 foo
225 226
226 227 In [4]: 1 / 0
227 228
228 229
229 230 Support is also provided for IPython exceptions:
230 231
231 232 .. code-block:: rst
232 233
233 234 .. code-block:: ipythonconsole
234 235
235 236 In [1]: raise Exception
236 237
237 238 ---------------------------------------------------------------------------
238 239 Exception Traceback (most recent call last)
239 240 <ipython-input-1-fca2ab0ca76b> in <module>
240 241 ----> 1 raise Exception
241 242
242 243 Exception:
243 244
244 245 """
245 246 name = 'IPython console session'
246 247 aliases = ['ipythonconsole']
247 248 mimetypes = ['text/x-ipython-console']
248 249
249 250 # The regexps used to determine what is input and what is output.
250 251 # The default prompts for IPython are:
251 252 #
252 253 # in = 'In [#]: '
253 254 # continuation = ' .D.: '
254 255 # template = 'Out[#]: '
255 256 #
256 257 # Where '#' is the 'prompt number' or 'execution count' and 'D'
257 258 # D is a number of dots matching the width of the execution count
258 259 #
259 260 in1_regex = r'In \[[0-9]+\]: '
260 261 in2_regex = r' \.\.+\.: '
261 262 out_regex = r'Out\[[0-9]+\]: '
262 263
263 264 #: The regex to determine when a traceback starts.
264 265 ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)')
265 266
266 267 def __init__(self, **options):
267 268 """Initialize the IPython console lexer.
268 269
269 270 Parameters
270 271 ----------
271 272 python3 : bool
272 273 If `True`, then the console inputs are parsed using a Python 3
273 274 lexer. Otherwise, they are parsed using a Python 2 lexer.
274 275 in1_regex : RegexObject
275 276 The compiled regular expression used to detect the start
276 277 of inputs. Although the IPython configuration setting may have a
277 278 trailing whitespace, do not include it in the regex. If `None`,
278 279 then the default input prompt is assumed.
279 280 in2_regex : RegexObject
280 281 The compiled regular expression used to detect the continuation
281 282 of inputs. Although the IPython configuration setting may have a
282 283 trailing whitespace, do not include it in the regex. If `None`,
283 284 then the default input prompt is assumed.
284 285 out_regex : RegexObject
285 286 The compiled regular expression used to detect outputs. If `None`,
286 287 then the default output prompt is assumed.
287 288
288 289 """
289 290 self.python3 = get_bool_opt(options, 'python3', False)
290 291 if self.python3:
291 292 self.aliases = ['ipython3console']
292 293 else:
293 294 self.aliases = ['ipython2console', 'ipythonconsole']
294 295
295 296 in1_regex = options.get('in1_regex', self.in1_regex)
296 297 in2_regex = options.get('in2_regex', self.in2_regex)
297 298 out_regex = options.get('out_regex', self.out_regex)
298 299
299 300 # So that we can work with input and output prompts which have been
300 301 # rstrip'd (possibly by editors) we also need rstrip'd variants. If
301 302 # we do not do this, then such prompts will be tagged as 'output'.
302 303 # The reason can't just use the rstrip'd variants instead is because
303 304 # we want any whitespace associated with the prompt to be inserted
304 305 # with the token. This allows formatted code to be modified so as hide
305 306 # the appearance of prompts, with the whitespace included. One example
306 307 # use of this is in copybutton.js from the standard lib Python docs.
307 308 in1_regex_rstrip = in1_regex.rstrip() + '\n'
308 309 in2_regex_rstrip = in2_regex.rstrip() + '\n'
309 310 out_regex_rstrip = out_regex.rstrip() + '\n'
310 311
311 312 # Compile and save them all.
312 313 attrs = ['in1_regex', 'in2_regex', 'out_regex',
313 314 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
314 315 for attr in attrs:
315 316 self.__setattr__(attr, re.compile(locals()[attr]))
316 317
317 318 Lexer.__init__(self, **options)
318 319
319 320 if self.python3:
320 321 pylexer = IPython3Lexer
321 322 tblexer = IPythonTracebackLexer
322 323 else:
323 324 pylexer = IPythonLexer
324 325 tblexer = IPythonTracebackLexer
325 326
326 327 self.pylexer = pylexer(**options)
327 328 self.tblexer = tblexer(**options)
328 329
329 330 self.reset()
330 331
331 332 def reset(self):
332 333 self.mode = 'output'
333 334 self.index = 0
334 335 self.buffer = u''
335 336 self.insertions = []
336 337
337 338 def buffered_tokens(self):
338 339 """
339 340 Generator of unprocessed tokens after doing insertions and before
340 341 changing to a new state.
341 342
342 343 """
343 344 if self.mode == 'output':
344 345 tokens = [(0, Generic.Output, self.buffer)]
345 346 elif self.mode == 'input':
346 347 tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
347 348 else: # traceback
348 349 tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
349 350
350 351 for i, t, v in do_insertions(self.insertions, tokens):
351 352 # All token indexes are relative to the buffer.
352 353 yield self.index + i, t, v
353 354
354 355 # Clear it all
355 356 self.index += len(self.buffer)
356 357 self.buffer = u''
357 358 self.insertions = []
358 359
359 360 def get_mci(self, line):
360 361 """
361 362 Parses the line and returns a 3-tuple: (mode, code, insertion).
362 363
363 364 `mode` is the next mode (or state) of the lexer, and is always equal
364 365 to 'input', 'output', or 'tb'.
365 366
366 367 `code` is a portion of the line that should be added to the buffer
367 368 corresponding to the next mode and eventually lexed by another lexer.
368 369 For example, `code` could be Python code if `mode` were 'input'.
369 370
370 371 `insertion` is a 3-tuple (index, token, text) representing an
371 372 unprocessed "token" that will be inserted into the stream of tokens
372 373 that are created from the buffer once we change modes. This is usually
373 374 the input or output prompt.
374 375
375 376 In general, the next mode depends on current mode and on the contents
376 377 of `line`.
377 378
378 379 """
379 380 # To reduce the number of regex match checks, we have multiple
380 381 # 'if' blocks instead of 'if-elif' blocks.
381 382
382 383 # Check for possible end of input
383 384 in2_match = self.in2_regex.match(line)
384 385 in2_match_rstrip = self.in2_regex_rstrip.match(line)
385 386 if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
386 387 in2_match_rstrip:
387 388 end_input = True
388 389 else:
389 390 end_input = False
390 391 if end_input and self.mode != 'tb':
391 392 # Only look for an end of input when not in tb mode.
392 393 # An ellipsis could appear within the traceback.
393 394 mode = 'output'
394 395 code = u''
395 396 insertion = (0, Generic.Prompt, line)
396 397 return mode, code, insertion
397 398
398 399 # Check for output prompt
399 400 out_match = self.out_regex.match(line)
400 401 out_match_rstrip = self.out_regex_rstrip.match(line)
401 402 if out_match or out_match_rstrip:
402 403 mode = 'output'
403 404 if out_match:
404 405 idx = out_match.end()
405 406 else:
406 407 idx = out_match_rstrip.end()
407 408 code = line[idx:]
408 409 # Use the 'heading' token for output. We cannot use Generic.Error
409 410 # since it would conflict with exceptions.
410 411 insertion = (0, Generic.Heading, line[:idx])
411 412 return mode, code, insertion
412 413
413 414
414 415 # Check for input or continuation prompt (non stripped version)
415 416 in1_match = self.in1_regex.match(line)
416 417 if in1_match or (in2_match and self.mode != 'tb'):
417 418 # New input or when not in tb, continued input.
418 419 # We do not check for continued input when in tb since it is
419 420 # allowable to replace a long stack with an ellipsis.
420 421 mode = 'input'
421 422 if in1_match:
422 423 idx = in1_match.end()
423 424 else: # in2_match
424 425 idx = in2_match.end()
425 426 code = line[idx:]
426 427 insertion = (0, Generic.Prompt, line[:idx])
427 428 return mode, code, insertion
428 429
429 430 # Check for input or continuation prompt (stripped version)
430 431 in1_match_rstrip = self.in1_regex_rstrip.match(line)
431 432 if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
432 433 # New input or when not in tb, continued input.
433 434 # We do not check for continued input when in tb since it is
434 435 # allowable to replace a long stack with an ellipsis.
435 436 mode = 'input'
436 437 if in1_match_rstrip:
437 438 idx = in1_match_rstrip.end()
438 439 else: # in2_match
439 440 idx = in2_match_rstrip.end()
440 441 code = line[idx:]
441 442 insertion = (0, Generic.Prompt, line[:idx])
442 443 return mode, code, insertion
443 444
444 445 # Check for traceback
445 446 if self.ipytb_start.match(line):
446 447 mode = 'tb'
447 448 code = line
448 449 insertion = None
449 450 return mode, code, insertion
450 451
451 452 # All other stuff...
452 453 if self.mode in ('input', 'output'):
453 454 # We assume all other text is output. Multiline input that
454 455 # does not use the continuation marker cannot be detected.
455 456 # For example, the 3 in the following is clearly output:
456 457 #
457 458 # In [1]: print 3
458 459 # 3
459 460 #
460 461 # But the following second line is part of the input:
461 462 #
462 463 # In [2]: while True:
463 464 # print True
464 465 #
465 466 # In both cases, the 2nd line will be 'output'.
466 467 #
467 468 mode = 'output'
468 469 else:
469 470 mode = 'tb'
470 471
471 472 code = line
472 473 insertion = None
473 474
474 475 return mode, code, insertion
475 476
476 477 def get_tokens_unprocessed(self, text):
477 478 self.reset()
478 479 for match in line_re.finditer(text):
479 480 line = match.group()
480 481 mode, code, insertion = self.get_mci(line)
481 482
482 483 if mode != self.mode:
483 484 # Yield buffered tokens before transitioning to new mode.
484 485 for token in self.buffered_tokens():
485 486 yield token
486 487 self.mode = mode
487 488
488 489 if insertion:
489 490 self.insertions.append((len(self.buffer), [insertion]))
490 491 self.buffer += code
491 492
492 493 for token in self.buffered_tokens():
493 494 yield token
494 495
495 496 class IPyLexer(Lexer):
496 497 """
497 498 Primary lexer for all IPython-like code.
498 499
499 500 This is a simple helper lexer. If the first line of the text begins with
500 501 "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
501 502 lexer. If not, then the entire text is parsed with an IPython lexer.
502 503
503 504 The goal is to reduce the number of lexers that are registered
504 505 with Pygments.
505 506
506 507 """
507 508 name = 'IPy session'
508 509 aliases = ['ipy']
509 510
510 511 def __init__(self, **options):
511 512 self.python3 = get_bool_opt(options, 'python3', False)
512 513 if self.python3:
513 514 self.aliases = ['ipy3']
514 515 else:
515 516 self.aliases = ['ipy2', 'ipy']
516 517
517 518 Lexer.__init__(self, **options)
518 519
519 520 self.IPythonLexer = IPythonLexer(**options)
520 521 self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
521 522
522 523 def get_tokens_unprocessed(self, text):
523 524 # Search for the input prompt anywhere...this allows code blocks to
524 525 # begin with comments as well.
525 526 if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL):
526 527 lex = self.IPythonConsoleLexer
527 528 else:
528 529 lex = self.IPythonLexer
529 530 for token in lex.get_tokens_unprocessed(text):
530 531 yield token
531 532
General Comments 0
You need to be logged in to leave comments. Login now