##// END OF EJS Templates
Fix %%perl highlighting
Matthias Geier -
Show More
@@ -1,532 +1,532 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 Defines a variety of Pygments lexers for highlighting IPython code.
4 4
5 5 This includes:
6 6
7 7 IPythonLexer, IPython3Lexer
8 8 Lexers for pure IPython (python + magic/shell commands)
9 9
10 10 IPythonPartialTracebackLexer, IPythonTracebackLexer
11 11 Supports 2.x and 3.x via keyword `python3`. The partial traceback
12 12 lexer reads everything but the Python code appearing in a traceback.
13 13 The full lexer combines the partial lexer with an IPython lexer.
14 14
15 15 IPythonConsoleLexer
16 16 A lexer for IPython console sessions, with support for tracebacks.
17 17
18 18 IPyLexer
19 19 A friendly lexer which examines the first line of text and from it,
20 20 decides whether to use an IPython lexer or an IPython console lexer.
21 21 This is probably the only lexer that needs to be explicitly added
22 22 to Pygments.
23 23
24 24 """
25 25 #-----------------------------------------------------------------------------
26 26 # Copyright (c) 2013, the IPython Development Team.
27 27 #
28 28 # Distributed under the terms of the Modified BSD License.
29 29 #
30 30 # The full license is in the file COPYING.txt, distributed with this software.
31 31 #-----------------------------------------------------------------------------
32 32
33 33 # Standard library
34 34 import re
35 35
36 36 # Third party
37 37 from pygments.lexers import (
38 38 BashLexer, HtmlLexer, JavascriptLexer, RubyLexer, PerlLexer, PythonLexer,
39 39 Python3Lexer, TexLexer)
40 40 from pygments.lexer import (
41 41 Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
42 42 )
43 43 from pygments.token import (
44 44 Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
45 45 )
46 46 from pygments.util import get_bool_opt
47 47
48 48 # Local
49 49
50 50 line_re = re.compile('.*?\n')
51 51
52 52 __all__ = ['build_ipy_lexer', 'IPython3Lexer', 'IPythonLexer',
53 53 'IPythonPartialTracebackLexer', 'IPythonTracebackLexer',
54 54 'IPythonConsoleLexer', 'IPyLexer']
55 55
56 56
57 57 def build_ipy_lexer(python3):
58 58 """Builds IPython lexers depending on the value of `python3`.
59 59
60 60 The lexer inherits from an appropriate Python lexer and then adds
61 61 information about IPython specific keywords (i.e. magic commands,
62 62 shell commands, etc.)
63 63
64 64 Parameters
65 65 ----------
66 66 python3 : bool
67 67 If `True`, then build an IPython lexer from a Python 3 lexer.
68 68
69 69 """
70 70 # It would be nice to have a single IPython lexer class which takes
71 71 # a boolean `python3`. But since there are two Python lexer classes,
72 72 # we will also have two IPython lexer classes.
73 73 if python3:
74 74 PyLexer = Python3Lexer
75 75 name = 'IPython3'
76 76 aliases = ['ipython3']
77 77 doc = """IPython3 Lexer"""
78 78 else:
79 79 PyLexer = PythonLexer
80 80 name = 'IPython'
81 81 aliases = ['ipython2', 'ipython']
82 82 doc = """IPython Lexer"""
83 83
84 84 ipython_tokens = [
85 85 (r'(?s)(\s*)(%%capture)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
86 86 (r'(?s)(\s*)(%%debug)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
87 87 (r'(?is)(\s*)(%%html)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(HtmlLexer))),
88 88 (r'(?s)(\s*)(%%javascript)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(JavascriptLexer))),
89 89 (r'(?s)(\s*)(%%js)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(JavascriptLexer))),
90 90 (r'(?s)(\s*)(%%latex)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(TexLexer))),
91 (r'(?s)(\s*)(%%pypy)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PerlLexer))),
91 (r'(?s)(\s*)(%%perl)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PerlLexer))),
92 92 (r'(?s)(\s*)(%%prun)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
93 93 (r'(?s)(\s*)(%%pypy)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
94 94 (r'(?s)(\s*)(%%python)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
95 95 (r'(?s)(\s*)(%%python2)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PythonLexer))),
96 96 (r'(?s)(\s*)(%%python3)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(Python3Lexer))),
97 97 (r'(?s)(\s*)(%%ruby)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(RubyLexer))),
98 98 (r'(?s)(\s*)(%%time)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
99 99 (r'(?s)(\s*)(%%timeit)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
100 100 (r'(?s)(\s*)(%%writefile)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
101 101 (r'(?s)(\s*)(%%file)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(PyLexer))),
102 102 (r"(?s)(\s*)(%%)(\w+)(.*)", bygroups(Text, Operator, Keyword, Text)),
103 103 (r'(?s)(^\s*)(%%!)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(BashLexer))),
104 104 (r"(%%?)(\w+)(\?\??)$", bygroups(Operator, Keyword, Operator)),
105 105 (r"\b(\?\??)(\s*)$", bygroups(Operator, Text)),
106 106 (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword,
107 107 using(BashLexer), Text)),
108 108 (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)),
109 109 (r'^(!!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
110 110 (r'(!)(?!=)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
111 111 (r'^(\s*)(\?\??)(\s*%{0,2}[\w\.\*]*)', bygroups(Text, Operator, Text)),
112 112 (r'(\s*%{0,2}[\w\.\*]*)(\?\??)(\s*)$', bygroups(Text, Operator, Text)),
113 113 ]
114 114
115 115 tokens = PyLexer.tokens.copy()
116 116 tokens['root'] = ipython_tokens + tokens['root']
117 117
118 118 attrs = {'name': name, 'aliases': aliases, 'filenames': [],
119 119 '__doc__': doc, 'tokens': tokens}
120 120
121 121 return type(name, (PyLexer,), attrs)
122 122
123 123
124 124 IPython3Lexer = build_ipy_lexer(python3=True)
125 125 IPythonLexer = build_ipy_lexer(python3=False)
126 126
127 127
128 128 class IPythonPartialTracebackLexer(RegexLexer):
129 129 """
130 130 Partial lexer for IPython tracebacks.
131 131
132 132 Handles all the non-python output. This works for both Python 2.x and 3.x.
133 133
134 134 """
135 135 name = 'IPython Partial Traceback'
136 136
137 137 tokens = {
138 138 'root': [
139 139 # Tracebacks for syntax errors have a different style.
140 140 # For both types of tracebacks, we mark the first line with
141 141 # Generic.Traceback. For syntax errors, we mark the filename
142 142 # as we mark the filenames for non-syntax tracebacks.
143 143 #
144 144 # These two regexps define how IPythonConsoleLexer finds a
145 145 # traceback.
146 146 #
147 147 ## Non-syntax traceback
148 148 (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
149 149 ## Syntax traceback
150 150 (r'^( File)(.*)(, line )(\d+\n)',
151 151 bygroups(Generic.Traceback, Name.Namespace,
152 152 Generic.Traceback, Literal.Number.Integer)),
153 153
154 154 # (Exception Identifier)(Whitespace)(Traceback Message)
155 155 (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
156 156 bygroups(Name.Exception, Generic.Whitespace, Text)),
157 157 # (Module/Filename)(Text)(Callee)(Function Signature)
158 158 # Better options for callee and function signature?
159 159 (r'(.*)( in )(.*)(\(.*\)\n)',
160 160 bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
161 161 # Regular line: (Whitespace)(Line Number)(Python Code)
162 162 (r'(\s*?)(\d+)(.*?\n)',
163 163 bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
164 164 # Emphasized line: (Arrow)(Line Number)(Python Code)
165 165 # Using Exception token so arrow color matches the Exception.
166 166 (r'(-*>?\s?)(\d+)(.*?\n)',
167 167 bygroups(Name.Exception, Literal.Number.Integer, Other)),
168 168 # (Exception Identifier)(Message)
169 169 (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
170 170 bygroups(Name.Exception, Text)),
171 171 # Tag everything else as Other, will be handled later.
172 172 (r'.*\n', Other),
173 173 ],
174 174 }
175 175
176 176
177 177 class IPythonTracebackLexer(DelegatingLexer):
178 178 """
179 179 IPython traceback lexer.
180 180
181 181 For doctests, the tracebacks can be snipped as much as desired with the
182 182 exception to the lines that designate a traceback. For non-syntax error
183 183 tracebacks, this is the line of hyphens. For syntax error tracebacks,
184 184 this is the line which lists the File and line number.
185 185
186 186 """
187 187 # The lexer inherits from DelegatingLexer. The "root" lexer is an
188 188 # appropriate IPython lexer, which depends on the value of the boolean
189 189 # `python3`. First, we parse with the partial IPython traceback lexer.
190 190 # Then, any code marked with the "Other" token is delegated to the root
191 191 # lexer.
192 192 #
193 193 name = 'IPython Traceback'
194 194 aliases = ['ipythontb']
195 195
196 196 def __init__(self, **options):
197 197 self.python3 = get_bool_opt(options, 'python3', False)
198 198 if self.python3:
199 199 self.aliases = ['ipython3tb']
200 200 else:
201 201 self.aliases = ['ipython2tb', 'ipythontb']
202 202
203 203 if self.python3:
204 204 IPyLexer = IPython3Lexer
205 205 else:
206 206 IPyLexer = IPythonLexer
207 207
208 208 DelegatingLexer.__init__(self, IPyLexer,
209 209 IPythonPartialTracebackLexer, **options)
210 210
211 211 class IPythonConsoleLexer(Lexer):
212 212 """
213 213 An IPython console lexer for IPython code-blocks and doctests, such as:
214 214
215 215 .. code-block:: rst
216 216
217 217 .. code-block:: ipythonconsole
218 218
219 219 In [1]: a = 'foo'
220 220
221 221 In [2]: a
222 222 Out[2]: 'foo'
223 223
224 224 In [3]: print a
225 225 foo
226 226
227 227 In [4]: 1 / 0
228 228
229 229
230 230 Support is also provided for IPython exceptions:
231 231
232 232 .. code-block:: rst
233 233
234 234 .. code-block:: ipythonconsole
235 235
236 236 In [1]: raise Exception
237 237
238 238 ---------------------------------------------------------------------------
239 239 Exception Traceback (most recent call last)
240 240 <ipython-input-1-fca2ab0ca76b> in <module>
241 241 ----> 1 raise Exception
242 242
243 243 Exception:
244 244
245 245 """
246 246 name = 'IPython console session'
247 247 aliases = ['ipythonconsole']
248 248 mimetypes = ['text/x-ipython-console']
249 249
250 250 # The regexps used to determine what is input and what is output.
251 251 # The default prompts for IPython are:
252 252 #
253 253 # in = 'In [#]: '
254 254 # continuation = ' .D.: '
255 255 # template = 'Out[#]: '
256 256 #
257 257 # Where '#' is the 'prompt number' or 'execution count' and 'D'
258 258 # D is a number of dots matching the width of the execution count
259 259 #
260 260 in1_regex = r'In \[[0-9]+\]: '
261 261 in2_regex = r' \.\.+\.: '
262 262 out_regex = r'Out\[[0-9]+\]: '
263 263
264 264 #: The regex to determine when a traceback starts.
265 265 ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)')
266 266
267 267 def __init__(self, **options):
268 268 """Initialize the IPython console lexer.
269 269
270 270 Parameters
271 271 ----------
272 272 python3 : bool
273 273 If `True`, then the console inputs are parsed using a Python 3
274 274 lexer. Otherwise, they are parsed using a Python 2 lexer.
275 275 in1_regex : RegexObject
276 276 The compiled regular expression used to detect the start
277 277 of inputs. Although the IPython configuration setting may have a
278 278 trailing whitespace, do not include it in the regex. If `None`,
279 279 then the default input prompt is assumed.
280 280 in2_regex : RegexObject
281 281 The compiled regular expression used to detect the continuation
282 282 of inputs. Although the IPython configuration setting may have a
283 283 trailing whitespace, do not include it in the regex. If `None`,
284 284 then the default input prompt is assumed.
285 285 out_regex : RegexObject
286 286 The compiled regular expression used to detect outputs. If `None`,
287 287 then the default output prompt is assumed.
288 288
289 289 """
290 290 self.python3 = get_bool_opt(options, 'python3', False)
291 291 if self.python3:
292 292 self.aliases = ['ipython3console']
293 293 else:
294 294 self.aliases = ['ipython2console', 'ipythonconsole']
295 295
296 296 in1_regex = options.get('in1_regex', self.in1_regex)
297 297 in2_regex = options.get('in2_regex', self.in2_regex)
298 298 out_regex = options.get('out_regex', self.out_regex)
299 299
300 300 # So that we can work with input and output prompts which have been
301 301 # rstrip'd (possibly by editors) we also need rstrip'd variants. If
302 302 # we do not do this, then such prompts will be tagged as 'output'.
303 303 # The reason can't just use the rstrip'd variants instead is because
304 304 # we want any whitespace associated with the prompt to be inserted
305 305 # with the token. This allows formatted code to be modified so as hide
306 306 # the appearance of prompts, with the whitespace included. One example
307 307 # use of this is in copybutton.js from the standard lib Python docs.
308 308 in1_regex_rstrip = in1_regex.rstrip() + '\n'
309 309 in2_regex_rstrip = in2_regex.rstrip() + '\n'
310 310 out_regex_rstrip = out_regex.rstrip() + '\n'
311 311
312 312 # Compile and save them all.
313 313 attrs = ['in1_regex', 'in2_regex', 'out_regex',
314 314 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
315 315 for attr in attrs:
316 316 self.__setattr__(attr, re.compile(locals()[attr]))
317 317
318 318 Lexer.__init__(self, **options)
319 319
320 320 if self.python3:
321 321 pylexer = IPython3Lexer
322 322 tblexer = IPythonTracebackLexer
323 323 else:
324 324 pylexer = IPythonLexer
325 325 tblexer = IPythonTracebackLexer
326 326
327 327 self.pylexer = pylexer(**options)
328 328 self.tblexer = tblexer(**options)
329 329
330 330 self.reset()
331 331
332 332 def reset(self):
333 333 self.mode = 'output'
334 334 self.index = 0
335 335 self.buffer = u''
336 336 self.insertions = []
337 337
338 338 def buffered_tokens(self):
339 339 """
340 340 Generator of unprocessed tokens after doing insertions and before
341 341 changing to a new state.
342 342
343 343 """
344 344 if self.mode == 'output':
345 345 tokens = [(0, Generic.Output, self.buffer)]
346 346 elif self.mode == 'input':
347 347 tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
348 348 else: # traceback
349 349 tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
350 350
351 351 for i, t, v in do_insertions(self.insertions, tokens):
352 352 # All token indexes are relative to the buffer.
353 353 yield self.index + i, t, v
354 354
355 355 # Clear it all
356 356 self.index += len(self.buffer)
357 357 self.buffer = u''
358 358 self.insertions = []
359 359
360 360 def get_mci(self, line):
361 361 """
362 362 Parses the line and returns a 3-tuple: (mode, code, insertion).
363 363
364 364 `mode` is the next mode (or state) of the lexer, and is always equal
365 365 to 'input', 'output', or 'tb'.
366 366
367 367 `code` is a portion of the line that should be added to the buffer
368 368 corresponding to the next mode and eventually lexed by another lexer.
369 369 For example, `code` could be Python code if `mode` were 'input'.
370 370
371 371 `insertion` is a 3-tuple (index, token, text) representing an
372 372 unprocessed "token" that will be inserted into the stream of tokens
373 373 that are created from the buffer once we change modes. This is usually
374 374 the input or output prompt.
375 375
376 376 In general, the next mode depends on current mode and on the contents
377 377 of `line`.
378 378
379 379 """
380 380 # To reduce the number of regex match checks, we have multiple
381 381 # 'if' blocks instead of 'if-elif' blocks.
382 382
383 383 # Check for possible end of input
384 384 in2_match = self.in2_regex.match(line)
385 385 in2_match_rstrip = self.in2_regex_rstrip.match(line)
386 386 if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
387 387 in2_match_rstrip:
388 388 end_input = True
389 389 else:
390 390 end_input = False
391 391 if end_input and self.mode != 'tb':
392 392 # Only look for an end of input when not in tb mode.
393 393 # An ellipsis could appear within the traceback.
394 394 mode = 'output'
395 395 code = u''
396 396 insertion = (0, Generic.Prompt, line)
397 397 return mode, code, insertion
398 398
399 399 # Check for output prompt
400 400 out_match = self.out_regex.match(line)
401 401 out_match_rstrip = self.out_regex_rstrip.match(line)
402 402 if out_match or out_match_rstrip:
403 403 mode = 'output'
404 404 if out_match:
405 405 idx = out_match.end()
406 406 else:
407 407 idx = out_match_rstrip.end()
408 408 code = line[idx:]
409 409 # Use the 'heading' token for output. We cannot use Generic.Error
410 410 # since it would conflict with exceptions.
411 411 insertion = (0, Generic.Heading, line[:idx])
412 412 return mode, code, insertion
413 413
414 414
415 415 # Check for input or continuation prompt (non stripped version)
416 416 in1_match = self.in1_regex.match(line)
417 417 if in1_match or (in2_match and self.mode != 'tb'):
418 418 # New input or when not in tb, continued input.
419 419 # We do not check for continued input when in tb since it is
420 420 # allowable to replace a long stack with an ellipsis.
421 421 mode = 'input'
422 422 if in1_match:
423 423 idx = in1_match.end()
424 424 else: # in2_match
425 425 idx = in2_match.end()
426 426 code = line[idx:]
427 427 insertion = (0, Generic.Prompt, line[:idx])
428 428 return mode, code, insertion
429 429
430 430 # Check for input or continuation prompt (stripped version)
431 431 in1_match_rstrip = self.in1_regex_rstrip.match(line)
432 432 if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
433 433 # New input or when not in tb, continued input.
434 434 # We do not check for continued input when in tb since it is
435 435 # allowable to replace a long stack with an ellipsis.
436 436 mode = 'input'
437 437 if in1_match_rstrip:
438 438 idx = in1_match_rstrip.end()
439 439 else: # in2_match
440 440 idx = in2_match_rstrip.end()
441 441 code = line[idx:]
442 442 insertion = (0, Generic.Prompt, line[:idx])
443 443 return mode, code, insertion
444 444
445 445 # Check for traceback
446 446 if self.ipytb_start.match(line):
447 447 mode = 'tb'
448 448 code = line
449 449 insertion = None
450 450 return mode, code, insertion
451 451
452 452 # All other stuff...
453 453 if self.mode in ('input', 'output'):
454 454 # We assume all other text is output. Multiline input that
455 455 # does not use the continuation marker cannot be detected.
456 456 # For example, the 3 in the following is clearly output:
457 457 #
458 458 # In [1]: print 3
459 459 # 3
460 460 #
461 461 # But the following second line is part of the input:
462 462 #
463 463 # In [2]: while True:
464 464 # print True
465 465 #
466 466 # In both cases, the 2nd line will be 'output'.
467 467 #
468 468 mode = 'output'
469 469 else:
470 470 mode = 'tb'
471 471
472 472 code = line
473 473 insertion = None
474 474
475 475 return mode, code, insertion
476 476
477 477 def get_tokens_unprocessed(self, text):
478 478 self.reset()
479 479 for match in line_re.finditer(text):
480 480 line = match.group()
481 481 mode, code, insertion = self.get_mci(line)
482 482
483 483 if mode != self.mode:
484 484 # Yield buffered tokens before transitioning to new mode.
485 485 for token in self.buffered_tokens():
486 486 yield token
487 487 self.mode = mode
488 488
489 489 if insertion:
490 490 self.insertions.append((len(self.buffer), [insertion]))
491 491 self.buffer += code
492 492
493 493 for token in self.buffered_tokens():
494 494 yield token
495 495
496 496 class IPyLexer(Lexer):
497 497 """
498 498 Primary lexer for all IPython-like code.
499 499
500 500 This is a simple helper lexer. If the first line of the text begins with
501 501 "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
502 502 lexer. If not, then the entire text is parsed with an IPython lexer.
503 503
504 504 The goal is to reduce the number of lexers that are registered
505 505 with Pygments.
506 506
507 507 """
508 508 name = 'IPy session'
509 509 aliases = ['ipy']
510 510
511 511 def __init__(self, **options):
512 512 self.python3 = get_bool_opt(options, 'python3', False)
513 513 if self.python3:
514 514 self.aliases = ['ipy3']
515 515 else:
516 516 self.aliases = ['ipy2', 'ipy']
517 517
518 518 Lexer.__init__(self, **options)
519 519
520 520 self.IPythonLexer = IPythonLexer(**options)
521 521 self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
522 522
523 523 def get_tokens_unprocessed(self, text):
524 524 # Search for the input prompt anywhere...this allows code blocks to
525 525 # begin with comments as well.
526 526 if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL):
527 527 lex = self.IPythonConsoleLexer
528 528 else:
529 529 lex = self.IPythonLexer
530 530 for token in lex.get_tokens_unprocessed(text):
531 531 yield token
532 532
General Comments 0
You need to be logged in to leave comments. Login now