##// END OF EJS Templates
Closes #7558: Added a rule for cell magics.
Lev Abalkin -
Show More
@@ -1,506 +1,507 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 Defines a variety of Pygments lexers for highlighting IPython code.
4 4
5 5 This includes:
6 6
7 7 IPythonLexer, IPython3Lexer
8 8 Lexers for pure IPython (python + magic/shell commands)
9 9
10 10 IPythonPartialTracebackLexer, IPythonTracebackLexer
11 11 Supports 2.x and 3.x via keyword `python3`. The partial traceback
12 12 lexer reads everything but the Python code appearing in a traceback.
13 13 The full lexer combines the partial lexer with an IPython lexer.
14 14
15 15 IPythonConsoleLexer
16 16 A lexer for IPython console sessions, with support for tracebacks.
17 17
18 18 IPyLexer
19 19 A friendly lexer which examines the first line of text and from it,
20 20 decides whether to use an IPython lexer or an IPython console lexer.
21 21 This is probably the only lexer that needs to be explicitly added
22 22 to Pygments.
23 23
24 24 """
25 25 #-----------------------------------------------------------------------------
26 26 # Copyright (c) 2013, the IPython Development Team.
27 27 #
28 28 # Distributed under the terms of the Modified BSD License.
29 29 #
30 30 # The full license is in the file COPYING.txt, distributed with this software.
31 31 #-----------------------------------------------------------------------------
32 32
33 33 # Standard library
34 34 import re
35 35
36 36 # Third party
37 37 from pygments.lexers import BashLexer, PythonLexer, Python3Lexer
38 38 from pygments.lexer import (
39 39 Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
40 40 )
41 41 from pygments.token import (
42 42 Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
43 43 )
44 44 from pygments.util import get_bool_opt
45 45
46 46 # Local
47 47 from IPython.testing.skipdoctest import skip_doctest
48 48
49 49 line_re = re.compile('.*?\n')
50 50
51 51 ipython_tokens = [
52 (r"(?s)(\s*)(%%)(\w+)(.*)", bygroups(Text, Operator, Keyword, Text)),
52 53 (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword,
53 54 using(BashLexer), Text)),
54 55 (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)),
55 56 (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
56 57 (r'^(.+)(=)(\s*)(!)(.+)(\n)', bygroups(
57 58 # With the limited syntax allowed on the l.h.s. of a shell capture,
58 59 # we don't need to differentiate between Python 2 and 3.
59 60 using(Python3Lexer), Operator, Text, Operator, using(BashLexer), Text)),
60 61 ]
61 62
62 63 def build_ipy_lexer(python3):
63 64 """Builds IPython lexers depending on the value of `python3`.
64 65
65 66 The lexer inherits from an appropriate Python lexer and then adds
66 67 information about IPython specific keywords (i.e. magic commands,
67 68 shell commands, etc.)
68 69
69 70 Parameters
70 71 ----------
71 72 python3 : bool
72 73 If `True`, then build an IPython lexer from a Python 3 lexer.
73 74
74 75 """
75 76 # It would be nice to have a single IPython lexer class which takes
76 77 # a boolean `python3`. But since there are two Python lexer classes,
77 78 # we will also have two IPython lexer classes.
78 79 if python3:
79 80 PyLexer = Python3Lexer
80 81 clsname = 'IPython3Lexer'
81 82 name = 'IPython3'
82 83 aliases = ['ipython3']
83 84 doc = """IPython3 Lexer"""
84 85 else:
85 86 PyLexer = PythonLexer
86 87 clsname = 'IPythonLexer'
87 88 name = 'IPython'
88 89 aliases = ['ipython2', 'ipython']
89 90 doc = """IPython Lexer"""
90 91
91 92 tokens = PyLexer.tokens.copy()
92 93 tokens['root'] = ipython_tokens + tokens['root']
93 94
94 95 attrs = {'name': name, 'aliases': aliases,
95 96 '__doc__': doc, 'tokens': tokens}
96 97
97 98 return type(name, (PyLexer,), attrs)
98 99
99 100
100 101 IPython3Lexer = build_ipy_lexer(python3=True)
101 102 IPythonLexer = build_ipy_lexer(python3=False)
102 103
103 104
104 105 class IPythonPartialTracebackLexer(RegexLexer):
105 106 """
106 107 Partial lexer for IPython tracebacks.
107 108
108 109 Handles all the non-python output. This works for both Python 2.x and 3.x.
109 110
110 111 """
111 112 name = 'IPython Partial Traceback'
112 113
113 114 tokens = {
114 115 'root': [
115 116 # Tracebacks for syntax errors have a different style.
116 117 # For both types of tracebacks, we mark the first line with
117 118 # Generic.Traceback. For syntax errors, we mark the filename
118 119 # as we mark the filenames for non-syntax tracebacks.
119 120 #
120 121 # These two regexps define how IPythonConsoleLexer finds a
121 122 # traceback.
122 123 #
123 124 ## Non-syntax traceback
124 125 (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
125 126 ## Syntax traceback
126 127 (r'^( File)(.*)(, line )(\d+\n)',
127 128 bygroups(Generic.Traceback, Name.Namespace,
128 129 Generic.Traceback, Literal.Number.Integer)),
129 130
130 131 # (Exception Identifier)(Whitespace)(Traceback Message)
131 132 (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
132 133 bygroups(Name.Exception, Generic.Whitespace, Text)),
133 134 # (Module/Filename)(Text)(Callee)(Function Signature)
134 135 # Better options for callee and function signature?
135 136 (r'(.*)( in )(.*)(\(.*\)\n)',
136 137 bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
137 138 # Regular line: (Whitespace)(Line Number)(Python Code)
138 139 (r'(\s*?)(\d+)(.*?\n)',
139 140 bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
140 141 # Emphasized line: (Arrow)(Line Number)(Python Code)
141 142 # Using Exception token so arrow color matches the Exception.
142 143 (r'(-*>?\s?)(\d+)(.*?\n)',
143 144 bygroups(Name.Exception, Literal.Number.Integer, Other)),
144 145 # (Exception Identifier)(Message)
145 146 (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
146 147 bygroups(Name.Exception, Text)),
147 148 # Tag everything else as Other, will be handled later.
148 149 (r'.*\n', Other),
149 150 ],
150 151 }
151 152
152 153
153 154 class IPythonTracebackLexer(DelegatingLexer):
154 155 """
155 156 IPython traceback lexer.
156 157
157 158 For doctests, the tracebacks can be snipped as much as desired with the
158 159 exception to the lines that designate a traceback. For non-syntax error
159 160 tracebacks, this is the line of hyphens. For syntax error tracebacks,
160 161 this is the line which lists the File and line number.
161 162
162 163 """
163 164 # The lexer inherits from DelegatingLexer. The "root" lexer is an
164 165 # appropriate IPython lexer, which depends on the value of the boolean
165 166 # `python3`. First, we parse with the partial IPython traceback lexer.
166 167 # Then, any code marked with the "Other" token is delegated to the root
167 168 # lexer.
168 169 #
169 170 name = 'IPython Traceback'
170 171 aliases = ['ipythontb']
171 172
172 173 def __init__(self, **options):
173 174 self.python3 = get_bool_opt(options, 'python3', False)
174 175 if self.python3:
175 176 self.aliases = ['ipython3tb']
176 177 else:
177 178 self.aliases = ['ipython2tb', 'ipythontb']
178 179
179 180 if self.python3:
180 181 IPyLexer = IPython3Lexer
181 182 else:
182 183 IPyLexer = IPythonLexer
183 184
184 185 DelegatingLexer.__init__(self, IPyLexer,
185 186 IPythonPartialTracebackLexer, **options)
186 187
187 188 @skip_doctest
188 189 class IPythonConsoleLexer(Lexer):
189 190 """
190 191 An IPython console lexer for IPython code-blocks and doctests, such as:
191 192
192 193 .. code-block:: rst
193 194
194 195 .. code-block:: ipythonconsole
195 196
196 197 In [1]: a = 'foo'
197 198
198 199 In [2]: a
199 200 Out[2]: 'foo'
200 201
201 202 In [3]: print a
202 203 foo
203 204
204 205 In [4]: 1 / 0
205 206
206 207
207 208 Support is also provided for IPython exceptions:
208 209
209 210 .. code-block:: rst
210 211
211 212 .. code-block:: ipythonconsole
212 213
213 214 In [1]: raise Exception
214 215
215 216 ---------------------------------------------------------------------------
216 217 Exception Traceback (most recent call last)
217 218 <ipython-input-1-fca2ab0ca76b> in <module>()
218 219 ----> 1 raise Exception
219 220
220 221 Exception:
221 222
222 223 """
223 224 name = 'IPython console session'
224 225 aliases = ['ipythonconsole']
225 226 mimetypes = ['text/x-ipython-console']
226 227
227 228 # The regexps used to determine what is input and what is output.
228 229 # The default prompts for IPython are:
229 230 #
230 231 # c.PromptManager.in_template = 'In [\#]: '
231 232 # c.PromptManager.in2_template = ' .\D.: '
232 233 # c.PromptManager.out_template = 'Out[\#]: '
233 234 #
234 235 in1_regex = r'In \[[0-9]+\]: '
235 236 in2_regex = r' \.\.+\.: '
236 237 out_regex = r'Out\[[0-9]+\]: '
237 238
238 239 #: The regex to determine when a traceback starts.
239 240 ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)')
240 241
241 242 def __init__(self, **options):
242 243 """Initialize the IPython console lexer.
243 244
244 245 Parameters
245 246 ----------
246 247 python3 : bool
247 248 If `True`, then the console inputs are parsed using a Python 3
248 249 lexer. Otherwise, they are parsed using a Python 2 lexer.
249 250 in1_regex : RegexObject
250 251 The compiled regular expression used to detect the start
251 252 of inputs. Although the IPython configuration setting may have a
252 253 trailing whitespace, do not include it in the regex. If `None`,
253 254 then the default input prompt is assumed.
254 255 in2_regex : RegexObject
255 256 The compiled regular expression used to detect the continuation
256 257 of inputs. Although the IPython configuration setting may have a
257 258 trailing whitespace, do not include it in the regex. If `None`,
258 259 then the default input prompt is assumed.
259 260 out_regex : RegexObject
260 261 The compiled regular expression used to detect outputs. If `None`,
261 262 then the default output prompt is assumed.
262 263
263 264 """
264 265 self.python3 = get_bool_opt(options, 'python3', False)
265 266 if self.python3:
266 267 self.aliases = ['ipython3console']
267 268 else:
268 269 self.aliases = ['ipython2console', 'ipythonconsole']
269 270
270 271 in1_regex = options.get('in1_regex', self.in1_regex)
271 272 in2_regex = options.get('in2_regex', self.in2_regex)
272 273 out_regex = options.get('out_regex', self.out_regex)
273 274
274 275 # So that we can work with input and output prompts which have been
275 276 # rstrip'd (possibly by editors) we also need rstrip'd variants. If
276 277 # we do not do this, then such prompts will be tagged as 'output'.
277 278 # The reason can't just use the rstrip'd variants instead is because
278 279 # we want any whitespace associated with the prompt to be inserted
279 280 # with the token. This allows formatted code to be modified so as hide
280 281 # the appearance of prompts, with the whitespace included. One example
281 282 # use of this is in copybutton.js from the standard lib Python docs.
282 283 in1_regex_rstrip = in1_regex.rstrip() + '\n'
283 284 in2_regex_rstrip = in2_regex.rstrip() + '\n'
284 285 out_regex_rstrip = out_regex.rstrip() + '\n'
285 286
286 287 # Compile and save them all.
287 288 attrs = ['in1_regex', 'in2_regex', 'out_regex',
288 289 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
289 290 for attr in attrs:
290 291 self.__setattr__(attr, re.compile(locals()[attr]))
291 292
292 293 Lexer.__init__(self, **options)
293 294
294 295 if self.python3:
295 296 pylexer = IPython3Lexer
296 297 tblexer = IPythonTracebackLexer
297 298 else:
298 299 pylexer = IPythonLexer
299 300 tblexer = IPythonTracebackLexer
300 301
301 302 self.pylexer = pylexer(**options)
302 303 self.tblexer = tblexer(**options)
303 304
304 305 self.reset()
305 306
306 307 def reset(self):
307 308 self.mode = 'output'
308 309 self.index = 0
309 310 self.buffer = u''
310 311 self.insertions = []
311 312
312 313 def buffered_tokens(self):
313 314 """
314 315 Generator of unprocessed tokens after doing insertions and before
315 316 changing to a new state.
316 317
317 318 """
318 319 if self.mode == 'output':
319 320 tokens = [(0, Generic.Output, self.buffer)]
320 321 elif self.mode == 'input':
321 322 tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
322 323 else: # traceback
323 324 tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
324 325
325 326 for i, t, v in do_insertions(self.insertions, tokens):
326 327 # All token indexes are relative to the buffer.
327 328 yield self.index + i, t, v
328 329
329 330 # Clear it all
330 331 self.index += len(self.buffer)
331 332 self.buffer = u''
332 333 self.insertions = []
333 334
334 335 def get_mci(self, line):
335 336 """
336 337 Parses the line and returns a 3-tuple: (mode, code, insertion).
337 338
338 339 `mode` is the next mode (or state) of the lexer, and is always equal
339 340 to 'input', 'output', or 'tb'.
340 341
341 342 `code` is a portion of the line that should be added to the buffer
342 343 corresponding to the next mode and eventually lexed by another lexer.
343 344 For example, `code` could be Python code if `mode` were 'input'.
344 345
345 346 `insertion` is a 3-tuple (index, token, text) representing an
346 347 unprocessed "token" that will be inserted into the stream of tokens
347 348 that are created from the buffer once we change modes. This is usually
348 349 the input or output prompt.
349 350
350 351 In general, the next mode depends on current mode and on the contents
351 352 of `line`.
352 353
353 354 """
354 355 # To reduce the number of regex match checks, we have multiple
355 356 # 'if' blocks instead of 'if-elif' blocks.
356 357
357 358 # Check for possible end of input
358 359 in2_match = self.in2_regex.match(line)
359 360 in2_match_rstrip = self.in2_regex_rstrip.match(line)
360 361 if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
361 362 in2_match_rstrip:
362 363 end_input = True
363 364 else:
364 365 end_input = False
365 366 if end_input and self.mode != 'tb':
366 367 # Only look for an end of input when not in tb mode.
367 368 # An ellipsis could appear within the traceback.
368 369 mode = 'output'
369 370 code = u''
370 371 insertion = (0, Generic.Prompt, line)
371 372 return mode, code, insertion
372 373
373 374 # Check for output prompt
374 375 out_match = self.out_regex.match(line)
375 376 out_match_rstrip = self.out_regex_rstrip.match(line)
376 377 if out_match or out_match_rstrip:
377 378 mode = 'output'
378 379 if out_match:
379 380 idx = out_match.end()
380 381 else:
381 382 idx = out_match_rstrip.end()
382 383 code = line[idx:]
383 384 # Use the 'heading' token for output. We cannot use Generic.Error
384 385 # since it would conflict with exceptions.
385 386 insertion = (0, Generic.Heading, line[:idx])
386 387 return mode, code, insertion
387 388
388 389
389 390 # Check for input or continuation prompt (non stripped version)
390 391 in1_match = self.in1_regex.match(line)
391 392 if in1_match or (in2_match and self.mode != 'tb'):
392 393 # New input or when not in tb, continued input.
393 394 # We do not check for continued input when in tb since it is
394 395 # allowable to replace a long stack with an ellipsis.
395 396 mode = 'input'
396 397 if in1_match:
397 398 idx = in1_match.end()
398 399 else: # in2_match
399 400 idx = in2_match.end()
400 401 code = line[idx:]
401 402 insertion = (0, Generic.Prompt, line[:idx])
402 403 return mode, code, insertion
403 404
404 405 # Check for input or continuation prompt (stripped version)
405 406 in1_match_rstrip = self.in1_regex_rstrip.match(line)
406 407 if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
407 408 # New input or when not in tb, continued input.
408 409 # We do not check for continued input when in tb since it is
409 410 # allowable to replace a long stack with an ellipsis.
410 411 mode = 'input'
411 412 if in1_match_rstrip:
412 413 idx = in1_match_rstrip.end()
413 414 else: # in2_match
414 415 idx = in2_match_rstrip.end()
415 416 code = line[idx:]
416 417 insertion = (0, Generic.Prompt, line[:idx])
417 418 return mode, code, insertion
418 419
419 420 # Check for traceback
420 421 if self.ipytb_start.match(line):
421 422 mode = 'tb'
422 423 code = line
423 424 insertion = None
424 425 return mode, code, insertion
425 426
426 427 # All other stuff...
427 428 if self.mode in ('input', 'output'):
428 429 # We assume all other text is output. Multiline input that
429 430 # does not use the continuation marker cannot be detected.
430 431 # For example, the 3 in the following is clearly output:
431 432 #
432 433 # In [1]: print 3
433 434 # 3
434 435 #
435 436 # But the following second line is part of the input:
436 437 #
437 438 # In [2]: while True:
438 439 # print True
439 440 #
440 441 # In both cases, the 2nd line will be 'output'.
441 442 #
442 443 mode = 'output'
443 444 else:
444 445 mode = 'tb'
445 446
446 447 code = line
447 448 insertion = None
448 449
449 450 return mode, code, insertion
450 451
451 452 def get_tokens_unprocessed(self, text):
452 453 self.reset()
453 454 for match in line_re.finditer(text):
454 455 line = match.group()
455 456 mode, code, insertion = self.get_mci(line)
456 457
457 458 if mode != self.mode:
458 459 # Yield buffered tokens before transitioning to new mode.
459 460 for token in self.buffered_tokens():
460 461 yield token
461 462 self.mode = mode
462 463
463 464 if insertion:
464 465 self.insertions.append((len(self.buffer), [insertion]))
465 466 self.buffer += code
466 467 else:
467 468 for token in self.buffered_tokens():
468 469 yield token
469 470
470 471 class IPyLexer(Lexer):
471 472 """
472 473 Primary lexer for all IPython-like code.
473 474
474 475 This is a simple helper lexer. If the first line of the text begins with
475 476 "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
476 477 lexer. If not, then the entire text is parsed with an IPython lexer.
477 478
478 479 The goal is to reduce the number of lexers that are registered
479 480 with Pygments.
480 481
481 482 """
482 483 name = 'IPy session'
483 484 aliases = ['ipy']
484 485
485 486 def __init__(self, **options):
486 487 self.python3 = get_bool_opt(options, 'python3', False)
487 488 if self.python3:
488 489 self.aliases = ['ipy3']
489 490 else:
490 491 self.aliases = ['ipy2', 'ipy']
491 492
492 493 Lexer.__init__(self, **options)
493 494
494 495 self.IPythonLexer = IPythonLexer(**options)
495 496 self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
496 497
497 498 def get_tokens_unprocessed(self, text):
498 499 # Search for the input prompt anywhere...this allows code blocks to
499 500 # begin with comments as well.
500 501 if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL):
501 502 lex = self.IPythonConsoleLexer
502 503 else:
503 504 lex = self.IPythonLexer
504 505 for token in lex.get_tokens_unprocessed(text):
505 506 yield token
506 507
@@ -1,79 +1,88 b''
1 1 """Test lexers module"""
2 2 #-----------------------------------------------------------------------------
3 3 # Copyright (C) 2014 The IPython Development Team
4 4 #
5 5 # Distributed under the terms of the BSD License. The full license is in
6 6 # the file COPYING, distributed as part of this software.
7 7 #-----------------------------------------------------------------------------
8 8
9 9 #-----------------------------------------------------------------------------
10 10 # Imports
11 11 #-----------------------------------------------------------------------------
12 12 from pygments.token import Token
13 13
14 14 from IPython.nbconvert.tests.base import TestsBase
15 15 from .. import lexers
16 16
17 17
18 18 #-----------------------------------------------------------------------------
19 19 # Classes and functions
20 20 #-----------------------------------------------------------------------------
21 21 class TestLexers(TestsBase):
22 22 """Collection of lexers tests"""
23 23 def setUp(self):
24 24 self.lexer = lexers.IPythonLexer()
25 25
26 26 def testIPythonLexer(self):
27 27 fragment = '!echo $HOME\n'
28 28 tokens = [
29 29 (Token.Operator, '!'),
30 30 (Token.Name.Builtin, 'echo'),
31 31 (Token.Text, ' '),
32 32 (Token.Name.Variable, '$HOME'),
33 33 (Token.Text, '\n'),
34 34 ]
35 35 self.assertEqual(tokens, list(self.lexer.get_tokens(fragment)))
36 36
37 37 fragment_2 = 'x = ' + fragment
38 38 tokens_2 = [
39 39 (Token.Name, 'x'),
40 40 (Token.Text, ' '),
41 41 (Token.Operator, '='),
42 42 (Token.Text, ' '),
43 43 ] + tokens
44 44 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
45 45
46 46 fragment_2 = 'x, = ' + fragment
47 47 tokens_2 = [
48 48 (Token.Name, 'x'),
49 49 (Token.Punctuation, ','),
50 50 (Token.Text, ' '),
51 51 (Token.Operator, '='),
52 52 (Token.Text, ' '),
53 53 ] + tokens
54 54 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
55 55
56 56 fragment_2 = 'x, = %sx ' + fragment[1:]
57 57 tokens_2 = [
58 58 (Token.Name, 'x'),
59 59 (Token.Punctuation, ','),
60 60 (Token.Text, ' '),
61 61 (Token.Operator, '='),
62 62 (Token.Text, ' '),
63 63 (Token.Operator, '%'),
64 64 (Token.Keyword, 'sx'),
65 65 (Token.Text, ' '),
66 66 ] + tokens[1:]
67 67 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
68 68
69 69 fragment_2 = 'f = %R function () {}\n'
70 70 tokens_2 = [
71 71 (Token.Name, 'f'),
72 72 (Token.Text, ' '),
73 73 (Token.Operator, '='),
74 74 (Token.Text, ' '),
75 75 (Token.Operator, '%'),
76 76 (Token.Keyword, 'R'),
77 77 (Token.Text, ' function () {}\n'),
78 78 ]
79 79 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
80
81 fragment_2 = '\t%%xyz\n$foo\n'
82 tokens_2 = [
83 (Token.Text, '\t'),
84 (Token.Operator, '%%'),
85 (Token.Keyword, 'xyz'),
86 (Token.Text, '\n$foo\n'),
87 ]
88 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
General Comments 0
You need to be logged in to leave comments. Login now