##// END OF EJS Templates
Issue #7548: Fixed parsing of line magic....
Lev Abalkin -
Show More
@@ -1,506 +1,506 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 Defines a variety of Pygments lexers for highlighting IPython code.
4 4
5 5 This includes:
6 6
7 7 IPythonLexer, IPython3Lexer
8 8 Lexers for pure IPython (python + magic/shell commands)
9 9
10 10 IPythonPartialTracebackLexer, IPythonTracebackLexer
11 11 Supports 2.x and 3.x via keyword `python3`. The partial traceback
12 12 lexer reads everything but the Python code appearing in a traceback.
13 13 The full lexer combines the partial lexer with an IPython lexer.
14 14
15 15 IPythonConsoleLexer
16 16 A lexer for IPython console sessions, with support for tracebacks.
17 17
18 18 IPyLexer
19 19 A friendly lexer which examines the first line of text and from it,
20 20 decides whether to use an IPython lexer or an IPython console lexer.
21 21 This is probably the only lexer that needs to be explicitly added
22 22 to Pygments.
23 23
24 24 """
25 25 #-----------------------------------------------------------------------------
26 26 # Copyright (c) 2013, the IPython Development Team.
27 27 #
28 28 # Distributed under the terms of the Modified BSD License.
29 29 #
30 30 # The full license is in the file COPYING.txt, distributed with this software.
31 31 #-----------------------------------------------------------------------------
32 32
33 33 # Standard library
34 34 import re
35 35
36 36 # Third party
37 37 from pygments.lexers import BashLexer, PythonLexer, Python3Lexer
38 38 from pygments.lexer import (
39 39 Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
40 40 )
41 41 from pygments.token import (
42 42 Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
43 43 )
44 44 from pygments.util import get_bool_opt
45 45
46 46 # Local
47 47 from IPython.testing.skipdoctest import skip_doctest
48 48
49 49 line_re = re.compile('.*?\n')
50 50
51 51 ipython_tokens = [
52 (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword,
52 (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword,
53 53 using(BashLexer), Text)),
54 (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)),
54 (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)),
55 55 (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
56 56 (r'^(.+)(=)(\s*)(!)(.+)(\n)', bygroups(
57 57 # With the limited syntax allowed on the l.h.s. of a shell capture,
58 58 # we don't need to differentiate between Python 2 and 3.
59 59 using(Python3Lexer), Operator, Text, Operator, using(BashLexer), Text)),
60 60 ]
61 61
62 62 def build_ipy_lexer(python3):
63 63 """Builds IPython lexers depending on the value of `python3`.
64 64
65 65 The lexer inherits from an appropriate Python lexer and then adds
66 66 information about IPython specific keywords (i.e. magic commands,
67 67 shell commands, etc.)
68 68
69 69 Parameters
70 70 ----------
71 71 python3 : bool
72 72 If `True`, then build an IPython lexer from a Python 3 lexer.
73 73
74 74 """
75 75 # It would be nice to have a single IPython lexer class which takes
76 76 # a boolean `python3`. But since there are two Python lexer classes,
77 77 # we will also have two IPython lexer classes.
78 78 if python3:
79 79 PyLexer = Python3Lexer
80 80 clsname = 'IPython3Lexer'
81 81 name = 'IPython3'
82 82 aliases = ['ipython3']
83 83 doc = """IPython3 Lexer"""
84 84 else:
85 85 PyLexer = PythonLexer
86 86 clsname = 'IPythonLexer'
87 87 name = 'IPython'
88 88 aliases = ['ipython2', 'ipython']
89 89 doc = """IPython Lexer"""
90 90
91 91 tokens = PyLexer.tokens.copy()
92 92 tokens['root'] = ipython_tokens + tokens['root']
93 93
94 94 attrs = {'name': name, 'aliases': aliases,
95 95 '__doc__': doc, 'tokens': tokens}
96 96
97 97 return type(name, (PyLexer,), attrs)
98 98
99 99
100 100 IPython3Lexer = build_ipy_lexer(python3=True)
101 101 IPythonLexer = build_ipy_lexer(python3=False)
102 102
103 103
104 104 class IPythonPartialTracebackLexer(RegexLexer):
105 105 """
106 106 Partial lexer for IPython tracebacks.
107 107
108 108 Handles all the non-python output. This works for both Python 2.x and 3.x.
109 109
110 110 """
111 111 name = 'IPython Partial Traceback'
112 112
113 113 tokens = {
114 114 'root': [
115 115 # Tracebacks for syntax errors have a different style.
116 116 # For both types of tracebacks, we mark the first line with
117 117 # Generic.Traceback. For syntax errors, we mark the filename
118 118 # as we mark the filenames for non-syntax tracebacks.
119 119 #
120 120 # These two regexps define how IPythonConsoleLexer finds a
121 121 # traceback.
122 122 #
123 123 ## Non-syntax traceback
124 124 (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
125 125 ## Syntax traceback
126 126 (r'^( File)(.*)(, line )(\d+\n)',
127 127 bygroups(Generic.Traceback, Name.Namespace,
128 128 Generic.Traceback, Literal.Number.Integer)),
129 129
130 130 # (Exception Identifier)(Whitespace)(Traceback Message)
131 131 (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
132 132 bygroups(Name.Exception, Generic.Whitespace, Text)),
133 133 # (Module/Filename)(Text)(Callee)(Function Signature)
134 134 # Better options for callee and function signature?
135 135 (r'(.*)( in )(.*)(\(.*\)\n)',
136 136 bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
137 137 # Regular line: (Whitespace)(Line Number)(Python Code)
138 138 (r'(\s*?)(\d+)(.*?\n)',
139 139 bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
140 140 # Emphasized line: (Arrow)(Line Number)(Python Code)
141 141 # Using Exception token so arrow color matches the Exception.
142 142 (r'(-*>?\s?)(\d+)(.*?\n)',
143 143 bygroups(Name.Exception, Literal.Number.Integer, Other)),
144 144 # (Exception Identifier)(Message)
145 145 (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
146 146 bygroups(Name.Exception, Text)),
147 147 # Tag everything else as Other, will be handled later.
148 148 (r'.*\n', Other),
149 149 ],
150 150 }
151 151
152 152
153 153 class IPythonTracebackLexer(DelegatingLexer):
154 154 """
155 155 IPython traceback lexer.
156 156
157 157 For doctests, the tracebacks can be snipped as much as desired with the
158 158 exception to the lines that designate a traceback. For non-syntax error
159 159 tracebacks, this is the line of hyphens. For syntax error tracebacks,
160 160 this is the line which lists the File and line number.
161 161
162 162 """
163 163 # The lexer inherits from DelegatingLexer. The "root" lexer is an
164 164 # appropriate IPython lexer, which depends on the value of the boolean
165 165 # `python3`. First, we parse with the partial IPython traceback lexer.
166 166 # Then, any code marked with the "Other" token is delegated to the root
167 167 # lexer.
168 168 #
169 169 name = 'IPython Traceback'
170 170 aliases = ['ipythontb']
171 171
172 172 def __init__(self, **options):
173 173 self.python3 = get_bool_opt(options, 'python3', False)
174 174 if self.python3:
175 175 self.aliases = ['ipython3tb']
176 176 else:
177 177 self.aliases = ['ipython2tb', 'ipythontb']
178 178
179 179 if self.python3:
180 180 IPyLexer = IPython3Lexer
181 181 else:
182 182 IPyLexer = IPythonLexer
183 183
184 184 DelegatingLexer.__init__(self, IPyLexer,
185 185 IPythonPartialTracebackLexer, **options)
186 186
187 187 @skip_doctest
188 188 class IPythonConsoleLexer(Lexer):
189 189 """
190 190 An IPython console lexer for IPython code-blocks and doctests, such as:
191 191
192 192 .. code-block:: rst
193 193
194 194 .. code-block:: ipythonconsole
195 195
196 196 In [1]: a = 'foo'
197 197
198 198 In [2]: a
199 199 Out[2]: 'foo'
200 200
201 201 In [3]: print a
202 202 foo
203 203
204 204 In [4]: 1 / 0
205 205
206 206
207 207 Support is also provided for IPython exceptions:
208 208
209 209 .. code-block:: rst
210 210
211 211 .. code-block:: ipythonconsole
212 212
213 213 In [1]: raise Exception
214 214
215 215 ---------------------------------------------------------------------------
216 216 Exception Traceback (most recent call last)
217 217 <ipython-input-1-fca2ab0ca76b> in <module>()
218 218 ----> 1 raise Exception
219 219
220 220 Exception:
221 221
222 222 """
223 223 name = 'IPython console session'
224 224 aliases = ['ipythonconsole']
225 225 mimetypes = ['text/x-ipython-console']
226 226
227 227 # The regexps used to determine what is input and what is output.
228 228 # The default prompts for IPython are:
229 229 #
230 230 # c.PromptManager.in_template = 'In [\#]: '
231 231 # c.PromptManager.in2_template = ' .\D.: '
232 232 # c.PromptManager.out_template = 'Out[\#]: '
233 233 #
234 234 in1_regex = r'In \[[0-9]+\]: '
235 235 in2_regex = r' \.\.+\.: '
236 236 out_regex = r'Out\[[0-9]+\]: '
237 237
238 238 #: The regex to determine when a traceback starts.
239 239 ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)')
240 240
241 241 def __init__(self, **options):
242 242 """Initialize the IPython console lexer.
243 243
244 244 Parameters
245 245 ----------
246 246 python3 : bool
247 247 If `True`, then the console inputs are parsed using a Python 3
248 248 lexer. Otherwise, they are parsed using a Python 2 lexer.
249 249 in1_regex : RegexObject
250 250 The compiled regular expression used to detect the start
251 251 of inputs. Although the IPython configuration setting may have a
252 252 trailing whitespace, do not include it in the regex. If `None`,
253 253 then the default input prompt is assumed.
254 254 in2_regex : RegexObject
255 255 The compiled regular expression used to detect the continuation
256 256 of inputs. Although the IPython configuration setting may have a
257 257 trailing whitespace, do not include it in the regex. If `None`,
258 258 then the default input prompt is assumed.
259 259 out_regex : RegexObject
260 260 The compiled regular expression used to detect outputs. If `None`,
261 261 then the default output prompt is assumed.
262 262
263 263 """
264 264 self.python3 = get_bool_opt(options, 'python3', False)
265 265 if self.python3:
266 266 self.aliases = ['ipython3console']
267 267 else:
268 268 self.aliases = ['ipython2console', 'ipythonconsole']
269 269
270 270 in1_regex = options.get('in1_regex', self.in1_regex)
271 271 in2_regex = options.get('in2_regex', self.in2_regex)
272 272 out_regex = options.get('out_regex', self.out_regex)
273 273
274 274 # So that we can work with input and output prompts which have been
275 275 # rstrip'd (possibly by editors) we also need rstrip'd variants. If
276 276 # we do not do this, then such prompts will be tagged as 'output'.
277 277 # The reason can't just use the rstrip'd variants instead is because
278 278 # we want any whitespace associated with the prompt to be inserted
279 279 # with the token. This allows formatted code to be modified so as hide
280 280 # the appearance of prompts, with the whitespace included. One example
281 281 # use of this is in copybutton.js from the standard lib Python docs.
282 282 in1_regex_rstrip = in1_regex.rstrip() + '\n'
283 283 in2_regex_rstrip = in2_regex.rstrip() + '\n'
284 284 out_regex_rstrip = out_regex.rstrip() + '\n'
285 285
286 286 # Compile and save them all.
287 287 attrs = ['in1_regex', 'in2_regex', 'out_regex',
288 288 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
289 289 for attr in attrs:
290 290 self.__setattr__(attr, re.compile(locals()[attr]))
291 291
292 292 Lexer.__init__(self, **options)
293 293
294 294 if self.python3:
295 295 pylexer = IPython3Lexer
296 296 tblexer = IPythonTracebackLexer
297 297 else:
298 298 pylexer = IPythonLexer
299 299 tblexer = IPythonTracebackLexer
300 300
301 301 self.pylexer = pylexer(**options)
302 302 self.tblexer = tblexer(**options)
303 303
304 304 self.reset()
305 305
306 306 def reset(self):
307 307 self.mode = 'output'
308 308 self.index = 0
309 309 self.buffer = u''
310 310 self.insertions = []
311 311
312 312 def buffered_tokens(self):
313 313 """
314 314 Generator of unprocessed tokens after doing insertions and before
315 315 changing to a new state.
316 316
317 317 """
318 318 if self.mode == 'output':
319 319 tokens = [(0, Generic.Output, self.buffer)]
320 320 elif self.mode == 'input':
321 321 tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
322 322 else: # traceback
323 323 tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
324 324
325 325 for i, t, v in do_insertions(self.insertions, tokens):
326 326 # All token indexes are relative to the buffer.
327 327 yield self.index + i, t, v
328 328
329 329 # Clear it all
330 330 self.index += len(self.buffer)
331 331 self.buffer = u''
332 332 self.insertions = []
333 333
334 334 def get_mci(self, line):
335 335 """
336 336 Parses the line and returns a 3-tuple: (mode, code, insertion).
337 337
338 338 `mode` is the next mode (or state) of the lexer, and is always equal
339 339 to 'input', 'output', or 'tb'.
340 340
341 341 `code` is a portion of the line that should be added to the buffer
342 342 corresponding to the next mode and eventually lexed by another lexer.
343 343 For example, `code` could be Python code if `mode` were 'input'.
344 344
345 345 `insertion` is a 3-tuple (index, token, text) representing an
346 346 unprocessed "token" that will be inserted into the stream of tokens
347 347 that are created from the buffer once we change modes. This is usually
348 348 the input or output prompt.
349 349
350 350 In general, the next mode depends on current mode and on the contents
351 351 of `line`.
352 352
353 353 """
354 354 # To reduce the number of regex match checks, we have multiple
355 355 # 'if' blocks instead of 'if-elif' blocks.
356 356
357 357 # Check for possible end of input
358 358 in2_match = self.in2_regex.match(line)
359 359 in2_match_rstrip = self.in2_regex_rstrip.match(line)
360 360 if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
361 361 in2_match_rstrip:
362 362 end_input = True
363 363 else:
364 364 end_input = False
365 365 if end_input and self.mode != 'tb':
366 366 # Only look for an end of input when not in tb mode.
367 367 # An ellipsis could appear within the traceback.
368 368 mode = 'output'
369 369 code = u''
370 370 insertion = (0, Generic.Prompt, line)
371 371 return mode, code, insertion
372 372
373 373 # Check for output prompt
374 374 out_match = self.out_regex.match(line)
375 375 out_match_rstrip = self.out_regex_rstrip.match(line)
376 376 if out_match or out_match_rstrip:
377 377 mode = 'output'
378 378 if out_match:
379 379 idx = out_match.end()
380 380 else:
381 381 idx = out_match_rstrip.end()
382 382 code = line[idx:]
383 383 # Use the 'heading' token for output. We cannot use Generic.Error
384 384 # since it would conflict with exceptions.
385 385 insertion = (0, Generic.Heading, line[:idx])
386 386 return mode, code, insertion
387 387
388 388
389 389 # Check for input or continuation prompt (non stripped version)
390 390 in1_match = self.in1_regex.match(line)
391 391 if in1_match or (in2_match and self.mode != 'tb'):
392 392 # New input or when not in tb, continued input.
393 393 # We do not check for continued input when in tb since it is
394 394 # allowable to replace a long stack with an ellipsis.
395 395 mode = 'input'
396 396 if in1_match:
397 397 idx = in1_match.end()
398 398 else: # in2_match
399 399 idx = in2_match.end()
400 400 code = line[idx:]
401 401 insertion = (0, Generic.Prompt, line[:idx])
402 402 return mode, code, insertion
403 403
404 404 # Check for input or continuation prompt (stripped version)
405 405 in1_match_rstrip = self.in1_regex_rstrip.match(line)
406 406 if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
407 407 # New input or when not in tb, continued input.
408 408 # We do not check for continued input when in tb since it is
409 409 # allowable to replace a long stack with an ellipsis.
410 410 mode = 'input'
411 411 if in1_match_rstrip:
412 412 idx = in1_match_rstrip.end()
413 413 else: # in2_match
414 414 idx = in2_match_rstrip.end()
415 415 code = line[idx:]
416 416 insertion = (0, Generic.Prompt, line[:idx])
417 417 return mode, code, insertion
418 418
419 419 # Check for traceback
420 420 if self.ipytb_start.match(line):
421 421 mode = 'tb'
422 422 code = line
423 423 insertion = None
424 424 return mode, code, insertion
425 425
426 426 # All other stuff...
427 427 if self.mode in ('input', 'output'):
428 428 # We assume all other text is output. Multiline input that
429 429 # does not use the continuation marker cannot be detected.
430 430 # For example, the 3 in the following is clearly output:
431 431 #
432 432 # In [1]: print 3
433 433 # 3
434 434 #
435 435 # But the following second line is part of the input:
436 436 #
437 437 # In [2]: while True:
438 438 # print True
439 439 #
440 440 # In both cases, the 2nd line will be 'output'.
441 441 #
442 442 mode = 'output'
443 443 else:
444 444 mode = 'tb'
445 445
446 446 code = line
447 447 insertion = None
448 448
449 449 return mode, code, insertion
450 450
451 451 def get_tokens_unprocessed(self, text):
452 452 self.reset()
453 453 for match in line_re.finditer(text):
454 454 line = match.group()
455 455 mode, code, insertion = self.get_mci(line)
456 456
457 457 if mode != self.mode:
458 458 # Yield buffered tokens before transitioning to new mode.
459 459 for token in self.buffered_tokens():
460 460 yield token
461 461 self.mode = mode
462 462
463 463 if insertion:
464 464 self.insertions.append((len(self.buffer), [insertion]))
465 465 self.buffer += code
466 466 else:
467 467 for token in self.buffered_tokens():
468 468 yield token
469 469
470 470 class IPyLexer(Lexer):
471 471 """
472 472 Primary lexer for all IPython-like code.
473 473
474 474 This is a simple helper lexer. If the first line of the text begins with
475 475 "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
476 476 lexer. If not, then the entire text is parsed with an IPython lexer.
477 477
478 478 The goal is to reduce the number of lexers that are registered
479 479 with Pygments.
480 480
481 481 """
482 482 name = 'IPy session'
483 483 aliases = ['ipy']
484 484
485 485 def __init__(self, **options):
486 486 self.python3 = get_bool_opt(options, 'python3', False)
487 487 if self.python3:
488 488 self.aliases = ['ipy3']
489 489 else:
490 490 self.aliases = ['ipy2', 'ipy']
491 491
492 492 Lexer.__init__(self, **options)
493 493
494 494 self.IPythonLexer = IPythonLexer(**options)
495 495 self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
496 496
497 497 def get_tokens_unprocessed(self, text):
498 498 # Search for the input prompt anywhere...this allows code blocks to
499 499 # begin with comments as well.
500 500 if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL):
501 501 lex = self.IPythonConsoleLexer
502 502 else:
503 503 lex = self.IPythonLexer
504 504 for token in lex.get_tokens_unprocessed(text):
505 505 yield token
506 506
@@ -1,53 +1,79 b''
1 1 """Test lexers module"""
2 2 #-----------------------------------------------------------------------------
3 3 # Copyright (C) 2014 The IPython Development Team
4 4 #
5 5 # Distributed under the terms of the BSD License. The full license is in
6 6 # the file COPYING, distributed as part of this software.
7 7 #-----------------------------------------------------------------------------
8 8
9 9 #-----------------------------------------------------------------------------
10 10 # Imports
11 11 #-----------------------------------------------------------------------------
12 12 from pygments.token import Token
13 13
14 14 from IPython.nbconvert.tests.base import TestsBase
15 15 from .. import lexers
16 16
17 17
18 18 #-----------------------------------------------------------------------------
19 19 # Classes and functions
20 20 #-----------------------------------------------------------------------------
21 21 class TestLexers(TestsBase):
22 22 """Collection of lexers tests"""
23 23 def setUp(self):
24 24 self.lexer = lexers.IPythonLexer()
25 25
26 26 def testIPythonLexer(self):
27 27 fragment = '!echo $HOME\n'
28 28 tokens = [
29 29 (Token.Operator, '!'),
30 30 (Token.Name.Builtin, 'echo'),
31 31 (Token.Text, ' '),
32 32 (Token.Name.Variable, '$HOME'),
33 33 (Token.Text, '\n'),
34 34 ]
35 35 self.assertEqual(tokens, list(self.lexer.get_tokens(fragment)))
36 36
37 37 fragment_2 = 'x = ' + fragment
38 38 tokens_2 = [
39 39 (Token.Name, 'x'),
40 40 (Token.Text, ' '),
41 41 (Token.Operator, '='),
42 42 (Token.Text, ' '),
43 43 ] + tokens
44 44 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
45
45 46 fragment_2 = 'x, = ' + fragment
46 47 tokens_2 = [
47 48 (Token.Name, 'x'),
48 49 (Token.Punctuation, ','),
49 50 (Token.Text, ' '),
50 51 (Token.Operator, '='),
51 52 (Token.Text, ' '),
52 53 ] + tokens
53 54 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
55
56 fragment_2 = 'x, = %sx ' + fragment[1:]
57 tokens_2 = [
58 (Token.Name, 'x'),
59 (Token.Punctuation, ','),
60 (Token.Text, ' '),
61 (Token.Operator, '='),
62 (Token.Text, ' '),
63 (Token.Operator, '%'),
64 (Token.Keyword, 'sx'),
65 (Token.Text, ' '),
66 ] + tokens[1:]
67 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
68
69 fragment_2 = 'f = %R function () {}\n'
70 tokens_2 = [
71 (Token.Name, 'f'),
72 (Token.Text, ' '),
73 (Token.Operator, '='),
74 (Token.Text, ' '),
75 (Token.Operator, '%'),
76 (Token.Keyword, 'R'),
77 (Token.Text, ' function () {}\n'),
78 ]
79 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
General Comments 0
You need to be logged in to leave comments. Login now