##// END OF EJS Templates
#7558: Simplified handling of the ! escape.
Lev Abalkin -
Show More
@@ -1,507 +1,504 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 Defines a variety of Pygments lexers for highlighting IPython code.
4 4
5 5 This includes:
6 6
7 7 IPythonLexer, IPython3Lexer
8 8 Lexers for pure IPython (python + magic/shell commands)
9 9
10 10 IPythonPartialTracebackLexer, IPythonTracebackLexer
11 11 Supports 2.x and 3.x via keyword `python3`. The partial traceback
12 12 lexer reads everything but the Python code appearing in a traceback.
13 13 The full lexer combines the partial lexer with an IPython lexer.
14 14
15 15 IPythonConsoleLexer
16 16 A lexer for IPython console sessions, with support for tracebacks.
17 17
18 18 IPyLexer
19 19 A friendly lexer which examines the first line of text and from it,
20 20 decides whether to use an IPython lexer or an IPython console lexer.
21 21 This is probably the only lexer that needs to be explicitly added
22 22 to Pygments.
23 23
24 24 """
25 25 #-----------------------------------------------------------------------------
26 26 # Copyright (c) 2013, the IPython Development Team.
27 27 #
28 28 # Distributed under the terms of the Modified BSD License.
29 29 #
30 30 # The full license is in the file COPYING.txt, distributed with this software.
31 31 #-----------------------------------------------------------------------------
32 32
33 33 # Standard library
34 34 import re
35 35
36 36 # Third party
37 37 from pygments.lexers import BashLexer, PythonLexer, Python3Lexer
38 38 from pygments.lexer import (
39 39 Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
40 40 )
41 41 from pygments.token import (
42 42 Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
43 43 )
44 44 from pygments.util import get_bool_opt
45 45
46 46 # Local
47 47 from IPython.testing.skipdoctest import skip_doctest
48 48
49 49 line_re = re.compile('.*?\n')
50 50
51 51 ipython_tokens = [
52 52 (r"(?s)(\s*)(%%)(\w+)(.*)", bygroups(Text, Operator, Keyword, Text)),
53 53 (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword,
54 54 using(BashLexer), Text)),
55 55 (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)),
56 (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
57 (r'^(.+)(=)(\s*)(!)(.+)(\n)', bygroups(
58 # With the limited syntax allowed on the l.h.s. of a shell capture,
59 # we don't need to differentiate between Python 2 and 3.
60 using(Python3Lexer), Operator, Text, Operator, using(BashLexer), Text)),
56 (r'^(!!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
57 (r'(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
61 58 ]
62 59
63 60 def build_ipy_lexer(python3):
64 61 """Builds IPython lexers depending on the value of `python3`.
65 62
66 63 The lexer inherits from an appropriate Python lexer and then adds
67 64 information about IPython specific keywords (i.e. magic commands,
68 65 shell commands, etc.)
69 66
70 67 Parameters
71 68 ----------
72 69 python3 : bool
73 70 If `True`, then build an IPython lexer from a Python 3 lexer.
74 71
75 72 """
76 73 # It would be nice to have a single IPython lexer class which takes
77 74 # a boolean `python3`. But since there are two Python lexer classes,
78 75 # we will also have two IPython lexer classes.
79 76 if python3:
80 77 PyLexer = Python3Lexer
81 78 clsname = 'IPython3Lexer'
82 79 name = 'IPython3'
83 80 aliases = ['ipython3']
84 81 doc = """IPython3 Lexer"""
85 82 else:
86 83 PyLexer = PythonLexer
87 84 clsname = 'IPythonLexer'
88 85 name = 'IPython'
89 86 aliases = ['ipython2', 'ipython']
90 87 doc = """IPython Lexer"""
91 88
92 89 tokens = PyLexer.tokens.copy()
93 90 tokens['root'] = ipython_tokens + tokens['root']
94 91
95 92 attrs = {'name': name, 'aliases': aliases,
96 93 '__doc__': doc, 'tokens': tokens}
97 94
98 95 return type(name, (PyLexer,), attrs)
99 96
100 97
101 98 IPython3Lexer = build_ipy_lexer(python3=True)
102 99 IPythonLexer = build_ipy_lexer(python3=False)
103 100
104 101
105 102 class IPythonPartialTracebackLexer(RegexLexer):
106 103 """
107 104 Partial lexer for IPython tracebacks.
108 105
109 106 Handles all the non-python output. This works for both Python 2.x and 3.x.
110 107
111 108 """
112 109 name = 'IPython Partial Traceback'
113 110
114 111 tokens = {
115 112 'root': [
116 113 # Tracebacks for syntax errors have a different style.
117 114 # For both types of tracebacks, we mark the first line with
118 115 # Generic.Traceback. For syntax errors, we mark the filename
119 116 # as we mark the filenames for non-syntax tracebacks.
120 117 #
121 118 # These two regexps define how IPythonConsoleLexer finds a
122 119 # traceback.
123 120 #
124 121 ## Non-syntax traceback
125 122 (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
126 123 ## Syntax traceback
127 124 (r'^( File)(.*)(, line )(\d+\n)',
128 125 bygroups(Generic.Traceback, Name.Namespace,
129 126 Generic.Traceback, Literal.Number.Integer)),
130 127
131 128 # (Exception Identifier)(Whitespace)(Traceback Message)
132 129 (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
133 130 bygroups(Name.Exception, Generic.Whitespace, Text)),
134 131 # (Module/Filename)(Text)(Callee)(Function Signature)
135 132 # Better options for callee and function signature?
136 133 (r'(.*)( in )(.*)(\(.*\)\n)',
137 134 bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
138 135 # Regular line: (Whitespace)(Line Number)(Python Code)
139 136 (r'(\s*?)(\d+)(.*?\n)',
140 137 bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
141 138 # Emphasized line: (Arrow)(Line Number)(Python Code)
142 139 # Using Exception token so arrow color matches the Exception.
143 140 (r'(-*>?\s?)(\d+)(.*?\n)',
144 141 bygroups(Name.Exception, Literal.Number.Integer, Other)),
145 142 # (Exception Identifier)(Message)
146 143 (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
147 144 bygroups(Name.Exception, Text)),
148 145 # Tag everything else as Other, will be handled later.
149 146 (r'.*\n', Other),
150 147 ],
151 148 }
152 149
153 150
154 151 class IPythonTracebackLexer(DelegatingLexer):
155 152 """
156 153 IPython traceback lexer.
157 154
158 155 For doctests, the tracebacks can be snipped as much as desired with the
159 156 exception to the lines that designate a traceback. For non-syntax error
160 157 tracebacks, this is the line of hyphens. For syntax error tracebacks,
161 158 this is the line which lists the File and line number.
162 159
163 160 """
164 161 # The lexer inherits from DelegatingLexer. The "root" lexer is an
165 162 # appropriate IPython lexer, which depends on the value of the boolean
166 163 # `python3`. First, we parse with the partial IPython traceback lexer.
167 164 # Then, any code marked with the "Other" token is delegated to the root
168 165 # lexer.
169 166 #
170 167 name = 'IPython Traceback'
171 168 aliases = ['ipythontb']
172 169
173 170 def __init__(self, **options):
174 171 self.python3 = get_bool_opt(options, 'python3', False)
175 172 if self.python3:
176 173 self.aliases = ['ipython3tb']
177 174 else:
178 175 self.aliases = ['ipython2tb', 'ipythontb']
179 176
180 177 if self.python3:
181 178 IPyLexer = IPython3Lexer
182 179 else:
183 180 IPyLexer = IPythonLexer
184 181
185 182 DelegatingLexer.__init__(self, IPyLexer,
186 183 IPythonPartialTracebackLexer, **options)
187 184
188 185 @skip_doctest
189 186 class IPythonConsoleLexer(Lexer):
190 187 """
191 188 An IPython console lexer for IPython code-blocks and doctests, such as:
192 189
193 190 .. code-block:: rst
194 191
195 192 .. code-block:: ipythonconsole
196 193
197 194 In [1]: a = 'foo'
198 195
199 196 In [2]: a
200 197 Out[2]: 'foo'
201 198
202 199 In [3]: print a
203 200 foo
204 201
205 202 In [4]: 1 / 0
206 203
207 204
208 205 Support is also provided for IPython exceptions:
209 206
210 207 .. code-block:: rst
211 208
212 209 .. code-block:: ipythonconsole
213 210
214 211 In [1]: raise Exception
215 212
216 213 ---------------------------------------------------------------------------
217 214 Exception Traceback (most recent call last)
218 215 <ipython-input-1-fca2ab0ca76b> in <module>()
219 216 ----> 1 raise Exception
220 217
221 218 Exception:
222 219
223 220 """
224 221 name = 'IPython console session'
225 222 aliases = ['ipythonconsole']
226 223 mimetypes = ['text/x-ipython-console']
227 224
228 225 # The regexps used to determine what is input and what is output.
229 226 # The default prompts for IPython are:
230 227 #
231 228 # c.PromptManager.in_template = 'In [\#]: '
232 229 # c.PromptManager.in2_template = ' .\D.: '
233 230 # c.PromptManager.out_template = 'Out[\#]: '
234 231 #
235 232 in1_regex = r'In \[[0-9]+\]: '
236 233 in2_regex = r' \.\.+\.: '
237 234 out_regex = r'Out\[[0-9]+\]: '
238 235
239 236 #: The regex to determine when a traceback starts.
240 237 ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)')
241 238
242 239 def __init__(self, **options):
243 240 """Initialize the IPython console lexer.
244 241
245 242 Parameters
246 243 ----------
247 244 python3 : bool
248 245 If `True`, then the console inputs are parsed using a Python 3
249 246 lexer. Otherwise, they are parsed using a Python 2 lexer.
250 247 in1_regex : RegexObject
251 248 The compiled regular expression used to detect the start
252 249 of inputs. Although the IPython configuration setting may have a
253 250 trailing whitespace, do not include it in the regex. If `None`,
254 251 then the default input prompt is assumed.
255 252 in2_regex : RegexObject
256 253 The compiled regular expression used to detect the continuation
257 254 of inputs. Although the IPython configuration setting may have a
258 255 trailing whitespace, do not include it in the regex. If `None`,
259 256 then the default input prompt is assumed.
260 257 out_regex : RegexObject
261 258 The compiled regular expression used to detect outputs. If `None`,
262 259 then the default output prompt is assumed.
263 260
264 261 """
265 262 self.python3 = get_bool_opt(options, 'python3', False)
266 263 if self.python3:
267 264 self.aliases = ['ipython3console']
268 265 else:
269 266 self.aliases = ['ipython2console', 'ipythonconsole']
270 267
271 268 in1_regex = options.get('in1_regex', self.in1_regex)
272 269 in2_regex = options.get('in2_regex', self.in2_regex)
273 270 out_regex = options.get('out_regex', self.out_regex)
274 271
275 272 # So that we can work with input and output prompts which have been
276 273 # rstrip'd (possibly by editors) we also need rstrip'd variants. If
277 274 # we do not do this, then such prompts will be tagged as 'output'.
278 275 # The reason can't just use the rstrip'd variants instead is because
279 276 # we want any whitespace associated with the prompt to be inserted
280 277 # with the token. This allows formatted code to be modified so as hide
281 278 # the appearance of prompts, with the whitespace included. One example
282 279 # use of this is in copybutton.js from the standard lib Python docs.
283 280 in1_regex_rstrip = in1_regex.rstrip() + '\n'
284 281 in2_regex_rstrip = in2_regex.rstrip() + '\n'
285 282 out_regex_rstrip = out_regex.rstrip() + '\n'
286 283
287 284 # Compile and save them all.
288 285 attrs = ['in1_regex', 'in2_regex', 'out_regex',
289 286 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
290 287 for attr in attrs:
291 288 self.__setattr__(attr, re.compile(locals()[attr]))
292 289
293 290 Lexer.__init__(self, **options)
294 291
295 292 if self.python3:
296 293 pylexer = IPython3Lexer
297 294 tblexer = IPythonTracebackLexer
298 295 else:
299 296 pylexer = IPythonLexer
300 297 tblexer = IPythonTracebackLexer
301 298
302 299 self.pylexer = pylexer(**options)
303 300 self.tblexer = tblexer(**options)
304 301
305 302 self.reset()
306 303
307 304 def reset(self):
308 305 self.mode = 'output'
309 306 self.index = 0
310 307 self.buffer = u''
311 308 self.insertions = []
312 309
313 310 def buffered_tokens(self):
314 311 """
315 312 Generator of unprocessed tokens after doing insertions and before
316 313 changing to a new state.
317 314
318 315 """
319 316 if self.mode == 'output':
320 317 tokens = [(0, Generic.Output, self.buffer)]
321 318 elif self.mode == 'input':
322 319 tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
323 320 else: # traceback
324 321 tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
325 322
326 323 for i, t, v in do_insertions(self.insertions, tokens):
327 324 # All token indexes are relative to the buffer.
328 325 yield self.index + i, t, v
329 326
330 327 # Clear it all
331 328 self.index += len(self.buffer)
332 329 self.buffer = u''
333 330 self.insertions = []
334 331
335 332 def get_mci(self, line):
336 333 """
337 334 Parses the line and returns a 3-tuple: (mode, code, insertion).
338 335
339 336 `mode` is the next mode (or state) of the lexer, and is always equal
340 337 to 'input', 'output', or 'tb'.
341 338
342 339 `code` is a portion of the line that should be added to the buffer
343 340 corresponding to the next mode and eventually lexed by another lexer.
344 341 For example, `code` could be Python code if `mode` were 'input'.
345 342
346 343 `insertion` is a 3-tuple (index, token, text) representing an
347 344 unprocessed "token" that will be inserted into the stream of tokens
348 345 that are created from the buffer once we change modes. This is usually
349 346 the input or output prompt.
350 347
351 348 In general, the next mode depends on current mode and on the contents
352 349 of `line`.
353 350
354 351 """
355 352 # To reduce the number of regex match checks, we have multiple
356 353 # 'if' blocks instead of 'if-elif' blocks.
357 354
358 355 # Check for possible end of input
359 356 in2_match = self.in2_regex.match(line)
360 357 in2_match_rstrip = self.in2_regex_rstrip.match(line)
361 358 if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
362 359 in2_match_rstrip:
363 360 end_input = True
364 361 else:
365 362 end_input = False
366 363 if end_input and self.mode != 'tb':
367 364 # Only look for an end of input when not in tb mode.
368 365 # An ellipsis could appear within the traceback.
369 366 mode = 'output'
370 367 code = u''
371 368 insertion = (0, Generic.Prompt, line)
372 369 return mode, code, insertion
373 370
374 371 # Check for output prompt
375 372 out_match = self.out_regex.match(line)
376 373 out_match_rstrip = self.out_regex_rstrip.match(line)
377 374 if out_match or out_match_rstrip:
378 375 mode = 'output'
379 376 if out_match:
380 377 idx = out_match.end()
381 378 else:
382 379 idx = out_match_rstrip.end()
383 380 code = line[idx:]
384 381 # Use the 'heading' token for output. We cannot use Generic.Error
385 382 # since it would conflict with exceptions.
386 383 insertion = (0, Generic.Heading, line[:idx])
387 384 return mode, code, insertion
388 385
389 386
390 387 # Check for input or continuation prompt (non stripped version)
391 388 in1_match = self.in1_regex.match(line)
392 389 if in1_match or (in2_match and self.mode != 'tb'):
393 390 # New input or when not in tb, continued input.
394 391 # We do not check for continued input when in tb since it is
395 392 # allowable to replace a long stack with an ellipsis.
396 393 mode = 'input'
397 394 if in1_match:
398 395 idx = in1_match.end()
399 396 else: # in2_match
400 397 idx = in2_match.end()
401 398 code = line[idx:]
402 399 insertion = (0, Generic.Prompt, line[:idx])
403 400 return mode, code, insertion
404 401
405 402 # Check for input or continuation prompt (stripped version)
406 403 in1_match_rstrip = self.in1_regex_rstrip.match(line)
407 404 if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
408 405 # New input or when not in tb, continued input.
409 406 # We do not check for continued input when in tb since it is
410 407 # allowable to replace a long stack with an ellipsis.
411 408 mode = 'input'
412 409 if in1_match_rstrip:
413 410 idx = in1_match_rstrip.end()
414 411 else: # in2_match
415 412 idx = in2_match_rstrip.end()
416 413 code = line[idx:]
417 414 insertion = (0, Generic.Prompt, line[:idx])
418 415 return mode, code, insertion
419 416
420 417 # Check for traceback
421 418 if self.ipytb_start.match(line):
422 419 mode = 'tb'
423 420 code = line
424 421 insertion = None
425 422 return mode, code, insertion
426 423
427 424 # All other stuff...
428 425 if self.mode in ('input', 'output'):
429 426 # We assume all other text is output. Multiline input that
430 427 # does not use the continuation marker cannot be detected.
431 428 # For example, the 3 in the following is clearly output:
432 429 #
433 430 # In [1]: print 3
434 431 # 3
435 432 #
436 433 # But the following second line is part of the input:
437 434 #
438 435 # In [2]: while True:
439 436 # print True
440 437 #
441 438 # In both cases, the 2nd line will be 'output'.
442 439 #
443 440 mode = 'output'
444 441 else:
445 442 mode = 'tb'
446 443
447 444 code = line
448 445 insertion = None
449 446
450 447 return mode, code, insertion
451 448
452 449 def get_tokens_unprocessed(self, text):
453 450 self.reset()
454 451 for match in line_re.finditer(text):
455 452 line = match.group()
456 453 mode, code, insertion = self.get_mci(line)
457 454
458 455 if mode != self.mode:
459 456 # Yield buffered tokens before transitioning to new mode.
460 457 for token in self.buffered_tokens():
461 458 yield token
462 459 self.mode = mode
463 460
464 461 if insertion:
465 462 self.insertions.append((len(self.buffer), [insertion]))
466 463 self.buffer += code
467 464 else:
468 465 for token in self.buffered_tokens():
469 466 yield token
470 467
471 468 class IPyLexer(Lexer):
472 469 """
473 470 Primary lexer for all IPython-like code.
474 471
475 472 This is a simple helper lexer. If the first line of the text begins with
476 473 "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
477 474 lexer. If not, then the entire text is parsed with an IPython lexer.
478 475
479 476 The goal is to reduce the number of lexers that are registered
480 477 with Pygments.
481 478
482 479 """
483 480 name = 'IPy session'
484 481 aliases = ['ipy']
485 482
486 483 def __init__(self, **options):
487 484 self.python3 = get_bool_opt(options, 'python3', False)
488 485 if self.python3:
489 486 self.aliases = ['ipy3']
490 487 else:
491 488 self.aliases = ['ipy2', 'ipy']
492 489
493 490 Lexer.__init__(self, **options)
494 491
495 492 self.IPythonLexer = IPythonLexer(**options)
496 493 self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
497 494
498 495 def get_tokens_unprocessed(self, text):
499 496 # Search for the input prompt anywhere...this allows code blocks to
500 497 # begin with comments as well.
501 498 if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL):
502 499 lex = self.IPythonConsoleLexer
503 500 else:
504 501 lex = self.IPythonLexer
505 502 for token in lex.get_tokens_unprocessed(text):
506 503 yield token
507 504
@@ -1,88 +1,94 b''
1 1 """Test lexers module"""
2 2 #-----------------------------------------------------------------------------
3 3 # Copyright (C) 2014 The IPython Development Team
4 4 #
5 5 # Distributed under the terms of the BSD License. The full license is in
6 6 # the file COPYING, distributed as part of this software.
7 7 #-----------------------------------------------------------------------------
8 8
9 9 #-----------------------------------------------------------------------------
10 10 # Imports
11 11 #-----------------------------------------------------------------------------
12 12 from pygments.token import Token
13 13
14 14 from IPython.nbconvert.tests.base import TestsBase
15 15 from .. import lexers
16 16
17 17
18 18 #-----------------------------------------------------------------------------
19 19 # Classes and functions
20 20 #-----------------------------------------------------------------------------
21 21 class TestLexers(TestsBase):
22 22 """Collection of lexers tests"""
23 23 def setUp(self):
24 24 self.lexer = lexers.IPythonLexer()
25 25
26 26 def testIPythonLexer(self):
27 27 fragment = '!echo $HOME\n'
28 28 tokens = [
29 29 (Token.Operator, '!'),
30 30 (Token.Name.Builtin, 'echo'),
31 31 (Token.Text, ' '),
32 32 (Token.Name.Variable, '$HOME'),
33 33 (Token.Text, '\n'),
34 34 ]
35 35 self.assertEqual(tokens, list(self.lexer.get_tokens(fragment)))
36 36
37 fragment_2 = '!' + fragment
38 tokens_2 = [
39 (Token.Operator, '!!'),
40 ] + tokens[1:]
41 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
42
37 43 fragment_2 = 'x = ' + fragment
38 44 tokens_2 = [
39 45 (Token.Name, 'x'),
40 46 (Token.Text, ' '),
41 47 (Token.Operator, '='),
42 48 (Token.Text, ' '),
43 49 ] + tokens
44 50 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
45 51
46 52 fragment_2 = 'x, = ' + fragment
47 53 tokens_2 = [
48 54 (Token.Name, 'x'),
49 55 (Token.Punctuation, ','),
50 56 (Token.Text, ' '),
51 57 (Token.Operator, '='),
52 58 (Token.Text, ' '),
53 59 ] + tokens
54 60 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
55 61
56 62 fragment_2 = 'x, = %sx ' + fragment[1:]
57 63 tokens_2 = [
58 64 (Token.Name, 'x'),
59 65 (Token.Punctuation, ','),
60 66 (Token.Text, ' '),
61 67 (Token.Operator, '='),
62 68 (Token.Text, ' '),
63 69 (Token.Operator, '%'),
64 70 (Token.Keyword, 'sx'),
65 71 (Token.Text, ' '),
66 72 ] + tokens[1:]
67 73 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
68 74
69 75 fragment_2 = 'f = %R function () {}\n'
70 76 tokens_2 = [
71 77 (Token.Name, 'f'),
72 78 (Token.Text, ' '),
73 79 (Token.Operator, '='),
74 80 (Token.Text, ' '),
75 81 (Token.Operator, '%'),
76 82 (Token.Keyword, 'R'),
77 83 (Token.Text, ' function () {}\n'),
78 84 ]
79 85 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
80 86
81 87 fragment_2 = '\t%%xyz\n$foo\n'
82 88 tokens_2 = [
83 89 (Token.Text, '\t'),
84 90 (Token.Operator, '%%'),
85 91 (Token.Keyword, 'xyz'),
86 92 (Token.Text, '\n$foo\n'),
87 93 ]
88 94 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
General Comments 0
You need to be logged in to leave comments. Login now