##// END OF EJS Templates
Issue #7548: Apply bash highlighting on the r.h.s. of ! escape.
Lev Abalkin -
Show More
@@ -1,502 +1,506 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 Defines a variety of Pygments lexers for highlighting IPython code.
4 4
5 5 This includes:
6 6
7 7 IPythonLexer, IPython3Lexer
8 8 Lexers for pure IPython (python + magic/shell commands)
9 9
10 10 IPythonPartialTracebackLexer, IPythonTracebackLexer
11 11 Supports 2.x and 3.x via keyword `python3`. The partial traceback
12 12 lexer reads everything but the Python code appearing in a traceback.
13 13 The full lexer combines the partial lexer with an IPython lexer.
14 14
15 15 IPythonConsoleLexer
16 16 A lexer for IPython console sessions, with support for tracebacks.
17 17
18 18 IPyLexer
19 19 A friendly lexer which examines the first line of text and from it,
20 20 decides whether to use an IPython lexer or an IPython console lexer.
21 21 This is probably the only lexer that needs to be explicitly added
22 22 to Pygments.
23 23
24 24 """
25 25 #-----------------------------------------------------------------------------
26 26 # Copyright (c) 2013, the IPython Development Team.
27 27 #
28 28 # Distributed under the terms of the Modified BSD License.
29 29 #
30 30 # The full license is in the file COPYING.txt, distributed with this software.
31 31 #-----------------------------------------------------------------------------
32 32
33 33 # Standard library
34 34 import re
35 35
36 36 # Third party
37 37 from pygments.lexers import BashLexer, PythonLexer, Python3Lexer
38 38 from pygments.lexer import (
39 39 Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using,
40 40 )
41 41 from pygments.token import (
42 42 Comment, Generic, Keyword, Literal, Name, Operator, Other, Text, Error,
43 43 )
44 44 from pygments.util import get_bool_opt
45 45
46 46 # Local
47 47 from IPython.testing.skipdoctest import skip_doctest
48 48
49 49 line_re = re.compile('.*?\n')
50 50
51 51 ipython_tokens = [
52 52 (r'(\%+)(\w+)\s+(\.*)(\n)', bygroups(Operator, Keyword,
53 53 using(BashLexer), Text)),
54 54 (r'(\%+)(\w+)\b', bygroups(Operator, Keyword)),
55 55 (r'^(!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)),
56 (r'^(.+)(=)(\s*)(!)(.+)(\n)', bygroups(
57 # With the limited syntax allowed on the l.h.s. of a shell capture,
58 # we don't need to differentiate between Python 2 and 3.
59 using(Python3Lexer), Operator, Text, Operator, using(BashLexer), Text)),
56 60 ]
57 61
58 62 def build_ipy_lexer(python3):
59 63 """Builds IPython lexers depending on the value of `python3`.
60 64
61 65 The lexer inherits from an appropriate Python lexer and then adds
62 66 information about IPython specific keywords (i.e. magic commands,
63 67 shell commands, etc.)
64 68
65 69 Parameters
66 70 ----------
67 71 python3 : bool
68 72 If `True`, then build an IPython lexer from a Python 3 lexer.
69 73
70 74 """
71 75 # It would be nice to have a single IPython lexer class which takes
72 76 # a boolean `python3`. But since there are two Python lexer classes,
73 77 # we will also have two IPython lexer classes.
74 78 if python3:
75 79 PyLexer = Python3Lexer
76 80 clsname = 'IPython3Lexer'
77 81 name = 'IPython3'
78 82 aliases = ['ipython3']
79 83 doc = """IPython3 Lexer"""
80 84 else:
81 85 PyLexer = PythonLexer
82 86 clsname = 'IPythonLexer'
83 87 name = 'IPython'
84 88 aliases = ['ipython2', 'ipython']
85 89 doc = """IPython Lexer"""
86 90
87 91 tokens = PyLexer.tokens.copy()
88 92 tokens['root'] = ipython_tokens + tokens['root']
89 93
90 94 attrs = {'name': name, 'aliases': aliases,
91 95 '__doc__': doc, 'tokens': tokens}
92 96
93 97 return type(name, (PyLexer,), attrs)
94 98
95 99
96 100 IPython3Lexer = build_ipy_lexer(python3=True)
97 101 IPythonLexer = build_ipy_lexer(python3=False)
98 102
99 103
100 104 class IPythonPartialTracebackLexer(RegexLexer):
101 105 """
102 106 Partial lexer for IPython tracebacks.
103 107
104 108 Handles all the non-python output. This works for both Python 2.x and 3.x.
105 109
106 110 """
107 111 name = 'IPython Partial Traceback'
108 112
109 113 tokens = {
110 114 'root': [
111 115 # Tracebacks for syntax errors have a different style.
112 116 # For both types of tracebacks, we mark the first line with
113 117 # Generic.Traceback. For syntax errors, we mark the filename
114 118 # as we mark the filenames for non-syntax tracebacks.
115 119 #
116 120 # These two regexps define how IPythonConsoleLexer finds a
117 121 # traceback.
118 122 #
119 123 ## Non-syntax traceback
120 124 (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)),
121 125 ## Syntax traceback
122 126 (r'^( File)(.*)(, line )(\d+\n)',
123 127 bygroups(Generic.Traceback, Name.Namespace,
124 128 Generic.Traceback, Literal.Number.Integer)),
125 129
126 130 # (Exception Identifier)(Whitespace)(Traceback Message)
127 131 (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)',
128 132 bygroups(Name.Exception, Generic.Whitespace, Text)),
129 133 # (Module/Filename)(Text)(Callee)(Function Signature)
130 134 # Better options for callee and function signature?
131 135 (r'(.*)( in )(.*)(\(.*\)\n)',
132 136 bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)),
133 137 # Regular line: (Whitespace)(Line Number)(Python Code)
134 138 (r'(\s*?)(\d+)(.*?\n)',
135 139 bygroups(Generic.Whitespace, Literal.Number.Integer, Other)),
136 140 # Emphasized line: (Arrow)(Line Number)(Python Code)
137 141 # Using Exception token so arrow color matches the Exception.
138 142 (r'(-*>?\s?)(\d+)(.*?\n)',
139 143 bygroups(Name.Exception, Literal.Number.Integer, Other)),
140 144 # (Exception Identifier)(Message)
141 145 (r'(?u)(^[^\d\W]\w*)(:.*?\n)',
142 146 bygroups(Name.Exception, Text)),
143 147 # Tag everything else as Other, will be handled later.
144 148 (r'.*\n', Other),
145 149 ],
146 150 }
147 151
148 152
149 153 class IPythonTracebackLexer(DelegatingLexer):
150 154 """
151 155 IPython traceback lexer.
152 156
153 157 For doctests, the tracebacks can be snipped as much as desired with the
154 158 exception to the lines that designate a traceback. For non-syntax error
155 159 tracebacks, this is the line of hyphens. For syntax error tracebacks,
156 160 this is the line which lists the File and line number.
157 161
158 162 """
159 163 # The lexer inherits from DelegatingLexer. The "root" lexer is an
160 164 # appropriate IPython lexer, which depends on the value of the boolean
161 165 # `python3`. First, we parse with the partial IPython traceback lexer.
162 166 # Then, any code marked with the "Other" token is delegated to the root
163 167 # lexer.
164 168 #
165 169 name = 'IPython Traceback'
166 170 aliases = ['ipythontb']
167 171
168 172 def __init__(self, **options):
169 173 self.python3 = get_bool_opt(options, 'python3', False)
170 174 if self.python3:
171 175 self.aliases = ['ipython3tb']
172 176 else:
173 177 self.aliases = ['ipython2tb', 'ipythontb']
174 178
175 179 if self.python3:
176 180 IPyLexer = IPython3Lexer
177 181 else:
178 182 IPyLexer = IPythonLexer
179 183
180 184 DelegatingLexer.__init__(self, IPyLexer,
181 185 IPythonPartialTracebackLexer, **options)
182 186
183 187 @skip_doctest
184 188 class IPythonConsoleLexer(Lexer):
185 189 """
186 190 An IPython console lexer for IPython code-blocks and doctests, such as:
187 191
188 192 .. code-block:: rst
189 193
190 194 .. code-block:: ipythonconsole
191 195
192 196 In [1]: a = 'foo'
193 197
194 198 In [2]: a
195 199 Out[2]: 'foo'
196 200
197 201 In [3]: print a
198 202 foo
199 203
200 204 In [4]: 1 / 0
201 205
202 206
203 207 Support is also provided for IPython exceptions:
204 208
205 209 .. code-block:: rst
206 210
207 211 .. code-block:: ipythonconsole
208 212
209 213 In [1]: raise Exception
210 214
211 215 ---------------------------------------------------------------------------
212 216 Exception Traceback (most recent call last)
213 217 <ipython-input-1-fca2ab0ca76b> in <module>()
214 218 ----> 1 raise Exception
215 219
216 220 Exception:
217 221
218 222 """
219 223 name = 'IPython console session'
220 224 aliases = ['ipythonconsole']
221 225 mimetypes = ['text/x-ipython-console']
222 226
223 227 # The regexps used to determine what is input and what is output.
224 228 # The default prompts for IPython are:
225 229 #
226 230 # c.PromptManager.in_template = 'In [\#]: '
227 231 # c.PromptManager.in2_template = ' .\D.: '
228 232 # c.PromptManager.out_template = 'Out[\#]: '
229 233 #
230 234 in1_regex = r'In \[[0-9]+\]: '
231 235 in2_regex = r' \.\.+\.: '
232 236 out_regex = r'Out\[[0-9]+\]: '
233 237
234 238 #: The regex to determine when a traceback starts.
235 239 ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)')
236 240
237 241 def __init__(self, **options):
238 242 """Initialize the IPython console lexer.
239 243
240 244 Parameters
241 245 ----------
242 246 python3 : bool
243 247 If `True`, then the console inputs are parsed using a Python 3
244 248 lexer. Otherwise, they are parsed using a Python 2 lexer.
245 249 in1_regex : RegexObject
246 250 The compiled regular expression used to detect the start
247 251 of inputs. Although the IPython configuration setting may have a
248 252 trailing whitespace, do not include it in the regex. If `None`,
249 253 then the default input prompt is assumed.
250 254 in2_regex : RegexObject
251 255 The compiled regular expression used to detect the continuation
252 256 of inputs. Although the IPython configuration setting may have a
253 257 trailing whitespace, do not include it in the regex. If `None`,
254 258 then the default input prompt is assumed.
255 259 out_regex : RegexObject
256 260 The compiled regular expression used to detect outputs. If `None`,
257 261 then the default output prompt is assumed.
258 262
259 263 """
260 264 self.python3 = get_bool_opt(options, 'python3', False)
261 265 if self.python3:
262 266 self.aliases = ['ipython3console']
263 267 else:
264 268 self.aliases = ['ipython2console', 'ipythonconsole']
265 269
266 270 in1_regex = options.get('in1_regex', self.in1_regex)
267 271 in2_regex = options.get('in2_regex', self.in2_regex)
268 272 out_regex = options.get('out_regex', self.out_regex)
269 273
270 274 # So that we can work with input and output prompts which have been
271 275 # rstrip'd (possibly by editors) we also need rstrip'd variants. If
272 276 # we do not do this, then such prompts will be tagged as 'output'.
273 277 # The reason can't just use the rstrip'd variants instead is because
274 278 # we want any whitespace associated with the prompt to be inserted
275 279 # with the token. This allows formatted code to be modified so as hide
276 280 # the appearance of prompts, with the whitespace included. One example
277 281 # use of this is in copybutton.js from the standard lib Python docs.
278 282 in1_regex_rstrip = in1_regex.rstrip() + '\n'
279 283 in2_regex_rstrip = in2_regex.rstrip() + '\n'
280 284 out_regex_rstrip = out_regex.rstrip() + '\n'
281 285
282 286 # Compile and save them all.
283 287 attrs = ['in1_regex', 'in2_regex', 'out_regex',
284 288 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip']
285 289 for attr in attrs:
286 290 self.__setattr__(attr, re.compile(locals()[attr]))
287 291
288 292 Lexer.__init__(self, **options)
289 293
290 294 if self.python3:
291 295 pylexer = IPython3Lexer
292 296 tblexer = IPythonTracebackLexer
293 297 else:
294 298 pylexer = IPythonLexer
295 299 tblexer = IPythonTracebackLexer
296 300
297 301 self.pylexer = pylexer(**options)
298 302 self.tblexer = tblexer(**options)
299 303
300 304 self.reset()
301 305
302 306 def reset(self):
303 307 self.mode = 'output'
304 308 self.index = 0
305 309 self.buffer = u''
306 310 self.insertions = []
307 311
308 312 def buffered_tokens(self):
309 313 """
310 314 Generator of unprocessed tokens after doing insertions and before
311 315 changing to a new state.
312 316
313 317 """
314 318 if self.mode == 'output':
315 319 tokens = [(0, Generic.Output, self.buffer)]
316 320 elif self.mode == 'input':
317 321 tokens = self.pylexer.get_tokens_unprocessed(self.buffer)
318 322 else: # traceback
319 323 tokens = self.tblexer.get_tokens_unprocessed(self.buffer)
320 324
321 325 for i, t, v in do_insertions(self.insertions, tokens):
322 326 # All token indexes are relative to the buffer.
323 327 yield self.index + i, t, v
324 328
325 329 # Clear it all
326 330 self.index += len(self.buffer)
327 331 self.buffer = u''
328 332 self.insertions = []
329 333
330 334 def get_mci(self, line):
331 335 """
332 336 Parses the line and returns a 3-tuple: (mode, code, insertion).
333 337
334 338 `mode` is the next mode (or state) of the lexer, and is always equal
335 339 to 'input', 'output', or 'tb'.
336 340
337 341 `code` is a portion of the line that should be added to the buffer
338 342 corresponding to the next mode and eventually lexed by another lexer.
339 343 For example, `code` could be Python code if `mode` were 'input'.
340 344
341 345 `insertion` is a 3-tuple (index, token, text) representing an
342 346 unprocessed "token" that will be inserted into the stream of tokens
343 347 that are created from the buffer once we change modes. This is usually
344 348 the input or output prompt.
345 349
346 350 In general, the next mode depends on current mode and on the contents
347 351 of `line`.
348 352
349 353 """
350 354 # To reduce the number of regex match checks, we have multiple
351 355 # 'if' blocks instead of 'if-elif' blocks.
352 356
353 357 # Check for possible end of input
354 358 in2_match = self.in2_regex.match(line)
355 359 in2_match_rstrip = self.in2_regex_rstrip.match(line)
356 360 if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \
357 361 in2_match_rstrip:
358 362 end_input = True
359 363 else:
360 364 end_input = False
361 365 if end_input and self.mode != 'tb':
362 366 # Only look for an end of input when not in tb mode.
363 367 # An ellipsis could appear within the traceback.
364 368 mode = 'output'
365 369 code = u''
366 370 insertion = (0, Generic.Prompt, line)
367 371 return mode, code, insertion
368 372
369 373 # Check for output prompt
370 374 out_match = self.out_regex.match(line)
371 375 out_match_rstrip = self.out_regex_rstrip.match(line)
372 376 if out_match or out_match_rstrip:
373 377 mode = 'output'
374 378 if out_match:
375 379 idx = out_match.end()
376 380 else:
377 381 idx = out_match_rstrip.end()
378 382 code = line[idx:]
379 383 # Use the 'heading' token for output. We cannot use Generic.Error
380 384 # since it would conflict with exceptions.
381 385 insertion = (0, Generic.Heading, line[:idx])
382 386 return mode, code, insertion
383 387
384 388
385 389 # Check for input or continuation prompt (non stripped version)
386 390 in1_match = self.in1_regex.match(line)
387 391 if in1_match or (in2_match and self.mode != 'tb'):
388 392 # New input or when not in tb, continued input.
389 393 # We do not check for continued input when in tb since it is
390 394 # allowable to replace a long stack with an ellipsis.
391 395 mode = 'input'
392 396 if in1_match:
393 397 idx = in1_match.end()
394 398 else: # in2_match
395 399 idx = in2_match.end()
396 400 code = line[idx:]
397 401 insertion = (0, Generic.Prompt, line[:idx])
398 402 return mode, code, insertion
399 403
400 404 # Check for input or continuation prompt (stripped version)
401 405 in1_match_rstrip = self.in1_regex_rstrip.match(line)
402 406 if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'):
403 407 # New input or when not in tb, continued input.
404 408 # We do not check for continued input when in tb since it is
405 409 # allowable to replace a long stack with an ellipsis.
406 410 mode = 'input'
407 411 if in1_match_rstrip:
408 412 idx = in1_match_rstrip.end()
409 413 else: # in2_match
410 414 idx = in2_match_rstrip.end()
411 415 code = line[idx:]
412 416 insertion = (0, Generic.Prompt, line[:idx])
413 417 return mode, code, insertion
414 418
415 419 # Check for traceback
416 420 if self.ipytb_start.match(line):
417 421 mode = 'tb'
418 422 code = line
419 423 insertion = None
420 424 return mode, code, insertion
421 425
422 426 # All other stuff...
423 427 if self.mode in ('input', 'output'):
424 428 # We assume all other text is output. Multiline input that
425 429 # does not use the continuation marker cannot be detected.
426 430 # For example, the 3 in the following is clearly output:
427 431 #
428 432 # In [1]: print 3
429 433 # 3
430 434 #
431 435 # But the following second line is part of the input:
432 436 #
433 437 # In [2]: while True:
434 438 # print True
435 439 #
436 440 # In both cases, the 2nd line will be 'output'.
437 441 #
438 442 mode = 'output'
439 443 else:
440 444 mode = 'tb'
441 445
442 446 code = line
443 447 insertion = None
444 448
445 449 return mode, code, insertion
446 450
447 451 def get_tokens_unprocessed(self, text):
448 452 self.reset()
449 453 for match in line_re.finditer(text):
450 454 line = match.group()
451 455 mode, code, insertion = self.get_mci(line)
452 456
453 457 if mode != self.mode:
454 458 # Yield buffered tokens before transitioning to new mode.
455 459 for token in self.buffered_tokens():
456 460 yield token
457 461 self.mode = mode
458 462
459 463 if insertion:
460 464 self.insertions.append((len(self.buffer), [insertion]))
461 465 self.buffer += code
462 466 else:
463 467 for token in self.buffered_tokens():
464 468 yield token
465 469
466 470 class IPyLexer(Lexer):
467 471 """
468 472 Primary lexer for all IPython-like code.
469 473
470 474 This is a simple helper lexer. If the first line of the text begins with
471 475 "In \[[0-9]+\]:", then the entire text is parsed with an IPython console
472 476 lexer. If not, then the entire text is parsed with an IPython lexer.
473 477
474 478 The goal is to reduce the number of lexers that are registered
475 479 with Pygments.
476 480
477 481 """
478 482 name = 'IPy session'
479 483 aliases = ['ipy']
480 484
481 485 def __init__(self, **options):
482 486 self.python3 = get_bool_opt(options, 'python3', False)
483 487 if self.python3:
484 488 self.aliases = ['ipy3']
485 489 else:
486 490 self.aliases = ['ipy2', 'ipy']
487 491
488 492 Lexer.__init__(self, **options)
489 493
490 494 self.IPythonLexer = IPythonLexer(**options)
491 495 self.IPythonConsoleLexer = IPythonConsoleLexer(**options)
492 496
493 497 def get_tokens_unprocessed(self, text):
494 498 # Search for the input prompt anywhere...this allows code blocks to
495 499 # begin with comments as well.
496 500 if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL):
497 501 lex = self.IPythonConsoleLexer
498 502 else:
499 503 lex = self.IPythonLexer
500 504 for token in lex.get_tokens_unprocessed(text):
501 505 yield token
502 506
@@ -1,35 +1,53 b''
1 1 """Test lexers module"""
2 2 #-----------------------------------------------------------------------------
3 3 # Copyright (C) 2014 The IPython Development Team
4 4 #
5 5 # Distributed under the terms of the BSD License. The full license is in
6 6 # the file COPYING, distributed as part of this software.
7 7 #-----------------------------------------------------------------------------
8 8
9 9 #-----------------------------------------------------------------------------
10 10 # Imports
11 11 #-----------------------------------------------------------------------------
12 12 from pygments.token import Token
13 13
14 14 from IPython.nbconvert.tests.base import TestsBase
15 15 from .. import lexers
16 16
17 17
18 18 #-----------------------------------------------------------------------------
19 19 # Classes and functions
20 20 #-----------------------------------------------------------------------------
21 21 class TestLexers(TestsBase):
22 22 """Collection of lexers tests"""
23 23 def setUp(self):
24 24 self.lexer = lexers.IPythonLexer()
25 25
26 26 def testIPythonLexer(self):
27 27 fragment = '!echo $HOME\n'
28 28 tokens = [
29 29 (Token.Operator, '!'),
30 30 (Token.Name.Builtin, 'echo'),
31 31 (Token.Text, ' '),
32 32 (Token.Name.Variable, '$HOME'),
33 33 (Token.Text, '\n'),
34 34 ]
35 35 self.assertEqual(tokens, list(self.lexer.get_tokens(fragment)))
36
37 fragment_2 = 'x = ' + fragment
38 tokens_2 = [
39 (Token.Name, 'x'),
40 (Token.Text, ' '),
41 (Token.Operator, '='),
42 (Token.Text, ' '),
43 ] + tokens
44 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
45 fragment_2 = 'x, = ' + fragment
46 tokens_2 = [
47 (Token.Name, 'x'),
48 (Token.Punctuation, ','),
49 (Token.Text, ' '),
50 (Token.Operator, '='),
51 (Token.Text, ' '),
52 ] + tokens
53 self.assertEqual(tokens_2, list(self.lexer.get_tokens(fragment_2)))
General Comments 0
You need to be logged in to leave comments. Login now