##// END OF EJS Templates
Now include patched copies of tokenize for Python 2 and 3.
Thomas Kluyver -
Show More
@@ -0,0 +1,438 b''
1 """Patched version of standard library tokenize, to deal with various bugs.
2
3 Patches
4
5 - Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing),
6 manually applied.
7 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
8 on whether they are in a multi-line statement. Filed as Python issue #17061.
9
10 -------------------------------------------------------------------------------
11 Tokenization help for Python programs.
12
13 generate_tokens(readline) is a generator that breaks a stream of
14 text into Python tokens. It accepts a readline-like method which is called
15 repeatedly to get the next line of input (or "" for EOF). It generates
16 5-tuples with these members:
17
18 the token type (see token.py)
19 the token (a string)
20 the starting (row, column) indices of the token (a 2-tuple of ints)
21 the ending (row, column) indices of the token (a 2-tuple of ints)
22 the original line (string)
23
24 It is designed to match the working of the Python tokenizer exactly, except
25 that it produces COMMENT tokens for comments and gives type OP for all
26 operators
27
28 Older entry points
29 tokenize_loop(readline, tokeneater)
30 tokenize(readline, tokeneater=printtoken)
31 are the same, except instead of generating tokens, tokeneater is a callback
32 function to which the 5 fields described above are passed as 5 arguments,
33 each time a new token is found."""
34
35 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
36 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
37 'Skip Montanaro, Raymond Hettinger')
38
39 import string, re
40 from token import *
41
42 import token
43 __all__ = [x for x in dir(token) if not x.startswith("_")]
44 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
45 del x
46 del token
47
48 __all__ += ["TokenError"]
49
50 COMMENT = N_TOKENS
51 tok_name[COMMENT] = 'COMMENT'
52 NL = N_TOKENS + 1
53 tok_name[NL] = 'NL'
54 N_TOKENS += 2
55
56 def group(*choices): return '(' + '|'.join(choices) + ')'
57 def any(*choices): return group(*choices) + '*'
58 def maybe(*choices): return group(*choices) + '?'
59
60 Whitespace = r'[ \f\t]*'
61 Comment = r'#[^\r\n]*'
62 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
63 Name = r'[a-zA-Z_]\w*'
64
65 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
66 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
67 Binnumber = r'0[bB][01]+[lL]?'
68 Decnumber = r'[1-9]\d*[lL]?'
69 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
70 Exponent = r'[eE][-+]?\d+'
71 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
72 Expfloat = r'\d+' + Exponent
73 Floatnumber = group(Pointfloat, Expfloat)
74 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
75 Number = group(Imagnumber, Floatnumber, Intnumber)
76
77 # Tail end of ' string.
78 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
79 # Tail end of " string.
80 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
81 # Tail end of ''' string.
82 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
83 # Tail end of """ string.
84 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
85 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
86 # Single-line ' or " string.
87 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
88 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
89
90 # Because of leftmost-then-longest match semantics, be sure to put the
91 # longest operators first (e.g., if = came before ==, == would get
92 # recognized as two instances of =).
93 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
94 r"//=?",
95 r"[+\-*/%&|^=<>]=?",
96 r"~")
97
98 Bracket = '[][(){}]'
99 Special = group(r'\r?\n', r'[:;.,`@]')
100 Funny = group(Operator, Bracket, Special)
101
102 PlainToken = group(Number, Funny, String, Name)
103 Token = Ignore + PlainToken
104
105 # First (or only) line of ' or " string.
106 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
107 group("'", r'\\\r?\n'),
108 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
109 group('"', r'\\\r?\n'))
110 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
111 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
112
113 tokenprog, pseudoprog, single3prog, double3prog = map(
114 re.compile, (Token, PseudoToken, Single3, Double3))
115 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
116 "'''": single3prog, '"""': double3prog,
117 "r'''": single3prog, 'r"""': double3prog,
118 "u'''": single3prog, 'u"""': double3prog,
119 "ur'''": single3prog, 'ur"""': double3prog,
120 "R'''": single3prog, 'R"""': double3prog,
121 "U'''": single3prog, 'U"""': double3prog,
122 "uR'''": single3prog, 'uR"""': double3prog,
123 "Ur'''": single3prog, 'Ur"""': double3prog,
124 "UR'''": single3prog, 'UR"""': double3prog,
125 "b'''": single3prog, 'b"""': double3prog,
126 "br'''": single3prog, 'br"""': double3prog,
127 "B'''": single3prog, 'B"""': double3prog,
128 "bR'''": single3prog, 'bR"""': double3prog,
129 "Br'''": single3prog, 'Br"""': double3prog,
130 "BR'''": single3prog, 'BR"""': double3prog,
131 'r': None, 'R': None, 'u': None, 'U': None,
132 'b': None, 'B': None}
133
134 triple_quoted = {}
135 for t in ("'''", '"""',
136 "r'''", 'r"""', "R'''", 'R"""',
137 "u'''", 'u"""', "U'''", 'U"""',
138 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
139 "uR'''", 'uR"""', "UR'''", 'UR"""',
140 "b'''", 'b"""', "B'''", 'B"""',
141 "br'''", 'br"""', "Br'''", 'Br"""',
142 "bR'''", 'bR"""', "BR'''", 'BR"""'):
143 triple_quoted[t] = t
144 single_quoted = {}
145 for t in ("'", '"',
146 "r'", 'r"', "R'", 'R"',
147 "u'", 'u"', "U'", 'U"',
148 "ur'", 'ur"', "Ur'", 'Ur"',
149 "uR'", 'uR"', "UR'", 'UR"',
150 "b'", 'b"', "B'", 'B"',
151 "br'", 'br"', "Br'", 'Br"',
152 "bR'", 'bR"', "BR'", 'BR"' ):
153 single_quoted[t] = t
154
155 tabsize = 8
156
157 class TokenError(Exception): pass
158
159 class StopTokenizing(Exception): pass
160
161 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
162 srow, scol = srow_scol
163 erow, ecol = erow_ecol
164 print "%d,%d-%d,%d:\t%s\t%s" % \
165 (srow, scol, erow, ecol, tok_name[type], repr(token))
166
167 def tokenize(readline, tokeneater=printtoken):
168 """
169 The tokenize() function accepts two parameters: one representing the
170 input stream, and one providing an output mechanism for tokenize().
171
172 The first parameter, readline, must be a callable object which provides
173 the same interface as the readline() method of built-in file objects.
174 Each call to the function should return one line of input as a string.
175
176 The second parameter, tokeneater, must also be a callable object. It is
177 called once for each token, with five arguments, corresponding to the
178 tuples generated by generate_tokens().
179 """
180 try:
181 tokenize_loop(readline, tokeneater)
182 except StopTokenizing:
183 pass
184
185 # backwards compatible interface
186 def tokenize_loop(readline, tokeneater):
187 for token_info in generate_tokens(readline):
188 tokeneater(*token_info)
189
190 class Untokenizer:
191
192 def __init__(self):
193 self.tokens = []
194 self.prev_row = 1
195 self.prev_col = 0
196
197 def add_whitespace(self, start):
198 row, col = start
199 assert row >= self.prev_row
200 col_offset = col - self.prev_col
201 if col_offset > 0:
202 self.tokens.append(" " * col_offset)
203 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
204 # Line was backslash-continued
205 self.tokens.append(" ")
206
207 def untokenize(self, tokens):
208 iterable = iter(tokens)
209 for t in iterable:
210 if len(t) == 2:
211 self.compat(t, iterable)
212 break
213 tok_type, token, start, end = t[:4]
214 self.add_whitespace(start)
215 self.tokens.append(token)
216 self.prev_row, self.prev_col = end
217 if tok_type in (NEWLINE, NL):
218 self.prev_row += 1
219 self.prev_col = 0
220 return "".join(self.tokens)
221
222 def compat(self, token, iterable):
223 # This import is here to avoid problems when the itertools
224 # module is not built yet and tokenize is imported.
225 from itertools import chain
226 startline = False
227 prevstring = False
228 indents = []
229 toks_append = self.tokens.append
230 for tok in chain([token], iterable):
231 toknum, tokval = tok[:2]
232
233 if toknum in (NAME, NUMBER):
234 tokval += ' '
235
236 # Insert a space between two consecutive strings
237 if toknum == STRING:
238 if prevstring:
239 tokval = ' ' + tokval
240 prevstring = True
241 else:
242 prevstring = False
243
244 if toknum == INDENT:
245 indents.append(tokval)
246 continue
247 elif toknum == DEDENT:
248 indents.pop()
249 continue
250 elif toknum in (NEWLINE, NL):
251 startline = True
252 elif startline and indents:
253 toks_append(indents[-1])
254 startline = False
255 toks_append(tokval)
256
257 def untokenize(iterable):
258 """Transform tokens back into Python source code.
259
260 Each element returned by the iterable must be a token sequence
261 with at least two elements, a token number and token value. If
262 only two tokens are passed, the resulting output is poor.
263
264 Round-trip invariant for full input:
265 Untokenized source will match input source exactly
266
267 Round-trip invariant for limited intput:
268 # Output text will tokenize the back to the input
269 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
270 newcode = untokenize(t1)
271 readline = iter(newcode.splitlines(1)).next
272 t2 = [tok[:2] for tok in generate_tokens(readline)]
273 assert t1 == t2
274 """
275 ut = Untokenizer()
276 return ut.untokenize(iterable)
277
278 def generate_tokens(readline):
279 """
280 The generate_tokens() generator requires one argment, readline, which
281 must be a callable object which provides the same interface as the
282 readline() method of built-in file objects. Each call to the function
283 should return one line of input as a string. Alternately, readline
284 can be a callable function terminating with StopIteration:
285 readline = open(myfile).next # Example of alternate readline
286
287 The generator produces 5-tuples with these members: the token type; the
288 token string; a 2-tuple (srow, scol) of ints specifying the row and
289 column where the token begins in the source; a 2-tuple (erow, ecol) of
290 ints specifying the row and column where the token ends in the source;
291 and the line on which the token was found. The line passed is the
292 logical line; continuation lines are included.
293 """
294 lnum = parenlev = continued = 0
295 namechars, numchars = string.ascii_letters + '_', '0123456789'
296 contstr, needcont = '', 0
297 contline = None
298 indents = [0]
299
300 while 1: # loop over lines in stream
301 try:
302 line = readline()
303 except StopIteration:
304 line = ''
305 lnum += 1
306 pos, max = 0, len(line)
307
308 if contstr: # continued string
309 if not line:
310 raise TokenError, ("EOF in multi-line string", strstart)
311 endmatch = endprog.match(line)
312 if endmatch:
313 pos = end = endmatch.end(0)
314 yield (STRING, contstr + line[:end],
315 strstart, (lnum, end), contline + line)
316 contstr, needcont = '', 0
317 contline = None
318 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
319 yield (ERRORTOKEN, contstr + line,
320 strstart, (lnum, len(line)), contline)
321 contstr = ''
322 contline = None
323 continue
324 else:
325 contstr = contstr + line
326 contline = contline + line
327 continue
328
329 elif parenlev == 0 and not continued: # new statement
330 if not line: break
331 column = 0
332 while pos < max: # measure leading whitespace
333 if line[pos] == ' ':
334 column += 1
335 elif line[pos] == '\t':
336 column = (column//tabsize + 1)*tabsize
337 elif line[pos] == '\f':
338 column = 0
339 else:
340 break
341 pos += 1
342 if pos == max:
343 break
344
345 if line[pos] in '#\r\n': # skip comments or blank lines
346 if line[pos] == '#':
347 comment_token = line[pos:].rstrip('\r\n')
348 nl_pos = pos + len(comment_token)
349 yield (COMMENT, comment_token,
350 (lnum, pos), (lnum, pos + len(comment_token)), line)
351 yield (NEWLINE, line[nl_pos:],
352 (lnum, nl_pos), (lnum, len(line)), line)
353 else:
354 yield (NEWLINE, line[pos:],
355 (lnum, pos), (lnum, len(line)), line)
356 continue
357
358 if column > indents[-1]: # count indents or dedents
359 indents.append(column)
360 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
361 while column < indents[-1]:
362 if column not in indents:
363 raise IndentationError(
364 "unindent does not match any outer indentation level",
365 ("<tokenize>", lnum, pos, line))
366 indents = indents[:-1]
367 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
368
369 else: # continued statement
370 if not line:
371 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
372 continued = 0
373
374 while pos < max:
375 pseudomatch = pseudoprog.match(line, pos)
376 if pseudomatch: # scan for tokens
377 start, end = pseudomatch.span(1)
378 spos, epos, pos = (lnum, start), (lnum, end), end
379 token, initial = line[start:end], line[start]
380
381 if initial in numchars or \
382 (initial == '.' and token != '.'): # ordinary number
383 yield (NUMBER, token, spos, epos, line)
384 elif initial in '\r\n':
385 yield (NL if parenlev > 0 else NEWLINE,
386 token, spos, epos, line)
387 elif initial == '#':
388 assert not token.endswith("\n")
389 yield (COMMENT, token, spos, epos, line)
390 elif token in triple_quoted:
391 endprog = endprogs[token]
392 endmatch = endprog.match(line, pos)
393 if endmatch: # all on one line
394 pos = endmatch.end(0)
395 token = line[start:pos]
396 yield (STRING, token, spos, (lnum, pos), line)
397 else:
398 strstart = (lnum, start) # multiple lines
399 contstr = line[start:]
400 contline = line
401 break
402 elif initial in single_quoted or \
403 token[:2] in single_quoted or \
404 token[:3] in single_quoted:
405 if token[-1] == '\n': # continued string
406 strstart = (lnum, start)
407 endprog = (endprogs[initial] or endprogs[token[1]] or
408 endprogs[token[2]])
409 contstr, needcont = line[start:], 1
410 contline = line
411 break
412 else: # ordinary string
413 yield (STRING, token, spos, epos, line)
414 elif initial in namechars: # ordinary name
415 yield (NAME, token, spos, epos, line)
416 elif initial == '\\': # continued stmt
417 continued = 1
418 else:
419 if initial in '([{':
420 parenlev += 1
421 elif initial in ')]}':
422 parenlev -= 1
423 yield (OP, token, spos, epos, line)
424 else:
425 yield (ERRORTOKEN, line[pos],
426 (lnum, pos), (lnum, pos+1), line)
427 pos += 1
428
429 for indent in indents[1:]: # pop remaining indent levels
430 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
431 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
432
433 if __name__ == '__main__': # testing
434 import sys
435 if len(sys.argv) > 1:
436 tokenize(open(sys.argv[1]).readline)
437 else:
438 tokenize(sys.stdin.readline)
This diff has been collapsed as it changes many lines, (574 lines changed) Show them Hide them
@@ -0,0 +1,574 b''
1 """Patched version of standard library tokenize, to deal with various bugs.
2
3 Based on Python 3.2 code.
4
5 Patches:
6
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
8 - Except we don't encode the output of untokenize
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
12 - Export generate_tokens & TokenError
13
14 ------------------------------------------------------------------------------
15 Tokenization help for Python programs.
16
17 tokenize(readline) is a generator that breaks a stream of bytes into
18 Python tokens. It decodes the bytes according to PEP-0263 for
19 determining source file encoding.
20
21 It accepts a readline-like method which is called repeatedly to get the
22 next line of input (or b"" for EOF). It generates 5-tuples with these
23 members:
24
25 the token type (see token.py)
26 the token (a string)
27 the starting (row, column) indices of the token (a 2-tuple of ints)
28 the ending (row, column) indices of the token (a 2-tuple of ints)
29 the original line (string)
30
31 It is designed to match the working of the Python tokenizer exactly, except
32 that it produces COMMENT tokens for comments and gives type OP for all
33 operators. Additionally, all token lists start with an ENCODING token
34 which tells you which encoding was used to decode the bytes stream.
35 """
36 from __future__ import absolute_import
37
38 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
39 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
40 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
41 'Michael Foord')
42 import builtins
43 import re
44 import sys
45 from token import *
46 from codecs import lookup, BOM_UTF8
47 import collections
48 from io import TextIOWrapper
49 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
50
51 import token
52 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
53 "NL", "untokenize", "ENCODING", "TokenInfo"]
54 del token
55
56 __all__ += ["generate_tokens", "TokenError"]
57
58 COMMENT = N_TOKENS
59 tok_name[COMMENT] = 'COMMENT'
60 NL = N_TOKENS + 1
61 tok_name[NL] = 'NL'
62 ENCODING = N_TOKENS + 2
63 tok_name[ENCODING] = 'ENCODING'
64 N_TOKENS += 3
65
66 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
67 def __repr__(self):
68 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
69 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
70 self._replace(type=annotated_type))
71
72 def group(*choices): return '(' + '|'.join(choices) + ')'
73 def any(*choices): return group(*choices) + '*'
74 def maybe(*choices): return group(*choices) + '?'
75
76 # Note: we use unicode matching for names ("\w") but ascii matching for
77 # number literals.
78 Whitespace = r'[ \f\t]*'
79 Comment = r'#[^\r\n]*'
80 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
81 Name = r'\w+'
82
83 Hexnumber = r'0[xX][0-9a-fA-F]+'
84 Binnumber = r'0[bB][01]+'
85 Octnumber = r'0[oO][0-7]+'
86 Decnumber = r'(?:0+|[1-9][0-9]*)'
87 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
88 Exponent = r'[eE][-+]?[0-9]+'
89 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
90 Expfloat = r'[0-9]+' + Exponent
91 Floatnumber = group(Pointfloat, Expfloat)
92 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
93 Number = group(Imagnumber, Floatnumber, Intnumber)
94
95 # Tail end of ' string.
96 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
97 # Tail end of " string.
98 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
99 # Tail end of ''' string.
100 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
101 # Tail end of """ string.
102 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
103 Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
104 # Single-line ' or " string.
105 String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
106 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
107
108 # Because of leftmost-then-longest match semantics, be sure to put the
109 # longest operators first (e.g., if = came before ==, == would get
110 # recognized as two instances of =).
111 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
112 r"//=?", r"->",
113 r"[+\-*/%&|^=<>]=?",
114 r"~")
115
116 Bracket = '[][(){}]'
117 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
118 Funny = group(Operator, Bracket, Special)
119
120 PlainToken = group(Number, Funny, String, Name)
121 Token = Ignore + PlainToken
122
123 # First (or only) line of ' or " string.
124 ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
125 group("'", r'\\\r?\n'),
126 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
127 group('"', r'\\\r?\n'))
128 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
129 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
130
131 def _compile(expr):
132 return re.compile(expr, re.UNICODE)
133
134 tokenprog, pseudoprog, single3prog, double3prog = map(
135 _compile, (Token, PseudoToken, Single3, Double3))
136 endprogs = {"'": _compile(Single), '"': _compile(Double),
137 "'''": single3prog, '"""': double3prog,
138 "r'''": single3prog, 'r"""': double3prog,
139 "b'''": single3prog, 'b"""': double3prog,
140 "br'''": single3prog, 'br"""': double3prog,
141 "R'''": single3prog, 'R"""': double3prog,
142 "B'''": single3prog, 'B"""': double3prog,
143 "bR'''": single3prog, 'bR"""': double3prog,
144 "Br'''": single3prog, 'Br"""': double3prog,
145 "BR'''": single3prog, 'BR"""': double3prog,
146 'r': None, 'R': None, 'b': None, 'B': None}
147
148 triple_quoted = {}
149 for t in ("'''", '"""',
150 "r'''", 'r"""', "R'''", 'R"""',
151 "b'''", 'b"""', "B'''", 'B"""',
152 "br'''", 'br"""', "Br'''", 'Br"""',
153 "bR'''", 'bR"""', "BR'''", 'BR"""'):
154 triple_quoted[t] = t
155 single_quoted = {}
156 for t in ("'", '"',
157 "r'", 'r"', "R'", 'R"',
158 "b'", 'b"', "B'", 'B"',
159 "br'", 'br"', "Br'", 'Br"',
160 "bR'", 'bR"', "BR'", 'BR"' ):
161 single_quoted[t] = t
162
163 del _compile
164
165 tabsize = 8
166
167 class TokenError(Exception): pass
168
169 class StopTokenizing(Exception): pass
170
171
172 class Untokenizer:
173
174 def __init__(self):
175 self.tokens = []
176 self.prev_row = 1
177 self.prev_col = 0
178 self.encoding = 'utf-8'
179
180 def add_whitespace(self, tok_type, start):
181 row, col = start
182 assert row >= self.prev_row
183 col_offset = col - self.prev_col
184 if col_offset > 0:
185 self.tokens.append(" " * col_offset)
186 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
187 # Line was backslash-continued.
188 self.tokens.append(" ")
189
190 def untokenize(self, tokens):
191 iterable = iter(tokens)
192 for t in iterable:
193 if len(t) == 2:
194 self.compat(t, iterable)
195 break
196 tok_type, token, start, end = t[:4]
197 if tok_type == ENCODING:
198 self.encoding = token
199 continue
200 self.add_whitespace(tok_type, start)
201 self.tokens.append(token)
202 self.prev_row, self.prev_col = end
203 if tok_type in (NEWLINE, NL):
204 self.prev_row += 1
205 self.prev_col = 0
206 return "".join(self.tokens)
207
208 def compat(self, token, iterable):
209 # This import is here to avoid problems when the itertools
210 # module is not built yet and tokenize is imported.
211 from itertools import chain
212 startline = False
213 prevstring = False
214 indents = []
215 toks_append = self.tokens.append
216
217 for tok in chain([token], iterable):
218 toknum, tokval = tok[:2]
219 if toknum == ENCODING:
220 self.encoding = tokval
221 continue
222
223 if toknum in (NAME, NUMBER):
224 tokval += ' '
225
226 # Insert a space between two consecutive strings
227 if toknum == STRING:
228 if prevstring:
229 tokval = ' ' + tokval
230 prevstring = True
231 else:
232 prevstring = False
233
234 if toknum == INDENT:
235 indents.append(tokval)
236 continue
237 elif toknum == DEDENT:
238 indents.pop()
239 continue
240 elif toknum in (NEWLINE, NL):
241 startline = True
242 elif startline and indents:
243 toks_append(indents[-1])
244 startline = False
245 toks_append(tokval)
246
247
248 def untokenize(tokens):
249 """
250 Convert ``tokens`` (an iterable) back into Python source code. Return
251 a bytes object, encoded using the encoding specified by the last
252 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
253
254 The result is guaranteed to tokenize back to match the input so that
255 the conversion is lossless and round-trips are assured. The
256 guarantee applies only to the token type and token string as the
257 spacing between tokens (column positions) may change.
258
259 :func:`untokenize` has two modes. If the input tokens are sequences
260 of length 2 (``type``, ``string``) then spaces are added as necessary to
261 preserve the round-trip property.
262
263 If the input tokens are sequences of length 4 or more (``type``,
264 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
265 spaces are added so that each token appears in the result at the
266 position indicated by ``start`` and ``end``, if possible.
267 """
268 return Untokenizer().untokenize(tokens)
269
270
271 def _get_normal_name(orig_enc):
272 """Imitates get_normal_name in tokenizer.c."""
273 # Only care about the first 12 characters.
274 enc = orig_enc[:12].lower().replace("_", "-")
275 if enc == "utf-8" or enc.startswith("utf-8-"):
276 return "utf-8"
277 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
278 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
279 return "iso-8859-1"
280 return orig_enc
281
282 def detect_encoding(readline):
283 """
284 The detect_encoding() function is used to detect the encoding that should
285 be used to decode a Python source file. It requires one argment, readline,
286 in the same way as the tokenize() generator.
287
288 It will call readline a maximum of twice, and return the encoding used
289 (as a string) and a list of any lines (left as bytes) it has read in.
290
291 It detects the encoding from the presence of a utf-8 bom or an encoding
292 cookie as specified in pep-0263. If both a bom and a cookie are present,
293 but disagree, a SyntaxError will be raised. If the encoding cookie is an
294 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
295 'utf-8-sig' is returned.
296
297 If no encoding is specified, then the default of 'utf-8' will be returned.
298 """
299 bom_found = False
300 encoding = None
301 default = 'utf-8'
302 def read_or_stop():
303 try:
304 return readline()
305 except StopIteration:
306 return b''
307
308 def find_cookie(line):
309 try:
310 # Decode as UTF-8. Either the line is an encoding declaration,
311 # in which case it should be pure ASCII, or it must be UTF-8
312 # per default encoding.
313 line_string = line.decode('utf-8')
314 except UnicodeDecodeError:
315 raise SyntaxError("invalid or missing encoding declaration")
316
317 matches = cookie_re.findall(line_string)
318 if not matches:
319 return None
320 encoding = _get_normal_name(matches[0])
321 try:
322 codec = lookup(encoding)
323 except LookupError:
324 # This behaviour mimics the Python interpreter
325 raise SyntaxError("unknown encoding: " + encoding)
326
327 if bom_found:
328 if encoding != 'utf-8':
329 # This behaviour mimics the Python interpreter
330 raise SyntaxError('encoding problem: utf-8')
331 encoding += '-sig'
332 return encoding
333
334 first = read_or_stop()
335 if first.startswith(BOM_UTF8):
336 bom_found = True
337 first = first[3:]
338 default = 'utf-8-sig'
339 if not first:
340 return default, []
341
342 encoding = find_cookie(first)
343 if encoding:
344 return encoding, [first]
345
346 second = read_or_stop()
347 if not second:
348 return default, [first]
349
350 encoding = find_cookie(second)
351 if encoding:
352 return encoding, [first, second]
353
354 return default, [first, second]
355
356
357 def open(filename):
358 """Open a file in read only mode using the encoding detected by
359 detect_encoding().
360 """
361 buffer = builtins.open(filename, 'rb')
362 encoding, lines = detect_encoding(buffer.readline)
363 buffer.seek(0)
364 text = TextIOWrapper(buffer, encoding, line_buffering=True)
365 text.mode = 'r'
366 return text
367
368
369 def tokenize(readline):
370 """
371 The tokenize() generator requires one argment, readline, which
372 must be a callable object which provides the same interface as the
373 readline() method of built-in file objects. Each call to the function
374 should return one line of input as bytes. Alternately, readline
375 can be a callable function terminating with StopIteration:
376 readline = open(myfile, 'rb').__next__ # Example of alternate readline
377
378 The generator produces 5-tuples with these members: the token type; the
379 token string; a 2-tuple (srow, scol) of ints specifying the row and
380 column where the token begins in the source; a 2-tuple (erow, ecol) of
381 ints specifying the row and column where the token ends in the source;
382 and the line on which the token was found. The line passed is the
383 logical line; continuation lines are included.
384
385 The first token sequence will always be an ENCODING token
386 which tells you which encoding was used to decode the bytes stream.
387 """
388 # This import is here to avoid problems when the itertools module is not
389 # built yet and tokenize is imported.
390 from itertools import chain, repeat
391 encoding, consumed = detect_encoding(readline)
392 rl_gen = iter(readline, b"")
393 empty = repeat(b"")
394 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
395
396
397 def _tokenize(readline, encoding):
398 lnum = parenlev = continued = 0
399 numchars = '0123456789'
400 contstr, needcont = '', 0
401 contline = None
402 indents = [0]
403
404 if encoding is not None:
405 if encoding == "utf-8-sig":
406 # BOM will already have been stripped.
407 encoding = "utf-8"
408 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
409 while True: # loop over lines in stream
410 try:
411 line = readline()
412 except StopIteration:
413 line = b''
414
415 if encoding is not None:
416 line = line.decode(encoding)
417 lnum += 1
418 pos, max = 0, len(line)
419
420 if contstr: # continued string
421 if not line:
422 raise TokenError("EOF in multi-line string", strstart)
423 endmatch = endprog.match(line)
424 if endmatch:
425 pos = end = endmatch.end(0)
426 yield TokenInfo(STRING, contstr + line[:end],
427 strstart, (lnum, end), contline + line)
428 contstr, needcont = '', 0
429 contline = None
430 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
431 yield TokenInfo(ERRORTOKEN, contstr + line,
432 strstart, (lnum, len(line)), contline)
433 contstr = ''
434 contline = None
435 continue
436 else:
437 contstr = contstr + line
438 contline = contline + line
439 continue
440
441 elif parenlev == 0 and not continued: # new statement
442 if not line: break
443 column = 0
444 while pos < max: # measure leading whitespace
445 if line[pos] == ' ':
446 column += 1
447 elif line[pos] == '\t':
448 column = (column//tabsize + 1)*tabsize
449 elif line[pos] == '\f':
450 column = 0
451 else:
452 break
453 pos += 1
454 if pos == max:
455 break
456
457 if line[pos] in '#\r\n': # skip comments or blank lines
458 if line[pos] == '#':
459 comment_token = line[pos:].rstrip('\r\n')
460 nl_pos = pos + len(comment_token)
461 yield TokenInfo(COMMENT, comment_token,
462 (lnum, pos), (lnum, pos + len(comment_token)), line)
463 yield TokenInfo(NEWLINE, line[nl_pos:],
464 (lnum, nl_pos), (lnum, len(line)), line)
465 else:
466 yield TokenInfo(NEWLINE, line[pos:],
467 (lnum, pos), (lnum, len(line)), line)
468 continue
469
470 if column > indents[-1]: # count indents or dedents
471 indents.append(column)
472 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
473 while column < indents[-1]:
474 if column not in indents:
475 raise IndentationError(
476 "unindent does not match any outer indentation level",
477 ("<tokenize>", lnum, pos, line))
478 indents = indents[:-1]
479 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
480
481 else: # continued statement
482 if not line:
483 raise TokenError("EOF in multi-line statement", (lnum, 0))
484 continued = 0
485
486 while pos < max:
487 pseudomatch = pseudoprog.match(line, pos)
488 if pseudomatch: # scan for tokens
489 start, end = pseudomatch.span(1)
490 spos, epos, pos = (lnum, start), (lnum, end), end
491 token, initial = line[start:end], line[start]
492
493 if (initial in numchars or # ordinary number
494 (initial == '.' and token != '.' and token != '...')):
495 yield TokenInfo(NUMBER, token, spos, epos, line)
496 elif initial in '\r\n':
497 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
498 token, spos, epos, line)
499 elif initial == '#':
500 assert not token.endswith("\n")
501 yield TokenInfo(COMMENT, token, spos, epos, line)
502 elif token in triple_quoted:
503 endprog = endprogs[token]
504 endmatch = endprog.match(line, pos)
505 if endmatch: # all on one line
506 pos = endmatch.end(0)
507 token = line[start:pos]
508 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
509 else:
510 strstart = (lnum, start) # multiple lines
511 contstr = line[start:]
512 contline = line
513 break
514 elif initial in single_quoted or \
515 token[:2] in single_quoted or \
516 token[:3] in single_quoted:
517 if token[-1] == '\n': # continued string
518 strstart = (lnum, start)
519 endprog = (endprogs[initial] or endprogs[token[1]] or
520 endprogs[token[2]])
521 contstr, needcont = line[start:], 1
522 contline = line
523 break
524 else: # ordinary string
525 yield TokenInfo(STRING, token, spos, epos, line)
526 elif initial.isidentifier(): # ordinary name
527 yield TokenInfo(NAME, token, spos, epos, line)
528 elif initial == '\\': # continued stmt
529 continued = 1
530 else:
531 if initial in '([{':
532 parenlev += 1
533 elif initial in ')]}':
534 parenlev -= 1
535 yield TokenInfo(OP, token, spos, epos, line)
536 else:
537 yield TokenInfo(ERRORTOKEN, line[pos],
538 (lnum, pos), (lnum, pos+1), line)
539 pos += 1
540
541 for indent in indents[1:]: # pop remaining indent levels
542 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
543 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
544
545
546 # An undocumented, backwards compatible, API for all the places in the standard
547 # library that expect to be able to use tokenize with strings
548 def generate_tokens(readline):
549 return _tokenize(readline, None)
550
551 if __name__ == "__main__":
552 # Quick sanity check
553 s = b'''def parseline(self, line):
554 """Parse the line into a command name and a string containing
555 the arguments. Returns a tuple containing (command, args, line).
556 'command' and 'args' may be None if the line couldn't be parsed.
557 """
558 line = line.strip()
559 if not line:
560 return None, None, line
561 elif line[0] == '?':
562 line = 'help ' + line[1:]
563 elif line[0] == '!':
564 if hasattr(self, 'do_shell'):
565 line = 'shell ' + line[1:]
566 else:
567 return None, None, line
568 i, n = 0, len(line)
569 while i < n and line[i] in self.identchars: i = i+1
570 cmd, arg = line[:i], line[i:].strip()
571 return cmd, arg, line
572 '''
573 for tok in tokenize(iter(s.splitlines()).__next__):
574 print(tok)
@@ -0,0 +1,9 b''
1 """Load our patched versions of tokenize.
2 """
3
4 import sys
5
6 if sys.version_info[0] >= 3:
7 from _tokenize_py3 import *
8 else:
9 from _tokenize_py2 import *
@@ -1,441 +1,434 b''
1 import abc
1 import abc
2 import functools
2 import functools
3 import re
3 import re
4 from StringIO import StringIO
4 from StringIO import StringIO
5 import tokenize
6
7 try:
8 generate_tokens = tokenize.generate_tokens
9 except AttributeError:
10 # Python 3. Note that we use the undocumented _tokenize because it expects
11 # strings, not bytes. See also Python issue #9969.
12 generate_tokens = tokenize._tokenize
13
5
14 from IPython.core.splitinput import split_user_input, LineInfo
6 from IPython.core.splitinput import split_user_input, LineInfo
15 from IPython.utils.untokenize import untokenize
7 from IPython.utils import tokenize2
8 from IPython.utils.tokenize2 import generate_tokens, untokenize, TokenError
16
9
17 #-----------------------------------------------------------------------------
10 #-----------------------------------------------------------------------------
18 # Globals
11 # Globals
19 #-----------------------------------------------------------------------------
12 #-----------------------------------------------------------------------------
20
13
21 # The escape sequences that define the syntax transformations IPython will
14 # The escape sequences that define the syntax transformations IPython will
22 # apply to user input. These can NOT be just changed here: many regular
15 # apply to user input. These can NOT be just changed here: many regular
23 # expressions and other parts of the code may use their hardcoded values, and
16 # expressions and other parts of the code may use their hardcoded values, and
24 # for all intents and purposes they constitute the 'IPython syntax', so they
17 # for all intents and purposes they constitute the 'IPython syntax', so they
25 # should be considered fixed.
18 # should be considered fixed.
26
19
27 ESC_SHELL = '!' # Send line to underlying system shell
20 ESC_SHELL = '!' # Send line to underlying system shell
28 ESC_SH_CAP = '!!' # Send line to system shell and capture output
21 ESC_SH_CAP = '!!' # Send line to system shell and capture output
29 ESC_HELP = '?' # Find information about object
22 ESC_HELP = '?' # Find information about object
30 ESC_HELP2 = '??' # Find extra-detailed information about object
23 ESC_HELP2 = '??' # Find extra-detailed information about object
31 ESC_MAGIC = '%' # Call magic function
24 ESC_MAGIC = '%' # Call magic function
32 ESC_MAGIC2 = '%%' # Call cell-magic function
25 ESC_MAGIC2 = '%%' # Call cell-magic function
33 ESC_QUOTE = ',' # Split args on whitespace, quote each as string and call
26 ESC_QUOTE = ',' # Split args on whitespace, quote each as string and call
34 ESC_QUOTE2 = ';' # Quote all args as a single string, call
27 ESC_QUOTE2 = ';' # Quote all args as a single string, call
35 ESC_PAREN = '/' # Call first argument with rest of line as arguments
28 ESC_PAREN = '/' # Call first argument with rest of line as arguments
36
29
37 ESC_SEQUENCES = [ESC_SHELL, ESC_SH_CAP, ESC_HELP ,\
30 ESC_SEQUENCES = [ESC_SHELL, ESC_SH_CAP, ESC_HELP ,\
38 ESC_HELP2, ESC_MAGIC, ESC_MAGIC2,\
31 ESC_HELP2, ESC_MAGIC, ESC_MAGIC2,\
39 ESC_QUOTE, ESC_QUOTE2, ESC_PAREN ]
32 ESC_QUOTE, ESC_QUOTE2, ESC_PAREN ]
40
33
41
34
42 class InputTransformer(object):
35 class InputTransformer(object):
43 """Abstract base class for line-based input transformers."""
36 """Abstract base class for line-based input transformers."""
44 __metaclass__ = abc.ABCMeta
37 __metaclass__ = abc.ABCMeta
45
38
46 @abc.abstractmethod
39 @abc.abstractmethod
47 def push(self, line):
40 def push(self, line):
48 """Send a line of input to the transformer, returning the transformed
41 """Send a line of input to the transformer, returning the transformed
49 input or None if the transformer is waiting for more input.
42 input or None if the transformer is waiting for more input.
50
43
51 Must be overridden by subclasses.
44 Must be overridden by subclasses.
52 """
45 """
53 pass
46 pass
54
47
55 @abc.abstractmethod
48 @abc.abstractmethod
56 def reset(self):
49 def reset(self):
57 """Return, transformed any lines that the transformer has accumulated,
50 """Return, transformed any lines that the transformer has accumulated,
58 and reset its internal state.
51 and reset its internal state.
59
52
60 Must be overridden by subclasses.
53 Must be overridden by subclasses.
61 """
54 """
62 pass
55 pass
63
56
64 @classmethod
57 @classmethod
65 def wrap(cls, func):
58 def wrap(cls, func):
66 """Can be used by subclasses as a decorator, to return a factory that
59 """Can be used by subclasses as a decorator, to return a factory that
67 will allow instantiation with the decorated object.
60 will allow instantiation with the decorated object.
68 """
61 """
69 @functools.wraps(func)
62 @functools.wraps(func)
70 def transformer_factory():
63 def transformer_factory():
71 return cls(func)
64 return cls(func)
72
65
73 return transformer_factory
66 return transformer_factory
74
67
75 class StatelessInputTransformer(InputTransformer):
68 class StatelessInputTransformer(InputTransformer):
76 """Wrapper for a stateless input transformer implemented as a function."""
69 """Wrapper for a stateless input transformer implemented as a function."""
77 def __init__(self, func):
70 def __init__(self, func):
78 self.func = func
71 self.func = func
79
72
80 def __repr__(self):
73 def __repr__(self):
81 return "StatelessInputTransformer(func={!r})".format(self.func)
74 return "StatelessInputTransformer(func={!r})".format(self.func)
82
75
83 def push(self, line):
76 def push(self, line):
84 """Send a line of input to the transformer, returning the
77 """Send a line of input to the transformer, returning the
85 transformed input."""
78 transformed input."""
86 return self.func(line)
79 return self.func(line)
87
80
88 def reset(self):
81 def reset(self):
89 """No-op - exists for compatibility."""
82 """No-op - exists for compatibility."""
90 pass
83 pass
91
84
92 class CoroutineInputTransformer(InputTransformer):
85 class CoroutineInputTransformer(InputTransformer):
93 """Wrapper for an input transformer implemented as a coroutine."""
86 """Wrapper for an input transformer implemented as a coroutine."""
94 def __init__(self, coro):
87 def __init__(self, coro):
95 # Prime it
88 # Prime it
96 self.coro = coro()
89 self.coro = coro()
97 next(self.coro)
90 next(self.coro)
98
91
99 def __repr__(self):
92 def __repr__(self):
100 return "CoroutineInputTransformer(coro={!r})".format(self.coro)
93 return "CoroutineInputTransformer(coro={!r})".format(self.coro)
101
94
102 def push(self, line):
95 def push(self, line):
103 """Send a line of input to the transformer, returning the
96 """Send a line of input to the transformer, returning the
104 transformed input or None if the transformer is waiting for more
97 transformed input or None if the transformer is waiting for more
105 input.
98 input.
106 """
99 """
107 return self.coro.send(line)
100 return self.coro.send(line)
108
101
109 def reset(self):
102 def reset(self):
110 """Return, transformed any lines that the transformer has
103 """Return, transformed any lines that the transformer has
111 accumulated, and reset its internal state.
104 accumulated, and reset its internal state.
112 """
105 """
113 return self.coro.send(None)
106 return self.coro.send(None)
114
107
115 class TokenInputTransformer(InputTransformer):
108 class TokenInputTransformer(InputTransformer):
116 """Wrapper for a token-based input transformer.
109 """Wrapper for a token-based input transformer.
117
110
118 func should accept a list of tokens (5-tuples, see tokenize docs), and
111 func should accept a list of tokens (5-tuples, see tokenize docs), and
119 return an iterable which can be passed to tokenize.untokenize().
112 return an iterable which can be passed to tokenize.untokenize().
120 """
113 """
121 def __init__(self, func):
114 def __init__(self, func):
122 self.func = func
115 self.func = func
123 self.current_line = ""
116 self.current_line = ""
124 self.line_used = False
117 self.line_used = False
125 self.reset_tokenizer()
118 self.reset_tokenizer()
126
119
127 def reset_tokenizer(self):
120 def reset_tokenizer(self):
128 self.tokenizer = generate_tokens(self.get_line)
121 self.tokenizer = generate_tokens(self.get_line)
129
122
130 def get_line(self):
123 def get_line(self):
131 if self.line_used:
124 if self.line_used:
132 raise tokenize.TokenError
125 raise TokenError
133 self.line_used = True
126 self.line_used = True
134 return self.current_line
127 return self.current_line
135
128
136 def push(self, line):
129 def push(self, line):
137 self.current_line += line + "\n"
130 self.current_line += line + "\n"
138 if self.current_line.isspace():
131 if self.current_line.isspace():
139 return self.reset()
132 return self.reset()
140
133
141 self.line_used = False
134 self.line_used = False
142 tokens = []
135 tokens = []
143 stop_at_NL = False
136 stop_at_NL = False
144 try:
137 try:
145 for intok in self.tokenizer:
138 for intok in self.tokenizer:
146 tokens.append(intok)
139 tokens.append(intok)
147 t = intok[0]
140 t = intok[0]
148 if t == tokenize.NEWLINE or (stop_at_NL and t == tokenize.NL):
141 if t == tokenize2.NEWLINE or (stop_at_NL and t == tokenize2.NL):
149 # Stop before we try to pull a line we don't have yet
142 # Stop before we try to pull a line we don't have yet
150 break
143 break
151 elif t in (tokenize.COMMENT, tokenize.ERRORTOKEN):
144 elif t == tokenize2.ERRORTOKEN:
152 stop_at_NL = True
145 stop_at_NL = True
153 except tokenize.TokenError:
146 except TokenError:
154 # Multi-line statement - stop and try again with the next line
147 # Multi-line statement - stop and try again with the next line
155 self.reset_tokenizer()
148 self.reset_tokenizer()
156 return None
149 return None
157
150
158 return self.output(tokens)
151 return self.output(tokens)
159
152
160 def output(self, tokens):
153 def output(self, tokens):
161 self.current_line = ""
154 self.current_line = ""
162 self.reset_tokenizer()
155 self.reset_tokenizer()
163 return untokenize(self.func(tokens)).rstrip('\n')
156 return untokenize(self.func(tokens)).rstrip('\n')
164
157
165 def reset(self):
158 def reset(self):
166 l = self.current_line
159 l = self.current_line
167 self.current_line = ""
160 self.current_line = ""
168 self.reset_tokenizer()
161 self.reset_tokenizer()
169 if l:
162 if l:
170 return l.rstrip('\n')
163 return l.rstrip('\n')
171
164
172 class assemble_python_lines(TokenInputTransformer):
165 class assemble_python_lines(TokenInputTransformer):
173 def __init__(self):
166 def __init__(self):
174 super(assemble_python_lines, self).__init__(None)
167 super(assemble_python_lines, self).__init__(None)
175
168
176 def output(self, tokens):
169 def output(self, tokens):
177 return self.reset()
170 return self.reset()
178
171
179 @CoroutineInputTransformer.wrap
172 @CoroutineInputTransformer.wrap
180 def assemble_logical_lines():
173 def assemble_logical_lines():
181 """Join lines following explicit line continuations (\)"""
174 """Join lines following explicit line continuations (\)"""
182 line = ''
175 line = ''
183 while True:
176 while True:
184 line = (yield line)
177 line = (yield line)
185 if not line or line.isspace():
178 if not line or line.isspace():
186 continue
179 continue
187
180
188 parts = []
181 parts = []
189 while line is not None:
182 while line is not None:
190 parts.append(line.rstrip('\\'))
183 parts.append(line.rstrip('\\'))
191 if not line.endswith('\\'):
184 if not line.endswith('\\'):
192 break
185 break
193 line = (yield None)
186 line = (yield None)
194
187
195 # Output
188 # Output
196 line = ' '.join(parts)
189 line = ' '.join(parts)
197
190
198 # Utilities
191 # Utilities
199 def _make_help_call(target, esc, lspace, next_input=None):
192 def _make_help_call(target, esc, lspace, next_input=None):
200 """Prepares a pinfo(2)/psearch call from a target name and the escape
193 """Prepares a pinfo(2)/psearch call from a target name and the escape
201 (i.e. ? or ??)"""
194 (i.e. ? or ??)"""
202 method = 'pinfo2' if esc == '??' \
195 method = 'pinfo2' if esc == '??' \
203 else 'psearch' if '*' in target \
196 else 'psearch' if '*' in target \
204 else 'pinfo'
197 else 'pinfo'
205 arg = " ".join([method, target])
198 arg = " ".join([method, target])
206 if next_input is None:
199 if next_input is None:
207 return '%sget_ipython().magic(%r)' % (lspace, arg)
200 return '%sget_ipython().magic(%r)' % (lspace, arg)
208 else:
201 else:
209 return '%sget_ipython().set_next_input(%r);get_ipython().magic(%r)' % \
202 return '%sget_ipython().set_next_input(%r);get_ipython().magic(%r)' % \
210 (lspace, next_input, arg)
203 (lspace, next_input, arg)
211
204
212 # These define the transformations for the different escape characters.
205 # These define the transformations for the different escape characters.
213 def _tr_system(line_info):
206 def _tr_system(line_info):
214 "Translate lines escaped with: !"
207 "Translate lines escaped with: !"
215 cmd = line_info.line.lstrip().lstrip(ESC_SHELL)
208 cmd = line_info.line.lstrip().lstrip(ESC_SHELL)
216 return '%sget_ipython().system(%r)' % (line_info.pre, cmd)
209 return '%sget_ipython().system(%r)' % (line_info.pre, cmd)
217
210
218 def _tr_system2(line_info):
211 def _tr_system2(line_info):
219 "Translate lines escaped with: !!"
212 "Translate lines escaped with: !!"
220 cmd = line_info.line.lstrip()[2:]
213 cmd = line_info.line.lstrip()[2:]
221 return '%sget_ipython().getoutput(%r)' % (line_info.pre, cmd)
214 return '%sget_ipython().getoutput(%r)' % (line_info.pre, cmd)
222
215
223 def _tr_help(line_info):
216 def _tr_help(line_info):
224 "Translate lines escaped with: ?/??"
217 "Translate lines escaped with: ?/??"
225 # A naked help line should just fire the intro help screen
218 # A naked help line should just fire the intro help screen
226 if not line_info.line[1:]:
219 if not line_info.line[1:]:
227 return 'get_ipython().show_usage()'
220 return 'get_ipython().show_usage()'
228
221
229 return _make_help_call(line_info.ifun, line_info.esc, line_info.pre)
222 return _make_help_call(line_info.ifun, line_info.esc, line_info.pre)
230
223
231 def _tr_magic(line_info):
224 def _tr_magic(line_info):
232 "Translate lines escaped with: %"
225 "Translate lines escaped with: %"
233 tpl = '%sget_ipython().magic(%r)'
226 tpl = '%sget_ipython().magic(%r)'
234 cmd = ' '.join([line_info.ifun, line_info.the_rest]).strip()
227 cmd = ' '.join([line_info.ifun, line_info.the_rest]).strip()
235 return tpl % (line_info.pre, cmd)
228 return tpl % (line_info.pre, cmd)
236
229
237 def _tr_quote(line_info):
230 def _tr_quote(line_info):
238 "Translate lines escaped with: ,"
231 "Translate lines escaped with: ,"
239 return '%s%s("%s")' % (line_info.pre, line_info.ifun,
232 return '%s%s("%s")' % (line_info.pre, line_info.ifun,
240 '", "'.join(line_info.the_rest.split()) )
233 '", "'.join(line_info.the_rest.split()) )
241
234
242 def _tr_quote2(line_info):
235 def _tr_quote2(line_info):
243 "Translate lines escaped with: ;"
236 "Translate lines escaped with: ;"
244 return '%s%s("%s")' % (line_info.pre, line_info.ifun,
237 return '%s%s("%s")' % (line_info.pre, line_info.ifun,
245 line_info.the_rest)
238 line_info.the_rest)
246
239
247 def _tr_paren(line_info):
240 def _tr_paren(line_info):
248 "Translate lines escaped with: /"
241 "Translate lines escaped with: /"
249 return '%s%s(%s)' % (line_info.pre, line_info.ifun,
242 return '%s%s(%s)' % (line_info.pre, line_info.ifun,
250 ", ".join(line_info.the_rest.split()))
243 ", ".join(line_info.the_rest.split()))
251
244
252 tr = { ESC_SHELL : _tr_system,
245 tr = { ESC_SHELL : _tr_system,
253 ESC_SH_CAP : _tr_system2,
246 ESC_SH_CAP : _tr_system2,
254 ESC_HELP : _tr_help,
247 ESC_HELP : _tr_help,
255 ESC_HELP2 : _tr_help,
248 ESC_HELP2 : _tr_help,
256 ESC_MAGIC : _tr_magic,
249 ESC_MAGIC : _tr_magic,
257 ESC_QUOTE : _tr_quote,
250 ESC_QUOTE : _tr_quote,
258 ESC_QUOTE2 : _tr_quote2,
251 ESC_QUOTE2 : _tr_quote2,
259 ESC_PAREN : _tr_paren }
252 ESC_PAREN : _tr_paren }
260
253
261 @StatelessInputTransformer.wrap
254 @StatelessInputTransformer.wrap
262 def escaped_commands(line):
255 def escaped_commands(line):
263 """Transform escaped commands - %magic, !system, ?help + various autocalls.
256 """Transform escaped commands - %magic, !system, ?help + various autocalls.
264 """
257 """
265 if not line or line.isspace():
258 if not line or line.isspace():
266 return line
259 return line
267 lineinf = LineInfo(line)
260 lineinf = LineInfo(line)
268 if lineinf.esc not in tr:
261 if lineinf.esc not in tr:
269 return line
262 return line
270
263
271 return tr[lineinf.esc](lineinf)
264 return tr[lineinf.esc](lineinf)
272
265
273 _initial_space_re = re.compile(r'\s*')
266 _initial_space_re = re.compile(r'\s*')
274
267
275 _help_end_re = re.compile(r"""(%{0,2}
268 _help_end_re = re.compile(r"""(%{0,2}
276 [a-zA-Z_*][\w*]* # Variable name
269 [a-zA-Z_*][\w*]* # Variable name
277 (\.[a-zA-Z_*][\w*]*)* # .etc.etc
270 (\.[a-zA-Z_*][\w*]*)* # .etc.etc
278 )
271 )
279 (\?\??)$ # ? or ??""",
272 (\?\??)$ # ? or ??""",
280 re.VERBOSE)
273 re.VERBOSE)
281
274
282 def has_comment(src):
275 def has_comment(src):
283 """Indicate whether an input line has (i.e. ends in, or is) a comment.
276 """Indicate whether an input line has (i.e. ends in, or is) a comment.
284
277
285 This uses tokenize, so it can distinguish comments from # inside strings.
278 This uses tokenize, so it can distinguish comments from # inside strings.
286
279
287 Parameters
280 Parameters
288 ----------
281 ----------
289 src : string
282 src : string
290 A single line input string.
283 A single line input string.
291
284
292 Returns
285 Returns
293 -------
286 -------
294 comment : bool
287 comment : bool
295 True if source has a comment.
288 True if source has a comment.
296 """
289 """
297 readline = StringIO(src).readline
290 readline = StringIO(src).readline
298 toktypes = set()
291 toktypes = set()
299 try:
292 try:
300 for t in tokenize.generate_tokens(readline):
293 for t in generate_tokens(readline):
301 toktypes.add(t[0])
294 toktypes.add(t[0])
302 except tokenize.TokenError:
295 except TokenError:
303 pass
296 pass
304 return(tokenize.COMMENT in toktypes)
297 return(tokenize2.COMMENT in toktypes)
305
298
306
299
307 @StatelessInputTransformer.wrap
300 @StatelessInputTransformer.wrap
308 def help_end(line):
301 def help_end(line):
309 """Translate lines with ?/?? at the end"""
302 """Translate lines with ?/?? at the end"""
310 m = _help_end_re.search(line)
303 m = _help_end_re.search(line)
311 if m is None or has_comment(line):
304 if m is None or has_comment(line):
312 return line
305 return line
313 target = m.group(1)
306 target = m.group(1)
314 esc = m.group(3)
307 esc = m.group(3)
315 lspace = _initial_space_re.match(line).group(0)
308 lspace = _initial_space_re.match(line).group(0)
316
309
317 # If we're mid-command, put it back on the next prompt for the user.
310 # If we're mid-command, put it back on the next prompt for the user.
318 next_input = line.rstrip('?') if line.strip() != m.group(0) else None
311 next_input = line.rstrip('?') if line.strip() != m.group(0) else None
319
312
320 return _make_help_call(target, esc, lspace, next_input)
313 return _make_help_call(target, esc, lspace, next_input)
321
314
322
315
323 @CoroutineInputTransformer.wrap
316 @CoroutineInputTransformer.wrap
324 def cellmagic():
317 def cellmagic():
325 """Captures & transforms cell magics.
318 """Captures & transforms cell magics.
326
319
327 After a cell magic is started, this stores up any lines it gets until it is
320 After a cell magic is started, this stores up any lines it gets until it is
328 reset (sent None).
321 reset (sent None).
329 """
322 """
330 tpl = 'get_ipython().run_cell_magic(%r, %r, %r)'
323 tpl = 'get_ipython().run_cell_magic(%r, %r, %r)'
331 cellmagic_help_re = re.compile('%%\w+\?')
324 cellmagic_help_re = re.compile('%%\w+\?')
332 line = ''
325 line = ''
333 while True:
326 while True:
334 line = (yield line)
327 line = (yield line)
335 if (not line) or (not line.startswith(ESC_MAGIC2)):
328 if (not line) or (not line.startswith(ESC_MAGIC2)):
336 continue
329 continue
337
330
338 if cellmagic_help_re.match(line):
331 if cellmagic_help_re.match(line):
339 # This case will be handled by help_end
332 # This case will be handled by help_end
340 continue
333 continue
341
334
342 first = line
335 first = line
343 body = []
336 body = []
344 line = (yield None)
337 line = (yield None)
345 while (line is not None) and (line.strip() != ''):
338 while (line is not None) and (line.strip() != ''):
346 body.append(line)
339 body.append(line)
347 line = (yield None)
340 line = (yield None)
348
341
349 # Output
342 # Output
350 magic_name, _, first = first.partition(' ')
343 magic_name, _, first = first.partition(' ')
351 magic_name = magic_name.lstrip(ESC_MAGIC2)
344 magic_name = magic_name.lstrip(ESC_MAGIC2)
352 line = tpl % (magic_name, first, u'\n'.join(body))
345 line = tpl % (magic_name, first, u'\n'.join(body))
353
346
354
347
355 def _strip_prompts(prompt1_re, prompt2_re):
348 def _strip_prompts(prompt1_re, prompt2_re):
356 """Remove matching input prompts from a block of input."""
349 """Remove matching input prompts from a block of input."""
357 line = ''
350 line = ''
358 while True:
351 while True:
359 line = (yield line)
352 line = (yield line)
360
353
361 if line is None:
354 if line is None:
362 continue
355 continue
363
356
364 m = prompt1_re.match(line)
357 m = prompt1_re.match(line)
365 if m:
358 if m:
366 while m:
359 while m:
367 line = (yield line[len(m.group(0)):])
360 line = (yield line[len(m.group(0)):])
368 if line is None:
361 if line is None:
369 break
362 break
370 m = prompt2_re.match(line)
363 m = prompt2_re.match(line)
371 else:
364 else:
372 # Prompts not in input - wait for reset
365 # Prompts not in input - wait for reset
373 while line is not None:
366 while line is not None:
374 line = (yield line)
367 line = (yield line)
375
368
376 @CoroutineInputTransformer.wrap
369 @CoroutineInputTransformer.wrap
377 def classic_prompt():
370 def classic_prompt():
378 """Strip the >>>/... prompts of the Python interactive shell."""
371 """Strip the >>>/... prompts of the Python interactive shell."""
379 prompt1_re = re.compile(r'^(>>> )')
372 prompt1_re = re.compile(r'^(>>> )')
380 prompt2_re = re.compile(r'^(>>> |^\.\.\. )')
373 prompt2_re = re.compile(r'^(>>> |^\.\.\. )')
381 return _strip_prompts(prompt1_re, prompt2_re)
374 return _strip_prompts(prompt1_re, prompt2_re)
382
375
383 @CoroutineInputTransformer.wrap
376 @CoroutineInputTransformer.wrap
384 def ipy_prompt():
377 def ipy_prompt():
385 """Strip IPython's In [1]:/...: prompts."""
378 """Strip IPython's In [1]:/...: prompts."""
386 prompt1_re = re.compile(r'^In \[\d+\]: ')
379 prompt1_re = re.compile(r'^In \[\d+\]: ')
387 prompt2_re = re.compile(r'^(In \[\d+\]: |^\ \ \ \.\.\.+: )')
380 prompt2_re = re.compile(r'^(In \[\d+\]: |^\ \ \ \.\.\.+: )')
388 return _strip_prompts(prompt1_re, prompt2_re)
381 return _strip_prompts(prompt1_re, prompt2_re)
389
382
390
383
391 @CoroutineInputTransformer.wrap
384 @CoroutineInputTransformer.wrap
392 def leading_indent():
385 def leading_indent():
393 """Remove leading indentation.
386 """Remove leading indentation.
394
387
395 If the first line starts with a spaces or tabs, the same whitespace will be
388 If the first line starts with a spaces or tabs, the same whitespace will be
396 removed from each following line until it is reset.
389 removed from each following line until it is reset.
397 """
390 """
398 space_re = re.compile(r'^[ \t]+')
391 space_re = re.compile(r'^[ \t]+')
399 line = ''
392 line = ''
400 while True:
393 while True:
401 line = (yield line)
394 line = (yield line)
402
395
403 if line is None:
396 if line is None:
404 continue
397 continue
405
398
406 m = space_re.match(line)
399 m = space_re.match(line)
407 if m:
400 if m:
408 space = m.group(0)
401 space = m.group(0)
409 while line is not None:
402 while line is not None:
410 if line.startswith(space):
403 if line.startswith(space):
411 line = line[len(space):]
404 line = line[len(space):]
412 line = (yield line)
405 line = (yield line)
413 else:
406 else:
414 # No leading spaces - wait for reset
407 # No leading spaces - wait for reset
415 while line is not None:
408 while line is not None:
416 line = (yield line)
409 line = (yield line)
417
410
418
411
419 assign_system_re = re.compile(r'(?P<lhs>(\s*)([\w\.]+)((\s*,\s*[\w\.]+)*))'
412 assign_system_re = re.compile(r'(?P<lhs>(\s*)([\w\.]+)((\s*,\s*[\w\.]+)*))'
420 r'\s*=\s*!\s*(?P<cmd>.*)')
413 r'\s*=\s*!\s*(?P<cmd>.*)')
421 assign_system_template = '%s = get_ipython().getoutput(%r)'
414 assign_system_template = '%s = get_ipython().getoutput(%r)'
422 @StatelessInputTransformer.wrap
415 @StatelessInputTransformer.wrap
423 def assign_from_system(line):
416 def assign_from_system(line):
424 """Transform assignment from system commands (e.g. files = !ls)"""
417 """Transform assignment from system commands (e.g. files = !ls)"""
425 m = assign_system_re.match(line)
418 m = assign_system_re.match(line)
426 if m is None:
419 if m is None:
427 return line
420 return line
428
421
429 return assign_system_template % m.group('lhs', 'cmd')
422 return assign_system_template % m.group('lhs', 'cmd')
430
423
431 assign_magic_re = re.compile(r'(?P<lhs>(\s*)([\w\.]+)((\s*,\s*[\w\.]+)*))'
424 assign_magic_re = re.compile(r'(?P<lhs>(\s*)([\w\.]+)((\s*,\s*[\w\.]+)*))'
432 r'\s*=\s*%\s*(?P<cmd>.*)')
425 r'\s*=\s*%\s*(?P<cmd>.*)')
433 assign_magic_template = '%s = get_ipython().magic(%r)'
426 assign_magic_template = '%s = get_ipython().magic(%r)'
434 @StatelessInputTransformer.wrap
427 @StatelessInputTransformer.wrap
435 def assign_from_magic(line):
428 def assign_from_magic(line):
436 """Transform assignment from magic commands (e.g. a = %who_ls)"""
429 """Transform assignment from magic commands (e.g. a = %who_ls)"""
437 m = assign_magic_re.match(line)
430 m = assign_magic_re.match(line)
438 if m is None:
431 if m is None:
439 return line
432 return line
440
433
441 return assign_magic_template % m.group('lhs', 'cmd')
434 return assign_magic_template % m.group('lhs', 'cmd')
1 NO CONTENT: file was removed
NO CONTENT: file was removed
General Comments 0
You need to be logged in to leave comments. Login now