##// END OF EJS Templates
Now include patched copies of tokenize for Python 2 and 3.
Thomas Kluyver -
Show More
@@ -0,0 +1,438 b''
1 """Patched version of standard library tokenize, to deal with various bugs.
2
3 Patches
4
5 - Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing),
6 manually applied.
7 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
8 on whether they are in a multi-line statement. Filed as Python issue #17061.
9
10 -------------------------------------------------------------------------------
11 Tokenization help for Python programs.
12
13 generate_tokens(readline) is a generator that breaks a stream of
14 text into Python tokens. It accepts a readline-like method which is called
15 repeatedly to get the next line of input (or "" for EOF). It generates
16 5-tuples with these members:
17
18 the token type (see token.py)
19 the token (a string)
20 the starting (row, column) indices of the token (a 2-tuple of ints)
21 the ending (row, column) indices of the token (a 2-tuple of ints)
22 the original line (string)
23
24 It is designed to match the working of the Python tokenizer exactly, except
25 that it produces COMMENT tokens for comments and gives type OP for all
26 operators
27
28 Older entry points
29 tokenize_loop(readline, tokeneater)
30 tokenize(readline, tokeneater=printtoken)
31 are the same, except instead of generating tokens, tokeneater is a callback
32 function to which the 5 fields described above are passed as 5 arguments,
33 each time a new token is found."""
34
35 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
36 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
37 'Skip Montanaro, Raymond Hettinger')
38
39 import string, re
40 from token import *
41
42 import token
43 __all__ = [x for x in dir(token) if not x.startswith("_")]
44 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
45 del x
46 del token
47
48 __all__ += ["TokenError"]
49
50 COMMENT = N_TOKENS
51 tok_name[COMMENT] = 'COMMENT'
52 NL = N_TOKENS + 1
53 tok_name[NL] = 'NL'
54 N_TOKENS += 2
55
56 def group(*choices): return '(' + '|'.join(choices) + ')'
57 def any(*choices): return group(*choices) + '*'
58 def maybe(*choices): return group(*choices) + '?'
59
60 Whitespace = r'[ \f\t]*'
61 Comment = r'#[^\r\n]*'
62 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
63 Name = r'[a-zA-Z_]\w*'
64
65 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
66 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
67 Binnumber = r'0[bB][01]+[lL]?'
68 Decnumber = r'[1-9]\d*[lL]?'
69 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
70 Exponent = r'[eE][-+]?\d+'
71 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
72 Expfloat = r'\d+' + Exponent
73 Floatnumber = group(Pointfloat, Expfloat)
74 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
75 Number = group(Imagnumber, Floatnumber, Intnumber)
76
77 # Tail end of ' string.
78 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
79 # Tail end of " string.
80 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
81 # Tail end of ''' string.
82 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
83 # Tail end of """ string.
84 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
85 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
86 # Single-line ' or " string.
87 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
88 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
89
90 # Because of leftmost-then-longest match semantics, be sure to put the
91 # longest operators first (e.g., if = came before ==, == would get
92 # recognized as two instances of =).
93 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
94 r"//=?",
95 r"[+\-*/%&|^=<>]=?",
96 r"~")
97
98 Bracket = '[][(){}]'
99 Special = group(r'\r?\n', r'[:;.,`@]')
100 Funny = group(Operator, Bracket, Special)
101
102 PlainToken = group(Number, Funny, String, Name)
103 Token = Ignore + PlainToken
104
105 # First (or only) line of ' or " string.
106 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
107 group("'", r'\\\r?\n'),
108 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
109 group('"', r'\\\r?\n'))
110 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
111 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
112
113 tokenprog, pseudoprog, single3prog, double3prog = map(
114 re.compile, (Token, PseudoToken, Single3, Double3))
115 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
116 "'''": single3prog, '"""': double3prog,
117 "r'''": single3prog, 'r"""': double3prog,
118 "u'''": single3prog, 'u"""': double3prog,
119 "ur'''": single3prog, 'ur"""': double3prog,
120 "R'''": single3prog, 'R"""': double3prog,
121 "U'''": single3prog, 'U"""': double3prog,
122 "uR'''": single3prog, 'uR"""': double3prog,
123 "Ur'''": single3prog, 'Ur"""': double3prog,
124 "UR'''": single3prog, 'UR"""': double3prog,
125 "b'''": single3prog, 'b"""': double3prog,
126 "br'''": single3prog, 'br"""': double3prog,
127 "B'''": single3prog, 'B"""': double3prog,
128 "bR'''": single3prog, 'bR"""': double3prog,
129 "Br'''": single3prog, 'Br"""': double3prog,
130 "BR'''": single3prog, 'BR"""': double3prog,
131 'r': None, 'R': None, 'u': None, 'U': None,
132 'b': None, 'B': None}
133
134 triple_quoted = {}
135 for t in ("'''", '"""',
136 "r'''", 'r"""', "R'''", 'R"""',
137 "u'''", 'u"""', "U'''", 'U"""',
138 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
139 "uR'''", 'uR"""', "UR'''", 'UR"""',
140 "b'''", 'b"""', "B'''", 'B"""',
141 "br'''", 'br"""', "Br'''", 'Br"""',
142 "bR'''", 'bR"""', "BR'''", 'BR"""'):
143 triple_quoted[t] = t
144 single_quoted = {}
145 for t in ("'", '"',
146 "r'", 'r"', "R'", 'R"',
147 "u'", 'u"', "U'", 'U"',
148 "ur'", 'ur"', "Ur'", 'Ur"',
149 "uR'", 'uR"', "UR'", 'UR"',
150 "b'", 'b"', "B'", 'B"',
151 "br'", 'br"', "Br'", 'Br"',
152 "bR'", 'bR"', "BR'", 'BR"' ):
153 single_quoted[t] = t
154
155 tabsize = 8
156
157 class TokenError(Exception): pass
158
159 class StopTokenizing(Exception): pass
160
161 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
162 srow, scol = srow_scol
163 erow, ecol = erow_ecol
164 print "%d,%d-%d,%d:\t%s\t%s" % \
165 (srow, scol, erow, ecol, tok_name[type], repr(token))
166
167 def tokenize(readline, tokeneater=printtoken):
168 """
169 The tokenize() function accepts two parameters: one representing the
170 input stream, and one providing an output mechanism for tokenize().
171
172 The first parameter, readline, must be a callable object which provides
173 the same interface as the readline() method of built-in file objects.
174 Each call to the function should return one line of input as a string.
175
176 The second parameter, tokeneater, must also be a callable object. It is
177 called once for each token, with five arguments, corresponding to the
178 tuples generated by generate_tokens().
179 """
180 try:
181 tokenize_loop(readline, tokeneater)
182 except StopTokenizing:
183 pass
184
185 # backwards compatible interface
186 def tokenize_loop(readline, tokeneater):
187 for token_info in generate_tokens(readline):
188 tokeneater(*token_info)
189
190 class Untokenizer:
191
192 def __init__(self):
193 self.tokens = []
194 self.prev_row = 1
195 self.prev_col = 0
196
197 def add_whitespace(self, start):
198 row, col = start
199 assert row >= self.prev_row
200 col_offset = col - self.prev_col
201 if col_offset > 0:
202 self.tokens.append(" " * col_offset)
203 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
204 # Line was backslash-continued
205 self.tokens.append(" ")
206
207 def untokenize(self, tokens):
208 iterable = iter(tokens)
209 for t in iterable:
210 if len(t) == 2:
211 self.compat(t, iterable)
212 break
213 tok_type, token, start, end = t[:4]
214 self.add_whitespace(start)
215 self.tokens.append(token)
216 self.prev_row, self.prev_col = end
217 if tok_type in (NEWLINE, NL):
218 self.prev_row += 1
219 self.prev_col = 0
220 return "".join(self.tokens)
221
222 def compat(self, token, iterable):
223 # This import is here to avoid problems when the itertools
224 # module is not built yet and tokenize is imported.
225 from itertools import chain
226 startline = False
227 prevstring = False
228 indents = []
229 toks_append = self.tokens.append
230 for tok in chain([token], iterable):
231 toknum, tokval = tok[:2]
232
233 if toknum in (NAME, NUMBER):
234 tokval += ' '
235
236 # Insert a space between two consecutive strings
237 if toknum == STRING:
238 if prevstring:
239 tokval = ' ' + tokval
240 prevstring = True
241 else:
242 prevstring = False
243
244 if toknum == INDENT:
245 indents.append(tokval)
246 continue
247 elif toknum == DEDENT:
248 indents.pop()
249 continue
250 elif toknum in (NEWLINE, NL):
251 startline = True
252 elif startline and indents:
253 toks_append(indents[-1])
254 startline = False
255 toks_append(tokval)
256
257 def untokenize(iterable):
258 """Transform tokens back into Python source code.
259
260 Each element returned by the iterable must be a token sequence
261 with at least two elements, a token number and token value. If
262 only two tokens are passed, the resulting output is poor.
263
264 Round-trip invariant for full input:
265 Untokenized source will match input source exactly
266
267 Round-trip invariant for limited intput:
268 # Output text will tokenize the back to the input
269 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
270 newcode = untokenize(t1)
271 readline = iter(newcode.splitlines(1)).next
272 t2 = [tok[:2] for tok in generate_tokens(readline)]
273 assert t1 == t2
274 """
275 ut = Untokenizer()
276 return ut.untokenize(iterable)
277
278 def generate_tokens(readline):
279 """
280 The generate_tokens() generator requires one argment, readline, which
281 must be a callable object which provides the same interface as the
282 readline() method of built-in file objects. Each call to the function
283 should return one line of input as a string. Alternately, readline
284 can be a callable function terminating with StopIteration:
285 readline = open(myfile).next # Example of alternate readline
286
287 The generator produces 5-tuples with these members: the token type; the
288 token string; a 2-tuple (srow, scol) of ints specifying the row and
289 column where the token begins in the source; a 2-tuple (erow, ecol) of
290 ints specifying the row and column where the token ends in the source;
291 and the line on which the token was found. The line passed is the
292 logical line; continuation lines are included.
293 """
294 lnum = parenlev = continued = 0
295 namechars, numchars = string.ascii_letters + '_', '0123456789'
296 contstr, needcont = '', 0
297 contline = None
298 indents = [0]
299
300 while 1: # loop over lines in stream
301 try:
302 line = readline()
303 except StopIteration:
304 line = ''
305 lnum += 1
306 pos, max = 0, len(line)
307
308 if contstr: # continued string
309 if not line:
310 raise TokenError, ("EOF in multi-line string", strstart)
311 endmatch = endprog.match(line)
312 if endmatch:
313 pos = end = endmatch.end(0)
314 yield (STRING, contstr + line[:end],
315 strstart, (lnum, end), contline + line)
316 contstr, needcont = '', 0
317 contline = None
318 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
319 yield (ERRORTOKEN, contstr + line,
320 strstart, (lnum, len(line)), contline)
321 contstr = ''
322 contline = None
323 continue
324 else:
325 contstr = contstr + line
326 contline = contline + line
327 continue
328
329 elif parenlev == 0 and not continued: # new statement
330 if not line: break
331 column = 0
332 while pos < max: # measure leading whitespace
333 if line[pos] == ' ':
334 column += 1
335 elif line[pos] == '\t':
336 column = (column//tabsize + 1)*tabsize
337 elif line[pos] == '\f':
338 column = 0
339 else:
340 break
341 pos += 1
342 if pos == max:
343 break
344
345 if line[pos] in '#\r\n': # skip comments or blank lines
346 if line[pos] == '#':
347 comment_token = line[pos:].rstrip('\r\n')
348 nl_pos = pos + len(comment_token)
349 yield (COMMENT, comment_token,
350 (lnum, pos), (lnum, pos + len(comment_token)), line)
351 yield (NEWLINE, line[nl_pos:],
352 (lnum, nl_pos), (lnum, len(line)), line)
353 else:
354 yield (NEWLINE, line[pos:],
355 (lnum, pos), (lnum, len(line)), line)
356 continue
357
358 if column > indents[-1]: # count indents or dedents
359 indents.append(column)
360 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
361 while column < indents[-1]:
362 if column not in indents:
363 raise IndentationError(
364 "unindent does not match any outer indentation level",
365 ("<tokenize>", lnum, pos, line))
366 indents = indents[:-1]
367 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
368
369 else: # continued statement
370 if not line:
371 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
372 continued = 0
373
374 while pos < max:
375 pseudomatch = pseudoprog.match(line, pos)
376 if pseudomatch: # scan for tokens
377 start, end = pseudomatch.span(1)
378 spos, epos, pos = (lnum, start), (lnum, end), end
379 token, initial = line[start:end], line[start]
380
381 if initial in numchars or \
382 (initial == '.' and token != '.'): # ordinary number
383 yield (NUMBER, token, spos, epos, line)
384 elif initial in '\r\n':
385 yield (NL if parenlev > 0 else NEWLINE,
386 token, spos, epos, line)
387 elif initial == '#':
388 assert not token.endswith("\n")
389 yield (COMMENT, token, spos, epos, line)
390 elif token in triple_quoted:
391 endprog = endprogs[token]
392 endmatch = endprog.match(line, pos)
393 if endmatch: # all on one line
394 pos = endmatch.end(0)
395 token = line[start:pos]
396 yield (STRING, token, spos, (lnum, pos), line)
397 else:
398 strstart = (lnum, start) # multiple lines
399 contstr = line[start:]
400 contline = line
401 break
402 elif initial in single_quoted or \
403 token[:2] in single_quoted or \
404 token[:3] in single_quoted:
405 if token[-1] == '\n': # continued string
406 strstart = (lnum, start)
407 endprog = (endprogs[initial] or endprogs[token[1]] or
408 endprogs[token[2]])
409 contstr, needcont = line[start:], 1
410 contline = line
411 break
412 else: # ordinary string
413 yield (STRING, token, spos, epos, line)
414 elif initial in namechars: # ordinary name
415 yield (NAME, token, spos, epos, line)
416 elif initial == '\\': # continued stmt
417 continued = 1
418 else:
419 if initial in '([{':
420 parenlev += 1
421 elif initial in ')]}':
422 parenlev -= 1
423 yield (OP, token, spos, epos, line)
424 else:
425 yield (ERRORTOKEN, line[pos],
426 (lnum, pos), (lnum, pos+1), line)
427 pos += 1
428
429 for indent in indents[1:]: # pop remaining indent levels
430 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
431 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
432
433 if __name__ == '__main__': # testing
434 import sys
435 if len(sys.argv) > 1:
436 tokenize(open(sys.argv[1]).readline)
437 else:
438 tokenize(sys.stdin.readline)
This diff has been collapsed as it changes many lines, (574 lines changed) Show them Hide them
@@ -0,0 +1,574 b''
1 """Patched version of standard library tokenize, to deal with various bugs.
2
3 Based on Python 3.2 code.
4
5 Patches:
6
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
8 - Except we don't encode the output of untokenize
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
12 - Export generate_tokens & TokenError
13
14 ------------------------------------------------------------------------------
15 Tokenization help for Python programs.
16
17 tokenize(readline) is a generator that breaks a stream of bytes into
18 Python tokens. It decodes the bytes according to PEP-0263 for
19 determining source file encoding.
20
21 It accepts a readline-like method which is called repeatedly to get the
22 next line of input (or b"" for EOF). It generates 5-tuples with these
23 members:
24
25 the token type (see token.py)
26 the token (a string)
27 the starting (row, column) indices of the token (a 2-tuple of ints)
28 the ending (row, column) indices of the token (a 2-tuple of ints)
29 the original line (string)
30
31 It is designed to match the working of the Python tokenizer exactly, except
32 that it produces COMMENT tokens for comments and gives type OP for all
33 operators. Additionally, all token lists start with an ENCODING token
34 which tells you which encoding was used to decode the bytes stream.
35 """
36 from __future__ import absolute_import
37
38 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
39 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
40 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
41 'Michael Foord')
42 import builtins
43 import re
44 import sys
45 from token import *
46 from codecs import lookup, BOM_UTF8
47 import collections
48 from io import TextIOWrapper
49 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
50
51 import token
52 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
53 "NL", "untokenize", "ENCODING", "TokenInfo"]
54 del token
55
56 __all__ += ["generate_tokens", "TokenError"]
57
58 COMMENT = N_TOKENS
59 tok_name[COMMENT] = 'COMMENT'
60 NL = N_TOKENS + 1
61 tok_name[NL] = 'NL'
62 ENCODING = N_TOKENS + 2
63 tok_name[ENCODING] = 'ENCODING'
64 N_TOKENS += 3
65
66 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
67 def __repr__(self):
68 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
69 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
70 self._replace(type=annotated_type))
71
72 def group(*choices): return '(' + '|'.join(choices) + ')'
73 def any(*choices): return group(*choices) + '*'
74 def maybe(*choices): return group(*choices) + '?'
75
76 # Note: we use unicode matching for names ("\w") but ascii matching for
77 # number literals.
78 Whitespace = r'[ \f\t]*'
79 Comment = r'#[^\r\n]*'
80 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
81 Name = r'\w+'
82
83 Hexnumber = r'0[xX][0-9a-fA-F]+'
84 Binnumber = r'0[bB][01]+'
85 Octnumber = r'0[oO][0-7]+'
86 Decnumber = r'(?:0+|[1-9][0-9]*)'
87 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
88 Exponent = r'[eE][-+]?[0-9]+'
89 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
90 Expfloat = r'[0-9]+' + Exponent
91 Floatnumber = group(Pointfloat, Expfloat)
92 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
93 Number = group(Imagnumber, Floatnumber, Intnumber)
94
95 # Tail end of ' string.
96 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
97 # Tail end of " string.
98 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
99 # Tail end of ''' string.
100 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
101 # Tail end of """ string.
102 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
103 Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
104 # Single-line ' or " string.
105 String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
106 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
107
108 # Because of leftmost-then-longest match semantics, be sure to put the
109 # longest operators first (e.g., if = came before ==, == would get
110 # recognized as two instances of =).
111 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
112 r"//=?", r"->",
113 r"[+\-*/%&|^=<>]=?",
114 r"~")
115
116 Bracket = '[][(){}]'
117 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
118 Funny = group(Operator, Bracket, Special)
119
120 PlainToken = group(Number, Funny, String, Name)
121 Token = Ignore + PlainToken
122
123 # First (or only) line of ' or " string.
124 ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
125 group("'", r'\\\r?\n'),
126 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
127 group('"', r'\\\r?\n'))
128 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
129 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
130
131 def _compile(expr):
132 return re.compile(expr, re.UNICODE)
133
134 tokenprog, pseudoprog, single3prog, double3prog = map(
135 _compile, (Token, PseudoToken, Single3, Double3))
136 endprogs = {"'": _compile(Single), '"': _compile(Double),
137 "'''": single3prog, '"""': double3prog,
138 "r'''": single3prog, 'r"""': double3prog,
139 "b'''": single3prog, 'b"""': double3prog,
140 "br'''": single3prog, 'br"""': double3prog,
141 "R'''": single3prog, 'R"""': double3prog,
142 "B'''": single3prog, 'B"""': double3prog,
143 "bR'''": single3prog, 'bR"""': double3prog,
144 "Br'''": single3prog, 'Br"""': double3prog,
145 "BR'''": single3prog, 'BR"""': double3prog,
146 'r': None, 'R': None, 'b': None, 'B': None}
147
148 triple_quoted = {}
149 for t in ("'''", '"""',
150 "r'''", 'r"""', "R'''", 'R"""',
151 "b'''", 'b"""', "B'''", 'B"""',
152 "br'''", 'br"""', "Br'''", 'Br"""',
153 "bR'''", 'bR"""', "BR'''", 'BR"""'):
154 triple_quoted[t] = t
155 single_quoted = {}
156 for t in ("'", '"',
157 "r'", 'r"', "R'", 'R"',
158 "b'", 'b"', "B'", 'B"',
159 "br'", 'br"', "Br'", 'Br"',
160 "bR'", 'bR"', "BR'", 'BR"' ):
161 single_quoted[t] = t
162
163 del _compile
164
165 tabsize = 8
166
167 class TokenError(Exception): pass
168
169 class StopTokenizing(Exception): pass
170
171
172 class Untokenizer:
173
174 def __init__(self):
175 self.tokens = []
176 self.prev_row = 1
177 self.prev_col = 0
178 self.encoding = 'utf-8'
179
180 def add_whitespace(self, tok_type, start):
181 row, col = start
182 assert row >= self.prev_row
183 col_offset = col - self.prev_col
184 if col_offset > 0:
185 self.tokens.append(" " * col_offset)
186 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
187 # Line was backslash-continued.
188 self.tokens.append(" ")
189
190 def untokenize(self, tokens):
191 iterable = iter(tokens)
192 for t in iterable:
193 if len(t) == 2:
194 self.compat(t, iterable)
195 break
196 tok_type, token, start, end = t[:4]
197 if tok_type == ENCODING:
198 self.encoding = token
199 continue
200 self.add_whitespace(tok_type, start)
201 self.tokens.append(token)
202 self.prev_row, self.prev_col = end
203 if tok_type in (NEWLINE, NL):
204 self.prev_row += 1
205 self.prev_col = 0
206 return "".join(self.tokens)
207
208 def compat(self, token, iterable):
209 # This import is here to avoid problems when the itertools
210 # module is not built yet and tokenize is imported.
211 from itertools import chain
212 startline = False
213 prevstring = False
214 indents = []
215 toks_append = self.tokens.append
216
217 for tok in chain([token], iterable):
218 toknum, tokval = tok[:2]
219 if toknum == ENCODING:
220 self.encoding = tokval
221 continue
222
223 if toknum in (NAME, NUMBER):
224 tokval += ' '
225
226 # Insert a space between two consecutive strings
227 if toknum == STRING:
228 if prevstring:
229 tokval = ' ' + tokval
230 prevstring = True
231 else:
232 prevstring = False
233
234 if toknum == INDENT:
235 indents.append(tokval)
236 continue
237 elif toknum == DEDENT:
238 indents.pop()
239 continue
240 elif toknum in (NEWLINE, NL):
241 startline = True
242 elif startline and indents:
243 toks_append(indents[-1])
244 startline = False
245 toks_append(tokval)
246
247
248 def untokenize(tokens):
249 """
250 Convert ``tokens`` (an iterable) back into Python source code. Return
251 a bytes object, encoded using the encoding specified by the last
252 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
253
254 The result is guaranteed to tokenize back to match the input so that
255 the conversion is lossless and round-trips are assured. The
256 guarantee applies only to the token type and token string as the
257 spacing between tokens (column positions) may change.
258
259 :func:`untokenize` has two modes. If the input tokens are sequences
260 of length 2 (``type``, ``string``) then spaces are added as necessary to
261 preserve the round-trip property.
262
263 If the input tokens are sequences of length 4 or more (``type``,
264 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
265 spaces are added so that each token appears in the result at the
266 position indicated by ``start`` and ``end``, if possible.
267 """
268 return Untokenizer().untokenize(tokens)
269
270
271 def _get_normal_name(orig_enc):
272 """Imitates get_normal_name in tokenizer.c."""
273 # Only care about the first 12 characters.
274 enc = orig_enc[:12].lower().replace("_", "-")
275 if enc == "utf-8" or enc.startswith("utf-8-"):
276 return "utf-8"
277 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
278 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
279 return "iso-8859-1"
280 return orig_enc
281
282 def detect_encoding(readline):
283 """
284 The detect_encoding() function is used to detect the encoding that should
285 be used to decode a Python source file. It requires one argment, readline,
286 in the same way as the tokenize() generator.
287
288 It will call readline a maximum of twice, and return the encoding used
289 (as a string) and a list of any lines (left as bytes) it has read in.
290
291 It detects the encoding from the presence of a utf-8 bom or an encoding
292 cookie as specified in pep-0263. If both a bom and a cookie are present,
293 but disagree, a SyntaxError will be raised. If the encoding cookie is an
294 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
295 'utf-8-sig' is returned.
296
297 If no encoding is specified, then the default of 'utf-8' will be returned.
298 """
299 bom_found = False
300 encoding = None
301 default = 'utf-8'
302 def read_or_stop():
303 try:
304 return readline()
305 except StopIteration:
306 return b''
307
308 def find_cookie(line):
309 try:
310 # Decode as UTF-8. Either the line is an encoding declaration,
311 # in which case it should be pure ASCII, or it must be UTF-8
312 # per default encoding.
313 line_string = line.decode('utf-8')
314 except UnicodeDecodeError:
315 raise SyntaxError("invalid or missing encoding declaration")
316
317 matches = cookie_re.findall(line_string)
318 if not matches:
319 return None
320 encoding = _get_normal_name(matches[0])
321 try:
322 codec = lookup(encoding)
323 except LookupError:
324 # This behaviour mimics the Python interpreter
325 raise SyntaxError("unknown encoding: " + encoding)
326
327 if bom_found:
328 if encoding != 'utf-8':
329 # This behaviour mimics the Python interpreter
330 raise SyntaxError('encoding problem: utf-8')
331 encoding += '-sig'
332 return encoding
333
334 first = read_or_stop()
335 if first.startswith(BOM_UTF8):
336 bom_found = True
337 first = first[3:]
338 default = 'utf-8-sig'
339 if not first:
340 return default, []
341
342 encoding = find_cookie(first)
343 if encoding:
344 return encoding, [first]
345
346 second = read_or_stop()
347 if not second:
348 return default, [first]
349
350 encoding = find_cookie(second)
351 if encoding:
352 return encoding, [first, second]
353
354 return default, [first, second]
355
356
357 def open(filename):
358 """Open a file in read only mode using the encoding detected by
359 detect_encoding().
360 """
361 buffer = builtins.open(filename, 'rb')
362 encoding, lines = detect_encoding(buffer.readline)
363 buffer.seek(0)
364 text = TextIOWrapper(buffer, encoding, line_buffering=True)
365 text.mode = 'r'
366 return text
367
368
369 def tokenize(readline):
370 """
371 The tokenize() generator requires one argment, readline, which
372 must be a callable object which provides the same interface as the
373 readline() method of built-in file objects. Each call to the function
374 should return one line of input as bytes. Alternately, readline
375 can be a callable function terminating with StopIteration:
376 readline = open(myfile, 'rb').__next__ # Example of alternate readline
377
378 The generator produces 5-tuples with these members: the token type; the
379 token string; a 2-tuple (srow, scol) of ints specifying the row and
380 column where the token begins in the source; a 2-tuple (erow, ecol) of
381 ints specifying the row and column where the token ends in the source;
382 and the line on which the token was found. The line passed is the
383 logical line; continuation lines are included.
384
385 The first token sequence will always be an ENCODING token
386 which tells you which encoding was used to decode the bytes stream.
387 """
388 # This import is here to avoid problems when the itertools module is not
389 # built yet and tokenize is imported.
390 from itertools import chain, repeat
391 encoding, consumed = detect_encoding(readline)
392 rl_gen = iter(readline, b"")
393 empty = repeat(b"")
394 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
395
396
397 def _tokenize(readline, encoding):
398 lnum = parenlev = continued = 0
399 numchars = '0123456789'
400 contstr, needcont = '', 0
401 contline = None
402 indents = [0]
403
404 if encoding is not None:
405 if encoding == "utf-8-sig":
406 # BOM will already have been stripped.
407 encoding = "utf-8"
408 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
409 while True: # loop over lines in stream
410 try:
411 line = readline()
412 except StopIteration:
413 line = b''
414
415 if encoding is not None:
416 line = line.decode(encoding)
417 lnum += 1
418 pos, max = 0, len(line)
419
420 if contstr: # continued string
421 if not line:
422 raise TokenError("EOF in multi-line string", strstart)
423 endmatch = endprog.match(line)
424 if endmatch:
425 pos = end = endmatch.end(0)
426 yield TokenInfo(STRING, contstr + line[:end],
427 strstart, (lnum, end), contline + line)
428 contstr, needcont = '', 0
429 contline = None
430 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
431 yield TokenInfo(ERRORTOKEN, contstr + line,
432 strstart, (lnum, len(line)), contline)
433 contstr = ''
434 contline = None
435 continue
436 else:
437 contstr = contstr + line
438 contline = contline + line
439 continue
440
441 elif parenlev == 0 and not continued: # new statement
442 if not line: break
443 column = 0
444 while pos < max: # measure leading whitespace
445 if line[pos] == ' ':
446 column += 1
447 elif line[pos] == '\t':
448 column = (column//tabsize + 1)*tabsize
449 elif line[pos] == '\f':
450 column = 0
451 else:
452 break
453 pos += 1
454 if pos == max:
455 break
456
457 if line[pos] in '#\r\n': # skip comments or blank lines
458 if line[pos] == '#':
459 comment_token = line[pos:].rstrip('\r\n')
460 nl_pos = pos + len(comment_token)
461 yield TokenInfo(COMMENT, comment_token,
462 (lnum, pos), (lnum, pos + len(comment_token)), line)
463 yield TokenInfo(NEWLINE, line[nl_pos:],
464 (lnum, nl_pos), (lnum, len(line)), line)
465 else:
466 yield TokenInfo(NEWLINE, line[pos:],
467 (lnum, pos), (lnum, len(line)), line)
468 continue
469
470 if column > indents[-1]: # count indents or dedents
471 indents.append(column)
472 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
473 while column < indents[-1]:
474 if column not in indents:
475 raise IndentationError(
476 "unindent does not match any outer indentation level",
477 ("<tokenize>", lnum, pos, line))
478 indents = indents[:-1]
479 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
480
481 else: # continued statement
482 if not line:
483 raise TokenError("EOF in multi-line statement", (lnum, 0))
484 continued = 0
485
486 while pos < max:
487 pseudomatch = pseudoprog.match(line, pos)
488 if pseudomatch: # scan for tokens
489 start, end = pseudomatch.span(1)
490 spos, epos, pos = (lnum, start), (lnum, end), end
491 token, initial = line[start:end], line[start]
492
493 if (initial in numchars or # ordinary number
494 (initial == '.' and token != '.' and token != '...')):
495 yield TokenInfo(NUMBER, token, spos, epos, line)
496 elif initial in '\r\n':
497 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
498 token, spos, epos, line)
499 elif initial == '#':
500 assert not token.endswith("\n")
501 yield TokenInfo(COMMENT, token, spos, epos, line)
502 elif token in triple_quoted:
503 endprog = endprogs[token]
504 endmatch = endprog.match(line, pos)
505 if endmatch: # all on one line
506 pos = endmatch.end(0)
507 token = line[start:pos]
508 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
509 else:
510 strstart = (lnum, start) # multiple lines
511 contstr = line[start:]
512 contline = line
513 break
514 elif initial in single_quoted or \
515 token[:2] in single_quoted or \
516 token[:3] in single_quoted:
517 if token[-1] == '\n': # continued string
518 strstart = (lnum, start)
519 endprog = (endprogs[initial] or endprogs[token[1]] or
520 endprogs[token[2]])
521 contstr, needcont = line[start:], 1
522 contline = line
523 break
524 else: # ordinary string
525 yield TokenInfo(STRING, token, spos, epos, line)
526 elif initial.isidentifier(): # ordinary name
527 yield TokenInfo(NAME, token, spos, epos, line)
528 elif initial == '\\': # continued stmt
529 continued = 1
530 else:
531 if initial in '([{':
532 parenlev += 1
533 elif initial in ')]}':
534 parenlev -= 1
535 yield TokenInfo(OP, token, spos, epos, line)
536 else:
537 yield TokenInfo(ERRORTOKEN, line[pos],
538 (lnum, pos), (lnum, pos+1), line)
539 pos += 1
540
541 for indent in indents[1:]: # pop remaining indent levels
542 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
543 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
544
545
546 # An undocumented, backwards compatible, API for all the places in the standard
547 # library that expect to be able to use tokenize with strings
548 def generate_tokens(readline):
549 return _tokenize(readline, None)
550
551 if __name__ == "__main__":
552 # Quick sanity check
553 s = b'''def parseline(self, line):
554 """Parse the line into a command name and a string containing
555 the arguments. Returns a tuple containing (command, args, line).
556 'command' and 'args' may be None if the line couldn't be parsed.
557 """
558 line = line.strip()
559 if not line:
560 return None, None, line
561 elif line[0] == '?':
562 line = 'help ' + line[1:]
563 elif line[0] == '!':
564 if hasattr(self, 'do_shell'):
565 line = 'shell ' + line[1:]
566 else:
567 return None, None, line
568 i, n = 0, len(line)
569 while i < n and line[i] in self.identchars: i = i+1
570 cmd, arg = line[:i], line[i:].strip()
571 return cmd, arg, line
572 '''
573 for tok in tokenize(iter(s.splitlines()).__next__):
574 print(tok)
@@ -0,0 +1,9 b''
1 """Load our patched versions of tokenize.
2 """
3
4 import sys
5
6 if sys.version_info[0] >= 3:
7 from _tokenize_py3 import *
8 else:
9 from _tokenize_py2 import *
@@ -2,17 +2,10 b' import abc'
2 import functools
2 import functools
3 import re
3 import re
4 from StringIO import StringIO
4 from StringIO import StringIO
5 import tokenize
6
7 try:
8 generate_tokens = tokenize.generate_tokens
9 except AttributeError:
10 # Python 3. Note that we use the undocumented _tokenize because it expects
11 # strings, not bytes. See also Python issue #9969.
12 generate_tokens = tokenize._tokenize
13
5
14 from IPython.core.splitinput import split_user_input, LineInfo
6 from IPython.core.splitinput import split_user_input, LineInfo
15 from IPython.utils.untokenize import untokenize
7 from IPython.utils import tokenize2
8 from IPython.utils.tokenize2 import generate_tokens, untokenize, TokenError
16
9
17 #-----------------------------------------------------------------------------
10 #-----------------------------------------------------------------------------
18 # Globals
11 # Globals
@@ -129,7 +122,7 b' class TokenInputTransformer(InputTransformer):'
129
122
130 def get_line(self):
123 def get_line(self):
131 if self.line_used:
124 if self.line_used:
132 raise tokenize.TokenError
125 raise TokenError
133 self.line_used = True
126 self.line_used = True
134 return self.current_line
127 return self.current_line
135
128
@@ -145,12 +138,12 b' class TokenInputTransformer(InputTransformer):'
145 for intok in self.tokenizer:
138 for intok in self.tokenizer:
146 tokens.append(intok)
139 tokens.append(intok)
147 t = intok[0]
140 t = intok[0]
148 if t == tokenize.NEWLINE or (stop_at_NL and t == tokenize.NL):
141 if t == tokenize2.NEWLINE or (stop_at_NL and t == tokenize2.NL):
149 # Stop before we try to pull a line we don't have yet
142 # Stop before we try to pull a line we don't have yet
150 break
143 break
151 elif t in (tokenize.COMMENT, tokenize.ERRORTOKEN):
144 elif t == tokenize2.ERRORTOKEN:
152 stop_at_NL = True
145 stop_at_NL = True
153 except tokenize.TokenError:
146 except TokenError:
154 # Multi-line statement - stop and try again with the next line
147 # Multi-line statement - stop and try again with the next line
155 self.reset_tokenizer()
148 self.reset_tokenizer()
156 return None
149 return None
@@ -297,11 +290,11 b' def has_comment(src):'
297 readline = StringIO(src).readline
290 readline = StringIO(src).readline
298 toktypes = set()
291 toktypes = set()
299 try:
292 try:
300 for t in tokenize.generate_tokens(readline):
293 for t in generate_tokens(readline):
301 toktypes.add(t[0])
294 toktypes.add(t[0])
302 except tokenize.TokenError:
295 except TokenError:
303 pass
296 pass
304 return(tokenize.COMMENT in toktypes)
297 return(tokenize2.COMMENT in toktypes)
305
298
306
299
307 @StatelessInputTransformer.wrap
300 @StatelessInputTransformer.wrap
1 NO CONTENT: file was removed
NO CONTENT: file was removed
General Comments 0
You need to be logged in to leave comments. Login now