##// END OF EJS Templates
remove sys_version for Python 3...
Paul Ivanov -
Show More
@@ -604,7 +604,6 b' class Pdb(OldPdb, object):'
604 ('Globals', self.curframe.f_globals)]
604 ('Globals', self.curframe.f_globals)]
605 self.shell.find_line_magic('psource')(arg, namespaces=namespaces)
605 self.shell.find_line_magic('psource')(arg, namespaces=namespaces)
606
606
607 if sys.version_info > (3, ):
608 def do_where(self, arg):
607 def do_where(self, arg):
609 """w(here)
608 """w(here)
610 Print a stack trace, with the most recent frame at the bottom.
609 Print a stack trace, with the most recent frame at the bottom.
@@ -187,10 +187,7 b' class ExecutionResult(object):'
187 raise self.error_in_exec
187 raise self.error_in_exec
188
188
189 def __repr__(self):
189 def __repr__(self):
190 if sys.version_info > (3,):
191 name = self.__class__.__qualname__
190 name = self.__class__.__qualname__
192 else:
193 name = self.__class__.__name__
194 return '<%s object at %x, execution_count=%s error_before_exec=%s error_in_exec=%s result=%s>' %\
191 return '<%s object at %x, execution_count=%s error_before_exec=%s error_in_exec=%s result=%s>' %\
195 (name, id(self), self.execution_count, self.error_before_exec, self.error_in_exec, repr(self.result))
192 (name, id(self), self.execution_count, self.error_before_exec, self.error_in_exec, repr(self.result))
196
193
@@ -155,7 +155,6 b' def test_latex_completions():'
155
155
156
156
157
157
158 @dec.onlyif(sys.version_info[0] >= 3, 'This test only apply on python3')
159 def test_back_latex_completion():
158 def test_back_latex_completion():
160 ip = get_ipython()
159 ip = get_ipython()
161
160
@@ -164,7 +163,6 b' def test_back_latex_completion():'
164 nt.assert_equal(len(matches), 1)
163 nt.assert_equal(len(matches), 1)
165 nt.assert_equal(matches[0], '\\beta')
164 nt.assert_equal(matches[0], '\\beta')
166
165
167 @dec.onlyif(sys.version_info[0] >= 3, 'This test only apply on python3')
168 def test_back_unicode_completion():
166 def test_back_unicode_completion():
169 ip = get_ipython()
167 ip = get_ipython()
170
168
@@ -173,7 +171,6 b' def test_back_unicode_completion():'
173 nt.assert_equal(matches[0], '\\ROMAN NUMERAL FIVE')
171 nt.assert_equal(matches[0], '\\ROMAN NUMERAL FIVE')
174
172
175
173
176 @dec.onlyif(sys.version_info[0] >= 3, 'This test only apply on python3')
177 def test_forward_unicode_completion():
174 def test_forward_unicode_completion():
178 ip = get_ipython()
175 ip = get_ipython()
179
176
@@ -181,7 +178,6 b' def test_forward_unicode_completion():'
181 nt.assert_equal(len(matches), 1)
178 nt.assert_equal(len(matches), 1)
182 nt.assert_equal(matches[0], 'β…€')
179 nt.assert_equal(matches[0], 'β…€')
183
180
184 @dec.onlyif(sys.version_info[0] >= 3, 'This test only apply on python3')
185 @dec.knownfailureif(sys.platform == 'win32', 'Fails if there is a C:\\j... path')
181 @dec.knownfailureif(sys.platform == 'win32', 'Fails if there is a C:\\j... path')
186 def test_no_ascii_back_completion():
182 def test_no_ascii_back_completion():
187 ip = get_ipython()
183 ip = get_ipython()
@@ -588,7 +584,6 b' def test_dict_key_completion_contexts():'
588
584
589
585
590
586
591 @dec.onlyif(sys.version_info[0] >= 3, 'This test only applies in Py>=3')
592 def test_dict_key_completion_bytes():
587 def test_dict_key_completion_bytes():
593 """Test handling of bytes in dict key completion"""
588 """Test handling of bytes in dict key completion"""
594 ip = get_ipython()
589 ip = get_ipython()
@@ -618,7 +613,6 b' def test_dict_key_completion_bytes():'
618 nt.assert_not_in("abd", matches)
613 nt.assert_not_in("abd", matches)
619
614
620
615
621 @dec.onlyif(sys.version_info[0] < 3, 'This test only applies in Py<3')
622 def test_dict_key_completion_unicode_py2():
616 def test_dict_key_completion_unicode_py2():
623 """Test handling of unicode in dict key completion"""
617 """Test handling of unicode in dict key completion"""
624 ip = get_ipython()
618 ip = get_ipython()
@@ -679,7 +673,6 b' def test_dict_key_completion_unicode_py2():'
679 nt.assert_in(u"d[u'a\u05d0b']", matches)
673 nt.assert_in(u"d[u'a\u05d0b']", matches)
680
674
681
675
682 @dec.onlyif(sys.version_info[0] >= 3, 'This test only applies in Py>=3')
683 def test_dict_key_completion_unicode_py3():
676 def test_dict_key_completion_unicode_py3():
684 """Test handling of unicode in dict key completion"""
677 """Test handling of unicode in dict key completion"""
685 ip = get_ipython()
678 ip = get_ipython()
@@ -338,19 +338,6 b' class InteractiveShellTestCase(unittest.TestCase):'
338 finally:
338 finally:
339 trap.hook = save_hook
339 trap.hook = save_hook
340
340
341 @skipif(sys.version_info[0] >= 3, "softspace removed in py3")
342 def test_print_softspace(self):
343 """Verify that softspace is handled correctly when executing multiple
344 statements.
345
346 In [1]: print 1; print 2
347 1
348 2
349
350 In [2]: print 1,; print 2
351 1 2
352 """
353
354 def test_ofind_line_magic(self):
341 def test_ofind_line_magic(self):
355 from IPython.core.magic import register_line_magic
342 from IPython.core.magic import register_line_magic
356
343
@@ -466,22 +453,6 b' class InteractiveShellTestCase(unittest.TestCase):'
466 # Reset the custom exception hook
453 # Reset the custom exception hook
467 ip.set_custom_exc((), None)
454 ip.set_custom_exc((), None)
468
455
469 @skipif(sys.version_info[0] >= 3, "no differences with __future__ in py3")
470 def test_future_environment(self):
471 "Can we run code with & without the shell's __future__ imports?"
472 ip.run_cell("from __future__ import division")
473 ip.run_cell("a = 1/2", shell_futures=True)
474 self.assertEqual(ip.user_ns['a'], 0.5)
475 ip.run_cell("b = 1/2", shell_futures=False)
476 self.assertEqual(ip.user_ns['b'], 0)
477
478 ip.compile.reset_compiler_flags()
479 # This shouldn't leak to the shell's compiler
480 ip.run_cell("from __future__ import division \nc=1/2", shell_futures=False)
481 self.assertEqual(ip.user_ns['c'], 0.5)
482 ip.run_cell("d = 1/2", shell_futures=True)
483 self.assertEqual(ip.user_ns['d'], 0)
484
485 def test_mktempfile(self):
456 def test_mktempfile(self):
486 filename = ip.mktempfile()
457 filename = ip.mktempfile()
487 # Check that we can open the file again on Windows
458 # Check that we can open the file again on Windows
@@ -509,9 +480,6 b' class InteractiveShellTestCase(unittest.TestCase):'
509 raise DerivedInterrupt("foo")
480 raise DerivedInterrupt("foo")
510 except KeyboardInterrupt:
481 except KeyboardInterrupt:
511 msg = ip.get_exception_only()
482 msg = ip.get_exception_only()
512 if sys.version_info[0] <= 2:
513 self.assertEqual(msg, 'DerivedInterrupt: foo\n')
514 else:
515 self.assertEqual(msg, 'IPython.core.tests.test_interactiveshell.DerivedInterrupt: foo\n')
483 self.assertEqual(msg, 'IPython.core.tests.test_interactiveshell.DerivedInterrupt: foo\n')
516
484
517 def test_inspect_text(self):
485 def test_inspect_text(self):
@@ -377,18 +377,6 b' def test_time3():'
377 "run = 0\n"
377 "run = 0\n"
378 "run += 1")
378 "run += 1")
379
379
380 @dec.skipif(sys.version_info[0] >= 3, "no differences with __future__ in py3")
381 def test_time_futures():
382 "Test %time with __future__ environments"
383 ip = get_ipython()
384 ip.autocall = 0
385 ip.run_cell("from __future__ import division")
386 with tt.AssertPrints('0.25'):
387 ip.run_line_magic('time', 'print(1/4)')
388 ip.compile.reset_compiler_flags()
389 with tt.AssertNotPrints('0.25'):
390 ip.run_line_magic('time', 'print(1/4)')
391
392 def test_doctest_mode():
380 def test_doctest_mode():
393 "Toggle doctest_mode twice, it should be a no-op and run without error"
381 "Toggle doctest_mode twice, it should be a no-op and run without error"
394 _ip.magic('doctest_mode')
382 _ip.magic('doctest_mode')
@@ -573,17 +561,6 b' def test_timeit_return_quiet():'
573 res = _ip.run_line_magic('timeit', '-n1 -r1 -q -o 1')
561 res = _ip.run_line_magic('timeit', '-n1 -r1 -q -o 1')
574 assert (res is not None)
562 assert (res is not None)
575
563
576 @dec.skipif(sys.version_info[0] >= 3, "no differences with __future__ in py3")
577 def test_timeit_futures():
578 "Test %timeit with __future__ environments"
579 ip = get_ipython()
580 ip.run_cell("from __future__ import division")
581 with tt.AssertPrints('0.25'):
582 ip.run_line_magic('timeit', '-n1 -r1 print(1/4)')
583 ip.compile.reset_compiler_flags()
584 with tt.AssertNotPrints('0.25'):
585 ip.run_line_magic('timeit', '-n1 -r1 print(1/4)')
586
587 @dec.skipif(execution.profile is None)
564 @dec.skipif(execution.profile is None)
588 def test_prun_special_syntax():
565 def test_prun_special_syntax():
589 "Test %%prun with IPython special syntax"
566 "Test %%prun with IPython special syntax"
@@ -57,8 +57,6 b' def check_cpaste(code, should_fail=False):'
57 finally:
57 finally:
58 sys.stdin = stdin_save
58 sys.stdin = stdin_save
59
59
60 PY31 = sys.version_info[:2] == (3,1)
61
62 def test_cpaste():
60 def test_cpaste():
63 """Test cpaste magic"""
61 """Test cpaste magic"""
64
62
@@ -77,14 +75,9 b' def test_cpaste():'
77 ],
75 ],
78
76
79 'fail': ["1 + runf()",
77 'fail': ["1 + runf()",
78 "++ runf()",
80 ]}
79 ]}
81
80
82 # I don't know why this is failing specifically on Python 3.1. I've
83 # checked it manually interactively, but we don't care enough about 3.1
84 # to spend time fiddling with the tests, so we just skip it.
85 if not PY31:
86 tests['fail'].append("++ runf()")
87
88 ip.user_ns['runf'] = runf
81 ip.user_ns['runf'] = runf
89
82
90 for code in tests['pass']:
83 for code in tests['pass']:
@@ -275,7 +275,6 b' def test_info():'
275 nt.assert_equal(i['type_name'], 'type')
275 nt.assert_equal(i['type_name'], 'type')
276 expted_class = str(type(type)) # <class 'type'> (Python 3) or <type 'type'>
276 expted_class = str(type(type)) # <class 'type'> (Python 3) or <type 'type'>
277 nt.assert_equal(i['base_class'], expted_class)
277 nt.assert_equal(i['base_class'], expted_class)
278 if sys.version_info > (3,):
279 nt.assert_regex(i['string_form'], "<class 'IPython.core.tests.test_oinspect.Call'( at 0x[0-9a-f]{1,9})?>")
278 nt.assert_regex(i['string_form'], "<class 'IPython.core.tests.test_oinspect.Call'( at 0x[0-9a-f]{1,9})?>")
280 fname = __file__
279 fname = __file__
281 if fname.endswith(".pyc"):
280 if fname.endswith(".pyc"):
@@ -313,7 +313,6 b' def r3o2():'
313 #----------------------------------------------------------------------------
313 #----------------------------------------------------------------------------
314
314
315 # module testing (minimal)
315 # module testing (minimal)
316 if sys.version_info > (3,):
317 def test_handlers():
316 def test_handlers():
318 def spam(c, d_e):
317 def spam(c, d_e):
319 (d, e) = d_e
318 (d, e) = d_e
@@ -45,7 +45,6 b' pjoin = path.join'
45
45
46 # Enable printing all warnings raise by IPython's modules
46 # Enable printing all warnings raise by IPython's modules
47 warnings.filterwarnings('ignore', message='.*Matplotlib is building the font cache.*', category=UserWarning, module='.*')
47 warnings.filterwarnings('ignore', message='.*Matplotlib is building the font cache.*', category=UserWarning, module='.*')
48 if sys.version_info > (3,0):
49 warnings.filterwarnings('error', message='.*', category=ResourceWarning, module='.*')
48 warnings.filterwarnings('error', message='.*', category=ResourceWarning, module='.*')
50 warnings.filterwarnings('error', message=".*{'config': True}.*", category=DeprecationWarning, module='IPy.*')
49 warnings.filterwarnings('error', message=".*{'config': True}.*", category=DeprecationWarning, module='IPy.*')
51 warnings.filterwarnings('default', message='.*', category=Warning, module='IPy.*')
50 warnings.filterwarnings('default', message='.*', category=Warning, module='IPy.*')
@@ -1,26 +1,13 b''
1 """Decorators marks that a doctest should be skipped, for both python 2 and 3.
1 """Decorators marks that a doctest should be skipped.
2
2
3 The IPython.testing.decorators module triggers various extra imports, including
3 The IPython.testing.decorators module triggers various extra imports, including
4 numpy and sympy if they're present. Since this decorator is used in core parts
4 numpy and sympy if they're present. Since this decorator is used in core parts
5 of IPython, it's in a separate module so that running IPython doesn't trigger
5 of IPython, it's in a separate module so that running IPython doesn't trigger
6 those imports."""
6 those imports."""
7
7
8 #-----------------------------------------------------------------------------
8 # Copyright (C) IPython Development Team
9 # Copyright (C) 2009-2011 The IPython Development Team
9 # Distributed under the terms of the Modified BSD License.
10 #
11 # Distributed under the terms of the BSD License. The full license is in
12 # the file COPYING, distributed as part of this software.
13 #-----------------------------------------------------------------------------
14
10
15 #-----------------------------------------------------------------------------
16 # Imports
17 #-----------------------------------------------------------------------------
18
19 import sys
20
21 #-----------------------------------------------------------------------------
22 # Decorators
23 #-----------------------------------------------------------------------------
24
11
25 def skip_doctest(f):
12 def skip_doctest(f):
26 """Decorator - mark a function or method for skipping its doctest.
13 """Decorator - mark a function or method for skipping its doctest.
This diff has been collapsed as it changes many lines, (594 lines changed) Show them Hide them
@@ -1,9 +1,595 b''
1 """Load our patched versions of tokenize.
1 """Patched version of standard library tokenize, to deal with various bugs.
2
3 Based on Python 3.2 code.
4
5 Patches:
6
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
8 - Except we don't encode the output of untokenize
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
12 - Export generate_tokens & TokenError
13 - u and rb literals are allowed under Python 3.3 and above.
14
15 ------------------------------------------------------------------------------
16 Tokenization help for Python programs.
17
18 tokenize(readline) is a generator that breaks a stream of bytes into
19 Python tokens. It decodes the bytes according to PEP-0263 for
20 determining source file encoding.
21
22 It accepts a readline-like method which is called repeatedly to get the
23 next line of input (or b"" for EOF). It generates 5-tuples with these
24 members:
25
26 the token type (see token.py)
27 the token (a string)
28 the starting (row, column) indices of the token (a 2-tuple of ints)
29 the ending (row, column) indices of the token (a 2-tuple of ints)
30 the original line (string)
31
32 It is designed to match the working of the Python tokenizer exactly, except
33 that it produces COMMENT tokens for comments and gives type OP for all
34 operators. Additionally, all token lists start with an ENCODING token
35 which tells you which encoding was used to decode the bytes stream.
2 """
36 """
37 from __future__ import absolute_import
3
38
39 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
40 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
41 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
42 'Michael Foord')
43 import builtins
44 import re
4 import sys
45 import sys
46 from token import *
47 from codecs import lookup, BOM_UTF8
48 import collections
49 from io import TextIOWrapper
50 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
51
52 import token
53 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
54 "NL", "untokenize", "ENCODING", "TokenInfo"]
55 del token
56
57 __all__ += ["generate_tokens", "TokenError"]
58
59 COMMENT = N_TOKENS
60 tok_name[COMMENT] = 'COMMENT'
61 NL = N_TOKENS + 1
62 tok_name[NL] = 'NL'
63 ENCODING = N_TOKENS + 2
64 tok_name[ENCODING] = 'ENCODING'
65 N_TOKENS += 3
66
67 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
68 def __repr__(self):
69 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
70 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
71 self._replace(type=annotated_type))
72
73 def group(*choices): return '(' + '|'.join(choices) + ')'
74 def any(*choices): return group(*choices) + '*'
75 def maybe(*choices): return group(*choices) + '?'
76
77 # Note: we use unicode matching for names ("\w") but ascii matching for
78 # number literals.
79 Whitespace = r'[ \f\t]*'
80 Comment = r'#[^\r\n]*'
81 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
82 Name = r'\w+'
83
84 Hexnumber = r'0[xX][0-9a-fA-F]+'
85 Binnumber = r'0[bB][01]+'
86 Octnumber = r'0[oO][0-7]+'
87 Decnumber = r'(?:0+|[1-9][0-9]*)'
88 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
89 Exponent = r'[eE][-+]?[0-9]+'
90 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
91 Expfloat = r'[0-9]+' + Exponent
92 Floatnumber = group(Pointfloat, Expfloat)
93 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
94 Number = group(Imagnumber, Floatnumber, Intnumber)
95
96 if sys.version_info.minor >= 3:
97 StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
98 else:
99 StringPrefix = r'(?:[bB]?[rR]?)?'
100
101 # Tail end of ' string.
102 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
103 # Tail end of " string.
104 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
105 # Tail end of ''' string.
106 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
107 # Tail end of """ string.
108 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
109 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
110 # Single-line ' or " string.
111 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
112 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
113
114 # Because of leftmost-then-longest match semantics, be sure to put the
115 # longest operators first (e.g., if = came before ==, == would get
116 # recognized as two instances of =).
117 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
118 r"//=?", r"->",
119 r"[+\-*/%&|^=<>]=?",
120 r"~")
121
122 Bracket = '[][(){}]'
123 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
124 Funny = group(Operator, Bracket, Special)
125
126 PlainToken = group(Number, Funny, String, Name)
127 Token = Ignore + PlainToken
128
129 # First (or only) line of ' or " string.
130 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
131 group("'", r'\\\r?\n'),
132 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
133 group('"', r'\\\r?\n'))
134 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
135 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
136
137 def _compile(expr):
138 return re.compile(expr, re.UNICODE)
139
140 tokenprog, pseudoprog, single3prog, double3prog = map(
141 _compile, (Token, PseudoToken, Single3, Double3))
142 endprogs = {"'": _compile(Single), '"': _compile(Double),
143 "'''": single3prog, '"""': double3prog,
144 "r'''": single3prog, 'r"""': double3prog,
145 "b'''": single3prog, 'b"""': double3prog,
146 "R'''": single3prog, 'R"""': double3prog,
147 "B'''": single3prog, 'B"""': double3prog,
148 "br'''": single3prog, 'br"""': double3prog,
149 "bR'''": single3prog, 'bR"""': double3prog,
150 "Br'''": single3prog, 'Br"""': double3prog,
151 "BR'''": single3prog, 'BR"""': double3prog,
152 'r': None, 'R': None, 'b': None, 'B': None}
153
154 triple_quoted = {}
155 for t in ("'''", '"""',
156 "r'''", 'r"""', "R'''", 'R"""',
157 "b'''", 'b"""', "B'''", 'B"""',
158 "br'''", 'br"""', "Br'''", 'Br"""',
159 "bR'''", 'bR"""', "BR'''", 'BR"""'):
160 triple_quoted[t] = t
161 single_quoted = {}
162 for t in ("'", '"',
163 "r'", 'r"', "R'", 'R"',
164 "b'", 'b"', "B'", 'B"',
165 "br'", 'br"', "Br'", 'Br"',
166 "bR'", 'bR"', "BR'", 'BR"' ):
167 single_quoted[t] = t
168
169 if sys.version_info.minor >= 3:
170 # Python 3.3
171 for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
172 _t2 = _prefix+'"""'
173 endprogs[_t2] = double3prog
174 triple_quoted[_t2] = _t2
175 _t1 = _prefix + "'''"
176 endprogs[_t1] = single3prog
177 triple_quoted[_t1] = _t1
178 single_quoted[_prefix+'"'] = _prefix+'"'
179 single_quoted[_prefix+"'"] = _prefix+"'"
180 del _prefix, _t2, _t1
181 endprogs['u'] = None
182 endprogs['U'] = None
183
184 del _compile
185
186 tabsize = 8
187
188 class TokenError(Exception): pass
189
190 class StopTokenizing(Exception): pass
191
192
193 class Untokenizer:
194
195 def __init__(self):
196 self.tokens = []
197 self.prev_row = 1
198 self.prev_col = 0
199 self.encoding = 'utf-8'
200
201 def add_whitespace(self, tok_type, start):
202 row, col = start
203 assert row >= self.prev_row
204 col_offset = col - self.prev_col
205 if col_offset > 0:
206 self.tokens.append(" " * col_offset)
207 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
208 # Line was backslash-continued.
209 self.tokens.append(" ")
210
211 def untokenize(self, tokens):
212 iterable = iter(tokens)
213 for t in iterable:
214 if len(t) == 2:
215 self.compat(t, iterable)
216 break
217 tok_type, token, start, end = t[:4]
218 if tok_type == ENCODING:
219 self.encoding = token
220 continue
221 self.add_whitespace(tok_type, start)
222 self.tokens.append(token)
223 self.prev_row, self.prev_col = end
224 if tok_type in (NEWLINE, NL):
225 self.prev_row += 1
226 self.prev_col = 0
227 return "".join(self.tokens)
228
229 def compat(self, token, iterable):
230 # This import is here to avoid problems when the itertools
231 # module is not built yet and tokenize is imported.
232 from itertools import chain
233 startline = False
234 prevstring = False
235 indents = []
236 toks_append = self.tokens.append
237
238 for tok in chain([token], iterable):
239 toknum, tokval = tok[:2]
240 if toknum == ENCODING:
241 self.encoding = tokval
242 continue
243
244 if toknum in (NAME, NUMBER):
245 tokval += ' '
246
247 # Insert a space between two consecutive strings
248 if toknum == STRING:
249 if prevstring:
250 tokval = ' ' + tokval
251 prevstring = True
252 else:
253 prevstring = False
254
255 if toknum == INDENT:
256 indents.append(tokval)
257 continue
258 elif toknum == DEDENT:
259 indents.pop()
260 continue
261 elif toknum in (NEWLINE, NL):
262 startline = True
263 elif startline and indents:
264 toks_append(indents[-1])
265 startline = False
266 toks_append(tokval)
267
268
269 def untokenize(tokens):
270 """
271 Convert ``tokens`` (an iterable) back into Python source code. Return
272 a bytes object, encoded using the encoding specified by the last
273 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
274
275 The result is guaranteed to tokenize back to match the input so that
276 the conversion is lossless and round-trips are assured. The
277 guarantee applies only to the token type and token string as the
278 spacing between tokens (column positions) may change.
279
280 :func:`untokenize` has two modes. If the input tokens are sequences
281 of length 2 (``type``, ``string``) then spaces are added as necessary to
282 preserve the round-trip property.
283
284 If the input tokens are sequences of length 4 or more (``type``,
285 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
286 spaces are added so that each token appears in the result at the
287 position indicated by ``start`` and ``end``, if possible.
288 """
289 return Untokenizer().untokenize(tokens)
290
291
292 def _get_normal_name(orig_enc):
293 """Imitates get_normal_name in tokenizer.c."""
294 # Only care about the first 12 characters.
295 enc = orig_enc[:12].lower().replace("_", "-")
296 if enc == "utf-8" or enc.startswith("utf-8-"):
297 return "utf-8"
298 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
299 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
300 return "iso-8859-1"
301 return orig_enc
302
303 def detect_encoding(readline):
304 """
305 The detect_encoding() function is used to detect the encoding that should
306 be used to decode a Python source file. It requires one argment, readline,
307 in the same way as the tokenize() generator.
308
309 It will call readline a maximum of twice, and return the encoding used
310 (as a string) and a list of any lines (left as bytes) it has read in.
311
312 It detects the encoding from the presence of a utf-8 bom or an encoding
313 cookie as specified in pep-0263. If both a bom and a cookie are present,
314 but disagree, a SyntaxError will be raised. If the encoding cookie is an
315 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
316 'utf-8-sig' is returned.
317
318 If no encoding is specified, then the default of 'utf-8' will be returned.
319 """
320 bom_found = False
321 encoding = None
322 default = 'utf-8'
323 def read_or_stop():
324 try:
325 return readline()
326 except StopIteration:
327 return b''
328
329 def find_cookie(line):
330 try:
331 # Decode as UTF-8. Either the line is an encoding declaration,
332 # in which case it should be pure ASCII, or it must be UTF-8
333 # per default encoding.
334 line_string = line.decode('utf-8')
335 except UnicodeDecodeError:
336 raise SyntaxError("invalid or missing encoding declaration")
337
338 matches = cookie_re.findall(line_string)
339 if not matches:
340 return None
341 encoding = _get_normal_name(matches[0])
342 try:
343 codec = lookup(encoding)
344 except LookupError:
345 # This behaviour mimics the Python interpreter
346 raise SyntaxError("unknown encoding: " + encoding)
347
348 if bom_found:
349 if encoding != 'utf-8':
350 # This behaviour mimics the Python interpreter
351 raise SyntaxError('encoding problem: utf-8')
352 encoding += '-sig'
353 return encoding
354
355 first = read_or_stop()
356 if first.startswith(BOM_UTF8):
357 bom_found = True
358 first = first[3:]
359 default = 'utf-8-sig'
360 if not first:
361 return default, []
362
363 encoding = find_cookie(first)
364 if encoding:
365 return encoding, [first]
366
367 second = read_or_stop()
368 if not second:
369 return default, [first]
370
371 encoding = find_cookie(second)
372 if encoding:
373 return encoding, [first, second]
374
375 return default, [first, second]
376
377
378 def open(filename):
379 """Open a file in read only mode using the encoding detected by
380 detect_encoding().
381 """
382 buffer = builtins.open(filename, 'rb')
383 encoding, lines = detect_encoding(buffer.readline)
384 buffer.seek(0)
385 text = TextIOWrapper(buffer, encoding, line_buffering=True)
386 text.mode = 'r'
387 return text
388
389
390 def tokenize(readline):
391 """
392 The tokenize() generator requires one argment, readline, which
393 must be a callable object which provides the same interface as the
394 readline() method of built-in file objects. Each call to the function
395 should return one line of input as bytes. Alternately, readline
396 can be a callable function terminating with StopIteration:
397 readline = open(myfile, 'rb').__next__ # Example of alternate readline
398
399 The generator produces 5-tuples with these members: the token type; the
400 token string; a 2-tuple (srow, scol) of ints specifying the row and
401 column where the token begins in the source; a 2-tuple (erow, ecol) of
402 ints specifying the row and column where the token ends in the source;
403 and the line on which the token was found. The line passed is the
404 logical line; continuation lines are included.
5
405
6 if sys.version_info[0] >= 3:
406 The first token sequence will always be an ENCODING token
7 from ._tokenize_py3 import *
407 which tells you which encoding was used to decode the bytes stream.
408 """
409 # This import is here to avoid problems when the itertools module is not
410 # built yet and tokenize is imported.
411 from itertools import chain, repeat
412 encoding, consumed = detect_encoding(readline)
413 rl_gen = iter(readline, b"")
414 empty = repeat(b"")
415 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
416
417
418 def _tokenize(readline, encoding):
419 lnum = parenlev = continued = 0
420 numchars = '0123456789'
421 contstr, needcont = '', 0
422 contline = None
423 indents = [0]
424
425 if encoding is not None:
426 if encoding == "utf-8-sig":
427 # BOM will already have been stripped.
428 encoding = "utf-8"
429 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
430 while True: # loop over lines in stream
431 try:
432 line = readline()
433 except StopIteration:
434 line = b''
435
436 if encoding is not None:
437 line = line.decode(encoding)
438 lnum += 1
439 pos, max = 0, len(line)
440
441 if contstr: # continued string
442 if not line:
443 raise TokenError("EOF in multi-line string", strstart)
444 endmatch = endprog.match(line)
445 if endmatch:
446 pos = end = endmatch.end(0)
447 yield TokenInfo(STRING, contstr + line[:end],
448 strstart, (lnum, end), contline + line)
449 contstr, needcont = '', 0
450 contline = None
451 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
452 yield TokenInfo(ERRORTOKEN, contstr + line,
453 strstart, (lnum, len(line)), contline)
454 contstr = ''
455 contline = None
456 continue
457 else:
458 contstr = contstr + line
459 contline = contline + line
460 continue
461
462 elif parenlev == 0 and not continued: # new statement
463 if not line: break
464 column = 0
465 while pos < max: # measure leading whitespace
466 if line[pos] == ' ':
467 column += 1
468 elif line[pos] == '\t':
469 column = (column//tabsize + 1)*tabsize
470 elif line[pos] == '\f':
471 column = 0
472 else:
473 break
474 pos += 1
475 if pos == max:
476 break
477
478 if line[pos] in '#\r\n': # skip comments or blank lines
479 if line[pos] == '#':
480 comment_token = line[pos:].rstrip('\r\n')
481 nl_pos = pos + len(comment_token)
482 yield TokenInfo(COMMENT, comment_token,
483 (lnum, pos), (lnum, pos + len(comment_token)), line)
484 yield TokenInfo(NEWLINE, line[nl_pos:],
485 (lnum, nl_pos), (lnum, len(line)), line)
486 else:
487 yield TokenInfo(NEWLINE, line[pos:],
488 (lnum, pos), (lnum, len(line)), line)
489 continue
490
491 if column > indents[-1]: # count indents or dedents
492 indents.append(column)
493 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
494 while column < indents[-1]:
495 if column not in indents:
496 raise IndentationError(
497 "unindent does not match any outer indentation level",
498 ("<tokenize>", lnum, pos, line))
499 indents = indents[:-1]
500 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
501
502 else: # continued statement
503 if not line:
504 raise TokenError("EOF in multi-line statement", (lnum, 0))
505 continued = 0
506
507 while pos < max:
508 pseudomatch = pseudoprog.match(line, pos)
509 if pseudomatch: # scan for tokens
510 start, end = pseudomatch.span(1)
511 spos, epos, pos = (lnum, start), (lnum, end), end
512 token, initial = line[start:end], line[start]
513
514 if (initial in numchars or # ordinary number
515 (initial == '.' and token != '.' and token != '...')):
516 yield TokenInfo(NUMBER, token, spos, epos, line)
517 elif initial in '\r\n':
518 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
519 token, spos, epos, line)
520 elif initial == '#':
521 assert not token.endswith("\n")
522 yield TokenInfo(COMMENT, token, spos, epos, line)
523 elif token in triple_quoted:
524 endprog = endprogs[token]
525 endmatch = endprog.match(line, pos)
526 if endmatch: # all on one line
527 pos = endmatch.end(0)
528 token = line[start:pos]
529 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
530 else:
531 strstart = (lnum, start) # multiple lines
532 contstr = line[start:]
533 contline = line
534 break
535 elif initial in single_quoted or \
536 token[:2] in single_quoted or \
537 token[:3] in single_quoted:
538 if token[-1] == '\n': # continued string
539 strstart = (lnum, start)
540 endprog = (endprogs[initial] or endprogs[token[1]] or
541 endprogs[token[2]])
542 contstr, needcont = line[start:], 1
543 contline = line
544 break
545 else: # ordinary string
546 yield TokenInfo(STRING, token, spos, epos, line)
547 elif initial.isidentifier(): # ordinary name
548 yield TokenInfo(NAME, token, spos, epos, line)
549 elif initial == '\\': # continued stmt
550 continued = 1
551 else:
552 if initial in '([{':
553 parenlev += 1
554 elif initial in ')]}':
555 parenlev -= 1
556 yield TokenInfo(OP, token, spos, epos, line)
557 else:
558 yield TokenInfo(ERRORTOKEN, line[pos],
559 (lnum, pos), (lnum, pos+1), line)
560 pos += 1
561
562 for indent in indents[1:]: # pop remaining indent levels
563 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
564 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
565
566
567 # An undocumented, backwards compatible, API for all the places in the standard
568 # library that expect to be able to use tokenize with strings
569 def generate_tokens(readline):
570 return _tokenize(readline, None)
571
572 if __name__ == "__main__":
573 # Quick sanity check
574 s = b'''def parseline(self, line):
575 """Parse the line into a command name and a string containing
576 the arguments. Returns a tuple containing (command, args, line).
577 'command' and 'args' may be None if the line couldn't be parsed.
578 """
579 line = line.strip()
580 if not line:
581 return None, None, line
582 elif line[0] == '?':
583 line = 'help ' + line[1:]
584 elif line[0] == '!':
585 if hasattr(self, 'do_shell'):
586 line = 'shell ' + line[1:]
8 else:
587 else:
9 from ._tokenize_py2 import *
588 return None, None, line
589 i, n = 0, len(line)
590 while i < n and line[i] in self.identchars: i = i+1
591 cmd, arg = line[:i], line[i:].strip()
592 return cmd, arg, line
593 '''
594 for tok in tokenize(iter(s.splitlines()).__next__):
595 print(tok)
@@ -41,8 +41,6 b' See IPython `README.rst` file for more information:'
41 print(error, file=sys.stderr)
41 print(error, file=sys.stderr)
42 sys.exit(1)
42 sys.exit(1)
43
43
44 PY3 = (sys.version_info[0] >= 3)
45
46 # At least we're on the python version we need, move on.
44 # At least we're on the python version we need, move on.
47
45
48 #-------------------------------------------------------------------------------
46 #-------------------------------------------------------------------------------
@@ -114,9 +114,6 b' if __name__ == "__main__":'
114 print("DEPRECATE: backport_pr.py is deprecated and is is now recommended"
114 print("DEPRECATE: backport_pr.py is deprecated and is is now recommended"
115 "to install `ghpro` from PyPI.", file=sys.stderr)
115 "to install `ghpro` from PyPI.", file=sys.stderr)
116
116
117 # deal with unicode
118 if sys.version_info < (3,):
119 sys.stdout = codecs.getwriter('utf8')(sys.stdout)
120
117
121 # Whether to add reST urls for all issues in printout.
118 # Whether to add reST urls for all issues in printout.
122 show_urls = True
119 show_urls = True
@@ -1,439 +0,0 b''
1 """Patched version of standard library tokenize, to deal with various bugs.
2
3 Patches
4
5 - Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing),
6 manually applied.
7 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
8 on whether they are in a multi-line statement. Filed as Python issue #17061.
9
10 -------------------------------------------------------------------------------
11 Tokenization help for Python programs.
12
13 generate_tokens(readline) is a generator that breaks a stream of
14 text into Python tokens. It accepts a readline-like method which is called
15 repeatedly to get the next line of input (or "" for EOF). It generates
16 5-tuples with these members:
17
18 the token type (see token.py)
19 the token (a string)
20 the starting (row, column) indices of the token (a 2-tuple of ints)
21 the ending (row, column) indices of the token (a 2-tuple of ints)
22 the original line (string)
23
24 It is designed to match the working of the Python tokenizer exactly, except
25 that it produces COMMENT tokens for comments and gives type OP for all
26 operators
27
28 Older entry points
29 tokenize_loop(readline, tokeneater)
30 tokenize(readline, tokeneater=printtoken)
31 are the same, except instead of generating tokens, tokeneater is a callback
32 function to which the 5 fields described above are passed as 5 arguments,
33 each time a new token is found."""
34 from __future__ import print_function
35
36 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
37 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
38 'Skip Montanaro, Raymond Hettinger')
39
40 import string, re
41 from token import *
42
43 import token
44 __all__ = [x for x in dir(token) if not x.startswith("_")]
45 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
46 del x
47 del token
48
49 __all__ += ["TokenError"]
50
51 COMMENT = N_TOKENS
52 tok_name[COMMENT] = 'COMMENT'
53 NL = N_TOKENS + 1
54 tok_name[NL] = 'NL'
55 N_TOKENS += 2
56
57 def group(*choices): return '(' + '|'.join(choices) + ')'
58 def any(*choices): return group(*choices) + '*'
59 def maybe(*choices): return group(*choices) + '?'
60
61 Whitespace = r'[ \f\t]*'
62 Comment = r'#[^\r\n]*'
63 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
64 Name = r'[a-zA-Z_]\w*'
65
66 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
67 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
68 Binnumber = r'0[bB][01]+[lL]?'
69 Decnumber = r'[1-9]\d*[lL]?'
70 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
71 Exponent = r'[eE][-+]?\d+'
72 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
73 Expfloat = r'\d+' + Exponent
74 Floatnumber = group(Pointfloat, Expfloat)
75 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
76 Number = group(Imagnumber, Floatnumber, Intnumber)
77
78 # Tail end of ' string.
79 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
80 # Tail end of " string.
81 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
82 # Tail end of ''' string.
83 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
84 # Tail end of """ string.
85 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
86 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
87 # Single-line ' or " string.
88 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
89 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
90
91 # Because of leftmost-then-longest match semantics, be sure to put the
92 # longest operators first (e.g., if = came before ==, == would get
93 # recognized as two instances of =).
94 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
95 r"//=?",
96 r"[+\-*/%&|^=<>]=?",
97 r"~")
98
99 Bracket = '[][(){}]'
100 Special = group(r'\r?\n', r'[:;.,`@]')
101 Funny = group(Operator, Bracket, Special)
102
103 PlainToken = group(Number, Funny, String, Name)
104 Token = Ignore + PlainToken
105
106 # First (or only) line of ' or " string.
107 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
108 group("'", r'\\\r?\n'),
109 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
110 group('"', r'\\\r?\n'))
111 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
112 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
113
114 tokenprog, pseudoprog, single3prog, double3prog = map(
115 re.compile, (Token, PseudoToken, Single3, Double3))
116 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
117 "'''": single3prog, '"""': double3prog,
118 "r'''": single3prog, 'r"""': double3prog,
119 "u'''": single3prog, 'u"""': double3prog,
120 "ur'''": single3prog, 'ur"""': double3prog,
121 "R'''": single3prog, 'R"""': double3prog,
122 "U'''": single3prog, 'U"""': double3prog,
123 "uR'''": single3prog, 'uR"""': double3prog,
124 "Ur'''": single3prog, 'Ur"""': double3prog,
125 "UR'''": single3prog, 'UR"""': double3prog,
126 "b'''": single3prog, 'b"""': double3prog,
127 "br'''": single3prog, 'br"""': double3prog,
128 "B'''": single3prog, 'B"""': double3prog,
129 "bR'''": single3prog, 'bR"""': double3prog,
130 "Br'''": single3prog, 'Br"""': double3prog,
131 "BR'''": single3prog, 'BR"""': double3prog,
132 'r': None, 'R': None, 'u': None, 'U': None,
133 'b': None, 'B': None}
134
135 triple_quoted = {}
136 for t in ("'''", '"""',
137 "r'''", 'r"""', "R'''", 'R"""',
138 "u'''", 'u"""', "U'''", 'U"""',
139 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
140 "uR'''", 'uR"""', "UR'''", 'UR"""',
141 "b'''", 'b"""', "B'''", 'B"""',
142 "br'''", 'br"""', "Br'''", 'Br"""',
143 "bR'''", 'bR"""', "BR'''", 'BR"""'):
144 triple_quoted[t] = t
145 single_quoted = {}
146 for t in ("'", '"',
147 "r'", 'r"', "R'", 'R"',
148 "u'", 'u"', "U'", 'U"',
149 "ur'", 'ur"', "Ur'", 'Ur"',
150 "uR'", 'uR"', "UR'", 'UR"',
151 "b'", 'b"', "B'", 'B"',
152 "br'", 'br"', "Br'", 'Br"',
153 "bR'", 'bR"', "BR'", 'BR"' ):
154 single_quoted[t] = t
155
156 tabsize = 8
157
158 class TokenError(Exception): pass
159
160 class StopTokenizing(Exception): pass
161
162 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
163 srow, scol = srow_scol
164 erow, ecol = erow_ecol
165 print("%d,%d-%d,%d:\t%s\t%s" % \
166 (srow, scol, erow, ecol, tok_name[type], repr(token)))
167
168 def tokenize(readline, tokeneater=printtoken):
169 """
170 The tokenize() function accepts two parameters: one representing the
171 input stream, and one providing an output mechanism for tokenize().
172
173 The first parameter, readline, must be a callable object which provides
174 the same interface as the readline() method of built-in file objects.
175 Each call to the function should return one line of input as a string.
176
177 The second parameter, tokeneater, must also be a callable object. It is
178 called once for each token, with five arguments, corresponding to the
179 tuples generated by generate_tokens().
180 """
181 try:
182 tokenize_loop(readline, tokeneater)
183 except StopTokenizing:
184 pass
185
186 # backwards compatible interface
187 def tokenize_loop(readline, tokeneater):
188 for token_info in generate_tokens(readline):
189 tokeneater(*token_info)
190
191 class Untokenizer:
192
193 def __init__(self):
194 self.tokens = []
195 self.prev_row = 1
196 self.prev_col = 0
197
198 def add_whitespace(self, start):
199 row, col = start
200 assert row >= self.prev_row
201 col_offset = col - self.prev_col
202 if col_offset > 0:
203 self.tokens.append(" " * col_offset)
204 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
205 # Line was backslash-continued
206 self.tokens.append(" ")
207
208 def untokenize(self, tokens):
209 iterable = iter(tokens)
210 for t in iterable:
211 if len(t) == 2:
212 self.compat(t, iterable)
213 break
214 tok_type, token, start, end = t[:4]
215 self.add_whitespace(start)
216 self.tokens.append(token)
217 self.prev_row, self.prev_col = end
218 if tok_type in (NEWLINE, NL):
219 self.prev_row += 1
220 self.prev_col = 0
221 return "".join(self.tokens)
222
223 def compat(self, token, iterable):
224 # This import is here to avoid problems when the itertools
225 # module is not built yet and tokenize is imported.
226 from itertools import chain
227 startline = False
228 prevstring = False
229 indents = []
230 toks_append = self.tokens.append
231 for tok in chain([token], iterable):
232 toknum, tokval = tok[:2]
233
234 if toknum in (NAME, NUMBER):
235 tokval += ' '
236
237 # Insert a space between two consecutive strings
238 if toknum == STRING:
239 if prevstring:
240 tokval = ' ' + tokval
241 prevstring = True
242 else:
243 prevstring = False
244
245 if toknum == INDENT:
246 indents.append(tokval)
247 continue
248 elif toknum == DEDENT:
249 indents.pop()
250 continue
251 elif toknum in (NEWLINE, NL):
252 startline = True
253 elif startline and indents:
254 toks_append(indents[-1])
255 startline = False
256 toks_append(tokval)
257
258 def untokenize(iterable):
259 """Transform tokens back into Python source code.
260
261 Each element returned by the iterable must be a token sequence
262 with at least two elements, a token number and token value. If
263 only two tokens are passed, the resulting output is poor.
264
265 Round-trip invariant for full input:
266 Untokenized source will match input source exactly
267
268 Round-trip invariant for limited intput:
269 # Output text will tokenize the back to the input
270 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
271 newcode = untokenize(t1)
272 readline = iter(newcode.splitlines(1)).next
273 t2 = [tok[:2] for tok in generate_tokens(readline)]
274 assert t1 == t2
275 """
276 ut = Untokenizer()
277 return ut.untokenize(iterable)
278
279 def generate_tokens(readline):
280 """
281 The generate_tokens() generator requires one argment, readline, which
282 must be a callable object which provides the same interface as the
283 readline() method of built-in file objects. Each call to the function
284 should return one line of input as a string. Alternately, readline
285 can be a callable function terminating with StopIteration:
286 readline = open(myfile).next # Example of alternate readline
287
288 The generator produces 5-tuples with these members: the token type; the
289 token string; a 2-tuple (srow, scol) of ints specifying the row and
290 column where the token begins in the source; a 2-tuple (erow, ecol) of
291 ints specifying the row and column where the token ends in the source;
292 and the line on which the token was found. The line passed is the
293 logical line; continuation lines are included.
294 """
295 lnum = parenlev = continued = 0
296 namechars, numchars = string.ascii_letters + '_', '0123456789'
297 contstr, needcont = '', 0
298 contline = None
299 indents = [0]
300
301 while 1: # loop over lines in stream
302 try:
303 line = readline()
304 except StopIteration:
305 line = ''
306 lnum += 1
307 pos, max = 0, len(line)
308
309 if contstr: # continued string
310 if not line:
311 raise TokenError("EOF in multi-line string", strstart)
312 endmatch = endprog.match(line)
313 if endmatch:
314 pos = end = endmatch.end(0)
315 yield (STRING, contstr + line[:end],
316 strstart, (lnum, end), contline + line)
317 contstr, needcont = '', 0
318 contline = None
319 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
320 yield (ERRORTOKEN, contstr + line,
321 strstart, (lnum, len(line)), contline)
322 contstr = ''
323 contline = None
324 continue
325 else:
326 contstr = contstr + line
327 contline = contline + line
328 continue
329
330 elif parenlev == 0 and not continued: # new statement
331 if not line: break
332 column = 0
333 while pos < max: # measure leading whitespace
334 if line[pos] == ' ':
335 column += 1
336 elif line[pos] == '\t':
337 column = (column//tabsize + 1)*tabsize
338 elif line[pos] == '\f':
339 column = 0
340 else:
341 break
342 pos += 1
343 if pos == max:
344 break
345
346 if line[pos] in '#\r\n': # skip comments or blank lines
347 if line[pos] == '#':
348 comment_token = line[pos:].rstrip('\r\n')
349 nl_pos = pos + len(comment_token)
350 yield (COMMENT, comment_token,
351 (lnum, pos), (lnum, pos + len(comment_token)), line)
352 yield (NEWLINE, line[nl_pos:],
353 (lnum, nl_pos), (lnum, len(line)), line)
354 else:
355 yield (NEWLINE, line[pos:],
356 (lnum, pos), (lnum, len(line)), line)
357 continue
358
359 if column > indents[-1]: # count indents or dedents
360 indents.append(column)
361 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
362 while column < indents[-1]:
363 if column not in indents:
364 raise IndentationError(
365 "unindent does not match any outer indentation level",
366 ("<tokenize>", lnum, pos, line))
367 indents = indents[:-1]
368 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
369
370 else: # continued statement
371 if not line:
372 raise TokenError("EOF in multi-line statement", (lnum, 0))
373 continued = 0
374
375 while pos < max:
376 pseudomatch = pseudoprog.match(line, pos)
377 if pseudomatch: # scan for tokens
378 start, end = pseudomatch.span(1)
379 spos, epos, pos = (lnum, start), (lnum, end), end
380 token, initial = line[start:end], line[start]
381
382 if initial in numchars or \
383 (initial == '.' and token != '.'): # ordinary number
384 yield (NUMBER, token, spos, epos, line)
385 elif initial in '\r\n':
386 yield (NL if parenlev > 0 else NEWLINE,
387 token, spos, epos, line)
388 elif initial == '#':
389 assert not token.endswith("\n")
390 yield (COMMENT, token, spos, epos, line)
391 elif token in triple_quoted:
392 endprog = endprogs[token]
393 endmatch = endprog.match(line, pos)
394 if endmatch: # all on one line
395 pos = endmatch.end(0)
396 token = line[start:pos]
397 yield (STRING, token, spos, (lnum, pos), line)
398 else:
399 strstart = (lnum, start) # multiple lines
400 contstr = line[start:]
401 contline = line
402 break
403 elif initial in single_quoted or \
404 token[:2] in single_quoted or \
405 token[:3] in single_quoted:
406 if token[-1] == '\n': # continued string
407 strstart = (lnum, start)
408 endprog = (endprogs[initial] or endprogs[token[1]] or
409 endprogs[token[2]])
410 contstr, needcont = line[start:], 1
411 contline = line
412 break
413 else: # ordinary string
414 yield (STRING, token, spos, epos, line)
415 elif initial in namechars: # ordinary name
416 yield (NAME, token, spos, epos, line)
417 elif initial == '\\': # continued stmt
418 continued = 1
419 else:
420 if initial in '([{':
421 parenlev += 1
422 elif initial in ')]}':
423 parenlev -= 1
424 yield (OP, token, spos, epos, line)
425 else:
426 yield (ERRORTOKEN, line[pos],
427 (lnum, pos), (lnum, pos+1), line)
428 pos += 1
429
430 for indent in indents[1:]: # pop remaining indent levels
431 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
432 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
433
434 if __name__ == '__main__': # testing
435 import sys
436 if len(sys.argv) > 1:
437 tokenize(open(sys.argv[1]).readline)
438 else:
439 tokenize(sys.stdin.readline)
This diff has been collapsed as it changes many lines, (595 lines changed) Show them Hide them
@@ -1,595 +0,0 b''
1 """Patched version of standard library tokenize, to deal with various bugs.
2
3 Based on Python 3.2 code.
4
5 Patches:
6
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
8 - Except we don't encode the output of untokenize
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
12 - Export generate_tokens & TokenError
13 - u and rb literals are allowed under Python 3.3 and above.
14
15 ------------------------------------------------------------------------------
16 Tokenization help for Python programs.
17
18 tokenize(readline) is a generator that breaks a stream of bytes into
19 Python tokens. It decodes the bytes according to PEP-0263 for
20 determining source file encoding.
21
22 It accepts a readline-like method which is called repeatedly to get the
23 next line of input (or b"" for EOF). It generates 5-tuples with these
24 members:
25
26 the token type (see token.py)
27 the token (a string)
28 the starting (row, column) indices of the token (a 2-tuple of ints)
29 the ending (row, column) indices of the token (a 2-tuple of ints)
30 the original line (string)
31
32 It is designed to match the working of the Python tokenizer exactly, except
33 that it produces COMMENT tokens for comments and gives type OP for all
34 operators. Additionally, all token lists start with an ENCODING token
35 which tells you which encoding was used to decode the bytes stream.
36 """
37 from __future__ import absolute_import
38
39 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
40 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
41 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
42 'Michael Foord')
43 import builtins
44 import re
45 import sys
46 from token import *
47 from codecs import lookup, BOM_UTF8
48 import collections
49 from io import TextIOWrapper
50 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
51
52 import token
53 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
54 "NL", "untokenize", "ENCODING", "TokenInfo"]
55 del token
56
57 __all__ += ["generate_tokens", "TokenError"]
58
59 COMMENT = N_TOKENS
60 tok_name[COMMENT] = 'COMMENT'
61 NL = N_TOKENS + 1
62 tok_name[NL] = 'NL'
63 ENCODING = N_TOKENS + 2
64 tok_name[ENCODING] = 'ENCODING'
65 N_TOKENS += 3
66
67 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
68 def __repr__(self):
69 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
70 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
71 self._replace(type=annotated_type))
72
73 def group(*choices): return '(' + '|'.join(choices) + ')'
74 def any(*choices): return group(*choices) + '*'
75 def maybe(*choices): return group(*choices) + '?'
76
77 # Note: we use unicode matching for names ("\w") but ascii matching for
78 # number literals.
79 Whitespace = r'[ \f\t]*'
80 Comment = r'#[^\r\n]*'
81 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
82 Name = r'\w+'
83
84 Hexnumber = r'0[xX][0-9a-fA-F]+'
85 Binnumber = r'0[bB][01]+'
86 Octnumber = r'0[oO][0-7]+'
87 Decnumber = r'(?:0+|[1-9][0-9]*)'
88 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
89 Exponent = r'[eE][-+]?[0-9]+'
90 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
91 Expfloat = r'[0-9]+' + Exponent
92 Floatnumber = group(Pointfloat, Expfloat)
93 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
94 Number = group(Imagnumber, Floatnumber, Intnumber)
95
96 if sys.version_info.minor >= 3:
97 StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
98 else:
99 StringPrefix = r'(?:[bB]?[rR]?)?'
100
101 # Tail end of ' string.
102 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
103 # Tail end of " string.
104 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
105 # Tail end of ''' string.
106 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
107 # Tail end of """ string.
108 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
109 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
110 # Single-line ' or " string.
111 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
112 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
113
114 # Because of leftmost-then-longest match semantics, be sure to put the
115 # longest operators first (e.g., if = came before ==, == would get
116 # recognized as two instances of =).
117 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
118 r"//=?", r"->",
119 r"[+\-*/%&|^=<>]=?",
120 r"~")
121
122 Bracket = '[][(){}]'
123 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
124 Funny = group(Operator, Bracket, Special)
125
126 PlainToken = group(Number, Funny, String, Name)
127 Token = Ignore + PlainToken
128
129 # First (or only) line of ' or " string.
130 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
131 group("'", r'\\\r?\n'),
132 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
133 group('"', r'\\\r?\n'))
134 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
135 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
136
137 def _compile(expr):
138 return re.compile(expr, re.UNICODE)
139
140 tokenprog, pseudoprog, single3prog, double3prog = map(
141 _compile, (Token, PseudoToken, Single3, Double3))
142 endprogs = {"'": _compile(Single), '"': _compile(Double),
143 "'''": single3prog, '"""': double3prog,
144 "r'''": single3prog, 'r"""': double3prog,
145 "b'''": single3prog, 'b"""': double3prog,
146 "R'''": single3prog, 'R"""': double3prog,
147 "B'''": single3prog, 'B"""': double3prog,
148 "br'''": single3prog, 'br"""': double3prog,
149 "bR'''": single3prog, 'bR"""': double3prog,
150 "Br'''": single3prog, 'Br"""': double3prog,
151 "BR'''": single3prog, 'BR"""': double3prog,
152 'r': None, 'R': None, 'b': None, 'B': None}
153
154 triple_quoted = {}
155 for t in ("'''", '"""',
156 "r'''", 'r"""', "R'''", 'R"""',
157 "b'''", 'b"""', "B'''", 'B"""',
158 "br'''", 'br"""', "Br'''", 'Br"""',
159 "bR'''", 'bR"""', "BR'''", 'BR"""'):
160 triple_quoted[t] = t
161 single_quoted = {}
162 for t in ("'", '"',
163 "r'", 'r"', "R'", 'R"',
164 "b'", 'b"', "B'", 'B"',
165 "br'", 'br"', "Br'", 'Br"',
166 "bR'", 'bR"', "BR'", 'BR"' ):
167 single_quoted[t] = t
168
169 if sys.version_info.minor >= 3:
170 # Python 3.3
171 for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
172 _t2 = _prefix+'"""'
173 endprogs[_t2] = double3prog
174 triple_quoted[_t2] = _t2
175 _t1 = _prefix + "'''"
176 endprogs[_t1] = single3prog
177 triple_quoted[_t1] = _t1
178 single_quoted[_prefix+'"'] = _prefix+'"'
179 single_quoted[_prefix+"'"] = _prefix+"'"
180 del _prefix, _t2, _t1
181 endprogs['u'] = None
182 endprogs['U'] = None
183
184 del _compile
185
186 tabsize = 8
187
188 class TokenError(Exception): pass
189
190 class StopTokenizing(Exception): pass
191
192
193 class Untokenizer:
194
195 def __init__(self):
196 self.tokens = []
197 self.prev_row = 1
198 self.prev_col = 0
199 self.encoding = 'utf-8'
200
201 def add_whitespace(self, tok_type, start):
202 row, col = start
203 assert row >= self.prev_row
204 col_offset = col - self.prev_col
205 if col_offset > 0:
206 self.tokens.append(" " * col_offset)
207 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
208 # Line was backslash-continued.
209 self.tokens.append(" ")
210
211 def untokenize(self, tokens):
212 iterable = iter(tokens)
213 for t in iterable:
214 if len(t) == 2:
215 self.compat(t, iterable)
216 break
217 tok_type, token, start, end = t[:4]
218 if tok_type == ENCODING:
219 self.encoding = token
220 continue
221 self.add_whitespace(tok_type, start)
222 self.tokens.append(token)
223 self.prev_row, self.prev_col = end
224 if tok_type in (NEWLINE, NL):
225 self.prev_row += 1
226 self.prev_col = 0
227 return "".join(self.tokens)
228
229 def compat(self, token, iterable):
230 # This import is here to avoid problems when the itertools
231 # module is not built yet and tokenize is imported.
232 from itertools import chain
233 startline = False
234 prevstring = False
235 indents = []
236 toks_append = self.tokens.append
237
238 for tok in chain([token], iterable):
239 toknum, tokval = tok[:2]
240 if toknum == ENCODING:
241 self.encoding = tokval
242 continue
243
244 if toknum in (NAME, NUMBER):
245 tokval += ' '
246
247 # Insert a space between two consecutive strings
248 if toknum == STRING:
249 if prevstring:
250 tokval = ' ' + tokval
251 prevstring = True
252 else:
253 prevstring = False
254
255 if toknum == INDENT:
256 indents.append(tokval)
257 continue
258 elif toknum == DEDENT:
259 indents.pop()
260 continue
261 elif toknum in (NEWLINE, NL):
262 startline = True
263 elif startline and indents:
264 toks_append(indents[-1])
265 startline = False
266 toks_append(tokval)
267
268
269 def untokenize(tokens):
270 """
271 Convert ``tokens`` (an iterable) back into Python source code. Return
272 a bytes object, encoded using the encoding specified by the last
273 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
274
275 The result is guaranteed to tokenize back to match the input so that
276 the conversion is lossless and round-trips are assured. The
277 guarantee applies only to the token type and token string as the
278 spacing between tokens (column positions) may change.
279
280 :func:`untokenize` has two modes. If the input tokens are sequences
281 of length 2 (``type``, ``string``) then spaces are added as necessary to
282 preserve the round-trip property.
283
284 If the input tokens are sequences of length 4 or more (``type``,
285 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
286 spaces are added so that each token appears in the result at the
287 position indicated by ``start`` and ``end``, if possible.
288 """
289 return Untokenizer().untokenize(tokens)
290
291
292 def _get_normal_name(orig_enc):
293 """Imitates get_normal_name in tokenizer.c."""
294 # Only care about the first 12 characters.
295 enc = orig_enc[:12].lower().replace("_", "-")
296 if enc == "utf-8" or enc.startswith("utf-8-"):
297 return "utf-8"
298 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
299 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
300 return "iso-8859-1"
301 return orig_enc
302
303 def detect_encoding(readline):
304 """
305 The detect_encoding() function is used to detect the encoding that should
306 be used to decode a Python source file. It requires one argment, readline,
307 in the same way as the tokenize() generator.
308
309 It will call readline a maximum of twice, and return the encoding used
310 (as a string) and a list of any lines (left as bytes) it has read in.
311
312 It detects the encoding from the presence of a utf-8 bom or an encoding
313 cookie as specified in pep-0263. If both a bom and a cookie are present,
314 but disagree, a SyntaxError will be raised. If the encoding cookie is an
315 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
316 'utf-8-sig' is returned.
317
318 If no encoding is specified, then the default of 'utf-8' will be returned.
319 """
320 bom_found = False
321 encoding = None
322 default = 'utf-8'
323 def read_or_stop():
324 try:
325 return readline()
326 except StopIteration:
327 return b''
328
329 def find_cookie(line):
330 try:
331 # Decode as UTF-8. Either the line is an encoding declaration,
332 # in which case it should be pure ASCII, or it must be UTF-8
333 # per default encoding.
334 line_string = line.decode('utf-8')
335 except UnicodeDecodeError:
336 raise SyntaxError("invalid or missing encoding declaration")
337
338 matches = cookie_re.findall(line_string)
339 if not matches:
340 return None
341 encoding = _get_normal_name(matches[0])
342 try:
343 codec = lookup(encoding)
344 except LookupError:
345 # This behaviour mimics the Python interpreter
346 raise SyntaxError("unknown encoding: " + encoding)
347
348 if bom_found:
349 if encoding != 'utf-8':
350 # This behaviour mimics the Python interpreter
351 raise SyntaxError('encoding problem: utf-8')
352 encoding += '-sig'
353 return encoding
354
355 first = read_or_stop()
356 if first.startswith(BOM_UTF8):
357 bom_found = True
358 first = first[3:]
359 default = 'utf-8-sig'
360 if not first:
361 return default, []
362
363 encoding = find_cookie(first)
364 if encoding:
365 return encoding, [first]
366
367 second = read_or_stop()
368 if not second:
369 return default, [first]
370
371 encoding = find_cookie(second)
372 if encoding:
373 return encoding, [first, second]
374
375 return default, [first, second]
376
377
378 def open(filename):
379 """Open a file in read only mode using the encoding detected by
380 detect_encoding().
381 """
382 buffer = builtins.open(filename, 'rb')
383 encoding, lines = detect_encoding(buffer.readline)
384 buffer.seek(0)
385 text = TextIOWrapper(buffer, encoding, line_buffering=True)
386 text.mode = 'r'
387 return text
388
389
390 def tokenize(readline):
391 """
392 The tokenize() generator requires one argment, readline, which
393 must be a callable object which provides the same interface as the
394 readline() method of built-in file objects. Each call to the function
395 should return one line of input as bytes. Alternately, readline
396 can be a callable function terminating with StopIteration:
397 readline = open(myfile, 'rb').__next__ # Example of alternate readline
398
399 The generator produces 5-tuples with these members: the token type; the
400 token string; a 2-tuple (srow, scol) of ints specifying the row and
401 column where the token begins in the source; a 2-tuple (erow, ecol) of
402 ints specifying the row and column where the token ends in the source;
403 and the line on which the token was found. The line passed is the
404 logical line; continuation lines are included.
405
406 The first token sequence will always be an ENCODING token
407 which tells you which encoding was used to decode the bytes stream.
408 """
409 # This import is here to avoid problems when the itertools module is not
410 # built yet and tokenize is imported.
411 from itertools import chain, repeat
412 encoding, consumed = detect_encoding(readline)
413 rl_gen = iter(readline, b"")
414 empty = repeat(b"")
415 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
416
417
418 def _tokenize(readline, encoding):
419 lnum = parenlev = continued = 0
420 numchars = '0123456789'
421 contstr, needcont = '', 0
422 contline = None
423 indents = [0]
424
425 if encoding is not None:
426 if encoding == "utf-8-sig":
427 # BOM will already have been stripped.
428 encoding = "utf-8"
429 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
430 while True: # loop over lines in stream
431 try:
432 line = readline()
433 except StopIteration:
434 line = b''
435
436 if encoding is not None:
437 line = line.decode(encoding)
438 lnum += 1
439 pos, max = 0, len(line)
440
441 if contstr: # continued string
442 if not line:
443 raise TokenError("EOF in multi-line string", strstart)
444 endmatch = endprog.match(line)
445 if endmatch:
446 pos = end = endmatch.end(0)
447 yield TokenInfo(STRING, contstr + line[:end],
448 strstart, (lnum, end), contline + line)
449 contstr, needcont = '', 0
450 contline = None
451 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
452 yield TokenInfo(ERRORTOKEN, contstr + line,
453 strstart, (lnum, len(line)), contline)
454 contstr = ''
455 contline = None
456 continue
457 else:
458 contstr = contstr + line
459 contline = contline + line
460 continue
461
462 elif parenlev == 0 and not continued: # new statement
463 if not line: break
464 column = 0
465 while pos < max: # measure leading whitespace
466 if line[pos] == ' ':
467 column += 1
468 elif line[pos] == '\t':
469 column = (column//tabsize + 1)*tabsize
470 elif line[pos] == '\f':
471 column = 0
472 else:
473 break
474 pos += 1
475 if pos == max:
476 break
477
478 if line[pos] in '#\r\n': # skip comments or blank lines
479 if line[pos] == '#':
480 comment_token = line[pos:].rstrip('\r\n')
481 nl_pos = pos + len(comment_token)
482 yield TokenInfo(COMMENT, comment_token,
483 (lnum, pos), (lnum, pos + len(comment_token)), line)
484 yield TokenInfo(NEWLINE, line[nl_pos:],
485 (lnum, nl_pos), (lnum, len(line)), line)
486 else:
487 yield TokenInfo(NEWLINE, line[pos:],
488 (lnum, pos), (lnum, len(line)), line)
489 continue
490
491 if column > indents[-1]: # count indents or dedents
492 indents.append(column)
493 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
494 while column < indents[-1]:
495 if column not in indents:
496 raise IndentationError(
497 "unindent does not match any outer indentation level",
498 ("<tokenize>", lnum, pos, line))
499 indents = indents[:-1]
500 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
501
502 else: # continued statement
503 if not line:
504 raise TokenError("EOF in multi-line statement", (lnum, 0))
505 continued = 0
506
507 while pos < max:
508 pseudomatch = pseudoprog.match(line, pos)
509 if pseudomatch: # scan for tokens
510 start, end = pseudomatch.span(1)
511 spos, epos, pos = (lnum, start), (lnum, end), end
512 token, initial = line[start:end], line[start]
513
514 if (initial in numchars or # ordinary number
515 (initial == '.' and token != '.' and token != '...')):
516 yield TokenInfo(NUMBER, token, spos, epos, line)
517 elif initial in '\r\n':
518 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
519 token, spos, epos, line)
520 elif initial == '#':
521 assert not token.endswith("\n")
522 yield TokenInfo(COMMENT, token, spos, epos, line)
523 elif token in triple_quoted:
524 endprog = endprogs[token]
525 endmatch = endprog.match(line, pos)
526 if endmatch: # all on one line
527 pos = endmatch.end(0)
528 token = line[start:pos]
529 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
530 else:
531 strstart = (lnum, start) # multiple lines
532 contstr = line[start:]
533 contline = line
534 break
535 elif initial in single_quoted or \
536 token[:2] in single_quoted or \
537 token[:3] in single_quoted:
538 if token[-1] == '\n': # continued string
539 strstart = (lnum, start)
540 endprog = (endprogs[initial] or endprogs[token[1]] or
541 endprogs[token[2]])
542 contstr, needcont = line[start:], 1
543 contline = line
544 break
545 else: # ordinary string
546 yield TokenInfo(STRING, token, spos, epos, line)
547 elif initial.isidentifier(): # ordinary name
548 yield TokenInfo(NAME, token, spos, epos, line)
549 elif initial == '\\': # continued stmt
550 continued = 1
551 else:
552 if initial in '([{':
553 parenlev += 1
554 elif initial in ')]}':
555 parenlev -= 1
556 yield TokenInfo(OP, token, spos, epos, line)
557 else:
558 yield TokenInfo(ERRORTOKEN, line[pos],
559 (lnum, pos), (lnum, pos+1), line)
560 pos += 1
561
562 for indent in indents[1:]: # pop remaining indent levels
563 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
564 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
565
566
567 # An undocumented, backwards compatible, API for all the places in the standard
568 # library that expect to be able to use tokenize with strings
569 def generate_tokens(readline):
570 return _tokenize(readline, None)
571
572 if __name__ == "__main__":
573 # Quick sanity check
574 s = b'''def parseline(self, line):
575 """Parse the line into a command name and a string containing
576 the arguments. Returns a tuple containing (command, args, line).
577 'command' and 'args' may be None if the line couldn't be parsed.
578 """
579 line = line.strip()
580 if not line:
581 return None, None, line
582 elif line[0] == '?':
583 line = 'help ' + line[1:]
584 elif line[0] == '!':
585 if hasattr(self, 'do_shell'):
586 line = 'shell ' + line[1:]
587 else:
588 return None, None, line
589 i, n = 0, len(line)
590 while i < n and line[i] in self.identchars: i = i+1
591 cmd, arg = line[:i], line[i:].strip()
592 return cmd, arg, line
593 '''
594 for tok in tokenize(iter(s.splitlines()).__next__):
595 print(tok)
General Comments 0
You need to be logged in to leave comments. Login now