##// END OF EJS Templates
Fixes for tokenize in Python 3.3
Thomas Kluyver -
Show More
@@ -1,595 +1,595 b''
1 """Patched version of standard library tokenize, to deal with various bugs.
1 """Patched version of standard library tokenize, to deal with various bugs.
2
2
3 Based on Python 3.2 code.
3 Based on Python 3.2 code.
4
4
5 Patches:
5 Patches:
6
6
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
8 - Except we don't encode the output of untokenize
8 - Except we don't encode the output of untokenize
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
12 - Export generate_tokens & TokenError
12 - Export generate_tokens & TokenError
13 - u and rb literals are allowed under Python 3.3 and above.
13 - u and rb literals are allowed under Python 3.3 and above.
14
14
15 ------------------------------------------------------------------------------
15 ------------------------------------------------------------------------------
16 Tokenization help for Python programs.
16 Tokenization help for Python programs.
17
17
18 tokenize(readline) is a generator that breaks a stream of bytes into
18 tokenize(readline) is a generator that breaks a stream of bytes into
19 Python tokens. It decodes the bytes according to PEP-0263 for
19 Python tokens. It decodes the bytes according to PEP-0263 for
20 determining source file encoding.
20 determining source file encoding.
21
21
22 It accepts a readline-like method which is called repeatedly to get the
22 It accepts a readline-like method which is called repeatedly to get the
23 next line of input (or b"" for EOF). It generates 5-tuples with these
23 next line of input (or b"" for EOF). It generates 5-tuples with these
24 members:
24 members:
25
25
26 the token type (see token.py)
26 the token type (see token.py)
27 the token (a string)
27 the token (a string)
28 the starting (row, column) indices of the token (a 2-tuple of ints)
28 the starting (row, column) indices of the token (a 2-tuple of ints)
29 the ending (row, column) indices of the token (a 2-tuple of ints)
29 the ending (row, column) indices of the token (a 2-tuple of ints)
30 the original line (string)
30 the original line (string)
31
31
32 It is designed to match the working of the Python tokenizer exactly, except
32 It is designed to match the working of the Python tokenizer exactly, except
33 that it produces COMMENT tokens for comments and gives type OP for all
33 that it produces COMMENT tokens for comments and gives type OP for all
34 operators. Additionally, all token lists start with an ENCODING token
34 operators. Additionally, all token lists start with an ENCODING token
35 which tells you which encoding was used to decode the bytes stream.
35 which tells you which encoding was used to decode the bytes stream.
36 """
36 """
37 from __future__ import absolute_import
37 from __future__ import absolute_import
38
38
39 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
39 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
40 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
40 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
41 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
41 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
42 'Michael Foord')
42 'Michael Foord')
43 import builtins
43 import builtins
44 import re
44 import re
45 import sys
45 import sys
46 from token import *
46 from token import *
47 from codecs import lookup, BOM_UTF8
47 from codecs import lookup, BOM_UTF8
48 import collections
48 import collections
49 from io import TextIOWrapper
49 from io import TextIOWrapper
50 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
50 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
51
51
52 import token
52 import token
53 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
53 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
54 "NL", "untokenize", "ENCODING", "TokenInfo"]
54 "NL", "untokenize", "ENCODING", "TokenInfo"]
55 del token
55 del token
56
56
57 __all__ += ["generate_tokens", "TokenError"]
57 __all__ += ["generate_tokens", "TokenError"]
58
58
59 COMMENT = N_TOKENS
59 COMMENT = N_TOKENS
60 tok_name[COMMENT] = 'COMMENT'
60 tok_name[COMMENT] = 'COMMENT'
61 NL = N_TOKENS + 1
61 NL = N_TOKENS + 1
62 tok_name[NL] = 'NL'
62 tok_name[NL] = 'NL'
63 ENCODING = N_TOKENS + 2
63 ENCODING = N_TOKENS + 2
64 tok_name[ENCODING] = 'ENCODING'
64 tok_name[ENCODING] = 'ENCODING'
65 N_TOKENS += 3
65 N_TOKENS += 3
66
66
67 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
67 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
68 def __repr__(self):
68 def __repr__(self):
69 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
69 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
70 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
70 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
71 self._replace(type=annotated_type))
71 self._replace(type=annotated_type))
72
72
73 def group(*choices): return '(' + '|'.join(choices) + ')'
73 def group(*choices): return '(' + '|'.join(choices) + ')'
74 def any(*choices): return group(*choices) + '*'
74 def any(*choices): return group(*choices) + '*'
75 def maybe(*choices): return group(*choices) + '?'
75 def maybe(*choices): return group(*choices) + '?'
76
76
77 # Note: we use unicode matching for names ("\w") but ascii matching for
77 # Note: we use unicode matching for names ("\w") but ascii matching for
78 # number literals.
78 # number literals.
79 Whitespace = r'[ \f\t]*'
79 Whitespace = r'[ \f\t]*'
80 Comment = r'#[^\r\n]*'
80 Comment = r'#[^\r\n]*'
81 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
81 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
82 Name = r'\w+'
82 Name = r'\w+'
83
83
84 Hexnumber = r'0[xX][0-9a-fA-F]+'
84 Hexnumber = r'0[xX][0-9a-fA-F]+'
85 Binnumber = r'0[bB][01]+'
85 Binnumber = r'0[bB][01]+'
86 Octnumber = r'0[oO][0-7]+'
86 Octnumber = r'0[oO][0-7]+'
87 Decnumber = r'(?:0+|[1-9][0-9]*)'
87 Decnumber = r'(?:0+|[1-9][0-9]*)'
88 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
88 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
89 Exponent = r'[eE][-+]?[0-9]+'
89 Exponent = r'[eE][-+]?[0-9]+'
90 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
90 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
91 Expfloat = r'[0-9]+' + Exponent
91 Expfloat = r'[0-9]+' + Exponent
92 Floatnumber = group(Pointfloat, Expfloat)
92 Floatnumber = group(Pointfloat, Expfloat)
93 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
93 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
94 Number = group(Imagnumber, Floatnumber, Intnumber)
94 Number = group(Imagnumber, Floatnumber, Intnumber)
95
95
96 if sys.version_info.minor >= 3:
96 if sys.version_info.minor >= 3:
97 StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
97 StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
98 else:
98 else:
99 StringPrefix = r'(?:[bB]?[rR]?)?'
99 StringPrefix = r'(?:[bB]?[rR]?)?'
100
100
101 # Tail end of ' string.
101 # Tail end of ' string.
102 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
102 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
103 # Tail end of " string.
103 # Tail end of " string.
104 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
104 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
105 # Tail end of ''' string.
105 # Tail end of ''' string.
106 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
106 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
107 # Tail end of """ string.
107 # Tail end of """ string.
108 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
108 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
109 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
109 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
110 # Single-line ' or " string.
110 # Single-line ' or " string.
111 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
111 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
112 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
112 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
113
113
114 # Because of leftmost-then-longest match semantics, be sure to put the
114 # Because of leftmost-then-longest match semantics, be sure to put the
115 # longest operators first (e.g., if = came before ==, == would get
115 # longest operators first (e.g., if = came before ==, == would get
116 # recognized as two instances of =).
116 # recognized as two instances of =).
117 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
117 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
118 r"//=?", r"->",
118 r"//=?", r"->",
119 r"[+\-*/%&|^=<>]=?",
119 r"[+\-*/%&|^=<>]=?",
120 r"~")
120 r"~")
121
121
122 Bracket = '[][(){}]'
122 Bracket = '[][(){}]'
123 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
123 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
124 Funny = group(Operator, Bracket, Special)
124 Funny = group(Operator, Bracket, Special)
125
125
126 PlainToken = group(Number, Funny, String, Name)
126 PlainToken = group(Number, Funny, String, Name)
127 Token = Ignore + PlainToken
127 Token = Ignore + PlainToken
128
128
129 # First (or only) line of ' or " string.
129 # First (or only) line of ' or " string.
130 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
130 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
131 group("'", r'\\\r?\n'),
131 group("'", r'\\\r?\n'),
132 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
132 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
133 group('"', r'\\\r?\n'))
133 group('"', r'\\\r?\n'))
134 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
134 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
135 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
135 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
136
136
137 def _compile(expr):
137 def _compile(expr):
138 return re.compile(expr, re.UNICODE)
138 return re.compile(expr, re.UNICODE)
139
139
140 tokenprog, pseudoprog, single3prog, double3prog = map(
140 tokenprog, pseudoprog, single3prog, double3prog = map(
141 _compile, (Token, PseudoToken, Single3, Double3))
141 _compile, (Token, PseudoToken, Single3, Double3))
142 endprogs = {"'": _compile(Single), '"': _compile(Double),
142 endprogs = {"'": _compile(Single), '"': _compile(Double),
143 "'''": single3prog, '"""': double3prog,
143 "'''": single3prog, '"""': double3prog,
144 "r'''": single3prog, 'r"""': double3prog,
144 "r'''": single3prog, 'r"""': double3prog,
145 "b'''": single3prog, 'b"""': double3prog,
145 "b'''": single3prog, 'b"""': double3prog,
146 "R'''": single3prog, 'R"""': double3prog,
146 "R'''": single3prog, 'R"""': double3prog,
147 "B'''": single3prog, 'B"""': double3prog,
147 "B'''": single3prog, 'B"""': double3prog,
148 "br'''": single3prog, 'br"""': double3prog,
148 "br'''": single3prog, 'br"""': double3prog,
149 "bR'''": single3prog, 'bR"""': double3prog,
149 "bR'''": single3prog, 'bR"""': double3prog,
150 "Br'''": single3prog, 'Br"""': double3prog,
150 "Br'''": single3prog, 'Br"""': double3prog,
151 "BR'''": single3prog, 'BR"""': double3prog,
151 "BR'''": single3prog, 'BR"""': double3prog,
152 'r': None, 'R': None, 'b': None, 'B': None}
152 'r': None, 'R': None, 'b': None, 'B': None}
153
153
154 triple_quoted = {}
154 triple_quoted = {}
155 for t in ("'''", '"""',
155 for t in ("'''", '"""',
156 "r'''", 'r"""', "R'''", 'R"""',
156 "r'''", 'r"""', "R'''", 'R"""',
157 "b'''", 'b"""', "B'''", 'B"""',
157 "b'''", 'b"""', "B'''", 'B"""',
158 "br'''", 'br"""', "Br'''", 'Br"""',
158 "br'''", 'br"""', "Br'''", 'Br"""',
159 "bR'''", 'bR"""', "BR'''", 'BR"""'):
159 "bR'''", 'bR"""', "BR'''", 'BR"""'):
160 triple_quoted[t] = t
160 triple_quoted[t] = t
161 single_quoted = {}
161 single_quoted = {}
162 for t in ("'", '"',
162 for t in ("'", '"',
163 "r'", 'r"', "R'", 'R"',
163 "r'", 'r"', "R'", 'R"',
164 "b'", 'b"', "B'", 'B"',
164 "b'", 'b"', "B'", 'B"',
165 "br'", 'br"', "Br'", 'Br"',
165 "br'", 'br"', "Br'", 'Br"',
166 "bR'", 'bR"', "BR'", 'BR"' ):
166 "bR'", 'bR"', "BR'", 'BR"' ):
167 single_quoted[t] = t
167 single_quoted[t] = t
168
168
169 if sys.version_info.minor >= 3:
169 if sys.version_info.minor >= 3:
170 # Python 3.3
170 # Python 3.3
171 for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
171 for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
172 _t2 = prefix+'"""'
172 _t2 = _prefix+'"""'
173 endprogs[_t2] = double3prog
173 endprogs[_t2] = double3prog
174 triple_quoted[_t2] = _t2
174 triple_quoted[_t2] = _t2
175 _t1 = prefix + "'''"
175 _t1 = _prefix + "'''"
176 endprogs[_t1] = single3prog
176 endprogs[_t1] = single3prog
177 triple_quoted[_t1] = _t1
177 triple_quoted[_t1] = _t1
178 single_quoted[_prefix+'"'] = _prefix+'"'
178 single_quoted[_prefix+'"'] = _prefix+'"'
179 single_quoted[_prefix+"'"] + _prefix+"'"
179 single_quoted[_prefix+"'"] = _prefix+"'"
180 del _prefix, _t2, _t1
180 del _prefix, _t2, _t1
181 endprogs['u'] = None
181 endprogs['u'] = None
182 endprogs['U'] = None
182 endprogs['U'] = None
183
183
184 del _compile
184 del _compile
185
185
186 tabsize = 8
186 tabsize = 8
187
187
188 class TokenError(Exception): pass
188 class TokenError(Exception): pass
189
189
190 class StopTokenizing(Exception): pass
190 class StopTokenizing(Exception): pass
191
191
192
192
193 class Untokenizer:
193 class Untokenizer:
194
194
195 def __init__(self):
195 def __init__(self):
196 self.tokens = []
196 self.tokens = []
197 self.prev_row = 1
197 self.prev_row = 1
198 self.prev_col = 0
198 self.prev_col = 0
199 self.encoding = 'utf-8'
199 self.encoding = 'utf-8'
200
200
201 def add_whitespace(self, tok_type, start):
201 def add_whitespace(self, tok_type, start):
202 row, col = start
202 row, col = start
203 assert row >= self.prev_row
203 assert row >= self.prev_row
204 col_offset = col - self.prev_col
204 col_offset = col - self.prev_col
205 if col_offset > 0:
205 if col_offset > 0:
206 self.tokens.append(" " * col_offset)
206 self.tokens.append(" " * col_offset)
207 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
207 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
208 # Line was backslash-continued.
208 # Line was backslash-continued.
209 self.tokens.append(" ")
209 self.tokens.append(" ")
210
210
211 def untokenize(self, tokens):
211 def untokenize(self, tokens):
212 iterable = iter(tokens)
212 iterable = iter(tokens)
213 for t in iterable:
213 for t in iterable:
214 if len(t) == 2:
214 if len(t) == 2:
215 self.compat(t, iterable)
215 self.compat(t, iterable)
216 break
216 break
217 tok_type, token, start, end = t[:4]
217 tok_type, token, start, end = t[:4]
218 if tok_type == ENCODING:
218 if tok_type == ENCODING:
219 self.encoding = token
219 self.encoding = token
220 continue
220 continue
221 self.add_whitespace(tok_type, start)
221 self.add_whitespace(tok_type, start)
222 self.tokens.append(token)
222 self.tokens.append(token)
223 self.prev_row, self.prev_col = end
223 self.prev_row, self.prev_col = end
224 if tok_type in (NEWLINE, NL):
224 if tok_type in (NEWLINE, NL):
225 self.prev_row += 1
225 self.prev_row += 1
226 self.prev_col = 0
226 self.prev_col = 0
227 return "".join(self.tokens)
227 return "".join(self.tokens)
228
228
229 def compat(self, token, iterable):
229 def compat(self, token, iterable):
230 # This import is here to avoid problems when the itertools
230 # This import is here to avoid problems when the itertools
231 # module is not built yet and tokenize is imported.
231 # module is not built yet and tokenize is imported.
232 from itertools import chain
232 from itertools import chain
233 startline = False
233 startline = False
234 prevstring = False
234 prevstring = False
235 indents = []
235 indents = []
236 toks_append = self.tokens.append
236 toks_append = self.tokens.append
237
237
238 for tok in chain([token], iterable):
238 for tok in chain([token], iterable):
239 toknum, tokval = tok[:2]
239 toknum, tokval = tok[:2]
240 if toknum == ENCODING:
240 if toknum == ENCODING:
241 self.encoding = tokval
241 self.encoding = tokval
242 continue
242 continue
243
243
244 if toknum in (NAME, NUMBER):
244 if toknum in (NAME, NUMBER):
245 tokval += ' '
245 tokval += ' '
246
246
247 # Insert a space between two consecutive strings
247 # Insert a space between two consecutive strings
248 if toknum == STRING:
248 if toknum == STRING:
249 if prevstring:
249 if prevstring:
250 tokval = ' ' + tokval
250 tokval = ' ' + tokval
251 prevstring = True
251 prevstring = True
252 else:
252 else:
253 prevstring = False
253 prevstring = False
254
254
255 if toknum == INDENT:
255 if toknum == INDENT:
256 indents.append(tokval)
256 indents.append(tokval)
257 continue
257 continue
258 elif toknum == DEDENT:
258 elif toknum == DEDENT:
259 indents.pop()
259 indents.pop()
260 continue
260 continue
261 elif toknum in (NEWLINE, NL):
261 elif toknum in (NEWLINE, NL):
262 startline = True
262 startline = True
263 elif startline and indents:
263 elif startline and indents:
264 toks_append(indents[-1])
264 toks_append(indents[-1])
265 startline = False
265 startline = False
266 toks_append(tokval)
266 toks_append(tokval)
267
267
268
268
269 def untokenize(tokens):
269 def untokenize(tokens):
270 """
270 """
271 Convert ``tokens`` (an iterable) back into Python source code. Return
271 Convert ``tokens`` (an iterable) back into Python source code. Return
272 a bytes object, encoded using the encoding specified by the last
272 a bytes object, encoded using the encoding specified by the last
273 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
273 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
274
274
275 The result is guaranteed to tokenize back to match the input so that
275 The result is guaranteed to tokenize back to match the input so that
276 the conversion is lossless and round-trips are assured. The
276 the conversion is lossless and round-trips are assured. The
277 guarantee applies only to the token type and token string as the
277 guarantee applies only to the token type and token string as the
278 spacing between tokens (column positions) may change.
278 spacing between tokens (column positions) may change.
279
279
280 :func:`untokenize` has two modes. If the input tokens are sequences
280 :func:`untokenize` has two modes. If the input tokens are sequences
281 of length 2 (``type``, ``string``) then spaces are added as necessary to
281 of length 2 (``type``, ``string``) then spaces are added as necessary to
282 preserve the round-trip property.
282 preserve the round-trip property.
283
283
284 If the input tokens are sequences of length 4 or more (``type``,
284 If the input tokens are sequences of length 4 or more (``type``,
285 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
285 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
286 spaces are added so that each token appears in the result at the
286 spaces are added so that each token appears in the result at the
287 position indicated by ``start`` and ``end``, if possible.
287 position indicated by ``start`` and ``end``, if possible.
288 """
288 """
289 return Untokenizer().untokenize(tokens)
289 return Untokenizer().untokenize(tokens)
290
290
291
291
292 def _get_normal_name(orig_enc):
292 def _get_normal_name(orig_enc):
293 """Imitates get_normal_name in tokenizer.c."""
293 """Imitates get_normal_name in tokenizer.c."""
294 # Only care about the first 12 characters.
294 # Only care about the first 12 characters.
295 enc = orig_enc[:12].lower().replace("_", "-")
295 enc = orig_enc[:12].lower().replace("_", "-")
296 if enc == "utf-8" or enc.startswith("utf-8-"):
296 if enc == "utf-8" or enc.startswith("utf-8-"):
297 return "utf-8"
297 return "utf-8"
298 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
298 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
299 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
299 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
300 return "iso-8859-1"
300 return "iso-8859-1"
301 return orig_enc
301 return orig_enc
302
302
303 def detect_encoding(readline):
303 def detect_encoding(readline):
304 """
304 """
305 The detect_encoding() function is used to detect the encoding that should
305 The detect_encoding() function is used to detect the encoding that should
306 be used to decode a Python source file. It requires one argment, readline,
306 be used to decode a Python source file. It requires one argment, readline,
307 in the same way as the tokenize() generator.
307 in the same way as the tokenize() generator.
308
308
309 It will call readline a maximum of twice, and return the encoding used
309 It will call readline a maximum of twice, and return the encoding used
310 (as a string) and a list of any lines (left as bytes) it has read in.
310 (as a string) and a list of any lines (left as bytes) it has read in.
311
311
312 It detects the encoding from the presence of a utf-8 bom or an encoding
312 It detects the encoding from the presence of a utf-8 bom or an encoding
313 cookie as specified in pep-0263. If both a bom and a cookie are present,
313 cookie as specified in pep-0263. If both a bom and a cookie are present,
314 but disagree, a SyntaxError will be raised. If the encoding cookie is an
314 but disagree, a SyntaxError will be raised. If the encoding cookie is an
315 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
315 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
316 'utf-8-sig' is returned.
316 'utf-8-sig' is returned.
317
317
318 If no encoding is specified, then the default of 'utf-8' will be returned.
318 If no encoding is specified, then the default of 'utf-8' will be returned.
319 """
319 """
320 bom_found = False
320 bom_found = False
321 encoding = None
321 encoding = None
322 default = 'utf-8'
322 default = 'utf-8'
323 def read_or_stop():
323 def read_or_stop():
324 try:
324 try:
325 return readline()
325 return readline()
326 except StopIteration:
326 except StopIteration:
327 return b''
327 return b''
328
328
329 def find_cookie(line):
329 def find_cookie(line):
330 try:
330 try:
331 # Decode as UTF-8. Either the line is an encoding declaration,
331 # Decode as UTF-8. Either the line is an encoding declaration,
332 # in which case it should be pure ASCII, or it must be UTF-8
332 # in which case it should be pure ASCII, or it must be UTF-8
333 # per default encoding.
333 # per default encoding.
334 line_string = line.decode('utf-8')
334 line_string = line.decode('utf-8')
335 except UnicodeDecodeError:
335 except UnicodeDecodeError:
336 raise SyntaxError("invalid or missing encoding declaration")
336 raise SyntaxError("invalid or missing encoding declaration")
337
337
338 matches = cookie_re.findall(line_string)
338 matches = cookie_re.findall(line_string)
339 if not matches:
339 if not matches:
340 return None
340 return None
341 encoding = _get_normal_name(matches[0])
341 encoding = _get_normal_name(matches[0])
342 try:
342 try:
343 codec = lookup(encoding)
343 codec = lookup(encoding)
344 except LookupError:
344 except LookupError:
345 # This behaviour mimics the Python interpreter
345 # This behaviour mimics the Python interpreter
346 raise SyntaxError("unknown encoding: " + encoding)
346 raise SyntaxError("unknown encoding: " + encoding)
347
347
348 if bom_found:
348 if bom_found:
349 if encoding != 'utf-8':
349 if encoding != 'utf-8':
350 # This behaviour mimics the Python interpreter
350 # This behaviour mimics the Python interpreter
351 raise SyntaxError('encoding problem: utf-8')
351 raise SyntaxError('encoding problem: utf-8')
352 encoding += '-sig'
352 encoding += '-sig'
353 return encoding
353 return encoding
354
354
355 first = read_or_stop()
355 first = read_or_stop()
356 if first.startswith(BOM_UTF8):
356 if first.startswith(BOM_UTF8):
357 bom_found = True
357 bom_found = True
358 first = first[3:]
358 first = first[3:]
359 default = 'utf-8-sig'
359 default = 'utf-8-sig'
360 if not first:
360 if not first:
361 return default, []
361 return default, []
362
362
363 encoding = find_cookie(first)
363 encoding = find_cookie(first)
364 if encoding:
364 if encoding:
365 return encoding, [first]
365 return encoding, [first]
366
366
367 second = read_or_stop()
367 second = read_or_stop()
368 if not second:
368 if not second:
369 return default, [first]
369 return default, [first]
370
370
371 encoding = find_cookie(second)
371 encoding = find_cookie(second)
372 if encoding:
372 if encoding:
373 return encoding, [first, second]
373 return encoding, [first, second]
374
374
375 return default, [first, second]
375 return default, [first, second]
376
376
377
377
378 def open(filename):
378 def open(filename):
379 """Open a file in read only mode using the encoding detected by
379 """Open a file in read only mode using the encoding detected by
380 detect_encoding().
380 detect_encoding().
381 """
381 """
382 buffer = builtins.open(filename, 'rb')
382 buffer = builtins.open(filename, 'rb')
383 encoding, lines = detect_encoding(buffer.readline)
383 encoding, lines = detect_encoding(buffer.readline)
384 buffer.seek(0)
384 buffer.seek(0)
385 text = TextIOWrapper(buffer, encoding, line_buffering=True)
385 text = TextIOWrapper(buffer, encoding, line_buffering=True)
386 text.mode = 'r'
386 text.mode = 'r'
387 return text
387 return text
388
388
389
389
390 def tokenize(readline):
390 def tokenize(readline):
391 """
391 """
392 The tokenize() generator requires one argment, readline, which
392 The tokenize() generator requires one argment, readline, which
393 must be a callable object which provides the same interface as the
393 must be a callable object which provides the same interface as the
394 readline() method of built-in file objects. Each call to the function
394 readline() method of built-in file objects. Each call to the function
395 should return one line of input as bytes. Alternately, readline
395 should return one line of input as bytes. Alternately, readline
396 can be a callable function terminating with StopIteration:
396 can be a callable function terminating with StopIteration:
397 readline = open(myfile, 'rb').__next__ # Example of alternate readline
397 readline = open(myfile, 'rb').__next__ # Example of alternate readline
398
398
399 The generator produces 5-tuples with these members: the token type; the
399 The generator produces 5-tuples with these members: the token type; the
400 token string; a 2-tuple (srow, scol) of ints specifying the row and
400 token string; a 2-tuple (srow, scol) of ints specifying the row and
401 column where the token begins in the source; a 2-tuple (erow, ecol) of
401 column where the token begins in the source; a 2-tuple (erow, ecol) of
402 ints specifying the row and column where the token ends in the source;
402 ints specifying the row and column where the token ends in the source;
403 and the line on which the token was found. The line passed is the
403 and the line on which the token was found. The line passed is the
404 logical line; continuation lines are included.
404 logical line; continuation lines are included.
405
405
406 The first token sequence will always be an ENCODING token
406 The first token sequence will always be an ENCODING token
407 which tells you which encoding was used to decode the bytes stream.
407 which tells you which encoding was used to decode the bytes stream.
408 """
408 """
409 # This import is here to avoid problems when the itertools module is not
409 # This import is here to avoid problems when the itertools module is not
410 # built yet and tokenize is imported.
410 # built yet and tokenize is imported.
411 from itertools import chain, repeat
411 from itertools import chain, repeat
412 encoding, consumed = detect_encoding(readline)
412 encoding, consumed = detect_encoding(readline)
413 rl_gen = iter(readline, b"")
413 rl_gen = iter(readline, b"")
414 empty = repeat(b"")
414 empty = repeat(b"")
415 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
415 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
416
416
417
417
418 def _tokenize(readline, encoding):
418 def _tokenize(readline, encoding):
419 lnum = parenlev = continued = 0
419 lnum = parenlev = continued = 0
420 numchars = '0123456789'
420 numchars = '0123456789'
421 contstr, needcont = '', 0
421 contstr, needcont = '', 0
422 contline = None
422 contline = None
423 indents = [0]
423 indents = [0]
424
424
425 if encoding is not None:
425 if encoding is not None:
426 if encoding == "utf-8-sig":
426 if encoding == "utf-8-sig":
427 # BOM will already have been stripped.
427 # BOM will already have been stripped.
428 encoding = "utf-8"
428 encoding = "utf-8"
429 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
429 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
430 while True: # loop over lines in stream
430 while True: # loop over lines in stream
431 try:
431 try:
432 line = readline()
432 line = readline()
433 except StopIteration:
433 except StopIteration:
434 line = b''
434 line = b''
435
435
436 if encoding is not None:
436 if encoding is not None:
437 line = line.decode(encoding)
437 line = line.decode(encoding)
438 lnum += 1
438 lnum += 1
439 pos, max = 0, len(line)
439 pos, max = 0, len(line)
440
440
441 if contstr: # continued string
441 if contstr: # continued string
442 if not line:
442 if not line:
443 raise TokenError("EOF in multi-line string", strstart)
443 raise TokenError("EOF in multi-line string", strstart)
444 endmatch = endprog.match(line)
444 endmatch = endprog.match(line)
445 if endmatch:
445 if endmatch:
446 pos = end = endmatch.end(0)
446 pos = end = endmatch.end(0)
447 yield TokenInfo(STRING, contstr + line[:end],
447 yield TokenInfo(STRING, contstr + line[:end],
448 strstart, (lnum, end), contline + line)
448 strstart, (lnum, end), contline + line)
449 contstr, needcont = '', 0
449 contstr, needcont = '', 0
450 contline = None
450 contline = None
451 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
451 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
452 yield TokenInfo(ERRORTOKEN, contstr + line,
452 yield TokenInfo(ERRORTOKEN, contstr + line,
453 strstart, (lnum, len(line)), contline)
453 strstart, (lnum, len(line)), contline)
454 contstr = ''
454 contstr = ''
455 contline = None
455 contline = None
456 continue
456 continue
457 else:
457 else:
458 contstr = contstr + line
458 contstr = contstr + line
459 contline = contline + line
459 contline = contline + line
460 continue
460 continue
461
461
462 elif parenlev == 0 and not continued: # new statement
462 elif parenlev == 0 and not continued: # new statement
463 if not line: break
463 if not line: break
464 column = 0
464 column = 0
465 while pos < max: # measure leading whitespace
465 while pos < max: # measure leading whitespace
466 if line[pos] == ' ':
466 if line[pos] == ' ':
467 column += 1
467 column += 1
468 elif line[pos] == '\t':
468 elif line[pos] == '\t':
469 column = (column//tabsize + 1)*tabsize
469 column = (column//tabsize + 1)*tabsize
470 elif line[pos] == '\f':
470 elif line[pos] == '\f':
471 column = 0
471 column = 0
472 else:
472 else:
473 break
473 break
474 pos += 1
474 pos += 1
475 if pos == max:
475 if pos == max:
476 break
476 break
477
477
478 if line[pos] in '#\r\n': # skip comments or blank lines
478 if line[pos] in '#\r\n': # skip comments or blank lines
479 if line[pos] == '#':
479 if line[pos] == '#':
480 comment_token = line[pos:].rstrip('\r\n')
480 comment_token = line[pos:].rstrip('\r\n')
481 nl_pos = pos + len(comment_token)
481 nl_pos = pos + len(comment_token)
482 yield TokenInfo(COMMENT, comment_token,
482 yield TokenInfo(COMMENT, comment_token,
483 (lnum, pos), (lnum, pos + len(comment_token)), line)
483 (lnum, pos), (lnum, pos + len(comment_token)), line)
484 yield TokenInfo(NEWLINE, line[nl_pos:],
484 yield TokenInfo(NEWLINE, line[nl_pos:],
485 (lnum, nl_pos), (lnum, len(line)), line)
485 (lnum, nl_pos), (lnum, len(line)), line)
486 else:
486 else:
487 yield TokenInfo(NEWLINE, line[pos:],
487 yield TokenInfo(NEWLINE, line[pos:],
488 (lnum, pos), (lnum, len(line)), line)
488 (lnum, pos), (lnum, len(line)), line)
489 continue
489 continue
490
490
491 if column > indents[-1]: # count indents or dedents
491 if column > indents[-1]: # count indents or dedents
492 indents.append(column)
492 indents.append(column)
493 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
493 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
494 while column < indents[-1]:
494 while column < indents[-1]:
495 if column not in indents:
495 if column not in indents:
496 raise IndentationError(
496 raise IndentationError(
497 "unindent does not match any outer indentation level",
497 "unindent does not match any outer indentation level",
498 ("<tokenize>", lnum, pos, line))
498 ("<tokenize>", lnum, pos, line))
499 indents = indents[:-1]
499 indents = indents[:-1]
500 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
500 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
501
501
502 else: # continued statement
502 else: # continued statement
503 if not line:
503 if not line:
504 raise TokenError("EOF in multi-line statement", (lnum, 0))
504 raise TokenError("EOF in multi-line statement", (lnum, 0))
505 continued = 0
505 continued = 0
506
506
507 while pos < max:
507 while pos < max:
508 pseudomatch = pseudoprog.match(line, pos)
508 pseudomatch = pseudoprog.match(line, pos)
509 if pseudomatch: # scan for tokens
509 if pseudomatch: # scan for tokens
510 start, end = pseudomatch.span(1)
510 start, end = pseudomatch.span(1)
511 spos, epos, pos = (lnum, start), (lnum, end), end
511 spos, epos, pos = (lnum, start), (lnum, end), end
512 token, initial = line[start:end], line[start]
512 token, initial = line[start:end], line[start]
513
513
514 if (initial in numchars or # ordinary number
514 if (initial in numchars or # ordinary number
515 (initial == '.' and token != '.' and token != '...')):
515 (initial == '.' and token != '.' and token != '...')):
516 yield TokenInfo(NUMBER, token, spos, epos, line)
516 yield TokenInfo(NUMBER, token, spos, epos, line)
517 elif initial in '\r\n':
517 elif initial in '\r\n':
518 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
518 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
519 token, spos, epos, line)
519 token, spos, epos, line)
520 elif initial == '#':
520 elif initial == '#':
521 assert not token.endswith("\n")
521 assert not token.endswith("\n")
522 yield TokenInfo(COMMENT, token, spos, epos, line)
522 yield TokenInfo(COMMENT, token, spos, epos, line)
523 elif token in triple_quoted:
523 elif token in triple_quoted:
524 endprog = endprogs[token]
524 endprog = endprogs[token]
525 endmatch = endprog.match(line, pos)
525 endmatch = endprog.match(line, pos)
526 if endmatch: # all on one line
526 if endmatch: # all on one line
527 pos = endmatch.end(0)
527 pos = endmatch.end(0)
528 token = line[start:pos]
528 token = line[start:pos]
529 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
529 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
530 else:
530 else:
531 strstart = (lnum, start) # multiple lines
531 strstart = (lnum, start) # multiple lines
532 contstr = line[start:]
532 contstr = line[start:]
533 contline = line
533 contline = line
534 break
534 break
535 elif initial in single_quoted or \
535 elif initial in single_quoted or \
536 token[:2] in single_quoted or \
536 token[:2] in single_quoted or \
537 token[:3] in single_quoted:
537 token[:3] in single_quoted:
538 if token[-1] == '\n': # continued string
538 if token[-1] == '\n': # continued string
539 strstart = (lnum, start)
539 strstart = (lnum, start)
540 endprog = (endprogs[initial] or endprogs[token[1]] or
540 endprog = (endprogs[initial] or endprogs[token[1]] or
541 endprogs[token[2]])
541 endprogs[token[2]])
542 contstr, needcont = line[start:], 1
542 contstr, needcont = line[start:], 1
543 contline = line
543 contline = line
544 break
544 break
545 else: # ordinary string
545 else: # ordinary string
546 yield TokenInfo(STRING, token, spos, epos, line)
546 yield TokenInfo(STRING, token, spos, epos, line)
547 elif initial.isidentifier(): # ordinary name
547 elif initial.isidentifier(): # ordinary name
548 yield TokenInfo(NAME, token, spos, epos, line)
548 yield TokenInfo(NAME, token, spos, epos, line)
549 elif initial == '\\': # continued stmt
549 elif initial == '\\': # continued stmt
550 continued = 1
550 continued = 1
551 else:
551 else:
552 if initial in '([{':
552 if initial in '([{':
553 parenlev += 1
553 parenlev += 1
554 elif initial in ')]}':
554 elif initial in ')]}':
555 parenlev -= 1
555 parenlev -= 1
556 yield TokenInfo(OP, token, spos, epos, line)
556 yield TokenInfo(OP, token, spos, epos, line)
557 else:
557 else:
558 yield TokenInfo(ERRORTOKEN, line[pos],
558 yield TokenInfo(ERRORTOKEN, line[pos],
559 (lnum, pos), (lnum, pos+1), line)
559 (lnum, pos), (lnum, pos+1), line)
560 pos += 1
560 pos += 1
561
561
562 for indent in indents[1:]: # pop remaining indent levels
562 for indent in indents[1:]: # pop remaining indent levels
563 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
563 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
564 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
564 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
565
565
566
566
567 # An undocumented, backwards compatible, API for all the places in the standard
567 # An undocumented, backwards compatible, API for all the places in the standard
568 # library that expect to be able to use tokenize with strings
568 # library that expect to be able to use tokenize with strings
569 def generate_tokens(readline):
569 def generate_tokens(readline):
570 return _tokenize(readline, None)
570 return _tokenize(readline, None)
571
571
572 if __name__ == "__main__":
572 if __name__ == "__main__":
573 # Quick sanity check
573 # Quick sanity check
574 s = b'''def parseline(self, line):
574 s = b'''def parseline(self, line):
575 """Parse the line into a command name and a string containing
575 """Parse the line into a command name and a string containing
576 the arguments. Returns a tuple containing (command, args, line).
576 the arguments. Returns a tuple containing (command, args, line).
577 'command' and 'args' may be None if the line couldn't be parsed.
577 'command' and 'args' may be None if the line couldn't be parsed.
578 """
578 """
579 line = line.strip()
579 line = line.strip()
580 if not line:
580 if not line:
581 return None, None, line
581 return None, None, line
582 elif line[0] == '?':
582 elif line[0] == '?':
583 line = 'help ' + line[1:]
583 line = 'help ' + line[1:]
584 elif line[0] == '!':
584 elif line[0] == '!':
585 if hasattr(self, 'do_shell'):
585 if hasattr(self, 'do_shell'):
586 line = 'shell ' + line[1:]
586 line = 'shell ' + line[1:]
587 else:
587 else:
588 return None, None, line
588 return None, None, line
589 i, n = 0, len(line)
589 i, n = 0, len(line)
590 while i < n and line[i] in self.identchars: i = i+1
590 while i < n and line[i] in self.identchars: i = i+1
591 cmd, arg = line[:i], line[i:].strip()
591 cmd, arg = line[:i], line[i:].strip()
592 return cmd, arg, line
592 return cmd, arg, line
593 '''
593 '''
594 for tok in tokenize(iter(s.splitlines()).__next__):
594 for tok in tokenize(iter(s.splitlines()).__next__):
595 print(tok)
595 print(tok)
General Comments 0
You need to be logged in to leave comments. Login now