##// END OF EJS Templates
Add linebreak to docstring to fix markup and silence Sphinx waraning....
Doug Latornell -
Show More
@@ -1,589 +1,590
1 """Patched version of standard library tokenize, to deal with various bugs.
1 """Patched version of standard library tokenize, to deal with various bugs.
2
2
3 Based on Python 3.2 code.
3 Based on Python 3.2 code.
4
4
5 Patches:
5 Patches:
6
6
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
7 - Gareth Rees' patch for Python issue #12691 (untokenizing)
8 - Except we don't encode the output of untokenize
8 - Except we don't encode the output of untokenize
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
9 - Python 2 compatible syntax, so that it can be byte-compiled at installation
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
10 - Newlines in comments and blank lines should be either NL or NEWLINE, depending
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
11 on whether they are in a multi-line statement. Filed as Python issue #17061.
12 - Export generate_tokens & TokenError
12 - Export generate_tokens & TokenError
13 - u and rb literals are allowed under Python 3.3 and above.
13 - u and rb literals are allowed under Python 3.3 and above.
14
14
15 ------------------------------------------------------------------------------
15 ------------------------------------------------------------------------------
16
16 Tokenization help for Python programs.
17 Tokenization help for Python programs.
17
18
18 tokenize(readline) is a generator that breaks a stream of bytes into
19 tokenize(readline) is a generator that breaks a stream of bytes into
19 Python tokens. It decodes the bytes according to PEP-0263 for
20 Python tokens. It decodes the bytes according to PEP-0263 for
20 determining source file encoding.
21 determining source file encoding.
21
22
22 It accepts a readline-like method which is called repeatedly to get the
23 It accepts a readline-like method which is called repeatedly to get the
23 next line of input (or b"" for EOF). It generates 5-tuples with these
24 next line of input (or b"" for EOF). It generates 5-tuples with these
24 members:
25 members:
25
26
26 the token type (see token.py)
27 the token type (see token.py)
27 the token (a string)
28 the token (a string)
28 the starting (row, column) indices of the token (a 2-tuple of ints)
29 the starting (row, column) indices of the token (a 2-tuple of ints)
29 the ending (row, column) indices of the token (a 2-tuple of ints)
30 the ending (row, column) indices of the token (a 2-tuple of ints)
30 the original line (string)
31 the original line (string)
31
32
32 It is designed to match the working of the Python tokenizer exactly, except
33 It is designed to match the working of the Python tokenizer exactly, except
33 that it produces COMMENT tokens for comments and gives type OP for all
34 that it produces COMMENT tokens for comments and gives type OP for all
34 operators. Additionally, all token lists start with an ENCODING token
35 operators. Additionally, all token lists start with an ENCODING token
35 which tells you which encoding was used to decode the bytes stream.
36 which tells you which encoding was used to decode the bytes stream.
36 """
37 """
37
38
38 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
39 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
39 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
40 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
40 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
41 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
41 'Michael Foord')
42 'Michael Foord')
42 import builtins
43 import builtins
43 import re
44 import re
44 import sys
45 import sys
45 from token import *
46 from token import *
46 from codecs import lookup, BOM_UTF8
47 from codecs import lookup, BOM_UTF8
47 import collections
48 import collections
48 from io import TextIOWrapper
49 from io import TextIOWrapper
49 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
50 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
50
51
51 import token
52 import token
52 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
53 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
53 "NL", "untokenize", "ENCODING", "TokenInfo"]
54 "NL", "untokenize", "ENCODING", "TokenInfo"]
54 del token
55 del token
55
56
56 __all__ += ["generate_tokens", "TokenError"]
57 __all__ += ["generate_tokens", "TokenError"]
57
58
58 COMMENT = N_TOKENS
59 COMMENT = N_TOKENS
59 tok_name[COMMENT] = 'COMMENT'
60 tok_name[COMMENT] = 'COMMENT'
60 NL = N_TOKENS + 1
61 NL = N_TOKENS + 1
61 tok_name[NL] = 'NL'
62 tok_name[NL] = 'NL'
62 ENCODING = N_TOKENS + 2
63 ENCODING = N_TOKENS + 2
63 tok_name[ENCODING] = 'ENCODING'
64 tok_name[ENCODING] = 'ENCODING'
64 N_TOKENS += 3
65 N_TOKENS += 3
65
66
66 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
67 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
67 def __repr__(self):
68 def __repr__(self):
68 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
69 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
69 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
70 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
70 self._replace(type=annotated_type))
71 self._replace(type=annotated_type))
71
72
72 def group(*choices): return '(' + '|'.join(choices) + ')'
73 def group(*choices): return '(' + '|'.join(choices) + ')'
73 def any(*choices): return group(*choices) + '*'
74 def any(*choices): return group(*choices) + '*'
74 def maybe(*choices): return group(*choices) + '?'
75 def maybe(*choices): return group(*choices) + '?'
75
76
76 # Note: we use unicode matching for names ("\w") but ascii matching for
77 # Note: we use unicode matching for names ("\w") but ascii matching for
77 # number literals.
78 # number literals.
78 Whitespace = r'[ \f\t]*'
79 Whitespace = r'[ \f\t]*'
79 Comment = r'#[^\r\n]*'
80 Comment = r'#[^\r\n]*'
80 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
81 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
81 Name = r'\w+'
82 Name = r'\w+'
82
83
83 Hexnumber = r'0[xX][0-9a-fA-F]+'
84 Hexnumber = r'0[xX][0-9a-fA-F]+'
84 Binnumber = r'0[bB][01]+'
85 Binnumber = r'0[bB][01]+'
85 Octnumber = r'0[oO][0-7]+'
86 Octnumber = r'0[oO][0-7]+'
86 Decnumber = r'(?:0+|[1-9][0-9]*)'
87 Decnumber = r'(?:0+|[1-9][0-9]*)'
87 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
88 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
88 Exponent = r'[eE][-+]?[0-9]+'
89 Exponent = r'[eE][-+]?[0-9]+'
89 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
90 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
90 Expfloat = r'[0-9]+' + Exponent
91 Expfloat = r'[0-9]+' + Exponent
91 Floatnumber = group(Pointfloat, Expfloat)
92 Floatnumber = group(Pointfloat, Expfloat)
92 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
93 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
93 Number = group(Imagnumber, Floatnumber, Intnumber)
94 Number = group(Imagnumber, Floatnumber, Intnumber)
94 StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
95 StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
95
96
96 # Tail end of ' string.
97 # Tail end of ' string.
97 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
98 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
98 # Tail end of " string.
99 # Tail end of " string.
99 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
100 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
100 # Tail end of ''' string.
101 # Tail end of ''' string.
101 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
102 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
102 # Tail end of """ string.
103 # Tail end of """ string.
103 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
104 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
104 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
105 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
105 # Single-line ' or " string.
106 # Single-line ' or " string.
106 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
107 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
107 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
108 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
108
109
109 # Because of leftmost-then-longest match semantics, be sure to put the
110 # Because of leftmost-then-longest match semantics, be sure to put the
110 # longest operators first (e.g., if = came before ==, == would get
111 # longest operators first (e.g., if = came before ==, == would get
111 # recognized as two instances of =).
112 # recognized as two instances of =).
112 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
113 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
113 r"//=?", r"->",
114 r"//=?", r"->",
114 r"[+\-*/%&|^=<>]=?",
115 r"[+\-*/%&|^=<>]=?",
115 r"~")
116 r"~")
116
117
117 Bracket = '[][(){}]'
118 Bracket = '[][(){}]'
118 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
119 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
119 Funny = group(Operator, Bracket, Special)
120 Funny = group(Operator, Bracket, Special)
120
121
121 PlainToken = group(Number, Funny, String, Name)
122 PlainToken = group(Number, Funny, String, Name)
122 Token = Ignore + PlainToken
123 Token = Ignore + PlainToken
123
124
124 # First (or only) line of ' or " string.
125 # First (or only) line of ' or " string.
125 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
126 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
126 group("'", r'\\\r?\n'),
127 group("'", r'\\\r?\n'),
127 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
128 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
128 group('"', r'\\\r?\n'))
129 group('"', r'\\\r?\n'))
129 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
130 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
130 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
131 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
131
132
132 def _compile(expr):
133 def _compile(expr):
133 return re.compile(expr, re.UNICODE)
134 return re.compile(expr, re.UNICODE)
134
135
135 tokenprog, pseudoprog, single3prog, double3prog = map(
136 tokenprog, pseudoprog, single3prog, double3prog = map(
136 _compile, (Token, PseudoToken, Single3, Double3))
137 _compile, (Token, PseudoToken, Single3, Double3))
137 endprogs = {"'": _compile(Single), '"': _compile(Double),
138 endprogs = {"'": _compile(Single), '"': _compile(Double),
138 "'''": single3prog, '"""': double3prog,
139 "'''": single3prog, '"""': double3prog,
139 "r'''": single3prog, 'r"""': double3prog,
140 "r'''": single3prog, 'r"""': double3prog,
140 "b'''": single3prog, 'b"""': double3prog,
141 "b'''": single3prog, 'b"""': double3prog,
141 "R'''": single3prog, 'R"""': double3prog,
142 "R'''": single3prog, 'R"""': double3prog,
142 "B'''": single3prog, 'B"""': double3prog,
143 "B'''": single3prog, 'B"""': double3prog,
143 "br'''": single3prog, 'br"""': double3prog,
144 "br'''": single3prog, 'br"""': double3prog,
144 "bR'''": single3prog, 'bR"""': double3prog,
145 "bR'''": single3prog, 'bR"""': double3prog,
145 "Br'''": single3prog, 'Br"""': double3prog,
146 "Br'''": single3prog, 'Br"""': double3prog,
146 "BR'''": single3prog, 'BR"""': double3prog,
147 "BR'''": single3prog, 'BR"""': double3prog,
147 'r': None, 'R': None, 'b': None, 'B': None}
148 'r': None, 'R': None, 'b': None, 'B': None}
148
149
149 triple_quoted = {}
150 triple_quoted = {}
150 for t in ("'''", '"""',
151 for t in ("'''", '"""',
151 "r'''", 'r"""', "R'''", 'R"""',
152 "r'''", 'r"""', "R'''", 'R"""',
152 "b'''", 'b"""', "B'''", 'B"""',
153 "b'''", 'b"""', "B'''", 'B"""',
153 "br'''", 'br"""', "Br'''", 'Br"""',
154 "br'''", 'br"""', "Br'''", 'Br"""',
154 "bR'''", 'bR"""', "BR'''", 'BR"""'):
155 "bR'''", 'bR"""', "BR'''", 'BR"""'):
155 triple_quoted[t] = t
156 triple_quoted[t] = t
156 single_quoted = {}
157 single_quoted = {}
157 for t in ("'", '"',
158 for t in ("'", '"',
158 "r'", 'r"', "R'", 'R"',
159 "r'", 'r"', "R'", 'R"',
159 "b'", 'b"', "B'", 'B"',
160 "b'", 'b"', "B'", 'B"',
160 "br'", 'br"', "Br'", 'Br"',
161 "br'", 'br"', "Br'", 'Br"',
161 "bR'", 'bR"', "BR'", 'BR"' ):
162 "bR'", 'bR"', "BR'", 'BR"' ):
162 single_quoted[t] = t
163 single_quoted[t] = t
163
164
164 for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
165 for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
165 _t2 = _prefix+'"""'
166 _t2 = _prefix+'"""'
166 endprogs[_t2] = double3prog
167 endprogs[_t2] = double3prog
167 triple_quoted[_t2] = _t2
168 triple_quoted[_t2] = _t2
168 _t1 = _prefix + "'''"
169 _t1 = _prefix + "'''"
169 endprogs[_t1] = single3prog
170 endprogs[_t1] = single3prog
170 triple_quoted[_t1] = _t1
171 triple_quoted[_t1] = _t1
171 single_quoted[_prefix+'"'] = _prefix+'"'
172 single_quoted[_prefix+'"'] = _prefix+'"'
172 single_quoted[_prefix+"'"] = _prefix+"'"
173 single_quoted[_prefix+"'"] = _prefix+"'"
173 del _prefix, _t2, _t1
174 del _prefix, _t2, _t1
174 endprogs['u'] = None
175 endprogs['u'] = None
175 endprogs['U'] = None
176 endprogs['U'] = None
176
177
177 del _compile
178 del _compile
178
179
179 tabsize = 8
180 tabsize = 8
180
181
181 class TokenError(Exception): pass
182 class TokenError(Exception): pass
182
183
183 class StopTokenizing(Exception): pass
184 class StopTokenizing(Exception): pass
184
185
185
186
186 class Untokenizer:
187 class Untokenizer:
187
188
188 def __init__(self):
189 def __init__(self):
189 self.tokens = []
190 self.tokens = []
190 self.prev_row = 1
191 self.prev_row = 1
191 self.prev_col = 0
192 self.prev_col = 0
192 self.encoding = 'utf-8'
193 self.encoding = 'utf-8'
193
194
194 def add_whitespace(self, tok_type, start):
195 def add_whitespace(self, tok_type, start):
195 row, col = start
196 row, col = start
196 assert row >= self.prev_row
197 assert row >= self.prev_row
197 col_offset = col - self.prev_col
198 col_offset = col - self.prev_col
198 if col_offset > 0:
199 if col_offset > 0:
199 self.tokens.append(" " * col_offset)
200 self.tokens.append(" " * col_offset)
200 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
201 elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
201 # Line was backslash-continued.
202 # Line was backslash-continued.
202 self.tokens.append(" ")
203 self.tokens.append(" ")
203
204
204 def untokenize(self, tokens):
205 def untokenize(self, tokens):
205 iterable = iter(tokens)
206 iterable = iter(tokens)
206 for t in iterable:
207 for t in iterable:
207 if len(t) == 2:
208 if len(t) == 2:
208 self.compat(t, iterable)
209 self.compat(t, iterable)
209 break
210 break
210 tok_type, token, start, end = t[:4]
211 tok_type, token, start, end = t[:4]
211 if tok_type == ENCODING:
212 if tok_type == ENCODING:
212 self.encoding = token
213 self.encoding = token
213 continue
214 continue
214 self.add_whitespace(tok_type, start)
215 self.add_whitespace(tok_type, start)
215 self.tokens.append(token)
216 self.tokens.append(token)
216 self.prev_row, self.prev_col = end
217 self.prev_row, self.prev_col = end
217 if tok_type in (NEWLINE, NL):
218 if tok_type in (NEWLINE, NL):
218 self.prev_row += 1
219 self.prev_row += 1
219 self.prev_col = 0
220 self.prev_col = 0
220 return "".join(self.tokens)
221 return "".join(self.tokens)
221
222
222 def compat(self, token, iterable):
223 def compat(self, token, iterable):
223 # This import is here to avoid problems when the itertools
224 # This import is here to avoid problems when the itertools
224 # module is not built yet and tokenize is imported.
225 # module is not built yet and tokenize is imported.
225 from itertools import chain
226 from itertools import chain
226 startline = False
227 startline = False
227 prevstring = False
228 prevstring = False
228 indents = []
229 indents = []
229 toks_append = self.tokens.append
230 toks_append = self.tokens.append
230
231
231 for tok in chain([token], iterable):
232 for tok in chain([token], iterable):
232 toknum, tokval = tok[:2]
233 toknum, tokval = tok[:2]
233 if toknum == ENCODING:
234 if toknum == ENCODING:
234 self.encoding = tokval
235 self.encoding = tokval
235 continue
236 continue
236
237
237 if toknum in (NAME, NUMBER):
238 if toknum in (NAME, NUMBER):
238 tokval += ' '
239 tokval += ' '
239
240
240 # Insert a space between two consecutive strings
241 # Insert a space between two consecutive strings
241 if toknum == STRING:
242 if toknum == STRING:
242 if prevstring:
243 if prevstring:
243 tokval = ' ' + tokval
244 tokval = ' ' + tokval
244 prevstring = True
245 prevstring = True
245 else:
246 else:
246 prevstring = False
247 prevstring = False
247
248
248 if toknum == INDENT:
249 if toknum == INDENT:
249 indents.append(tokval)
250 indents.append(tokval)
250 continue
251 continue
251 elif toknum == DEDENT:
252 elif toknum == DEDENT:
252 indents.pop()
253 indents.pop()
253 continue
254 continue
254 elif toknum in (NEWLINE, NL):
255 elif toknum in (NEWLINE, NL):
255 startline = True
256 startline = True
256 elif startline and indents:
257 elif startline and indents:
257 toks_append(indents[-1])
258 toks_append(indents[-1])
258 startline = False
259 startline = False
259 toks_append(tokval)
260 toks_append(tokval)
260
261
261
262
262 def untokenize(tokens):
263 def untokenize(tokens):
263 """
264 """
264 Convert ``tokens`` (an iterable) back into Python source code. Return
265 Convert ``tokens`` (an iterable) back into Python source code. Return
265 a bytes object, encoded using the encoding specified by the last
266 a bytes object, encoded using the encoding specified by the last
266 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
267 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
267
268
268 The result is guaranteed to tokenize back to match the input so that
269 The result is guaranteed to tokenize back to match the input so that
269 the conversion is lossless and round-trips are assured. The
270 the conversion is lossless and round-trips are assured. The
270 guarantee applies only to the token type and token string as the
271 guarantee applies only to the token type and token string as the
271 spacing between tokens (column positions) may change.
272 spacing between tokens (column positions) may change.
272
273
273 :func:`untokenize` has two modes. If the input tokens are sequences
274 :func:`untokenize` has two modes. If the input tokens are sequences
274 of length 2 (``type``, ``string``) then spaces are added as necessary to
275 of length 2 (``type``, ``string``) then spaces are added as necessary to
275 preserve the round-trip property.
276 preserve the round-trip property.
276
277
277 If the input tokens are sequences of length 4 or more (``type``,
278 If the input tokens are sequences of length 4 or more (``type``,
278 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
279 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
279 spaces are added so that each token appears in the result at the
280 spaces are added so that each token appears in the result at the
280 position indicated by ``start`` and ``end``, if possible.
281 position indicated by ``start`` and ``end``, if possible.
281 """
282 """
282 return Untokenizer().untokenize(tokens)
283 return Untokenizer().untokenize(tokens)
283
284
284
285
285 def _get_normal_name(orig_enc):
286 def _get_normal_name(orig_enc):
286 """Imitates get_normal_name in tokenizer.c."""
287 """Imitates get_normal_name in tokenizer.c."""
287 # Only care about the first 12 characters.
288 # Only care about the first 12 characters.
288 enc = orig_enc[:12].lower().replace("_", "-")
289 enc = orig_enc[:12].lower().replace("_", "-")
289 if enc == "utf-8" or enc.startswith("utf-8-"):
290 if enc == "utf-8" or enc.startswith("utf-8-"):
290 return "utf-8"
291 return "utf-8"
291 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
292 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
292 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
293 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
293 return "iso-8859-1"
294 return "iso-8859-1"
294 return orig_enc
295 return orig_enc
295
296
296 def detect_encoding(readline):
297 def detect_encoding(readline):
297 """
298 """
298 The detect_encoding() function is used to detect the encoding that should
299 The detect_encoding() function is used to detect the encoding that should
299 be used to decode a Python source file. It requires one argment, readline,
300 be used to decode a Python source file. It requires one argment, readline,
300 in the same way as the tokenize() generator.
301 in the same way as the tokenize() generator.
301
302
302 It will call readline a maximum of twice, and return the encoding used
303 It will call readline a maximum of twice, and return the encoding used
303 (as a string) and a list of any lines (left as bytes) it has read in.
304 (as a string) and a list of any lines (left as bytes) it has read in.
304
305
305 It detects the encoding from the presence of a utf-8 bom or an encoding
306 It detects the encoding from the presence of a utf-8 bom or an encoding
306 cookie as specified in pep-0263. If both a bom and a cookie are present,
307 cookie as specified in pep-0263. If both a bom and a cookie are present,
307 but disagree, a SyntaxError will be raised. If the encoding cookie is an
308 but disagree, a SyntaxError will be raised. If the encoding cookie is an
308 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
309 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
309 'utf-8-sig' is returned.
310 'utf-8-sig' is returned.
310
311
311 If no encoding is specified, then the default of 'utf-8' will be returned.
312 If no encoding is specified, then the default of 'utf-8' will be returned.
312 """
313 """
313 bom_found = False
314 bom_found = False
314 encoding = None
315 encoding = None
315 default = 'utf-8'
316 default = 'utf-8'
316 def read_or_stop():
317 def read_or_stop():
317 try:
318 try:
318 return readline()
319 return readline()
319 except StopIteration:
320 except StopIteration:
320 return b''
321 return b''
321
322
322 def find_cookie(line):
323 def find_cookie(line):
323 try:
324 try:
324 # Decode as UTF-8. Either the line is an encoding declaration,
325 # Decode as UTF-8. Either the line is an encoding declaration,
325 # in which case it should be pure ASCII, or it must be UTF-8
326 # in which case it should be pure ASCII, or it must be UTF-8
326 # per default encoding.
327 # per default encoding.
327 line_string = line.decode('utf-8')
328 line_string = line.decode('utf-8')
328 except UnicodeDecodeError:
329 except UnicodeDecodeError:
329 raise SyntaxError("invalid or missing encoding declaration")
330 raise SyntaxError("invalid or missing encoding declaration")
330
331
331 matches = cookie_re.findall(line_string)
332 matches = cookie_re.findall(line_string)
332 if not matches:
333 if not matches:
333 return None
334 return None
334 encoding = _get_normal_name(matches[0])
335 encoding = _get_normal_name(matches[0])
335 try:
336 try:
336 codec = lookup(encoding)
337 codec = lookup(encoding)
337 except LookupError:
338 except LookupError:
338 # This behaviour mimics the Python interpreter
339 # This behaviour mimics the Python interpreter
339 raise SyntaxError("unknown encoding: " + encoding)
340 raise SyntaxError("unknown encoding: " + encoding)
340
341
341 if bom_found:
342 if bom_found:
342 if encoding != 'utf-8':
343 if encoding != 'utf-8':
343 # This behaviour mimics the Python interpreter
344 # This behaviour mimics the Python interpreter
344 raise SyntaxError('encoding problem: utf-8')
345 raise SyntaxError('encoding problem: utf-8')
345 encoding += '-sig'
346 encoding += '-sig'
346 return encoding
347 return encoding
347
348
348 first = read_or_stop()
349 first = read_or_stop()
349 if first.startswith(BOM_UTF8):
350 if first.startswith(BOM_UTF8):
350 bom_found = True
351 bom_found = True
351 first = first[3:]
352 first = first[3:]
352 default = 'utf-8-sig'
353 default = 'utf-8-sig'
353 if not first:
354 if not first:
354 return default, []
355 return default, []
355
356
356 encoding = find_cookie(first)
357 encoding = find_cookie(first)
357 if encoding:
358 if encoding:
358 return encoding, [first]
359 return encoding, [first]
359
360
360 second = read_or_stop()
361 second = read_or_stop()
361 if not second:
362 if not second:
362 return default, [first]
363 return default, [first]
363
364
364 encoding = find_cookie(second)
365 encoding = find_cookie(second)
365 if encoding:
366 if encoding:
366 return encoding, [first, second]
367 return encoding, [first, second]
367
368
368 return default, [first, second]
369 return default, [first, second]
369
370
370
371
371 def open(filename):
372 def open(filename):
372 """Open a file in read only mode using the encoding detected by
373 """Open a file in read only mode using the encoding detected by
373 detect_encoding().
374 detect_encoding().
374 """
375 """
375 buffer = builtins.open(filename, 'rb')
376 buffer = builtins.open(filename, 'rb')
376 encoding, lines = detect_encoding(buffer.readline)
377 encoding, lines = detect_encoding(buffer.readline)
377 buffer.seek(0)
378 buffer.seek(0)
378 text = TextIOWrapper(buffer, encoding, line_buffering=True)
379 text = TextIOWrapper(buffer, encoding, line_buffering=True)
379 text.mode = 'r'
380 text.mode = 'r'
380 return text
381 return text
381
382
382
383
383 def tokenize(readline):
384 def tokenize(readline):
384 """
385 """
385 The tokenize() generator requires one argment, readline, which
386 The tokenize() generator requires one argment, readline, which
386 must be a callable object which provides the same interface as the
387 must be a callable object which provides the same interface as the
387 readline() method of built-in file objects. Each call to the function
388 readline() method of built-in file objects. Each call to the function
388 should return one line of input as bytes. Alternately, readline
389 should return one line of input as bytes. Alternately, readline
389 can be a callable function terminating with :class:`StopIteration`::
390 can be a callable function terminating with :class:`StopIteration`::
390
391
391 readline = open(myfile, 'rb').__next__ # Example of alternate readline
392 readline = open(myfile, 'rb').__next__ # Example of alternate readline
392
393
393 The generator produces 5-tuples with these members: the token type; the
394 The generator produces 5-tuples with these members: the token type; the
394 token string; a 2-tuple (srow, scol) of ints specifying the row and
395 token string; a 2-tuple (srow, scol) of ints specifying the row and
395 column where the token begins in the source; a 2-tuple (erow, ecol) of
396 column where the token begins in the source; a 2-tuple (erow, ecol) of
396 ints specifying the row and column where the token ends in the source;
397 ints specifying the row and column where the token ends in the source;
397 and the line on which the token was found. The line passed is the
398 and the line on which the token was found. The line passed is the
398 logical line; continuation lines are included.
399 logical line; continuation lines are included.
399
400
400 The first token sequence will always be an ENCODING token
401 The first token sequence will always be an ENCODING token
401 which tells you which encoding was used to decode the bytes stream.
402 which tells you which encoding was used to decode the bytes stream.
402 """
403 """
403 # This import is here to avoid problems when the itertools module is not
404 # This import is here to avoid problems when the itertools module is not
404 # built yet and tokenize is imported.
405 # built yet and tokenize is imported.
405 from itertools import chain, repeat
406 from itertools import chain, repeat
406 encoding, consumed = detect_encoding(readline)
407 encoding, consumed = detect_encoding(readline)
407 rl_gen = iter(readline, b"")
408 rl_gen = iter(readline, b"")
408 empty = repeat(b"")
409 empty = repeat(b"")
409 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
410 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
410
411
411
412
412 def _tokenize(readline, encoding):
413 def _tokenize(readline, encoding):
413 lnum = parenlev = continued = 0
414 lnum = parenlev = continued = 0
414 numchars = '0123456789'
415 numchars = '0123456789'
415 contstr, needcont = '', 0
416 contstr, needcont = '', 0
416 contline = None
417 contline = None
417 indents = [0]
418 indents = [0]
418
419
419 if encoding is not None:
420 if encoding is not None:
420 if encoding == "utf-8-sig":
421 if encoding == "utf-8-sig":
421 # BOM will already have been stripped.
422 # BOM will already have been stripped.
422 encoding = "utf-8"
423 encoding = "utf-8"
423 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
424 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
424 while True: # loop over lines in stream
425 while True: # loop over lines in stream
425 try:
426 try:
426 line = readline()
427 line = readline()
427 except StopIteration:
428 except StopIteration:
428 line = b''
429 line = b''
429
430
430 if encoding is not None:
431 if encoding is not None:
431 line = line.decode(encoding)
432 line = line.decode(encoding)
432 lnum += 1
433 lnum += 1
433 pos, max = 0, len(line)
434 pos, max = 0, len(line)
434
435
435 if contstr: # continued string
436 if contstr: # continued string
436 if not line:
437 if not line:
437 raise TokenError("EOF in multi-line string", strstart)
438 raise TokenError("EOF in multi-line string", strstart)
438 endmatch = endprog.match(line)
439 endmatch = endprog.match(line)
439 if endmatch:
440 if endmatch:
440 pos = end = endmatch.end(0)
441 pos = end = endmatch.end(0)
441 yield TokenInfo(STRING, contstr + line[:end],
442 yield TokenInfo(STRING, contstr + line[:end],
442 strstart, (lnum, end), contline + line)
443 strstart, (lnum, end), contline + line)
443 contstr, needcont = '', 0
444 contstr, needcont = '', 0
444 contline = None
445 contline = None
445 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
446 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
446 yield TokenInfo(ERRORTOKEN, contstr + line,
447 yield TokenInfo(ERRORTOKEN, contstr + line,
447 strstart, (lnum, len(line)), contline)
448 strstart, (lnum, len(line)), contline)
448 contstr = ''
449 contstr = ''
449 contline = None
450 contline = None
450 continue
451 continue
451 else:
452 else:
452 contstr = contstr + line
453 contstr = contstr + line
453 contline = contline + line
454 contline = contline + line
454 continue
455 continue
455
456
456 elif parenlev == 0 and not continued: # new statement
457 elif parenlev == 0 and not continued: # new statement
457 if not line: break
458 if not line: break
458 column = 0
459 column = 0
459 while pos < max: # measure leading whitespace
460 while pos < max: # measure leading whitespace
460 if line[pos] == ' ':
461 if line[pos] == ' ':
461 column += 1
462 column += 1
462 elif line[pos] == '\t':
463 elif line[pos] == '\t':
463 column = (column//tabsize + 1)*tabsize
464 column = (column//tabsize + 1)*tabsize
464 elif line[pos] == '\f':
465 elif line[pos] == '\f':
465 column = 0
466 column = 0
466 else:
467 else:
467 break
468 break
468 pos += 1
469 pos += 1
469 if pos == max:
470 if pos == max:
470 break
471 break
471
472
472 if line[pos] in '#\r\n': # skip comments or blank lines
473 if line[pos] in '#\r\n': # skip comments or blank lines
473 if line[pos] == '#':
474 if line[pos] == '#':
474 comment_token = line[pos:].rstrip('\r\n')
475 comment_token = line[pos:].rstrip('\r\n')
475 nl_pos = pos + len(comment_token)
476 nl_pos = pos + len(comment_token)
476 yield TokenInfo(COMMENT, comment_token,
477 yield TokenInfo(COMMENT, comment_token,
477 (lnum, pos), (lnum, pos + len(comment_token)), line)
478 (lnum, pos), (lnum, pos + len(comment_token)), line)
478 yield TokenInfo(NEWLINE, line[nl_pos:],
479 yield TokenInfo(NEWLINE, line[nl_pos:],
479 (lnum, nl_pos), (lnum, len(line)), line)
480 (lnum, nl_pos), (lnum, len(line)), line)
480 else:
481 else:
481 yield TokenInfo(NEWLINE, line[pos:],
482 yield TokenInfo(NEWLINE, line[pos:],
482 (lnum, pos), (lnum, len(line)), line)
483 (lnum, pos), (lnum, len(line)), line)
483 continue
484 continue
484
485
485 if column > indents[-1]: # count indents or dedents
486 if column > indents[-1]: # count indents or dedents
486 indents.append(column)
487 indents.append(column)
487 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
488 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
488 while column < indents[-1]:
489 while column < indents[-1]:
489 if column not in indents:
490 if column not in indents:
490 raise IndentationError(
491 raise IndentationError(
491 "unindent does not match any outer indentation level",
492 "unindent does not match any outer indentation level",
492 ("<tokenize>", lnum, pos, line))
493 ("<tokenize>", lnum, pos, line))
493 indents = indents[:-1]
494 indents = indents[:-1]
494 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
495 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
495
496
496 else: # continued statement
497 else: # continued statement
497 if not line:
498 if not line:
498 raise TokenError("EOF in multi-line statement", (lnum, 0))
499 raise TokenError("EOF in multi-line statement", (lnum, 0))
499 continued = 0
500 continued = 0
500
501
501 while pos < max:
502 while pos < max:
502 pseudomatch = pseudoprog.match(line, pos)
503 pseudomatch = pseudoprog.match(line, pos)
503 if pseudomatch: # scan for tokens
504 if pseudomatch: # scan for tokens
504 start, end = pseudomatch.span(1)
505 start, end = pseudomatch.span(1)
505 spos, epos, pos = (lnum, start), (lnum, end), end
506 spos, epos, pos = (lnum, start), (lnum, end), end
506 token, initial = line[start:end], line[start]
507 token, initial = line[start:end], line[start]
507
508
508 if (initial in numchars or # ordinary number
509 if (initial in numchars or # ordinary number
509 (initial == '.' and token != '.' and token != '...')):
510 (initial == '.' and token != '.' and token != '...')):
510 yield TokenInfo(NUMBER, token, spos, epos, line)
511 yield TokenInfo(NUMBER, token, spos, epos, line)
511 elif initial in '\r\n':
512 elif initial in '\r\n':
512 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
513 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
513 token, spos, epos, line)
514 token, spos, epos, line)
514 elif initial == '#':
515 elif initial == '#':
515 assert not token.endswith("\n")
516 assert not token.endswith("\n")
516 yield TokenInfo(COMMENT, token, spos, epos, line)
517 yield TokenInfo(COMMENT, token, spos, epos, line)
517 elif token in triple_quoted:
518 elif token in triple_quoted:
518 endprog = endprogs[token]
519 endprog = endprogs[token]
519 endmatch = endprog.match(line, pos)
520 endmatch = endprog.match(line, pos)
520 if endmatch: # all on one line
521 if endmatch: # all on one line
521 pos = endmatch.end(0)
522 pos = endmatch.end(0)
522 token = line[start:pos]
523 token = line[start:pos]
523 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
524 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
524 else:
525 else:
525 strstart = (lnum, start) # multiple lines
526 strstart = (lnum, start) # multiple lines
526 contstr = line[start:]
527 contstr = line[start:]
527 contline = line
528 contline = line
528 break
529 break
529 elif initial in single_quoted or \
530 elif initial in single_quoted or \
530 token[:2] in single_quoted or \
531 token[:2] in single_quoted or \
531 token[:3] in single_quoted:
532 token[:3] in single_quoted:
532 if token[-1] == '\n': # continued string
533 if token[-1] == '\n': # continued string
533 strstart = (lnum, start)
534 strstart = (lnum, start)
534 endprog = (endprogs[initial] or endprogs[token[1]] or
535 endprog = (endprogs[initial] or endprogs[token[1]] or
535 endprogs[token[2]])
536 endprogs[token[2]])
536 contstr, needcont = line[start:], 1
537 contstr, needcont = line[start:], 1
537 contline = line
538 contline = line
538 break
539 break
539 else: # ordinary string
540 else: # ordinary string
540 yield TokenInfo(STRING, token, spos, epos, line)
541 yield TokenInfo(STRING, token, spos, epos, line)
541 elif initial.isidentifier(): # ordinary name
542 elif initial.isidentifier(): # ordinary name
542 yield TokenInfo(NAME, token, spos, epos, line)
543 yield TokenInfo(NAME, token, spos, epos, line)
543 elif initial == '\\': # continued stmt
544 elif initial == '\\': # continued stmt
544 continued = 1
545 continued = 1
545 else:
546 else:
546 if initial in '([{':
547 if initial in '([{':
547 parenlev += 1
548 parenlev += 1
548 elif initial in ')]}':
549 elif initial in ')]}':
549 parenlev -= 1
550 parenlev -= 1
550 yield TokenInfo(OP, token, spos, epos, line)
551 yield TokenInfo(OP, token, spos, epos, line)
551 else:
552 else:
552 yield TokenInfo(ERRORTOKEN, line[pos],
553 yield TokenInfo(ERRORTOKEN, line[pos],
553 (lnum, pos), (lnum, pos+1), line)
554 (lnum, pos), (lnum, pos+1), line)
554 pos += 1
555 pos += 1
555
556
556 for indent in indents[1:]: # pop remaining indent levels
557 for indent in indents[1:]: # pop remaining indent levels
557 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
558 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
558 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
559 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
559
560
560
561
561 # An undocumented, backwards compatible, API for all the places in the standard
562 # An undocumented, backwards compatible, API for all the places in the standard
562 # library that expect to be able to use tokenize with strings
563 # library that expect to be able to use tokenize with strings
563 def generate_tokens(readline):
564 def generate_tokens(readline):
564 return _tokenize(readline, None)
565 return _tokenize(readline, None)
565
566
566 if __name__ == "__main__":
567 if __name__ == "__main__":
567 # Quick sanity check
568 # Quick sanity check
568 s = b'''def parseline(self, line):
569 s = b'''def parseline(self, line):
569 """Parse the line into a command name and a string containing
570 """Parse the line into a command name and a string containing
570 the arguments. Returns a tuple containing (command, args, line).
571 the arguments. Returns a tuple containing (command, args, line).
571 'command' and 'args' may be None if the line couldn't be parsed.
572 'command' and 'args' may be None if the line couldn't be parsed.
572 """
573 """
573 line = line.strip()
574 line = line.strip()
574 if not line:
575 if not line:
575 return None, None, line
576 return None, None, line
576 elif line[0] == '?':
577 elif line[0] == '?':
577 line = 'help ' + line[1:]
578 line = 'help ' + line[1:]
578 elif line[0] == '!':
579 elif line[0] == '!':
579 if hasattr(self, 'do_shell'):
580 if hasattr(self, 'do_shell'):
580 line = 'shell ' + line[1:]
581 line = 'shell ' + line[1:]
581 else:
582 else:
582 return None, None, line
583 return None, None, line
583 i, n = 0, len(line)
584 i, n = 0, len(line)
584 while i < n and line[i] in self.identchars: i = i+1
585 while i < n and line[i] in self.identchars: i = i+1
585 cmd, arg = line[:i], line[i:].strip()
586 cmd, arg = line[:i], line[i:].strip()
586 return cmd, arg, line
587 return cmd, arg, line
587 '''
588 '''
588 for tok in tokenize(iter(s.splitlines()).__next__):
589 for tok in tokenize(iter(s.splitlines()).__next__):
589 print(tok)
590 print(tok)
General Comments 0
You need to be logged in to leave comments. Login now