From 30fce8a6d59c9a5984f20d787cdd9c3885d85bea 2013-03-31 09:03:55 From: Thomas Kluyver Date: 2013-03-31 09:03:55 Subject: [PATCH] Now include patched copies of tokenize for Python 2 and 3. --- diff --git a/IPython/core/inputtransformer.py b/IPython/core/inputtransformer.py index 5a6c717..f5a9fe5 100644 --- a/IPython/core/inputtransformer.py +++ b/IPython/core/inputtransformer.py @@ -2,17 +2,10 @@ import abc import functools import re from StringIO import StringIO -import tokenize - -try: - generate_tokens = tokenize.generate_tokens -except AttributeError: - # Python 3. Note that we use the undocumented _tokenize because it expects - # strings, not bytes. See also Python issue #9969. - generate_tokens = tokenize._tokenize from IPython.core.splitinput import split_user_input, LineInfo -from IPython.utils.untokenize import untokenize +from IPython.utils import tokenize2 +from IPython.utils.tokenize2 import generate_tokens, untokenize, TokenError #----------------------------------------------------------------------------- # Globals @@ -129,7 +122,7 @@ class TokenInputTransformer(InputTransformer): def get_line(self): if self.line_used: - raise tokenize.TokenError + raise TokenError self.line_used = True return self.current_line @@ -145,12 +138,12 @@ class TokenInputTransformer(InputTransformer): for intok in self.tokenizer: tokens.append(intok) t = intok[0] - if t == tokenize.NEWLINE or (stop_at_NL and t == tokenize.NL): + if t == tokenize2.NEWLINE or (stop_at_NL and t == tokenize2.NL): # Stop before we try to pull a line we don't have yet break - elif t in (tokenize.COMMENT, tokenize.ERRORTOKEN): + elif t == tokenize2.ERRORTOKEN: stop_at_NL = True - except tokenize.TokenError: + except TokenError: # Multi-line statement - stop and try again with the next line self.reset_tokenizer() return None @@ -297,11 +290,11 @@ def has_comment(src): readline = StringIO(src).readline toktypes = set() try: - for t in tokenize.generate_tokens(readline): + for t in generate_tokens(readline): toktypes.add(t[0]) - except tokenize.TokenError: + except TokenError: pass - return(tokenize.COMMENT in toktypes) + return(tokenize2.COMMENT in toktypes) @StatelessInputTransformer.wrap diff --git a/IPython/utils/_tokenize_py2.py b/IPython/utils/_tokenize_py2.py new file mode 100644 index 0000000..16a3c9d --- /dev/null +++ b/IPython/utils/_tokenize_py2.py @@ -0,0 +1,438 @@ +"""Patched version of standard library tokenize, to deal with various bugs. + +Patches + +- Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing), + manually applied. +- Newlines in comments and blank lines should be either NL or NEWLINE, depending + on whether they are in a multi-line statement. Filed as Python issue #17061. + +------------------------------------------------------------------------------- +Tokenization help for Python programs. + +generate_tokens(readline) is a generator that breaks a stream of +text into Python tokens. It accepts a readline-like method which is called +repeatedly to get the next line of input (or "" for EOF). It generates +5-tuples with these members: + + the token type (see token.py) + the token (a string) + the starting (row, column) indices of the token (a 2-tuple of ints) + the ending (row, column) indices of the token (a 2-tuple of ints) + the original line (string) + +It is designed to match the working of the Python tokenizer exactly, except +that it produces COMMENT tokens for comments and gives type OP for all +operators + +Older entry points + tokenize_loop(readline, tokeneater) + tokenize(readline, tokeneater=printtoken) +are the same, except instead of generating tokens, tokeneater is a callback +function to which the 5 fields described above are passed as 5 arguments, +each time a new token is found.""" + +__author__ = 'Ka-Ping Yee ' +__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' + 'Skip Montanaro, Raymond Hettinger') + +import string, re +from token import * + +import token +__all__ = [x for x in dir(token) if not x.startswith("_")] +__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] +del x +del token + +__all__ += ["TokenError"] + +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' +NL = N_TOKENS + 1 +tok_name[NL] = 'NL' +N_TOKENS += 2 + +def group(*choices): return '(' + '|'.join(choices) + ')' +def any(*choices): return group(*choices) + '*' +def maybe(*choices): return group(*choices) + '?' + +Whitespace = r'[ \f\t]*' +Comment = r'#[^\r\n]*' +Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) +Name = r'[a-zA-Z_]\w*' + +Hexnumber = r'0[xX][\da-fA-F]+[lL]?' +Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?' +Binnumber = r'0[bB][01]+[lL]?' +Decnumber = r'[1-9]\d*[lL]?' +Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) +Exponent = r'[eE][-+]?\d+' +Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) +Expfloat = r'\d+' + Exponent +Floatnumber = group(Pointfloat, Expfloat) +Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) + +# Tail end of ' string. +Single = r"[^'\\]*(?:\\.[^'\\]*)*'" +# Tail end of " string. +Double = r'[^"\\]*(?:\\.[^"\\]*)*"' +# Tail end of ''' string. +Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" +# Tail end of """ string. +Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' +Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') +# Single-line ' or " string. +String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", + r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') + +# Because of leftmost-then-longest match semantics, be sure to put the +# longest operators first (e.g., if = came before ==, == would get +# recognized as two instances of =). +Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", + r"//=?", + r"[+\-*/%&|^=<>]=?", + r"~") + +Bracket = '[][(){}]' +Special = group(r'\r?\n', r'[:;.,`@]') +Funny = group(Operator, Bracket, Special) + +PlainToken = group(Number, Funny, String, Name) +Token = Ignore + PlainToken + +# First (or only) line of ' or " string. +ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) +PseudoExtras = group(r'\\\r?\n', Comment, Triple) +PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) + +tokenprog, pseudoprog, single3prog, double3prog = map( + re.compile, (Token, PseudoToken, Single3, Double3)) +endprogs = {"'": re.compile(Single), '"': re.compile(Double), + "'''": single3prog, '"""': double3prog, + "r'''": single3prog, 'r"""': double3prog, + "u'''": single3prog, 'u"""': double3prog, + "ur'''": single3prog, 'ur"""': double3prog, + "R'''": single3prog, 'R"""': double3prog, + "U'''": single3prog, 'U"""': double3prog, + "uR'''": single3prog, 'uR"""': double3prog, + "Ur'''": single3prog, 'Ur"""': double3prog, + "UR'''": single3prog, 'UR"""': double3prog, + "b'''": single3prog, 'b"""': double3prog, + "br'''": single3prog, 'br"""': double3prog, + "B'''": single3prog, 'B"""': double3prog, + "bR'''": single3prog, 'bR"""': double3prog, + "Br'''": single3prog, 'Br"""': double3prog, + "BR'''": single3prog, 'BR"""': double3prog, + 'r': None, 'R': None, 'u': None, 'U': None, + 'b': None, 'B': None} + +triple_quoted = {} +for t in ("'''", '"""', + "r'''", 'r"""', "R'''", 'R"""', + "u'''", 'u"""', "U'''", 'U"""', + "ur'''", 'ur"""', "Ur'''", 'Ur"""', + "uR'''", 'uR"""', "UR'''", 'UR"""', + "b'''", 'b"""', "B'''", 'B"""', + "br'''", 'br"""', "Br'''", 'Br"""', + "bR'''", 'bR"""', "BR'''", 'BR"""'): + triple_quoted[t] = t +single_quoted = {} +for t in ("'", '"', + "r'", 'r"', "R'", 'R"', + "u'", 'u"', "U'", 'U"', + "ur'", 'ur"', "Ur'", 'Ur"', + "uR'", 'uR"', "UR'", 'UR"', + "b'", 'b"', "B'", 'B"', + "br'", 'br"', "Br'", 'Br"', + "bR'", 'bR"', "BR'", 'BR"' ): + single_quoted[t] = t + +tabsize = 8 + +class TokenError(Exception): pass + +class StopTokenizing(Exception): pass + +def printtoken(type, token, srow_scol, erow_ecol, line): # for testing + srow, scol = srow_scol + erow, ecol = erow_ecol + print "%d,%d-%d,%d:\t%s\t%s" % \ + (srow, scol, erow, ecol, tok_name[type], repr(token)) + +def tokenize(readline, tokeneater=printtoken): + """ + The tokenize() function accepts two parameters: one representing the + input stream, and one providing an output mechanism for tokenize(). + + The first parameter, readline, must be a callable object which provides + the same interface as the readline() method of built-in file objects. + Each call to the function should return one line of input as a string. + + The second parameter, tokeneater, must also be a callable object. It is + called once for each token, with five arguments, corresponding to the + tuples generated by generate_tokens(). + """ + try: + tokenize_loop(readline, tokeneater) + except StopTokenizing: + pass + +# backwards compatible interface +def tokenize_loop(readline, tokeneater): + for token_info in generate_tokens(readline): + tokeneater(*token_info) + +class Untokenizer: + + def __init__(self): + self.tokens = [] + self.prev_row = 1 + self.prev_col = 0 + + def add_whitespace(self, start): + row, col = start + assert row >= self.prev_row + col_offset = col - self.prev_col + if col_offset > 0: + self.tokens.append(" " * col_offset) + elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): + # Line was backslash-continued + self.tokens.append(" ") + + def untokenize(self, tokens): + iterable = iter(tokens) + for t in iterable: + if len(t) == 2: + self.compat(t, iterable) + break + tok_type, token, start, end = t[:4] + self.add_whitespace(start) + self.tokens.append(token) + self.prev_row, self.prev_col = end + if tok_type in (NEWLINE, NL): + self.prev_row += 1 + self.prev_col = 0 + return "".join(self.tokens) + + def compat(self, token, iterable): + # This import is here to avoid problems when the itertools + # module is not built yet and tokenize is imported. + from itertools import chain + startline = False + prevstring = False + indents = [] + toks_append = self.tokens.append + for tok in chain([token], iterable): + toknum, tokval = tok[:2] + + if toknum in (NAME, NUMBER): + tokval += ' ' + + # Insert a space between two consecutive strings + if toknum == STRING: + if prevstring: + tokval = ' ' + tokval + prevstring = True + else: + prevstring = False + + if toknum == INDENT: + indents.append(tokval) + continue + elif toknum == DEDENT: + indents.pop() + continue + elif toknum in (NEWLINE, NL): + startline = True + elif startline and indents: + toks_append(indents[-1]) + startline = False + toks_append(tokval) + +def untokenize(iterable): + """Transform tokens back into Python source code. + + Each element returned by the iterable must be a token sequence + with at least two elements, a token number and token value. If + only two tokens are passed, the resulting output is poor. + + Round-trip invariant for full input: + Untokenized source will match input source exactly + + Round-trip invariant for limited intput: + # Output text will tokenize the back to the input + t1 = [tok[:2] for tok in generate_tokens(f.readline)] + newcode = untokenize(t1) + readline = iter(newcode.splitlines(1)).next + t2 = [tok[:2] for tok in generate_tokens(readline)] + assert t1 == t2 + """ + ut = Untokenizer() + return ut.untokenize(iterable) + +def generate_tokens(readline): + """ + The generate_tokens() generator requires one argment, readline, which + must be a callable object which provides the same interface as the + readline() method of built-in file objects. Each call to the function + should return one line of input as a string. Alternately, readline + can be a callable function terminating with StopIteration: + readline = open(myfile).next # Example of alternate readline + + The generator produces 5-tuples with these members: the token type; the + token string; a 2-tuple (srow, scol) of ints specifying the row and + column where the token begins in the source; a 2-tuple (erow, ecol) of + ints specifying the row and column where the token ends in the source; + and the line on which the token was found. The line passed is the + logical line; continuation lines are included. + """ + lnum = parenlev = continued = 0 + namechars, numchars = string.ascii_letters + '_', '0123456789' + contstr, needcont = '', 0 + contline = None + indents = [0] + + while 1: # loop over lines in stream + try: + line = readline() + except StopIteration: + line = '' + lnum += 1 + pos, max = 0, len(line) + + if contstr: # continued string + if not line: + raise TokenError, ("EOF in multi-line string", strstart) + endmatch = endprog.match(line) + if endmatch: + pos = end = endmatch.end(0) + yield (STRING, contstr + line[:end], + strstart, (lnum, end), contline + line) + contstr, needcont = '', 0 + contline = None + elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': + yield (ERRORTOKEN, contstr + line, + strstart, (lnum, len(line)), contline) + contstr = '' + contline = None + continue + else: + contstr = contstr + line + contline = contline + line + continue + + elif parenlev == 0 and not continued: # new statement + if not line: break + column = 0 + while pos < max: # measure leading whitespace + if line[pos] == ' ': + column += 1 + elif line[pos] == '\t': + column = (column//tabsize + 1)*tabsize + elif line[pos] == '\f': + column = 0 + else: + break + pos += 1 + if pos == max: + break + + if line[pos] in '#\r\n': # skip comments or blank lines + if line[pos] == '#': + comment_token = line[pos:].rstrip('\r\n') + nl_pos = pos + len(comment_token) + yield (COMMENT, comment_token, + (lnum, pos), (lnum, pos + len(comment_token)), line) + yield (NEWLINE, line[nl_pos:], + (lnum, nl_pos), (lnum, len(line)), line) + else: + yield (NEWLINE, line[pos:], + (lnum, pos), (lnum, len(line)), line) + continue + + if column > indents[-1]: # count indents or dedents + indents.append(column) + yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) + while column < indents[-1]: + if column not in indents: + raise IndentationError( + "unindent does not match any outer indentation level", + ("", lnum, pos, line)) + indents = indents[:-1] + yield (DEDENT, '', (lnum, pos), (lnum, pos), line) + + else: # continued statement + if not line: + raise TokenError, ("EOF in multi-line statement", (lnum, 0)) + continued = 0 + + while pos < max: + pseudomatch = pseudoprog.match(line, pos) + if pseudomatch: # scan for tokens + start, end = pseudomatch.span(1) + spos, epos, pos = (lnum, start), (lnum, end), end + token, initial = line[start:end], line[start] + + if initial in numchars or \ + (initial == '.' and token != '.'): # ordinary number + yield (NUMBER, token, spos, epos, line) + elif initial in '\r\n': + yield (NL if parenlev > 0 else NEWLINE, + token, spos, epos, line) + elif initial == '#': + assert not token.endswith("\n") + yield (COMMENT, token, spos, epos, line) + elif token in triple_quoted: + endprog = endprogs[token] + endmatch = endprog.match(line, pos) + if endmatch: # all on one line + pos = endmatch.end(0) + token = line[start:pos] + yield (STRING, token, spos, (lnum, pos), line) + else: + strstart = (lnum, start) # multiple lines + contstr = line[start:] + contline = line + break + elif initial in single_quoted or \ + token[:2] in single_quoted or \ + token[:3] in single_quoted: + if token[-1] == '\n': # continued string + strstart = (lnum, start) + endprog = (endprogs[initial] or endprogs[token[1]] or + endprogs[token[2]]) + contstr, needcont = line[start:], 1 + contline = line + break + else: # ordinary string + yield (STRING, token, spos, epos, line) + elif initial in namechars: # ordinary name + yield (NAME, token, spos, epos, line) + elif initial == '\\': # continued stmt + continued = 1 + else: + if initial in '([{': + parenlev += 1 + elif initial in ')]}': + parenlev -= 1 + yield (OP, token, spos, epos, line) + else: + yield (ERRORTOKEN, line[pos], + (lnum, pos), (lnum, pos+1), line) + pos += 1 + + for indent in indents[1:]: # pop remaining indent levels + yield (DEDENT, '', (lnum, 0), (lnum, 0), '') + yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') + +if __name__ == '__main__': # testing + import sys + if len(sys.argv) > 1: + tokenize(open(sys.argv[1]).readline) + else: + tokenize(sys.stdin.readline) diff --git a/IPython/utils/_tokenize_py3.py b/IPython/utils/_tokenize_py3.py new file mode 100644 index 0000000..a596fe4 --- /dev/null +++ b/IPython/utils/_tokenize_py3.py @@ -0,0 +1,574 @@ +"""Patched version of standard library tokenize, to deal with various bugs. + +Based on Python 3.2 code. + +Patches: + +- Gareth Rees' patch for Python issue #12691 (untokenizing) + - Except we don't encode the output of untokenize + - Python 2 compatible syntax, so that it can be byte-compiled at installation +- Newlines in comments and blank lines should be either NL or NEWLINE, depending + on whether they are in a multi-line statement. Filed as Python issue #17061. +- Export generate_tokens & TokenError + +------------------------------------------------------------------------------ +Tokenization help for Python programs. + +tokenize(readline) is a generator that breaks a stream of bytes into +Python tokens. It decodes the bytes according to PEP-0263 for +determining source file encoding. + +It accepts a readline-like method which is called repeatedly to get the +next line of input (or b"" for EOF). It generates 5-tuples with these +members: + + the token type (see token.py) + the token (a string) + the starting (row, column) indices of the token (a 2-tuple of ints) + the ending (row, column) indices of the token (a 2-tuple of ints) + the original line (string) + +It is designed to match the working of the Python tokenizer exactly, except +that it produces COMMENT tokens for comments and gives type OP for all +operators. Additionally, all token lists start with an ENCODING token +which tells you which encoding was used to decode the bytes stream. +""" +from __future__ import absolute_import + +__author__ = 'Ka-Ping Yee ' +__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' + 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' + 'Michael Foord') +import builtins +import re +import sys +from token import * +from codecs import lookup, BOM_UTF8 +import collections +from io import TextIOWrapper +cookie_re = re.compile("coding[:=]\s*([-\w.]+)") + +import token +__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", + "NL", "untokenize", "ENCODING", "TokenInfo"] +del token + +__all__ += ["generate_tokens", "TokenError"] + +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' +NL = N_TOKENS + 1 +tok_name[NL] = 'NL' +ENCODING = N_TOKENS + 2 +tok_name[ENCODING] = 'ENCODING' +N_TOKENS += 3 + +class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): + def __repr__(self): + annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) + return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % + self._replace(type=annotated_type)) + +def group(*choices): return '(' + '|'.join(choices) + ')' +def any(*choices): return group(*choices) + '*' +def maybe(*choices): return group(*choices) + '?' + +# Note: we use unicode matching for names ("\w") but ascii matching for +# number literals. +Whitespace = r'[ \f\t]*' +Comment = r'#[^\r\n]*' +Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) +Name = r'\w+' + +Hexnumber = r'0[xX][0-9a-fA-F]+' +Binnumber = r'0[bB][01]+' +Octnumber = r'0[oO][0-7]+' +Decnumber = r'(?:0+|[1-9][0-9]*)' +Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) +Exponent = r'[eE][-+]?[0-9]+' +Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) +Expfloat = r'[0-9]+' + Exponent +Floatnumber = group(Pointfloat, Expfloat) +Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) + +# Tail end of ' string. +Single = r"[^'\\]*(?:\\.[^'\\]*)*'" +# Tail end of " string. +Double = r'[^"\\]*(?:\\.[^"\\]*)*"' +# Tail end of ''' string. +Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" +# Tail end of """ string. +Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' +Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""') +# Single-line ' or " string. +String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", + r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') + +# Because of leftmost-then-longest match semantics, be sure to put the +# longest operators first (e.g., if = came before ==, == would get +# recognized as two instances of =). +Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", + r"//=?", r"->", + r"[+\-*/%&|^=<>]=?", + r"~") + +Bracket = '[][(){}]' +Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') +Funny = group(Operator, Bracket, Special) + +PlainToken = group(Number, Funny, String, Name) +Token = Ignore + PlainToken + +# First (or only) line of ' or " string. +ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) +PseudoExtras = group(r'\\\r?\n', Comment, Triple) +PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) + +def _compile(expr): + return re.compile(expr, re.UNICODE) + +tokenprog, pseudoprog, single3prog, double3prog = map( + _compile, (Token, PseudoToken, Single3, Double3)) +endprogs = {"'": _compile(Single), '"': _compile(Double), + "'''": single3prog, '"""': double3prog, + "r'''": single3prog, 'r"""': double3prog, + "b'''": single3prog, 'b"""': double3prog, + "br'''": single3prog, 'br"""': double3prog, + "R'''": single3prog, 'R"""': double3prog, + "B'''": single3prog, 'B"""': double3prog, + "bR'''": single3prog, 'bR"""': double3prog, + "Br'''": single3prog, 'Br"""': double3prog, + "BR'''": single3prog, 'BR"""': double3prog, + 'r': None, 'R': None, 'b': None, 'B': None} + +triple_quoted = {} +for t in ("'''", '"""', + "r'''", 'r"""', "R'''", 'R"""', + "b'''", 'b"""', "B'''", 'B"""', + "br'''", 'br"""', "Br'''", 'Br"""', + "bR'''", 'bR"""', "BR'''", 'BR"""'): + triple_quoted[t] = t +single_quoted = {} +for t in ("'", '"', + "r'", 'r"', "R'", 'R"', + "b'", 'b"', "B'", 'B"', + "br'", 'br"', "Br'", 'Br"', + "bR'", 'bR"', "BR'", 'BR"' ): + single_quoted[t] = t + +del _compile + +tabsize = 8 + +class TokenError(Exception): pass + +class StopTokenizing(Exception): pass + + +class Untokenizer: + + def __init__(self): + self.tokens = [] + self.prev_row = 1 + self.prev_col = 0 + self.encoding = 'utf-8' + + def add_whitespace(self, tok_type, start): + row, col = start + assert row >= self.prev_row + col_offset = col - self.prev_col + if col_offset > 0: + self.tokens.append(" " * col_offset) + elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): + # Line was backslash-continued. + self.tokens.append(" ") + + def untokenize(self, tokens): + iterable = iter(tokens) + for t in iterable: + if len(t) == 2: + self.compat(t, iterable) + break + tok_type, token, start, end = t[:4] + if tok_type == ENCODING: + self.encoding = token + continue + self.add_whitespace(tok_type, start) + self.tokens.append(token) + self.prev_row, self.prev_col = end + if tok_type in (NEWLINE, NL): + self.prev_row += 1 + self.prev_col = 0 + return "".join(self.tokens) + + def compat(self, token, iterable): + # This import is here to avoid problems when the itertools + # module is not built yet and tokenize is imported. + from itertools import chain + startline = False + prevstring = False + indents = [] + toks_append = self.tokens.append + + for tok in chain([token], iterable): + toknum, tokval = tok[:2] + if toknum == ENCODING: + self.encoding = tokval + continue + + if toknum in (NAME, NUMBER): + tokval += ' ' + + # Insert a space between two consecutive strings + if toknum == STRING: + if prevstring: + tokval = ' ' + tokval + prevstring = True + else: + prevstring = False + + if toknum == INDENT: + indents.append(tokval) + continue + elif toknum == DEDENT: + indents.pop() + continue + elif toknum in (NEWLINE, NL): + startline = True + elif startline and indents: + toks_append(indents[-1]) + startline = False + toks_append(tokval) + + +def untokenize(tokens): + """ + Convert ``tokens`` (an iterable) back into Python source code. Return + a bytes object, encoded using the encoding specified by the last + ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found. + + The result is guaranteed to tokenize back to match the input so that + the conversion is lossless and round-trips are assured. The + guarantee applies only to the token type and token string as the + spacing between tokens (column positions) may change. + + :func:`untokenize` has two modes. If the input tokens are sequences + of length 2 (``type``, ``string``) then spaces are added as necessary to + preserve the round-trip property. + + If the input tokens are sequences of length 4 or more (``type``, + ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then + spaces are added so that each token appears in the result at the + position indicated by ``start`` and ``end``, if possible. + """ + return Untokenizer().untokenize(tokens) + + +def _get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace("_", "-") + if enc == "utf-8" or enc.startswith("utf-8-"): + return "utf-8" + if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ + enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): + return "iso-8859-1" + return orig_enc + +def detect_encoding(readline): + """ + The detect_encoding() function is used to detect the encoding that should + be used to decode a Python source file. It requires one argment, readline, + in the same way as the tokenize() generator. + + It will call readline a maximum of twice, and return the encoding used + (as a string) and a list of any lines (left as bytes) it has read in. + + It detects the encoding from the presence of a utf-8 bom or an encoding + cookie as specified in pep-0263. If both a bom and a cookie are present, + but disagree, a SyntaxError will be raised. If the encoding cookie is an + invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, + 'utf-8-sig' is returned. + + If no encoding is specified, then the default of 'utf-8' will be returned. + """ + bom_found = False + encoding = None + default = 'utf-8' + def read_or_stop(): + try: + return readline() + except StopIteration: + return b'' + + def find_cookie(line): + try: + # Decode as UTF-8. Either the line is an encoding declaration, + # in which case it should be pure ASCII, or it must be UTF-8 + # per default encoding. + line_string = line.decode('utf-8') + except UnicodeDecodeError: + raise SyntaxError("invalid or missing encoding declaration") + + matches = cookie_re.findall(line_string) + if not matches: + return None + encoding = _get_normal_name(matches[0]) + try: + codec = lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + raise SyntaxError("unknown encoding: " + encoding) + + if bom_found: + if encoding != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + encoding += '-sig' + return encoding + + first = read_or_stop() + if first.startswith(BOM_UTF8): + bom_found = True + first = first[3:] + default = 'utf-8-sig' + if not first: + return default, [] + + encoding = find_cookie(first) + if encoding: + return encoding, [first] + + second = read_or_stop() + if not second: + return default, [first] + + encoding = find_cookie(second) + if encoding: + return encoding, [first, second] + + return default, [first, second] + + +def open(filename): + """Open a file in read only mode using the encoding detected by + detect_encoding(). + """ + buffer = builtins.open(filename, 'rb') + encoding, lines = detect_encoding(buffer.readline) + buffer.seek(0) + text = TextIOWrapper(buffer, encoding, line_buffering=True) + text.mode = 'r' + return text + + +def tokenize(readline): + """ + The tokenize() generator requires one argment, readline, which + must be a callable object which provides the same interface as the + readline() method of built-in file objects. Each call to the function + should return one line of input as bytes. Alternately, readline + can be a callable function terminating with StopIteration: + readline = open(myfile, 'rb').__next__ # Example of alternate readline + + The generator produces 5-tuples with these members: the token type; the + token string; a 2-tuple (srow, scol) of ints specifying the row and + column where the token begins in the source; a 2-tuple (erow, ecol) of + ints specifying the row and column where the token ends in the source; + and the line on which the token was found. The line passed is the + logical line; continuation lines are included. + + The first token sequence will always be an ENCODING token + which tells you which encoding was used to decode the bytes stream. + """ + # This import is here to avoid problems when the itertools module is not + # built yet and tokenize is imported. + from itertools import chain, repeat + encoding, consumed = detect_encoding(readline) + rl_gen = iter(readline, b"") + empty = repeat(b"") + return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) + + +def _tokenize(readline, encoding): + lnum = parenlev = continued = 0 + numchars = '0123456789' + contstr, needcont = '', 0 + contline = None + indents = [0] + + if encoding is not None: + if encoding == "utf-8-sig": + # BOM will already have been stripped. + encoding = "utf-8" + yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') + while True: # loop over lines in stream + try: + line = readline() + except StopIteration: + line = b'' + + if encoding is not None: + line = line.decode(encoding) + lnum += 1 + pos, max = 0, len(line) + + if contstr: # continued string + if not line: + raise TokenError("EOF in multi-line string", strstart) + endmatch = endprog.match(line) + if endmatch: + pos = end = endmatch.end(0) + yield TokenInfo(STRING, contstr + line[:end], + strstart, (lnum, end), contline + line) + contstr, needcont = '', 0 + contline = None + elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': + yield TokenInfo(ERRORTOKEN, contstr + line, + strstart, (lnum, len(line)), contline) + contstr = '' + contline = None + continue + else: + contstr = contstr + line + contline = contline + line + continue + + elif parenlev == 0 and not continued: # new statement + if not line: break + column = 0 + while pos < max: # measure leading whitespace + if line[pos] == ' ': + column += 1 + elif line[pos] == '\t': + column = (column//tabsize + 1)*tabsize + elif line[pos] == '\f': + column = 0 + else: + break + pos += 1 + if pos == max: + break + + if line[pos] in '#\r\n': # skip comments or blank lines + if line[pos] == '#': + comment_token = line[pos:].rstrip('\r\n') + nl_pos = pos + len(comment_token) + yield TokenInfo(COMMENT, comment_token, + (lnum, pos), (lnum, pos + len(comment_token)), line) + yield TokenInfo(NEWLINE, line[nl_pos:], + (lnum, nl_pos), (lnum, len(line)), line) + else: + yield TokenInfo(NEWLINE, line[pos:], + (lnum, pos), (lnum, len(line)), line) + continue + + if column > indents[-1]: # count indents or dedents + indents.append(column) + yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) + while column < indents[-1]: + if column not in indents: + raise IndentationError( + "unindent does not match any outer indentation level", + ("", lnum, pos, line)) + indents = indents[:-1] + yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) + + else: # continued statement + if not line: + raise TokenError("EOF in multi-line statement", (lnum, 0)) + continued = 0 + + while pos < max: + pseudomatch = pseudoprog.match(line, pos) + if pseudomatch: # scan for tokens + start, end = pseudomatch.span(1) + spos, epos, pos = (lnum, start), (lnum, end), end + token, initial = line[start:end], line[start] + + if (initial in numchars or # ordinary number + (initial == '.' and token != '.' and token != '...')): + yield TokenInfo(NUMBER, token, spos, epos, line) + elif initial in '\r\n': + yield TokenInfo(NL if parenlev > 0 else NEWLINE, + token, spos, epos, line) + elif initial == '#': + assert not token.endswith("\n") + yield TokenInfo(COMMENT, token, spos, epos, line) + elif token in triple_quoted: + endprog = endprogs[token] + endmatch = endprog.match(line, pos) + if endmatch: # all on one line + pos = endmatch.end(0) + token = line[start:pos] + yield TokenInfo(STRING, token, spos, (lnum, pos), line) + else: + strstart = (lnum, start) # multiple lines + contstr = line[start:] + contline = line + break + elif initial in single_quoted or \ + token[:2] in single_quoted or \ + token[:3] in single_quoted: + if token[-1] == '\n': # continued string + strstart = (lnum, start) + endprog = (endprogs[initial] or endprogs[token[1]] or + endprogs[token[2]]) + contstr, needcont = line[start:], 1 + contline = line + break + else: # ordinary string + yield TokenInfo(STRING, token, spos, epos, line) + elif initial.isidentifier(): # ordinary name + yield TokenInfo(NAME, token, spos, epos, line) + elif initial == '\\': # continued stmt + continued = 1 + else: + if initial in '([{': + parenlev += 1 + elif initial in ')]}': + parenlev -= 1 + yield TokenInfo(OP, token, spos, epos, line) + else: + yield TokenInfo(ERRORTOKEN, line[pos], + (lnum, pos), (lnum, pos+1), line) + pos += 1 + + for indent in indents[1:]: # pop remaining indent levels + yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') + yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') + + +# An undocumented, backwards compatible, API for all the places in the standard +# library that expect to be able to use tokenize with strings +def generate_tokens(readline): + return _tokenize(readline, None) + +if __name__ == "__main__": + # Quick sanity check + s = b'''def parseline(self, line): + """Parse the line into a command name and a string containing + the arguments. Returns a tuple containing (command, args, line). + 'command' and 'args' may be None if the line couldn't be parsed. + """ + line = line.strip() + if not line: + return None, None, line + elif line[0] == '?': + line = 'help ' + line[1:] + elif line[0] == '!': + if hasattr(self, 'do_shell'): + line = 'shell ' + line[1:] + else: + return None, None, line + i, n = 0, len(line) + while i < n and line[i] in self.identchars: i = i+1 + cmd, arg = line[:i], line[i:].strip() + return cmd, arg, line + ''' + for tok in tokenize(iter(s.splitlines()).__next__): + print(tok) diff --git a/IPython/utils/tokenize2.py b/IPython/utils/tokenize2.py new file mode 100644 index 0000000..d44a2f7 --- /dev/null +++ b/IPython/utils/tokenize2.py @@ -0,0 +1,9 @@ +"""Load our patched versions of tokenize. +""" + +import sys + +if sys.version_info[0] >= 3: + from _tokenize_py3 import * +else: + from _tokenize_py2 import * diff --git a/IPython/utils/untokenize.py b/IPython/utils/untokenize.py deleted file mode 100644 index ee87f89..0000000 --- a/IPython/utils/untokenize.py +++ /dev/null @@ -1,125 +0,0 @@ -"""This is a patched copy of the untokenize machinery from the standard library. - -untokenize has a number of major bugs that render it almost useless. We're using -the patch written by Gareth Rees on Python issue 12961: - -http://bugs.python.org/issue12691 - -We've undone one part of the patch - it encoded the output to bytes, to neatly -round-trip from tokenize. We want to keep working with text, so we don't encode. -""" - -__author__ = 'Ka-Ping Yee ' -__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' - 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' - 'Michael Foord') -from token import * - - -from tokenize import COMMENT, NL - -try: - # Python 3 - from tokenize import ENCODING -except: - ENCODING = 987654321 - -class Untokenizer: - - def __init__(self): - self.tokens = [] - self.prev_row = 1 - self.prev_col = 0 - self.encoding = 'utf-8' - - def add_whitespace(self, tok_type, start): - row, col = start - assert row >= self.prev_row - col_offset = col - self.prev_col - if col_offset > 0: - self.tokens.append(" " * col_offset) - elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): - # Line was backslash-continued. - self.tokens.append(" ") - - def untokenize(self, tokens): - iterable = iter(tokens) - for t in iterable: - if len(t) == 2: - self.compat(t, iterable) - break - # IPython modification - valid Python 2 syntax - tok_type, token, start, end = t[:4] - if tok_type == ENCODING: - self.encoding = token - continue - self.add_whitespace(tok_type, start) - self.tokens.append(token) - self.prev_row, self.prev_col = end - if tok_type in (NEWLINE, NL): - self.prev_row += 1 - self.prev_col = 0 - # IPython modification - don't encode output - return "".join(self.tokens) - - def compat(self, token, iterable): - # This import is here to avoid problems when the itertools - # module is not built yet and tokenize is imported. - from itertools import chain - startline = False - prevstring = False - indents = [] - toks_append = self.tokens.append - - for tok in chain([token], iterable): - toknum, tokval = tok[:2] - if toknum == ENCODING: - self.encoding = tokval - continue - - if toknum in (NAME, NUMBER): - tokval += ' ' - - # Insert a space between two consecutive strings - if toknum == STRING: - if prevstring: - tokval = ' ' + tokval - prevstring = True - else: - prevstring = False - - if toknum == INDENT: - indents.append(tokval) - continue - elif toknum == DEDENT: - indents.pop() - continue - elif toknum in (NEWLINE, NL): - startline = True - elif startline and indents: - toks_append(indents[-1]) - startline = False - toks_append(tokval) - - -def untokenize(tokens): - """ - Convert ``tokens`` (an iterable) back into Python source code. Return - a bytes object, encoded using the encoding specified by the last - ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found. - - The result is guaranteed to tokenize back to match the input so that - the conversion is lossless and round-trips are assured. The - guarantee applies only to the token type and token string as the - spacing between tokens (column positions) may change. - - :func:`untokenize` has two modes. If the input tokens are sequences - of length 2 (``type``, ``string``) then spaces are added as necessary to - preserve the round-trip property. - - If the input tokens are sequences of length 4 or more (``type``, - ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then - spaces are added so that each token appears in the result at the - position indicated by ``start`` and ``end``, if possible. - """ - return Untokenizer().untokenize(tokens)