diff --git a/IPython/core/inputtransformer.py b/IPython/core/inputtransformer.py index 2be8760..b1c0dd2 100644 --- a/IPython/core/inputtransformer.py +++ b/IPython/core/inputtransformer.py @@ -8,11 +8,11 @@ This includes the machinery to recognise and transform ``%magic`` commands, import abc import functools import re +import tokenize +from tokenize import generate_tokens, untokenize, TokenError from io import StringIO from IPython.core.splitinput import LineInfo -from IPython.utils import tokenize2 -from IPython.utils.tokenize2 import generate_tokens, untokenize, TokenError #----------------------------------------------------------------------------- # Globals @@ -140,10 +140,10 @@ class TokenInputTransformer(InputTransformer): for intok in self.tokenizer: tokens.append(intok) t = intok[0] - if t == tokenize2.NEWLINE or (stop_at_NL and t == tokenize2.NL): + if t == tokenize.NEWLINE or (stop_at_NL and t == tokenize.NL): # Stop before we try to pull a line we don't have yet break - elif t == tokenize2.ERRORTOKEN: + elif t == tokenize.ERRORTOKEN: stop_at_NL = True except TokenError: # Multi-line statement - stop and try again with the next line @@ -319,7 +319,7 @@ def has_comment(src): comment : bool True if source has a comment. """ - return (tokenize2.COMMENT in _line_tokens(src)) + return (tokenize.COMMENT in _line_tokens(src)) def ends_in_comment_or_string(src): """Indicates whether or not an input line ends in a comment or within @@ -336,7 +336,7 @@ def ends_in_comment_or_string(src): True if source ends in a comment or multiline string. """ toktypes = _line_tokens(src) - return (tokenize2.COMMENT in toktypes) or (_MULTILINE_STRING in toktypes) + return (tokenize.COMMENT in toktypes) or (_MULTILINE_STRING in toktypes) @StatelessInputTransformer.wrap diff --git a/IPython/utils/tokenize2.py b/IPython/utils/tokenize2.py deleted file mode 100644 index 97ac18d..0000000 --- a/IPython/utils/tokenize2.py +++ /dev/null @@ -1,590 +0,0 @@ -"""Patched version of standard library tokenize, to deal with various bugs. - -Based on Python 3.2 code. - -Patches: - -- Gareth Rees' patch for Python issue #12691 (untokenizing) - - Except we don't encode the output of untokenize - - Python 2 compatible syntax, so that it can be byte-compiled at installation -- Newlines in comments and blank lines should be either NL or NEWLINE, depending - on whether they are in a multi-line statement. Filed as Python issue #17061. -- Export generate_tokens & TokenError -- u and rb literals are allowed under Python 3.3 and above. - ------------------------------------------------------------------------------- - -Tokenization help for Python programs. - -tokenize(readline) is a generator that breaks a stream of bytes into -Python tokens. It decodes the bytes according to PEP-0263 for -determining source file encoding. - -It accepts a readline-like method which is called repeatedly to get the -next line of input (or b"" for EOF). It generates 5-tuples with these -members: - - the token type (see token.py) - the token (a string) - the starting (row, column) indices of the token (a 2-tuple of ints) - the ending (row, column) indices of the token (a 2-tuple of ints) - the original line (string) - -It is designed to match the working of the Python tokenizer exactly, except -that it produces COMMENT tokens for comments and gives type OP for all -operators. Additionally, all token lists start with an ENCODING token -which tells you which encoding was used to decode the bytes stream. -""" - -__author__ = 'Ka-Ping Yee ' -__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' - 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' - 'Michael Foord') -import builtins -import re -import sys -from token import * -from codecs import lookup, BOM_UTF8 -import collections -from io import TextIOWrapper -cookie_re = re.compile("coding[:=]\s*([-\w.]+)") - -import token -__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", - "NL", "untokenize", "ENCODING", "TokenInfo"] -del token - -__all__ += ["generate_tokens", "TokenError"] - -COMMENT = N_TOKENS -tok_name[COMMENT] = 'COMMENT' -NL = N_TOKENS + 1 -tok_name[NL] = 'NL' -ENCODING = N_TOKENS + 2 -tok_name[ENCODING] = 'ENCODING' -N_TOKENS += 3 - -class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): - def __repr__(self): - annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) - return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % - self._replace(type=annotated_type)) - -def group(*choices): return '(' + '|'.join(choices) + ')' -def any(*choices): return group(*choices) + '*' -def maybe(*choices): return group(*choices) + '?' - -# Note: we use unicode matching for names ("\w") but ascii matching for -# number literals. -Whitespace = r'[ \f\t]*' -Comment = r'#[^\r\n]*' -Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'\w+' - -Hexnumber = r'0[xX][0-9a-fA-F]+' -Binnumber = r'0[bB][01]+' -Octnumber = r'0[oO][0-7]+' -Decnumber = r'(?:0+|[1-9][0-9]*)' -Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?[0-9]+' -Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) -Expfloat = r'[0-9]+' + Exponent -Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') -Number = group(Imagnumber, Floatnumber, Intnumber) -StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?' - -# Tail end of ' string. -Single = r"[^'\\]*(?:\\.[^'\\]*)*'" -# Tail end of " string. -Double = r'[^"\\]*(?:\\.[^"\\]*)*"' -# Tail end of ''' string. -Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" -# Tail end of """ string. -Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -Triple = group(StringPrefix + "'''", StringPrefix + '"""') -# Single-line ' or " string. -String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", - StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') - -# Because of leftmost-then-longest match semantics, be sure to put the -# longest operators first (e.g., if = came before ==, == would get -# recognized as two instances of =). -Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", - r"//=?", r"->", - r"[+\-*/%&|^=<>]=?", - r"~") - -Bracket = '[][(){}]' -Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') -Funny = group(Operator, Bracket, Special) - -PlainToken = group(Number, Funny, String, Name) -Token = Ignore + PlainToken - -# First (or only) line of ' or " string. -ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + - group("'", r'\\\r?\n'), - StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + - group('"', r'\\\r?\n')) -PseudoExtras = group(r'\\\r?\n', Comment, Triple) -PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) - -def _compile(expr): - return re.compile(expr, re.UNICODE) - -tokenprog, pseudoprog, single3prog, double3prog = map( - _compile, (Token, PseudoToken, Single3, Double3)) -endprogs = {"'": _compile(Single), '"': _compile(Double), - "'''": single3prog, '"""': double3prog, - "r'''": single3prog, 'r"""': double3prog, - "b'''": single3prog, 'b"""': double3prog, - "R'''": single3prog, 'R"""': double3prog, - "B'''": single3prog, 'B"""': double3prog, - "br'''": single3prog, 'br"""': double3prog, - "bR'''": single3prog, 'bR"""': double3prog, - "Br'''": single3prog, 'Br"""': double3prog, - "BR'''": single3prog, 'BR"""': double3prog, - 'r': None, 'R': None, 'b': None, 'B': None} - -triple_quoted = {} -for t in ("'''", '"""', - "r'''", 'r"""', "R'''", 'R"""', - "b'''", 'b"""', "B'''", 'B"""', - "br'''", 'br"""', "Br'''", 'Br"""', - "bR'''", 'bR"""', "BR'''", 'BR"""'): - triple_quoted[t] = t -single_quoted = {} -for t in ("'", '"', - "r'", 'r"', "R'", 'R"', - "b'", 'b"', "B'", 'B"', - "br'", 'br"', "Br'", 'Br"', - "bR'", 'bR"', "BR'", 'BR"' ): - single_quoted[t] = t - -for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']: - _t2 = _prefix+'"""' - endprogs[_t2] = double3prog - triple_quoted[_t2] = _t2 - _t1 = _prefix + "'''" - endprogs[_t1] = single3prog - triple_quoted[_t1] = _t1 - single_quoted[_prefix+'"'] = _prefix+'"' - single_quoted[_prefix+"'"] = _prefix+"'" -del _prefix, _t2, _t1 -endprogs['u'] = None -endprogs['U'] = None - -del _compile - -tabsize = 8 - -class TokenError(Exception): pass - -class StopTokenizing(Exception): pass - - -class Untokenizer: - - def __init__(self): - self.tokens = [] - self.prev_row = 1 - self.prev_col = 0 - self.encoding = 'utf-8' - - def add_whitespace(self, tok_type, start): - row, col = start - assert row >= self.prev_row - col_offset = col - self.prev_col - if col_offset > 0: - self.tokens.append(" " * col_offset) - elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): - # Line was backslash-continued. - self.tokens.append(" ") - - def untokenize(self, tokens): - iterable = iter(tokens) - for t in iterable: - if len(t) == 2: - self.compat(t, iterable) - break - tok_type, token, start, end = t[:4] - if tok_type == ENCODING: - self.encoding = token - continue - self.add_whitespace(tok_type, start) - self.tokens.append(token) - self.prev_row, self.prev_col = end - if tok_type in (NEWLINE, NL): - self.prev_row += 1 - self.prev_col = 0 - return "".join(self.tokens) - - def compat(self, token, iterable): - # This import is here to avoid problems when the itertools - # module is not built yet and tokenize is imported. - from itertools import chain - startline = False - prevstring = False - indents = [] - toks_append = self.tokens.append - - for tok in chain([token], iterable): - toknum, tokval = tok[:2] - if toknum == ENCODING: - self.encoding = tokval - continue - - if toknum in (NAME, NUMBER): - tokval += ' ' - - # Insert a space between two consecutive strings - if toknum == STRING: - if prevstring: - tokval = ' ' + tokval - prevstring = True - else: - prevstring = False - - if toknum == INDENT: - indents.append(tokval) - continue - elif toknum == DEDENT: - indents.pop() - continue - elif toknum in (NEWLINE, NL): - startline = True - elif startline and indents: - toks_append(indents[-1]) - startline = False - toks_append(tokval) - - -def untokenize(tokens): - """ - Convert ``tokens`` (an iterable) back into Python source code. Return - a bytes object, encoded using the encoding specified by the last - ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found. - - The result is guaranteed to tokenize back to match the input so that - the conversion is lossless and round-trips are assured. The - guarantee applies only to the token type and token string as the - spacing between tokens (column positions) may change. - - :func:`untokenize` has two modes. If the input tokens are sequences - of length 2 (``type``, ``string``) then spaces are added as necessary to - preserve the round-trip property. - - If the input tokens are sequences of length 4 or more (``type``, - ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then - spaces are added so that each token appears in the result at the - position indicated by ``start`` and ``end``, if possible. - """ - return Untokenizer().untokenize(tokens) - - -def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" - # Only care about the first 12 characters. - enc = orig_enc[:12].lower().replace("_", "-") - if enc == "utf-8" or enc.startswith("utf-8-"): - return "utf-8" - if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ - enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): - return "iso-8859-1" - return orig_enc - -def detect_encoding(readline): - """ - The detect_encoding() function is used to detect the encoding that should - be used to decode a Python source file. It requires one argument, readline, - in the same way as the tokenize() generator. - - It will call readline a maximum of twice, and return the encoding used - (as a string) and a list of any lines (left as bytes) it has read in. - - It detects the encoding from the presence of a utf-8 bom or an encoding - cookie as specified in pep-0263. If both a bom and a cookie are present, - but disagree, a SyntaxError will be raised. If the encoding cookie is an - invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, - 'utf-8-sig' is returned. - - If no encoding is specified, then the default of 'utf-8' will be returned. - """ - bom_found = False - encoding = None - default = 'utf-8' - def read_or_stop(): - try: - return readline() - except StopIteration: - return b'' - - def find_cookie(line): - try: - # Decode as UTF-8. Either the line is an encoding declaration, - # in which case it should be pure ASCII, or it must be UTF-8 - # per default encoding. - line_string = line.decode('utf-8') - except UnicodeDecodeError: - raise SyntaxError("invalid or missing encoding declaration") - - matches = cookie_re.findall(line_string) - if not matches: - return None - encoding = _get_normal_name(matches[0]) - try: - codec = lookup(encoding) - except LookupError: - # This behaviour mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) - - if bom_found: - if encoding != 'utf-8': - # This behaviour mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - encoding += '-sig' - return encoding - - first = read_or_stop() - if first.startswith(BOM_UTF8): - bom_found = True - first = first[3:] - default = 'utf-8-sig' - if not first: - return default, [] - - encoding = find_cookie(first) - if encoding: - return encoding, [first] - - second = read_or_stop() - if not second: - return default, [first] - - encoding = find_cookie(second) - if encoding: - return encoding, [first, second] - - return default, [first, second] - - -def open(filename): - """Open a file in read only mode using the encoding detected by - detect_encoding(). - """ - buffer = builtins.open(filename, 'rb') - encoding, lines = detect_encoding(buffer.readline) - buffer.seek(0) - text = TextIOWrapper(buffer, encoding, line_buffering=True) - text.mode = 'r' - return text - - -def tokenize(readline): - """ - The tokenize() generator requires one argument, readline, which - must be a callable object which provides the same interface as the - readline() method of built-in file objects. Each call to the function - should return one line of input as bytes. Alternately, readline - can be a callable function terminating with :class:`StopIteration`:: - - readline = open(myfile, 'rb').__next__ # Example of alternate readline - - The generator produces 5-tuples with these members: the token type; the - token string; a 2-tuple (srow, scol) of ints specifying the row and - column where the token begins in the source; a 2-tuple (erow, ecol) of - ints specifying the row and column where the token ends in the source; - and the line on which the token was found. The line passed is the - logical line; continuation lines are included. - - The first token sequence will always be an ENCODING token - which tells you which encoding was used to decode the bytes stream. - """ - # This import is here to avoid problems when the itertools module is not - # built yet and tokenize is imported. - from itertools import chain, repeat - encoding, consumed = detect_encoding(readline) - rl_gen = iter(readline, b"") - empty = repeat(b"") - return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) - - -def _tokenize(readline, encoding): - lnum = parenlev = continued = 0 - numchars = '0123456789' - contstr, needcont = '', 0 - contline = None - indents = [0] - - if encoding is not None: - if encoding == "utf-8-sig": - # BOM will already have been stripped. - encoding = "utf-8" - yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - while True: # loop over lines in stream - try: - line = readline() - except StopIteration: - line = b'' - - if encoding is not None: - line = line.decode(encoding) - lnum += 1 - pos, max = 0, len(line) - - if contstr: # continued string - if not line: - raise TokenError("EOF in multi-line string", strstart) - endmatch = endprog.match(line) - if endmatch: - pos = end = endmatch.end(0) - yield TokenInfo(STRING, contstr + line[:end], - strstart, (lnum, end), contline + line) - contstr, needcont = '', 0 - contline = None - elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': - yield TokenInfo(ERRORTOKEN, contstr + line, - strstart, (lnum, len(line)), contline) - contstr = '' - contline = None - continue - else: - contstr = contstr + line - contline = contline + line - continue - - elif parenlev == 0 and not continued: # new statement - if not line: break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == ' ': - column += 1 - elif line[pos] == '\t': - column = (column//tabsize + 1)*tabsize - elif line[pos] == '\f': - column = 0 - else: - break - pos += 1 - if pos == max: - break - - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - nl_pos = pos + len(comment_token) - yield TokenInfo(COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - yield TokenInfo(NEWLINE, line[nl_pos:], - (lnum, nl_pos), (lnum, len(line)), line) - else: - yield TokenInfo(NEWLINE, line[pos:], - (lnum, pos), (lnum, len(line)), line) - continue - - if column > indents[-1]: # count indents or dedents - indents.append(column) - yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("", lnum, pos, line)) - indents = indents[:-1] - yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) - - else: # continued statement - if not line: - raise TokenError("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - pseudomatch = pseudoprog.match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - token, initial = line[start:end], line[start] - - if (initial in numchars or # ordinary number - (initial == '.' and token != '.' and token != '...')): - yield TokenInfo(NUMBER, token, spos, epos, line) - elif initial in '\r\n': - yield TokenInfo(NL if parenlev > 0 else NEWLINE, - token, spos, epos, line) - elif initial == '#': - assert not token.endswith("\n") - yield TokenInfo(COMMENT, token, spos, epos, line) - elif token in triple_quoted: - endprog = endprogs[token] - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - yield TokenInfo(STRING, token, spos, (lnum, pos), line) - else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - elif initial in single_quoted or \ - token[:2] in single_quoted or \ - token[:3] in single_quoted: - if token[-1] == '\n': # continued string - strstart = (lnum, start) - endprog = (endprogs[initial] or endprogs[token[1]] or - endprogs[token[2]]) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - yield TokenInfo(STRING, token, spos, epos, line) - elif initial.isidentifier(): # ordinary name - yield TokenInfo(NAME, token, spos, epos, line) - elif initial == '\\': # continued stmt - continued = 1 - else: - if initial in '([{': - parenlev += 1 - elif initial in ')]}': - parenlev -= 1 - yield TokenInfo(OP, token, spos, epos, line) - else: - yield TokenInfo(ERRORTOKEN, line[pos], - (lnum, pos), (lnum, pos+1), line) - pos += 1 - - for indent in indents[1:]: # pop remaining indent levels - yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') - yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') - - -# An undocumented, backwards compatible, API for all the places in the standard -# library that expect to be able to use tokenize with strings -def generate_tokens(readline): - return _tokenize(readline, None) - -if __name__ == "__main__": - # Quick sanity check - s = b'''def parseline(self, line): - """Parse the line into a command name and a string containing - the arguments. Returns a tuple containing (command, args, line). - 'command' and 'args' may be None if the line couldn't be parsed. - """ - line = line.strip() - if not line: - return None, None, line - elif line[0] == '?': - line = 'help ' + line[1:] - elif line[0] == '!': - if hasattr(self, 'do_shell'): - line = 'shell ' + line[1:] - else: - return None, None, line - i, n = 0, len(line) - while i < n and line[i] in self.identchars: i = i+1 - cmd, arg = line[:i], line[i:].strip() - return cmd, arg, line - ''' - for tok in tokenize(iter(s.splitlines()).__next__): - print(tok) diff --git a/IPython/utils/tokenutil.py b/IPython/utils/tokenutil.py index 6e283a5..28f8b6d 100644 --- a/IPython/utils/tokenutil.py +++ b/IPython/utils/tokenutil.py @@ -7,7 +7,7 @@ from collections import namedtuple from io import StringIO from keyword import iskeyword -from . import tokenize2 +import tokenize Token = namedtuple('Token', ['token', 'text', 'start', 'end', 'line']) @@ -15,9 +15,9 @@ Token = namedtuple('Token', ['token', 'text', 'start', 'end', 'line']) def generate_tokens(readline): """wrap generate_tokens to catch EOF errors""" try: - for token in tokenize2.generate_tokens(readline): + for token in tokenize.generate_tokens(readline): yield token - except tokenize2.TokenError: + except tokenize.TokenError: # catch EOF error return @@ -99,12 +99,12 @@ def token_at_cursor(cell, cursor_pos=0): # don't consume it break - if tok.token == tokenize2.NAME and not iskeyword(tok.text): - if names and tokens and tokens[-1].token == tokenize2.OP and tokens[-1].text == '.': + if tok.token == tokenize.NAME and not iskeyword(tok.text): + if names and tokens and tokens[-1].token == tokenize.OP and tokens[-1].text == '.': names[-1] = "%s.%s" % (names[-1], tok.text) else: names.append(tok.text) - elif tok.token == tokenize2.OP: + elif tok.token == tokenize.OP: if tok.text == '=' and names: # don't inspect the lhs of an assignment names.pop(-1)