From 30fce8a6d59c9a5984f20d787cdd9c3885d85bea 2013-03-31 09:03:55
From: Thomas Kluyver <takowl@gmail.com>
Date: 2013-03-31 09:03:55
Subject: [PATCH] Now include patched copies of tokenize for Python 2 and 3.

---

diff --git a/IPython/core/inputtransformer.py b/IPython/core/inputtransformer.py
index 5a6c717..f5a9fe5 100644
--- a/IPython/core/inputtransformer.py
+++ b/IPython/core/inputtransformer.py
@@ -2,17 +2,10 @@ import abc
 import functools
 import re
 from StringIO import StringIO
-import tokenize
-
-try:
-    generate_tokens = tokenize.generate_tokens
-except AttributeError:
-    # Python 3. Note that we use the undocumented _tokenize because it expects
-    # strings, not bytes. See also Python issue #9969.
-    generate_tokens = tokenize._tokenize
 
 from IPython.core.splitinput import split_user_input, LineInfo
-from IPython.utils.untokenize import untokenize
+from IPython.utils import tokenize2
+from IPython.utils.tokenize2 import generate_tokens, untokenize, TokenError
 
 #-----------------------------------------------------------------------------
 # Globals
@@ -129,7 +122,7 @@ class TokenInputTransformer(InputTransformer):
     
     def get_line(self):
         if self.line_used:
-            raise tokenize.TokenError
+            raise TokenError
         self.line_used = True
         return self.current_line
     
@@ -145,12 +138,12 @@ class TokenInputTransformer(InputTransformer):
             for intok in self.tokenizer:
                 tokens.append(intok)
                 t = intok[0]
-                if t == tokenize.NEWLINE or (stop_at_NL and t == tokenize.NL):
+                if t == tokenize2.NEWLINE or (stop_at_NL and t == tokenize2.NL):
                     # Stop before we try to pull a line we don't have yet
                     break
-                elif t in (tokenize.COMMENT, tokenize.ERRORTOKEN):
+                elif t == tokenize2.ERRORTOKEN:
                     stop_at_NL = True
-        except tokenize.TokenError:
+        except TokenError:
             # Multi-line statement - stop and try again with the next line
             self.reset_tokenizer()
             return None
@@ -297,11 +290,11 @@ def has_comment(src):
     readline = StringIO(src).readline
     toktypes = set()
     try:
-        for t in tokenize.generate_tokens(readline):
+        for t in generate_tokens(readline):
             toktypes.add(t[0])
-    except tokenize.TokenError:
+    except TokenError:
         pass
-    return(tokenize.COMMENT in toktypes)
+    return(tokenize2.COMMENT in toktypes)
 
 
 @StatelessInputTransformer.wrap
diff --git a/IPython/utils/_tokenize_py2.py b/IPython/utils/_tokenize_py2.py
new file mode 100644
index 0000000..16a3c9d
--- /dev/null
+++ b/IPython/utils/_tokenize_py2.py
@@ -0,0 +1,438 @@
+"""Patched version of standard library tokenize, to deal with various bugs.
+
+Patches
+
+- Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing),
+  manually applied.
+- Newlines in comments and blank lines should be either NL or NEWLINE, depending
+  on whether they are in a multi-line statement. Filed as Python issue #17061.
+
+-------------------------------------------------------------------------------
+Tokenization help for Python programs.
+
+generate_tokens(readline) is a generator that breaks a stream of
+text into Python tokens.  It accepts a readline-like method which is called
+repeatedly to get the next line of input (or "" for EOF).  It generates
+5-tuples with these members:
+
+    the token type (see token.py)
+    the token (a string)
+    the starting (row, column) indices of the token (a 2-tuple of ints)
+    the ending (row, column) indices of the token (a 2-tuple of ints)
+    the original line (string)
+
+It is designed to match the working of the Python tokenizer exactly, except
+that it produces COMMENT tokens for comments and gives type OP for all
+operators
+
+Older entry points
+    tokenize_loop(readline, tokeneater)
+    tokenize(readline, tokeneater=printtoken)
+are the same, except instead of generating tokens, tokeneater is a callback
+function to which the 5 fields described above are passed as 5 arguments,
+each time a new token is found."""
+
+__author__ = 'Ka-Ping Yee <ping@lfw.org>'
+__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
+               'Skip Montanaro, Raymond Hettinger')
+
+import string, re
+from token import *
+
+import token
+__all__ = [x for x in dir(token) if not x.startswith("_")]
+__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
+del x
+del token
+
+__all__ += ["TokenError"]
+
+COMMENT = N_TOKENS
+tok_name[COMMENT] = 'COMMENT'
+NL = N_TOKENS + 1
+tok_name[NL] = 'NL'
+N_TOKENS += 2
+
+def group(*choices): return '(' + '|'.join(choices) + ')'
+def any(*choices): return group(*choices) + '*'
+def maybe(*choices): return group(*choices) + '?'
+
+Whitespace = r'[ \f\t]*'
+Comment = r'#[^\r\n]*'
+Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
+Name = r'[a-zA-Z_]\w*'
+
+Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
+Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
+Binnumber = r'0[bB][01]+[lL]?'
+Decnumber = r'[1-9]\d*[lL]?'
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
+Exponent = r'[eE][-+]?\d+'
+Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
+Expfloat = r'\d+' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
+
+# Tail end of ' string.
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+# Tail end of " string.
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+# Tail end of ''' string.
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+# Tail end of """ string.
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
+# Single-line ' or " string.
+String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+
+# Because of leftmost-then-longest match semantics, be sure to put the
+# longest operators first (e.g., if = came before ==, == would get
+# recognized as two instances of =).
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
+                 r"//=?",
+                 r"[+\-*/%&|^=<>]=?",
+                 r"~")
+
+Bracket = '[][(){}]'
+Special = group(r'\r?\n', r'[:;.,`@]')
+Funny = group(Operator, Bracket, Special)
+
+PlainToken = group(Number, Funny, String, Name)
+Token = Ignore + PlainToken
+
+# First (or only) line of ' or " string.
+ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+                group("'", r'\\\r?\n'),
+                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+                group('"', r'\\\r?\n'))
+PseudoExtras = group(r'\\\r?\n', Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
+
+tokenprog, pseudoprog, single3prog, double3prog = map(
+    re.compile, (Token, PseudoToken, Single3, Double3))
+endprogs = {"'": re.compile(Single), '"': re.compile(Double),
+            "'''": single3prog, '"""': double3prog,
+            "r'''": single3prog, 'r"""': double3prog,
+            "u'''": single3prog, 'u"""': double3prog,
+            "ur'''": single3prog, 'ur"""': double3prog,
+            "R'''": single3prog, 'R"""': double3prog,
+            "U'''": single3prog, 'U"""': double3prog,
+            "uR'''": single3prog, 'uR"""': double3prog,
+            "Ur'''": single3prog, 'Ur"""': double3prog,
+            "UR'''": single3prog, 'UR"""': double3prog,
+            "b'''": single3prog, 'b"""': double3prog,
+            "br'''": single3prog, 'br"""': double3prog,
+            "B'''": single3prog, 'B"""': double3prog,
+            "bR'''": single3prog, 'bR"""': double3prog,
+            "Br'''": single3prog, 'Br"""': double3prog,
+            "BR'''": single3prog, 'BR"""': double3prog,
+            'r': None, 'R': None, 'u': None, 'U': None,
+            'b': None, 'B': None}
+
+triple_quoted = {}
+for t in ("'''", '"""',
+          "r'''", 'r"""', "R'''", 'R"""',
+          "u'''", 'u"""', "U'''", 'U"""',
+          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
+          "uR'''", 'uR"""', "UR'''", 'UR"""',
+          "b'''", 'b"""', "B'''", 'B"""',
+          "br'''", 'br"""', "Br'''", 'Br"""',
+          "bR'''", 'bR"""', "BR'''", 'BR"""'):
+    triple_quoted[t] = t
+single_quoted = {}
+for t in ("'", '"',
+          "r'", 'r"', "R'", 'R"',
+          "u'", 'u"', "U'", 'U"',
+          "ur'", 'ur"', "Ur'", 'Ur"',
+          "uR'", 'uR"', "UR'", 'UR"',
+          "b'", 'b"', "B'", 'B"',
+          "br'", 'br"', "Br'", 'Br"',
+          "bR'", 'bR"', "BR'", 'BR"' ):
+    single_quoted[t] = t
+
+tabsize = 8
+
+class TokenError(Exception): pass
+
+class StopTokenizing(Exception): pass
+
+def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
+    srow, scol = srow_scol
+    erow, ecol = erow_ecol
+    print "%d,%d-%d,%d:\t%s\t%s" % \
+        (srow, scol, erow, ecol, tok_name[type], repr(token))
+
+def tokenize(readline, tokeneater=printtoken):
+    """
+    The tokenize() function accepts two parameters: one representing the
+    input stream, and one providing an output mechanism for tokenize().
+
+    The first parameter, readline, must be a callable object which provides
+    the same interface as the readline() method of built-in file objects.
+    Each call to the function should return one line of input as a string.
+
+    The second parameter, tokeneater, must also be a callable object. It is
+    called once for each token, with five arguments, corresponding to the
+    tuples generated by generate_tokens().
+    """
+    try:
+        tokenize_loop(readline, tokeneater)
+    except StopTokenizing:
+        pass
+
+# backwards compatible interface
+def tokenize_loop(readline, tokeneater):
+    for token_info in generate_tokens(readline):
+        tokeneater(*token_info)
+
+class Untokenizer:
+
+    def __init__(self):
+        self.tokens = []
+        self.prev_row = 1
+        self.prev_col = 0
+
+    def add_whitespace(self, start):
+        row, col = start
+        assert row >= self.prev_row
+        col_offset = col - self.prev_col
+        if col_offset > 0:
+            self.tokens.append(" " * col_offset)
+        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
+            # Line was backslash-continued
+            self.tokens.append(" ")
+
+    def untokenize(self, tokens):
+        iterable = iter(tokens)
+        for t in iterable:
+            if len(t) == 2:
+                self.compat(t, iterable)
+                break
+            tok_type, token, start, end = t[:4]
+            self.add_whitespace(start)
+            self.tokens.append(token)
+            self.prev_row, self.prev_col = end
+            if tok_type in (NEWLINE, NL):
+                self.prev_row += 1
+                self.prev_col = 0
+        return "".join(self.tokens)
+
+    def compat(self, token, iterable):
+        # This import is here to avoid problems when the itertools
+        # module is not built yet and tokenize is imported.
+        from itertools import chain
+        startline = False
+        prevstring = False
+        indents = []
+        toks_append = self.tokens.append
+        for tok in chain([token], iterable):
+            toknum, tokval = tok[:2]
+
+            if toknum in (NAME, NUMBER):
+                tokval += ' '
+
+            # Insert a space between two consecutive strings
+            if toknum == STRING:
+                if prevstring:
+                    tokval = ' ' + tokval
+                prevstring = True
+            else:
+                prevstring = False
+
+            if toknum == INDENT:
+                indents.append(tokval)
+                continue
+            elif toknum == DEDENT:
+                indents.pop()
+                continue
+            elif toknum in (NEWLINE, NL):
+                startline = True
+            elif startline and indents:
+                toks_append(indents[-1])
+                startline = False
+            toks_append(tokval)
+
+def untokenize(iterable):
+    """Transform tokens back into Python source code.
+
+    Each element returned by the iterable must be a token sequence
+    with at least two elements, a token number and token value.  If
+    only two tokens are passed, the resulting output is poor.
+
+    Round-trip invariant for full input:
+        Untokenized source will match input source exactly
+
+    Round-trip invariant for limited intput:
+        # Output text will tokenize the back to the input
+        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
+        newcode = untokenize(t1)
+        readline = iter(newcode.splitlines(1)).next
+        t2 = [tok[:2] for tok in generate_tokens(readline)]
+        assert t1 == t2
+    """
+    ut = Untokenizer()
+    return ut.untokenize(iterable)
+
+def generate_tokens(readline):
+    """
+    The generate_tokens() generator requires one argment, readline, which
+    must be a callable object which provides the same interface as the
+    readline() method of built-in file objects. Each call to the function
+    should return one line of input as a string.  Alternately, readline
+    can be a callable function terminating with StopIteration:
+        readline = open(myfile).next    # Example of alternate readline
+
+    The generator produces 5-tuples with these members: the token type; the
+    token string; a 2-tuple (srow, scol) of ints specifying the row and
+    column where the token begins in the source; a 2-tuple (erow, ecol) of
+    ints specifying the row and column where the token ends in the source;
+    and the line on which the token was found. The line passed is the
+    logical line; continuation lines are included.
+    """
+    lnum = parenlev = continued = 0
+    namechars, numchars = string.ascii_letters + '_', '0123456789'
+    contstr, needcont = '', 0
+    contline = None
+    indents = [0]
+
+    while 1:                                   # loop over lines in stream
+        try:
+            line = readline()
+        except StopIteration:
+            line = ''
+        lnum += 1
+        pos, max = 0, len(line)
+
+        if contstr:                            # continued string
+            if not line:
+                raise TokenError, ("EOF in multi-line string", strstart)
+            endmatch = endprog.match(line)
+            if endmatch:
+                pos = end = endmatch.end(0)
+                yield (STRING, contstr + line[:end],
+                       strstart, (lnum, end), contline + line)
+                contstr, needcont = '', 0
+                contline = None
+            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
+                yield (ERRORTOKEN, contstr + line,
+                           strstart, (lnum, len(line)), contline)
+                contstr = ''
+                contline = None
+                continue
+            else:
+                contstr = contstr + line
+                contline = contline + line
+                continue
+
+        elif parenlev == 0 and not continued:  # new statement
+            if not line: break
+            column = 0
+            while pos < max:                   # measure leading whitespace
+                if line[pos] == ' ':
+                    column += 1
+                elif line[pos] == '\t':
+                    column = (column//tabsize + 1)*tabsize
+                elif line[pos] == '\f':
+                    column = 0
+                else:
+                    break
+                pos += 1
+            if pos == max:
+                break
+
+            if line[pos] in '#\r\n':           # skip comments or blank lines
+                if line[pos] == '#':
+                    comment_token = line[pos:].rstrip('\r\n')
+                    nl_pos = pos + len(comment_token)
+                    yield (COMMENT, comment_token,
+                           (lnum, pos), (lnum, pos + len(comment_token)), line)
+                    yield (NEWLINE, line[nl_pos:],
+                           (lnum, nl_pos), (lnum, len(line)), line)
+                else:
+                    yield (NEWLINE, line[pos:],
+                           (lnum, pos), (lnum, len(line)), line)
+                continue
+
+            if column > indents[-1]:           # count indents or dedents
+                indents.append(column)
+                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+            while column < indents[-1]:
+                if column not in indents:
+                    raise IndentationError(
+                        "unindent does not match any outer indentation level",
+                        ("<tokenize>", lnum, pos, line))
+                indents = indents[:-1]
+                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
+
+        else:                                  # continued statement
+            if not line:
+                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
+            continued = 0
+
+        while pos < max:
+            pseudomatch = pseudoprog.match(line, pos)
+            if pseudomatch:                                # scan for tokens
+                start, end = pseudomatch.span(1)
+                spos, epos, pos = (lnum, start), (lnum, end), end
+                token, initial = line[start:end], line[start]
+
+                if initial in numchars or \
+                   (initial == '.' and token != '.'):      # ordinary number
+                    yield (NUMBER, token, spos, epos, line)
+                elif initial in '\r\n':
+                    yield (NL if parenlev > 0 else NEWLINE,
+                           token, spos, epos, line)
+                elif initial == '#':
+                    assert not token.endswith("\n")
+                    yield (COMMENT, token, spos, epos, line)
+                elif token in triple_quoted:
+                    endprog = endprogs[token]
+                    endmatch = endprog.match(line, pos)
+                    if endmatch:                           # all on one line
+                        pos = endmatch.end(0)
+                        token = line[start:pos]
+                        yield (STRING, token, spos, (lnum, pos), line)
+                    else:
+                        strstart = (lnum, start)           # multiple lines
+                        contstr = line[start:]
+                        contline = line
+                        break
+                elif initial in single_quoted or \
+                    token[:2] in single_quoted or \
+                    token[:3] in single_quoted:
+                    if token[-1] == '\n':                  # continued string
+                        strstart = (lnum, start)
+                        endprog = (endprogs[initial] or endprogs[token[1]] or
+                                   endprogs[token[2]])
+                        contstr, needcont = line[start:], 1
+                        contline = line
+                        break
+                    else:                                  # ordinary string
+                        yield (STRING, token, spos, epos, line)
+                elif initial in namechars:                 # ordinary name
+                    yield (NAME, token, spos, epos, line)
+                elif initial == '\\':                      # continued stmt
+                    continued = 1
+                else:
+                    if initial in '([{':
+                        parenlev += 1
+                    elif initial in ')]}':
+                        parenlev -= 1
+                    yield (OP, token, spos, epos, line)
+            else:
+                yield (ERRORTOKEN, line[pos],
+                           (lnum, pos), (lnum, pos+1), line)
+                pos += 1
+
+    for indent in indents[1:]:                 # pop remaining indent levels
+        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
+    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+
+if __name__ == '__main__':                     # testing
+    import sys
+    if len(sys.argv) > 1:
+        tokenize(open(sys.argv[1]).readline)
+    else:
+        tokenize(sys.stdin.readline)
diff --git a/IPython/utils/_tokenize_py3.py b/IPython/utils/_tokenize_py3.py
new file mode 100644
index 0000000..a596fe4
--- /dev/null
+++ b/IPython/utils/_tokenize_py3.py
@@ -0,0 +1,574 @@
+"""Patched version of standard library tokenize, to deal with various bugs.
+
+Based on Python 3.2 code.
+
+Patches:
+
+- Gareth Rees' patch for Python issue #12691 (untokenizing)
+  - Except we don't encode the output of untokenize
+  - Python 2 compatible syntax, so that it can be byte-compiled at installation
+- Newlines in comments and blank lines should be either NL or NEWLINE, depending
+  on whether they are in a multi-line statement. Filed as Python issue #17061.
+- Export generate_tokens & TokenError
+
+------------------------------------------------------------------------------
+Tokenization help for Python programs.
+
+tokenize(readline) is a generator that breaks a stream of bytes into
+Python tokens.  It decodes the bytes according to PEP-0263 for
+determining source file encoding.
+
+It accepts a readline-like method which is called repeatedly to get the
+next line of input (or b"" for EOF).  It generates 5-tuples with these
+members:
+
+    the token type (see token.py)
+    the token (a string)
+    the starting (row, column) indices of the token (a 2-tuple of ints)
+    the ending (row, column) indices of the token (a 2-tuple of ints)
+    the original line (string)
+
+It is designed to match the working of the Python tokenizer exactly, except
+that it produces COMMENT tokens for comments and gives type OP for all
+operators.  Additionally, all token lists start with an ENCODING token
+which tells you which encoding was used to decode the bytes stream.
+"""
+from __future__ import absolute_import
+
+__author__ = 'Ka-Ping Yee <ping@lfw.org>'
+__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
+               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
+               'Michael Foord')
+import builtins
+import re
+import sys
+from token import *
+from codecs import lookup, BOM_UTF8
+import collections
+from io import TextIOWrapper
+cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
+
+import token
+__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
+                           "NL", "untokenize", "ENCODING", "TokenInfo"]
+del token
+
+__all__ += ["generate_tokens", "TokenError"]
+
+COMMENT = N_TOKENS
+tok_name[COMMENT] = 'COMMENT'
+NL = N_TOKENS + 1
+tok_name[NL] = 'NL'
+ENCODING = N_TOKENS + 2
+tok_name[ENCODING] = 'ENCODING'
+N_TOKENS += 3
+
+class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
+    def __repr__(self):
+        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
+        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
+                self._replace(type=annotated_type))
+
+def group(*choices): return '(' + '|'.join(choices) + ')'
+def any(*choices): return group(*choices) + '*'
+def maybe(*choices): return group(*choices) + '?'
+
+# Note: we use unicode matching for names ("\w") but ascii matching for
+# number literals.
+Whitespace = r'[ \f\t]*'
+Comment = r'#[^\r\n]*'
+Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
+Name = r'\w+'
+
+Hexnumber = r'0[xX][0-9a-fA-F]+'
+Binnumber = r'0[bB][01]+'
+Octnumber = r'0[oO][0-7]+'
+Decnumber = r'(?:0+|[1-9][0-9]*)'
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
+Exponent = r'[eE][-+]?[0-9]+'
+Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
+Expfloat = r'[0-9]+' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
+
+# Tail end of ' string.
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+# Tail end of " string.
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+# Tail end of ''' string.
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+# Tail end of """ string.
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
+# Single-line ' or " string.
+String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+               r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+
+# Because of leftmost-then-longest match semantics, be sure to put the
+# longest operators first (e.g., if = came before ==, == would get
+# recognized as two instances of =).
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
+                 r"//=?", r"->",
+                 r"[+\-*/%&|^=<>]=?",
+                 r"~")
+
+Bracket = '[][(){}]'
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
+Funny = group(Operator, Bracket, Special)
+
+PlainToken = group(Number, Funny, String, Name)
+Token = Ignore + PlainToken
+
+# First (or only) line of ' or " string.
+ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+                group("'", r'\\\r?\n'),
+                r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+                group('"', r'\\\r?\n'))
+PseudoExtras = group(r'\\\r?\n', Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
+
+def _compile(expr):
+    return re.compile(expr, re.UNICODE)
+
+tokenprog, pseudoprog, single3prog, double3prog = map(
+    _compile, (Token, PseudoToken, Single3, Double3))
+endprogs = {"'": _compile(Single), '"': _compile(Double),
+            "'''": single3prog, '"""': double3prog,
+            "r'''": single3prog, 'r"""': double3prog,
+            "b'''": single3prog, 'b"""': double3prog,
+            "br'''": single3prog, 'br"""': double3prog,
+            "R'''": single3prog, 'R"""': double3prog,
+            "B'''": single3prog, 'B"""': double3prog,
+            "bR'''": single3prog, 'bR"""': double3prog,
+            "Br'''": single3prog, 'Br"""': double3prog,
+            "BR'''": single3prog, 'BR"""': double3prog,
+            'r': None, 'R': None, 'b': None, 'B': None}
+
+triple_quoted = {}
+for t in ("'''", '"""',
+          "r'''", 'r"""', "R'''", 'R"""',
+          "b'''", 'b"""', "B'''", 'B"""',
+          "br'''", 'br"""', "Br'''", 'Br"""',
+          "bR'''", 'bR"""', "BR'''", 'BR"""'):
+    triple_quoted[t] = t
+single_quoted = {}
+for t in ("'", '"',
+          "r'", 'r"', "R'", 'R"',
+          "b'", 'b"', "B'", 'B"',
+          "br'", 'br"', "Br'", 'Br"',
+          "bR'", 'bR"', "BR'", 'BR"' ):
+    single_quoted[t] = t
+
+del _compile
+
+tabsize = 8
+
+class TokenError(Exception): pass
+
+class StopTokenizing(Exception): pass
+
+
+class Untokenizer:
+
+    def __init__(self):
+        self.tokens = []
+        self.prev_row = 1
+        self.prev_col = 0
+        self.encoding = 'utf-8'
+
+    def add_whitespace(self, tok_type, start):
+        row, col = start
+        assert row >= self.prev_row
+        col_offset = col - self.prev_col
+        if col_offset > 0:
+            self.tokens.append(" " * col_offset)
+        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
+            # Line was backslash-continued.
+            self.tokens.append(" ")
+
+    def untokenize(self, tokens):
+        iterable = iter(tokens)
+        for t in iterable:
+            if len(t) == 2:
+                self.compat(t, iterable)
+                break
+            tok_type, token, start, end = t[:4]
+            if tok_type == ENCODING:
+                self.encoding = token
+                continue
+            self.add_whitespace(tok_type, start)
+            self.tokens.append(token)
+            self.prev_row, self.prev_col = end
+            if tok_type in (NEWLINE, NL):
+                self.prev_row += 1
+                self.prev_col = 0
+        return "".join(self.tokens)
+
+    def compat(self, token, iterable):
+        # This import is here to avoid problems when the itertools
+        # module is not built yet and tokenize is imported.
+        from itertools import chain
+        startline = False
+        prevstring = False
+        indents = []
+        toks_append = self.tokens.append
+
+        for tok in chain([token], iterable):
+            toknum, tokval = tok[:2]
+            if toknum == ENCODING:
+                self.encoding = tokval
+                continue
+
+            if toknum in (NAME, NUMBER):
+                tokval += ' '
+
+            # Insert a space between two consecutive strings
+            if toknum == STRING:
+                if prevstring:
+                    tokval = ' ' + tokval
+                prevstring = True
+            else:
+                prevstring = False
+
+            if toknum == INDENT:
+                indents.append(tokval)
+                continue
+            elif toknum == DEDENT:
+                indents.pop()
+                continue
+            elif toknum in (NEWLINE, NL):
+                startline = True
+            elif startline and indents:
+                toks_append(indents[-1])
+                startline = False
+            toks_append(tokval)
+
+
+def untokenize(tokens):
+    """
+    Convert ``tokens`` (an iterable) back into Python source code. Return
+    a bytes object, encoded using the encoding specified by the last
+    ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
+
+    The result is guaranteed to tokenize back to match the input so that
+    the conversion is lossless and round-trips are assured.  The
+    guarantee applies only to the token type and token string as the
+    spacing between tokens (column positions) may change.
+
+    :func:`untokenize` has two modes. If the input tokens are sequences
+    of length 2 (``type``, ``string``) then spaces are added as necessary to
+    preserve the round-trip property.
+
+    If the input tokens are sequences of length 4 or more (``type``,
+    ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
+    spaces are added so that each token appears in the result at the
+    position indicated by ``start`` and ``end``, if possible.
+    """
+    return Untokenizer().untokenize(tokens)
+
+
+def _get_normal_name(orig_enc):
+    """Imitates get_normal_name in tokenizer.c."""
+    # Only care about the first 12 characters.
+    enc = orig_enc[:12].lower().replace("_", "-")
+    if enc == "utf-8" or enc.startswith("utf-8-"):
+        return "utf-8"
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+        return "iso-8859-1"
+    return orig_enc
+
+def detect_encoding(readline):
+    """
+    The detect_encoding() function is used to detect the encoding that should
+    be used to decode a Python source file.  It requires one argment, readline,
+    in the same way as the tokenize() generator.
+
+    It will call readline a maximum of twice, and return the encoding used
+    (as a string) and a list of any lines (left as bytes) it has read in.
+
+    It detects the encoding from the presence of a utf-8 bom or an encoding
+    cookie as specified in pep-0263.  If both a bom and a cookie are present,
+    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+    'utf-8-sig' is returned.
+
+    If no encoding is specified, then the default of 'utf-8' will be returned.
+    """
+    bom_found = False
+    encoding = None
+    default = 'utf-8'
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return b''
+
+    def find_cookie(line):
+        try:
+            # Decode as UTF-8. Either the line is an encoding declaration,
+            # in which case it should be pure ASCII, or it must be UTF-8
+            # per default encoding.
+            line_string = line.decode('utf-8')
+        except UnicodeDecodeError:
+            raise SyntaxError("invalid or missing encoding declaration")
+
+        matches = cookie_re.findall(line_string)
+        if not matches:
+            return None
+        encoding = _get_normal_name(matches[0])
+        try:
+            codec = lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found:
+            if encoding != 'utf-8':
+                # This behaviour mimics the Python interpreter
+                raise SyntaxError('encoding problem: utf-8')
+            encoding += '-sig'
+        return encoding
+
+    first = read_or_stop()
+    if first.startswith(BOM_UTF8):
+        bom_found = True
+        first = first[3:]
+        default = 'utf-8-sig'
+    if not first:
+        return default, []
+
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding, [first]
+
+    second = read_or_stop()
+    if not second:
+        return default, [first]
+
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding, [first, second]
+
+    return default, [first, second]
+
+
+def open(filename):
+    """Open a file in read only mode using the encoding detected by
+    detect_encoding().
+    """
+    buffer = builtins.open(filename, 'rb')
+    encoding, lines = detect_encoding(buffer.readline)
+    buffer.seek(0)
+    text = TextIOWrapper(buffer, encoding, line_buffering=True)
+    text.mode = 'r'
+    return text
+
+
+def tokenize(readline):
+    """
+    The tokenize() generator requires one argment, readline, which
+    must be a callable object which provides the same interface as the
+    readline() method of built-in file objects.  Each call to the function
+    should return one line of input as bytes.  Alternately, readline
+    can be a callable function terminating with StopIteration:
+        readline = open(myfile, 'rb').__next__  # Example of alternate readline
+
+    The generator produces 5-tuples with these members: the token type; the
+    token string; a 2-tuple (srow, scol) of ints specifying the row and
+    column where the token begins in the source; a 2-tuple (erow, ecol) of
+    ints specifying the row and column where the token ends in the source;
+    and the line on which the token was found.  The line passed is the
+    logical line; continuation lines are included.
+
+    The first token sequence will always be an ENCODING token
+    which tells you which encoding was used to decode the bytes stream.
+    """
+    # This import is here to avoid problems when the itertools module is not
+    # built yet and tokenize is imported.
+    from itertools import chain, repeat
+    encoding, consumed = detect_encoding(readline)
+    rl_gen = iter(readline, b"")
+    empty = repeat(b"")
+    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
+
+
+def _tokenize(readline, encoding):
+    lnum = parenlev = continued = 0
+    numchars = '0123456789'
+    contstr, needcont = '', 0
+    contline = None
+    indents = [0]
+
+    if encoding is not None:
+        if encoding == "utf-8-sig":
+            # BOM will already have been stripped.
+            encoding = "utf-8"
+        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
+    while True:             # loop over lines in stream
+        try:
+            line = readline()
+        except StopIteration:
+            line = b''
+
+        if encoding is not None:
+            line = line.decode(encoding)
+        lnum += 1
+        pos, max = 0, len(line)
+
+        if contstr:                            # continued string
+            if not line:
+                raise TokenError("EOF in multi-line string", strstart)
+            endmatch = endprog.match(line)
+            if endmatch:
+                pos = end = endmatch.end(0)
+                yield TokenInfo(STRING, contstr + line[:end],
+                       strstart, (lnum, end), contline + line)
+                contstr, needcont = '', 0
+                contline = None
+            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
+                yield TokenInfo(ERRORTOKEN, contstr + line,
+                           strstart, (lnum, len(line)), contline)
+                contstr = ''
+                contline = None
+                continue
+            else:
+                contstr = contstr + line
+                contline = contline + line
+                continue
+
+        elif parenlev == 0 and not continued:  # new statement
+            if not line: break
+            column = 0
+            while pos < max:                   # measure leading whitespace
+                if line[pos] == ' ':
+                    column += 1
+                elif line[pos] == '\t':
+                    column = (column//tabsize + 1)*tabsize
+                elif line[pos] == '\f':
+                    column = 0
+                else:
+                    break
+                pos += 1
+            if pos == max:
+                break
+
+            if line[pos] in '#\r\n':           # skip comments or blank lines
+                if line[pos] == '#':
+                    comment_token = line[pos:].rstrip('\r\n')
+                    nl_pos = pos + len(comment_token)
+                    yield TokenInfo(COMMENT, comment_token,
+                           (lnum, pos), (lnum, pos + len(comment_token)), line)
+                    yield TokenInfo(NEWLINE, line[nl_pos:],
+                           (lnum, nl_pos), (lnum, len(line)), line)
+                else:
+                    yield TokenInfo(NEWLINE, line[pos:],
+                           (lnum, pos), (lnum, len(line)), line)
+                continue
+
+            if column > indents[-1]:           # count indents or dedents
+                indents.append(column)
+                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+            while column < indents[-1]:
+                if column not in indents:
+                    raise IndentationError(
+                        "unindent does not match any outer indentation level",
+                        ("<tokenize>", lnum, pos, line))
+                indents = indents[:-1]
+                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
+
+        else:                                  # continued statement
+            if not line:
+                raise TokenError("EOF in multi-line statement", (lnum, 0))
+            continued = 0
+
+        while pos < max:
+            pseudomatch = pseudoprog.match(line, pos)
+            if pseudomatch:                                # scan for tokens
+                start, end = pseudomatch.span(1)
+                spos, epos, pos = (lnum, start), (lnum, end), end
+                token, initial = line[start:end], line[start]
+
+                if (initial in numchars or                  # ordinary number
+                    (initial == '.' and token != '.' and token != '...')):
+                    yield TokenInfo(NUMBER, token, spos, epos, line)
+                elif initial in '\r\n':
+                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
+                           token, spos, epos, line)
+                elif initial == '#':
+                    assert not token.endswith("\n")
+                    yield TokenInfo(COMMENT, token, spos, epos, line)
+                elif token in triple_quoted:
+                    endprog = endprogs[token]
+                    endmatch = endprog.match(line, pos)
+                    if endmatch:                           # all on one line
+                        pos = endmatch.end(0)
+                        token = line[start:pos]
+                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
+                    else:
+                        strstart = (lnum, start)           # multiple lines
+                        contstr = line[start:]
+                        contline = line
+                        break
+                elif initial in single_quoted or \
+                    token[:2] in single_quoted or \
+                    token[:3] in single_quoted:
+                    if token[-1] == '\n':                  # continued string
+                        strstart = (lnum, start)
+                        endprog = (endprogs[initial] or endprogs[token[1]] or
+                                   endprogs[token[2]])
+                        contstr, needcont = line[start:], 1
+                        contline = line
+                        break
+                    else:                                  # ordinary string
+                        yield TokenInfo(STRING, token, spos, epos, line)
+                elif initial.isidentifier():               # ordinary name
+                    yield TokenInfo(NAME, token, spos, epos, line)
+                elif initial == '\\':                      # continued stmt
+                    continued = 1
+                else:
+                    if initial in '([{':
+                        parenlev += 1
+                    elif initial in ')]}':
+                        parenlev -= 1
+                    yield TokenInfo(OP, token, spos, epos, line)
+            else:
+                yield TokenInfo(ERRORTOKEN, line[pos],
+                           (lnum, pos), (lnum, pos+1), line)
+                pos += 1
+
+    for indent in indents[1:]:                 # pop remaining indent levels
+        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
+    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+
+
+# An undocumented, backwards compatible, API for all the places in the standard
+# library that expect to be able to use tokenize with strings
+def generate_tokens(readline):
+    return _tokenize(readline, None)
+
+if __name__ == "__main__":
+    # Quick sanity check
+    s = b'''def parseline(self, line):
+            """Parse the line into a command name and a string containing
+            the arguments.  Returns a tuple containing (command, args, line).
+            'command' and 'args' may be None if the line couldn't be parsed.
+            """
+            line = line.strip()
+            if not line:
+                return None, None, line
+            elif line[0] == '?':
+                line = 'help ' + line[1:]
+            elif line[0] == '!':
+                if hasattr(self, 'do_shell'):
+                    line = 'shell ' + line[1:]
+                else:
+                    return None, None, line
+            i, n = 0, len(line)
+            while i < n and line[i] in self.identchars: i = i+1
+            cmd, arg = line[:i], line[i:].strip()
+            return cmd, arg, line
+    '''
+    for tok in tokenize(iter(s.splitlines()).__next__):
+        print(tok)
diff --git a/IPython/utils/tokenize2.py b/IPython/utils/tokenize2.py
new file mode 100644
index 0000000..d44a2f7
--- /dev/null
+++ b/IPython/utils/tokenize2.py
@@ -0,0 +1,9 @@
+"""Load our patched versions of tokenize.
+"""
+
+import sys
+
+if sys.version_info[0] >= 3:
+    from _tokenize_py3 import *
+else:
+    from _tokenize_py2 import *
diff --git a/IPython/utils/untokenize.py b/IPython/utils/untokenize.py
deleted file mode 100644
index ee87f89..0000000
--- a/IPython/utils/untokenize.py
+++ /dev/null
@@ -1,125 +0,0 @@
-"""This is a patched copy of the untokenize machinery from the standard library.
-
-untokenize has a number of major bugs that render it almost useless. We're using
-the patch written by Gareth Rees on Python issue 12961:
-
-http://bugs.python.org/issue12691
-
-We've undone one part of the patch - it encoded the output to bytes, to neatly
-round-trip from tokenize. We want to keep working with text, so we don't encode.
-"""
-
-__author__ = 'Ka-Ping Yee <ping@lfw.org>'
-__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
-               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
-               'Michael Foord')
-from token import *
-
-
-from tokenize import COMMENT, NL
-
-try:
-    # Python 3
-    from tokenize import ENCODING
-except:
-    ENCODING = 987654321
-
-class Untokenizer:
-
-    def __init__(self):
-        self.tokens = []
-        self.prev_row = 1
-        self.prev_col = 0
-        self.encoding = 'utf-8'
-
-    def add_whitespace(self, tok_type, start):
-        row, col = start
-        assert row >= self.prev_row
-        col_offset = col - self.prev_col
-        if col_offset > 0:
-            self.tokens.append(" " * col_offset)
-        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
-            # Line was backslash-continued.
-            self.tokens.append(" ")
-
-    def untokenize(self, tokens):
-        iterable = iter(tokens)
-        for t in iterable:
-            if len(t) == 2:
-                self.compat(t, iterable)
-                break
-            # IPython modification - valid Python 2 syntax
-            tok_type, token, start, end = t[:4]
-            if tok_type == ENCODING:
-                self.encoding = token
-                continue
-            self.add_whitespace(tok_type, start)
-            self.tokens.append(token)
-            self.prev_row, self.prev_col = end
-            if tok_type in (NEWLINE, NL):
-                self.prev_row += 1
-                self.prev_col = 0
-        # IPython modification - don't encode output
-        return "".join(self.tokens)
-
-    def compat(self, token, iterable):
-        # This import is here to avoid problems when the itertools
-        # module is not built yet and tokenize is imported.
-        from itertools import chain
-        startline = False
-        prevstring = False
-        indents = []
-        toks_append = self.tokens.append
-
-        for tok in chain([token], iterable):
-            toknum, tokval = tok[:2]
-            if toknum == ENCODING:
-                self.encoding = tokval
-                continue
-
-            if toknum in (NAME, NUMBER):
-                tokval += ' '
-
-            # Insert a space between two consecutive strings
-            if toknum == STRING:
-                if prevstring:
-                    tokval = ' ' + tokval
-                prevstring = True
-            else:
-                prevstring = False
-
-            if toknum == INDENT:
-                indents.append(tokval)
-                continue
-            elif toknum == DEDENT:
-                indents.pop()
-                continue
-            elif toknum in (NEWLINE, NL):
-                startline = True
-            elif startline and indents:
-                toks_append(indents[-1])
-                startline = False
-            toks_append(tokval)
-
-
-def untokenize(tokens):
-    """
-    Convert ``tokens`` (an iterable) back into Python source code. Return
-    a bytes object, encoded using the encoding specified by the last
-    ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
-
-    The result is guaranteed to tokenize back to match the input so that
-    the conversion is lossless and round-trips are assured.  The
-    guarantee applies only to the token type and token string as the
-    spacing between tokens (column positions) may change.
-
-    :func:`untokenize` has two modes. If the input tokens are sequences
-    of length 2 (``type``, ``string``) then spaces are added as necessary to
-    preserve the round-trip property.
-
-    If the input tokens are sequences of length 4 or more (``type``,
-    ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
-    spaces are added so that each token appears in the result at the
-    position indicated by ``start`` and ``end``, if possible.
-    """
-    return Untokenizer().untokenize(tokens)