From ee6d0e6a95a5f8b2c6d4bcfe21418318dd6e85bb 2017-02-09 20:43:54
From: Matthias Bussonnier <bussonniermatthias@gmail.com>
Date: 2017-02-09 20:43:54
Subject: [PATCH] Merge pull request #10270 from takluyver/tokenize-indent

Calculate indentation based on tokens, not regexes
---

diff --git a/IPython/core/inputsplitter.py b/IPython/core/inputsplitter.py
index 2d0ca2d..267b73a 100644
--- a/IPython/core/inputsplitter.py
+++ b/IPython/core/inputsplitter.py
@@ -18,8 +18,10 @@ For more details, see the class docstrings below.
 # Distributed under the terms of the Modified BSD License.
 import ast
 import codeop
+import io
 import re
 import sys
+import tokenize
 import warnings
 
 from IPython.utils.py3compat import cast_unicode
@@ -87,6 +89,113 @@ def num_ini_spaces(s):
     else:
         return 0
 
+# Fake token types for partial_tokenize:
+INCOMPLETE_STRING = tokenize.N_TOKENS
+IN_MULTILINE_STATEMENT = tokenize.N_TOKENS + 1
+
+# The 2 classes below have the same API as TokenInfo, but don't try to look up
+# a token type name that they won't find.
+class IncompleteString:
+    type = exact_type = INCOMPLETE_STRING
+    def __init__(self, s, start, end, line):
+        self.s = s
+        self.start = start
+        self.end = end
+        self.line = line
+
+class InMultilineStatement:
+    type = exact_type = IN_MULTILINE_STATEMENT
+    def __init__(self, pos, line):
+        self.s = ''
+        self.start = self.end = pos
+        self.line = line
+
+def partial_tokens(s):
+    """Iterate over tokens from a possibly-incomplete string of code.
+
+    This adds two special token types: INCOMPLETE_STRING and
+    IN_MULTILINE_STATEMENT. These can only occur as the last token yielded, and
+    represent the two main ways for code to be incomplete.
+    """
+    readline = io.StringIO(s).readline
+    token = tokenize.TokenInfo(tokenize.NEWLINE, '', (1, 0), (1, 0), '')
+    try:
+        for token in tokenize.generate_tokens(readline):
+            yield token
+    except tokenize.TokenError as e:
+        # catch EOF error
+        lines = s.splitlines(keepends=True)
+        end = len(lines), len(lines[-1])
+        if 'multi-line string' in e.args[0]:
+            l, c = start = token.end
+            s = lines[l-1][c:] + ''.join(lines[l:])
+            yield IncompleteString(s, start, end, lines[-1])
+        elif 'multi-line statement' in e.args[0]:
+            yield InMultilineStatement(end, lines[-1])
+        else:
+            raise
+
+def find_next_indent(code):
+    """Find the number of spaces for the next line of indentation"""
+    tokens = list(partial_tokens(code))
+    if tokens[-1].type == tokenize.ENDMARKER:
+        tokens.pop()
+    if not tokens:
+        return 0
+    while (tokens[-1].type in {tokenize.DEDENT, tokenize.NEWLINE, tokenize.COMMENT}):
+        tokens.pop()
+
+    if tokens[-1].type == INCOMPLETE_STRING:
+        # Inside a multiline string
+        return 0
+
+    # Find the indents used before
+    prev_indents = [0]
+    def _add_indent(n):
+        if n != prev_indents[-1]:
+            prev_indents.append(n)
+
+    tokiter = iter(tokens)
+    for tok in tokiter:
+        if tok.type in {tokenize.INDENT, tokenize.DEDENT}:
+            _add_indent(tok.end[1])
+        elif (tok.type == tokenize.NL):
+            try:
+                _add_indent(next(tokiter).start[1])
+            except StopIteration:
+                break
+
+    last_indent = prev_indents.pop()
+
+    # If we've just opened a multiline statement (e.g. 'a = ['), indent more
+    if tokens[-1].type == IN_MULTILINE_STATEMENT:
+        if tokens[-2].exact_type in {tokenize.LPAR, tokenize.LSQB, tokenize.LBRACE}:
+            return last_indent + 4
+        return last_indent
+
+    if tokens[-1].exact_type == tokenize.COLON:
+        # Line ends with colon - indent
+        return last_indent + 4
+
+    if last_indent:
+        # Examine the last line for dedent cues - statements like return or
+        # raise which normally end a block of code.
+        last_line_starts = 0
+        for i, tok in enumerate(tokens):
+            if tok.type == tokenize.NEWLINE:
+                last_line_starts = i + 1
+
+        last_line_tokens = tokens[last_line_starts:]
+        names = [t.string for t in last_line_tokens if t.type == tokenize.NAME]
+        if names and names[0] in {'raise', 'return', 'pass', 'break', 'continue'}:
+            # Find the most recent indentation less than the current level
+            for indent in reversed(prev_indents):
+                if indent < last_indent:
+                    return indent
+
+    return last_indent
+
+
 def last_blank(src):
     """Determine if the input source ends in a blank.
 
@@ -306,7 +415,7 @@ class InputSplitter(object):
         if source.endswith('\\\n'):
             return False
 
-        self._update_indent(lines)
+        self._update_indent()
         try:
             with warnings.catch_warnings():
                 warnings.simplefilter('error', SyntaxWarning)
@@ -382,55 +491,10 @@ class InputSplitter(object):
         # General fallback - accept more code
         return True
 
-    #------------------------------------------------------------------------
-    # Private interface
-    #------------------------------------------------------------------------
-
-    def _find_indent(self, line):
-        """Compute the new indentation level for a single line.
-
-        Parameters
-        ----------
-        line : str
-          A single new line of non-whitespace, non-comment Python input.
-
-        Returns
-        -------
-        indent_spaces : int
-          New value for the indent level (it may be equal to self.indent_spaces
-        if indentation doesn't change.
-
-        full_dedent : boolean
-          Whether the new line causes a full flush-left dedent.
-        """
-        indent_spaces = self.indent_spaces
-        full_dedent = self._full_dedent
-
-        inisp = num_ini_spaces(line)
-        if inisp < indent_spaces:
-            indent_spaces = inisp
-            if indent_spaces <= 0:
-                #print 'Full dedent in text',self.source # dbg
-                full_dedent = True
-
-        if line.rstrip()[-1] == ':':
-            indent_spaces += 4
-        elif dedent_re.match(line):
-            indent_spaces -= 4
-            if indent_spaces <= 0:
-                full_dedent = True
-
-        # Safety
-        if indent_spaces < 0:
-            indent_spaces = 0
-            #print 'safety' # dbg
-
-        return indent_spaces, full_dedent
-
-    def _update_indent(self, lines):
-        for line in remove_comments(lines).splitlines():
-            if line and not line.isspace():
-                self.indent_spaces, self._full_dedent = self._find_indent(line)
+    def _update_indent(self):
+        # self.source always has a trailing newline
+        self.indent_spaces = find_next_indent(self.source[:-1])
+        self._full_dedent = (self.indent_spaces == 0)
 
     def _store(self, lines, buffer=None, store='source'):
         """Store one or more lines of input.
diff --git a/IPython/core/tests/test_inputsplitter.py b/IPython/core/tests/test_inputsplitter.py
index 90dd911..54e85c0 100644
--- a/IPython/core/tests/test_inputsplitter.py
+++ b/IPython/core/tests/test_inputsplitter.py
@@ -612,3 +612,30 @@ class LineModeCellMagics(CellMagicsCommon, unittest.TestCase):
         sp.push('\n')
         # In this case, a blank line should end the cell magic
         nt.assert_false(sp.push_accepts_more()) #2
+
+indentation_samples = [
+    ('a = 1', 0),
+    ('for a in b:', 4),
+    ('def f():', 4),
+    ('def f(): #comment', 4),
+    ('a = ":#not a comment"', 0),
+    ('def f():\n    a = 1', 4),
+    ('def f():\n    return 1', 0),
+    ('for a in b:\n'
+     '   if a < 0:'
+     '       continue', 3),
+    ('a = {', 4),
+    ('a = {\n'
+     '     1,', 5),
+    ('b = """123', 0),
+    ('', 0),
+    ('def f():\n    pass', 0),
+    ('class Bar:\n    def f():\n        pass', 4),
+    ('class Bar:\n    def f():\n        raise', 4),
+]
+
+def test_find_next_indent():
+    for code, exp in indentation_samples:
+        res = isp.find_next_indent(code)
+        msg = "{!r} != {!r} (expected)\n Code: {!r}".format(res, exp, code)
+        assert res == exp, msg