upstream/ipython Commit - r10150:4ed49bf8

Fixes for tokenize in Python 3.3

Thomas Kluyver -

r10150:4ed49bf8

parent child

IPython/utils/_tokenize_py3.py

0 +3 -3

              """Patched version of standard library tokenize, to deal with various bugs.
              Based on Python 3.2 code.
              Patches:
              - Gareth Rees' patch for Python issue #12691 (untokenizing)
                - Except we don't encode the output of untokenize
                - Python 2 compatible syntax, so that it can be byte-compiled at installation
              - Newlines in comments and blank lines should be either NL or NEWLINE, depending
                on whether they are in a multi-line statement. Filed as Python issue #17061.
              - Export generate_tokens & TokenError
              - u and rb literals are allowed under Python 3.3 and above.
              ------------------------------------------------------------------------------
              Tokenization help for Python programs.
              tokenize(readline) is a generator that breaks a stream of bytes into
              Python tokens.  It decodes the bytes according to PEP-0263 for
              determining source file encoding.
              It accepts a readline-like method which is called repeatedly to get the
              next line of input (or b"" for EOF).  It generates 5-tuples with these
              members:
                  the token type (see token.py)
                  the token (a string)
                  the starting (row, column) indices of the token (a 2-tuple of ints)
                  the ending (row, column) indices of the token (a 2-tuple of ints)
                  the original line (string)
              It is designed to match the working of the Python tokenizer exactly, except
              that it produces COMMENT tokens for comments and gives type OP for all
              operators.  Additionally, all token lists start with an ENCODING token
              which tells you which encoding was used to decode the bytes stream.
              """
              from __future__ import absolute_import
              __author__ = 'Ka-Ping Yee <ping@lfw.org>'
              __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
                             'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
                             'Michael Foord')
              import builtins
              import re
              import sys
              from token import *
              from codecs import lookup, BOM_UTF8
              import collections
              from io import TextIOWrapper
              cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
              import token
              __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
                                         "NL", "untokenize", "ENCODING", "TokenInfo"]
              del token
              __all__ += ["generate_tokens", "TokenError"]
              COMMENT = N_TOKENS
              tok_name[COMMENT] = 'COMMENT'
              NL = N_TOKENS + 1
              tok_name[NL] = 'NL'
              ENCODING = N_TOKENS + 2
              tok_name[ENCODING] = 'ENCODING'
              N_TOKENS += 3
              class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
                  def __repr__(self):
                      annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
                      return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                              self._replace(type=annotated_type))
              def group(*choices): return '(' + '|'.join(choices) + ')'
              def any(*choices): return group(*choices) + '*'
              def maybe(*choices): return group(*choices) + '?'
              # Note: we use unicode matching for names ("\w") but ascii matching for
              # number literals.
              Whitespace = r'[ \f\t]*'
              Comment = r'#[^\r\n]*'
              Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
              Name = r'\w+'
              Hexnumber = r'0[xX][0-9a-fA-F]+'
              Binnumber = r'0[bB][01]+'
              Octnumber = r'0[oO][0-7]+'
              Decnumber = r'(?:0+|[1-9][0-9]*)'
              Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
              Exponent = r'[eE][-+]?[0-9]+'
              Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
              Expfloat = r'[0-9]+' + Exponent
              Floatnumber = group(Pointfloat, Expfloat)
              Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
              Number = group(Imagnumber, Floatnumber, Intnumber)
              if sys.version_info.minor >= 3:
                  StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
              else:
                  StringPrefix = r'(?:[bB]?[rR]?)?'
              # Tail end of ' string.
              Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
              # Tail end of " string.
              Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
              # Tail end of ''' string.
              Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
              # Tail end of """ string.
              Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
              Triple = group(StringPrefix + "'''", StringPrefix + '"""')
              # Single-line ' or " string.
              String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
                             StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
              # Because of leftmost-then-longest match semantics, be sure to put the
              # longest operators first (e.g., if = came before ==, == would get
              # recognized as two instances of =).
              Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
                               r"//=?", r"->",
                               r"[+\-*/%&|^=<>]=?",
                               r"~")
              Bracket = '[][(){}]'
              Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
              Funny = group(Operator, Bracket, Special)
              PlainToken = group(Number, Funny, String, Name)
              Token = Ignore + PlainToken
              # First (or only) line of ' or " string.
              ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                              group("'", r'\\\r?\n'),
                              StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                              group('"', r'\\\r?\n'))
              PseudoExtras = group(r'\\\r?\n', Comment, Triple)
              PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
              def _compile(expr):
                  return re.compile(expr, re.UNICODE)
              tokenprog, pseudoprog, single3prog, double3prog = map(
                  _compile, (Token, PseudoToken, Single3, Double3))
              endprogs = {"'": _compile(Single), '"': _compile(Double),
                          "'''": single3prog, '"""': double3prog,
                          "r'''": single3prog, 'r"""': double3prog,
                          "b'''": single3prog, 'b"""': double3prog,
                          "R'''": single3prog, 'R"""': double3prog,
                          "B'''": single3prog, 'B"""': double3prog,
                          "br'''": single3prog, 'br"""': double3prog,
                          "bR'''": single3prog, 'bR"""': double3prog,
                          "Br'''": single3prog, 'Br"""': double3prog,
                          "BR'''": single3prog, 'BR"""': double3prog,
                          'r': None, 'R': None, 'b': None, 'B': None}
              triple_quoted = {}
              for t in ("'''", '"""',
                        "r'''", 'r"""', "R'''", 'R"""',
                        "b'''", 'b"""', "B'''", 'B"""',
                        "br'''", 'br"""', "Br'''", 'Br"""',
                        "bR'''", 'bR"""', "BR'''", 'BR"""'):
                  triple_quoted[t] = t
              single_quoted = {}
              for t in ("'", '"',
                        "r'", 'r"', "R'", 'R"',
                        "b'", 'b"', "B'", 'B"',
                        "br'", 'br"', "Br'", 'Br"',
                        "bR'", 'bR"', "BR'", 'BR"' ):
                  single_quoted[t] = t
              if sys.version_info.minor >= 3:
                  # Python 3.3
                  for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
-                     _t2 = prefix+'"""'
+                     _t2 = _prefix+'"""'
                      endprogs[_t2] = double3prog
                      triple_quoted[_t2] = _t2
-                     _t1 = prefix + "'''"
+                     _t1 = _prefix + "'''"
                      endprogs[_t1] = single3prog
                      triple_quoted[_t1] = _t1
                      single_quoted[_prefix+'"'] = _prefix+'"'
-                     single_quoted[_prefix+"'"] + _prefix+"'"
+                     single_quoted[_prefix+"'"] = _prefix+"'"
                  del _prefix, _t2, _t1
                  endprogs['u'] = None
                  endprogs['U'] = None
              del _compile
              tabsize = 8
              class TokenError(Exception): pass
              class StopTokenizing(Exception): pass
              class Untokenizer:
                  def __init__(self):
                      self.tokens = []
                      self.prev_row = 1
                      self.prev_col = 0
                      self.encoding = 'utf-8'
                  def add_whitespace(self, tok_type, start):
                      row, col = start
                      assert row >= self.prev_row
                      col_offset = col - self.prev_col
                      if col_offset > 0:
                          self.tokens.append(" " * col_offset)
                      elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
                          # Line was backslash-continued.
                          self.tokens.append(" ")
                  def untokenize(self, tokens):
                      iterable = iter(tokens)
                      for t in iterable:
                          if len(t) == 2:
                              self.compat(t, iterable)
                              break
                          tok_type, token, start, end = t[:4]
                          if tok_type == ENCODING:
                              self.encoding = token
                              continue
                          self.add_whitespace(tok_type, start)
                          self.tokens.append(token)
                          self.prev_row, self.prev_col = end
                          if tok_type in (NEWLINE, NL):
                              self.prev_row += 1
                              self.prev_col = 0
                      return "".join(self.tokens)
                  def compat(self, token, iterable):
                      # This import is here to avoid problems when the itertools
                      # module is not built yet and tokenize is imported.
                      from itertools import chain
                      startline = False
                      prevstring = False
                      indents = []
                      toks_append = self.tokens.append
                      for tok in chain([token], iterable):
                          toknum, tokval = tok[:2]
                          if toknum == ENCODING:
                              self.encoding = tokval
                              continue
                          if toknum in (NAME, NUMBER):
                              tokval += ' '
                          # Insert a space between two consecutive strings
                          if toknum == STRING:
                              if prevstring:
                                  tokval = ' ' + tokval
                              prevstring = True
                          else:
                              prevstring = False
                          if toknum == INDENT:
                              indents.append(tokval)
                              continue
                          elif toknum == DEDENT:
                              indents.pop()
                              continue
                          elif toknum in (NEWLINE, NL):
                              startline = True
                          elif startline and indents:
                              toks_append(indents[-1])
                              startline = False
                          toks_append(tokval)
              def untokenize(tokens):
                  """
                  Convert ``tokens`` (an iterable) back into Python source code. Return
                  a bytes object, encoded using the encoding specified by the last
                  ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
                  The result is guaranteed to tokenize back to match the input so that
                  the conversion is lossless and round-trips are assured.  The
                  guarantee applies only to the token type and token string as the
                  spacing between tokens (column positions) may change.
                  :func:`untokenize` has two modes. If the input tokens are sequences
                  of length 2 (``type``, ``string``) then spaces are added as necessary to
                  preserve the round-trip property.
                  If the input tokens are sequences of length 4 or more (``type``,
                  ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
                  spaces are added so that each token appears in the result at the
                  position indicated by ``start`` and ``end``, if possible.
                  """
                  return Untokenizer().untokenize(tokens)
              def _get_normal_name(orig_enc):
                  """Imitates get_normal_name in tokenizer.c."""
                  # Only care about the first 12 characters.
                  enc = orig_enc[:12].lower().replace("_", "-")
                  if enc == "utf-8" or enc.startswith("utf-8-"):
                      return "utf-8"
                  if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
                     enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
                      return "iso-8859-1"
                  return orig_enc
              def detect_encoding(readline):
                  """
                  The detect_encoding() function is used to detect the encoding that should
                  be used to decode a Python source file.  It requires one argment, readline,
                  in the same way as the tokenize() generator.
                  It will call readline a maximum of twice, and return the encoding used
                  (as a string) and a list of any lines (left as bytes) it has read in.
                  It detects the encoding from the presence of a utf-8 bom or an encoding
                  cookie as specified in pep-0263.  If both a bom and a cookie are present,
                  but disagree, a SyntaxError will be raised.  If the encoding cookie is an
                  invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
                  'utf-8-sig' is returned.
                  If no encoding is specified, then the default of 'utf-8' will be returned.
                  """
                  bom_found = False
                  encoding = None
                  default = 'utf-8'
                  def read_or_stop():
                      try:
                          return readline()
                      except StopIteration:
                          return b''
                  def find_cookie(line):
                      try:
                          # Decode as UTF-8. Either the line is an encoding declaration,
                          # in which case it should be pure ASCII, or it must be UTF-8
                          # per default encoding.
                          line_string = line.decode('utf-8')
                      except UnicodeDecodeError:
                          raise SyntaxError("invalid or missing encoding declaration")
                      matches = cookie_re.findall(line_string)
                      if not matches:
                          return None
                      encoding = _get_normal_name(matches[0])
                      try:
                          codec = lookup(encoding)
                      except LookupError:
                          # This behaviour mimics the Python interpreter
                          raise SyntaxError("unknown encoding: " + encoding)
                      if bom_found:
                          if encoding != 'utf-8':
                              # This behaviour mimics the Python interpreter
                              raise SyntaxError('encoding problem: utf-8')
                          encoding += '-sig'
                      return encoding
                  first = read_or_stop()
                  if first.startswith(BOM_UTF8):
                      bom_found = True
                      first = first[3:]
                      default = 'utf-8-sig'
                  if not first:
                      return default, []
                  encoding = find_cookie(first)
                  if encoding:
                      return encoding, [first]
                  second = read_or_stop()
                  if not second:
                      return default, [first]
                  encoding = find_cookie(second)
                  if encoding:
                      return encoding, [first, second]
                  return default, [first, second]
              def open(filename):
                  """Open a file in read only mode using the encoding detected by
                  detect_encoding().
                  """
                  buffer = builtins.open(filename, 'rb')
                  encoding, lines = detect_encoding(buffer.readline)
                  buffer.seek(0)
                  text = TextIOWrapper(buffer, encoding, line_buffering=True)
                  text.mode = 'r'
                  return text
              def tokenize(readline):
                  """
                  The tokenize() generator requires one argment, readline, which
                  must be a callable object which provides the same interface as the
                  readline() method of built-in file objects.  Each call to the function
                  should return one line of input as bytes.  Alternately, readline
                  can be a callable function terminating with StopIteration:
                      readline = open(myfile, 'rb').__next__  # Example of alternate readline
                  The generator produces 5-tuples with these members: the token type; the
                  token string; a 2-tuple (srow, scol) of ints specifying the row and
                  column where the token begins in the source; a 2-tuple (erow, ecol) of
                  ints specifying the row and column where the token ends in the source;
                  and the line on which the token was found.  The line passed is the
                  logical line; continuation lines are included.
                  The first token sequence will always be an ENCODING token
                  which tells you which encoding was used to decode the bytes stream.
                  """
                  # This import is here to avoid problems when the itertools module is not
                  # built yet and tokenize is imported.
                  from itertools import chain, repeat
                  encoding, consumed = detect_encoding(readline)
                  rl_gen = iter(readline, b"")
                  empty = repeat(b"")
                  return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
              def _tokenize(readline, encoding):
                  lnum = parenlev = continued = 0
                  numchars = '0123456789'
                  contstr, needcont = '', 0
                  contline = None
                  indents = [0]
                  if encoding is not None:
                      if encoding == "utf-8-sig":
                          # BOM will already have been stripped.
                          encoding = "utf-8"
                      yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
                  while True:             # loop over lines in stream
                      try:
                          line = readline()
                      except StopIteration:
                          line = b''
                      if encoding is not None:
                          line = line.decode(encoding)
                      lnum += 1
                      pos, max = 0, len(line)
                      if contstr:                            # continued string
                          if not line:
                              raise TokenError("EOF in multi-line string", strstart)
                          endmatch = endprog.match(line)
                          if endmatch:
                              pos = end = endmatch.end(0)
                              yield TokenInfo(STRING, contstr + line[:end],
                                     strstart, (lnum, end), contline + line)
                              contstr, needcont = '', 0
                              contline = None
                          elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
                              yield TokenInfo(ERRORTOKEN, contstr + line,
                                         strstart, (lnum, len(line)), contline)
                              contstr = ''
                              contline = None
                              continue
                          else:
                              contstr = contstr + line
                              contline = contline + line
                              continue
                      elif parenlev == 0 and not continued:  # new statement
                          if not line: break
                          column = 0
                          while pos < max:                   # measure leading whitespace
                              if line[pos] == ' ':
                                  column += 1
                              elif line[pos] == '\t':
                                  column = (column//tabsize + 1)*tabsize
                              elif line[pos] == '\f':
                                  column = 0
                              else:
                                  break
                              pos += 1
                          if pos == max:
                              break
                          if line[pos] in '#\r\n':           # skip comments or blank lines
                              if line[pos] == '#':
                                  comment_token = line[pos:].rstrip('\r\n')
                                  nl_pos = pos + len(comment_token)
                                  yield TokenInfo(COMMENT, comment_token,
                                         (lnum, pos), (lnum, pos + len(comment_token)), line)
                                  yield TokenInfo(NEWLINE, line[nl_pos:],
                                         (lnum, nl_pos), (lnum, len(line)), line)
                              else:
                                  yield TokenInfo(NEWLINE, line[pos:],
                                         (lnum, pos), (lnum, len(line)), line)
                              continue
                          if column > indents[-1]:           # count indents or dedents
                              indents.append(column)
                              yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
                          while column < indents[-1]:
                              if column not in indents:
                                  raise IndentationError(
                                      "unindent does not match any outer indentation level",
                                      ("<tokenize>", lnum, pos, line))
                              indents = indents[:-1]
                              yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
                      else:                                  # continued statement
                          if not line:
                              raise TokenError("EOF in multi-line statement", (lnum, 0))
                          continued = 0
                      while pos < max:
                          pseudomatch = pseudoprog.match(line, pos)
                          if pseudomatch:                                # scan for tokens
                              start, end = pseudomatch.span(1)
                              spos, epos, pos = (lnum, start), (lnum, end), end
                              token, initial = line[start:end], line[start]
                              if (initial in numchars or                  # ordinary number
                                  (initial == '.' and token != '.' and token != '...')):
                                  yield TokenInfo(NUMBER, token, spos, epos, line)
                              elif initial in '\r\n':
                                  yield TokenInfo(NL if parenlev > 0 else NEWLINE,
                                         token, spos, epos, line)
                              elif initial == '#':
                                  assert not token.endswith("\n")
                                  yield TokenInfo(COMMENT, token, spos, epos, line)
                              elif token in triple_quoted:
                                  endprog = endprogs[token]
                                  endmatch = endprog.match(line, pos)
                                  if endmatch:                           # all on one line
                                      pos = endmatch.end(0)
                                      token = line[start:pos]
                                      yield TokenInfo(STRING, token, spos, (lnum, pos), line)
                                  else:
                                      strstart = (lnum, start)           # multiple lines
                                      contstr = line[start:]
                                      contline = line
                                      break
                              elif initial in single_quoted or \
                                  token[:2] in single_quoted or \
                                  token[:3] in single_quoted:
                                  if token[-1] == '\n':                  # continued string
                                      strstart = (lnum, start)
                                      endprog = (endprogs[initial] or endprogs[token[1]] or
                                                 endprogs[token[2]])
                                      contstr, needcont = line[start:], 1
                                      contline = line
                                      break
                                  else:                                  # ordinary string
                                      yield TokenInfo(STRING, token, spos, epos, line)
                              elif initial.isidentifier():               # ordinary name
                                  yield TokenInfo(NAME, token, spos, epos, line)
                              elif initial == '\\':                      # continued stmt
                                  continued = 1
                              else:
                                  if initial in '([{':
                                      parenlev += 1
                                  elif initial in ')]}':
                                      parenlev -= 1
                                  yield TokenInfo(OP, token, spos, epos, line)
                          else:
                              yield TokenInfo(ERRORTOKEN, line[pos],
                                         (lnum, pos), (lnum, pos+1), line)
                              pos += 1
                  for indent in indents[1:]:                 # pop remaining indent levels
                      yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
                  yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
              # An undocumented, backwards compatible, API for all the places in the standard
              # library that expect to be able to use tokenize with strings
              def generate_tokens(readline):
                  return _tokenize(readline, None)
              if __name__ == "__main__":
                  # Quick sanity check
                  s = b'''def parseline(self, line):
                          """Parse the line into a command name and a string containing
                          the arguments.  Returns a tuple containing (command, args, line).
                          'command' and 'args' may be None if the line couldn't be parsed.
                          """
                          line = line.strip()
                          if not line:
                              return None, None, line
                          elif line[0] == '?':
                              line = 'help ' + line[1:]
                          elif line[0] == '!':
                              if hasattr(self, 'do_shell'):
                                  line = 'shell ' + line[1:]
                              else:
                                  return None, None, line
                          i, n = 0, len(line)
                          while i < n and line[i] in self.identchars: i = i+1
                          cmd, arg = line[:i], line[i:].strip()
                          return cmd, arg, line
                  '''
                  for tok in tokenize(iter(s.splitlines()).__next__):
                      print(tok)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages