upstream/ipython Files · IPython/utils/_tokenize_py3.py

don't instantiate IPython shell as class attr...

don't instantiate IPython shell as class attr in sphinxext.ipython_directive also fix capitalization, because reasons.

Thomas Kluyver - - Load All Authors

File last commit:

r10150:4ed49bf8


                r13150:e5f05d22

Download file

             _tokenize_py3.py
        
                    595 lines
            
             | 22.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / IPython / utils / _tokenize_py3.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      """Patched version of standard library tokenize, to deal with various bugs.

      Based on Python 3.2 code.

      Patches:

      - Gareth Rees' patch for Python issue #12691 (untokenizing)

        - Except we don't encode the output of untokenize

        - Python 2 compatible syntax, so that it can be byte-compiled at installation

      - Newlines in comments and blank lines should be either NL or NEWLINE, depending

        on whether they are in a multi-line statement. Filed as Python issue #17061.

      - Export generate_tokens & TokenError

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
      - u and rb literals are allowed under Python 3.3 and above.

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      ------------------------------------------------------------------------------

      Tokenization help for Python programs.

      tokenize(readline) is a generator that breaks a stream of bytes into

      Python tokens.  It decodes the bytes according to PEP-0263 for

      determining source file encoding.

      It accepts a readline-like method which is called repeatedly to get the

      next line of input (or b"" for EOF).  It generates 5-tuples with these

      members:

          the token type (see token.py)

          the token (a string)

          the starting (row, column) indices of the token (a 2-tuple of ints)

          the ending (row, column) indices of the token (a 2-tuple of ints)

          the original line (string)

      It is designed to match the working of the Python tokenizer exactly, except

      that it produces COMMENT tokens for comments and gives type OP for all

      operators.  Additionally, all token lists start with an ENCODING token

      which tells you which encoding was used to decode the bytes stream.

      """

      from __future__ import absolute_import

      __author__ = 'Ka-Ping Yee <ping@lfw.org>'

      __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

                     'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

                     'Michael Foord')

      import builtins

      import re

      import sys

      from token import *

      from codecs import lookup, BOM_UTF8

      import collections

      from io import TextIOWrapper

      cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

      import token

      __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

                                 "NL", "untokenize", "ENCODING", "TokenInfo"]

      del token

      __all__ += ["generate_tokens", "TokenError"]

      COMMENT = N_TOKENS

      tok_name[COMMENT] = 'COMMENT'

      NL = N_TOKENS + 1

      tok_name[NL] = 'NL'

      ENCODING = N_TOKENS + 2

      tok_name[ENCODING] = 'ENCODING'

      N_TOKENS += 3

      class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

          def __repr__(self):

              annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

              return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

                      self._replace(type=annotated_type))

      def group(*choices): return '(' + '|'.join(choices) + ')'

      def any(*choices): return group(*choices) + '*'

      def maybe(*choices): return group(*choices) + '?'

      # Note: we use unicode matching for names ("\w") but ascii matching for

      # number literals.

      Whitespace = r'[ \f\t]*'

      Comment = r'#[^\r\n]*'

      Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

      Name = r'\w+'

      Hexnumber = r'0[xX][0-9a-fA-F]+'

      Binnumber = r'0[bB][01]+'

      Octnumber = r'0[oO][0-7]+'

      Decnumber = r'(?:0+|[1-9][0-9]*)'

      Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

      Exponent = r'[eE][-+]?[0-9]+'

      Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

      Expfloat = r'[0-9]+' + Exponent

      Floatnumber = group(Pointfloat, Expfloat)

      Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

      Number = group(Imagnumber, Floatnumber, Intnumber)

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
      if sys.version_info.minor >= 3:

          StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

      else:

          StringPrefix = r'(?:[bB]?[rR]?)?'

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      # Tail end of ' string.

      Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

      # Tail end of " string.

      Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

      # Tail end of ''' string.

      Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

      # Tail end of """ string.

      Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
      Triple = group(StringPrefix + "'''", StringPrefix + '"""')

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      # Single-line ' or " string.

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
      String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

                     StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      # Because of leftmost-then-longest match semantics, be sure to put the

      # longest operators first (e.g., if = came before ==, == would get

      # recognized as two instances of =).

      Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

                       r"//=?", r"->",

                       r"[+\-*/%&|^=<>]=?",

                       r"~")

      Bracket = '[][(){}]'

      Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

      Funny = group(Operator, Bracket, Special)

      PlainToken = group(Number, Funny, String, Name)

      Token = Ignore + PlainToken

      # First (or only) line of ' or " string.

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
      ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
                      group("'", r'\\\r?\n'),

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
                      StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
                      group('"', r'\\\r?\n'))

      PseudoExtras = group(r'\\\r?\n', Comment, Triple)

      PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

      def _compile(expr):

          return re.compile(expr, re.UNICODE)

      tokenprog, pseudoprog, single3prog, double3prog = map(

          _compile, (Token, PseudoToken, Single3, Double3))

      endprogs = {"'": _compile(Single), '"': _compile(Double),

                  "'''": single3prog, '"""': double3prog,

                  "r'''": single3prog, 'r"""': double3prog,

                  "b'''": single3prog, 'b"""': double3prog,

                  "R'''": single3prog, 'R"""': double3prog,

                  "B'''": single3prog, 'B"""': double3prog,

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
                  "br'''": single3prog, 'br"""': double3prog,

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
                  "bR'''": single3prog, 'bR"""': double3prog,

                  "Br'''": single3prog, 'Br"""': double3prog,

                  "BR'''": single3prog, 'BR"""': double3prog,

                  'r': None, 'R': None, 'b': None, 'B': None}

      triple_quoted = {}

      for t in ("'''", '"""',

                "r'''", 'r"""', "R'''", 'R"""',

                "b'''", 'b"""', "B'''", 'B"""',

                "br'''", 'br"""', "Br'''", 'Br"""',

                "bR'''", 'bR"""', "BR'''", 'BR"""'):

          triple_quoted[t] = t

      single_quoted = {}

      for t in ("'", '"',

                "r'", 'r"', "R'", 'R"',

                "b'", 'b"', "B'", 'B"',

                "br'", 'br"', "Br'", 'Br"',

                "bR'", 'bR"', "BR'", 'BR"' ):

          single_quoted[t] = t

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
      if sys.version_info.minor >= 3:

          # Python 3.3

          for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:

        Thomas Kluyver
    
Fixes for tokenize in Python 3.3

              r10150
            
              _t2 = _prefix+'"""'

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
              endprogs[_t2] = double3prog

              triple_quoted[_t2] = _t2

        Thomas Kluyver
    
Fixes for tokenize in Python 3.3

              r10150
            
              _t1 = _prefix + "'''"

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
              endprogs[_t1] = single3prog

              triple_quoted[_t1] = _t1

              single_quoted[_prefix+'"'] = _prefix+'"'

        Thomas Kluyver
    
Fixes for tokenize in Python 3.3

              r10150
            
              single_quoted[_prefix+"'"] = _prefix+"'"

        Thomas Kluyver
    
Allow u and rb strings in tokenize for Python 3.3+

              r10111
            
          del _prefix, _t2, _t1

          endprogs['u'] = None

          endprogs['U'] = None

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      del _compile

      tabsize = 8

      class TokenError(Exception): pass

      class StopTokenizing(Exception): pass

      class Untokenizer:

          def __init__(self):

              self.tokens = []

              self.prev_row = 1

              self.prev_col = 0

              self.encoding = 'utf-8'

          def add_whitespace(self, tok_type, start):

              row, col = start

              assert row >= self.prev_row

              col_offset = col - self.prev_col

              if col_offset > 0:

                  self.tokens.append(" " * col_offset)

              elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):

                  # Line was backslash-continued.

                  self.tokens.append(" ")

          def untokenize(self, tokens):

              iterable = iter(tokens)

              for t in iterable:

                  if len(t) == 2:

                      self.compat(t, iterable)

                      break

                  tok_type, token, start, end = t[:4]

                  if tok_type == ENCODING:

                      self.encoding = token

                      continue

                  self.add_whitespace(tok_type, start)

                  self.tokens.append(token)

                  self.prev_row, self.prev_col = end

                  if tok_type in (NEWLINE, NL):

                      self.prev_row += 1

                      self.prev_col = 0

              return "".join(self.tokens)

          def compat(self, token, iterable):

              # This import is here to avoid problems when the itertools

              # module is not built yet and tokenize is imported.

              from itertools import chain

              startline = False

              prevstring = False

              indents = []

              toks_append = self.tokens.append

              for tok in chain([token], iterable):

                  toknum, tokval = tok[:2]

                  if toknum == ENCODING:

                      self.encoding = tokval

                      continue

                  if toknum in (NAME, NUMBER):

                      tokval += ' '

                  # Insert a space between two consecutive strings

                  if toknum == STRING:

                      if prevstring:

                          tokval = ' ' + tokval

                      prevstring = True

                  else:

                      prevstring = False

                  if toknum == INDENT:

                      indents.append(tokval)

                      continue

                  elif toknum == DEDENT:

                      indents.pop()

                      continue

                  elif toknum in (NEWLINE, NL):

                      startline = True

                  elif startline and indents:

                      toks_append(indents[-1])

                      startline = False

                  toks_append(tokval)

      def untokenize(tokens):

          """

          Convert ``tokens`` (an iterable) back into Python source code. Return

          a bytes object, encoded using the encoding specified by the last

          ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.

          The result is guaranteed to tokenize back to match the input so that

          the conversion is lossless and round-trips are assured.  The

          guarantee applies only to the token type and token string as the

          spacing between tokens (column positions) may change.

          :func:`untokenize` has two modes. If the input tokens are sequences

          of length 2 (``type``, ``string``) then spaces are added as necessary to

          preserve the round-trip property.

          If the input tokens are sequences of length 4 or more (``type``,

          ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then

          spaces are added so that each token appears in the result at the

          position indicated by ``start`` and ``end``, if possible.

          """

          return Untokenizer().untokenize(tokens)

      def _get_normal_name(orig_enc):

          """Imitates get_normal_name in tokenizer.c."""

          # Only care about the first 12 characters.

          enc = orig_enc[:12].lower().replace("_", "-")

          if enc == "utf-8" or enc.startswith("utf-8-"):

              return "utf-8"

          if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

             enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

              return "iso-8859-1"

          return orig_enc

      def detect_encoding(readline):

          """

          The detect_encoding() function is used to detect the encoding that should

          be used to decode a Python source file.  It requires one argment, readline,

          in the same way as the tokenize() generator.

          It will call readline a maximum of twice, and return the encoding used

          (as a string) and a list of any lines (left as bytes) it has read in.

          It detects the encoding from the presence of a utf-8 bom or an encoding

          cookie as specified in pep-0263.  If both a bom and a cookie are present,

          but disagree, a SyntaxError will be raised.  If the encoding cookie is an

          invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,

          'utf-8-sig' is returned.

          If no encoding is specified, then the default of 'utf-8' will be returned.

          """

          bom_found = False

          encoding = None

          default = 'utf-8'

          def read_or_stop():

              try:

                  return readline()

              except StopIteration:

                  return b''

          def find_cookie(line):

              try:

                  # Decode as UTF-8. Either the line is an encoding declaration,

                  # in which case it should be pure ASCII, or it must be UTF-8

                  # per default encoding.

                  line_string = line.decode('utf-8')

              except UnicodeDecodeError:

                  raise SyntaxError("invalid or missing encoding declaration")

              matches = cookie_re.findall(line_string)

              if not matches:

                  return None

              encoding = _get_normal_name(matches[0])

              try:

                  codec = lookup(encoding)

              except LookupError:

                  # This behaviour mimics the Python interpreter

                  raise SyntaxError("unknown encoding: " + encoding)

              if bom_found:

                  if encoding != 'utf-8':

                      # This behaviour mimics the Python interpreter

                      raise SyntaxError('encoding problem: utf-8')

                  encoding += '-sig'

              return encoding

          first = read_or_stop()

          if first.startswith(BOM_UTF8):

              bom_found = True

              first = first[3:]

              default = 'utf-8-sig'

          if not first:

              return default, []

          encoding = find_cookie(first)

          if encoding:

              return encoding, [first]

          second = read_or_stop()

          if not second:

              return default, [first]

          encoding = find_cookie(second)

          if encoding:

              return encoding, [first, second]

          return default, [first, second]

      def open(filename):

          """Open a file in read only mode using the encoding detected by

          detect_encoding().

          """

          buffer = builtins.open(filename, 'rb')

          encoding, lines = detect_encoding(buffer.readline)

          buffer.seek(0)

          text = TextIOWrapper(buffer, encoding, line_buffering=True)

          text.mode = 'r'

          return text

      def tokenize(readline):

          """

          The tokenize() generator requires one argment, readline, which

          must be a callable object which provides the same interface as the

          readline() method of built-in file objects.  Each call to the function

          should return one line of input as bytes.  Alternately, readline

          can be a callable function terminating with StopIteration:

              readline = open(myfile, 'rb').__next__  # Example of alternate readline

          The generator produces 5-tuples with these members: the token type; the

          token string; a 2-tuple (srow, scol) of ints specifying the row and

          column where the token begins in the source; a 2-tuple (erow, ecol) of

          ints specifying the row and column where the token ends in the source;

          and the line on which the token was found.  The line passed is the

          logical line; continuation lines are included.

          The first token sequence will always be an ENCODING token

          which tells you which encoding was used to decode the bytes stream.

          """

          # This import is here to avoid problems when the itertools module is not

          # built yet and tokenize is imported.

          from itertools import chain, repeat

          encoding, consumed = detect_encoding(readline)

          rl_gen = iter(readline, b"")

          empty = repeat(b"")

          return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

      def _tokenize(readline, encoding):

          lnum = parenlev = continued = 0

          numchars = '0123456789'

          contstr, needcont = '', 0

          contline = None

          indents = [0]

          if encoding is not None:

              if encoding == "utf-8-sig":

                  # BOM will already have been stripped.

                  encoding = "utf-8"

              yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

          while True:             # loop over lines in stream

              try:

                  line = readline()

              except StopIteration:

                  line = b''

              if encoding is not None:

                  line = line.decode(encoding)

              lnum += 1

              pos, max = 0, len(line)

              if contstr:                            # continued string

                  if not line:

                      raise TokenError("EOF in multi-line string", strstart)

                  endmatch = endprog.match(line)

                  if endmatch:

                      pos = end = endmatch.end(0)

                      yield TokenInfo(STRING, contstr + line[:end],

                             strstart, (lnum, end), contline + line)

                      contstr, needcont = '', 0

                      contline = None

                  elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

                      yield TokenInfo(ERRORTOKEN, contstr + line,

                                 strstart, (lnum, len(line)), contline)

                      contstr = ''

                      contline = None

                      continue

                  else:

                      contstr = contstr + line

                      contline = contline + line

                      continue

              elif parenlev == 0 and not continued:  # new statement

                  if not line: break

                  column = 0

                  while pos < max:                   # measure leading whitespace

                      if line[pos] == ' ':

                          column += 1

                      elif line[pos] == '\t':

                          column = (column//tabsize + 1)*tabsize

                      elif line[pos] == '\f':

                          column = 0

                      else:

                          break

                      pos += 1

                  if pos == max:

                      break

                  if line[pos] in '#\r\n':           # skip comments or blank lines

                      if line[pos] == '#':

                          comment_token = line[pos:].rstrip('\r\n')

                          nl_pos = pos + len(comment_token)

                          yield TokenInfo(COMMENT, comment_token,

                                 (lnum, pos), (lnum, pos + len(comment_token)), line)

                          yield TokenInfo(NEWLINE, line[nl_pos:],

                                 (lnum, nl_pos), (lnum, len(line)), line)

                      else:

                          yield TokenInfo(NEWLINE, line[pos:],

                                 (lnum, pos), (lnum, len(line)), line)

                      continue

                  if column > indents[-1]:           # count indents or dedents

                      indents.append(column)

                      yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

                  while column < indents[-1]:

                      if column not in indents:

                          raise IndentationError(

                              "unindent does not match any outer indentation level",

                              ("<tokenize>", lnum, pos, line))

                      indents = indents[:-1]

                      yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

              else:                                  # continued statement

                  if not line:

                      raise TokenError("EOF in multi-line statement", (lnum, 0))

                  continued = 0

              while pos < max:

                  pseudomatch = pseudoprog.match(line, pos)

                  if pseudomatch:                                # scan for tokens

                      start, end = pseudomatch.span(1)

                      spos, epos, pos = (lnum, start), (lnum, end), end

                      token, initial = line[start:end], line[start]

                      if (initial in numchars or                  # ordinary number

                          (initial == '.' and token != '.' and token != '...')):

                          yield TokenInfo(NUMBER, token, spos, epos, line)

                      elif initial in '\r\n':

                          yield TokenInfo(NL if parenlev > 0 else NEWLINE,

                                 token, spos, epos, line)

                      elif initial == '#':

                          assert not token.endswith("\n")

                          yield TokenInfo(COMMENT, token, spos, epos, line)

                      elif token in triple_quoted:

                          endprog = endprogs[token]

                          endmatch = endprog.match(line, pos)

                          if endmatch:                           # all on one line

                              pos = endmatch.end(0)

                              token = line[start:pos]

                              yield TokenInfo(STRING, token, spos, (lnum, pos), line)

                          else:

                              strstart = (lnum, start)           # multiple lines

                              contstr = line[start:]

                              contline = line

                              break

                      elif initial in single_quoted or \

                          token[:2] in single_quoted or \

                          token[:3] in single_quoted:

                          if token[-1] == '\n':                  # continued string

                              strstart = (lnum, start)

                              endprog = (endprogs[initial] or endprogs[token[1]] or

                                         endprogs[token[2]])

                              contstr, needcont = line[start:], 1

                              contline = line

                              break

                          else:                                  # ordinary string

                              yield TokenInfo(STRING, token, spos, epos, line)

                      elif initial.isidentifier():               # ordinary name

                          yield TokenInfo(NAME, token, spos, epos, line)

                      elif initial == '\\':                      # continued stmt

                          continued = 1

                      else:

                          if initial in '([{':

                              parenlev += 1

                          elif initial in ')]}':

                              parenlev -= 1

                          yield TokenInfo(OP, token, spos, epos, line)

                  else:

                      yield TokenInfo(ERRORTOKEN, line[pos],

                                 (lnum, pos), (lnum, pos+1), line)

                      pos += 1

          for indent in indents[1:]:                 # pop remaining indent levels

              yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

          yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

      # An undocumented, backwards compatible, API for all the places in the standard

      # library that expect to be able to use tokenize with strings

      def generate_tokens(readline):

          return _tokenize(readline, None)

      if __name__ == "__main__":

          # Quick sanity check

          s = b'''def parseline(self, line):

                  """Parse the line into a command name and a string containing

                  the arguments.  Returns a tuple containing (command, args, line).

                  'command' and 'args' may be None if the line couldn't be parsed.

                  """

                  line = line.strip()

                  if not line:

                      return None, None, line

                  elif line[0] == '?':

                      line = 'help ' + line[1:]

                  elif line[0] == '!':

                      if hasattr(self, 'do_shell'):

                          line = 'shell ' + line[1:]

                      else:

                          return None, None, line

                  i, n = 0, len(line)

                  while i < n and line[i] in self.identchars: i = i+1

                  cmd, arg = line[:i], line[i:].strip()

                  return cmd, arg, line

          '''

          for tok in tokenize(iter(s.splitlines()).__next__):

              print(tok)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	"""Patched version of standard library tokenize, to deal with various bugs.

		Based on Python 3.2 code.

		Patches:

		- Gareth Rees' patch for Python issue #12691 (untokenizing)
		- Except we don't encode the output of untokenize
		- Python 2 compatible syntax, so that it can be byte-compiled at installation
		- Newlines in comments and blank lines should be either NL or NEWLINE, depending
		on whether they are in a multi-line statement. Filed as Python issue #17061.
		- Export generate_tokens & TokenError
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	- u and rb literals are allowed under Python 3.3 and above.
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110
		------------------------------------------------------------------------------
		Tokenization help for Python programs.

		tokenize(readline) is a generator that breaks a stream of bytes into
		Python tokens. It decodes the bytes according to PEP-0263 for
		determining source file encoding.

		It accepts a readline-like method which is called repeatedly to get the
		next line of input (or b"" for EOF). It generates 5-tuples with these
		members:

		the token type (see token.py)
		the token (a string)
		the starting (row, column) indices of the token (a 2-tuple of ints)
		the ending (row, column) indices of the token (a 2-tuple of ints)
		the original line (string)

		It is designed to match the working of the Python tokenizer exactly, except
		that it produces COMMENT tokens for comments and gives type OP for all
		operators. Additionally, all token lists start with an ENCODING token
		which tells you which encoding was used to decode the bytes stream.
		"""
		from __future__ import absolute_import

		__author__ = 'Ka-Ping Yee <ping@lfw.org>'
		__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
		'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
		'Michael Foord')
		import builtins
		import re
		import sys
		from token import *
		from codecs import lookup, BOM_UTF8
		import collections
		from io import TextIOWrapper
		cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

		import token
		__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
		"NL", "untokenize", "ENCODING", "TokenInfo"]
		del token

		__all__ += ["generate_tokens", "TokenError"]

		COMMENT = N_TOKENS
		tok_name[COMMENT] = 'COMMENT'
		NL = N_TOKENS + 1
		tok_name[NL] = 'NL'
		ENCODING = N_TOKENS + 2
		tok_name[ENCODING] = 'ENCODING'
		N_TOKENS += 3

		class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
		def __repr__(self):
		annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
		return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
		self._replace(type=annotated_type))

		def group(*choices): return '(' + '\|'.join(choices) + ')'
		def any(choices): return group(choices) + '*'
		def maybe(choices): return group(choices) + '?'

		# Note: we use unicode matching for names ("\w") but ascii matching for
		# number literals.
		Whitespace = r'[ \f\t]*'
		Comment = r'#[^\r\n]*'
		Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
		Name = r'\w+'

		Hexnumber = r'0[xX][0-9a-fA-F]+'
		Binnumber = r'0[bB][01]+'
		Octnumber = r'0[oO][0-7]+'
		Decnumber = r'(?:0+\|[1-9][0-9]*)'
		Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
		Exponent = r'[eE][-+]?[0-9]+'
		Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
		Expfloat = r'[0-9]+' + Exponent
		Floatnumber = group(Pointfloat, Expfloat)
		Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
		Number = group(Imagnumber, Floatnumber, Intnumber)

Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	if sys.version_info.minor >= 3:
		StringPrefix = r'(?:[bB][rR]?\|[rR][bB]?\|[uU])?'
		else:
		StringPrefix = r'(?:[bB]?[rR]?)?'

Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	# Tail end of ' string.
		Single = r"[^'\\](?:\\.[^'\\])*'"
		# Tail end of " string.
		Double = r'[^"\\](?:\\.[^"\\])*"'
		# Tail end of ''' string.
		Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
		# Tail end of """ string.
		Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	Triple = group(StringPrefix + "'''", StringPrefix + '"""')
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	# Single-line ' or " string.
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	String = group(StringPrefix + r"'[^\n'\\](?:\\.[^\n'\\])*'",
		StringPrefix + r'"[^\n"\\](?:\\.[^\n"\\])*"')
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110
		# Because of leftmost-then-longest match semantics, be sure to put the
		# longest operators first (e.g., if = came before ==, == would get
		# recognized as two instances of =).
		Operator = group(r"\\=?", r">>=?", r"<<=?", r"!=",
		r"//=?", r"->",
		r"[+\-*/%&\|^=<>]=?",
		r"~")

		Bracket = '[][(){}]'
		Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
		Funny = group(Operator, Bracket, Special)

		PlainToken = group(Number, Funny, String, Name)
		Token = Ignore + PlainToken

		# First (or only) line of ' or " string.
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	ContStr = group(StringPrefix + r"'[^\n'\\](?:\\.[^\n'\\])*" +
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	group("'", r'\\\r?\n'),
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	StringPrefix + r'"[^\n"\\](?:\\.[^\n"\\])*' +
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	group('"', r'\\\r?\n'))
		PseudoExtras = group(r'\\\r?\n', Comment, Triple)
		PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

		def _compile(expr):
		return re.compile(expr, re.UNICODE)

		tokenprog, pseudoprog, single3prog, double3prog = map(
		_compile, (Token, PseudoToken, Single3, Double3))
		endprogs = {"'": _compile(Single), '"': _compile(Double),
		"'''": single3prog, '"""': double3prog,
		"r'''": single3prog, 'r"""': double3prog,
		"b'''": single3prog, 'b"""': double3prog,
		"R'''": single3prog, 'R"""': double3prog,
		"B'''": single3prog, 'B"""': double3prog,
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	"br'''": single3prog, 'br"""': double3prog,
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	"bR'''": single3prog, 'bR"""': double3prog,
		"Br'''": single3prog, 'Br"""': double3prog,
		"BR'''": single3prog, 'BR"""': double3prog,
		'r': None, 'R': None, 'b': None, 'B': None}

		triple_quoted = {}
		for t in ("'''", '"""',
		"r'''", 'r"""', "R'''", 'R"""',
		"b'''", 'b"""', "B'''", 'B"""',
		"br'''", 'br"""', "Br'''", 'Br"""',
		"bR'''", 'bR"""', "BR'''", 'BR"""'):
		triple_quoted[t] = t
		single_quoted = {}
		for t in ("'", '"',
		"r'", 'r"', "R'", 'R"',
		"b'", 'b"', "B'", 'B"',
		"br'", 'br"', "Br'", 'Br"',
		"bR'", 'bR"', "BR'", 'BR"' ):
		single_quoted[t] = t

Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	if sys.version_info.minor >= 3:
		# Python 3.3
		for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
Thomas Kluyver Fixes for tokenize in Python 3.3	r10150	_t2 = _prefix+'"""'
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	endprogs[_t2] = double3prog
		triple_quoted[_t2] = _t2
Thomas Kluyver Fixes for tokenize in Python 3.3	r10150	_t1 = _prefix + "'''"
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	endprogs[_t1] = single3prog
		triple_quoted[_t1] = _t1
		single_quoted[_prefix+'"'] = _prefix+'"'
Thomas Kluyver Fixes for tokenize in Python 3.3	r10150	single_quoted[_prefix+"'"] = _prefix+"'"
Thomas Kluyver Allow u and rb strings in tokenize for Python 3.3+	r10111	del _prefix, _t2, _t1
		endprogs['u'] = None
		endprogs['U'] = None

Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	del _compile

		tabsize = 8

		class TokenError(Exception): pass

		class StopTokenizing(Exception): pass


		class Untokenizer:

		def __init__(self):
		self.tokens = []
		self.prev_row = 1
		self.prev_col = 0
		self.encoding = 'utf-8'

		def add_whitespace(self, tok_type, start):
		row, col = start
		assert row >= self.prev_row
		col_offset = col - self.prev_col
		if col_offset > 0:
		self.tokens.append(" " * col_offset)
		elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
		# Line was backslash-continued.
		self.tokens.append(" ")

		def untokenize(self, tokens):
		iterable = iter(tokens)
		for t in iterable:
		if len(t) == 2:
		self.compat(t, iterable)
		break
		tok_type, token, start, end = t[:4]
		if tok_type == ENCODING:
		self.encoding = token
		continue
		self.add_whitespace(tok_type, start)
		self.tokens.append(token)
		self.prev_row, self.prev_col = end
		if tok_type in (NEWLINE, NL):
		self.prev_row += 1
		self.prev_col = 0
		return "".join(self.tokens)

		def compat(self, token, iterable):
		# This import is here to avoid problems when the itertools
		# module is not built yet and tokenize is imported.
		from itertools import chain
		startline = False
		prevstring = False
		indents = []
		toks_append = self.tokens.append

		for tok in chain([token], iterable):
		toknum, tokval = tok[:2]
		if toknum == ENCODING:
		self.encoding = tokval
		continue

		if toknum in (NAME, NUMBER):
		tokval += ' '

		# Insert a space between two consecutive strings
		if toknum == STRING:
		if prevstring:
		tokval = ' ' + tokval
		prevstring = True
		else:
		prevstring = False

		if toknum == INDENT:
		indents.append(tokval)
		continue
		elif toknum == DEDENT:
		indents.pop()
		continue
		elif toknum in (NEWLINE, NL):
		startline = True
		elif startline and indents:
		toks_append(indents[-1])
		startline = False
		toks_append(tokval)


		def untokenize(tokens):
		"""
		Convert ``tokens`` (an iterable) back into Python source code. Return
		a bytes object, encoded using the encoding specified by the last
		ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.

		The result is guaranteed to tokenize back to match the input so that
		the conversion is lossless and round-trips are assured. The
		guarantee applies only to the token type and token string as the
		spacing between tokens (column positions) may change.

		:func:`untokenize` has two modes. If the input tokens are sequences
		of length 2 (``type``, ``string``) then spaces are added as necessary to
		preserve the round-trip property.

		If the input tokens are sequences of length 4 or more (``type``,
		``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
		spaces are added so that each token appears in the result at the
		position indicated by ``start`` and ``end``, if possible.
		"""
		return Untokenizer().untokenize(tokens)


		def _get_normal_name(orig_enc):
		"""Imitates get_normal_name in tokenizer.c."""
		# Only care about the first 12 characters.
		enc = orig_enc[:12].lower().replace("_", "-")
		if enc == "utf-8" or enc.startswith("utf-8-"):
		return "utf-8"
		if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
		enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
		return "iso-8859-1"
		return orig_enc

		def detect_encoding(readline):
		"""
		The detect_encoding() function is used to detect the encoding that should
		be used to decode a Python source file. It requires one argment, readline,
		in the same way as the tokenize() generator.

		It will call readline a maximum of twice, and return the encoding used
		(as a string) and a list of any lines (left as bytes) it has read in.

		It detects the encoding from the presence of a utf-8 bom or an encoding
		cookie as specified in pep-0263. If both a bom and a cookie are present,
		but disagree, a SyntaxError will be raised. If the encoding cookie is an
		invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
		'utf-8-sig' is returned.

		If no encoding is specified, then the default of 'utf-8' will be returned.
		"""
		bom_found = False
		encoding = None
		default = 'utf-8'
		def read_or_stop():
		try:
		return readline()
		except StopIteration:
		return b''

		def find_cookie(line):
		try:
		# Decode as UTF-8. Either the line is an encoding declaration,
		# in which case it should be pure ASCII, or it must be UTF-8
		# per default encoding.
		line_string = line.decode('utf-8')
		except UnicodeDecodeError:
		raise SyntaxError("invalid or missing encoding declaration")

		matches = cookie_re.findall(line_string)
		if not matches:
		return None
		encoding = _get_normal_name(matches[0])
		try:
		codec = lookup(encoding)
		except LookupError:
		# This behaviour mimics the Python interpreter
		raise SyntaxError("unknown encoding: " + encoding)

		if bom_found:
		if encoding != 'utf-8':
		# This behaviour mimics the Python interpreter
		raise SyntaxError('encoding problem: utf-8')
		encoding += '-sig'
		return encoding

		first = read_or_stop()
		if first.startswith(BOM_UTF8):
		bom_found = True
		first = first[3:]
		default = 'utf-8-sig'
		if not first:
		return default, []

		encoding = find_cookie(first)
		if encoding:
		return encoding, [first]

		second = read_or_stop()
		if not second:
		return default, [first]

		encoding = find_cookie(second)
		if encoding:
		return encoding, [first, second]

		return default, [first, second]


		def open(filename):
		"""Open a file in read only mode using the encoding detected by
		detect_encoding().
		"""
		buffer = builtins.open(filename, 'rb')
		encoding, lines = detect_encoding(buffer.readline)
		buffer.seek(0)
		text = TextIOWrapper(buffer, encoding, line_buffering=True)
		text.mode = 'r'
		return text


		def tokenize(readline):
		"""
		The tokenize() generator requires one argment, readline, which
		must be a callable object which provides the same interface as the
		readline() method of built-in file objects. Each call to the function
		should return one line of input as bytes. Alternately, readline
		can be a callable function terminating with StopIteration:
		readline = open(myfile, 'rb').__next__ # Example of alternate readline

		The generator produces 5-tuples with these members: the token type; the
		token string; a 2-tuple (srow, scol) of ints specifying the row and
		column where the token begins in the source; a 2-tuple (erow, ecol) of
		ints specifying the row and column where the token ends in the source;
		and the line on which the token was found. The line passed is the
		logical line; continuation lines are included.

		The first token sequence will always be an ENCODING token
		which tells you which encoding was used to decode the bytes stream.
		"""
		# This import is here to avoid problems when the itertools module is not
		# built yet and tokenize is imported.
		from itertools import chain, repeat
		encoding, consumed = detect_encoding(readline)
		rl_gen = iter(readline, b"")
		empty = repeat(b"")
		return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)


		def _tokenize(readline, encoding):
		lnum = parenlev = continued = 0
		numchars = '0123456789'
		contstr, needcont = '', 0
		contline = None
		indents = [0]

		if encoding is not None:
		if encoding == "utf-8-sig":
		# BOM will already have been stripped.
		encoding = "utf-8"
		yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
		while True: # loop over lines in stream
		try:
		line = readline()
		except StopIteration:
		line = b''

		if encoding is not None:
		line = line.decode(encoding)
		lnum += 1
		pos, max = 0, len(line)

		if contstr: # continued string
		if not line:
		raise TokenError("EOF in multi-line string", strstart)
		endmatch = endprog.match(line)
		if endmatch:
		pos = end = endmatch.end(0)
		yield TokenInfo(STRING, contstr + line[:end],
		strstart, (lnum, end), contline + line)
		contstr, needcont = '', 0
		contline = None
		elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
		yield TokenInfo(ERRORTOKEN, contstr + line,
		strstart, (lnum, len(line)), contline)
		contstr = ''
		contline = None
		continue
		else:
		contstr = contstr + line
		contline = contline + line
		continue

		elif parenlev == 0 and not continued: # new statement
		if not line: break
		column = 0
		while pos < max: # measure leading whitespace
		if line[pos] == ' ':
		column += 1
		elif line[pos] == '\t':
		column = (column//tabsize + 1)*tabsize
		elif line[pos] == '\f':
		column = 0
		else:
		break
		pos += 1
		if pos == max:
		break

		if line[pos] in '#\r\n': # skip comments or blank lines
		if line[pos] == '#':
		comment_token = line[pos:].rstrip('\r\n')
		nl_pos = pos + len(comment_token)
		yield TokenInfo(COMMENT, comment_token,
		(lnum, pos), (lnum, pos + len(comment_token)), line)
		yield TokenInfo(NEWLINE, line[nl_pos:],
		(lnum, nl_pos), (lnum, len(line)), line)
		else:
		yield TokenInfo(NEWLINE, line[pos:],
		(lnum, pos), (lnum, len(line)), line)
		continue

		if column > indents[-1]: # count indents or dedents
		indents.append(column)
		yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
		while column < indents[-1]:
		if column not in indents:
		raise IndentationError(
		"unindent does not match any outer indentation level",
		("<tokenize>", lnum, pos, line))
		indents = indents[:-1]
		yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

		else: # continued statement
		if not line:
		raise TokenError("EOF in multi-line statement", (lnum, 0))
		continued = 0

		while pos < max:
		pseudomatch = pseudoprog.match(line, pos)
		if pseudomatch: # scan for tokens
		start, end = pseudomatch.span(1)
		spos, epos, pos = (lnum, start), (lnum, end), end
		token, initial = line[start:end], line[start]

		if (initial in numchars or # ordinary number
		(initial == '.' and token != '.' and token != '...')):
		yield TokenInfo(NUMBER, token, spos, epos, line)
		elif initial in '\r\n':
		yield TokenInfo(NL if parenlev > 0 else NEWLINE,
		token, spos, epos, line)
		elif initial == '#':
		assert not token.endswith("\n")
		yield TokenInfo(COMMENT, token, spos, epos, line)
		elif token in triple_quoted:
		endprog = endprogs[token]
		endmatch = endprog.match(line, pos)
		if endmatch: # all on one line
		pos = endmatch.end(0)
		token = line[start:pos]
		yield TokenInfo(STRING, token, spos, (lnum, pos), line)
		else:
		strstart = (lnum, start) # multiple lines
		contstr = line[start:]
		contline = line
		break
		elif initial in single_quoted or \
		token[:2] in single_quoted or \
		token[:3] in single_quoted:
		if token[-1] == '\n': # continued string
		strstart = (lnum, start)
		endprog = (endprogs[initial] or endprogs[token[1]] or
		endprogs[token[2]])
		contstr, needcont = line[start:], 1
		contline = line
		break
		else: # ordinary string
		yield TokenInfo(STRING, token, spos, epos, line)
		elif initial.isidentifier(): # ordinary name
		yield TokenInfo(NAME, token, spos, epos, line)
		elif initial == '\\': # continued stmt
		continued = 1
		else:
		if initial in '([{':
		parenlev += 1
		elif initial in ')]}':
		parenlev -= 1
		yield TokenInfo(OP, token, spos, epos, line)
		else:
		yield TokenInfo(ERRORTOKEN, line[pos],
		(lnum, pos), (lnum, pos+1), line)
		pos += 1

		for indent in indents[1:]: # pop remaining indent levels
		yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
		yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')


		# An undocumented, backwards compatible, API for all the places in the standard
		# library that expect to be able to use tokenize with strings
		def generate_tokens(readline):
		return _tokenize(readline, None)

		if __name__ == "__main__":
		# Quick sanity check
		s = b'''def parseline(self, line):
		"""Parse the line into a command name and a string containing
		the arguments. Returns a tuple containing (command, args, line).
		'command' and 'args' may be None if the line couldn't be parsed.
		"""
		line = line.strip()
		if not line:
		return None, None, line
		elif line[0] == '?':
		line = 'help ' + line[1:]
		elif line[0] == '!':
		if hasattr(self, 'do_shell'):
		line = 'shell ' + line[1:]
		else:
		return None, None, line
		i, n = 0, len(line)
		while i < n and line[i] in self.identchars: i = i+1
		cmd, arg = line[:i], line[i:].strip()
		return cmd, arg, line
		'''
		for tok in tokenize(iter(s.splitlines()).__next__):
		print(tok)