upstream/ipython Files · IPython/utils/_tokenize_py2.py

use Tuple, not nonexistent CTuple

Thomas Kluyver - - Load All Authors

File last commit:

r13349:6edc3771


                r15471:a5e8082e

Download file

             _tokenize_py2.py
        
                    439 lines
            
             | 16.8 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / IPython / utils / _tokenize_py2.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      """Patched version of standard library tokenize, to deal with various bugs.

      Patches

      - Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing),

        manually applied.

      - Newlines in comments and blank lines should be either NL or NEWLINE, depending

        on whether they are in a multi-line statement. Filed as Python issue #17061.

      -------------------------------------------------------------------------------

      Tokenization help for Python programs.

      generate_tokens(readline) is a generator that breaks a stream of

      text into Python tokens.  It accepts a readline-like method which is called

      repeatedly to get the next line of input (or "" for EOF).  It generates

      5-tuples with these members:

          the token type (see token.py)

          the token (a string)

          the starting (row, column) indices of the token (a 2-tuple of ints)

          the ending (row, column) indices of the token (a 2-tuple of ints)

          the original line (string)

      It is designed to match the working of the Python tokenizer exactly, except

      that it produces COMMENT tokens for comments and gives type OP for all

      operators

      Older entry points

          tokenize_loop(readline, tokeneater)

          tokenize(readline, tokeneater=printtoken)

      are the same, except instead of generating tokens, tokeneater is a callback

      function to which the 5 fields described above are passed as 5 arguments,

      each time a new token is found."""

        Thomas Kluyver
    
Convert print statements to print function calls...

              r13348
            
      from __future__ import print_function

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      __author__ = 'Ka-Ping Yee <ping@lfw.org>'

      __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

                     'Skip Montanaro, Raymond Hettinger')

      import string, re

      from token import *

      import token

      __all__ = [x for x in dir(token) if not x.startswith("_")]

      __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]

      del x

      del token

      __all__ += ["TokenError"]

      COMMENT = N_TOKENS

      tok_name[COMMENT] = 'COMMENT'

      NL = N_TOKENS + 1

      tok_name[NL] = 'NL'

      N_TOKENS += 2

      def group(*choices): return '(' + '|'.join(choices) + ')'

      def any(*choices): return group(*choices) + '*'

      def maybe(*choices): return group(*choices) + '?'

      Whitespace = r'[ \f\t]*'

      Comment = r'#[^\r\n]*'

      Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

      Name = r'[a-zA-Z_]\w*'

      Hexnumber = r'0[xX][\da-fA-F]+[lL]?'

      Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'

      Binnumber = r'0[bB][01]+[lL]?'

      Decnumber = r'[1-9]\d*[lL]?'

      Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

      Exponent = r'[eE][-+]?\d+'

      Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)

      Expfloat = r'\d+' + Exponent

      Floatnumber = group(Pointfloat, Expfloat)

      Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')

      Number = group(Imagnumber, Floatnumber, Intnumber)

      # Tail end of ' string.

      Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

      # Tail end of " string.

      Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

      # Tail end of ''' string.

      Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

      # Tail end of """ string.

      Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

      Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')

      # Single-line ' or " string.

      String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

                     r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

      # Because of leftmost-then-longest match semantics, be sure to put the

      # longest operators first (e.g., if = came before ==, == would get

      # recognized as two instances of =).

      Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",

                       r"//=?",

                       r"[+\-*/%&|^=<>]=?",

                       r"~")

      Bracket = '[][(){}]'

      Special = group(r'\r?\n', r'[:;.,`@]')

      Funny = group(Operator, Bracket, Special)

      PlainToken = group(Number, Funny, String, Name)

      Token = Ignore + PlainToken

      # First (or only) line of ' or " string.

      ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

                      group("'", r'\\\r?\n'),

                      r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

                      group('"', r'\\\r?\n'))

      PseudoExtras = group(r'\\\r?\n', Comment, Triple)

      PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

      tokenprog, pseudoprog, single3prog, double3prog = map(

          re.compile, (Token, PseudoToken, Single3, Double3))

      endprogs = {"'": re.compile(Single), '"': re.compile(Double),

                  "'''": single3prog, '"""': double3prog,

                  "r'''": single3prog, 'r"""': double3prog,

                  "u'''": single3prog, 'u"""': double3prog,

                  "ur'''": single3prog, 'ur"""': double3prog,

                  "R'''": single3prog, 'R"""': double3prog,

                  "U'''": single3prog, 'U"""': double3prog,

                  "uR'''": single3prog, 'uR"""': double3prog,

                  "Ur'''": single3prog, 'Ur"""': double3prog,

                  "UR'''": single3prog, 'UR"""': double3prog,

                  "b'''": single3prog, 'b"""': double3prog,

                  "br'''": single3prog, 'br"""': double3prog,

                  "B'''": single3prog, 'B"""': double3prog,

                  "bR'''": single3prog, 'bR"""': double3prog,

                  "Br'''": single3prog, 'Br"""': double3prog,

                  "BR'''": single3prog, 'BR"""': double3prog,

                  'r': None, 'R': None, 'u': None, 'U': None,

                  'b': None, 'B': None}

      triple_quoted = {}

      for t in ("'''", '"""',

                "r'''", 'r"""', "R'''", 'R"""',

                "u'''", 'u"""', "U'''", 'U"""',

                "ur'''", 'ur"""', "Ur'''", 'Ur"""',

                "uR'''", 'uR"""', "UR'''", 'UR"""',

                "b'''", 'b"""', "B'''", 'B"""',

                "br'''", 'br"""', "Br'''", 'Br"""',

                "bR'''", 'bR"""', "BR'''", 'BR"""'):

          triple_quoted[t] = t

      single_quoted = {}

      for t in ("'", '"',

                "r'", 'r"', "R'", 'R"',

                "u'", 'u"', "U'", 'U"',

                "ur'", 'ur"', "Ur'", 'Ur"',

                "uR'", 'uR"', "UR'", 'UR"',

                "b'", 'b"', "B'", 'B"',

                "br'", 'br"', "Br'", 'Br"',

                "bR'", 'bR"', "BR'", 'BR"' ):

          single_quoted[t] = t

      tabsize = 8

      class TokenError(Exception): pass

      class StopTokenizing(Exception): pass

      def printtoken(type, token, srow_scol, erow_ecol, line): # for testing

          srow, scol = srow_scol

          erow, ecol = erow_ecol

        Thomas Kluyver
    
Convert print statements to print function calls...

              r13348
            
          print("%d,%d-%d,%d:\t%s\t%s" % \

              (srow, scol, erow, ecol, tok_name[type], repr(token)))

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
      def tokenize(readline, tokeneater=printtoken):

          """

          The tokenize() function accepts two parameters: one representing the

          input stream, and one providing an output mechanism for tokenize().

          The first parameter, readline, must be a callable object which provides

          the same interface as the readline() method of built-in file objects.

          Each call to the function should return one line of input as a string.

          The second parameter, tokeneater, must also be a callable object. It is

          called once for each token, with five arguments, corresponding to the

          tuples generated by generate_tokens().

          """

          try:

              tokenize_loop(readline, tokeneater)

          except StopTokenizing:

              pass

      # backwards compatible interface

      def tokenize_loop(readline, tokeneater):

          for token_info in generate_tokens(readline):

              tokeneater(*token_info)

      class Untokenizer:

          def __init__(self):

              self.tokens = []

              self.prev_row = 1

              self.prev_col = 0

          def add_whitespace(self, start):

              row, col = start

              assert row >= self.prev_row

              col_offset = col - self.prev_col

              if col_offset > 0:

                  self.tokens.append(" " * col_offset)

              elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):

                  # Line was backslash-continued

                  self.tokens.append(" ")

          def untokenize(self, tokens):

              iterable = iter(tokens)

              for t in iterable:

                  if len(t) == 2:

                      self.compat(t, iterable)

                      break

                  tok_type, token, start, end = t[:4]

                  self.add_whitespace(start)

                  self.tokens.append(token)

                  self.prev_row, self.prev_col = end

                  if tok_type in (NEWLINE, NL):

                      self.prev_row += 1

                      self.prev_col = 0

              return "".join(self.tokens)

          def compat(self, token, iterable):

              # This import is here to avoid problems when the itertools

              # module is not built yet and tokenize is imported.

              from itertools import chain

              startline = False

              prevstring = False

              indents = []

              toks_append = self.tokens.append

              for tok in chain([token], iterable):

                  toknum, tokval = tok[:2]

                  if toknum in (NAME, NUMBER):

                      tokval += ' '

                  # Insert a space between two consecutive strings

                  if toknum == STRING:

                      if prevstring:

                          tokval = ' ' + tokval

                      prevstring = True

                  else:

                      prevstring = False

                  if toknum == INDENT:

                      indents.append(tokval)

                      continue

                  elif toknum == DEDENT:

                      indents.pop()

                      continue

                  elif toknum in (NEWLINE, NL):

                      startline = True

                  elif startline and indents:

                      toks_append(indents[-1])

                      startline = False

                  toks_append(tokval)

      def untokenize(iterable):

          """Transform tokens back into Python source code.

          Each element returned by the iterable must be a token sequence

          with at least two elements, a token number and token value.  If

          only two tokens are passed, the resulting output is poor.

          Round-trip invariant for full input:

              Untokenized source will match input source exactly

          Round-trip invariant for limited intput:

              # Output text will tokenize the back to the input

              t1 = [tok[:2] for tok in generate_tokens(f.readline)]

              newcode = untokenize(t1)

              readline = iter(newcode.splitlines(1)).next

              t2 = [tok[:2] for tok in generate_tokens(readline)]

              assert t1 == t2

          """

          ut = Untokenizer()

          return ut.untokenize(iterable)

      def generate_tokens(readline):

          """

          The generate_tokens() generator requires one argment, readline, which

          must be a callable object which provides the same interface as the

          readline() method of built-in file objects. Each call to the function

          should return one line of input as a string.  Alternately, readline

          can be a callable function terminating with StopIteration:

              readline = open(myfile).next    # Example of alternate readline

          The generator produces 5-tuples with these members: the token type; the

          token string; a 2-tuple (srow, scol) of ints specifying the row and

          column where the token begins in the source; a 2-tuple (erow, ecol) of

          ints specifying the row and column where the token ends in the source;

          and the line on which the token was found. The line passed is the

          logical line; continuation lines are included.

          """

          lnum = parenlev = continued = 0

          namechars, numchars = string.ascii_letters + '_', '0123456789'

          contstr, needcont = '', 0

          contline = None

          indents = [0]

          while 1:                                   # loop over lines in stream

              try:

                  line = readline()

              except StopIteration:

                  line = ''

              lnum += 1

              pos, max = 0, len(line)

              if contstr:                            # continued string

                  if not line:

        Thomas Kluyver
    
Make raise statements Python 3 compatible....

              r13349
            
                      raise TokenError("EOF in multi-line string", strstart)

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
                  endmatch = endprog.match(line)

                  if endmatch:

                      pos = end = endmatch.end(0)

                      yield (STRING, contstr + line[:end],

                             strstart, (lnum, end), contline + line)

                      contstr, needcont = '', 0

                      contline = None

                  elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

                      yield (ERRORTOKEN, contstr + line,

                                 strstart, (lnum, len(line)), contline)

                      contstr = ''

                      contline = None

                      continue

                  else:

                      contstr = contstr + line

                      contline = contline + line

                      continue

              elif parenlev == 0 and not continued:  # new statement

                  if not line: break

                  column = 0

                  while pos < max:                   # measure leading whitespace

                      if line[pos] == ' ':

                          column += 1

                      elif line[pos] == '\t':

                          column = (column//tabsize + 1)*tabsize

                      elif line[pos] == '\f':

                          column = 0

                      else:

                          break

                      pos += 1

                  if pos == max:

                      break

                  if line[pos] in '#\r\n':           # skip comments or blank lines

                      if line[pos] == '#':

                          comment_token = line[pos:].rstrip('\r\n')

                          nl_pos = pos + len(comment_token)

                          yield (COMMENT, comment_token,

                                 (lnum, pos), (lnum, pos + len(comment_token)), line)

                          yield (NEWLINE, line[nl_pos:],

                                 (lnum, nl_pos), (lnum, len(line)), line)

                      else:

                          yield (NEWLINE, line[pos:],

                                 (lnum, pos), (lnum, len(line)), line)

                      continue

                  if column > indents[-1]:           # count indents or dedents

                      indents.append(column)

                      yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

                  while column < indents[-1]:

                      if column not in indents:

                          raise IndentationError(

                              "unindent does not match any outer indentation level",

                              ("<tokenize>", lnum, pos, line))

                      indents = indents[:-1]

                      yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

              else:                                  # continued statement

                  if not line:

        Thomas Kluyver
    
Make raise statements Python 3 compatible....

              r13349
            
                      raise TokenError("EOF in multi-line statement", (lnum, 0))

        Thomas Kluyver
    
Now include patched copies of tokenize for Python 2 and 3.

              r10110
            
                  continued = 0

              while pos < max:

                  pseudomatch = pseudoprog.match(line, pos)

                  if pseudomatch:                                # scan for tokens

                      start, end = pseudomatch.span(1)

                      spos, epos, pos = (lnum, start), (lnum, end), end

                      token, initial = line[start:end], line[start]

                      if initial in numchars or \

                         (initial == '.' and token != '.'):      # ordinary number

                          yield (NUMBER, token, spos, epos, line)

                      elif initial in '\r\n':

                          yield (NL if parenlev > 0 else NEWLINE,

                                 token, spos, epos, line)

                      elif initial == '#':

                          assert not token.endswith("\n")

                          yield (COMMENT, token, spos, epos, line)

                      elif token in triple_quoted:

                          endprog = endprogs[token]

                          endmatch = endprog.match(line, pos)

                          if endmatch:                           # all on one line

                              pos = endmatch.end(0)

                              token = line[start:pos]

                              yield (STRING, token, spos, (lnum, pos), line)

                          else:

                              strstart = (lnum, start)           # multiple lines

                              contstr = line[start:]

                              contline = line

                              break

                      elif initial in single_quoted or \

                          token[:2] in single_quoted or \

                          token[:3] in single_quoted:

                          if token[-1] == '\n':                  # continued string

                              strstart = (lnum, start)

                              endprog = (endprogs[initial] or endprogs[token[1]] or

                                         endprogs[token[2]])

                              contstr, needcont = line[start:], 1

                              contline = line

                              break

                          else:                                  # ordinary string

                              yield (STRING, token, spos, epos, line)

                      elif initial in namechars:                 # ordinary name

                          yield (NAME, token, spos, epos, line)

                      elif initial == '\\':                      # continued stmt

                          continued = 1

                      else:

                          if initial in '([{':

                              parenlev += 1

                          elif initial in ')]}':

                              parenlev -= 1

                          yield (OP, token, spos, epos, line)

                  else:

                      yield (ERRORTOKEN, line[pos],

                                 (lnum, pos), (lnum, pos+1), line)

                      pos += 1

          for indent in indents[1:]:                 # pop remaining indent levels

              yield (DEDENT, '', (lnum, 0), (lnum, 0), '')

          yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')

      if __name__ == '__main__':                     # testing

          import sys

          if len(sys.argv) > 1:

              tokenize(open(sys.argv[1]).readline)

          else:

              tokenize(sys.stdin.readline)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	"""Patched version of standard library tokenize, to deal with various bugs.

		Patches

		- Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing),
		manually applied.
		- Newlines in comments and blank lines should be either NL or NEWLINE, depending
		on whether they are in a multi-line statement. Filed as Python issue #17061.

		-------------------------------------------------------------------------------
		Tokenization help for Python programs.

		generate_tokens(readline) is a generator that breaks a stream of
		text into Python tokens. It accepts a readline-like method which is called
		repeatedly to get the next line of input (or "" for EOF). It generates
		5-tuples with these members:

		the token type (see token.py)
		the token (a string)
		the starting (row, column) indices of the token (a 2-tuple of ints)
		the ending (row, column) indices of the token (a 2-tuple of ints)
		the original line (string)

		It is designed to match the working of the Python tokenizer exactly, except
		that it produces COMMENT tokens for comments and gives type OP for all
		operators

		Older entry points
		tokenize_loop(readline, tokeneater)
		tokenize(readline, tokeneater=printtoken)
		are the same, except instead of generating tokens, tokeneater is a callback
		function to which the 5 fields described above are passed as 5 arguments,
		each time a new token is found."""
Thomas Kluyver Convert print statements to print function calls...	r13348	from __future__ import print_function
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110
		__author__ = 'Ka-Ping Yee <ping@lfw.org>'
		__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
		'Skip Montanaro, Raymond Hettinger')

		import string, re
		from token import *

		import token
		__all__ = [x for x in dir(token) if not x.startswith("_")]
		__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
		del x
		del token

		__all__ += ["TokenError"]

		COMMENT = N_TOKENS
		tok_name[COMMENT] = 'COMMENT'
		NL = N_TOKENS + 1
		tok_name[NL] = 'NL'
		N_TOKENS += 2

		def group(*choices): return '(' + '\|'.join(choices) + ')'
		def any(choices): return group(choices) + '*'
		def maybe(choices): return group(choices) + '?'

		Whitespace = r'[ \f\t]*'
		Comment = r'#[^\r\n]*'
		Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
		Name = r'[a-zA-Z_]\w*'

		Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
		Octnumber = r'(0[oO][0-7]+)\|(0[0-7]*)[lL]?'
		Binnumber = r'0[bB][01]+[lL]?'
		Decnumber = r'[1-9]\d*[lL]?'
		Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
		Exponent = r'[eE][-+]?\d+'
		Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
		Expfloat = r'\d+' + Exponent
		Floatnumber = group(Pointfloat, Expfloat)
		Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
		Number = group(Imagnumber, Floatnumber, Intnumber)

		# Tail end of ' string.
		Single = r"[^'\\](?:\\.[^'\\])*'"
		# Tail end of " string.
		Double = r'[^"\\](?:\\.[^"\\])*"'
		# Tail end of ''' string.
		Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
		# Tail end of """ string.
		Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
		Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
		# Single-line ' or " string.
		String = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
		r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')

		# Because of leftmost-then-longest match semantics, be sure to put the
		# longest operators first (e.g., if = came before ==, == would get
		# recognized as two instances of =).
		Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
		r"//=?",
		r"[+\-*/%&\|^=<>]=?",
		r"~")

		Bracket = '[][(){}]'
		Special = group(r'\r?\n', r'[:;.,`@]')
		Funny = group(Operator, Bracket, Special)

		PlainToken = group(Number, Funny, String, Name)
		Token = Ignore + PlainToken

		# First (or only) line of ' or " string.
		ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
		group("'", r'\\\r?\n'),
		r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
		group('"', r'\\\r?\n'))
		PseudoExtras = group(r'\\\r?\n', Comment, Triple)
		PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

		tokenprog, pseudoprog, single3prog, double3prog = map(
		re.compile, (Token, PseudoToken, Single3, Double3))
		endprogs = {"'": re.compile(Single), '"': re.compile(Double),
		"'''": single3prog, '"""': double3prog,
		"r'''": single3prog, 'r"""': double3prog,
		"u'''": single3prog, 'u"""': double3prog,
		"ur'''": single3prog, 'ur"""': double3prog,
		"R'''": single3prog, 'R"""': double3prog,
		"U'''": single3prog, 'U"""': double3prog,
		"uR'''": single3prog, 'uR"""': double3prog,
		"Ur'''": single3prog, 'Ur"""': double3prog,
		"UR'''": single3prog, 'UR"""': double3prog,
		"b'''": single3prog, 'b"""': double3prog,
		"br'''": single3prog, 'br"""': double3prog,
		"B'''": single3prog, 'B"""': double3prog,
		"bR'''": single3prog, 'bR"""': double3prog,
		"Br'''": single3prog, 'Br"""': double3prog,
		"BR'''": single3prog, 'BR"""': double3prog,
		'r': None, 'R': None, 'u': None, 'U': None,
		'b': None, 'B': None}

		triple_quoted = {}
		for t in ("'''", '"""',
		"r'''", 'r"""', "R'''", 'R"""',
		"u'''", 'u"""', "U'''", 'U"""',
		"ur'''", 'ur"""', "Ur'''", 'Ur"""',
		"uR'''", 'uR"""', "UR'''", 'UR"""',
		"b'''", 'b"""', "B'''", 'B"""',
		"br'''", 'br"""', "Br'''", 'Br"""',
		"bR'''", 'bR"""', "BR'''", 'BR"""'):
		triple_quoted[t] = t
		single_quoted = {}
		for t in ("'", '"',
		"r'", 'r"', "R'", 'R"',
		"u'", 'u"', "U'", 'U"',
		"ur'", 'ur"', "Ur'", 'Ur"',
		"uR'", 'uR"', "UR'", 'UR"',
		"b'", 'b"', "B'", 'B"',
		"br'", 'br"', "Br'", 'Br"',
		"bR'", 'bR"', "BR'", 'BR"' ):
		single_quoted[t] = t

		tabsize = 8

		class TokenError(Exception): pass

		class StopTokenizing(Exception): pass

		def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
		srow, scol = srow_scol
		erow, ecol = erow_ecol
Thomas Kluyver Convert print statements to print function calls...	r13348	print("%d,%d-%d,%d:\t%s\t%s" % \
		(srow, scol, erow, ecol, tok_name[type], repr(token)))
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110
		def tokenize(readline, tokeneater=printtoken):
		"""
		The tokenize() function accepts two parameters: one representing the
		input stream, and one providing an output mechanism for tokenize().

		The first parameter, readline, must be a callable object which provides
		the same interface as the readline() method of built-in file objects.
		Each call to the function should return one line of input as a string.

		The second parameter, tokeneater, must also be a callable object. It is
		called once for each token, with five arguments, corresponding to the
		tuples generated by generate_tokens().
		"""
		try:
		tokenize_loop(readline, tokeneater)
		except StopTokenizing:
		pass

		# backwards compatible interface
		def tokenize_loop(readline, tokeneater):
		for token_info in generate_tokens(readline):
		tokeneater(*token_info)

		class Untokenizer:

		def __init__(self):
		self.tokens = []
		self.prev_row = 1
		self.prev_col = 0

		def add_whitespace(self, start):
		row, col = start
		assert row >= self.prev_row
		col_offset = col - self.prev_col
		if col_offset > 0:
		self.tokens.append(" " * col_offset)
		elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
		# Line was backslash-continued
		self.tokens.append(" ")

		def untokenize(self, tokens):
		iterable = iter(tokens)
		for t in iterable:
		if len(t) == 2:
		self.compat(t, iterable)
		break
		tok_type, token, start, end = t[:4]
		self.add_whitespace(start)
		self.tokens.append(token)
		self.prev_row, self.prev_col = end
		if tok_type in (NEWLINE, NL):
		self.prev_row += 1
		self.prev_col = 0
		return "".join(self.tokens)

		def compat(self, token, iterable):
		# This import is here to avoid problems when the itertools
		# module is not built yet and tokenize is imported.
		from itertools import chain
		startline = False
		prevstring = False
		indents = []
		toks_append = self.tokens.append
		for tok in chain([token], iterable):
		toknum, tokval = tok[:2]

		if toknum in (NAME, NUMBER):
		tokval += ' '

		# Insert a space between two consecutive strings
		if toknum == STRING:
		if prevstring:
		tokval = ' ' + tokval
		prevstring = True
		else:
		prevstring = False

		if toknum == INDENT:
		indents.append(tokval)
		continue
		elif toknum == DEDENT:
		indents.pop()
		continue
		elif toknum in (NEWLINE, NL):
		startline = True
		elif startline and indents:
		toks_append(indents[-1])
		startline = False
		toks_append(tokval)

		def untokenize(iterable):
		"""Transform tokens back into Python source code.

		Each element returned by the iterable must be a token sequence
		with at least two elements, a token number and token value. If
		only two tokens are passed, the resulting output is poor.

		Round-trip invariant for full input:
		Untokenized source will match input source exactly

		Round-trip invariant for limited intput:
		# Output text will tokenize the back to the input
		t1 = [tok[:2] for tok in generate_tokens(f.readline)]
		newcode = untokenize(t1)
		readline = iter(newcode.splitlines(1)).next
		t2 = [tok[:2] for tok in generate_tokens(readline)]
		assert t1 == t2
		"""
		ut = Untokenizer()
		return ut.untokenize(iterable)

		def generate_tokens(readline):
		"""
		The generate_tokens() generator requires one argment, readline, which
		must be a callable object which provides the same interface as the
		readline() method of built-in file objects. Each call to the function
		should return one line of input as a string. Alternately, readline
		can be a callable function terminating with StopIteration:
		readline = open(myfile).next # Example of alternate readline

		The generator produces 5-tuples with these members: the token type; the
		token string; a 2-tuple (srow, scol) of ints specifying the row and
		column where the token begins in the source; a 2-tuple (erow, ecol) of
		ints specifying the row and column where the token ends in the source;
		and the line on which the token was found. The line passed is the
		logical line; continuation lines are included.
		"""
		lnum = parenlev = continued = 0
		namechars, numchars = string.ascii_letters + '_', '0123456789'
		contstr, needcont = '', 0
		contline = None
		indents = [0]

		while 1: # loop over lines in stream
		try:
		line = readline()
		except StopIteration:
		line = ''
		lnum += 1
		pos, max = 0, len(line)

		if contstr: # continued string
		if not line:
Thomas Kluyver Make raise statements Python 3 compatible....	r13349	raise TokenError("EOF in multi-line string", strstart)
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	endmatch = endprog.match(line)
		if endmatch:
		pos = end = endmatch.end(0)
		yield (STRING, contstr + line[:end],
		strstart, (lnum, end), contline + line)
		contstr, needcont = '', 0
		contline = None
		elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
		yield (ERRORTOKEN, contstr + line,
		strstart, (lnum, len(line)), contline)
		contstr = ''
		contline = None
		continue
		else:
		contstr = contstr + line
		contline = contline + line
		continue

		elif parenlev == 0 and not continued: # new statement
		if not line: break
		column = 0
		while pos < max: # measure leading whitespace
		if line[pos] == ' ':
		column += 1
		elif line[pos] == '\t':
		column = (column//tabsize + 1)*tabsize
		elif line[pos] == '\f':
		column = 0
		else:
		break
		pos += 1
		if pos == max:
		break

		if line[pos] in '#\r\n': # skip comments or blank lines
		if line[pos] == '#':
		comment_token = line[pos:].rstrip('\r\n')
		nl_pos = pos + len(comment_token)
		yield (COMMENT, comment_token,
		(lnum, pos), (lnum, pos + len(comment_token)), line)
		yield (NEWLINE, line[nl_pos:],
		(lnum, nl_pos), (lnum, len(line)), line)
		else:
		yield (NEWLINE, line[pos:],
		(lnum, pos), (lnum, len(line)), line)
		continue

		if column > indents[-1]: # count indents or dedents
		indents.append(column)
		yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
		while column < indents[-1]:
		if column not in indents:
		raise IndentationError(
		"unindent does not match any outer indentation level",
		("<tokenize>", lnum, pos, line))
		indents = indents[:-1]
		yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

		else: # continued statement
		if not line:
Thomas Kluyver Make raise statements Python 3 compatible....	r13349	raise TokenError("EOF in multi-line statement", (lnum, 0))
Thomas Kluyver Now include patched copies of tokenize for Python 2 and 3.	r10110	continued = 0

		while pos < max:
		pseudomatch = pseudoprog.match(line, pos)
		if pseudomatch: # scan for tokens
		start, end = pseudomatch.span(1)
		spos, epos, pos = (lnum, start), (lnum, end), end
		token, initial = line[start:end], line[start]

		if initial in numchars or \
		(initial == '.' and token != '.'): # ordinary number
		yield (NUMBER, token, spos, epos, line)
		elif initial in '\r\n':
		yield (NL if parenlev > 0 else NEWLINE,
		token, spos, epos, line)
		elif initial == '#':
		assert not token.endswith("\n")
		yield (COMMENT, token, spos, epos, line)
		elif token in triple_quoted:
		endprog = endprogs[token]
		endmatch = endprog.match(line, pos)
		if endmatch: # all on one line
		pos = endmatch.end(0)
		token = line[start:pos]
		yield (STRING, token, spos, (lnum, pos), line)
		else:
		strstart = (lnum, start) # multiple lines
		contstr = line[start:]
		contline = line
		break
		elif initial in single_quoted or \
		token[:2] in single_quoted or \
		token[:3] in single_quoted:
		if token[-1] == '\n': # continued string
		strstart = (lnum, start)
		endprog = (endprogs[initial] or endprogs[token[1]] or
		endprogs[token[2]])
		contstr, needcont = line[start:], 1
		contline = line
		break
		else: # ordinary string
		yield (STRING, token, spos, epos, line)
		elif initial in namechars: # ordinary name
		yield (NAME, token, spos, epos, line)
		elif initial == '\\': # continued stmt
		continued = 1
		else:
		if initial in '([{':
		parenlev += 1
		elif initial in ')]}':
		parenlev -= 1
		yield (OP, token, spos, epos, line)
		else:
		yield (ERRORTOKEN, line[pos],
		(lnum, pos), (lnum, pos+1), line)
		pos += 1

		for indent in indents[1:]: # pop remaining indent levels
		yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
		yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')

		if __name__ == '__main__': # testing
		import sys
		if len(sys.argv) > 1:
		tokenize(open(sys.argv[1]).readline)
		else:
		tokenize(sys.stdin.readline)