upstream/ipython Commit - r10110:30fce8a6

Now include patched copies of tokenize for Python 2 and 3.

Thomas Kluyver -

r10110:30fce8a6

parent child

IPython/utils/_tokenize_py2.py

0 created 644 +438 0

			@@ -0,0 +1,438
		1	"""Patched version of standard library tokenize, to deal with various bugs.
		2
		3	Patches
		4
		5	- Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing),
		6	manually applied.
		7	- Newlines in comments and blank lines should be either NL or NEWLINE, depending
		8	on whether they are in a multi-line statement. Filed as Python issue #17061.
		9
		10	-------------------------------------------------------------------------------
		11	Tokenization help for Python programs.
		12
		13	generate_tokens(readline) is a generator that breaks a stream of
		14	text into Python tokens. It accepts a readline-like method which is called
		15	repeatedly to get the next line of input (or "" for EOF). It generates
		16	5-tuples with these members:
		17
		18	the token type (see token.py)
		19	the token (a string)
		20	the starting (row, column) indices of the token (a 2-tuple of ints)
		21	the ending (row, column) indices of the token (a 2-tuple of ints)
		22	the original line (string)
		23
		24	It is designed to match the working of the Python tokenizer exactly, except
		25	that it produces COMMENT tokens for comments and gives type OP for all
		26	operators
		27
		28	Older entry points
		29	tokenize_loop(readline, tokeneater)
		30	tokenize(readline, tokeneater=printtoken)
		31	are the same, except instead of generating tokens, tokeneater is a callback
		32	function to which the 5 fields described above are passed as 5 arguments,
		33	each time a new token is found."""
		34
		35	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
		36	__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
		37	'Skip Montanaro, Raymond Hettinger')
		38
		39	import string, re
		40	from token import *
		41
		42	import token
		43	__all__ = [x for x in dir(token) if not x.startswith("_")]
		44	__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
		45	del x
		46	del token
		47
		48	__all__ += ["TokenError"]
		49
		50	COMMENT = N_TOKENS
		51	tok_name[COMMENT] = 'COMMENT'
		52	NL = N_TOKENS + 1
		53	tok_name[NL] = 'NL'
		54	N_TOKENS += 2
		55
		56	def group(*choices): return '(' + '\|'.join(choices) + ')'
		57	def any(choices): return group(choices) + '*'
		58	def maybe(choices): return group(choices) + '?'
		59
		60	Whitespace = r'[ \f\t]*'
		61	Comment = r'#[^\r\n]*'
		62	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
		63	Name = r'[a-zA-Z_]\w*'
		64
		65	Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
		66	Octnumber = r'(0[oO][0-7]+)\|(0[0-7]*)[lL]?'
		67	Binnumber = r'0[bB][01]+[lL]?'
		68	Decnumber = r'[1-9]\d*[lL]?'
		69	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
		70	Exponent = r'[eE][-+]?\d+'
		71	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
		72	Expfloat = r'\d+' + Exponent
		73	Floatnumber = group(Pointfloat, Expfloat)
		74	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
		75	Number = group(Imagnumber, Floatnumber, Intnumber)
		76
		77	# Tail end of ' string.
		78	Single = r"[^'\\](?:\\.[^'\\])*'"
		79	# Tail end of " string.
		80	Double = r'[^"\\](?:\\.[^"\\])*"'
		81	# Tail end of ''' string.
		82	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
		83	# Tail end of """ string.
		84	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
		85	Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
		86	# Single-line ' or " string.
		87	String = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
		88	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
		89
		90	# Because of leftmost-then-longest match semantics, be sure to put the
		91	# longest operators first (e.g., if = came before ==, == would get
		92	# recognized as two instances of =).
		93	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
		94	r"//=?",
		95	r"[+\-*/%&\|^=<>]=?",
		96	r"~")
		97
		98	Bracket = '[][(){}]'
		99	Special = group(r'\r?\n', r'[:;.,`@]')
		100	Funny = group(Operator, Bracket, Special)
		101
		102	PlainToken = group(Number, Funny, String, Name)
		103	Token = Ignore + PlainToken
		104
		105	# First (or only) line of ' or " string.
		106	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
		107	group("'", r'\\\r?\n'),
		108	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
		109	group('"', r'\\\r?\n'))
		110	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
		111	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
		112
		113	tokenprog, pseudoprog, single3prog, double3prog = map(
		114	re.compile, (Token, PseudoToken, Single3, Double3))
		115	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
		116	"'''": single3prog, '"""': double3prog,
		117	"r'''": single3prog, 'r"""': double3prog,
		118	"u'''": single3prog, 'u"""': double3prog,
		119	"ur'''": single3prog, 'ur"""': double3prog,
		120	"R'''": single3prog, 'R"""': double3prog,
		121	"U'''": single3prog, 'U"""': double3prog,
		122	"uR'''": single3prog, 'uR"""': double3prog,
		123	"Ur'''": single3prog, 'Ur"""': double3prog,
		124	"UR'''": single3prog, 'UR"""': double3prog,
		125	"b'''": single3prog, 'b"""': double3prog,
		126	"br'''": single3prog, 'br"""': double3prog,
		127	"B'''": single3prog, 'B"""': double3prog,
		128	"bR'''": single3prog, 'bR"""': double3prog,
		129	"Br'''": single3prog, 'Br"""': double3prog,
		130	"BR'''": single3prog, 'BR"""': double3prog,
		131	'r': None, 'R': None, 'u': None, 'U': None,
		132	'b': None, 'B': None}
		133
		134	triple_quoted = {}
		135	for t in ("'''", '"""',
		136	"r'''", 'r"""', "R'''", 'R"""',
		137	"u'''", 'u"""', "U'''", 'U"""',
		138	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
		139	"uR'''", 'uR"""', "UR'''", 'UR"""',
		140	"b'''", 'b"""', "B'''", 'B"""',
		141	"br'''", 'br"""', "Br'''", 'Br"""',
		142	"bR'''", 'bR"""', "BR'''", 'BR"""'):
		143	triple_quoted[t] = t
		144	single_quoted = {}
		145	for t in ("'", '"',
		146	"r'", 'r"', "R'", 'R"',
		147	"u'", 'u"', "U'", 'U"',
		148	"ur'", 'ur"', "Ur'", 'Ur"',
		149	"uR'", 'uR"', "UR'", 'UR"',
		150	"b'", 'b"', "B'", 'B"',
		151	"br'", 'br"', "Br'", 'Br"',
		152	"bR'", 'bR"', "BR'", 'BR"' ):
		153	single_quoted[t] = t
		154
		155	tabsize = 8
		156
		157	class TokenError(Exception): pass
		158
		159	class StopTokenizing(Exception): pass
		160
		161	def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
		162	srow, scol = srow_scol
		163	erow, ecol = erow_ecol
		164	print "%d,%d-%d,%d:\t%s\t%s" % \
		165	(srow, scol, erow, ecol, tok_name[type], repr(token))
		166
		167	def tokenize(readline, tokeneater=printtoken):
		168	"""
		169	The tokenize() function accepts two parameters: one representing the
		170	input stream, and one providing an output mechanism for tokenize().
		171
		172	The first parameter, readline, must be a callable object which provides
		173	the same interface as the readline() method of built-in file objects.
		174	Each call to the function should return one line of input as a string.
		175
		176	The second parameter, tokeneater, must also be a callable object. It is
		177	called once for each token, with five arguments, corresponding to the
		178	tuples generated by generate_tokens().
		179	"""
		180	try:
		181	tokenize_loop(readline, tokeneater)
		182	except StopTokenizing:
		183	pass
		184
		185	# backwards compatible interface
		186	def tokenize_loop(readline, tokeneater):
		187	for token_info in generate_tokens(readline):
		188	tokeneater(*token_info)
		189
		190	class Untokenizer:
		191
		192	def __init__(self):
		193	self.tokens = []
		194	self.prev_row = 1
		195	self.prev_col = 0
		196
		197	def add_whitespace(self, start):
		198	row, col = start
		199	assert row >= self.prev_row
		200	col_offset = col - self.prev_col
		201	if col_offset > 0:
		202	self.tokens.append(" " * col_offset)
		203	elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
		204	# Line was backslash-continued
		205	self.tokens.append(" ")
		206
		207	def untokenize(self, tokens):
		208	iterable = iter(tokens)
		209	for t in iterable:
		210	if len(t) == 2:
		211	self.compat(t, iterable)
		212	break
		213	tok_type, token, start, end = t[:4]
		214	self.add_whitespace(start)
		215	self.tokens.append(token)
		216	self.prev_row, self.prev_col = end
		217	if tok_type in (NEWLINE, NL):
		218	self.prev_row += 1
		219	self.prev_col = 0
		220	return "".join(self.tokens)
		221
		222	def compat(self, token, iterable):
		223	# This import is here to avoid problems when the itertools
		224	# module is not built yet and tokenize is imported.
		225	from itertools import chain
		226	startline = False
		227	prevstring = False
		228	indents = []
		229	toks_append = self.tokens.append
		230	for tok in chain([token], iterable):
		231	toknum, tokval = tok[:2]
		232
		233	if toknum in (NAME, NUMBER):
		234	tokval += ' '
		235
		236	# Insert a space between two consecutive strings
		237	if toknum == STRING:
		238	if prevstring:
		239	tokval = ' ' + tokval
		240	prevstring = True
		241	else:
		242	prevstring = False
		243
		244	if toknum == INDENT:
		245	indents.append(tokval)
		246	continue
		247	elif toknum == DEDENT:
		248	indents.pop()
		249	continue
		250	elif toknum in (NEWLINE, NL):
		251	startline = True
		252	elif startline and indents:
		253	toks_append(indents[-1])
		254	startline = False
		255	toks_append(tokval)
		256
		257	def untokenize(iterable):
		258	"""Transform tokens back into Python source code.
		259
		260	Each element returned by the iterable must be a token sequence
		261	with at least two elements, a token number and token value. If
		262	only two tokens are passed, the resulting output is poor.
		263
		264	Round-trip invariant for full input:
		265	Untokenized source will match input source exactly
		266
		267	Round-trip invariant for limited intput:
		268	# Output text will tokenize the back to the input
		269	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
		270	newcode = untokenize(t1)
		271	readline = iter(newcode.splitlines(1)).next
		272	t2 = [tok[:2] for tok in generate_tokens(readline)]
		273	assert t1 == t2
		274	"""
		275	ut = Untokenizer()
		276	return ut.untokenize(iterable)
		277
		278	def generate_tokens(readline):
		279	"""
		280	The generate_tokens() generator requires one argment, readline, which
		281	must be a callable object which provides the same interface as the
		282	readline() method of built-in file objects. Each call to the function
		283	should return one line of input as a string. Alternately, readline
		284	can be a callable function terminating with StopIteration:
		285	readline = open(myfile).next # Example of alternate readline
		286
		287	The generator produces 5-tuples with these members: the token type; the
		288	token string; a 2-tuple (srow, scol) of ints specifying the row and
		289	column where the token begins in the source; a 2-tuple (erow, ecol) of
		290	ints specifying the row and column where the token ends in the source;
		291	and the line on which the token was found. The line passed is the
		292	logical line; continuation lines are included.
		293	"""
		294	lnum = parenlev = continued = 0
		295	namechars, numchars = string.ascii_letters + '_', '0123456789'
		296	contstr, needcont = '', 0
		297	contline = None
		298	indents = [0]
		299
		300	while 1: # loop over lines in stream
		301	try:
		302	line = readline()
		303	except StopIteration:
		304	line = ''
		305	lnum += 1
		306	pos, max = 0, len(line)
		307
		308	if contstr: # continued string
		309	if not line:
		310	raise TokenError, ("EOF in multi-line string", strstart)
		311	endmatch = endprog.match(line)
		312	if endmatch:
		313	pos = end = endmatch.end(0)
		314	yield (STRING, contstr + line[:end],
		315	strstart, (lnum, end), contline + line)
		316	contstr, needcont = '', 0
		317	contline = None
		318	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
		319	yield (ERRORTOKEN, contstr + line,
		320	strstart, (lnum, len(line)), contline)
		321	contstr = ''
		322	contline = None
		323	continue
		324	else:
		325	contstr = contstr + line
		326	contline = contline + line
		327	continue
		328
		329	elif parenlev == 0 and not continued: # new statement
		330	if not line: break
		331	column = 0
		332	while pos < max: # measure leading whitespace
		333	if line[pos] == ' ':
		334	column += 1
		335	elif line[pos] == '\t':
		336	column = (column//tabsize + 1)*tabsize
		337	elif line[pos] == '\f':
		338	column = 0
		339	else:
		340	break
		341	pos += 1
		342	if pos == max:
		343	break
		344
		345	if line[pos] in '#\r\n': # skip comments or blank lines
		346	if line[pos] == '#':
		347	comment_token = line[pos:].rstrip('\r\n')
		348	nl_pos = pos + len(comment_token)
		349	yield (COMMENT, comment_token,
		350	(lnum, pos), (lnum, pos + len(comment_token)), line)
		351	yield (NEWLINE, line[nl_pos:],
		352	(lnum, nl_pos), (lnum, len(line)), line)
		353	else:
		354	yield (NEWLINE, line[pos:],
		355	(lnum, pos), (lnum, len(line)), line)
		356	continue
		357
		358	if column > indents[-1]: # count indents or dedents
		359	indents.append(column)
		360	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
		361	while column < indents[-1]:
		362	if column not in indents:
		363	raise IndentationError(
		364	"unindent does not match any outer indentation level",
		365	("<tokenize>", lnum, pos, line))
		366	indents = indents[:-1]
		367	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
		368
		369	else: # continued statement
		370	if not line:
		371	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
		372	continued = 0
		373
		374	while pos < max:
		375	pseudomatch = pseudoprog.match(line, pos)
		376	if pseudomatch: # scan for tokens
		377	start, end = pseudomatch.span(1)
		378	spos, epos, pos = (lnum, start), (lnum, end), end
		379	token, initial = line[start:end], line[start]
		380
		381	if initial in numchars or \
		382	(initial == '.' and token != '.'): # ordinary number
		383	yield (NUMBER, token, spos, epos, line)
		384	elif initial in '\r\n':
		385	yield (NL if parenlev > 0 else NEWLINE,
		386	token, spos, epos, line)
		387	elif initial == '#':
		388	assert not token.endswith("\n")
		389	yield (COMMENT, token, spos, epos, line)
		390	elif token in triple_quoted:
		391	endprog = endprogs[token]
		392	endmatch = endprog.match(line, pos)
		393	if endmatch: # all on one line
		394	pos = endmatch.end(0)
		395	token = line[start:pos]
		396	yield (STRING, token, spos, (lnum, pos), line)
		397	else:
		398	strstart = (lnum, start) # multiple lines
		399	contstr = line[start:]
		400	contline = line
		401	break
		402	elif initial in single_quoted or \
		403	token[:2] in single_quoted or \
		404	token[:3] in single_quoted:
		405	if token[-1] == '\n': # continued string
		406	strstart = (lnum, start)
		407	endprog = (endprogs[initial] or endprogs[token[1]] or
		408	endprogs[token[2]])
		409	contstr, needcont = line[start:], 1
		410	contline = line
		411	break
		412	else: # ordinary string
		413	yield (STRING, token, spos, epos, line)
		414	elif initial in namechars: # ordinary name
		415	yield (NAME, token, spos, epos, line)
		416	elif initial == '\\': # continued stmt
		417	continued = 1
		418	else:
		419	if initial in '([{':
		420	parenlev += 1
		421	elif initial in ')]}':
		422	parenlev -= 1
		423	yield (OP, token, spos, epos, line)
		424	else:
		425	yield (ERRORTOKEN, line[pos],
		426	(lnum, pos), (lnum, pos+1), line)
		427	pos += 1
		428
		429	for indent in indents[1:]: # pop remaining indent levels
		430	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
		431	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
		432
		433	if __name__ == '__main__': # testing
		434	import sys
		435	if len(sys.argv) > 1:
		436	tokenize(open(sys.argv[1]).readline)
		437	else:
		438	tokenize(sys.stdin.readline)

IPython/utils/_tokenize_py3.py

0 created 644 +574 0

This diff has been collapsed as it changes many lines, (574 lines changed) Show them Hide them
		@@ -0,0 +1,574
	1	"""Patched version of standard library tokenize, to deal with various bugs.
	2
	3	Based on Python 3.2 code.
	4
	5	Patches:
	6
	7	- Gareth Rees' patch for Python issue #12691 (untokenizing)
	8	- Except we don't encode the output of untokenize
	9	- Python 2 compatible syntax, so that it can be byte-compiled at installation
	10	- Newlines in comments and blank lines should be either NL or NEWLINE, depending
	11	on whether they are in a multi-line statement. Filed as Python issue #17061.
	12	- Export generate_tokens & TokenError
	13
	14	------------------------------------------------------------------------------
	15	Tokenization help for Python programs.
	16
	17	tokenize(readline) is a generator that breaks a stream of bytes into
	18	Python tokens. It decodes the bytes according to PEP-0263 for
	19	determining source file encoding.
	20
	21	It accepts a readline-like method which is called repeatedly to get the
	22	next line of input (or b"" for EOF). It generates 5-tuples with these
	23	members:
	24
	25	the token type (see token.py)
	26	the token (a string)
	27	the starting (row, column) indices of the token (a 2-tuple of ints)
	28	the ending (row, column) indices of the token (a 2-tuple of ints)
	29	the original line (string)
	30
	31	It is designed to match the working of the Python tokenizer exactly, except
	32	that it produces COMMENT tokens for comments and gives type OP for all
	33	operators. Additionally, all token lists start with an ENCODING token
	34	which tells you which encoding was used to decode the bytes stream.
	35	"""
	36	from __future__ import absolute_import
	37
	38	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
	39	__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
	40	'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
	41	'Michael Foord')
	42	import builtins
	43	import re
	44	import sys
	45	from token import *
	46	from codecs import lookup, BOM_UTF8
	47	import collections
	48	from io import TextIOWrapper
	49	cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
	50
	51	import token
	52	__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
	53	"NL", "untokenize", "ENCODING", "TokenInfo"]
	54	del token
	55
	56	__all__ += ["generate_tokens", "TokenError"]
	57
	58	COMMENT = N_TOKENS
	59	tok_name[COMMENT] = 'COMMENT'
	60	NL = N_TOKENS + 1
	61	tok_name[NL] = 'NL'
	62	ENCODING = N_TOKENS + 2
	63	tok_name[ENCODING] = 'ENCODING'
	64	N_TOKENS += 3
	65
	66	class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
	67	def __repr__(self):
	68	annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
	69	return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
	70	self._replace(type=annotated_type))
	71
	72	def group(*choices): return '(' + '\|'.join(choices) + ')'
	73	def any(choices): return group(choices) + '*'
	74	def maybe(choices): return group(choices) + '?'
	75
	76	# Note: we use unicode matching for names ("\w") but ascii matching for
	77	# number literals.
	78	Whitespace = r'[ \f\t]*'
	79	Comment = r'#[^\r\n]*'
	80	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	81	Name = r'\w+'
	82
	83	Hexnumber = r'0[xX][0-9a-fA-F]+'
	84	Binnumber = r'0[bB][01]+'
	85	Octnumber = r'0[oO][0-7]+'
	86	Decnumber = r'(?:0+\|[1-9][0-9]*)'
	87	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
	88	Exponent = r'[eE][-+]?[0-9]+'
	89	Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
	90	Expfloat = r'[0-9]+' + Exponent
	91	Floatnumber = group(Pointfloat, Expfloat)
	92	Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
	93	Number = group(Imagnumber, Floatnumber, Intnumber)
	94
	95	# Tail end of ' string.
	96	Single = r"[^'\\](?:\\.[^'\\])*'"
	97	# Tail end of " string.
	98	Double = r'[^"\\](?:\\.[^"\\])*"'
	99	# Tail end of ''' string.
	100	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
	101	# Tail end of """ string.
	102	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
	103	Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
	104	# Single-line ' or " string.
	105	String = group(r"[bB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
	106	r'[bB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
	107
	108	# Because of leftmost-then-longest match semantics, be sure to put the
	109	# longest operators first (e.g., if = came before ==, == would get
	110	# recognized as two instances of =).
	111	Operator = group(r"\\=?", r">>=?", r"<<=?", r"!=",
	112	r"//=?", r"->",
	113	r"[+\-*/%&\|^=<>]=?",
	114	r"~")
	115
	116	Bracket = '[][(){}]'
	117	Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
	118	Funny = group(Operator, Bracket, Special)
	119
	120	PlainToken = group(Number, Funny, String, Name)
	121	Token = Ignore + PlainToken
	122
	123	# First (or only) line of ' or " string.
	124	ContStr = group(r"[bB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
	125	group("'", r'\\\r?\n'),
	126	r'[bB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
	127	group('"', r'\\\r?\n'))
	128	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
	129	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
	130
	131	def _compile(expr):
	132	return re.compile(expr, re.UNICODE)
	133
	134	tokenprog, pseudoprog, single3prog, double3prog = map(
	135	_compile, (Token, PseudoToken, Single3, Double3))
	136	endprogs = {"'": _compile(Single), '"': _compile(Double),
	137	"'''": single3prog, '"""': double3prog,
	138	"r'''": single3prog, 'r"""': double3prog,
	139	"b'''": single3prog, 'b"""': double3prog,
	140	"br'''": single3prog, 'br"""': double3prog,
	141	"R'''": single3prog, 'R"""': double3prog,
	142	"B'''": single3prog, 'B"""': double3prog,
	143	"bR'''": single3prog, 'bR"""': double3prog,
	144	"Br'''": single3prog, 'Br"""': double3prog,
	145	"BR'''": single3prog, 'BR"""': double3prog,
	146	'r': None, 'R': None, 'b': None, 'B': None}
	147
	148	triple_quoted = {}
	149	for t in ("'''", '"""',
	150	"r'''", 'r"""', "R'''", 'R"""',
	151	"b'''", 'b"""', "B'''", 'B"""',
	152	"br'''", 'br"""', "Br'''", 'Br"""',
	153	"bR'''", 'bR"""', "BR'''", 'BR"""'):
	154	triple_quoted[t] = t
	155	single_quoted = {}
	156	for t in ("'", '"',
	157	"r'", 'r"', "R'", 'R"',
	158	"b'", 'b"', "B'", 'B"',
	159	"br'", 'br"', "Br'", 'Br"',
	160	"bR'", 'bR"', "BR'", 'BR"' ):
	161	single_quoted[t] = t
	162
	163	del _compile
	164
	165	tabsize = 8
	166
	167	class TokenError(Exception): pass
	168
	169	class StopTokenizing(Exception): pass
	170
	171
	172	class Untokenizer:
	173
	174	def __init__(self):
	175	self.tokens = []
	176	self.prev_row = 1
	177	self.prev_col = 0
	178	self.encoding = 'utf-8'
	179
	180	def add_whitespace(self, tok_type, start):
	181	row, col = start
	182	assert row >= self.prev_row
	183	col_offset = col - self.prev_col
	184	if col_offset > 0:
	185	self.tokens.append(" " * col_offset)
	186	elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
	187	# Line was backslash-continued.
	188	self.tokens.append(" ")
	189
	190	def untokenize(self, tokens):
	191	iterable = iter(tokens)
	192	for t in iterable:
	193	if len(t) == 2:
	194	self.compat(t, iterable)
	195	break
	196	tok_type, token, start, end = t[:4]
	197	if tok_type == ENCODING:
	198	self.encoding = token
	199	continue
	200	self.add_whitespace(tok_type, start)
	201	self.tokens.append(token)
	202	self.prev_row, self.prev_col = end
	203	if tok_type in (NEWLINE, NL):
	204	self.prev_row += 1
	205	self.prev_col = 0
	206	return "".join(self.tokens)
	207
	208	def compat(self, token, iterable):
	209	# This import is here to avoid problems when the itertools
	210	# module is not built yet and tokenize is imported.
	211	from itertools import chain
	212	startline = False
	213	prevstring = False
	214	indents = []
	215	toks_append = self.tokens.append
	216
	217	for tok in chain([token], iterable):
	218	toknum, tokval = tok[:2]
	219	if toknum == ENCODING:
	220	self.encoding = tokval
	221	continue
	222
	223	if toknum in (NAME, NUMBER):
	224	tokval += ' '
	225
	226	# Insert a space between two consecutive strings
	227	if toknum == STRING:
	228	if prevstring:
	229	tokval = ' ' + tokval
	230	prevstring = True
	231	else:
	232	prevstring = False
	233
	234	if toknum == INDENT:
	235	indents.append(tokval)
	236	continue
	237	elif toknum == DEDENT:
	238	indents.pop()
	239	continue
	240	elif toknum in (NEWLINE, NL):
	241	startline = True
	242	elif startline and indents:
	243	toks_append(indents[-1])
	244	startline = False
	245	toks_append(tokval)
	246
	247
	248	def untokenize(tokens):
	249	"""
	250	Convert ``tokens`` (an iterable) back into Python source code. Return
	251	a bytes object, encoded using the encoding specified by the last
	252	ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
	253
	254	The result is guaranteed to tokenize back to match the input so that
	255	the conversion is lossless and round-trips are assured. The
	256	guarantee applies only to the token type and token string as the
	257	spacing between tokens (column positions) may change.
	258
	259	:func:`untokenize` has two modes. If the input tokens are sequences
	260	of length 2 (``type``, ``string``) then spaces are added as necessary to
	261	preserve the round-trip property.
	262
	263	If the input tokens are sequences of length 4 or more (``type``,
	264	``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
	265	spaces are added so that each token appears in the result at the
	266	position indicated by ``start`` and ``end``, if possible.
	267	"""
	268	return Untokenizer().untokenize(tokens)
	269
	270
	271	def _get_normal_name(orig_enc):
	272	"""Imitates get_normal_name in tokenizer.c."""
	273	# Only care about the first 12 characters.
	274	enc = orig_enc[:12].lower().replace("_", "-")
	275	if enc == "utf-8" or enc.startswith("utf-8-"):
	276	return "utf-8"
	277	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
	278	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
	279	return "iso-8859-1"
	280	return orig_enc
	281
	282	def detect_encoding(readline):
	283	"""
	284	The detect_encoding() function is used to detect the encoding that should
	285	be used to decode a Python source file. It requires one argment, readline,
	286	in the same way as the tokenize() generator.
	287
	288	It will call readline a maximum of twice, and return the encoding used
	289	(as a string) and a list of any lines (left as bytes) it has read in.
	290
	291	It detects the encoding from the presence of a utf-8 bom or an encoding
	292	cookie as specified in pep-0263. If both a bom and a cookie are present,
	293	but disagree, a SyntaxError will be raised. If the encoding cookie is an
	294	invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
	295	'utf-8-sig' is returned.
	296
	297	If no encoding is specified, then the default of 'utf-8' will be returned.
	298	"""
	299	bom_found = False
	300	encoding = None
	301	default = 'utf-8'
	302	def read_or_stop():
	303	try:
	304	return readline()
	305	except StopIteration:
	306	return b''
	307
	308	def find_cookie(line):
	309	try:
	310	# Decode as UTF-8. Either the line is an encoding declaration,
	311	# in which case it should be pure ASCII, or it must be UTF-8
	312	# per default encoding.
	313	line_string = line.decode('utf-8')
	314	except UnicodeDecodeError:
	315	raise SyntaxError("invalid or missing encoding declaration")
	316
	317	matches = cookie_re.findall(line_string)
	318	if not matches:
	319	return None
	320	encoding = _get_normal_name(matches[0])
	321	try:
	322	codec = lookup(encoding)
	323	except LookupError:
	324	# This behaviour mimics the Python interpreter
	325	raise SyntaxError("unknown encoding: " + encoding)
	326
	327	if bom_found:
	328	if encoding != 'utf-8':
	329	# This behaviour mimics the Python interpreter
	330	raise SyntaxError('encoding problem: utf-8')
	331	encoding += '-sig'
	332	return encoding
	333
	334	first = read_or_stop()
	335	if first.startswith(BOM_UTF8):
	336	bom_found = True
	337	first = first[3:]
	338	default = 'utf-8-sig'
	339	if not first:
	340	return default, []
	341
	342	encoding = find_cookie(first)
	343	if encoding:
	344	return encoding, [first]
	345
	346	second = read_or_stop()
	347	if not second:
	348	return default, [first]
	349
	350	encoding = find_cookie(second)
	351	if encoding:
	352	return encoding, [first, second]
	353
	354	return default, [first, second]
	355
	356
	357	def open(filename):
	358	"""Open a file in read only mode using the encoding detected by
	359	detect_encoding().
	360	"""
	361	buffer = builtins.open(filename, 'rb')
	362	encoding, lines = detect_encoding(buffer.readline)
	363	buffer.seek(0)
	364	text = TextIOWrapper(buffer, encoding, line_buffering=True)
	365	text.mode = 'r'
	366	return text
	367
	368
	369	def tokenize(readline):
	370	"""
	371	The tokenize() generator requires one argment, readline, which
	372	must be a callable object which provides the same interface as the
	373	readline() method of built-in file objects. Each call to the function
	374	should return one line of input as bytes. Alternately, readline
	375	can be a callable function terminating with StopIteration:
	376	readline = open(myfile, 'rb').__next__ # Example of alternate readline
	377
	378	The generator produces 5-tuples with these members: the token type; the
	379	token string; a 2-tuple (srow, scol) of ints specifying the row and
	380	column where the token begins in the source; a 2-tuple (erow, ecol) of
	381	ints specifying the row and column where the token ends in the source;
	382	and the line on which the token was found. The line passed is the
	383	logical line; continuation lines are included.
	384
	385	The first token sequence will always be an ENCODING token
	386	which tells you which encoding was used to decode the bytes stream.
	387	"""
	388	# This import is here to avoid problems when the itertools module is not
	389	# built yet and tokenize is imported.
	390	from itertools import chain, repeat
	391	encoding, consumed = detect_encoding(readline)
	392	rl_gen = iter(readline, b"")
	393	empty = repeat(b"")
	394	return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
	395
	396
	397	def _tokenize(readline, encoding):
	398	lnum = parenlev = continued = 0
	399	numchars = '0123456789'
	400	contstr, needcont = '', 0
	401	contline = None
	402	indents = [0]
	403
	404	if encoding is not None:
	405	if encoding == "utf-8-sig":
	406	# BOM will already have been stripped.
	407	encoding = "utf-8"
	408	yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
	409	while True: # loop over lines in stream
	410	try:
	411	line = readline()
	412	except StopIteration:
	413	line = b''
	414
	415	if encoding is not None:
	416	line = line.decode(encoding)
	417	lnum += 1
	418	pos, max = 0, len(line)
	419
	420	if contstr: # continued string
	421	if not line:
	422	raise TokenError("EOF in multi-line string", strstart)
	423	endmatch = endprog.match(line)
	424	if endmatch:
	425	pos = end = endmatch.end(0)
	426	yield TokenInfo(STRING, contstr + line[:end],
	427	strstart, (lnum, end), contline + line)
	428	contstr, needcont = '', 0
	429	contline = None
	430	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
	431	yield TokenInfo(ERRORTOKEN, contstr + line,
	432	strstart, (lnum, len(line)), contline)
	433	contstr = ''
	434	contline = None
	435	continue
	436	else:
	437	contstr = contstr + line
	438	contline = contline + line
	439	continue
	440
	441	elif parenlev == 0 and not continued: # new statement
	442	if not line: break
	443	column = 0
	444	while pos < max: # measure leading whitespace
	445	if line[pos] == ' ':
	446	column += 1
	447	elif line[pos] == '\t':
	448	column = (column//tabsize + 1)*tabsize
	449	elif line[pos] == '\f':
	450	column = 0
	451	else:
	452	break
	453	pos += 1
	454	if pos == max:
	455	break
	456
	457	if line[pos] in '#\r\n': # skip comments or blank lines
	458	if line[pos] == '#':
	459	comment_token = line[pos:].rstrip('\r\n')
	460	nl_pos = pos + len(comment_token)
	461	yield TokenInfo(COMMENT, comment_token,
	462	(lnum, pos), (lnum, pos + len(comment_token)), line)
	463	yield TokenInfo(NEWLINE, line[nl_pos:],
	464	(lnum, nl_pos), (lnum, len(line)), line)
	465	else:
	466	yield TokenInfo(NEWLINE, line[pos:],
	467	(lnum, pos), (lnum, len(line)), line)
	468	continue
	469
	470	if column > indents[-1]: # count indents or dedents
	471	indents.append(column)
	472	yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
	473	while column < indents[-1]:
	474	if column not in indents:
	475	raise IndentationError(
	476	"unindent does not match any outer indentation level",
	477	("<tokenize>", lnum, pos, line))
	478	indents = indents[:-1]
	479	yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
	480
	481	else: # continued statement
	482	if not line:
	483	raise TokenError("EOF in multi-line statement", (lnum, 0))
	484	continued = 0
	485
	486	while pos < max:
	487	pseudomatch = pseudoprog.match(line, pos)
	488	if pseudomatch: # scan for tokens
	489	start, end = pseudomatch.span(1)
	490	spos, epos, pos = (lnum, start), (lnum, end), end
	491	token, initial = line[start:end], line[start]
	492
	493	if (initial in numchars or # ordinary number
	494	(initial == '.' and token != '.' and token != '...')):
	495	yield TokenInfo(NUMBER, token, spos, epos, line)
	496	elif initial in '\r\n':
	497	yield TokenInfo(NL if parenlev > 0 else NEWLINE,
	498	token, spos, epos, line)
	499	elif initial == '#':
	500	assert not token.endswith("\n")
	501	yield TokenInfo(COMMENT, token, spos, epos, line)
	502	elif token in triple_quoted:
	503	endprog = endprogs[token]
	504	endmatch = endprog.match(line, pos)
	505	if endmatch: # all on one line
	506	pos = endmatch.end(0)
	507	token = line[start:pos]
	508	yield TokenInfo(STRING, token, spos, (lnum, pos), line)
	509	else:
	510	strstart = (lnum, start) # multiple lines
	511	contstr = line[start:]
	512	contline = line
	513	break
	514	elif initial in single_quoted or \
	515	token[:2] in single_quoted or \
	516	token[:3] in single_quoted:
	517	if token[-1] == '\n': # continued string
	518	strstart = (lnum, start)
	519	endprog = (endprogs[initial] or endprogs[token[1]] or
	520	endprogs[token[2]])
	521	contstr, needcont = line[start:], 1
	522	contline = line
	523	break
	524	else: # ordinary string
	525	yield TokenInfo(STRING, token, spos, epos, line)
	526	elif initial.isidentifier(): # ordinary name
	527	yield TokenInfo(NAME, token, spos, epos, line)
	528	elif initial == '\\': # continued stmt
	529	continued = 1
	530	else:
	531	if initial in '([{':
	532	parenlev += 1
	533	elif initial in ')]}':
	534	parenlev -= 1
	535	yield TokenInfo(OP, token, spos, epos, line)
	536	else:
	537	yield TokenInfo(ERRORTOKEN, line[pos],
	538	(lnum, pos), (lnum, pos+1), line)
	539	pos += 1
	540
	541	for indent in indents[1:]: # pop remaining indent levels
	542	yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
	543	yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
	544
	545
	546	# An undocumented, backwards compatible, API for all the places in the standard
	547	# library that expect to be able to use tokenize with strings
	548	def generate_tokens(readline):
	549	return _tokenize(readline, None)
	550
	551	if __name__ == "__main__":
	552	# Quick sanity check
	553	s = b'''def parseline(self, line):
	554	"""Parse the line into a command name and a string containing
	555	the arguments. Returns a tuple containing (command, args, line).
	556	'command' and 'args' may be None if the line couldn't be parsed.
	557	"""
	558	line = line.strip()
	559	if not line:
	560	return None, None, line
	561	elif line[0] == '?':
	562	line = 'help ' + line[1:]
	563	elif line[0] == '!':
	564	if hasattr(self, 'do_shell'):
	565	line = 'shell ' + line[1:]
	566	else:
	567	return None, None, line
	568	i, n = 0, len(line)
	569	while i < n and line[i] in self.identchars: i = i+1
	570	cmd, arg = line[:i], line[i:].strip()
	571	return cmd, arg, line
	572	'''
	573	for tok in tokenize(iter(s.splitlines()).__next__):
	574	print(tok)

IPython/utils/tokenize2.py

0 created 644 +9 0

			@@ -0,0 +1,9
		1	"""Load our patched versions of tokenize.
		2	"""
		3
		4	import sys
		5
		6	if sys.version_info[0] >= 3:
		7	from _tokenize_py3 import *
		8	else:
		9	from _tokenize_py2 import *

IPython/core/inputtransformer.py

0 +9 -16

              import functools
              import re
              from StringIO import StringIO
-             import tokenize
-             try:
-                 generate_tokens = tokenize.generate_tokens
-             except AttributeError:
-                 # Python 3. Note that we use the undocumented _tokenize because it expects
-                 # strings, not bytes. See also Python issue #9969.
-                 generate_tokens = tokenize._tokenize
              from IPython.core.splitinput import split_user_input, LineInfo
-             from IPython.utils.untokenize import untokenize
+             from IPython.utils import tokenize2
+             from IPython.utils.tokenize2 import generate_tokens, untokenize, TokenError
              #-----------------------------------------------------------------------------
              # Globals
                  def get_line(self):
                      if self.line_used:
-                         raise tokenize.TokenError
+                         raise TokenError
                      self.line_used = True
                      return self.current_line
                          for intok in self.tokenizer:
                              tokens.append(intok)
                              t = intok[0]
-                             if t == tokenize.NEWLINE or (stop_at_NL and t == tokenize.NL):
+                             if t == tokenize2.NEWLINE or (stop_at_NL and t == tokenize2.NL):
                                  # Stop before we try to pull a line we don't have yet
                                  break
-                             elif t in (tokenize.COMMENT, tokenize.ERRORTOKEN):
+                             elif t == tokenize2.ERRORTOKEN:
                                  stop_at_NL = True
-                     except tokenize.TokenError:
+                     except TokenError:
                          # Multi-line statement - stop and try again with the next line
                          self.reset_tokenizer()
                          return None
                  readline = StringIO(src).readline
                  toktypes = set()
                  try:
-                     for t in tokenize.generate_tokens(readline):
+                     for t in generate_tokens(readline):
                          toktypes.add(t[0])
-                 except tokenize.TokenError:
+                 except TokenError:
                      pass
-                 return(tokenize.COMMENT in toktypes)
+                 return(tokenize2.COMMENT in toktypes)
              @StatelessInputTransformer.wrap

IPython/utils/untokenize.py

0 removed 0 -125

NO CONTENT: file was removed

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No reviewers

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages