upstream/ipython Commit - r6247:8ec813a4

Add IPython.utils.openpy to decode Python files.

Thomas Kluyver -

r6247:8ec813a4

parent child

IPython/utils/openpy.py

0 created 644 +158 0

			@@ -0,0 +1,158 b''
		1	"""
		2	Tools to open .py files as Unicode, using the encoding specified within the file,
		3	as per PEP 263.
		4
		5	Much of the code is taken from the tokenize module in Python 3.2.
		6	"""
		7
		8	import __builtin__
		9	import io
		10	from io import TextIOWrapper
		11	import re
		12	import urllib
		13
		14	cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
		15	cookie_comment_re = re.compile(ur"^\s#.coding[:=]\s*([-\w.]+)", re.UNICODE)
		16
		17	try:
		18	# Available in Python 3
		19	from tokenize import detect_encoding
		20	except ImportError:
		21	from codecs import lookup, BOM_UTF8
		22
		23	# Copied from Python 3.2 tokenize
		24	def _get_normal_name(orig_enc):
		25	"""Imitates get_normal_name in tokenizer.c."""
		26	# Only care about the first 12 characters.
		27	enc = orig_enc[:12].lower().replace("_", "-")
		28	if enc == "utf-8" or enc.startswith("utf-8-"):
		29	return "utf-8"
		30	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
		31	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
		32	return "iso-8859-1"
		33	return orig_enc
		34
		35	# Copied from Python 3.2 tokenize
		36	def detect_encoding(readline):
		37	"""
		38	The detect_encoding() function is used to detect the encoding that should
		39	be used to decode a Python source file. It requires one argment, readline,
		40	in the same way as the tokenize() generator.
		41
		42	It will call readline a maximum of twice, and return the encoding used
		43	(as a string) and a list of any lines (left as bytes) it has read in.
		44
		45	It detects the encoding from the presence of a utf-8 bom or an encoding
		46	cookie as specified in pep-0263. If both a bom and a cookie are present,
		47	but disagree, a SyntaxError will be raised. If the encoding cookie is an
		48	invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
		49	'utf-8-sig' is returned.
		50
		51	If no encoding is specified, then the default of 'utf-8' will be returned.
		52	"""
		53	bom_found = False
		54	encoding = None
		55	default = 'utf-8'
		56	def read_or_stop():
		57	try:
		58	return readline()
		59	except StopIteration:
		60	return b''
		61
		62	def find_cookie(line):
		63	try:
		64	line_string = line.decode('ascii')
		65	except UnicodeDecodeError:
		66	return None
		67
		68	matches = cookie_re.findall(line_string)
		69	if not matches:
		70	return None
		71	encoding = _get_normal_name(matches[0])
		72	try:
		73	codec = lookup(encoding)
		74	except LookupError:
		75	# This behaviour mimics the Python interpreter
		76	raise SyntaxError("unknown encoding: " + encoding)
		77
		78	if bom_found:
		79	if codec.name != 'utf-8':
		80	# This behaviour mimics the Python interpreter
		81	raise SyntaxError('encoding problem: utf-8')
		82	encoding += '-sig'
		83	return encoding
		84
		85	first = read_or_stop()
		86	if first.startswith(BOM_UTF8):
		87	bom_found = True
		88	first = first[3:]
		89	default = 'utf-8-sig'
		90	if not first:
		91	return default, []
		92
		93	encoding = find_cookie(first)
		94	if encoding:
		95	return encoding, [first]
		96
		97	second = read_or_stop()
		98	if not second:
		99	return default, [first]
		100
		101	encoding = find_cookie(second)
		102	if encoding:
		103	return encoding, [first, second]
		104
		105	return default, [first, second]
		106
		107	try:
		108	# Available in Python 3.2 and above.
		109	from tokenize import open
		110	except:
		111	# Copied from Python 3.2 tokenize
		112	def open(filename):
		113	"""Open a file in read only mode using the encoding detected by
		114	detect_encoding().
		115	"""
		116	buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
		117	encoding, lines = detect_encoding(buffer.readline)
		118	buffer.seek(0)
		119	text = TextIOWrapper(buffer, encoding, line_buffering=True)
		120	text.mode = 'r'
		121	return text
		122
		123	def open_url(url, errors='replace'):
		124	"""Open a URL to a raw Python file, using the encoding detected by
		125	detect_encoding().
		126	"""
		127	response = urllib.urlopen(url)
		128	buffer = io.BufferedRandom(response)
		129	encoding, lines = detect_encoding(buffer.readline)
		130	buffer.seek(0)
		131	text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
		132	text.mode = 'r'
		133	return text
		134
		135	def strip_encoding_cookie(filelike):
		136	"""Generator to pull lines from a text-mode file, skipping the encoding
		137	cookie if it is found in the first two lines.
		138	"""
		139	it = iter(filelike)
		140	try:
		141	first = next(it)
		142	if not cookie_comment_re.match(first):
		143	yield first
		144	second = next(it)
		145	if not cookie_comment_re.match(second):
		146	yield second
		147	except StopIteration:
		148	return
		149
		150	for line in it:
		151	yield line
		152
		153	def read_py_file(filename, skip_encoding_cookie=True):
		154	f = open(filename) # the open function defined in this module.
		155	if skip_encoding_cookie:
		156	return "".join(strip_encoding_cookie(f))
		157	else:
		158	return f.read()

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages