upstream/ipython Commit - r6247:8ec813a4

Add IPython.utils.openpy to decode Python files.

Thomas Kluyver -

r6247:8ec813a4

parent child

IPython/utils/openpy.py

0 created 644 +158 0

@@ -0,0 +1,158 b''
	1	"""
	2	Tools to open .py files as Unicode, using the encoding specified within the file,
	3	as per PEP 263.
	4
	5	Much of the code is taken from the tokenize module in Python 3.2.
	6	"""
	7
	8	import __builtin__
	9	import io
	10	from io import TextIOWrapper
	11	import re
	12	import urllib
	13
	14	cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
	15	cookie_comment_re = re.compile(ur"^\s#.coding[:=]\s*([-\w.]+)", re.UNICODE)
	16
	17	try:
	18	# Available in Python 3
	19	from tokenize import detect_encoding
	20	except ImportError:
	21	from codecs import lookup, BOM_UTF8
	22
	23	# Copied from Python 3.2 tokenize
	24	def _get_normal_name(orig_enc):
	25	"""Imitates get_normal_name in tokenizer.c."""
	26	# Only care about the first 12 characters.
	27	enc = orig_enc[:12].lower().replace("_", "-")
	28	if enc == "utf-8" or enc.startswith("utf-8-"):
	29	return "utf-8"
	30	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
	31	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
	32	return "iso-8859-1"
	33	return orig_enc
	34
	35	# Copied from Python 3.2 tokenize
	36	def detect_encoding(readline):
	37	"""
	38	The detect_encoding() function is used to detect the encoding that should
	39	be used to decode a Python source file. It requires one argment, readline,
	40	in the same way as the tokenize() generator.
	41
	42	It will call readline a maximum of twice, and return the encoding used
	43	(as a string) and a list of any lines (left as bytes) it has read in.
	44
	45	It detects the encoding from the presence of a utf-8 bom or an encoding
	46	cookie as specified in pep-0263. If both a bom and a cookie are present,
	47	but disagree, a SyntaxError will be raised. If the encoding cookie is an
	48	invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
	49	'utf-8-sig' is returned.
	50
	51	If no encoding is specified, then the default of 'utf-8' will be returned.
	52	"""
	53	bom_found = False
	54	encoding = None
	55	default = 'utf-8'
	56	def read_or_stop():
	57	try:
	58	return readline()
	59	except StopIteration:
	60	return b''
	61
	62	def find_cookie(line):
	63	try:
	64	line_string = line.decode('ascii')
	65	except UnicodeDecodeError:
	66	return None
	67
	68	matches = cookie_re.findall(line_string)
	69	if not matches:
	70	return None
	71	encoding = _get_normal_name(matches[0])
	72	try:
	73	codec = lookup(encoding)
	74	except LookupError:
	75	# This behaviour mimics the Python interpreter
	76	raise SyntaxError("unknown encoding: " + encoding)
	77
	78	if bom_found:
	79	if codec.name != 'utf-8':
	80	# This behaviour mimics the Python interpreter
	81	raise SyntaxError('encoding problem: utf-8')
	82	encoding += '-sig'
	83	return encoding
	84
	85	first = read_or_stop()
	86	if first.startswith(BOM_UTF8):
	87	bom_found = True
	88	first = first[3:]
	89	default = 'utf-8-sig'
	90	if not first:
	91	return default, []
	92
	93	encoding = find_cookie(first)
	94	if encoding:
	95	return encoding, [first]
	96
	97	second = read_or_stop()
	98	if not second:
	99	return default, [first]
	100
	101	encoding = find_cookie(second)
	102	if encoding:
	103	return encoding, [first, second]
	104
	105	return default, [first, second]
	106
	107	try:
	108	# Available in Python 3.2 and above.
	109	from tokenize import open
	110	except:
	111	# Copied from Python 3.2 tokenize
	112	def open(filename):
	113	"""Open a file in read only mode using the encoding detected by
	114	detect_encoding().
	115	"""
	116	buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
	117	encoding, lines = detect_encoding(buffer.readline)
	118	buffer.seek(0)
	119	text = TextIOWrapper(buffer, encoding, line_buffering=True)
	120	text.mode = 'r'
	121	return text
	122
	123	def open_url(url, errors='replace'):
	124	"""Open a URL to a raw Python file, using the encoding detected by
	125	detect_encoding().
	126	"""
	127	response = urllib.urlopen(url)
	128	buffer = io.BufferedRandom(response)
	129	encoding, lines = detect_encoding(buffer.readline)
	130	buffer.seek(0)
	131	text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
	132	text.mode = 'r'
	133	return text
	134
	135	def strip_encoding_cookie(filelike):
	136	"""Generator to pull lines from a text-mode file, skipping the encoding
	137	cookie if it is found in the first two lines.
	138	"""
	139	it = iter(filelike)
	140	try:
	141	first = next(it)
	142	if not cookie_comment_re.match(first):
	143	yield first
	144	second = next(it)
	145	if not cookie_comment_re.match(second):
	146	yield second
	147	except StopIteration:
	148	return
	149
	150	for line in it:
	151	yield line
	152
	153	def read_py_file(filename, skip_encoding_cookie=True):
	154	f = open(filename) # the open function defined in this module.
	155	if skip_encoding_cookie:
	156	return "".join(strip_encoding_cookie(f))
	157	else:
	158	return f.read()

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages