upstream/ipython Commit - r6454:0fed70ce

Merge pull request from takluyver/openpy...

Fernando Perez -

r6454:0fed70ce

parent child

IPython/core/tests/nonascii.py

0 created 644 +5 0

@@ -0,0 +1,5 b''
	1	# encoding: iso-8859-5
	2	# (Unlikely to be the default encoding for most testers.)
	3	# �� <- Cyrillic characters
	4	from __future__ import unicode_literals
	5	u = '��'

IPython/utils/openpy.py

0 created 644 +192 0

@@ -0,0 +1,192 b''
	1	"""
	2	Tools to open .py files as Unicode, using the encoding specified within the file,
	3	as per PEP 263.
	4
	5	Much of the code is taken from the tokenize module in Python 3.2.
	6	"""
	7	from __future__ import absolute_import
	8
	9	import __builtin__
	10	import io
	11	from io import TextIOWrapper
	12	import re
	13	import urllib
	14
	15	cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
	16	cookie_comment_re = re.compile(ur"^\s#.coding[:=]\s*([-\w.]+)", re.UNICODE)
	17
	18	try:
	19	# Available in Python 3
	20	from tokenize import detect_encoding
	21	except ImportError:
	22	from codecs import lookup, BOM_UTF8
	23
	24	# Copied from Python 3.2 tokenize
	25	def _get_normal_name(orig_enc):
	26	"""Imitates get_normal_name in tokenizer.c."""
	27	# Only care about the first 12 characters.
	28	enc = orig_enc[:12].lower().replace("_", "-")
	29	if enc == "utf-8" or enc.startswith("utf-8-"):
	30	return "utf-8"
	31	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
	32	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
	33	return "iso-8859-1"
	34	return orig_enc
	35
	36	# Copied from Python 3.2 tokenize
	37	def detect_encoding(readline):
	38	"""
	39	The detect_encoding() function is used to detect the encoding that should
	40	be used to decode a Python source file. It requires one argment, readline,
	41	in the same way as the tokenize() generator.
	42
	43	It will call readline a maximum of twice, and return the encoding used
	44	(as a string) and a list of any lines (left as bytes) it has read in.
	45
	46	It detects the encoding from the presence of a utf-8 bom or an encoding
	47	cookie as specified in pep-0263. If both a bom and a cookie are present,
	48	but disagree, a SyntaxError will be raised. If the encoding cookie is an
	49	invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
	50	'utf-8-sig' is returned.
	51
	52	If no encoding is specified, then the default of 'utf-8' will be returned.
	53	"""
	54	bom_found = False
	55	encoding = None
	56	default = 'utf-8'
	57	def read_or_stop():
	58	try:
	59	return readline()
	60	except StopIteration:
	61	return b''
	62
	63	def find_cookie(line):
	64	try:
	65	line_string = line.decode('ascii')
	66	except UnicodeDecodeError:
	67	return None
	68
	69	matches = cookie_re.findall(line_string)
	70	if not matches:
	71	return None
	72	encoding = _get_normal_name(matches[0])
	73	try:
	74	codec = lookup(encoding)
	75	except LookupError:
	76	# This behaviour mimics the Python interpreter
	77	raise SyntaxError("unknown encoding: " + encoding)
	78
	79	if bom_found:
	80	if codec.name != 'utf-8':
	81	# This behaviour mimics the Python interpreter
	82	raise SyntaxError('encoding problem: utf-8')
	83	encoding += '-sig'
	84	return encoding
	85
	86	first = read_or_stop()
	87	if first.startswith(BOM_UTF8):
	88	bom_found = True
	89	first = first[3:]
	90	default = 'utf-8-sig'
	91	if not first:
	92	return default, []
	93
	94	encoding = find_cookie(first)
	95	if encoding:
	96	return encoding, [first]
	97
	98	second = read_or_stop()
	99	if not second:
	100	return default, [first]
	101
	102	encoding = find_cookie(second)
	103	if encoding:
	104	return encoding, [first, second]
	105
	106	return default, [first, second]
	107
	108	try:
	109	# Available in Python 3.2 and above.
	110	from tokenize import open
	111	except ImportError:
	112	# Copied from Python 3.2 tokenize
	113	def open(filename):
	114	"""Open a file in read only mode using the encoding detected by
	115	detect_encoding().
	116	"""
	117	buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
	118	encoding, lines = detect_encoding(buffer.readline)
	119	buffer.seek(0)
	120	text = TextIOWrapper(buffer, encoding, line_buffering=True)
	121	text.mode = 'r'
	122	return text
	123
	124	def strip_encoding_cookie(filelike):
	125	"""Generator to pull lines from a text-mode file, skipping the encoding
	126	cookie if it is found in the first two lines.
	127	"""
	128	it = iter(filelike)
	129	try:
	130	first = next(it)
	131	if not cookie_comment_re.match(first):
	132	yield first
	133	second = next(it)
	134	if not cookie_comment_re.match(second):
	135	yield second
	136	except StopIteration:
	137	return
	138
	139	for line in it:
	140	yield line
	141
	142	def read_py_file(filename, skip_encoding_cookie=True):
	143	"""Read a Python file, using the encoding declared inside the file.
	144
	145	Parameters
	146	----------
	147	filename : str
	148	The path to the file to read.
	149	skip_encoding_cookie : bool
	150	If True (the default), and the encoding declaration is found in the first
	151	two lines, that line will be excluded from the output - compiling a
	152	unicode string with an encoding declaration is a SyntaxError in Python 2.
	153
	154	Returns
	155	-------
	156	A unicode string containing the contents of the file.
	157	"""
	158	with open(filename) as f: # the open function defined in this module.
	159	if skip_encoding_cookie:
	160	return "".join(strip_encoding_cookie(f))
	161	else:
	162	return f.read()
	163
	164	def read_py_url(url, errors='replace', skip_encoding_cookie=True):
	165	"""Read a Python file from a URL, using the encoding declared inside the file.
	166
	167	Parameters
	168	----------
	169	url : str
	170	The URL from which to fetch the file.
	171	errors : str
	172	How to handle decoding errors in the file. Options are the same as for
	173	bytes.decode(), but here 'replace' is the default.
	174	skip_encoding_cookie : bool
	175	If True (the default), and the encoding declaration is found in the first
	176	two lines, that line will be excluded from the output - compiling a
	177	unicode string with an encoding declaration is a SyntaxError in Python 2.
	178
	179	Returns
	180	-------
	181	A unicode string containing the contents of the file.
	182	"""
	183	response = urllib.urlopen(url)
	184	buffer = io.BytesIO(response.read())
	185	encoding, lines = detect_encoding(buffer.readline)
	186	buffer.seek(0)
	187	text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
	188	text.mode = 'r'
	189	if skip_encoding_cookie:
	190	return "".join(strip_encoding_cookie(text))
	191	else:
	192	return text.read()

IPython/utils/tests/test_openpy.py

0 created 644 +23 0

@@ -0,0 +1,23 b''
	1	import io
	2	import os.path
	3	import nose.tools as nt
	4
	5	from IPython.utils import openpy
	6
	7	mydir = os.path.dirname(__file__)
	8	nonascii_path = os.path.join(mydir, '../../core/tests/nonascii.py')
	9
	10	def test_detect_encoding():
	11	f = open(nonascii_path, 'rb')
	12	enc, lines = openpy.detect_encoding(f.readline)
	13	nt.assert_equal(enc, 'iso-8859-5')
	14
	15	def test_read_file():
	16	read_specified_enc = io.open(nonascii_path, encoding='iso-8859-5').read()
	17	read_detected_enc = openpy.read_py_file(nonascii_path, skip_encoding_cookie=False)
	18	nt.assert_equal(read_detected_enc, read_specified_enc)
	19	assert u'encoding: iso-8859-5' in read_detected_enc
	20
	21	read_strip_enc_cookie = openpy.read_py_file(nonascii_path, skip_encoding_cookie=True)
	22	assert u'encoding: iso-8859-5' not in read_strip_enc_cookie
	23

IPython/core/magic.py

0 +7 -22

             from IPython.core.pylabtools import mpl_runner
             from IPython.testing.skipdoctest import skip_doctest
             from IPython.utils import py3compat
+            from IPython.utils import openpy
             from IPython.utils.io import file_read, nlprint
             from IPython.utils.module_paths import find_mod
             from IPython.utils.path import get_py_filename, unquote_filename
             # Used for exception handling in magic_edit
             class MacroToEdit(ValueError): pass
-            # Taken from PEP 263, this is the official encoding regexp.
-            _encoding_declaration_re = re.compile(r"^#.*coding[:=]\s*([-\w.]+)")
             #***************************************************************************
             # Main class implementing Magic functionality
                         # Local files must be .py; for remote URLs it's possible that the
                         # fetch URL doesn't have a .py in it (many servers have an opaque
                         # URL, such as scipy-central.org).
-                        raise ValueError('%%load only works with .py files: %s' % arg_s)
+                        raise ValueError('%%loadpy only works with .py files: %s' % arg_s)
+                    # openpy takes care of finding the source encoding (per PEP 263)
                     if remote_url:
-                        import urllib2
+                        contents = openpy.read_py_url(arg_s, skip_encoding_cookie=True)
-                        fileobj = urllib2.urlopen(arg_s)
-                        # While responses have a .info().getencoding() way of asking for
-                        # their encoding, in *many* cases the return value is bogus.  In
-                        # the wild, servers serving utf-8 but declaring latin-1 are
-                        # extremely common, as the old HTTP standards specify latin-1 as
-                        # the default but many modern filesystems use utf-8.  So we can NOT
-                        # rely on the headers.  Short of building complex encoding-guessing
-                        # logic, going with utf-8 is a simple solution likely to be right
-                        # in most real-world cases.
-                        linesource = fileobj.read().decode('utf-8', 'replace').splitlines()
-                        fileobj.close()
                     else:
-                        with open(arg_s) as fileobj:
+                        contents = openpy.read_py_file(arg_s, skip_encoding_cookie=True)
-                            linesource = fileobj.read().splitlines()
-                    # Strip out encoding declarations
-                    lines = [l for l in linesource if not _encoding_declaration_re.match(l)]
-                    self.set_next_input(os.linesep.join(lines))
+                    self.set_next_input(contents)
                 def _find_edit_target(self, args, opts, last_call):
                     """Utility method used by magic_edit to find what to edit."""

IPython/core/tests/test_run.py

0 +8 0

@@ -1,3 +1,4 b''
		1	# encoding: utf-8
1	"""Tests for code execution (%run and related), which is particularly tricky.	2	"""Tests for code execution (%run and related), which is particularly tricky.
2		3
3	Because of how %run manages namespaces, and the fact that we are trying here to	4	Because of how %run manages namespaces, and the fact that we are trying here to
@@ -240,3 +241,10 b' tclass.py: deleting object: C-third'
240	_ip.run_cell("zz = 23")	241	_ip.run_cell("zz = 23")
241	_ip.magic('run -i %s' % self.fname)	242	_ip.magic('run -i %s' % self.fname)
242	tt.assert_equals(_ip.user_ns['yy'], 23)	243	tt.assert_equals(_ip.user_ns['yy'], 23)
		244
		245	def test_unicode(self):
		246	"""Check that files in odd encodings are accepted."""
		247	mydir = os.path.dirname(__file__)
		248	na = os.path.join(mydir, 'nonascii.py')
		249	_ip.magic('run %s' % na)
		250	tt.assert_equals(_ip.user_ns['u'], u'Ўт№Ф')

IPython/utils/py3compat.py

0 +1 -1

                 def execfile(fname, glob, loc=None):
                     loc = loc if (loc is not None) else glob
-                    exec compile(open(fname).read(), fname, 'exec') in glob, loc
+                    exec compile(open(fname, 'rb').read(), fname, 'exec') in glob, loc
                 # Refactor print statements in doctests.
                 _print_statement_re = re.compile(r"\bprint (?P<expr>.*)$", re.MULTILINE)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages