##// END OF EJS Templates
Merge pull request #1526 from takluyver/openpy...
Fernando Perez -
r6454:0fed70ce merge
parent child Browse files
Show More
@@ -0,0 +1,5 b''
1 # encoding: iso-8859-5
2 # (Unlikely to be the default encoding for most testers.)
3 # ������������������� <- Cyrillic characters
4 from __future__ import unicode_literals
5 u = '����'
@@ -0,0 +1,192 b''
1 """
2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 as per PEP 263.
4
5 Much of the code is taken from the tokenize module in Python 3.2.
6 """
7 from __future__ import absolute_import
8
9 import __builtin__
10 import io
11 from io import TextIOWrapper
12 import re
13 import urllib
14
15 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
16 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
17
18 try:
19 # Available in Python 3
20 from tokenize import detect_encoding
21 except ImportError:
22 from codecs import lookup, BOM_UTF8
23
24 # Copied from Python 3.2 tokenize
25 def _get_normal_name(orig_enc):
26 """Imitates get_normal_name in tokenizer.c."""
27 # Only care about the first 12 characters.
28 enc = orig_enc[:12].lower().replace("_", "-")
29 if enc == "utf-8" or enc.startswith("utf-8-"):
30 return "utf-8"
31 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
32 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
33 return "iso-8859-1"
34 return orig_enc
35
36 # Copied from Python 3.2 tokenize
37 def detect_encoding(readline):
38 """
39 The detect_encoding() function is used to detect the encoding that should
40 be used to decode a Python source file. It requires one argment, readline,
41 in the same way as the tokenize() generator.
42
43 It will call readline a maximum of twice, and return the encoding used
44 (as a string) and a list of any lines (left as bytes) it has read in.
45
46 It detects the encoding from the presence of a utf-8 bom or an encoding
47 cookie as specified in pep-0263. If both a bom and a cookie are present,
48 but disagree, a SyntaxError will be raised. If the encoding cookie is an
49 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
50 'utf-8-sig' is returned.
51
52 If no encoding is specified, then the default of 'utf-8' will be returned.
53 """
54 bom_found = False
55 encoding = None
56 default = 'utf-8'
57 def read_or_stop():
58 try:
59 return readline()
60 except StopIteration:
61 return b''
62
63 def find_cookie(line):
64 try:
65 line_string = line.decode('ascii')
66 except UnicodeDecodeError:
67 return None
68
69 matches = cookie_re.findall(line_string)
70 if not matches:
71 return None
72 encoding = _get_normal_name(matches[0])
73 try:
74 codec = lookup(encoding)
75 except LookupError:
76 # This behaviour mimics the Python interpreter
77 raise SyntaxError("unknown encoding: " + encoding)
78
79 if bom_found:
80 if codec.name != 'utf-8':
81 # This behaviour mimics the Python interpreter
82 raise SyntaxError('encoding problem: utf-8')
83 encoding += '-sig'
84 return encoding
85
86 first = read_or_stop()
87 if first.startswith(BOM_UTF8):
88 bom_found = True
89 first = first[3:]
90 default = 'utf-8-sig'
91 if not first:
92 return default, []
93
94 encoding = find_cookie(first)
95 if encoding:
96 return encoding, [first]
97
98 second = read_or_stop()
99 if not second:
100 return default, [first]
101
102 encoding = find_cookie(second)
103 if encoding:
104 return encoding, [first, second]
105
106 return default, [first, second]
107
108 try:
109 # Available in Python 3.2 and above.
110 from tokenize import open
111 except ImportError:
112 # Copied from Python 3.2 tokenize
113 def open(filename):
114 """Open a file in read only mode using the encoding detected by
115 detect_encoding().
116 """
117 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
118 encoding, lines = detect_encoding(buffer.readline)
119 buffer.seek(0)
120 text = TextIOWrapper(buffer, encoding, line_buffering=True)
121 text.mode = 'r'
122 return text
123
124 def strip_encoding_cookie(filelike):
125 """Generator to pull lines from a text-mode file, skipping the encoding
126 cookie if it is found in the first two lines.
127 """
128 it = iter(filelike)
129 try:
130 first = next(it)
131 if not cookie_comment_re.match(first):
132 yield first
133 second = next(it)
134 if not cookie_comment_re.match(second):
135 yield second
136 except StopIteration:
137 return
138
139 for line in it:
140 yield line
141
142 def read_py_file(filename, skip_encoding_cookie=True):
143 """Read a Python file, using the encoding declared inside the file.
144
145 Parameters
146 ----------
147 filename : str
148 The path to the file to read.
149 skip_encoding_cookie : bool
150 If True (the default), and the encoding declaration is found in the first
151 two lines, that line will be excluded from the output - compiling a
152 unicode string with an encoding declaration is a SyntaxError in Python 2.
153
154 Returns
155 -------
156 A unicode string containing the contents of the file.
157 """
158 with open(filename) as f: # the open function defined in this module.
159 if skip_encoding_cookie:
160 return "".join(strip_encoding_cookie(f))
161 else:
162 return f.read()
163
164 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
165 """Read a Python file from a URL, using the encoding declared inside the file.
166
167 Parameters
168 ----------
169 url : str
170 The URL from which to fetch the file.
171 errors : str
172 How to handle decoding errors in the file. Options are the same as for
173 bytes.decode(), but here 'replace' is the default.
174 skip_encoding_cookie : bool
175 If True (the default), and the encoding declaration is found in the first
176 two lines, that line will be excluded from the output - compiling a
177 unicode string with an encoding declaration is a SyntaxError in Python 2.
178
179 Returns
180 -------
181 A unicode string containing the contents of the file.
182 """
183 response = urllib.urlopen(url)
184 buffer = io.BytesIO(response.read())
185 encoding, lines = detect_encoding(buffer.readline)
186 buffer.seek(0)
187 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
188 text.mode = 'r'
189 if skip_encoding_cookie:
190 return "".join(strip_encoding_cookie(text))
191 else:
192 return text.read()
@@ -0,0 +1,23 b''
1 import io
2 import os.path
3 import nose.tools as nt
4
5 from IPython.utils import openpy
6
7 mydir = os.path.dirname(__file__)
8 nonascii_path = os.path.join(mydir, '../../core/tests/nonascii.py')
9
10 def test_detect_encoding():
11 f = open(nonascii_path, 'rb')
12 enc, lines = openpy.detect_encoding(f.readline)
13 nt.assert_equal(enc, 'iso-8859-5')
14
15 def test_read_file():
16 read_specified_enc = io.open(nonascii_path, encoding='iso-8859-5').read()
17 read_detected_enc = openpy.read_py_file(nonascii_path, skip_encoding_cookie=False)
18 nt.assert_equal(read_detected_enc, read_specified_enc)
19 assert u'encoding: iso-8859-5' in read_detected_enc
20
21 read_strip_enc_cookie = openpy.read_py_file(nonascii_path, skip_encoding_cookie=True)
22 assert u'encoding: iso-8859-5' not in read_strip_enc_cookie
23
@@ -55,6 +55,7 b' from IPython.core.prefilter import ESC_MAGIC'
55 from IPython.core.pylabtools import mpl_runner
55 from IPython.core.pylabtools import mpl_runner
56 from IPython.testing.skipdoctest import skip_doctest
56 from IPython.testing.skipdoctest import skip_doctest
57 from IPython.utils import py3compat
57 from IPython.utils import py3compat
58 from IPython.utils import openpy
58 from IPython.utils.io import file_read, nlprint
59 from IPython.utils.io import file_read, nlprint
59 from IPython.utils.module_paths import find_mod
60 from IPython.utils.module_paths import find_mod
60 from IPython.utils.path import get_py_filename, unquote_filename
61 from IPython.utils.path import get_py_filename, unquote_filename
@@ -98,9 +99,6 b' def needs_local_scope(func):'
98 # Used for exception handling in magic_edit
99 # Used for exception handling in magic_edit
99 class MacroToEdit(ValueError): pass
100 class MacroToEdit(ValueError): pass
100
101
101 # Taken from PEP 263, this is the official encoding regexp.
102 _encoding_declaration_re = re.compile(r"^#.*coding[:=]\s*([-\w.]+)")
103
104 #***************************************************************************
102 #***************************************************************************
105 # Main class implementing Magic functionality
103 # Main class implementing Magic functionality
106
104
@@ -2256,28 +2254,15 b' Currently the magic system has the following functions:\\n"""'
2256 # Local files must be .py; for remote URLs it's possible that the
2254 # Local files must be .py; for remote URLs it's possible that the
2257 # fetch URL doesn't have a .py in it (many servers have an opaque
2255 # fetch URL doesn't have a .py in it (many servers have an opaque
2258 # URL, such as scipy-central.org).
2256 # URL, such as scipy-central.org).
2259 raise ValueError('%%load only works with .py files: %s' % arg_s)
2257 raise ValueError('%%loadpy only works with .py files: %s' % arg_s)
2258
2259 # openpy takes care of finding the source encoding (per PEP 263)
2260 if remote_url:
2260 if remote_url:
2261 import urllib2
2261 contents = openpy.read_py_url(arg_s, skip_encoding_cookie=True)
2262 fileobj = urllib2.urlopen(arg_s)
2263 # While responses have a .info().getencoding() way of asking for
2264 # their encoding, in *many* cases the return value is bogus. In
2265 # the wild, servers serving utf-8 but declaring latin-1 are
2266 # extremely common, as the old HTTP standards specify latin-1 as
2267 # the default but many modern filesystems use utf-8. So we can NOT
2268 # rely on the headers. Short of building complex encoding-guessing
2269 # logic, going with utf-8 is a simple solution likely to be right
2270 # in most real-world cases.
2271 linesource = fileobj.read().decode('utf-8', 'replace').splitlines()
2272 fileobj.close()
2273 else:
2262 else:
2274 with open(arg_s) as fileobj:
2263 contents = openpy.read_py_file(arg_s, skip_encoding_cookie=True)
2275 linesource = fileobj.read().splitlines()
2276
2277 # Strip out encoding declarations
2278 lines = [l for l in linesource if not _encoding_declaration_re.match(l)]
2279
2264
2280 self.set_next_input(os.linesep.join(lines))
2265 self.set_next_input(contents)
2281
2266
2282 def _find_edit_target(self, args, opts, last_call):
2267 def _find_edit_target(self, args, opts, last_call):
2283 """Utility method used by magic_edit to find what to edit."""
2268 """Utility method used by magic_edit to find what to edit."""
@@ -1,3 +1,4 b''
1 # encoding: utf-8
1 """Tests for code execution (%run and related), which is particularly tricky.
2 """Tests for code execution (%run and related), which is particularly tricky.
2
3
3 Because of how %run manages namespaces, and the fact that we are trying here to
4 Because of how %run manages namespaces, and the fact that we are trying here to
@@ -240,3 +241,10 b' tclass.py: deleting object: C-third'
240 _ip.run_cell("zz = 23")
241 _ip.run_cell("zz = 23")
241 _ip.magic('run -i %s' % self.fname)
242 _ip.magic('run -i %s' % self.fname)
242 tt.assert_equals(_ip.user_ns['yy'], 23)
243 tt.assert_equals(_ip.user_ns['yy'], 23)
244
245 def test_unicode(self):
246 """Check that files in odd encodings are accepted."""
247 mydir = os.path.dirname(__file__)
248 na = os.path.join(mydir, 'nonascii.py')
249 _ip.magic('run %s' % na)
250 tt.assert_equals(_ip.user_ns['u'], u'Ўт№Ф')
@@ -70,7 +70,7 b' if sys.version_info[0] >= 3:'
70
70
71 def execfile(fname, glob, loc=None):
71 def execfile(fname, glob, loc=None):
72 loc = loc if (loc is not None) else glob
72 loc = loc if (loc is not None) else glob
73 exec compile(open(fname).read(), fname, 'exec') in glob, loc
73 exec compile(open(fname, 'rb').read(), fname, 'exec') in glob, loc
74
74
75 # Refactor print statements in doctests.
75 # Refactor print statements in doctests.
76 _print_statement_re = re.compile(r"\bprint (?P<expr>.*)$", re.MULTILINE)
76 _print_statement_re = re.compile(r"\bprint (?P<expr>.*)$", re.MULTILINE)
General Comments 0
You need to be logged in to leave comments. Login now