Show More
@@ -0,0 +1,5 b'' | |||||
|
1 | # encoding: iso-8859-5 | |||
|
2 | # (Unlikely to be the default encoding for most testers.) | |||
|
3 | # ������������������� <- Cyrillic characters | |||
|
4 | from __future__ import unicode_literals | |||
|
5 | u = '����' |
@@ -0,0 +1,192 b'' | |||||
|
1 | """ | |||
|
2 | Tools to open .py files as Unicode, using the encoding specified within the file, | |||
|
3 | as per PEP 263. | |||
|
4 | ||||
|
5 | Much of the code is taken from the tokenize module in Python 3.2. | |||
|
6 | """ | |||
|
7 | from __future__ import absolute_import | |||
|
8 | ||||
|
9 | import __builtin__ | |||
|
10 | import io | |||
|
11 | from io import TextIOWrapper | |||
|
12 | import re | |||
|
13 | import urllib | |||
|
14 | ||||
|
15 | cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE) | |||
|
16 | cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) | |||
|
17 | ||||
|
18 | try: | |||
|
19 | # Available in Python 3 | |||
|
20 | from tokenize import detect_encoding | |||
|
21 | except ImportError: | |||
|
22 | from codecs import lookup, BOM_UTF8 | |||
|
23 | ||||
|
24 | # Copied from Python 3.2 tokenize | |||
|
25 | def _get_normal_name(orig_enc): | |||
|
26 | """Imitates get_normal_name in tokenizer.c.""" | |||
|
27 | # Only care about the first 12 characters. | |||
|
28 | enc = orig_enc[:12].lower().replace("_", "-") | |||
|
29 | if enc == "utf-8" or enc.startswith("utf-8-"): | |||
|
30 | return "utf-8" | |||
|
31 | if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ | |||
|
32 | enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): | |||
|
33 | return "iso-8859-1" | |||
|
34 | return orig_enc | |||
|
35 | ||||
|
36 | # Copied from Python 3.2 tokenize | |||
|
37 | def detect_encoding(readline): | |||
|
38 | """ | |||
|
39 | The detect_encoding() function is used to detect the encoding that should | |||
|
40 | be used to decode a Python source file. It requires one argment, readline, | |||
|
41 | in the same way as the tokenize() generator. | |||
|
42 | ||||
|
43 | It will call readline a maximum of twice, and return the encoding used | |||
|
44 | (as a string) and a list of any lines (left as bytes) it has read in. | |||
|
45 | ||||
|
46 | It detects the encoding from the presence of a utf-8 bom or an encoding | |||
|
47 | cookie as specified in pep-0263. If both a bom and a cookie are present, | |||
|
48 | but disagree, a SyntaxError will be raised. If the encoding cookie is an | |||
|
49 | invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, | |||
|
50 | 'utf-8-sig' is returned. | |||
|
51 | ||||
|
52 | If no encoding is specified, then the default of 'utf-8' will be returned. | |||
|
53 | """ | |||
|
54 | bom_found = False | |||
|
55 | encoding = None | |||
|
56 | default = 'utf-8' | |||
|
57 | def read_or_stop(): | |||
|
58 | try: | |||
|
59 | return readline() | |||
|
60 | except StopIteration: | |||
|
61 | return b'' | |||
|
62 | ||||
|
63 | def find_cookie(line): | |||
|
64 | try: | |||
|
65 | line_string = line.decode('ascii') | |||
|
66 | except UnicodeDecodeError: | |||
|
67 | return None | |||
|
68 | ||||
|
69 | matches = cookie_re.findall(line_string) | |||
|
70 | if not matches: | |||
|
71 | return None | |||
|
72 | encoding = _get_normal_name(matches[0]) | |||
|
73 | try: | |||
|
74 | codec = lookup(encoding) | |||
|
75 | except LookupError: | |||
|
76 | # This behaviour mimics the Python interpreter | |||
|
77 | raise SyntaxError("unknown encoding: " + encoding) | |||
|
78 | ||||
|
79 | if bom_found: | |||
|
80 | if codec.name != 'utf-8': | |||
|
81 | # This behaviour mimics the Python interpreter | |||
|
82 | raise SyntaxError('encoding problem: utf-8') | |||
|
83 | encoding += '-sig' | |||
|
84 | return encoding | |||
|
85 | ||||
|
86 | first = read_or_stop() | |||
|
87 | if first.startswith(BOM_UTF8): | |||
|
88 | bom_found = True | |||
|
89 | first = first[3:] | |||
|
90 | default = 'utf-8-sig' | |||
|
91 | if not first: | |||
|
92 | return default, [] | |||
|
93 | ||||
|
94 | encoding = find_cookie(first) | |||
|
95 | if encoding: | |||
|
96 | return encoding, [first] | |||
|
97 | ||||
|
98 | second = read_or_stop() | |||
|
99 | if not second: | |||
|
100 | return default, [first] | |||
|
101 | ||||
|
102 | encoding = find_cookie(second) | |||
|
103 | if encoding: | |||
|
104 | return encoding, [first, second] | |||
|
105 | ||||
|
106 | return default, [first, second] | |||
|
107 | ||||
|
108 | try: | |||
|
109 | # Available in Python 3.2 and above. | |||
|
110 | from tokenize import open | |||
|
111 | except ImportError: | |||
|
112 | # Copied from Python 3.2 tokenize | |||
|
113 | def open(filename): | |||
|
114 | """Open a file in read only mode using the encoding detected by | |||
|
115 | detect_encoding(). | |||
|
116 | """ | |||
|
117 | buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2 | |||
|
118 | encoding, lines = detect_encoding(buffer.readline) | |||
|
119 | buffer.seek(0) | |||
|
120 | text = TextIOWrapper(buffer, encoding, line_buffering=True) | |||
|
121 | text.mode = 'r' | |||
|
122 | return text | |||
|
123 | ||||
|
124 | def strip_encoding_cookie(filelike): | |||
|
125 | """Generator to pull lines from a text-mode file, skipping the encoding | |||
|
126 | cookie if it is found in the first two lines. | |||
|
127 | """ | |||
|
128 | it = iter(filelike) | |||
|
129 | try: | |||
|
130 | first = next(it) | |||
|
131 | if not cookie_comment_re.match(first): | |||
|
132 | yield first | |||
|
133 | second = next(it) | |||
|
134 | if not cookie_comment_re.match(second): | |||
|
135 | yield second | |||
|
136 | except StopIteration: | |||
|
137 | return | |||
|
138 | ||||
|
139 | for line in it: | |||
|
140 | yield line | |||
|
141 | ||||
|
142 | def read_py_file(filename, skip_encoding_cookie=True): | |||
|
143 | """Read a Python file, using the encoding declared inside the file. | |||
|
144 | ||||
|
145 | Parameters | |||
|
146 | ---------- | |||
|
147 | filename : str | |||
|
148 | The path to the file to read. | |||
|
149 | skip_encoding_cookie : bool | |||
|
150 | If True (the default), and the encoding declaration is found in the first | |||
|
151 | two lines, that line will be excluded from the output - compiling a | |||
|
152 | unicode string with an encoding declaration is a SyntaxError in Python 2. | |||
|
153 | ||||
|
154 | Returns | |||
|
155 | ------- | |||
|
156 | A unicode string containing the contents of the file. | |||
|
157 | """ | |||
|
158 | with open(filename) as f: # the open function defined in this module. | |||
|
159 | if skip_encoding_cookie: | |||
|
160 | return "".join(strip_encoding_cookie(f)) | |||
|
161 | else: | |||
|
162 | return f.read() | |||
|
163 | ||||
|
164 | def read_py_url(url, errors='replace', skip_encoding_cookie=True): | |||
|
165 | """Read a Python file from a URL, using the encoding declared inside the file. | |||
|
166 | ||||
|
167 | Parameters | |||
|
168 | ---------- | |||
|
169 | url : str | |||
|
170 | The URL from which to fetch the file. | |||
|
171 | errors : str | |||
|
172 | How to handle decoding errors in the file. Options are the same as for | |||
|
173 | bytes.decode(), but here 'replace' is the default. | |||
|
174 | skip_encoding_cookie : bool | |||
|
175 | If True (the default), and the encoding declaration is found in the first | |||
|
176 | two lines, that line will be excluded from the output - compiling a | |||
|
177 | unicode string with an encoding declaration is a SyntaxError in Python 2. | |||
|
178 | ||||
|
179 | Returns | |||
|
180 | ------- | |||
|
181 | A unicode string containing the contents of the file. | |||
|
182 | """ | |||
|
183 | response = urllib.urlopen(url) | |||
|
184 | buffer = io.BytesIO(response.read()) | |||
|
185 | encoding, lines = detect_encoding(buffer.readline) | |||
|
186 | buffer.seek(0) | |||
|
187 | text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) | |||
|
188 | text.mode = 'r' | |||
|
189 | if skip_encoding_cookie: | |||
|
190 | return "".join(strip_encoding_cookie(text)) | |||
|
191 | else: | |||
|
192 | return text.read() |
@@ -0,0 +1,23 b'' | |||||
|
1 | import io | |||
|
2 | import os.path | |||
|
3 | import nose.tools as nt | |||
|
4 | ||||
|
5 | from IPython.utils import openpy | |||
|
6 | ||||
|
7 | mydir = os.path.dirname(__file__) | |||
|
8 | nonascii_path = os.path.join(mydir, '../../core/tests/nonascii.py') | |||
|
9 | ||||
|
10 | def test_detect_encoding(): | |||
|
11 | f = open(nonascii_path, 'rb') | |||
|
12 | enc, lines = openpy.detect_encoding(f.readline) | |||
|
13 | nt.assert_equal(enc, 'iso-8859-5') | |||
|
14 | ||||
|
15 | def test_read_file(): | |||
|
16 | read_specified_enc = io.open(nonascii_path, encoding='iso-8859-5').read() | |||
|
17 | read_detected_enc = openpy.read_py_file(nonascii_path, skip_encoding_cookie=False) | |||
|
18 | nt.assert_equal(read_detected_enc, read_specified_enc) | |||
|
19 | assert u'encoding: iso-8859-5' in read_detected_enc | |||
|
20 | ||||
|
21 | read_strip_enc_cookie = openpy.read_py_file(nonascii_path, skip_encoding_cookie=True) | |||
|
22 | assert u'encoding: iso-8859-5' not in read_strip_enc_cookie | |||
|
23 |
@@ -55,6 +55,7 b' from IPython.core.prefilter import ESC_MAGIC' | |||||
55 | from IPython.core.pylabtools import mpl_runner |
|
55 | from IPython.core.pylabtools import mpl_runner | |
56 | from IPython.testing.skipdoctest import skip_doctest |
|
56 | from IPython.testing.skipdoctest import skip_doctest | |
57 | from IPython.utils import py3compat |
|
57 | from IPython.utils import py3compat | |
|
58 | from IPython.utils import openpy | |||
58 | from IPython.utils.io import file_read, nlprint |
|
59 | from IPython.utils.io import file_read, nlprint | |
59 | from IPython.utils.module_paths import find_mod |
|
60 | from IPython.utils.module_paths import find_mod | |
60 | from IPython.utils.path import get_py_filename, unquote_filename |
|
61 | from IPython.utils.path import get_py_filename, unquote_filename | |
@@ -98,9 +99,6 b' def needs_local_scope(func):' | |||||
98 | # Used for exception handling in magic_edit |
|
99 | # Used for exception handling in magic_edit | |
99 | class MacroToEdit(ValueError): pass |
|
100 | class MacroToEdit(ValueError): pass | |
100 |
|
101 | |||
101 | # Taken from PEP 263, this is the official encoding regexp. |
|
|||
102 | _encoding_declaration_re = re.compile(r"^#.*coding[:=]\s*([-\w.]+)") |
|
|||
103 |
|
||||
104 | #*************************************************************************** |
|
102 | #*************************************************************************** | |
105 | # Main class implementing Magic functionality |
|
103 | # Main class implementing Magic functionality | |
106 |
|
104 | |||
@@ -2256,28 +2254,15 b' Currently the magic system has the following functions:\\n"""' | |||||
2256 | # Local files must be .py; for remote URLs it's possible that the |
|
2254 | # Local files must be .py; for remote URLs it's possible that the | |
2257 | # fetch URL doesn't have a .py in it (many servers have an opaque |
|
2255 | # fetch URL doesn't have a .py in it (many servers have an opaque | |
2258 | # URL, such as scipy-central.org). |
|
2256 | # URL, such as scipy-central.org). | |
2259 | raise ValueError('%%load only works with .py files: %s' % arg_s) |
|
2257 | raise ValueError('%%loadpy only works with .py files: %s' % arg_s) | |
|
2258 | ||||
|
2259 | # openpy takes care of finding the source encoding (per PEP 263) | |||
2260 | if remote_url: |
|
2260 | if remote_url: | |
2261 | import urllib2 |
|
2261 | contents = openpy.read_py_url(arg_s, skip_encoding_cookie=True) | |
2262 | fileobj = urllib2.urlopen(arg_s) |
|
|||
2263 | # While responses have a .info().getencoding() way of asking for |
|
|||
2264 | # their encoding, in *many* cases the return value is bogus. In |
|
|||
2265 | # the wild, servers serving utf-8 but declaring latin-1 are |
|
|||
2266 | # extremely common, as the old HTTP standards specify latin-1 as |
|
|||
2267 | # the default but many modern filesystems use utf-8. So we can NOT |
|
|||
2268 | # rely on the headers. Short of building complex encoding-guessing |
|
|||
2269 | # logic, going with utf-8 is a simple solution likely to be right |
|
|||
2270 | # in most real-world cases. |
|
|||
2271 | linesource = fileobj.read().decode('utf-8', 'replace').splitlines() |
|
|||
2272 | fileobj.close() |
|
|||
2273 | else: |
|
2262 | else: | |
2274 | with open(arg_s) as fileobj: |
|
2263 | contents = openpy.read_py_file(arg_s, skip_encoding_cookie=True) | |
2275 | linesource = fileobj.read().splitlines() |
|
|||
2276 |
|
||||
2277 | # Strip out encoding declarations |
|
|||
2278 | lines = [l for l in linesource if not _encoding_declaration_re.match(l)] |
|
|||
2279 |
|
2264 | |||
2280 |
self.set_next_input( |
|
2265 | self.set_next_input(contents) | |
2281 |
|
2266 | |||
2282 | def _find_edit_target(self, args, opts, last_call): |
|
2267 | def _find_edit_target(self, args, opts, last_call): | |
2283 | """Utility method used by magic_edit to find what to edit.""" |
|
2268 | """Utility method used by magic_edit to find what to edit.""" |
@@ -1,3 +1,4 b'' | |||||
|
1 | # encoding: utf-8 | |||
1 | """Tests for code execution (%run and related), which is particularly tricky. |
|
2 | """Tests for code execution (%run and related), which is particularly tricky. | |
2 |
|
3 | |||
3 | Because of how %run manages namespaces, and the fact that we are trying here to |
|
4 | Because of how %run manages namespaces, and the fact that we are trying here to | |
@@ -240,3 +241,10 b' tclass.py: deleting object: C-third' | |||||
240 | _ip.run_cell("zz = 23") |
|
241 | _ip.run_cell("zz = 23") | |
241 | _ip.magic('run -i %s' % self.fname) |
|
242 | _ip.magic('run -i %s' % self.fname) | |
242 | tt.assert_equals(_ip.user_ns['yy'], 23) |
|
243 | tt.assert_equals(_ip.user_ns['yy'], 23) | |
|
244 | ||||
|
245 | def test_unicode(self): | |||
|
246 | """Check that files in odd encodings are accepted.""" | |||
|
247 | mydir = os.path.dirname(__file__) | |||
|
248 | na = os.path.join(mydir, 'nonascii.py') | |||
|
249 | _ip.magic('run %s' % na) | |||
|
250 | tt.assert_equals(_ip.user_ns['u'], u'Ўт№Ф') |
@@ -70,7 +70,7 b' if sys.version_info[0] >= 3:' | |||||
70 |
|
70 | |||
71 | def execfile(fname, glob, loc=None): |
|
71 | def execfile(fname, glob, loc=None): | |
72 | loc = loc if (loc is not None) else glob |
|
72 | loc = loc if (loc is not None) else glob | |
73 | exec compile(open(fname).read(), fname, 'exec') in glob, loc |
|
73 | exec compile(open(fname, 'rb').read(), fname, 'exec') in glob, loc | |
74 |
|
74 | |||
75 | # Refactor print statements in doctests. |
|
75 | # Refactor print statements in doctests. | |
76 | _print_statement_re = re.compile(r"\bprint (?P<expr>.*)$", re.MULTILINE) |
|
76 | _print_statement_re = re.compile(r"\bprint (?P<expr>.*)$", re.MULTILINE) |
General Comments 0
You need to be logged in to leave comments.
Login now