##// END OF EJS Templates
Add docstrings for read_py_file and read_py_url.
Thomas Kluyver -
Show More
@@ -1,162 +1,192 b''
1 """
1 """
2 Tools to open .py files as Unicode, using the encoding specified within the file,
2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 as per PEP 263.
3 as per PEP 263.
4
4
5 Much of the code is taken from the tokenize module in Python 3.2.
5 Much of the code is taken from the tokenize module in Python 3.2.
6 """
6 """
7 from __future__ import absolute_import
7 from __future__ import absolute_import
8
8
9 import __builtin__
9 import __builtin__
10 import io
10 import io
11 from io import TextIOWrapper
11 from io import TextIOWrapper
12 import re
12 import re
13 import urllib
13 import urllib
14
14
15 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
15 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
16 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
16 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
17
17
18 try:
18 try:
19 # Available in Python 3
19 # Available in Python 3
20 from tokenize import detect_encoding
20 from tokenize import detect_encoding
21 except ImportError:
21 except ImportError:
22 from codecs import lookup, BOM_UTF8
22 from codecs import lookup, BOM_UTF8
23
23
24 # Copied from Python 3.2 tokenize
24 # Copied from Python 3.2 tokenize
25 def _get_normal_name(orig_enc):
25 def _get_normal_name(orig_enc):
26 """Imitates get_normal_name in tokenizer.c."""
26 """Imitates get_normal_name in tokenizer.c."""
27 # Only care about the first 12 characters.
27 # Only care about the first 12 characters.
28 enc = orig_enc[:12].lower().replace("_", "-")
28 enc = orig_enc[:12].lower().replace("_", "-")
29 if enc == "utf-8" or enc.startswith("utf-8-"):
29 if enc == "utf-8" or enc.startswith("utf-8-"):
30 return "utf-8"
30 return "utf-8"
31 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
31 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
32 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
32 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
33 return "iso-8859-1"
33 return "iso-8859-1"
34 return orig_enc
34 return orig_enc
35
35
36 # Copied from Python 3.2 tokenize
36 # Copied from Python 3.2 tokenize
37 def detect_encoding(readline):
37 def detect_encoding(readline):
38 """
38 """
39 The detect_encoding() function is used to detect the encoding that should
39 The detect_encoding() function is used to detect the encoding that should
40 be used to decode a Python source file. It requires one argment, readline,
40 be used to decode a Python source file. It requires one argment, readline,
41 in the same way as the tokenize() generator.
41 in the same way as the tokenize() generator.
42
42
43 It will call readline a maximum of twice, and return the encoding used
43 It will call readline a maximum of twice, and return the encoding used
44 (as a string) and a list of any lines (left as bytes) it has read in.
44 (as a string) and a list of any lines (left as bytes) it has read in.
45
45
46 It detects the encoding from the presence of a utf-8 bom or an encoding
46 It detects the encoding from the presence of a utf-8 bom or an encoding
47 cookie as specified in pep-0263. If both a bom and a cookie are present,
47 cookie as specified in pep-0263. If both a bom and a cookie are present,
48 but disagree, a SyntaxError will be raised. If the encoding cookie is an
48 but disagree, a SyntaxError will be raised. If the encoding cookie is an
49 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
49 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
50 'utf-8-sig' is returned.
50 'utf-8-sig' is returned.
51
51
52 If no encoding is specified, then the default of 'utf-8' will be returned.
52 If no encoding is specified, then the default of 'utf-8' will be returned.
53 """
53 """
54 bom_found = False
54 bom_found = False
55 encoding = None
55 encoding = None
56 default = 'utf-8'
56 default = 'utf-8'
57 def read_or_stop():
57 def read_or_stop():
58 try:
58 try:
59 return readline()
59 return readline()
60 except StopIteration:
60 except StopIteration:
61 return b''
61 return b''
62
62
63 def find_cookie(line):
63 def find_cookie(line):
64 try:
64 try:
65 line_string = line.decode('ascii')
65 line_string = line.decode('ascii')
66 except UnicodeDecodeError:
66 except UnicodeDecodeError:
67 return None
67 return None
68
68
69 matches = cookie_re.findall(line_string)
69 matches = cookie_re.findall(line_string)
70 if not matches:
70 if not matches:
71 return None
71 return None
72 encoding = _get_normal_name(matches[0])
72 encoding = _get_normal_name(matches[0])
73 try:
73 try:
74 codec = lookup(encoding)
74 codec = lookup(encoding)
75 except LookupError:
75 except LookupError:
76 # This behaviour mimics the Python interpreter
76 # This behaviour mimics the Python interpreter
77 raise SyntaxError("unknown encoding: " + encoding)
77 raise SyntaxError("unknown encoding: " + encoding)
78
78
79 if bom_found:
79 if bom_found:
80 if codec.name != 'utf-8':
80 if codec.name != 'utf-8':
81 # This behaviour mimics the Python interpreter
81 # This behaviour mimics the Python interpreter
82 raise SyntaxError('encoding problem: utf-8')
82 raise SyntaxError('encoding problem: utf-8')
83 encoding += '-sig'
83 encoding += '-sig'
84 return encoding
84 return encoding
85
85
86 first = read_or_stop()
86 first = read_or_stop()
87 if first.startswith(BOM_UTF8):
87 if first.startswith(BOM_UTF8):
88 bom_found = True
88 bom_found = True
89 first = first[3:]
89 first = first[3:]
90 default = 'utf-8-sig'
90 default = 'utf-8-sig'
91 if not first:
91 if not first:
92 return default, []
92 return default, []
93
93
94 encoding = find_cookie(first)
94 encoding = find_cookie(first)
95 if encoding:
95 if encoding:
96 return encoding, [first]
96 return encoding, [first]
97
97
98 second = read_or_stop()
98 second = read_or_stop()
99 if not second:
99 if not second:
100 return default, [first]
100 return default, [first]
101
101
102 encoding = find_cookie(second)
102 encoding = find_cookie(second)
103 if encoding:
103 if encoding:
104 return encoding, [first, second]
104 return encoding, [first, second]
105
105
106 return default, [first, second]
106 return default, [first, second]
107
107
108 try:
108 try:
109 # Available in Python 3.2 and above.
109 # Available in Python 3.2 and above.
110 from tokenize import open
110 from tokenize import open
111 except ImportError:
111 except ImportError:
112 # Copied from Python 3.2 tokenize
112 # Copied from Python 3.2 tokenize
113 def open(filename):
113 def open(filename):
114 """Open a file in read only mode using the encoding detected by
114 """Open a file in read only mode using the encoding detected by
115 detect_encoding().
115 detect_encoding().
116 """
116 """
117 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
117 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
118 encoding, lines = detect_encoding(buffer.readline)
118 encoding, lines = detect_encoding(buffer.readline)
119 buffer.seek(0)
119 buffer.seek(0)
120 text = TextIOWrapper(buffer, encoding, line_buffering=True)
120 text = TextIOWrapper(buffer, encoding, line_buffering=True)
121 text.mode = 'r'
121 text.mode = 'r'
122 return text
122 return text
123
123
124 def strip_encoding_cookie(filelike):
124 def strip_encoding_cookie(filelike):
125 """Generator to pull lines from a text-mode file, skipping the encoding
125 """Generator to pull lines from a text-mode file, skipping the encoding
126 cookie if it is found in the first two lines.
126 cookie if it is found in the first two lines.
127 """
127 """
128 it = iter(filelike)
128 it = iter(filelike)
129 try:
129 try:
130 first = next(it)
130 first = next(it)
131 if not cookie_comment_re.match(first):
131 if not cookie_comment_re.match(first):
132 yield first
132 yield first
133 second = next(it)
133 second = next(it)
134 if not cookie_comment_re.match(second):
134 if not cookie_comment_re.match(second):
135 yield second
135 yield second
136 except StopIteration:
136 except StopIteration:
137 return
137 return
138
138
139 for line in it:
139 for line in it:
140 yield line
140 yield line
141
141
142 def read_py_file(filename, errors='replace', skip_encoding_cookie=True):
142 def read_py_file(filename, skip_encoding_cookie=True):
143 """Read a Python file, using the encoding declared inside the file.
144
145 Parameters
146 ----------
147 filename : str
148 The path to the file to read.
149 skip_encoding_cookie : bool
150 If True (the default), and the encoding declaration is found in the first
151 two lines, that line will be excluded from the output - compiling a
152 unicode string with an encoding declaration is a SyntaxError in Python 2.
153
154 Returns
155 -------
156 A unicode string containing the contents of the file.
157 """
143 with open(filename) as f: # the open function defined in this module.
158 with open(filename) as f: # the open function defined in this module.
144 if skip_encoding_cookie:
159 if skip_encoding_cookie:
145 return "".join(strip_encoding_cookie(f))
160 return "".join(strip_encoding_cookie(f))
146 else:
161 else:
147 return f.read()
162 return f.read()
148
163
149 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
164 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
150 """Open a URL to a raw Python file, using the encoding detected by
165 """Read a Python file from a URL, using the encoding declared inside the file.
151 detect_encoding().
166
167 Parameters
168 ----------
169 url : str
170 The URL from which to fetch the file.
171 errors : str
172 How to handle decoding errors in the file. Options are the same as for
173 bytes.decode(), but here 'replace' is the default.
174 skip_encoding_cookie : bool
175 If True (the default), and the encoding declaration is found in the first
176 two lines, that line will be excluded from the output - compiling a
177 unicode string with an encoding declaration is a SyntaxError in Python 2.
178
179 Returns
180 -------
181 A unicode string containing the contents of the file.
152 """
182 """
153 response = urllib.urlopen(url)
183 response = urllib.urlopen(url)
154 buffer = io.BytesIO(response.read())
184 buffer = io.BytesIO(response.read())
155 encoding, lines = detect_encoding(buffer.readline)
185 encoding, lines = detect_encoding(buffer.readline)
156 buffer.seek(0)
186 buffer.seek(0)
157 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
187 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
158 text.mode = 'r'
188 text.mode = 'r'
159 if skip_encoding_cookie:
189 if skip_encoding_cookie:
160 return "".join(strip_encoding_cookie(text))
190 return "".join(strip_encoding_cookie(text))
161 else:
191 else:
162 return text.read()
192 return text.read()
General Comments 0
You need to be logged in to leave comments. Login now