##// END OF EJS Templates
make source_to_unicode use BytesIO and refactor
Jörgen Stenarson -
Show More
@@ -1,204 +1,210 b''
1 """
1 """
2 Tools to open .py files as Unicode, using the encoding specified within the file,
2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 as per PEP 263.
3 as per PEP 263.
4
4
5 Much of the code is taken from the tokenize module in Python 3.2.
5 Much of the code is taken from the tokenize module in Python 3.2.
6 """
6 """
7 from __future__ import absolute_import
7 from __future__ import absolute_import
8
8
9 import io
9 import io
10 from io import TextIOWrapper
10 from io import TextIOWrapper, BytesIO
11 import re
11 import re
12 from StringIO import StringIO
13 import urllib
12 import urllib
14
13
15 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
14 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
16 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
15 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
17
16
18 try:
17 try:
19 # Available in Python 3
18 # Available in Python 3
20 from tokenize import detect_encoding
19 from tokenize import detect_encoding
21 except ImportError:
20 except ImportError:
22 from codecs import lookup, BOM_UTF8
21 from codecs import lookup, BOM_UTF8
23
22
24 # Copied from Python 3.2 tokenize
23 # Copied from Python 3.2 tokenize
25 def _get_normal_name(orig_enc):
24 def _get_normal_name(orig_enc):
26 """Imitates get_normal_name in tokenizer.c."""
25 """Imitates get_normal_name in tokenizer.c."""
27 # Only care about the first 12 characters.
26 # Only care about the first 12 characters.
28 enc = orig_enc[:12].lower().replace("_", "-")
27 enc = orig_enc[:12].lower().replace("_", "-")
29 if enc == "utf-8" or enc.startswith("utf-8-"):
28 if enc == "utf-8" or enc.startswith("utf-8-"):
30 return "utf-8"
29 return "utf-8"
31 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
30 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
32 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
31 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
33 return "iso-8859-1"
32 return "iso-8859-1"
34 return orig_enc
33 return orig_enc
35
34
36 # Copied from Python 3.2 tokenize
35 # Copied from Python 3.2 tokenize
37 def detect_encoding(readline):
36 def detect_encoding(readline):
38 """
37 """
39 The detect_encoding() function is used to detect the encoding that should
38 The detect_encoding() function is used to detect the encoding that should
40 be used to decode a Python source file. It requires one argment, readline,
39 be used to decode a Python source file. It requires one argment, readline,
41 in the same way as the tokenize() generator.
40 in the same way as the tokenize() generator.
42
41
43 It will call readline a maximum of twice, and return the encoding used
42 It will call readline a maximum of twice, and return the encoding used
44 (as a string) and a list of any lines (left as bytes) it has read in.
43 (as a string) and a list of any lines (left as bytes) it has read in.
45
44
46 It detects the encoding from the presence of a utf-8 bom or an encoding
45 It detects the encoding from the presence of a utf-8 bom or an encoding
47 cookie as specified in pep-0263. If both a bom and a cookie are present,
46 cookie as specified in pep-0263. If both a bom and a cookie are present,
48 but disagree, a SyntaxError will be raised. If the encoding cookie is an
47 but disagree, a SyntaxError will be raised. If the encoding cookie is an
49 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
48 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
50 'utf-8-sig' is returned.
49 'utf-8-sig' is returned.
51
50
52 If no encoding is specified, then the default of 'utf-8' will be returned.
51 If no encoding is specified, then the default of 'utf-8' will be returned.
53 """
52 """
54 bom_found = False
53 bom_found = False
55 encoding = None
54 encoding = None
56 default = 'utf-8'
55 default = 'utf-8'
57 def read_or_stop():
56 def read_or_stop():
58 try:
57 try:
59 return readline()
58 return readline()
60 except StopIteration:
59 except StopIteration:
61 return b''
60 return b''
62
61
63 def find_cookie(line):
62 def find_cookie(line):
64 try:
63 try:
65 line_string = line.decode('ascii')
64 line_string = line.decode('ascii')
66 except UnicodeDecodeError:
65 except UnicodeDecodeError:
67 return None
66 return None
68
67
69 matches = cookie_re.findall(line_string)
68 matches = cookie_re.findall(line_string)
70 if not matches:
69 if not matches:
71 return None
70 return None
72 encoding = _get_normal_name(matches[0])
71 encoding = _get_normal_name(matches[0])
73 try:
72 try:
74 codec = lookup(encoding)
73 codec = lookup(encoding)
75 except LookupError:
74 except LookupError:
76 # This behaviour mimics the Python interpreter
75 # This behaviour mimics the Python interpreter
77 raise SyntaxError("unknown encoding: " + encoding)
76 raise SyntaxError("unknown encoding: " + encoding)
78
77
79 if bom_found:
78 if bom_found:
80 if codec.name != 'utf-8':
79 if codec.name != 'utf-8':
81 # This behaviour mimics the Python interpreter
80 # This behaviour mimics the Python interpreter
82 raise SyntaxError('encoding problem: utf-8')
81 raise SyntaxError('encoding problem: utf-8')
83 encoding += '-sig'
82 encoding += '-sig'
84 return encoding
83 return encoding
85
84
86 first = read_or_stop()
85 first = read_or_stop()
87 if first.startswith(BOM_UTF8):
86 if first.startswith(BOM_UTF8):
88 bom_found = True
87 bom_found = True
89 first = first[3:]
88 first = first[3:]
90 default = 'utf-8-sig'
89 default = 'utf-8-sig'
91 if not first:
90 if not first:
92 return default, []
91 return default, []
93
92
94 encoding = find_cookie(first)
93 encoding = find_cookie(first)
95 if encoding:
94 if encoding:
96 return encoding, [first]
95 return encoding, [first]
97
96
98 second = read_or_stop()
97 second = read_or_stop()
99 if not second:
98 if not second:
100 return default, [first]
99 return default, [first]
101
100
102 encoding = find_cookie(second)
101 encoding = find_cookie(second)
103 if encoding:
102 if encoding:
104 return encoding, [first, second]
103 return encoding, [first, second]
105
104
106 return default, [first, second]
105 return default, [first, second]
107
106
108 try:
107 try:
109 # Available in Python 3.2 and above.
108 # Available in Python 3.2 and above.
110 from tokenize import open
109 from tokenize import open
111 except ImportError:
110 except ImportError:
112 # Copied from Python 3.2 tokenize
111 # Copied from Python 3.2 tokenize
113 def open(filename):
112 def open(filename):
114 """Open a file in read only mode using the encoding detected by
113 """Open a file in read only mode using the encoding detected by
115 detect_encoding().
114 detect_encoding().
116 """
115 """
117 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
116 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
118 encoding, lines = detect_encoding(buffer.readline)
117 encoding, lines = detect_encoding(buffer.readline)
119 buffer.seek(0)
118 buffer.seek(0)
120 text = TextIOWrapper(buffer, encoding, line_buffering=True)
119 text = TextIOWrapper(buffer, encoding, line_buffering=True)
121 text.mode = 'r'
120 text.mode = 'r'
122 return text
121 return text
123
122
124 def source_to_unicode(txt):
123 def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
125 """Converts string with python source code to unicode
124 """Converts a bytes string with python source code to unicode.
125
126 Unicode strings are passed through unchanged. Byte strings are checked
127 for the python source file encoding cookie to determine encoding.
128 txt can be either a bytes buffer or a string containing the source
129 code.
126 """
130 """
127 if isinstance(txt, unicode):
131 if isinstance(txt, unicode):
128 return txt
132 return txt
133 if isinstance(txt, str):
134 buffer = BytesIO(txt)
135 else:
136 buffer = txt
129 try:
137 try:
130 coding, _ = detect_encoding(StringIO(txt).readline)
138 encoding, _ = detect_encoding(buffer.readline)
131 except SyntaxError:
139 except SyntaxError:
132 coding = "ascii"
140 encoding = "ascii"
133 return txt.decode(coding, errors="replace")
141 buffer.seek(0)
142 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
143 text.mode = 'r'
144 if skip_encoding_cookie:
145 return u"".join(strip_encoding_cookie(text))
146 else:
147 return text.read()
134
148
135 def strip_encoding_cookie(filelike):
149 def strip_encoding_cookie(filelike):
136 """Generator to pull lines from a text-mode file, skipping the encoding
150 """Generator to pull lines from a text-mode file, skipping the encoding
137 cookie if it is found in the first two lines.
151 cookie if it is found in the first two lines.
138 """
152 """
139 it = iter(filelike)
153 it = iter(filelike)
140 try:
154 try:
141 first = next(it)
155 first = next(it)
142 if not cookie_comment_re.match(first):
156 if not cookie_comment_re.match(first):
143 yield first
157 yield first
144 second = next(it)
158 second = next(it)
145 if not cookie_comment_re.match(second):
159 if not cookie_comment_re.match(second):
146 yield second
160 yield second
147 except StopIteration:
161 except StopIteration:
148 return
162 return
149
163
150 for line in it:
164 for line in it:
151 yield line
165 yield line
152
166
153 def read_py_file(filename, skip_encoding_cookie=True):
167 def read_py_file(filename, skip_encoding_cookie=True):
154 """Read a Python file, using the encoding declared inside the file.
168 """Read a Python file, using the encoding declared inside the file.
155
169
156 Parameters
170 Parameters
157 ----------
171 ----------
158 filename : str
172 filename : str
159 The path to the file to read.
173 The path to the file to read.
160 skip_encoding_cookie : bool
174 skip_encoding_cookie : bool
161 If True (the default), and the encoding declaration is found in the first
175 If True (the default), and the encoding declaration is found in the first
162 two lines, that line will be excluded from the output - compiling a
176 two lines, that line will be excluded from the output - compiling a
163 unicode string with an encoding declaration is a SyntaxError in Python 2.
177 unicode string with an encoding declaration is a SyntaxError in Python 2.
164
178
165 Returns
179 Returns
166 -------
180 -------
167 A unicode string containing the contents of the file.
181 A unicode string containing the contents of the file.
168 """
182 """
169 with open(filename) as f: # the open function defined in this module.
183 with open(filename) as f: # the open function defined in this module.
170 if skip_encoding_cookie:
184 if skip_encoding_cookie:
171 return "".join(strip_encoding_cookie(f))
185 return "".join(strip_encoding_cookie(f))
172 else:
186 else:
173 return f.read()
187 return f.read()
174
188
175 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
189 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
176 """Read a Python file from a URL, using the encoding declared inside the file.
190 """Read a Python file from a URL, using the encoding declared inside the file.
177
191
178 Parameters
192 Parameters
179 ----------
193 ----------
180 url : str
194 url : str
181 The URL from which to fetch the file.
195 The URL from which to fetch the file.
182 errors : str
196 errors : str
183 How to handle decoding errors in the file. Options are the same as for
197 How to handle decoding errors in the file. Options are the same as for
184 bytes.decode(), but here 'replace' is the default.
198 bytes.decode(), but here 'replace' is the default.
185 skip_encoding_cookie : bool
199 skip_encoding_cookie : bool
186 If True (the default), and the encoding declaration is found in the first
200 If True (the default), and the encoding declaration is found in the first
187 two lines, that line will be excluded from the output - compiling a
201 two lines, that line will be excluded from the output - compiling a
188 unicode string with an encoding declaration is a SyntaxError in Python 2.
202 unicode string with an encoding declaration is a SyntaxError in Python 2.
189
203
190 Returns
204 Returns
191 -------
205 -------
192 A unicode string containing the contents of the file.
206 A unicode string containing the contents of the file.
193 """
207 """
194 response = urllib.urlopen(url)
208 response = urllib.urlopen(url)
195 buffer = io.BytesIO(response.read())
209 buffer = io.BytesIO(response.read())
196 encoding, lines = detect_encoding(buffer.readline)
210 return source_to_unicode(buffer, errors, skip_encoding_cookie)
197 buffer.seek(0)
198 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
199 text.mode = 'r'
200 if skip_encoding_cookie:
201 return "".join(strip_encoding_cookie(text))
202 else:
203 return text.read()
204
General Comments 0
You need to be logged in to leave comments. Login now