##// END OF EJS Templates
check for bytes instead of str for python3 compatibility
Jörgen Stenarson -
Show More
@@ -1,210 +1,210 b''
1 """
1 """
2 Tools to open .py files as Unicode, using the encoding specified within the file,
2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 as per PEP 263.
3 as per PEP 263.
4
4
5 Much of the code is taken from the tokenize module in Python 3.2.
5 Much of the code is taken from the tokenize module in Python 3.2.
6 """
6 """
7 from __future__ import absolute_import
7 from __future__ import absolute_import
8
8
9 import io
9 import io
10 from io import TextIOWrapper, BytesIO
10 from io import TextIOWrapper, BytesIO
11 import re
11 import re
12 import urllib
12 import urllib
13
13
14 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
14 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
15 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
15 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
16
16
17 try:
17 try:
18 # Available in Python 3
18 # Available in Python 3
19 from tokenize import detect_encoding
19 from tokenize import detect_encoding
20 except ImportError:
20 except ImportError:
21 from codecs import lookup, BOM_UTF8
21 from codecs import lookup, BOM_UTF8
22
22
23 # Copied from Python 3.2 tokenize
23 # Copied from Python 3.2 tokenize
24 def _get_normal_name(orig_enc):
24 def _get_normal_name(orig_enc):
25 """Imitates get_normal_name in tokenizer.c."""
25 """Imitates get_normal_name in tokenizer.c."""
26 # Only care about the first 12 characters.
26 # Only care about the first 12 characters.
27 enc = orig_enc[:12].lower().replace("_", "-")
27 enc = orig_enc[:12].lower().replace("_", "-")
28 if enc == "utf-8" or enc.startswith("utf-8-"):
28 if enc == "utf-8" or enc.startswith("utf-8-"):
29 return "utf-8"
29 return "utf-8"
30 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
30 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
31 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
31 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
32 return "iso-8859-1"
32 return "iso-8859-1"
33 return orig_enc
33 return orig_enc
34
34
35 # Copied from Python 3.2 tokenize
35 # Copied from Python 3.2 tokenize
36 def detect_encoding(readline):
36 def detect_encoding(readline):
37 """
37 """
38 The detect_encoding() function is used to detect the encoding that should
38 The detect_encoding() function is used to detect the encoding that should
39 be used to decode a Python source file. It requires one argment, readline,
39 be used to decode a Python source file. It requires one argment, readline,
40 in the same way as the tokenize() generator.
40 in the same way as the tokenize() generator.
41
41
42 It will call readline a maximum of twice, and return the encoding used
42 It will call readline a maximum of twice, and return the encoding used
43 (as a string) and a list of any lines (left as bytes) it has read in.
43 (as a string) and a list of any lines (left as bytes) it has read in.
44
44
45 It detects the encoding from the presence of a utf-8 bom or an encoding
45 It detects the encoding from the presence of a utf-8 bom or an encoding
46 cookie as specified in pep-0263. If both a bom and a cookie are present,
46 cookie as specified in pep-0263. If both a bom and a cookie are present,
47 but disagree, a SyntaxError will be raised. If the encoding cookie is an
47 but disagree, a SyntaxError will be raised. If the encoding cookie is an
48 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
48 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
49 'utf-8-sig' is returned.
49 'utf-8-sig' is returned.
50
50
51 If no encoding is specified, then the default of 'utf-8' will be returned.
51 If no encoding is specified, then the default of 'utf-8' will be returned.
52 """
52 """
53 bom_found = False
53 bom_found = False
54 encoding = None
54 encoding = None
55 default = 'utf-8'
55 default = 'utf-8'
56 def read_or_stop():
56 def read_or_stop():
57 try:
57 try:
58 return readline()
58 return readline()
59 except StopIteration:
59 except StopIteration:
60 return b''
60 return b''
61
61
62 def find_cookie(line):
62 def find_cookie(line):
63 try:
63 try:
64 line_string = line.decode('ascii')
64 line_string = line.decode('ascii')
65 except UnicodeDecodeError:
65 except UnicodeDecodeError:
66 return None
66 return None
67
67
68 matches = cookie_re.findall(line_string)
68 matches = cookie_re.findall(line_string)
69 if not matches:
69 if not matches:
70 return None
70 return None
71 encoding = _get_normal_name(matches[0])
71 encoding = _get_normal_name(matches[0])
72 try:
72 try:
73 codec = lookup(encoding)
73 codec = lookup(encoding)
74 except LookupError:
74 except LookupError:
75 # This behaviour mimics the Python interpreter
75 # This behaviour mimics the Python interpreter
76 raise SyntaxError("unknown encoding: " + encoding)
76 raise SyntaxError("unknown encoding: " + encoding)
77
77
78 if bom_found:
78 if bom_found:
79 if codec.name != 'utf-8':
79 if codec.name != 'utf-8':
80 # This behaviour mimics the Python interpreter
80 # This behaviour mimics the Python interpreter
81 raise SyntaxError('encoding problem: utf-8')
81 raise SyntaxError('encoding problem: utf-8')
82 encoding += '-sig'
82 encoding += '-sig'
83 return encoding
83 return encoding
84
84
85 first = read_or_stop()
85 first = read_or_stop()
86 if first.startswith(BOM_UTF8):
86 if first.startswith(BOM_UTF8):
87 bom_found = True
87 bom_found = True
88 first = first[3:]
88 first = first[3:]
89 default = 'utf-8-sig'
89 default = 'utf-8-sig'
90 if not first:
90 if not first:
91 return default, []
91 return default, []
92
92
93 encoding = find_cookie(first)
93 encoding = find_cookie(first)
94 if encoding:
94 if encoding:
95 return encoding, [first]
95 return encoding, [first]
96
96
97 second = read_or_stop()
97 second = read_or_stop()
98 if not second:
98 if not second:
99 return default, [first]
99 return default, [first]
100
100
101 encoding = find_cookie(second)
101 encoding = find_cookie(second)
102 if encoding:
102 if encoding:
103 return encoding, [first, second]
103 return encoding, [first, second]
104
104
105 return default, [first, second]
105 return default, [first, second]
106
106
107 try:
107 try:
108 # Available in Python 3.2 and above.
108 # Available in Python 3.2 and above.
109 from tokenize import open
109 from tokenize import open
110 except ImportError:
110 except ImportError:
111 # Copied from Python 3.2 tokenize
111 # Copied from Python 3.2 tokenize
112 def open(filename):
112 def open(filename):
113 """Open a file in read only mode using the encoding detected by
113 """Open a file in read only mode using the encoding detected by
114 detect_encoding().
114 detect_encoding().
115 """
115 """
116 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
116 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
117 encoding, lines = detect_encoding(buffer.readline)
117 encoding, lines = detect_encoding(buffer.readline)
118 buffer.seek(0)
118 buffer.seek(0)
119 text = TextIOWrapper(buffer, encoding, line_buffering=True)
119 text = TextIOWrapper(buffer, encoding, line_buffering=True)
120 text.mode = 'r'
120 text.mode = 'r'
121 return text
121 return text
122
122
123 def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
123 def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
124 """Converts a bytes string with python source code to unicode.
124 """Converts a bytes string with python source code to unicode.
125
125
126 Unicode strings are passed through unchanged. Byte strings are checked
126 Unicode strings are passed through unchanged. Byte strings are checked
127 for the python source file encoding cookie to determine encoding.
127 for the python source file encoding cookie to determine encoding.
128 txt can be either a bytes buffer or a string containing the source
128 txt can be either a bytes buffer or a string containing the source
129 code.
129 code.
130 """
130 """
131 if isinstance(txt, unicode):
131 if isinstance(txt, unicode):
132 return txt
132 return txt
133 if isinstance(txt, str):
133 if isinstance(txt, bytes):
134 buffer = BytesIO(txt)
134 buffer = BytesIO(txt)
135 else:
135 else:
136 buffer = txt
136 buffer = txt
137 try:
137 try:
138 encoding, _ = detect_encoding(buffer.readline)
138 encoding, _ = detect_encoding(buffer.readline)
139 except SyntaxError:
139 except SyntaxError:
140 encoding = "ascii"
140 encoding = "ascii"
141 buffer.seek(0)
141 buffer.seek(0)
142 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
142 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
143 text.mode = 'r'
143 text.mode = 'r'
144 if skip_encoding_cookie:
144 if skip_encoding_cookie:
145 return u"".join(strip_encoding_cookie(text))
145 return u"".join(strip_encoding_cookie(text))
146 else:
146 else:
147 return text.read()
147 return text.read()
148
148
149 def strip_encoding_cookie(filelike):
149 def strip_encoding_cookie(filelike):
150 """Generator to pull lines from a text-mode file, skipping the encoding
150 """Generator to pull lines from a text-mode file, skipping the encoding
151 cookie if it is found in the first two lines.
151 cookie if it is found in the first two lines.
152 """
152 """
153 it = iter(filelike)
153 it = iter(filelike)
154 try:
154 try:
155 first = next(it)
155 first = next(it)
156 if not cookie_comment_re.match(first):
156 if not cookie_comment_re.match(first):
157 yield first
157 yield first
158 second = next(it)
158 second = next(it)
159 if not cookie_comment_re.match(second):
159 if not cookie_comment_re.match(second):
160 yield second
160 yield second
161 except StopIteration:
161 except StopIteration:
162 return
162 return
163
163
164 for line in it:
164 for line in it:
165 yield line
165 yield line
166
166
167 def read_py_file(filename, skip_encoding_cookie=True):
167 def read_py_file(filename, skip_encoding_cookie=True):
168 """Read a Python file, using the encoding declared inside the file.
168 """Read a Python file, using the encoding declared inside the file.
169
169
170 Parameters
170 Parameters
171 ----------
171 ----------
172 filename : str
172 filename : str
173 The path to the file to read.
173 The path to the file to read.
174 skip_encoding_cookie : bool
174 skip_encoding_cookie : bool
175 If True (the default), and the encoding declaration is found in the first
175 If True (the default), and the encoding declaration is found in the first
176 two lines, that line will be excluded from the output - compiling a
176 two lines, that line will be excluded from the output - compiling a
177 unicode string with an encoding declaration is a SyntaxError in Python 2.
177 unicode string with an encoding declaration is a SyntaxError in Python 2.
178
178
179 Returns
179 Returns
180 -------
180 -------
181 A unicode string containing the contents of the file.
181 A unicode string containing the contents of the file.
182 """
182 """
183 with open(filename) as f: # the open function defined in this module.
183 with open(filename) as f: # the open function defined in this module.
184 if skip_encoding_cookie:
184 if skip_encoding_cookie:
185 return "".join(strip_encoding_cookie(f))
185 return "".join(strip_encoding_cookie(f))
186 else:
186 else:
187 return f.read()
187 return f.read()
188
188
189 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
189 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
190 """Read a Python file from a URL, using the encoding declared inside the file.
190 """Read a Python file from a URL, using the encoding declared inside the file.
191
191
192 Parameters
192 Parameters
193 ----------
193 ----------
194 url : str
194 url : str
195 The URL from which to fetch the file.
195 The URL from which to fetch the file.
196 errors : str
196 errors : str
197 How to handle decoding errors in the file. Options are the same as for
197 How to handle decoding errors in the file. Options are the same as for
198 bytes.decode(), but here 'replace' is the default.
198 bytes.decode(), but here 'replace' is the default.
199 skip_encoding_cookie : bool
199 skip_encoding_cookie : bool
200 If True (the default), and the encoding declaration is found in the first
200 If True (the default), and the encoding declaration is found in the first
201 two lines, that line will be excluded from the output - compiling a
201 two lines, that line will be excluded from the output - compiling a
202 unicode string with an encoding declaration is a SyntaxError in Python 2.
202 unicode string with an encoding declaration is a SyntaxError in Python 2.
203
203
204 Returns
204 Returns
205 -------
205 -------
206 A unicode string containing the contents of the file.
206 A unicode string containing the contents of the file.
207 """
207 """
208 response = urllib.urlopen(url)
208 response = urllib.urlopen(url)
209 buffer = io.BytesIO(response.read())
209 buffer = io.BytesIO(response.read())
210 return source_to_unicode(buffer, errors, skip_encoding_cookie)
210 return source_to_unicode(buffer, errors, skip_encoding_cookie)
General Comments 0
You need to be logged in to leave comments. Login now