##// END OF EJS Templates
check for bytes instead of str for python3 compatibility
Jörgen Stenarson -
Show More
@@ -1,210 +1,210 b''
1 1 """
2 2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 3 as per PEP 263.
4 4
5 5 Much of the code is taken from the tokenize module in Python 3.2.
6 6 """
7 7 from __future__ import absolute_import
8 8
9 9 import io
10 10 from io import TextIOWrapper, BytesIO
11 11 import re
12 12 import urllib
13 13
14 14 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
15 15 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
16 16
17 17 try:
18 18 # Available in Python 3
19 19 from tokenize import detect_encoding
20 20 except ImportError:
21 21 from codecs import lookup, BOM_UTF8
22 22
23 23 # Copied from Python 3.2 tokenize
24 24 def _get_normal_name(orig_enc):
25 25 """Imitates get_normal_name in tokenizer.c."""
26 26 # Only care about the first 12 characters.
27 27 enc = orig_enc[:12].lower().replace("_", "-")
28 28 if enc == "utf-8" or enc.startswith("utf-8-"):
29 29 return "utf-8"
30 30 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
31 31 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
32 32 return "iso-8859-1"
33 33 return orig_enc
34 34
35 35 # Copied from Python 3.2 tokenize
36 36 def detect_encoding(readline):
37 37 """
38 38 The detect_encoding() function is used to detect the encoding that should
39 39 be used to decode a Python source file. It requires one argment, readline,
40 40 in the same way as the tokenize() generator.
41 41
42 42 It will call readline a maximum of twice, and return the encoding used
43 43 (as a string) and a list of any lines (left as bytes) it has read in.
44 44
45 45 It detects the encoding from the presence of a utf-8 bom or an encoding
46 46 cookie as specified in pep-0263. If both a bom and a cookie are present,
47 47 but disagree, a SyntaxError will be raised. If the encoding cookie is an
48 48 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
49 49 'utf-8-sig' is returned.
50 50
51 51 If no encoding is specified, then the default of 'utf-8' will be returned.
52 52 """
53 53 bom_found = False
54 54 encoding = None
55 55 default = 'utf-8'
56 56 def read_or_stop():
57 57 try:
58 58 return readline()
59 59 except StopIteration:
60 60 return b''
61 61
62 62 def find_cookie(line):
63 63 try:
64 64 line_string = line.decode('ascii')
65 65 except UnicodeDecodeError:
66 66 return None
67 67
68 68 matches = cookie_re.findall(line_string)
69 69 if not matches:
70 70 return None
71 71 encoding = _get_normal_name(matches[0])
72 72 try:
73 73 codec = lookup(encoding)
74 74 except LookupError:
75 75 # This behaviour mimics the Python interpreter
76 76 raise SyntaxError("unknown encoding: " + encoding)
77 77
78 78 if bom_found:
79 79 if codec.name != 'utf-8':
80 80 # This behaviour mimics the Python interpreter
81 81 raise SyntaxError('encoding problem: utf-8')
82 82 encoding += '-sig'
83 83 return encoding
84 84
85 85 first = read_or_stop()
86 86 if first.startswith(BOM_UTF8):
87 87 bom_found = True
88 88 first = first[3:]
89 89 default = 'utf-8-sig'
90 90 if not first:
91 91 return default, []
92 92
93 93 encoding = find_cookie(first)
94 94 if encoding:
95 95 return encoding, [first]
96 96
97 97 second = read_or_stop()
98 98 if not second:
99 99 return default, [first]
100 100
101 101 encoding = find_cookie(second)
102 102 if encoding:
103 103 return encoding, [first, second]
104 104
105 105 return default, [first, second]
106 106
107 107 try:
108 108 # Available in Python 3.2 and above.
109 109 from tokenize import open
110 110 except ImportError:
111 111 # Copied from Python 3.2 tokenize
112 112 def open(filename):
113 113 """Open a file in read only mode using the encoding detected by
114 114 detect_encoding().
115 115 """
116 116 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
117 117 encoding, lines = detect_encoding(buffer.readline)
118 118 buffer.seek(0)
119 119 text = TextIOWrapper(buffer, encoding, line_buffering=True)
120 120 text.mode = 'r'
121 121 return text
122 122
123 123 def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
124 124 """Converts a bytes string with python source code to unicode.
125 125
126 126 Unicode strings are passed through unchanged. Byte strings are checked
127 127 for the python source file encoding cookie to determine encoding.
128 128 txt can be either a bytes buffer or a string containing the source
129 129 code.
130 130 """
131 131 if isinstance(txt, unicode):
132 132 return txt
133 if isinstance(txt, str):
133 if isinstance(txt, bytes):
134 134 buffer = BytesIO(txt)
135 135 else:
136 136 buffer = txt
137 137 try:
138 138 encoding, _ = detect_encoding(buffer.readline)
139 139 except SyntaxError:
140 140 encoding = "ascii"
141 141 buffer.seek(0)
142 142 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
143 143 text.mode = 'r'
144 144 if skip_encoding_cookie:
145 145 return u"".join(strip_encoding_cookie(text))
146 146 else:
147 147 return text.read()
148 148
149 149 def strip_encoding_cookie(filelike):
150 150 """Generator to pull lines from a text-mode file, skipping the encoding
151 151 cookie if it is found in the first two lines.
152 152 """
153 153 it = iter(filelike)
154 154 try:
155 155 first = next(it)
156 156 if not cookie_comment_re.match(first):
157 157 yield first
158 158 second = next(it)
159 159 if not cookie_comment_re.match(second):
160 160 yield second
161 161 except StopIteration:
162 162 return
163 163
164 164 for line in it:
165 165 yield line
166 166
167 167 def read_py_file(filename, skip_encoding_cookie=True):
168 168 """Read a Python file, using the encoding declared inside the file.
169 169
170 170 Parameters
171 171 ----------
172 172 filename : str
173 173 The path to the file to read.
174 174 skip_encoding_cookie : bool
175 175 If True (the default), and the encoding declaration is found in the first
176 176 two lines, that line will be excluded from the output - compiling a
177 177 unicode string with an encoding declaration is a SyntaxError in Python 2.
178 178
179 179 Returns
180 180 -------
181 181 A unicode string containing the contents of the file.
182 182 """
183 183 with open(filename) as f: # the open function defined in this module.
184 184 if skip_encoding_cookie:
185 185 return "".join(strip_encoding_cookie(f))
186 186 else:
187 187 return f.read()
188 188
189 189 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
190 190 """Read a Python file from a URL, using the encoding declared inside the file.
191 191
192 192 Parameters
193 193 ----------
194 194 url : str
195 195 The URL from which to fetch the file.
196 196 errors : str
197 197 How to handle decoding errors in the file. Options are the same as for
198 198 bytes.decode(), but here 'replace' is the default.
199 199 skip_encoding_cookie : bool
200 200 If True (the default), and the encoding declaration is found in the first
201 201 two lines, that line will be excluded from the output - compiling a
202 202 unicode string with an encoding declaration is a SyntaxError in Python 2.
203 203
204 204 Returns
205 205 -------
206 206 A unicode string containing the contents of the file.
207 207 """
208 208 response = urllib.urlopen(url)
209 209 buffer = io.BytesIO(response.read())
210 210 return source_to_unicode(buffer, errors, skip_encoding_cookie)
General Comments 0
You need to be logged in to leave comments. Login now