##// END OF EJS Templates
make source_to_unicode use BytesIO and refactor
Jörgen Stenarson -
Show More
@@ -1,204 +1,210 b''
1 1 """
2 2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 3 as per PEP 263.
4 4
5 5 Much of the code is taken from the tokenize module in Python 3.2.
6 6 """
7 7 from __future__ import absolute_import
8 8
9 9 import io
10 from io import TextIOWrapper
10 from io import TextIOWrapper, BytesIO
11 11 import re
12 from StringIO import StringIO
13 12 import urllib
14 13
15 14 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
16 15 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
17 16
18 17 try:
19 18 # Available in Python 3
20 19 from tokenize import detect_encoding
21 20 except ImportError:
22 21 from codecs import lookup, BOM_UTF8
23 22
24 23 # Copied from Python 3.2 tokenize
25 24 def _get_normal_name(orig_enc):
26 25 """Imitates get_normal_name in tokenizer.c."""
27 26 # Only care about the first 12 characters.
28 27 enc = orig_enc[:12].lower().replace("_", "-")
29 28 if enc == "utf-8" or enc.startswith("utf-8-"):
30 29 return "utf-8"
31 30 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
32 31 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
33 32 return "iso-8859-1"
34 33 return orig_enc
35 34
36 35 # Copied from Python 3.2 tokenize
37 36 def detect_encoding(readline):
38 37 """
39 38 The detect_encoding() function is used to detect the encoding that should
40 39 be used to decode a Python source file. It requires one argment, readline,
41 40 in the same way as the tokenize() generator.
42 41
43 42 It will call readline a maximum of twice, and return the encoding used
44 43 (as a string) and a list of any lines (left as bytes) it has read in.
45 44
46 45 It detects the encoding from the presence of a utf-8 bom or an encoding
47 46 cookie as specified in pep-0263. If both a bom and a cookie are present,
48 47 but disagree, a SyntaxError will be raised. If the encoding cookie is an
49 48 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
50 49 'utf-8-sig' is returned.
51 50
52 51 If no encoding is specified, then the default of 'utf-8' will be returned.
53 52 """
54 53 bom_found = False
55 54 encoding = None
56 55 default = 'utf-8'
57 56 def read_or_stop():
58 57 try:
59 58 return readline()
60 59 except StopIteration:
61 60 return b''
62 61
63 62 def find_cookie(line):
64 63 try:
65 64 line_string = line.decode('ascii')
66 65 except UnicodeDecodeError:
67 66 return None
68 67
69 68 matches = cookie_re.findall(line_string)
70 69 if not matches:
71 70 return None
72 71 encoding = _get_normal_name(matches[0])
73 72 try:
74 73 codec = lookup(encoding)
75 74 except LookupError:
76 75 # This behaviour mimics the Python interpreter
77 76 raise SyntaxError("unknown encoding: " + encoding)
78 77
79 78 if bom_found:
80 79 if codec.name != 'utf-8':
81 80 # This behaviour mimics the Python interpreter
82 81 raise SyntaxError('encoding problem: utf-8')
83 82 encoding += '-sig'
84 83 return encoding
85 84
86 85 first = read_or_stop()
87 86 if first.startswith(BOM_UTF8):
88 87 bom_found = True
89 88 first = first[3:]
90 89 default = 'utf-8-sig'
91 90 if not first:
92 91 return default, []
93 92
94 93 encoding = find_cookie(first)
95 94 if encoding:
96 95 return encoding, [first]
97 96
98 97 second = read_or_stop()
99 98 if not second:
100 99 return default, [first]
101 100
102 101 encoding = find_cookie(second)
103 102 if encoding:
104 103 return encoding, [first, second]
105 104
106 105 return default, [first, second]
107 106
108 107 try:
109 108 # Available in Python 3.2 and above.
110 109 from tokenize import open
111 110 except ImportError:
112 111 # Copied from Python 3.2 tokenize
113 112 def open(filename):
114 113 """Open a file in read only mode using the encoding detected by
115 114 detect_encoding().
116 115 """
117 116 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
118 117 encoding, lines = detect_encoding(buffer.readline)
119 118 buffer.seek(0)
120 119 text = TextIOWrapper(buffer, encoding, line_buffering=True)
121 120 text.mode = 'r'
122 121 return text
123 122
124 def source_to_unicode(txt):
125 """Converts string with python source code to unicode
123 def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
124 """Converts a bytes string with python source code to unicode.
125
126 Unicode strings are passed through unchanged. Byte strings are checked
127 for the python source file encoding cookie to determine encoding.
128 txt can be either a bytes buffer or a string containing the source
129 code.
126 130 """
127 131 if isinstance(txt, unicode):
128 132 return txt
133 if isinstance(txt, str):
134 buffer = BytesIO(txt)
135 else:
136 buffer = txt
129 137 try:
130 coding, _ = detect_encoding(StringIO(txt).readline)
138 encoding, _ = detect_encoding(buffer.readline)
131 139 except SyntaxError:
132 coding = "ascii"
133 return txt.decode(coding, errors="replace")
140 encoding = "ascii"
141 buffer.seek(0)
142 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
143 text.mode = 'r'
144 if skip_encoding_cookie:
145 return u"".join(strip_encoding_cookie(text))
146 else:
147 return text.read()
134 148
135 149 def strip_encoding_cookie(filelike):
136 150 """Generator to pull lines from a text-mode file, skipping the encoding
137 151 cookie if it is found in the first two lines.
138 152 """
139 153 it = iter(filelike)
140 154 try:
141 155 first = next(it)
142 156 if not cookie_comment_re.match(first):
143 157 yield first
144 158 second = next(it)
145 159 if not cookie_comment_re.match(second):
146 160 yield second
147 161 except StopIteration:
148 162 return
149 163
150 164 for line in it:
151 165 yield line
152 166
153 167 def read_py_file(filename, skip_encoding_cookie=True):
154 168 """Read a Python file, using the encoding declared inside the file.
155 169
156 170 Parameters
157 171 ----------
158 172 filename : str
159 173 The path to the file to read.
160 174 skip_encoding_cookie : bool
161 175 If True (the default), and the encoding declaration is found in the first
162 176 two lines, that line will be excluded from the output - compiling a
163 177 unicode string with an encoding declaration is a SyntaxError in Python 2.
164 178
165 179 Returns
166 180 -------
167 181 A unicode string containing the contents of the file.
168 182 """
169 183 with open(filename) as f: # the open function defined in this module.
170 184 if skip_encoding_cookie:
171 185 return "".join(strip_encoding_cookie(f))
172 186 else:
173 187 return f.read()
174 188
175 189 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
176 190 """Read a Python file from a URL, using the encoding declared inside the file.
177 191
178 192 Parameters
179 193 ----------
180 194 url : str
181 195 The URL from which to fetch the file.
182 196 errors : str
183 197 How to handle decoding errors in the file. Options are the same as for
184 198 bytes.decode(), but here 'replace' is the default.
185 199 skip_encoding_cookie : bool
186 200 If True (the default), and the encoding declaration is found in the first
187 201 two lines, that line will be excluded from the output - compiling a
188 202 unicode string with an encoding declaration is a SyntaxError in Python 2.
189 203
190 204 Returns
191 205 -------
192 206 A unicode string containing the contents of the file.
193 207 """
194 208 response = urllib.urlopen(url)
195 209 buffer = io.BytesIO(response.read())
196 encoding, lines = detect_encoding(buffer.readline)
197 buffer.seek(0)
198 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
199 text.mode = 'r'
200 if skip_encoding_cookie:
201 return "".join(strip_encoding_cookie(text))
202 else:
203 return text.read()
204
210 return source_to_unicode(buffer, errors, skip_encoding_cookie)
General Comments 0
You need to be logged in to leave comments. Login now