##// END OF EJS Templates
Add docstrings for read_py_file and read_py_url.
Thomas Kluyver -
Show More
@@ -1,162 +1,192 b''
1 1 """
2 2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 3 as per PEP 263.
4 4
5 5 Much of the code is taken from the tokenize module in Python 3.2.
6 6 """
7 7 from __future__ import absolute_import
8 8
9 9 import __builtin__
10 10 import io
11 11 from io import TextIOWrapper
12 12 import re
13 13 import urllib
14 14
15 15 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
16 16 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
17 17
18 18 try:
19 19 # Available in Python 3
20 20 from tokenize import detect_encoding
21 21 except ImportError:
22 22 from codecs import lookup, BOM_UTF8
23 23
24 24 # Copied from Python 3.2 tokenize
25 25 def _get_normal_name(orig_enc):
26 26 """Imitates get_normal_name in tokenizer.c."""
27 27 # Only care about the first 12 characters.
28 28 enc = orig_enc[:12].lower().replace("_", "-")
29 29 if enc == "utf-8" or enc.startswith("utf-8-"):
30 30 return "utf-8"
31 31 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
32 32 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
33 33 return "iso-8859-1"
34 34 return orig_enc
35 35
36 36 # Copied from Python 3.2 tokenize
37 37 def detect_encoding(readline):
38 38 """
39 39 The detect_encoding() function is used to detect the encoding that should
40 40 be used to decode a Python source file. It requires one argment, readline,
41 41 in the same way as the tokenize() generator.
42 42
43 43 It will call readline a maximum of twice, and return the encoding used
44 44 (as a string) and a list of any lines (left as bytes) it has read in.
45 45
46 46 It detects the encoding from the presence of a utf-8 bom or an encoding
47 47 cookie as specified in pep-0263. If both a bom and a cookie are present,
48 48 but disagree, a SyntaxError will be raised. If the encoding cookie is an
49 49 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
50 50 'utf-8-sig' is returned.
51 51
52 52 If no encoding is specified, then the default of 'utf-8' will be returned.
53 53 """
54 54 bom_found = False
55 55 encoding = None
56 56 default = 'utf-8'
57 57 def read_or_stop():
58 58 try:
59 59 return readline()
60 60 except StopIteration:
61 61 return b''
62 62
63 63 def find_cookie(line):
64 64 try:
65 65 line_string = line.decode('ascii')
66 66 except UnicodeDecodeError:
67 67 return None
68 68
69 69 matches = cookie_re.findall(line_string)
70 70 if not matches:
71 71 return None
72 72 encoding = _get_normal_name(matches[0])
73 73 try:
74 74 codec = lookup(encoding)
75 75 except LookupError:
76 76 # This behaviour mimics the Python interpreter
77 77 raise SyntaxError("unknown encoding: " + encoding)
78 78
79 79 if bom_found:
80 80 if codec.name != 'utf-8':
81 81 # This behaviour mimics the Python interpreter
82 82 raise SyntaxError('encoding problem: utf-8')
83 83 encoding += '-sig'
84 84 return encoding
85 85
86 86 first = read_or_stop()
87 87 if first.startswith(BOM_UTF8):
88 88 bom_found = True
89 89 first = first[3:]
90 90 default = 'utf-8-sig'
91 91 if not first:
92 92 return default, []
93 93
94 94 encoding = find_cookie(first)
95 95 if encoding:
96 96 return encoding, [first]
97 97
98 98 second = read_or_stop()
99 99 if not second:
100 100 return default, [first]
101 101
102 102 encoding = find_cookie(second)
103 103 if encoding:
104 104 return encoding, [first, second]
105 105
106 106 return default, [first, second]
107 107
108 108 try:
109 109 # Available in Python 3.2 and above.
110 110 from tokenize import open
111 111 except ImportError:
112 112 # Copied from Python 3.2 tokenize
113 113 def open(filename):
114 114 """Open a file in read only mode using the encoding detected by
115 115 detect_encoding().
116 116 """
117 117 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
118 118 encoding, lines = detect_encoding(buffer.readline)
119 119 buffer.seek(0)
120 120 text = TextIOWrapper(buffer, encoding, line_buffering=True)
121 121 text.mode = 'r'
122 122 return text
123 123
124 124 def strip_encoding_cookie(filelike):
125 125 """Generator to pull lines from a text-mode file, skipping the encoding
126 126 cookie if it is found in the first two lines.
127 127 """
128 128 it = iter(filelike)
129 129 try:
130 130 first = next(it)
131 131 if not cookie_comment_re.match(first):
132 132 yield first
133 133 second = next(it)
134 134 if not cookie_comment_re.match(second):
135 135 yield second
136 136 except StopIteration:
137 137 return
138 138
139 139 for line in it:
140 140 yield line
141 141
142 def read_py_file(filename, errors='replace', skip_encoding_cookie=True):
142 def read_py_file(filename, skip_encoding_cookie=True):
143 """Read a Python file, using the encoding declared inside the file.
144
145 Parameters
146 ----------
147 filename : str
148 The path to the file to read.
149 skip_encoding_cookie : bool
150 If True (the default), and the encoding declaration is found in the first
151 two lines, that line will be excluded from the output - compiling a
152 unicode string with an encoding declaration is a SyntaxError in Python 2.
153
154 Returns
155 -------
156 A unicode string containing the contents of the file.
157 """
143 158 with open(filename) as f: # the open function defined in this module.
144 159 if skip_encoding_cookie:
145 160 return "".join(strip_encoding_cookie(f))
146 161 else:
147 162 return f.read()
148 163
149 164 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
150 """Open a URL to a raw Python file, using the encoding detected by
151 detect_encoding().
165 """Read a Python file from a URL, using the encoding declared inside the file.
166
167 Parameters
168 ----------
169 url : str
170 The URL from which to fetch the file.
171 errors : str
172 How to handle decoding errors in the file. Options are the same as for
173 bytes.decode(), but here 'replace' is the default.
174 skip_encoding_cookie : bool
175 If True (the default), and the encoding declaration is found in the first
176 two lines, that line will be excluded from the output - compiling a
177 unicode string with an encoding declaration is a SyntaxError in Python 2.
178
179 Returns
180 -------
181 A unicode string containing the contents of the file.
152 182 """
153 183 response = urllib.urlopen(url)
154 184 buffer = io.BytesIO(response.read())
155 185 encoding, lines = detect_encoding(buffer.readline)
156 186 buffer.seek(0)
157 187 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
158 188 text.mode = 'r'
159 189 if skip_encoding_cookie:
160 190 return "".join(strip_encoding_cookie(text))
161 191 else:
162 192 return text.read()
General Comments 0
You need to be logged in to leave comments. Login now