openpy.py
105 lines
| 3.4 KiB
| text/x-python
|
PythonLexer
Thomas Kluyver
|
r6247 | """ | ||
Tools to open .py files as Unicode, using the encoding specified within the file, | ||||
as per PEP 263. | ||||
Much of the code is taken from the tokenize module in Python 3.2. | ||||
""" | ||||
import io | ||||
Jörgen Stenarson
|
r8309 | from io import TextIOWrapper, BytesIO | ||
Thomas Kluyver
|
r6247 | import re | ||
Srinivas Reddy Thatiparthy
|
r23073 | from tokenize import open, detect_encoding | ||
Thomas Kluyver
|
r13353 | |||
cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE) | ||||
cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) | ||||
Thomas Kluyver
|
r6247 | |||
Jörgen Stenarson
|
r8309 | def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): | ||
"""Converts a bytes string with python source code to unicode. | ||||
Unicode strings are passed through unchanged. Byte strings are checked | ||||
for the python source file encoding cookie to determine encoding. | ||||
txt can be either a bytes buffer or a string containing the source | ||||
code. | ||||
Jörgen Stenarson
|
r8304 | """ | ||
Srinivas Reddy Thatiparthy
|
r23044 | if isinstance(txt, str): | ||
Jörgen Stenarson
|
r8304 | return txt | ||
Jörgen Stenarson
|
r8314 | if isinstance(txt, bytes): | ||
Jörgen Stenarson
|
r8309 | buffer = BytesIO(txt) | ||
else: | ||||
buffer = txt | ||||
Jörgen Stenarson
|
r8304 | try: | ||
Jörgen Stenarson
|
r8309 | encoding, _ = detect_encoding(buffer.readline) | ||
Jörgen Stenarson
|
r8304 | except SyntaxError: | ||
Jörgen Stenarson
|
r8309 | encoding = "ascii" | ||
buffer.seek(0) | ||||
text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) | ||||
text.mode = 'r' | ||||
if skip_encoding_cookie: | ||||
return u"".join(strip_encoding_cookie(text)) | ||||
else: | ||||
return text.read() | ||||
Jörgen Stenarson
|
r8304 | |||
Thomas Kluyver
|
r6247 | def strip_encoding_cookie(filelike): | ||
"""Generator to pull lines from a text-mode file, skipping the encoding | ||||
cookie if it is found in the first two lines. | ||||
""" | ||||
it = iter(filelike) | ||||
try: | ||||
first = next(it) | ||||
if not cookie_comment_re.match(first): | ||||
yield first | ||||
second = next(it) | ||||
if not cookie_comment_re.match(second): | ||||
yield second | ||||
except StopIteration: | ||||
return | ||||
for line in it: | ||||
yield line | ||||
Thomas Kluyver
|
r6450 | def read_py_file(filename, skip_encoding_cookie=True): | ||
"""Read a Python file, using the encoding declared inside the file. | ||||
Parameters | ||||
---------- | ||||
filename : str | ||||
The path to the file to read. | ||||
skip_encoding_cookie : bool | ||||
If True (the default), and the encoding declaration is found in the first | ||||
two lines, that line will be excluded from the output - compiling a | ||||
unicode string with an encoding declaration is a SyntaxError in Python 2. | ||||
Returns | ||||
------- | ||||
A unicode string containing the contents of the file. | ||||
""" | ||||
Thomas Kluyver
|
r6301 | with open(filename) as f: # the open function defined in this module. | ||
if skip_encoding_cookie: | ||||
return "".join(strip_encoding_cookie(f)) | ||||
else: | ||||
return f.read() | ||||
def read_py_url(url, errors='replace', skip_encoding_cookie=True): | ||||
Thomas Kluyver
|
r6450 | """Read a Python file from a URL, using the encoding declared inside the file. | ||
Parameters | ||||
---------- | ||||
url : str | ||||
The URL from which to fetch the file. | ||||
errors : str | ||||
How to handle decoding errors in the file. Options are the same as for | ||||
bytes.decode(), but here 'replace' is the default. | ||||
skip_encoding_cookie : bool | ||||
If True (the default), and the encoding declaration is found in the first | ||||
two lines, that line will be excluded from the output - compiling a | ||||
unicode string with an encoding declaration is a SyntaxError in Python 2. | ||||
Returns | ||||
------- | ||||
A unicode string containing the contents of the file. | ||||
Thomas Kluyver
|
r6301 | """ | ||
Sean Vig
|
r13640 | # Deferred import for faster start | ||
Srinivas Reddy Thatiparthy
|
r23073 | from urllib.request import urlopen | ||
Thomas Kluyver
|
r9389 | response = urlopen(url) | ||
Thomas Kluyver
|
r6301 | buffer = io.BytesIO(response.read()) | ||
Jörgen Stenarson
|
r8309 | return source_to_unicode(buffer, errors, skip_encoding_cookie) | ||