openpy.py
105 lines
| 3.3 KiB
| text/x-python
|
PythonLexer
Thomas Kluyver
|
r6247 | """ | ||
Tools to open .py files as Unicode, using the encoding specified within the file, | ||||
as per PEP 263. | ||||
Much of the code is taken from the tokenize module in Python 3.2. | ||||
""" | ||||
import io | ||||
Jörgen Stenarson
|
r8309 | from io import TextIOWrapper, BytesIO | ||
rushabh-v
|
r26026 | from pathlib import Path | ||
Thomas Kluyver
|
r6247 | import re | ||
Srinivas Reddy Thatiparthy
|
r23073 | from tokenize import open, detect_encoding | ||
Thomas Kluyver
|
r13353 | |||
cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE) | ||||
cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) | ||||
Thomas Kluyver
|
r6247 | |||
Jörgen Stenarson
|
r8309 | def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): | ||
"""Converts a bytes string with python source code to unicode. | ||||
Unicode strings are passed through unchanged. Byte strings are checked | ||||
for the python source file encoding cookie to determine encoding. | ||||
txt can be either a bytes buffer or a string containing the source | ||||
code. | ||||
Jörgen Stenarson
|
r8304 | """ | ||
Srinivas Reddy Thatiparthy
|
r23044 | if isinstance(txt, str): | ||
Jörgen Stenarson
|
r8304 | return txt | ||
Jörgen Stenarson
|
r8314 | if isinstance(txt, bytes): | ||
Jörgen Stenarson
|
r8309 | buffer = BytesIO(txt) | ||
else: | ||||
buffer = txt | ||||
Jörgen Stenarson
|
r8304 | try: | ||
Jörgen Stenarson
|
r8309 | encoding, _ = detect_encoding(buffer.readline) | ||
Jörgen Stenarson
|
r8304 | except SyntaxError: | ||
Jörgen Stenarson
|
r8309 | encoding = "ascii" | ||
buffer.seek(0) | ||||
Matthias Bussonnier
|
r25102 | with TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) as text: | ||
text.mode = 'r' | ||||
if skip_encoding_cookie: | ||||
return u"".join(strip_encoding_cookie(text)) | ||||
else: | ||||
return text.read() | ||||
Jörgen Stenarson
|
r8304 | |||
Thomas Kluyver
|
r6247 | def strip_encoding_cookie(filelike): | ||
"""Generator to pull lines from a text-mode file, skipping the encoding | ||||
cookie if it is found in the first two lines. | ||||
""" | ||||
it = iter(filelike) | ||||
try: | ||||
first = next(it) | ||||
if not cookie_comment_re.match(first): | ||||
yield first | ||||
second = next(it) | ||||
if not cookie_comment_re.match(second): | ||||
yield second | ||||
except StopIteration: | ||||
return | ||||
for line in it: | ||||
yield line | ||||
Thomas Kluyver
|
r6450 | def read_py_file(filename, skip_encoding_cookie=True): | ||
"""Read a Python file, using the encoding declared inside the file. | ||||
Matthias Bussonnier
|
r26419 | |||
Thomas Kluyver
|
r6450 | Parameters | ||
---------- | ||||
filename : str | ||||
Matthias Bussonnier
|
r26419 | The path to the file to read. | ||
Thomas Kluyver
|
r6450 | skip_encoding_cookie : bool | ||
Matthias Bussonnier
|
r26419 | If True (the default), and the encoding declaration is found in the first | ||
two lines, that line will be excluded from the output. | ||||
Thomas Kluyver
|
r6450 | Returns | ||
------- | ||||
A unicode string containing the contents of the file. | ||||
""" | ||||
rushabh-v
|
r26026 | filepath = Path(filename) | ||
Matthias Bussonnier
|
r26092 | with open(filepath) as f: # the open function defined in this module. | ||
Thomas Kluyver
|
r6301 | if skip_encoding_cookie: | ||
return "".join(strip_encoding_cookie(f)) | ||||
else: | ||||
rushabh-v
|
r26029 | return f.read() | ||
Thomas Kluyver
|
r6301 | |||
def read_py_url(url, errors='replace', skip_encoding_cookie=True): | ||||
Thomas Kluyver
|
r6450 | """Read a Python file from a URL, using the encoding declared inside the file. | ||
Matthias Bussonnier
|
r26419 | |||
Thomas Kluyver
|
r6450 | Parameters | ||
---------- | ||||
url : str | ||||
Matthias Bussonnier
|
r26419 | The URL from which to fetch the file. | ||
Thomas Kluyver
|
r6450 | errors : str | ||
Matthias Bussonnier
|
r26419 | How to handle decoding errors in the file. Options are the same as for | ||
bytes.decode(), but here 'replace' is the default. | ||||
Thomas Kluyver
|
r6450 | skip_encoding_cookie : bool | ||
Matthias Bussonnier
|
r26419 | If True (the default), and the encoding declaration is found in the first | ||
two lines, that line will be excluded from the output. | ||||
Thomas Kluyver
|
r6450 | Returns | ||
------- | ||||
A unicode string containing the contents of the file. | ||||
Thomas Kluyver
|
r6301 | """ | ||
Sean Vig
|
r13640 | # Deferred import for faster start | ||
Srinivas Reddy Thatiparthy
|
r23073 | from urllib.request import urlopen | ||
Thomas Kluyver
|
r9389 | response = urlopen(url) | ||
Thomas Kluyver
|
r6301 | buffer = io.BytesIO(response.read()) | ||
Jörgen Stenarson
|
r8309 | return source_to_unicode(buffer, errors, skip_encoding_cookie) | ||