openpy.py
219 lines
| 7.2 KiB
| text/x-python
|
PythonLexer
Thomas Kluyver
|
r6247 | """ | ||
Tools to open .py files as Unicode, using the encoding specified within the file, | ||||
as per PEP 263. | ||||
Much of the code is taken from the tokenize module in Python 3.2. | ||||
""" | ||||
Thomas Kluyver
|
r6301 | from __future__ import absolute_import | ||
Thomas Kluyver
|
r6247 | |||
import io | ||||
Jörgen Stenarson
|
r8309 | from io import TextIOWrapper, BytesIO | ||
Thomas Kluyver
|
r6247 | import re | ||
import urllib | ||||
cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE) | ||||
cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) | ||||
try: | ||||
# Available in Python 3 | ||||
from tokenize import detect_encoding | ||||
except ImportError: | ||||
from codecs import lookup, BOM_UTF8 | ||||
# Copied from Python 3.2 tokenize | ||||
def _get_normal_name(orig_enc): | ||||
"""Imitates get_normal_name in tokenizer.c.""" | ||||
# Only care about the first 12 characters. | ||||
enc = orig_enc[:12].lower().replace("_", "-") | ||||
if enc == "utf-8" or enc.startswith("utf-8-"): | ||||
return "utf-8" | ||||
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ | ||||
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): | ||||
return "iso-8859-1" | ||||
return orig_enc | ||||
# Copied from Python 3.2 tokenize | ||||
def detect_encoding(readline): | ||||
""" | ||||
The detect_encoding() function is used to detect the encoding that should | ||||
be used to decode a Python source file. It requires one argment, readline, | ||||
in the same way as the tokenize() generator. | ||||
It will call readline a maximum of twice, and return the encoding used | ||||
(as a string) and a list of any lines (left as bytes) it has read in. | ||||
It detects the encoding from the presence of a utf-8 bom or an encoding | ||||
cookie as specified in pep-0263. If both a bom and a cookie are present, | ||||
but disagree, a SyntaxError will be raised. If the encoding cookie is an | ||||
invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, | ||||
'utf-8-sig' is returned. | ||||
If no encoding is specified, then the default of 'utf-8' will be returned. | ||||
""" | ||||
bom_found = False | ||||
encoding = None | ||||
default = 'utf-8' | ||||
def read_or_stop(): | ||||
try: | ||||
return readline() | ||||
except StopIteration: | ||||
return b'' | ||||
def find_cookie(line): | ||||
try: | ||||
line_string = line.decode('ascii') | ||||
except UnicodeDecodeError: | ||||
return None | ||||
matches = cookie_re.findall(line_string) | ||||
if not matches: | ||||
return None | ||||
encoding = _get_normal_name(matches[0]) | ||||
try: | ||||
codec = lookup(encoding) | ||||
except LookupError: | ||||
# This behaviour mimics the Python interpreter | ||||
raise SyntaxError("unknown encoding: " + encoding) | ||||
if bom_found: | ||||
if codec.name != 'utf-8': | ||||
# This behaviour mimics the Python interpreter | ||||
raise SyntaxError('encoding problem: utf-8') | ||||
encoding += '-sig' | ||||
return encoding | ||||
first = read_or_stop() | ||||
if first.startswith(BOM_UTF8): | ||||
bom_found = True | ||||
first = first[3:] | ||||
default = 'utf-8-sig' | ||||
if not first: | ||||
return default, [] | ||||
encoding = find_cookie(first) | ||||
if encoding: | ||||
return encoding, [first] | ||||
second = read_or_stop() | ||||
if not second: | ||||
return default, [first] | ||||
encoding = find_cookie(second) | ||||
if encoding: | ||||
return encoding, [first, second] | ||||
return default, [first, second] | ||||
try: | ||||
# Available in Python 3.2 and above. | ||||
from tokenize import open | ||||
Thomas Kluyver
|
r6301 | except ImportError: | ||
Thomas Kluyver
|
r6247 | # Copied from Python 3.2 tokenize | ||
def open(filename): | ||||
"""Open a file in read only mode using the encoding detected by | ||||
detect_encoding(). | ||||
""" | ||||
buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2 | ||||
encoding, lines = detect_encoding(buffer.readline) | ||||
buffer.seek(0) | ||||
text = TextIOWrapper(buffer, encoding, line_buffering=True) | ||||
text.mode = 'r' | ||||
Thomas Kluyver
|
r6301 | return text | ||
Thomas Kluyver
|
r6247 | |||
Jörgen Stenarson
|
r8309 | def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): | ||
"""Converts a bytes string with python source code to unicode. | ||||
Unicode strings are passed through unchanged. Byte strings are checked | ||||
for the python source file encoding cookie to determine encoding. | ||||
txt can be either a bytes buffer or a string containing the source | ||||
code. | ||||
Jörgen Stenarson
|
r8304 | """ | ||
if isinstance(txt, unicode): | ||||
return txt | ||||
Jörgen Stenarson
|
r8314 | if isinstance(txt, bytes): | ||
Jörgen Stenarson
|
r8309 | buffer = BytesIO(txt) | ||
else: | ||||
buffer = txt | ||||
Jörgen Stenarson
|
r8304 | try: | ||
Jörgen Stenarson
|
r8309 | encoding, _ = detect_encoding(buffer.readline) | ||
Jörgen Stenarson
|
r8304 | except SyntaxError: | ||
Jörgen Stenarson
|
r8309 | encoding = "ascii" | ||
buffer.seek(0) | ||||
text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) | ||||
text.mode = 'r' | ||||
if skip_encoding_cookie: | ||||
return u"".join(strip_encoding_cookie(text)) | ||||
else: | ||||
return text.read() | ||||
Jörgen Stenarson
|
r8304 | |||
Thomas Kluyver
|
r6247 | def strip_encoding_cookie(filelike): | ||
"""Generator to pull lines from a text-mode file, skipping the encoding | ||||
cookie if it is found in the first two lines. | ||||
""" | ||||
it = iter(filelike) | ||||
try: | ||||
first = next(it) | ||||
if not cookie_comment_re.match(first): | ||||
yield first | ||||
second = next(it) | ||||
if not cookie_comment_re.match(second): | ||||
yield second | ||||
except StopIteration: | ||||
return | ||||
for line in it: | ||||
yield line | ||||
Thomas Kluyver
|
r6450 | def read_py_file(filename, skip_encoding_cookie=True): | ||
"""Read a Python file, using the encoding declared inside the file. | ||||
Parameters | ||||
---------- | ||||
filename : str | ||||
The path to the file to read. | ||||
skip_encoding_cookie : bool | ||||
If True (the default), and the encoding declaration is found in the first | ||||
two lines, that line will be excluded from the output - compiling a | ||||
unicode string with an encoding declaration is a SyntaxError in Python 2. | ||||
Returns | ||||
------- | ||||
A unicode string containing the contents of the file. | ||||
""" | ||||
Thomas Kluyver
|
r6301 | with open(filename) as f: # the open function defined in this module. | ||
if skip_encoding_cookie: | ||||
return "".join(strip_encoding_cookie(f)) | ||||
else: | ||||
return f.read() | ||||
def read_py_url(url, errors='replace', skip_encoding_cookie=True): | ||||
Thomas Kluyver
|
r6450 | """Read a Python file from a URL, using the encoding declared inside the file. | ||
Parameters | ||||
---------- | ||||
url : str | ||||
The URL from which to fetch the file. | ||||
errors : str | ||||
How to handle decoding errors in the file. Options are the same as for | ||||
bytes.decode(), but here 'replace' is the default. | ||||
skip_encoding_cookie : bool | ||||
If True (the default), and the encoding declaration is found in the first | ||||
two lines, that line will be excluded from the output - compiling a | ||||
unicode string with an encoding declaration is a SyntaxError in Python 2. | ||||
Returns | ||||
------- | ||||
A unicode string containing the contents of the file. | ||||
Thomas Kluyver
|
r6301 | """ | ||
response = urllib.urlopen(url) | ||||
buffer = io.BytesIO(response.read()) | ||||
Jörgen Stenarson
|
r8309 | return source_to_unicode(buffer, errors, skip_encoding_cookie) | ||
Thomas Kluyver
|
r8324 | |||
def _list_readline(x): | ||||
"""Given a list, returns a readline() function that returns the next element | ||||
with each call. | ||||
""" | ||||
x = iter(x) | ||||
def readline(): | ||||
return next(x) | ||||
return readline | ||||