From 8ec813a4f6c00cd1aa90181c8a079ce0a8408c61 2012-03-14 00:24:09 From: Thomas Kluyver Date: 2012-03-14 00:24:09 Subject: [PATCH] Add IPython.utils.openpy to decode Python files. --- diff --git a/IPython/utils/openpy.py b/IPython/utils/openpy.py new file mode 100644 index 0000000..49457cf --- /dev/null +++ b/IPython/utils/openpy.py @@ -0,0 +1,158 @@ +""" +Tools to open .py files as Unicode, using the encoding specified within the file, +as per PEP 263. + +Much of the code is taken from the tokenize module in Python 3.2. +""" + +import __builtin__ +import io +from io import TextIOWrapper +import re +import urllib + +cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE) +cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) + +try: + # Available in Python 3 + from tokenize import detect_encoding +except ImportError: + from codecs import lookup, BOM_UTF8 + + # Copied from Python 3.2 tokenize + def _get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace("_", "-") + if enc == "utf-8" or enc.startswith("utf-8-"): + return "utf-8" + if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ + enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): + return "iso-8859-1" + return orig_enc + + # Copied from Python 3.2 tokenize + def detect_encoding(readline): + """ + The detect_encoding() function is used to detect the encoding that should + be used to decode a Python source file. It requires one argment, readline, + in the same way as the tokenize() generator. + + It will call readline a maximum of twice, and return the encoding used + (as a string) and a list of any lines (left as bytes) it has read in. + + It detects the encoding from the presence of a utf-8 bom or an encoding + cookie as specified in pep-0263. If both a bom and a cookie are present, + but disagree, a SyntaxError will be raised. If the encoding cookie is an + invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, + 'utf-8-sig' is returned. + + If no encoding is specified, then the default of 'utf-8' will be returned. + """ + bom_found = False + encoding = None + default = 'utf-8' + def read_or_stop(): + try: + return readline() + except StopIteration: + return b'' + + def find_cookie(line): + try: + line_string = line.decode('ascii') + except UnicodeDecodeError: + return None + + matches = cookie_re.findall(line_string) + if not matches: + return None + encoding = _get_normal_name(matches[0]) + try: + codec = lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + raise SyntaxError("unknown encoding: " + encoding) + + if bom_found: + if codec.name != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + encoding += '-sig' + return encoding + + first = read_or_stop() + if first.startswith(BOM_UTF8): + bom_found = True + first = first[3:] + default = 'utf-8-sig' + if not first: + return default, [] + + encoding = find_cookie(first) + if encoding: + return encoding, [first] + + second = read_or_stop() + if not second: + return default, [first] + + encoding = find_cookie(second) + if encoding: + return encoding, [first, second] + + return default, [first, second] + +try: + # Available in Python 3.2 and above. + from tokenize import open +except: + # Copied from Python 3.2 tokenize + def open(filename): + """Open a file in read only mode using the encoding detected by + detect_encoding(). + """ + buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2 + encoding, lines = detect_encoding(buffer.readline) + buffer.seek(0) + text = TextIOWrapper(buffer, encoding, line_buffering=True) + text.mode = 'r' + return text + +def open_url(url, errors='replace'): + """Open a URL to a raw Python file, using the encoding detected by + detect_encoding(). + """ + response = urllib.urlopen(url) + buffer = io.BufferedRandom(response) + encoding, lines = detect_encoding(buffer.readline) + buffer.seek(0) + text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) + text.mode = 'r' + return text + +def strip_encoding_cookie(filelike): + """Generator to pull lines from a text-mode file, skipping the encoding + cookie if it is found in the first two lines. + """ + it = iter(filelike) + try: + first = next(it) + if not cookie_comment_re.match(first): + yield first + second = next(it) + if not cookie_comment_re.match(second): + yield second + except StopIteration: + return + + for line in it: + yield line + +def read_py_file(filename, skip_encoding_cookie=True): + f = open(filename) # the open function defined in this module. + if skip_encoding_cookie: + return "".join(strip_encoding_cookie(f)) + else: + return f.read()