From 8ec813a4f6c00cd1aa90181c8a079ce0a8408c61 2012-03-14 00:24:09
From: Thomas Kluyver <takowl@gmail.com>
Date: 2012-03-14 00:24:09
Subject: [PATCH] Add IPython.utils.openpy to decode Python files.

---

diff --git a/IPython/utils/openpy.py b/IPython/utils/openpy.py
new file mode 100644
index 0000000..49457cf
--- /dev/null
+++ b/IPython/utils/openpy.py
@@ -0,0 +1,158 @@
+"""
+Tools to open .py files as Unicode, using the encoding specified within the file,
+as per PEP 263.
+
+Much of the code is taken from the tokenize module in Python 3.2.
+"""
+
+import __builtin__
+import io
+from io import TextIOWrapper
+import re
+import urllib
+
+cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
+cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
+
+try:
+    # Available in Python 3
+    from tokenize import detect_encoding
+except ImportError:
+    from codecs import lookup, BOM_UTF8
+    
+    # Copied from Python 3.2 tokenize
+    def _get_normal_name(orig_enc):
+        """Imitates get_normal_name in tokenizer.c."""
+        # Only care about the first 12 characters.
+        enc = orig_enc[:12].lower().replace("_", "-")
+        if enc == "utf-8" or enc.startswith("utf-8-"):
+            return "utf-8"
+        if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+           enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+            return "iso-8859-1"
+        return orig_enc
+    
+    # Copied from Python 3.2 tokenize
+    def detect_encoding(readline):
+        """
+        The detect_encoding() function is used to detect the encoding that should
+        be used to decode a Python source file.  It requires one argment, readline,
+        in the same way as the tokenize() generator.
+
+        It will call readline a maximum of twice, and return the encoding used
+        (as a string) and a list of any lines (left as bytes) it has read in.
+
+        It detects the encoding from the presence of a utf-8 bom or an encoding
+        cookie as specified in pep-0263.  If both a bom and a cookie are present,
+        but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+        invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+        'utf-8-sig' is returned.
+
+        If no encoding is specified, then the default of 'utf-8' will be returned.
+        """
+        bom_found = False
+        encoding = None
+        default = 'utf-8'
+        def read_or_stop():
+            try:
+                return readline()
+            except StopIteration:
+                return b''
+
+        def find_cookie(line):
+            try:
+                line_string = line.decode('ascii')
+            except UnicodeDecodeError:
+                return None
+
+            matches = cookie_re.findall(line_string)
+            if not matches:
+                return None
+            encoding = _get_normal_name(matches[0])
+            try:
+                codec = lookup(encoding)
+            except LookupError:
+                # This behaviour mimics the Python interpreter
+                raise SyntaxError("unknown encoding: " + encoding)
+
+            if bom_found:
+                if codec.name != 'utf-8':
+                    # This behaviour mimics the Python interpreter
+                    raise SyntaxError('encoding problem: utf-8')
+                encoding += '-sig'
+            return encoding
+
+        first = read_or_stop()
+        if first.startswith(BOM_UTF8):
+            bom_found = True
+            first = first[3:]
+            default = 'utf-8-sig'
+        if not first:
+            return default, []
+
+        encoding = find_cookie(first)
+        if encoding:
+            return encoding, [first]
+
+        second = read_or_stop()
+        if not second:
+            return default, [first]
+
+        encoding = find_cookie(second)
+        if encoding:
+            return encoding, [first, second]
+
+        return default, [first, second]
+
+try:
+    # Available in Python 3.2 and above.
+    from tokenize import open
+except:
+    # Copied from Python 3.2 tokenize
+    def open(filename):
+        """Open a file in read only mode using the encoding detected by
+        detect_encoding().
+        """
+        buffer = io.open(filename, 'rb')   # Tweaked to use io.open for Python 2
+        encoding, lines = detect_encoding(buffer.readline)
+        buffer.seek(0)
+        text = TextIOWrapper(buffer, encoding, line_buffering=True)
+        text.mode = 'r'
+        return text
+
+def open_url(url, errors='replace'):
+    """Open a URL to a raw Python file, using the encoding detected by
+    detect_encoding().
+    """
+    response = urllib.urlopen(url)
+    buffer = io.BufferedRandom(response)
+    encoding, lines = detect_encoding(buffer.readline)
+    buffer.seek(0)
+    text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
+    text.mode = 'r'
+    return text
+
+def strip_encoding_cookie(filelike):
+    """Generator to pull lines from a text-mode file, skipping the encoding
+    cookie if it is found in the first two lines.
+    """
+    it = iter(filelike)
+    try:
+        first = next(it)
+        if not cookie_comment_re.match(first):
+            yield first
+        second = next(it)
+        if not cookie_comment_re.match(second):
+            yield second
+    except StopIteration:
+        return
+    
+    for line in it:
+        yield line
+
+def read_py_file(filename, skip_encoding_cookie=True):
+    f = open(filename)   # the open function defined in this module.
+    if skip_encoding_cookie:
+        return "".join(strip_encoding_cookie(f))
+    else:
+        return f.read()