##// END OF EJS Templates
Add IPython.utils.openpy to decode Python files.
Thomas Kluyver -
Show More
@@ -0,0 +1,158 b''
1 """
2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 as per PEP 263.
4
5 Much of the code is taken from the tokenize module in Python 3.2.
6 """
7
8 import __builtin__
9 import io
10 from io import TextIOWrapper
11 import re
12 import urllib
13
14 cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
15 cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
16
17 try:
18 # Available in Python 3
19 from tokenize import detect_encoding
20 except ImportError:
21 from codecs import lookup, BOM_UTF8
22
23 # Copied from Python 3.2 tokenize
24 def _get_normal_name(orig_enc):
25 """Imitates get_normal_name in tokenizer.c."""
26 # Only care about the first 12 characters.
27 enc = orig_enc[:12].lower().replace("_", "-")
28 if enc == "utf-8" or enc.startswith("utf-8-"):
29 return "utf-8"
30 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
31 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
32 return "iso-8859-1"
33 return orig_enc
34
35 # Copied from Python 3.2 tokenize
36 def detect_encoding(readline):
37 """
38 The detect_encoding() function is used to detect the encoding that should
39 be used to decode a Python source file. It requires one argment, readline,
40 in the same way as the tokenize() generator.
41
42 It will call readline a maximum of twice, and return the encoding used
43 (as a string) and a list of any lines (left as bytes) it has read in.
44
45 It detects the encoding from the presence of a utf-8 bom or an encoding
46 cookie as specified in pep-0263. If both a bom and a cookie are present,
47 but disagree, a SyntaxError will be raised. If the encoding cookie is an
48 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
49 'utf-8-sig' is returned.
50
51 If no encoding is specified, then the default of 'utf-8' will be returned.
52 """
53 bom_found = False
54 encoding = None
55 default = 'utf-8'
56 def read_or_stop():
57 try:
58 return readline()
59 except StopIteration:
60 return b''
61
62 def find_cookie(line):
63 try:
64 line_string = line.decode('ascii')
65 except UnicodeDecodeError:
66 return None
67
68 matches = cookie_re.findall(line_string)
69 if not matches:
70 return None
71 encoding = _get_normal_name(matches[0])
72 try:
73 codec = lookup(encoding)
74 except LookupError:
75 # This behaviour mimics the Python interpreter
76 raise SyntaxError("unknown encoding: " + encoding)
77
78 if bom_found:
79 if codec.name != 'utf-8':
80 # This behaviour mimics the Python interpreter
81 raise SyntaxError('encoding problem: utf-8')
82 encoding += '-sig'
83 return encoding
84
85 first = read_or_stop()
86 if first.startswith(BOM_UTF8):
87 bom_found = True
88 first = first[3:]
89 default = 'utf-8-sig'
90 if not first:
91 return default, []
92
93 encoding = find_cookie(first)
94 if encoding:
95 return encoding, [first]
96
97 second = read_or_stop()
98 if not second:
99 return default, [first]
100
101 encoding = find_cookie(second)
102 if encoding:
103 return encoding, [first, second]
104
105 return default, [first, second]
106
107 try:
108 # Available in Python 3.2 and above.
109 from tokenize import open
110 except:
111 # Copied from Python 3.2 tokenize
112 def open(filename):
113 """Open a file in read only mode using the encoding detected by
114 detect_encoding().
115 """
116 buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
117 encoding, lines = detect_encoding(buffer.readline)
118 buffer.seek(0)
119 text = TextIOWrapper(buffer, encoding, line_buffering=True)
120 text.mode = 'r'
121 return text
122
123 def open_url(url, errors='replace'):
124 """Open a URL to a raw Python file, using the encoding detected by
125 detect_encoding().
126 """
127 response = urllib.urlopen(url)
128 buffer = io.BufferedRandom(response)
129 encoding, lines = detect_encoding(buffer.readline)
130 buffer.seek(0)
131 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
132 text.mode = 'r'
133 return text
134
135 def strip_encoding_cookie(filelike):
136 """Generator to pull lines from a text-mode file, skipping the encoding
137 cookie if it is found in the first two lines.
138 """
139 it = iter(filelike)
140 try:
141 first = next(it)
142 if not cookie_comment_re.match(first):
143 yield first
144 second = next(it)
145 if not cookie_comment_re.match(second):
146 yield second
147 except StopIteration:
148 return
149
150 for line in it:
151 yield line
152
153 def read_py_file(filename, skip_encoding_cookie=True):
154 f = open(filename) # the open function defined in this module.
155 if skip_encoding_cookie:
156 return "".join(strip_encoding_cookie(f))
157 else:
158 return f.read()
General Comments 0
You need to be logged in to leave comments. Login now