##// END OF EJS Templates
Merge pull request #12559 from rushabh-v/pathlib_op
Matthias Bussonnier -
r26099:fc4e583d merge
parent child Browse files
Show More
@@ -1,103 +1,105 b''
1 """
1 """
2 Tools to open .py files as Unicode, using the encoding specified within the file,
2 Tools to open .py files as Unicode, using the encoding specified within the file,
3 as per PEP 263.
3 as per PEP 263.
4
4
5 Much of the code is taken from the tokenize module in Python 3.2.
5 Much of the code is taken from the tokenize module in Python 3.2.
6 """
6 """
7
7
8 import io
8 import io
9 from io import TextIOWrapper, BytesIO
9 from io import TextIOWrapper, BytesIO
10 from pathlib import Path
10 import re
11 import re
11 from tokenize import open, detect_encoding
12 from tokenize import open, detect_encoding
12
13
13 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE)
14 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE)
14 cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
15 cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
15
16
16 def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
17 def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
17 """Converts a bytes string with python source code to unicode.
18 """Converts a bytes string with python source code to unicode.
18
19
19 Unicode strings are passed through unchanged. Byte strings are checked
20 Unicode strings are passed through unchanged. Byte strings are checked
20 for the python source file encoding cookie to determine encoding.
21 for the python source file encoding cookie to determine encoding.
21 txt can be either a bytes buffer or a string containing the source
22 txt can be either a bytes buffer or a string containing the source
22 code.
23 code.
23 """
24 """
24 if isinstance(txt, str):
25 if isinstance(txt, str):
25 return txt
26 return txt
26 if isinstance(txt, bytes):
27 if isinstance(txt, bytes):
27 buffer = BytesIO(txt)
28 buffer = BytesIO(txt)
28 else:
29 else:
29 buffer = txt
30 buffer = txt
30 try:
31 try:
31 encoding, _ = detect_encoding(buffer.readline)
32 encoding, _ = detect_encoding(buffer.readline)
32 except SyntaxError:
33 except SyntaxError:
33 encoding = "ascii"
34 encoding = "ascii"
34 buffer.seek(0)
35 buffer.seek(0)
35 with TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) as text:
36 with TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) as text:
36 text.mode = 'r'
37 text.mode = 'r'
37 if skip_encoding_cookie:
38 if skip_encoding_cookie:
38 return u"".join(strip_encoding_cookie(text))
39 return u"".join(strip_encoding_cookie(text))
39 else:
40 else:
40 return text.read()
41 return text.read()
41
42
42 def strip_encoding_cookie(filelike):
43 def strip_encoding_cookie(filelike):
43 """Generator to pull lines from a text-mode file, skipping the encoding
44 """Generator to pull lines from a text-mode file, skipping the encoding
44 cookie if it is found in the first two lines.
45 cookie if it is found in the first two lines.
45 """
46 """
46 it = iter(filelike)
47 it = iter(filelike)
47 try:
48 try:
48 first = next(it)
49 first = next(it)
49 if not cookie_comment_re.match(first):
50 if not cookie_comment_re.match(first):
50 yield first
51 yield first
51 second = next(it)
52 second = next(it)
52 if not cookie_comment_re.match(second):
53 if not cookie_comment_re.match(second):
53 yield second
54 yield second
54 except StopIteration:
55 except StopIteration:
55 return
56 return
56
57
57 for line in it:
58 for line in it:
58 yield line
59 yield line
59
60
60 def read_py_file(filename, skip_encoding_cookie=True):
61 def read_py_file(filename, skip_encoding_cookie=True):
61 """Read a Python file, using the encoding declared inside the file.
62 """Read a Python file, using the encoding declared inside the file.
62
63
63 Parameters
64 Parameters
64 ----------
65 ----------
65 filename : str
66 filename : str
66 The path to the file to read.
67 The path to the file to read.
67 skip_encoding_cookie : bool
68 skip_encoding_cookie : bool
68 If True (the default), and the encoding declaration is found in the first
69 If True (the default), and the encoding declaration is found in the first
69 two lines, that line will be excluded from the output.
70 two lines, that line will be excluded from the output.
70
71
71 Returns
72 Returns
72 -------
73 -------
73 A unicode string containing the contents of the file.
74 A unicode string containing the contents of the file.
74 """
75 """
75 with open(filename) as f: # the open function defined in this module.
76 filepath = Path(filename)
77 with open(filepath) as f: # the open function defined in this module.
76 if skip_encoding_cookie:
78 if skip_encoding_cookie:
77 return "".join(strip_encoding_cookie(f))
79 return "".join(strip_encoding_cookie(f))
78 else:
80 else:
79 return f.read()
81 return f.read()
80
82
81 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
83 def read_py_url(url, errors='replace', skip_encoding_cookie=True):
82 """Read a Python file from a URL, using the encoding declared inside the file.
84 """Read a Python file from a URL, using the encoding declared inside the file.
83
85
84 Parameters
86 Parameters
85 ----------
87 ----------
86 url : str
88 url : str
87 The URL from which to fetch the file.
89 The URL from which to fetch the file.
88 errors : str
90 errors : str
89 How to handle decoding errors in the file. Options are the same as for
91 How to handle decoding errors in the file. Options are the same as for
90 bytes.decode(), but here 'replace' is the default.
92 bytes.decode(), but here 'replace' is the default.
91 skip_encoding_cookie : bool
93 skip_encoding_cookie : bool
92 If True (the default), and the encoding declaration is found in the first
94 If True (the default), and the encoding declaration is found in the first
93 two lines, that line will be excluded from the output.
95 two lines, that line will be excluded from the output.
94
96
95 Returns
97 Returns
96 -------
98 -------
97 A unicode string containing the contents of the file.
99 A unicode string containing the contents of the file.
98 """
100 """
99 # Deferred import for faster start
101 # Deferred import for faster start
100 from urllib.request import urlopen
102 from urllib.request import urlopen
101 response = urlopen(url)
103 response = urlopen(url)
102 buffer = io.BytesIO(response.read())
104 buffer = io.BytesIO(response.read())
103 return source_to_unicode(buffer, errors, skip_encoding_cookie)
105 return source_to_unicode(buffer, errors, skip_encoding_cookie)
General Comments 0
You need to be logged in to leave comments. Login now