Show More
@@ -1,175 +1,209 b'' | |||||
1 | #!/usr/bin/env python3 |
|
1 | #!/usr/bin/env python3 | |
2 | # |
|
2 | # | |
3 | # byteify-strings.py - transform string literals to be Python 3 safe |
|
3 | # byteify-strings.py - transform string literals to be Python 3 safe | |
4 | # |
|
4 | # | |
5 | # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com> |
|
5 | # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com> | |
6 | # |
|
6 | # | |
7 | # This software may be used and distributed according to the terms of the |
|
7 | # This software may be used and distributed according to the terms of the | |
8 | # GNU General Public License version 2 or any later version. |
|
8 | # GNU General Public License version 2 or any later version. | |
9 |
|
9 | |||
10 | from __future__ import absolute_import |
|
10 | from __future__ import absolute_import | |
11 |
|
11 | |||
12 | import argparse |
|
12 | import argparse | |
|
13 | import contextlib | |||
|
14 | import errno | |||
13 | import io |
|
15 | import io | |
|
16 | import os | |||
14 | import sys |
|
17 | import sys | |
|
18 | import tempfile | |||
15 | import token |
|
19 | import token | |
16 | import tokenize |
|
20 | import tokenize | |
17 |
|
21 | |||
18 | if True: |
|
22 | if True: | |
19 | def replacetokens(tokens, fullname): |
|
23 | def replacetokens(tokens, fullname): | |
20 | """Transform a stream of tokens from raw to Python 3. |
|
24 | """Transform a stream of tokens from raw to Python 3. | |
21 |
|
25 | |||
22 | Returns a generator of possibly rewritten tokens. |
|
26 | Returns a generator of possibly rewritten tokens. | |
23 |
|
27 | |||
24 | The input token list may be mutated as part of processing. However, |
|
28 | The input token list may be mutated as part of processing. However, | |
25 | its changes do not necessarily match the output token stream. |
|
29 | its changes do not necessarily match the output token stream. | |
26 | """ |
|
30 | """ | |
27 | futureimpline = False |
|
31 | futureimpline = False | |
28 |
|
32 | |||
29 | # The following utility functions access the tokens list and i index of |
|
33 | # The following utility functions access the tokens list and i index of | |
30 | # the for i, t enumerate(tokens) loop below |
|
34 | # the for i, t enumerate(tokens) loop below | |
31 | def _isop(j, *o): |
|
35 | def _isop(j, *o): | |
32 | """Assert that tokens[j] is an OP with one of the given values""" |
|
36 | """Assert that tokens[j] is an OP with one of the given values""" | |
33 | try: |
|
37 | try: | |
34 | return tokens[j].type == token.OP and tokens[j].string in o |
|
38 | return tokens[j].type == token.OP and tokens[j].string in o | |
35 | except IndexError: |
|
39 | except IndexError: | |
36 | return False |
|
40 | return False | |
37 |
|
41 | |||
38 | def _findargnofcall(n): |
|
42 | def _findargnofcall(n): | |
39 | """Find arg n of a call expression (start at 0) |
|
43 | """Find arg n of a call expression (start at 0) | |
40 |
|
44 | |||
41 | Returns index of the first token of that argument, or None if |
|
45 | Returns index of the first token of that argument, or None if | |
42 | there is not that many arguments. |
|
46 | there is not that many arguments. | |
43 |
|
47 | |||
44 | Assumes that token[i + 1] is '('. |
|
48 | Assumes that token[i + 1] is '('. | |
45 |
|
49 | |||
46 | """ |
|
50 | """ | |
47 | nested = 0 |
|
51 | nested = 0 | |
48 | for j in range(i + 2, len(tokens)): |
|
52 | for j in range(i + 2, len(tokens)): | |
49 | if _isop(j, ')', ']', '}'): |
|
53 | if _isop(j, ')', ']', '}'): | |
50 | # end of call, tuple, subscription or dict / set |
|
54 | # end of call, tuple, subscription or dict / set | |
51 | nested -= 1 |
|
55 | nested -= 1 | |
52 | if nested < 0: |
|
56 | if nested < 0: | |
53 | return None |
|
57 | return None | |
54 | elif n == 0: |
|
58 | elif n == 0: | |
55 | # this is the starting position of arg |
|
59 | # this is the starting position of arg | |
56 | return j |
|
60 | return j | |
57 | elif _isop(j, '(', '[', '{'): |
|
61 | elif _isop(j, '(', '[', '{'): | |
58 | nested += 1 |
|
62 | nested += 1 | |
59 | elif _isop(j, ',') and nested == 0: |
|
63 | elif _isop(j, ',') and nested == 0: | |
60 | n -= 1 |
|
64 | n -= 1 | |
61 |
|
65 | |||
62 | return None |
|
66 | return None | |
63 |
|
67 | |||
64 | def _ensureunicode(j): |
|
68 | def _ensureunicode(j): | |
65 | """Make sure the token at j is a unicode string |
|
69 | """Make sure the token at j is a unicode string | |
66 |
|
70 | |||
67 | This rewrites a string token to include the unicode literal prefix |
|
71 | This rewrites a string token to include the unicode literal prefix | |
68 | so the string transformer won't add the byte prefix. |
|
72 | so the string transformer won't add the byte prefix. | |
69 |
|
73 | |||
70 | Ignores tokens that are not strings. Assumes bounds checking has |
|
74 | Ignores tokens that are not strings. Assumes bounds checking has | |
71 | already been done. |
|
75 | already been done. | |
72 |
|
76 | |||
73 | """ |
|
77 | """ | |
74 | st = tokens[j] |
|
78 | st = tokens[j] | |
75 | if st.type == token.STRING and st.string.startswith(("'", '"')): |
|
79 | if st.type == token.STRING and st.string.startswith(("'", '"')): | |
76 | tokens[j] = st._replace(string='u%s' % st.string) |
|
80 | tokens[j] = st._replace(string='u%s' % st.string) | |
77 |
|
81 | |||
78 | for i, t in enumerate(tokens): |
|
82 | for i, t in enumerate(tokens): | |
79 | # Convert most string literals to byte literals. String literals |
|
83 | # Convert most string literals to byte literals. String literals | |
80 | # in Python 2 are bytes. String literals in Python 3 are unicode. |
|
84 | # in Python 2 are bytes. String literals in Python 3 are unicode. | |
81 | # Most strings in Mercurial are bytes and unicode strings are rare. |
|
85 | # Most strings in Mercurial are bytes and unicode strings are rare. | |
82 | # Rather than rewrite all string literals to use ``b''`` to indicate |
|
86 | # Rather than rewrite all string literals to use ``b''`` to indicate | |
83 | # byte strings, we apply this token transformer to insert the ``b`` |
|
87 | # byte strings, we apply this token transformer to insert the ``b`` | |
84 | # prefix nearly everywhere. |
|
88 | # prefix nearly everywhere. | |
85 | if t.type == token.STRING: |
|
89 | if t.type == token.STRING: | |
86 | s = t.string |
|
90 | s = t.string | |
87 |
|
91 | |||
88 | # Preserve docstrings as string literals. This is inconsistent |
|
92 | # Preserve docstrings as string literals. This is inconsistent | |
89 | # with regular unprefixed strings. However, the |
|
93 | # with regular unprefixed strings. However, the | |
90 | # "from __future__" parsing (which allows a module docstring to |
|
94 | # "from __future__" parsing (which allows a module docstring to | |
91 | # exist before it) doesn't properly handle the docstring if it |
|
95 | # exist before it) doesn't properly handle the docstring if it | |
92 | # is b''' prefixed, leading to a SyntaxError. We leave all |
|
96 | # is b''' prefixed, leading to a SyntaxError. We leave all | |
93 | # docstrings as unprefixed to avoid this. This means Mercurial |
|
97 | # docstrings as unprefixed to avoid this. This means Mercurial | |
94 | # components touching docstrings need to handle unicode, |
|
98 | # components touching docstrings need to handle unicode, | |
95 | # unfortunately. |
|
99 | # unfortunately. | |
96 | if s[0:3] in ("'''", '"""'): |
|
100 | if s[0:3] in ("'''", '"""'): | |
97 | yield t |
|
101 | yield t | |
98 | continue |
|
102 | continue | |
99 |
|
103 | |||
100 | # If the first character isn't a quote, it is likely a string |
|
104 | # If the first character isn't a quote, it is likely a string | |
101 | # prefixing character (such as 'b', 'u', or 'r'. Ignore. |
|
105 | # prefixing character (such as 'b', 'u', or 'r'. Ignore. | |
102 | if s[0] not in ("'", '"'): |
|
106 | if s[0] not in ("'", '"'): | |
103 | yield t |
|
107 | yield t | |
104 | continue |
|
108 | continue | |
105 |
|
109 | |||
106 | # String literal. Prefix to make a b'' string. |
|
110 | # String literal. Prefix to make a b'' string. | |
107 | yield t._replace(string='b%s' % t.string) |
|
111 | yield t._replace(string='b%s' % t.string) | |
108 | continue |
|
112 | continue | |
109 |
|
113 | |||
110 | # Insert compatibility imports at "from __future__ import" line. |
|
114 | # Insert compatibility imports at "from __future__ import" line. | |
111 | # No '\n' should be added to preserve line numbers. |
|
115 | # No '\n' should be added to preserve line numbers. | |
112 | if (t.type == token.NAME and t.string == 'import' and |
|
116 | if (t.type == token.NAME and t.string == 'import' and | |
113 | all(u.type == token.NAME for u in tokens[i - 2:i]) and |
|
117 | all(u.type == token.NAME for u in tokens[i - 2:i]) and | |
114 | [u.string for u in tokens[i - 2:i]] == ['from', '__future__']): |
|
118 | [u.string for u in tokens[i - 2:i]] == ['from', '__future__']): | |
115 | futureimpline = True |
|
119 | futureimpline = True | |
116 | if t.type == token.NEWLINE and futureimpline: |
|
120 | if t.type == token.NEWLINE and futureimpline: | |
117 | futureimpline = False |
|
121 | futureimpline = False | |
118 | if fullname == 'mercurial.pycompat': |
|
122 | if fullname == 'mercurial.pycompat': | |
119 | yield t |
|
123 | yield t | |
120 | continue |
|
124 | continue | |
121 | r, c = t.start |
|
125 | r, c = t.start | |
122 | l = (b'; from mercurial.pycompat import ' |
|
126 | l = (b'; from mercurial.pycompat import ' | |
123 | b'delattr, getattr, hasattr, setattr, xrange, ' |
|
127 | b'delattr, getattr, hasattr, setattr, xrange, ' | |
124 | b'open, unicode\n') |
|
128 | b'open, unicode\n') | |
125 | for u in tokenize.tokenize(io.BytesIO(l).readline): |
|
129 | for u in tokenize.tokenize(io.BytesIO(l).readline): | |
126 | if u.type in (tokenize.ENCODING, token.ENDMARKER): |
|
130 | if u.type in (tokenize.ENCODING, token.ENDMARKER): | |
127 | continue |
|
131 | continue | |
128 | yield u._replace( |
|
132 | yield u._replace( | |
129 | start=(r, c + u.start[1]), end=(r, c + u.end[1])) |
|
133 | start=(r, c + u.start[1]), end=(r, c + u.end[1])) | |
130 | continue |
|
134 | continue | |
131 |
|
135 | |||
132 | # This looks like a function call. |
|
136 | # This looks like a function call. | |
133 | if t.type == token.NAME and _isop(i + 1, '('): |
|
137 | if t.type == token.NAME and _isop(i + 1, '('): | |
134 | fn = t.string |
|
138 | fn = t.string | |
135 |
|
139 | |||
136 | # *attr() builtins don't accept byte strings to 2nd argument. |
|
140 | # *attr() builtins don't accept byte strings to 2nd argument. | |
137 | if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and |
|
141 | if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and | |
138 | not _isop(i - 1, '.')): |
|
142 | not _isop(i - 1, '.')): | |
139 | arg1idx = _findargnofcall(1) |
|
143 | arg1idx = _findargnofcall(1) | |
140 | if arg1idx is not None: |
|
144 | if arg1idx is not None: | |
141 | _ensureunicode(arg1idx) |
|
145 | _ensureunicode(arg1idx) | |
142 |
|
146 | |||
143 | # .encode() and .decode() on str/bytes/unicode don't accept |
|
147 | # .encode() and .decode() on str/bytes/unicode don't accept | |
144 | # byte strings on Python 3. |
|
148 | # byte strings on Python 3. | |
145 | elif fn in ('encode', 'decode') and _isop(i - 1, '.'): |
|
149 | elif fn in ('encode', 'decode') and _isop(i - 1, '.'): | |
146 | for argn in range(2): |
|
150 | for argn in range(2): | |
147 | argidx = _findargnofcall(argn) |
|
151 | argidx = _findargnofcall(argn) | |
148 | if argidx is not None: |
|
152 | if argidx is not None: | |
149 | _ensureunicode(argidx) |
|
153 | _ensureunicode(argidx) | |
150 |
|
154 | |||
151 | # It changes iteritems/values to items/values as they are not |
|
155 | # It changes iteritems/values to items/values as they are not | |
152 | # present in Python 3 world. |
|
156 | # present in Python 3 world. | |
153 | elif fn in ('iteritems', 'itervalues'): |
|
157 | elif fn in ('iteritems', 'itervalues'): | |
154 | yield t._replace(string=fn[4:]) |
|
158 | yield t._replace(string=fn[4:]) | |
155 | continue |
|
159 | continue | |
156 |
|
160 | |||
157 | # Emit unmodified token. |
|
161 | # Emit unmodified token. | |
158 | yield t |
|
162 | yield t | |
159 |
|
163 | |||
160 | def process(fin, fout): |
|
164 | def process(fin, fout): | |
161 | tokens = tokenize.tokenize(fin.readline) |
|
165 | tokens = tokenize.tokenize(fin.readline) | |
162 | tokens = replacetokens(list(tokens), fullname='<dummy>') |
|
166 | tokens = replacetokens(list(tokens), fullname='<dummy>') | |
163 | fout.write(tokenize.untokenize(tokens)) |
|
167 | fout.write(tokenize.untokenize(tokens)) | |
164 |
|
168 | |||
|
169 | def tryunlink(fname): | |||
|
170 | try: | |||
|
171 | os.unlink(fname) | |||
|
172 | except OSError as err: | |||
|
173 | if err.errno != errno.ENOENT: | |||
|
174 | raise | |||
|
175 | ||||
|
176 | @contextlib.contextmanager | |||
|
177 | def editinplace(fname): | |||
|
178 | n = os.path.basename(fname) | |||
|
179 | d = os.path.dirname(fname) | |||
|
180 | fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d, | |||
|
181 | delete=False) | |||
|
182 | try: | |||
|
183 | yield fp | |||
|
184 | fp.close() | |||
|
185 | if os.name == 'nt': | |||
|
186 | tryunlink(fname) | |||
|
187 | os.rename(fp.name, fname) | |||
|
188 | finally: | |||
|
189 | fp.close() | |||
|
190 | tryunlink(fp.name) | |||
|
191 | ||||
165 | def main(): |
|
192 | def main(): | |
166 | ap = argparse.ArgumentParser() |
|
193 | ap = argparse.ArgumentParser() | |
|
194 | ap.add_argument('-i', '--inplace', action='store_true', default=False, | |||
|
195 | help='edit files in place') | |||
167 | ap.add_argument('files', metavar='FILE', nargs='+', help='source file') |
|
196 | ap.add_argument('files', metavar='FILE', nargs='+', help='source file') | |
168 | args = ap.parse_args() |
|
197 | args = ap.parse_args() | |
169 | for fname in args.files: |
|
198 | for fname in args.files: | |
|
199 | if args.inplace: | |||
|
200 | with editinplace(fname) as fout: | |||
|
201 | with open(fname, 'rb') as fin: | |||
|
202 | process(fin, fout) | |||
|
203 | else: | |||
170 | with open(fname, 'rb') as fin: |
|
204 | with open(fname, 'rb') as fin: | |
171 | fout = sys.stdout.buffer |
|
205 | fout = sys.stdout.buffer | |
172 | process(fin, fout) |
|
206 | process(fin, fout) | |
173 |
|
207 | |||
174 | if __name__ == '__main__': |
|
208 | if __name__ == '__main__': | |
175 | main() |
|
209 | main() |
General Comments 0
You need to be logged in to leave comments.
Login now