Show More
@@ -1,303 +1,301 b'' | |||
|
1 | 1 | #!/usr/bin/env python3 |
|
2 | 2 | # |
|
3 | 3 | # byteify-strings.py - transform string literals to be Python 3 safe |
|
4 | 4 | # |
|
5 | 5 | # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com> |
|
6 | 6 | # |
|
7 | 7 | # This software may be used and distributed according to the terms of the |
|
8 | 8 | # GNU General Public License version 2 or any later version. |
|
9 | 9 | |
|
10 | 10 | from __future__ import absolute_import, print_function |
|
11 | 11 | |
|
12 | 12 | import argparse |
|
13 | 13 | import contextlib |
|
14 | 14 | import errno |
|
15 | 15 | import os |
|
16 | 16 | import sys |
|
17 | 17 | import tempfile |
|
18 | 18 | import token |
|
19 | 19 | import tokenize |
|
20 | 20 | |
|
21 | 21 | def adjusttokenpos(t, ofs): |
|
22 | 22 | """Adjust start/end column of the given token""" |
|
23 | 23 | return t._replace(start=(t.start[0], t.start[1] + ofs), |
|
24 | 24 | end=(t.end[0], t.end[1] + ofs)) |
|
25 | 25 | |
|
26 | 26 | def replacetokens(tokens, opts): |
|
27 | 27 | """Transform a stream of tokens from raw to Python 3. |
|
28 | 28 | |
|
29 | 29 | Returns a generator of possibly rewritten tokens. |
|
30 | 30 | |
|
31 | 31 | The input token list may be mutated as part of processing. However, |
|
32 | 32 | its changes do not necessarily match the output token stream. |
|
33 | 33 | """ |
|
34 | 34 | sysstrtokens = set() |
|
35 | 35 | |
|
36 | 36 | # The following utility functions access the tokens list and i index of |
|
37 | 37 | # the for i, t enumerate(tokens) loop below |
|
38 | 38 | def _isop(j, *o): |
|
39 | 39 | """Assert that tokens[j] is an OP with one of the given values""" |
|
40 | 40 | try: |
|
41 | 41 | return tokens[j].type == token.OP and tokens[j].string in o |
|
42 | 42 | except IndexError: |
|
43 | 43 | return False |
|
44 | 44 | |
|
45 | 45 | def _findargnofcall(n): |
|
46 | 46 | """Find arg n of a call expression (start at 0) |
|
47 | 47 | |
|
48 | 48 | Returns index of the first token of that argument, or None if |
|
49 | 49 | there is not that many arguments. |
|
50 | 50 | |
|
51 | 51 | Assumes that token[i + 1] is '('. |
|
52 | 52 | |
|
53 | 53 | """ |
|
54 | 54 | nested = 0 |
|
55 | 55 | for j in range(i + 2, len(tokens)): |
|
56 | 56 | if _isop(j, ')', ']', '}'): |
|
57 | 57 | # end of call, tuple, subscription or dict / set |
|
58 | 58 | nested -= 1 |
|
59 | 59 | if nested < 0: |
|
60 | 60 | return None |
|
61 | 61 | elif n == 0: |
|
62 | 62 | # this is the starting position of arg |
|
63 | 63 | return j |
|
64 | 64 | elif _isop(j, '(', '[', '{'): |
|
65 | 65 | nested += 1 |
|
66 | 66 | elif _isop(j, ',') and nested == 0: |
|
67 | 67 | n -= 1 |
|
68 | 68 | |
|
69 | 69 | return None |
|
70 | 70 | |
|
71 | 71 | def _ensuresysstr(j): |
|
72 | 72 | """Make sure the token at j is a system string |
|
73 | 73 | |
|
74 | 74 | Remember the given token so the string transformer won't add |
|
75 | 75 | the byte prefix. |
|
76 | 76 | |
|
77 | 77 | Ignores tokens that are not strings. Assumes bounds checking has |
|
78 | 78 | already been done. |
|
79 | 79 | |
|
80 | 80 | """ |
|
81 | 81 | k = j |
|
82 | 82 | currtoken = tokens[k] |
|
83 | 83 | while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL): |
|
84 | 84 | k += 1 |
|
85 | 85 | if ( |
|
86 | 86 | currtoken.type == token.STRING |
|
87 | 87 | and currtoken.string.startswith(("'", '"')) |
|
88 | 88 | ): |
|
89 | 89 | sysstrtokens.add(currtoken) |
|
90 | 90 | try: |
|
91 | 91 | currtoken = tokens[k] |
|
92 | 92 | except IndexError: |
|
93 | 93 | break |
|
94 | 94 | |
|
95 | 95 | def _isitemaccess(j): |
|
96 | 96 | """Assert the next tokens form an item access on `tokens[j]` and that |
|
97 | 97 | `tokens[j]` is a name. |
|
98 | 98 | """ |
|
99 | 99 | try: |
|
100 | 100 | return ( |
|
101 | 101 | tokens[j].type == token.NAME |
|
102 | 102 | and _isop(j + 1, '[') |
|
103 | 103 | and tokens[j + 2].type == token.STRING |
|
104 | 104 | and _isop(j + 3, ']') |
|
105 | 105 | ) |
|
106 | 106 | except IndexError: |
|
107 | 107 | return False |
|
108 | 108 | |
|
109 | 109 | def _ismethodcall(j, *methodnames): |
|
110 | 110 | """Assert the next tokens form a call to `methodname` with a string |
|
111 | 111 | as first argument on `tokens[j]` and that `tokens[j]` is a name. |
|
112 | 112 | """ |
|
113 | 113 | try: |
|
114 | 114 | return ( |
|
115 | 115 | tokens[j].type == token.NAME |
|
116 | 116 | and _isop(j + 1, '.') |
|
117 | 117 | and tokens[j + 2].type == token.NAME |
|
118 | 118 | and tokens[j + 2].string in methodnames |
|
119 | 119 | and _isop(j + 3, '(') |
|
120 | 120 | and tokens[j + 4].type == token.STRING |
|
121 | 121 | ) |
|
122 | 122 | except IndexError: |
|
123 | 123 | return False |
|
124 | 124 | |
|
125 | 125 | coldelta = 0 # column increment for new opening parens |
|
126 | 126 | coloffset = -1 # column offset for the current line (-1: TBD) |
|
127 | 127 | parens = [(0, 0, 0)] # stack of (line, end-column, column-offset) |
|
128 | 128 | ignorenextline = False # don't transform the next line |
|
129 | 129 | insideignoreblock = False # don't transform until turned off |
|
130 | 130 | for i, t in enumerate(tokens): |
|
131 | 131 | # Compute the column offset for the current line, such that |
|
132 | 132 | # the current line will be aligned to the last opening paren |
|
133 | 133 | # as before. |
|
134 | 134 | if coloffset < 0: |
|
135 | 135 | if t.start[1] == parens[-1][1]: |
|
136 | 136 | coloffset = parens[-1][2] |
|
137 | 137 | elif t.start[1] + 1 == parens[-1][1]: |
|
138 | 138 | # fix misaligned indent of s/util.Abort/error.Abort/ |
|
139 | 139 | coloffset = parens[-1][2] + (parens[-1][1] - t.start[1]) |
|
140 | 140 | else: |
|
141 | 141 | coloffset = 0 |
|
142 | 142 | |
|
143 | 143 | # Reset per-line attributes at EOL. |
|
144 | 144 | if t.type in (token.NEWLINE, tokenize.NL): |
|
145 | 145 | yield adjusttokenpos(t, coloffset) |
|
146 | 146 | coldelta = 0 |
|
147 | 147 | coloffset = -1 |
|
148 | 148 | if not insideignoreblock: |
|
149 | 149 | ignorenextline = ( |
|
150 | 150 | tokens[i - 1].type == token.COMMENT |
|
151 | 151 | and tokens[i - 1].string == "#no-py3-transform" |
|
152 | 152 | ) |
|
153 | 153 | continue |
|
154 | 154 | |
|
155 | 155 | if t.type == token.COMMENT: |
|
156 | 156 | if t.string == "#py3-transform: off": |
|
157 | 157 | insideignoreblock = True |
|
158 | 158 | if t.string == "#py3-transform: on": |
|
159 | 159 | insideignoreblock = False |
|
160 | 160 | |
|
161 | 161 | if ignorenextline or insideignoreblock: |
|
162 | 162 | yield adjusttokenpos(t, coloffset) |
|
163 | 163 | continue |
|
164 | 164 | |
|
165 | 165 | # Remember the last paren position. |
|
166 | 166 | if _isop(i, '(', '[', '{'): |
|
167 | 167 | parens.append(t.end + (coloffset + coldelta,)) |
|
168 | 168 | elif _isop(i, ')', ']', '}'): |
|
169 | 169 | parens.pop() |
|
170 | 170 | |
|
171 | 171 | # Convert most string literals to byte literals. String literals |
|
172 | 172 | # in Python 2 are bytes. String literals in Python 3 are unicode. |
|
173 | 173 | # Most strings in Mercurial are bytes and unicode strings are rare. |
|
174 | 174 | # Rather than rewrite all string literals to use ``b''`` to indicate |
|
175 | 175 | # byte strings, we apply this token transformer to insert the ``b`` |
|
176 | 176 | # prefix nearly everywhere. |
|
177 | 177 | if t.type == token.STRING and t not in sysstrtokens: |
|
178 | 178 | s = t.string |
|
179 | 179 | |
|
180 | 180 | # Preserve docstrings as string literals. This is inconsistent |
|
181 | 181 | # with regular unprefixed strings. However, the |
|
182 | 182 | # "from __future__" parsing (which allows a module docstring to |
|
183 | 183 | # exist before it) doesn't properly handle the docstring if it |
|
184 | 184 | # is b''' prefixed, leading to a SyntaxError. We leave all |
|
185 | 185 | # docstrings as unprefixed to avoid this. This means Mercurial |
|
186 | 186 | # components touching docstrings need to handle unicode, |
|
187 | 187 | # unfortunately. |
|
188 | 188 | if s[0:3] in ("'''", '"""'): |
|
189 | 189 | # If it's assigned to something, it's not a docstring |
|
190 | 190 | if not _isop(i - 1, '='): |
|
191 | 191 | yield adjusttokenpos(t, coloffset) |
|
192 | 192 | continue |
|
193 | 193 | |
|
194 | 194 | # If the first character isn't a quote, it is likely a string |
|
195 | 195 | # prefixing character (such as 'b', 'u', or 'r'. Ignore. |
|
196 | 196 | if s[0] not in ("'", '"'): |
|
197 | 197 | yield adjusttokenpos(t, coloffset) |
|
198 | 198 | continue |
|
199 | 199 | |
|
200 | 200 | # String literal. Prefix to make a b'' string. |
|
201 | 201 | yield adjusttokenpos(t._replace(string='b%s' % t.string), |
|
202 | 202 | coloffset) |
|
203 | 203 | coldelta += 1 |
|
204 | 204 | continue |
|
205 | 205 | |
|
206 | 206 | # This looks like a function call. |
|
207 | 207 | if t.type == token.NAME and _isop(i + 1, '('): |
|
208 | 208 | fn = t.string |
|
209 | 209 | |
|
210 | 210 | # *attr() builtins don't accept byte strings to 2nd argument. |
|
211 | 211 | if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and |
|
212 | 212 | not _isop(i - 1, '.')): |
|
213 | 213 | arg1idx = _findargnofcall(1) |
|
214 | 214 | if arg1idx is not None: |
|
215 | 215 | _ensuresysstr(arg1idx) |
|
216 | 216 | |
|
217 | 217 | # .encode() and .decode() on str/bytes/unicode don't accept |
|
218 | 218 | # byte strings on Python 3. |
|
219 | 219 | elif fn in ('encode', 'decode') and _isop(i - 1, '.'): |
|
220 | 220 | for argn in range(2): |
|
221 | 221 | argidx = _findargnofcall(argn) |
|
222 | 222 | if argidx is not None: |
|
223 | 223 | _ensuresysstr(argidx) |
|
224 | 224 | |
|
225 | 225 | # It changes iteritems/values to items/values as they are not |
|
226 | 226 | # present in Python 3 world. |
|
227 | 227 | elif opts['dictiter'] and fn in ('iteritems', 'itervalues'): |
|
228 | 228 | yield adjusttokenpos(t._replace(string=fn[4:]), coloffset) |
|
229 | 229 | continue |
|
230 | 230 | |
|
231 | 231 | if t.type == token.NAME and t.string in opts['treat-as-kwargs']: |
|
232 | 232 | if _isitemaccess(i): |
|
233 | 233 | _ensuresysstr(i + 2) |
|
234 | 234 | if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'): |
|
235 | 235 | _ensuresysstr(i + 4) |
|
236 | 236 | |
|
237 | 237 | # Looks like "if __name__ == '__main__'". |
|
238 | 238 | if (t.type == token.NAME and t.string == '__name__' |
|
239 | 239 | and _isop(i + 1, '==')): |
|
240 | 240 | _ensuresysstr(i + 2) |
|
241 | 241 | |
|
242 | 242 | # Emit unmodified token. |
|
243 | 243 | yield adjusttokenpos(t, coloffset) |
|
244 | 244 | |
|
245 | 245 | def process(fin, fout, opts): |
|
246 | 246 | tokens = tokenize.tokenize(fin.readline) |
|
247 | 247 | tokens = replacetokens(list(tokens), opts) |
|
248 | 248 | fout.write(tokenize.untokenize(tokens)) |
|
249 | 249 | |
|
250 | 250 | def tryunlink(fname): |
|
251 | 251 | try: |
|
252 | 252 | os.unlink(fname) |
|
253 | 253 | except OSError as err: |
|
254 | 254 | if err.errno != errno.ENOENT: |
|
255 | 255 | raise |
|
256 | 256 | |
|
257 | 257 | @contextlib.contextmanager |
|
258 | 258 | def editinplace(fname): |
|
259 | 259 | n = os.path.basename(fname) |
|
260 | 260 | d = os.path.dirname(fname) |
|
261 | 261 | fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d, |
|
262 | 262 | delete=False) |
|
263 | 263 | try: |
|
264 | 264 | yield fp |
|
265 | 265 | fp.close() |
|
266 | 266 | if os.name == 'nt': |
|
267 | 267 | tryunlink(fname) |
|
268 | 268 | os.rename(fp.name, fname) |
|
269 | 269 | finally: |
|
270 | 270 | fp.close() |
|
271 | 271 | tryunlink(fp.name) |
|
272 | 272 | |
|
273 | 273 | def main(): |
|
274 | 274 | ap = argparse.ArgumentParser() |
|
275 | 275 | ap.add_argument('-i', '--inplace', action='store_true', default=False, |
|
276 | 276 | help='edit files in place') |
|
277 | 277 | ap.add_argument('--dictiter', action='store_true', default=False, |
|
278 | 278 | help='rewrite iteritems() and itervalues()'), |
|
279 | ap.add_argument('--treat-as-kwargs', nargs="+", | |
|
279 | ap.add_argument('--treat-as-kwargs', nargs="+", default=[], | |
|
280 | 280 | help="ignore kwargs-like objects"), |
|
281 | 281 | ap.add_argument('files', metavar='FILE', nargs='+', help='source file') |
|
282 | 282 | args = ap.parse_args() |
|
283 | 283 | opts = { |
|
284 | 284 | 'dictiter': args.dictiter, |
|
285 | 'treat-as-kwargs': set( | |
|
286 | args.treat_as_kwargs | |
|
287 | ) if args.treat_as_kwargs else set() | |
|
285 | 'treat-as-kwargs': set(args.treat_as_kwargs), | |
|
288 | 286 | } |
|
289 | 287 | for fname in args.files: |
|
290 | 288 | if args.inplace: |
|
291 | 289 | with editinplace(fname) as fout: |
|
292 | 290 | with open(fname, 'rb') as fin: |
|
293 | 291 | process(fin, fout, opts) |
|
294 | 292 | else: |
|
295 | 293 | with open(fname, 'rb') as fin: |
|
296 | 294 | fout = sys.stdout.buffer |
|
297 | 295 | process(fin, fout, opts) |
|
298 | 296 | |
|
299 | 297 | if __name__ == '__main__': |
|
300 | 298 | if sys.version_info.major < 3: |
|
301 | 299 | print('This script must be run under Python 3.') |
|
302 | 300 | sys.exit(3) |
|
303 | 301 | main() |
General Comments 0
You need to be logged in to leave comments.
Login now