##// END OF EJS Templates
byteify-strings: add cli argument to handle `attr*()` when they are methods...
Raphaël Gomès -
r42910:bbb002b3 default
parent child Browse files
Show More
@@ -1,301 +1,307 b''
1 1 #!/usr/bin/env python3
2 2 #
3 3 # byteify-strings.py - transform string literals to be Python 3 safe
4 4 #
5 5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import, print_function
11 11
12 12 import argparse
13 13 import contextlib
14 14 import errno
15 15 import os
16 16 import sys
17 17 import tempfile
18 18 import token
19 19 import tokenize
20 20
21 21 def adjusttokenpos(t, ofs):
22 22 """Adjust start/end column of the given token"""
23 23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 24 end=(t.end[0], t.end[1] + ofs))
25 25
26 26 def replacetokens(tokens, opts):
27 27 """Transform a stream of tokens from raw to Python 3.
28 28
29 29 Returns a generator of possibly rewritten tokens.
30 30
31 31 The input token list may be mutated as part of processing. However,
32 32 its changes do not necessarily match the output token stream.
33 33 """
34 34 sysstrtokens = set()
35 35
36 36 # The following utility functions access the tokens list and i index of
37 37 # the for i, t enumerate(tokens) loop below
38 38 def _isop(j, *o):
39 39 """Assert that tokens[j] is an OP with one of the given values"""
40 40 try:
41 41 return tokens[j].type == token.OP and tokens[j].string in o
42 42 except IndexError:
43 43 return False
44 44
45 45 def _findargnofcall(n):
46 46 """Find arg n of a call expression (start at 0)
47 47
48 48 Returns index of the first token of that argument, or None if
49 49 there is not that many arguments.
50 50
51 51 Assumes that token[i + 1] is '('.
52 52
53 53 """
54 54 nested = 0
55 55 for j in range(i + 2, len(tokens)):
56 56 if _isop(j, ')', ']', '}'):
57 57 # end of call, tuple, subscription or dict / set
58 58 nested -= 1
59 59 if nested < 0:
60 60 return None
61 61 elif n == 0:
62 62 # this is the starting position of arg
63 63 return j
64 64 elif _isop(j, '(', '[', '{'):
65 65 nested += 1
66 66 elif _isop(j, ',') and nested == 0:
67 67 n -= 1
68 68
69 69 return None
70 70
71 71 def _ensuresysstr(j):
72 72 """Make sure the token at j is a system string
73 73
74 74 Remember the given token so the string transformer won't add
75 75 the byte prefix.
76 76
77 77 Ignores tokens that are not strings. Assumes bounds checking has
78 78 already been done.
79 79
80 80 """
81 81 k = j
82 82 currtoken = tokens[k]
83 83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 84 k += 1
85 85 if (
86 86 currtoken.type == token.STRING
87 87 and currtoken.string.startswith(("'", '"'))
88 88 ):
89 89 sysstrtokens.add(currtoken)
90 90 try:
91 91 currtoken = tokens[k]
92 92 except IndexError:
93 93 break
94 94
95 95 def _isitemaccess(j):
96 96 """Assert the next tokens form an item access on `tokens[j]` and that
97 97 `tokens[j]` is a name.
98 98 """
99 99 try:
100 100 return (
101 101 tokens[j].type == token.NAME
102 102 and _isop(j + 1, '[')
103 103 and tokens[j + 2].type == token.STRING
104 104 and _isop(j + 3, ']')
105 105 )
106 106 except IndexError:
107 107 return False
108 108
109 109 def _ismethodcall(j, *methodnames):
110 110 """Assert the next tokens form a call to `methodname` with a string
111 111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
112 112 """
113 113 try:
114 114 return (
115 115 tokens[j].type == token.NAME
116 116 and _isop(j + 1, '.')
117 117 and tokens[j + 2].type == token.NAME
118 118 and tokens[j + 2].string in methodnames
119 119 and _isop(j + 3, '(')
120 120 and tokens[j + 4].type == token.STRING
121 121 )
122 122 except IndexError:
123 123 return False
124 124
125 125 coldelta = 0 # column increment for new opening parens
126 126 coloffset = -1 # column offset for the current line (-1: TBD)
127 127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
128 128 ignorenextline = False # don't transform the next line
129 129 insideignoreblock = False # don't transform until turned off
130 130 for i, t in enumerate(tokens):
131 131 # Compute the column offset for the current line, such that
132 132 # the current line will be aligned to the last opening paren
133 133 # as before.
134 134 if coloffset < 0:
135 135 if t.start[1] == parens[-1][1]:
136 136 coloffset = parens[-1][2]
137 137 elif t.start[1] + 1 == parens[-1][1]:
138 138 # fix misaligned indent of s/util.Abort/error.Abort/
139 139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
140 140 else:
141 141 coloffset = 0
142 142
143 143 # Reset per-line attributes at EOL.
144 144 if t.type in (token.NEWLINE, tokenize.NL):
145 145 yield adjusttokenpos(t, coloffset)
146 146 coldelta = 0
147 147 coloffset = -1
148 148 if not insideignoreblock:
149 149 ignorenextline = (
150 150 tokens[i - 1].type == token.COMMENT
151 151 and tokens[i - 1].string == "#no-py3-transform"
152 152 )
153 153 continue
154 154
155 155 if t.type == token.COMMENT:
156 156 if t.string == "#py3-transform: off":
157 157 insideignoreblock = True
158 158 if t.string == "#py3-transform: on":
159 159 insideignoreblock = False
160 160
161 161 if ignorenextline or insideignoreblock:
162 162 yield adjusttokenpos(t, coloffset)
163 163 continue
164 164
165 165 # Remember the last paren position.
166 166 if _isop(i, '(', '[', '{'):
167 167 parens.append(t.end + (coloffset + coldelta,))
168 168 elif _isop(i, ')', ']', '}'):
169 169 parens.pop()
170 170
171 171 # Convert most string literals to byte literals. String literals
172 172 # in Python 2 are bytes. String literals in Python 3 are unicode.
173 173 # Most strings in Mercurial are bytes and unicode strings are rare.
174 174 # Rather than rewrite all string literals to use ``b''`` to indicate
175 175 # byte strings, we apply this token transformer to insert the ``b``
176 176 # prefix nearly everywhere.
177 177 if t.type == token.STRING and t not in sysstrtokens:
178 178 s = t.string
179 179
180 180 # Preserve docstrings as string literals. This is inconsistent
181 181 # with regular unprefixed strings. However, the
182 182 # "from __future__" parsing (which allows a module docstring to
183 183 # exist before it) doesn't properly handle the docstring if it
184 184 # is b''' prefixed, leading to a SyntaxError. We leave all
185 185 # docstrings as unprefixed to avoid this. This means Mercurial
186 186 # components touching docstrings need to handle unicode,
187 187 # unfortunately.
188 188 if s[0:3] in ("'''", '"""'):
189 189 # If it's assigned to something, it's not a docstring
190 190 if not _isop(i - 1, '='):
191 191 yield adjusttokenpos(t, coloffset)
192 192 continue
193 193
194 194 # If the first character isn't a quote, it is likely a string
195 195 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
196 196 if s[0] not in ("'", '"'):
197 197 yield adjusttokenpos(t, coloffset)
198 198 continue
199 199
200 200 # String literal. Prefix to make a b'' string.
201 201 yield adjusttokenpos(t._replace(string='b%s' % t.string),
202 202 coloffset)
203 203 coldelta += 1
204 204 continue
205 205
206 206 # This looks like a function call.
207 207 if t.type == token.NAME and _isop(i + 1, '('):
208 208 fn = t.string
209 209
210 210 # *attr() builtins don't accept byte strings to 2nd argument.
211 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
212 not _isop(i - 1, '.')):
211 if fn in (
212 'getattr', 'setattr', 'hasattr', 'safehasattr', 'wrapfunction',
213 'wrapclass', 'addattr'
214 ) and (opts['allow-attr-methods'] or not _isop(i - 1, '.')):
213 215 arg1idx = _findargnofcall(1)
214 216 if arg1idx is not None:
215 217 _ensuresysstr(arg1idx)
216 218
217 219 # .encode() and .decode() on str/bytes/unicode don't accept
218 220 # byte strings on Python 3.
219 221 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
220 222 for argn in range(2):
221 223 argidx = _findargnofcall(argn)
222 224 if argidx is not None:
223 225 _ensuresysstr(argidx)
224 226
225 227 # It changes iteritems/values to items/values as they are not
226 228 # present in Python 3 world.
227 229 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
228 230 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
229 231 continue
230 232
231 233 if t.type == token.NAME and t.string in opts['treat-as-kwargs']:
232 234 if _isitemaccess(i):
233 235 _ensuresysstr(i + 2)
234 236 if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'):
235 237 _ensuresysstr(i + 4)
236 238
237 239 # Looks like "if __name__ == '__main__'".
238 240 if (t.type == token.NAME and t.string == '__name__'
239 241 and _isop(i + 1, '==')):
240 242 _ensuresysstr(i + 2)
241 243
242 244 # Emit unmodified token.
243 245 yield adjusttokenpos(t, coloffset)
244 246
245 247 def process(fin, fout, opts):
246 248 tokens = tokenize.tokenize(fin.readline)
247 249 tokens = replacetokens(list(tokens), opts)
248 250 fout.write(tokenize.untokenize(tokens))
249 251
250 252 def tryunlink(fname):
251 253 try:
252 254 os.unlink(fname)
253 255 except OSError as err:
254 256 if err.errno != errno.ENOENT:
255 257 raise
256 258
257 259 @contextlib.contextmanager
258 260 def editinplace(fname):
259 261 n = os.path.basename(fname)
260 262 d = os.path.dirname(fname)
261 263 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
262 264 delete=False)
263 265 try:
264 266 yield fp
265 267 fp.close()
266 268 if os.name == 'nt':
267 269 tryunlink(fname)
268 270 os.rename(fp.name, fname)
269 271 finally:
270 272 fp.close()
271 273 tryunlink(fp.name)
272 274
273 275 def main():
274 276 ap = argparse.ArgumentParser()
275 277 ap.add_argument('-i', '--inplace', action='store_true', default=False,
276 278 help='edit files in place')
277 279 ap.add_argument('--dictiter', action='store_true', default=False,
278 280 help='rewrite iteritems() and itervalues()'),
281 ap.add_argument('--allow-attr-methods', action='store_true',
282 default=False,
283 help='also handle attr*() when they are methods'),
279 284 ap.add_argument('--treat-as-kwargs', nargs="+", default=[],
280 285 help="ignore kwargs-like objects"),
281 286 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
282 287 args = ap.parse_args()
283 288 opts = {
284 289 'dictiter': args.dictiter,
285 290 'treat-as-kwargs': set(args.treat_as_kwargs),
291 'allow-attr-methods': args.allow_attr_methods,
286 292 }
287 293 for fname in args.files:
288 294 if args.inplace:
289 295 with editinplace(fname) as fout:
290 296 with open(fname, 'rb') as fin:
291 297 process(fin, fout, opts)
292 298 else:
293 299 with open(fname, 'rb') as fin:
294 300 fout = sys.stdout.buffer
295 301 process(fin, fout, opts)
296 302
297 303 if __name__ == '__main__':
298 304 if sys.version_info.major < 3:
299 305 print('This script must be run under Python 3.')
300 306 sys.exit(3)
301 307 main()
General Comments 0
You need to be logged in to leave comments. Login now