##// END OF EJS Templates
byteify-strings: add cli argument to handle `attr*()` when they are methods...
Raphaël Gomès -
r42910:bbb002b3 default
parent child Browse files
Show More
@@ -1,301 +1,307 b''
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import, print_function
10 from __future__ import absolute_import, print_function
11
11
12 import argparse
12 import argparse
13 import contextlib
13 import contextlib
14 import errno
14 import errno
15 import os
15 import os
16 import sys
16 import sys
17 import tempfile
17 import tempfile
18 import token
18 import token
19 import tokenize
19 import tokenize
20
20
21 def adjusttokenpos(t, ofs):
21 def adjusttokenpos(t, ofs):
22 """Adjust start/end column of the given token"""
22 """Adjust start/end column of the given token"""
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 end=(t.end[0], t.end[1] + ofs))
24 end=(t.end[0], t.end[1] + ofs))
25
25
26 def replacetokens(tokens, opts):
26 def replacetokens(tokens, opts):
27 """Transform a stream of tokens from raw to Python 3.
27 """Transform a stream of tokens from raw to Python 3.
28
28
29 Returns a generator of possibly rewritten tokens.
29 Returns a generator of possibly rewritten tokens.
30
30
31 The input token list may be mutated as part of processing. However,
31 The input token list may be mutated as part of processing. However,
32 its changes do not necessarily match the output token stream.
32 its changes do not necessarily match the output token stream.
33 """
33 """
34 sysstrtokens = set()
34 sysstrtokens = set()
35
35
36 # The following utility functions access the tokens list and i index of
36 # The following utility functions access the tokens list and i index of
37 # the for i, t enumerate(tokens) loop below
37 # the for i, t enumerate(tokens) loop below
38 def _isop(j, *o):
38 def _isop(j, *o):
39 """Assert that tokens[j] is an OP with one of the given values"""
39 """Assert that tokens[j] is an OP with one of the given values"""
40 try:
40 try:
41 return tokens[j].type == token.OP and tokens[j].string in o
41 return tokens[j].type == token.OP and tokens[j].string in o
42 except IndexError:
42 except IndexError:
43 return False
43 return False
44
44
45 def _findargnofcall(n):
45 def _findargnofcall(n):
46 """Find arg n of a call expression (start at 0)
46 """Find arg n of a call expression (start at 0)
47
47
48 Returns index of the first token of that argument, or None if
48 Returns index of the first token of that argument, or None if
49 there is not that many arguments.
49 there is not that many arguments.
50
50
51 Assumes that token[i + 1] is '('.
51 Assumes that token[i + 1] is '('.
52
52
53 """
53 """
54 nested = 0
54 nested = 0
55 for j in range(i + 2, len(tokens)):
55 for j in range(i + 2, len(tokens)):
56 if _isop(j, ')', ']', '}'):
56 if _isop(j, ')', ']', '}'):
57 # end of call, tuple, subscription or dict / set
57 # end of call, tuple, subscription or dict / set
58 nested -= 1
58 nested -= 1
59 if nested < 0:
59 if nested < 0:
60 return None
60 return None
61 elif n == 0:
61 elif n == 0:
62 # this is the starting position of arg
62 # this is the starting position of arg
63 return j
63 return j
64 elif _isop(j, '(', '[', '{'):
64 elif _isop(j, '(', '[', '{'):
65 nested += 1
65 nested += 1
66 elif _isop(j, ',') and nested == 0:
66 elif _isop(j, ',') and nested == 0:
67 n -= 1
67 n -= 1
68
68
69 return None
69 return None
70
70
71 def _ensuresysstr(j):
71 def _ensuresysstr(j):
72 """Make sure the token at j is a system string
72 """Make sure the token at j is a system string
73
73
74 Remember the given token so the string transformer won't add
74 Remember the given token so the string transformer won't add
75 the byte prefix.
75 the byte prefix.
76
76
77 Ignores tokens that are not strings. Assumes bounds checking has
77 Ignores tokens that are not strings. Assumes bounds checking has
78 already been done.
78 already been done.
79
79
80 """
80 """
81 k = j
81 k = j
82 currtoken = tokens[k]
82 currtoken = tokens[k]
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 k += 1
84 k += 1
85 if (
85 if (
86 currtoken.type == token.STRING
86 currtoken.type == token.STRING
87 and currtoken.string.startswith(("'", '"'))
87 and currtoken.string.startswith(("'", '"'))
88 ):
88 ):
89 sysstrtokens.add(currtoken)
89 sysstrtokens.add(currtoken)
90 try:
90 try:
91 currtoken = tokens[k]
91 currtoken = tokens[k]
92 except IndexError:
92 except IndexError:
93 break
93 break
94
94
95 def _isitemaccess(j):
95 def _isitemaccess(j):
96 """Assert the next tokens form an item access on `tokens[j]` and that
96 """Assert the next tokens form an item access on `tokens[j]` and that
97 `tokens[j]` is a name.
97 `tokens[j]` is a name.
98 """
98 """
99 try:
99 try:
100 return (
100 return (
101 tokens[j].type == token.NAME
101 tokens[j].type == token.NAME
102 and _isop(j + 1, '[')
102 and _isop(j + 1, '[')
103 and tokens[j + 2].type == token.STRING
103 and tokens[j + 2].type == token.STRING
104 and _isop(j + 3, ']')
104 and _isop(j + 3, ']')
105 )
105 )
106 except IndexError:
106 except IndexError:
107 return False
107 return False
108
108
109 def _ismethodcall(j, *methodnames):
109 def _ismethodcall(j, *methodnames):
110 """Assert the next tokens form a call to `methodname` with a string
110 """Assert the next tokens form a call to `methodname` with a string
111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
112 """
112 """
113 try:
113 try:
114 return (
114 return (
115 tokens[j].type == token.NAME
115 tokens[j].type == token.NAME
116 and _isop(j + 1, '.')
116 and _isop(j + 1, '.')
117 and tokens[j + 2].type == token.NAME
117 and tokens[j + 2].type == token.NAME
118 and tokens[j + 2].string in methodnames
118 and tokens[j + 2].string in methodnames
119 and _isop(j + 3, '(')
119 and _isop(j + 3, '(')
120 and tokens[j + 4].type == token.STRING
120 and tokens[j + 4].type == token.STRING
121 )
121 )
122 except IndexError:
122 except IndexError:
123 return False
123 return False
124
124
125 coldelta = 0 # column increment for new opening parens
125 coldelta = 0 # column increment for new opening parens
126 coloffset = -1 # column offset for the current line (-1: TBD)
126 coloffset = -1 # column offset for the current line (-1: TBD)
127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
128 ignorenextline = False # don't transform the next line
128 ignorenextline = False # don't transform the next line
129 insideignoreblock = False # don't transform until turned off
129 insideignoreblock = False # don't transform until turned off
130 for i, t in enumerate(tokens):
130 for i, t in enumerate(tokens):
131 # Compute the column offset for the current line, such that
131 # Compute the column offset for the current line, such that
132 # the current line will be aligned to the last opening paren
132 # the current line will be aligned to the last opening paren
133 # as before.
133 # as before.
134 if coloffset < 0:
134 if coloffset < 0:
135 if t.start[1] == parens[-1][1]:
135 if t.start[1] == parens[-1][1]:
136 coloffset = parens[-1][2]
136 coloffset = parens[-1][2]
137 elif t.start[1] + 1 == parens[-1][1]:
137 elif t.start[1] + 1 == parens[-1][1]:
138 # fix misaligned indent of s/util.Abort/error.Abort/
138 # fix misaligned indent of s/util.Abort/error.Abort/
139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
140 else:
140 else:
141 coloffset = 0
141 coloffset = 0
142
142
143 # Reset per-line attributes at EOL.
143 # Reset per-line attributes at EOL.
144 if t.type in (token.NEWLINE, tokenize.NL):
144 if t.type in (token.NEWLINE, tokenize.NL):
145 yield adjusttokenpos(t, coloffset)
145 yield adjusttokenpos(t, coloffset)
146 coldelta = 0
146 coldelta = 0
147 coloffset = -1
147 coloffset = -1
148 if not insideignoreblock:
148 if not insideignoreblock:
149 ignorenextline = (
149 ignorenextline = (
150 tokens[i - 1].type == token.COMMENT
150 tokens[i - 1].type == token.COMMENT
151 and tokens[i - 1].string == "#no-py3-transform"
151 and tokens[i - 1].string == "#no-py3-transform"
152 )
152 )
153 continue
153 continue
154
154
155 if t.type == token.COMMENT:
155 if t.type == token.COMMENT:
156 if t.string == "#py3-transform: off":
156 if t.string == "#py3-transform: off":
157 insideignoreblock = True
157 insideignoreblock = True
158 if t.string == "#py3-transform: on":
158 if t.string == "#py3-transform: on":
159 insideignoreblock = False
159 insideignoreblock = False
160
160
161 if ignorenextline or insideignoreblock:
161 if ignorenextline or insideignoreblock:
162 yield adjusttokenpos(t, coloffset)
162 yield adjusttokenpos(t, coloffset)
163 continue
163 continue
164
164
165 # Remember the last paren position.
165 # Remember the last paren position.
166 if _isop(i, '(', '[', '{'):
166 if _isop(i, '(', '[', '{'):
167 parens.append(t.end + (coloffset + coldelta,))
167 parens.append(t.end + (coloffset + coldelta,))
168 elif _isop(i, ')', ']', '}'):
168 elif _isop(i, ')', ']', '}'):
169 parens.pop()
169 parens.pop()
170
170
171 # Convert most string literals to byte literals. String literals
171 # Convert most string literals to byte literals. String literals
172 # in Python 2 are bytes. String literals in Python 3 are unicode.
172 # in Python 2 are bytes. String literals in Python 3 are unicode.
173 # Most strings in Mercurial are bytes and unicode strings are rare.
173 # Most strings in Mercurial are bytes and unicode strings are rare.
174 # Rather than rewrite all string literals to use ``b''`` to indicate
174 # Rather than rewrite all string literals to use ``b''`` to indicate
175 # byte strings, we apply this token transformer to insert the ``b``
175 # byte strings, we apply this token transformer to insert the ``b``
176 # prefix nearly everywhere.
176 # prefix nearly everywhere.
177 if t.type == token.STRING and t not in sysstrtokens:
177 if t.type == token.STRING and t not in sysstrtokens:
178 s = t.string
178 s = t.string
179
179
180 # Preserve docstrings as string literals. This is inconsistent
180 # Preserve docstrings as string literals. This is inconsistent
181 # with regular unprefixed strings. However, the
181 # with regular unprefixed strings. However, the
182 # "from __future__" parsing (which allows a module docstring to
182 # "from __future__" parsing (which allows a module docstring to
183 # exist before it) doesn't properly handle the docstring if it
183 # exist before it) doesn't properly handle the docstring if it
184 # is b''' prefixed, leading to a SyntaxError. We leave all
184 # is b''' prefixed, leading to a SyntaxError. We leave all
185 # docstrings as unprefixed to avoid this. This means Mercurial
185 # docstrings as unprefixed to avoid this. This means Mercurial
186 # components touching docstrings need to handle unicode,
186 # components touching docstrings need to handle unicode,
187 # unfortunately.
187 # unfortunately.
188 if s[0:3] in ("'''", '"""'):
188 if s[0:3] in ("'''", '"""'):
189 # If it's assigned to something, it's not a docstring
189 # If it's assigned to something, it's not a docstring
190 if not _isop(i - 1, '='):
190 if not _isop(i - 1, '='):
191 yield adjusttokenpos(t, coloffset)
191 yield adjusttokenpos(t, coloffset)
192 continue
192 continue
193
193
194 # If the first character isn't a quote, it is likely a string
194 # If the first character isn't a quote, it is likely a string
195 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
195 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
196 if s[0] not in ("'", '"'):
196 if s[0] not in ("'", '"'):
197 yield adjusttokenpos(t, coloffset)
197 yield adjusttokenpos(t, coloffset)
198 continue
198 continue
199
199
200 # String literal. Prefix to make a b'' string.
200 # String literal. Prefix to make a b'' string.
201 yield adjusttokenpos(t._replace(string='b%s' % t.string),
201 yield adjusttokenpos(t._replace(string='b%s' % t.string),
202 coloffset)
202 coloffset)
203 coldelta += 1
203 coldelta += 1
204 continue
204 continue
205
205
206 # This looks like a function call.
206 # This looks like a function call.
207 if t.type == token.NAME and _isop(i + 1, '('):
207 if t.type == token.NAME and _isop(i + 1, '('):
208 fn = t.string
208 fn = t.string
209
209
210 # *attr() builtins don't accept byte strings to 2nd argument.
210 # *attr() builtins don't accept byte strings to 2nd argument.
211 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
211 if fn in (
212 not _isop(i - 1, '.')):
212 'getattr', 'setattr', 'hasattr', 'safehasattr', 'wrapfunction',
213 'wrapclass', 'addattr'
214 ) and (opts['allow-attr-methods'] or not _isop(i - 1, '.')):
213 arg1idx = _findargnofcall(1)
215 arg1idx = _findargnofcall(1)
214 if arg1idx is not None:
216 if arg1idx is not None:
215 _ensuresysstr(arg1idx)
217 _ensuresysstr(arg1idx)
216
218
217 # .encode() and .decode() on str/bytes/unicode don't accept
219 # .encode() and .decode() on str/bytes/unicode don't accept
218 # byte strings on Python 3.
220 # byte strings on Python 3.
219 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
221 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
220 for argn in range(2):
222 for argn in range(2):
221 argidx = _findargnofcall(argn)
223 argidx = _findargnofcall(argn)
222 if argidx is not None:
224 if argidx is not None:
223 _ensuresysstr(argidx)
225 _ensuresysstr(argidx)
224
226
225 # It changes iteritems/values to items/values as they are not
227 # It changes iteritems/values to items/values as they are not
226 # present in Python 3 world.
228 # present in Python 3 world.
227 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
229 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
228 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
230 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
229 continue
231 continue
230
232
231 if t.type == token.NAME and t.string in opts['treat-as-kwargs']:
233 if t.type == token.NAME and t.string in opts['treat-as-kwargs']:
232 if _isitemaccess(i):
234 if _isitemaccess(i):
233 _ensuresysstr(i + 2)
235 _ensuresysstr(i + 2)
234 if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'):
236 if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'):
235 _ensuresysstr(i + 4)
237 _ensuresysstr(i + 4)
236
238
237 # Looks like "if __name__ == '__main__'".
239 # Looks like "if __name__ == '__main__'".
238 if (t.type == token.NAME and t.string == '__name__'
240 if (t.type == token.NAME and t.string == '__name__'
239 and _isop(i + 1, '==')):
241 and _isop(i + 1, '==')):
240 _ensuresysstr(i + 2)
242 _ensuresysstr(i + 2)
241
243
242 # Emit unmodified token.
244 # Emit unmodified token.
243 yield adjusttokenpos(t, coloffset)
245 yield adjusttokenpos(t, coloffset)
244
246
245 def process(fin, fout, opts):
247 def process(fin, fout, opts):
246 tokens = tokenize.tokenize(fin.readline)
248 tokens = tokenize.tokenize(fin.readline)
247 tokens = replacetokens(list(tokens), opts)
249 tokens = replacetokens(list(tokens), opts)
248 fout.write(tokenize.untokenize(tokens))
250 fout.write(tokenize.untokenize(tokens))
249
251
250 def tryunlink(fname):
252 def tryunlink(fname):
251 try:
253 try:
252 os.unlink(fname)
254 os.unlink(fname)
253 except OSError as err:
255 except OSError as err:
254 if err.errno != errno.ENOENT:
256 if err.errno != errno.ENOENT:
255 raise
257 raise
256
258
257 @contextlib.contextmanager
259 @contextlib.contextmanager
258 def editinplace(fname):
260 def editinplace(fname):
259 n = os.path.basename(fname)
261 n = os.path.basename(fname)
260 d = os.path.dirname(fname)
262 d = os.path.dirname(fname)
261 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
263 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
262 delete=False)
264 delete=False)
263 try:
265 try:
264 yield fp
266 yield fp
265 fp.close()
267 fp.close()
266 if os.name == 'nt':
268 if os.name == 'nt':
267 tryunlink(fname)
269 tryunlink(fname)
268 os.rename(fp.name, fname)
270 os.rename(fp.name, fname)
269 finally:
271 finally:
270 fp.close()
272 fp.close()
271 tryunlink(fp.name)
273 tryunlink(fp.name)
272
274
273 def main():
275 def main():
274 ap = argparse.ArgumentParser()
276 ap = argparse.ArgumentParser()
275 ap.add_argument('-i', '--inplace', action='store_true', default=False,
277 ap.add_argument('-i', '--inplace', action='store_true', default=False,
276 help='edit files in place')
278 help='edit files in place')
277 ap.add_argument('--dictiter', action='store_true', default=False,
279 ap.add_argument('--dictiter', action='store_true', default=False,
278 help='rewrite iteritems() and itervalues()'),
280 help='rewrite iteritems() and itervalues()'),
281 ap.add_argument('--allow-attr-methods', action='store_true',
282 default=False,
283 help='also handle attr*() when they are methods'),
279 ap.add_argument('--treat-as-kwargs', nargs="+", default=[],
284 ap.add_argument('--treat-as-kwargs', nargs="+", default=[],
280 help="ignore kwargs-like objects"),
285 help="ignore kwargs-like objects"),
281 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
286 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
282 args = ap.parse_args()
287 args = ap.parse_args()
283 opts = {
288 opts = {
284 'dictiter': args.dictiter,
289 'dictiter': args.dictiter,
285 'treat-as-kwargs': set(args.treat_as_kwargs),
290 'treat-as-kwargs': set(args.treat_as_kwargs),
291 'allow-attr-methods': args.allow_attr_methods,
286 }
292 }
287 for fname in args.files:
293 for fname in args.files:
288 if args.inplace:
294 if args.inplace:
289 with editinplace(fname) as fout:
295 with editinplace(fname) as fout:
290 with open(fname, 'rb') as fin:
296 with open(fname, 'rb') as fin:
291 process(fin, fout, opts)
297 process(fin, fout, opts)
292 else:
298 else:
293 with open(fname, 'rb') as fin:
299 with open(fname, 'rb') as fin:
294 fout = sys.stdout.buffer
300 fout = sys.stdout.buffer
295 process(fin, fout, opts)
301 process(fin, fout, opts)
296
302
297 if __name__ == '__main__':
303 if __name__ == '__main__':
298 if sys.version_info.major < 3:
304 if sys.version_info.major < 3:
299 print('This script must be run under Python 3.')
305 print('This script must be run under Python 3.')
300 sys.exit(3)
306 sys.exit(3)
301 main()
307 main()
General Comments 0
You need to be logged in to leave comments. Login now