##// END OF EJS Templates
byteify-strings: simplify default value for `--treat-as-kwargs`
Raphaël Gomès -
r42909:5e296f61 default
parent child Browse files
Show More
@@ -1,303 +1,301 b''
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import, print_function
10 from __future__ import absolute_import, print_function
11
11
12 import argparse
12 import argparse
13 import contextlib
13 import contextlib
14 import errno
14 import errno
15 import os
15 import os
16 import sys
16 import sys
17 import tempfile
17 import tempfile
18 import token
18 import token
19 import tokenize
19 import tokenize
20
20
21 def adjusttokenpos(t, ofs):
21 def adjusttokenpos(t, ofs):
22 """Adjust start/end column of the given token"""
22 """Adjust start/end column of the given token"""
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 end=(t.end[0], t.end[1] + ofs))
24 end=(t.end[0], t.end[1] + ofs))
25
25
26 def replacetokens(tokens, opts):
26 def replacetokens(tokens, opts):
27 """Transform a stream of tokens from raw to Python 3.
27 """Transform a stream of tokens from raw to Python 3.
28
28
29 Returns a generator of possibly rewritten tokens.
29 Returns a generator of possibly rewritten tokens.
30
30
31 The input token list may be mutated as part of processing. However,
31 The input token list may be mutated as part of processing. However,
32 its changes do not necessarily match the output token stream.
32 its changes do not necessarily match the output token stream.
33 """
33 """
34 sysstrtokens = set()
34 sysstrtokens = set()
35
35
36 # The following utility functions access the tokens list and i index of
36 # The following utility functions access the tokens list and i index of
37 # the for i, t enumerate(tokens) loop below
37 # the for i, t enumerate(tokens) loop below
38 def _isop(j, *o):
38 def _isop(j, *o):
39 """Assert that tokens[j] is an OP with one of the given values"""
39 """Assert that tokens[j] is an OP with one of the given values"""
40 try:
40 try:
41 return tokens[j].type == token.OP and tokens[j].string in o
41 return tokens[j].type == token.OP and tokens[j].string in o
42 except IndexError:
42 except IndexError:
43 return False
43 return False
44
44
45 def _findargnofcall(n):
45 def _findargnofcall(n):
46 """Find arg n of a call expression (start at 0)
46 """Find arg n of a call expression (start at 0)
47
47
48 Returns index of the first token of that argument, or None if
48 Returns index of the first token of that argument, or None if
49 there is not that many arguments.
49 there is not that many arguments.
50
50
51 Assumes that token[i + 1] is '('.
51 Assumes that token[i + 1] is '('.
52
52
53 """
53 """
54 nested = 0
54 nested = 0
55 for j in range(i + 2, len(tokens)):
55 for j in range(i + 2, len(tokens)):
56 if _isop(j, ')', ']', '}'):
56 if _isop(j, ')', ']', '}'):
57 # end of call, tuple, subscription or dict / set
57 # end of call, tuple, subscription or dict / set
58 nested -= 1
58 nested -= 1
59 if nested < 0:
59 if nested < 0:
60 return None
60 return None
61 elif n == 0:
61 elif n == 0:
62 # this is the starting position of arg
62 # this is the starting position of arg
63 return j
63 return j
64 elif _isop(j, '(', '[', '{'):
64 elif _isop(j, '(', '[', '{'):
65 nested += 1
65 nested += 1
66 elif _isop(j, ',') and nested == 0:
66 elif _isop(j, ',') and nested == 0:
67 n -= 1
67 n -= 1
68
68
69 return None
69 return None
70
70
71 def _ensuresysstr(j):
71 def _ensuresysstr(j):
72 """Make sure the token at j is a system string
72 """Make sure the token at j is a system string
73
73
74 Remember the given token so the string transformer won't add
74 Remember the given token so the string transformer won't add
75 the byte prefix.
75 the byte prefix.
76
76
77 Ignores tokens that are not strings. Assumes bounds checking has
77 Ignores tokens that are not strings. Assumes bounds checking has
78 already been done.
78 already been done.
79
79
80 """
80 """
81 k = j
81 k = j
82 currtoken = tokens[k]
82 currtoken = tokens[k]
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 k += 1
84 k += 1
85 if (
85 if (
86 currtoken.type == token.STRING
86 currtoken.type == token.STRING
87 and currtoken.string.startswith(("'", '"'))
87 and currtoken.string.startswith(("'", '"'))
88 ):
88 ):
89 sysstrtokens.add(currtoken)
89 sysstrtokens.add(currtoken)
90 try:
90 try:
91 currtoken = tokens[k]
91 currtoken = tokens[k]
92 except IndexError:
92 except IndexError:
93 break
93 break
94
94
95 def _isitemaccess(j):
95 def _isitemaccess(j):
96 """Assert the next tokens form an item access on `tokens[j]` and that
96 """Assert the next tokens form an item access on `tokens[j]` and that
97 `tokens[j]` is a name.
97 `tokens[j]` is a name.
98 """
98 """
99 try:
99 try:
100 return (
100 return (
101 tokens[j].type == token.NAME
101 tokens[j].type == token.NAME
102 and _isop(j + 1, '[')
102 and _isop(j + 1, '[')
103 and tokens[j + 2].type == token.STRING
103 and tokens[j + 2].type == token.STRING
104 and _isop(j + 3, ']')
104 and _isop(j + 3, ']')
105 )
105 )
106 except IndexError:
106 except IndexError:
107 return False
107 return False
108
108
109 def _ismethodcall(j, *methodnames):
109 def _ismethodcall(j, *methodnames):
110 """Assert the next tokens form a call to `methodname` with a string
110 """Assert the next tokens form a call to `methodname` with a string
111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
112 """
112 """
113 try:
113 try:
114 return (
114 return (
115 tokens[j].type == token.NAME
115 tokens[j].type == token.NAME
116 and _isop(j + 1, '.')
116 and _isop(j + 1, '.')
117 and tokens[j + 2].type == token.NAME
117 and tokens[j + 2].type == token.NAME
118 and tokens[j + 2].string in methodnames
118 and tokens[j + 2].string in methodnames
119 and _isop(j + 3, '(')
119 and _isop(j + 3, '(')
120 and tokens[j + 4].type == token.STRING
120 and tokens[j + 4].type == token.STRING
121 )
121 )
122 except IndexError:
122 except IndexError:
123 return False
123 return False
124
124
125 coldelta = 0 # column increment for new opening parens
125 coldelta = 0 # column increment for new opening parens
126 coloffset = -1 # column offset for the current line (-1: TBD)
126 coloffset = -1 # column offset for the current line (-1: TBD)
127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
128 ignorenextline = False # don't transform the next line
128 ignorenextline = False # don't transform the next line
129 insideignoreblock = False # don't transform until turned off
129 insideignoreblock = False # don't transform until turned off
130 for i, t in enumerate(tokens):
130 for i, t in enumerate(tokens):
131 # Compute the column offset for the current line, such that
131 # Compute the column offset for the current line, such that
132 # the current line will be aligned to the last opening paren
132 # the current line will be aligned to the last opening paren
133 # as before.
133 # as before.
134 if coloffset < 0:
134 if coloffset < 0:
135 if t.start[1] == parens[-1][1]:
135 if t.start[1] == parens[-1][1]:
136 coloffset = parens[-1][2]
136 coloffset = parens[-1][2]
137 elif t.start[1] + 1 == parens[-1][1]:
137 elif t.start[1] + 1 == parens[-1][1]:
138 # fix misaligned indent of s/util.Abort/error.Abort/
138 # fix misaligned indent of s/util.Abort/error.Abort/
139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
140 else:
140 else:
141 coloffset = 0
141 coloffset = 0
142
142
143 # Reset per-line attributes at EOL.
143 # Reset per-line attributes at EOL.
144 if t.type in (token.NEWLINE, tokenize.NL):
144 if t.type in (token.NEWLINE, tokenize.NL):
145 yield adjusttokenpos(t, coloffset)
145 yield adjusttokenpos(t, coloffset)
146 coldelta = 0
146 coldelta = 0
147 coloffset = -1
147 coloffset = -1
148 if not insideignoreblock:
148 if not insideignoreblock:
149 ignorenextline = (
149 ignorenextline = (
150 tokens[i - 1].type == token.COMMENT
150 tokens[i - 1].type == token.COMMENT
151 and tokens[i - 1].string == "#no-py3-transform"
151 and tokens[i - 1].string == "#no-py3-transform"
152 )
152 )
153 continue
153 continue
154
154
155 if t.type == token.COMMENT:
155 if t.type == token.COMMENT:
156 if t.string == "#py3-transform: off":
156 if t.string == "#py3-transform: off":
157 insideignoreblock = True
157 insideignoreblock = True
158 if t.string == "#py3-transform: on":
158 if t.string == "#py3-transform: on":
159 insideignoreblock = False
159 insideignoreblock = False
160
160
161 if ignorenextline or insideignoreblock:
161 if ignorenextline or insideignoreblock:
162 yield adjusttokenpos(t, coloffset)
162 yield adjusttokenpos(t, coloffset)
163 continue
163 continue
164
164
165 # Remember the last paren position.
165 # Remember the last paren position.
166 if _isop(i, '(', '[', '{'):
166 if _isop(i, '(', '[', '{'):
167 parens.append(t.end + (coloffset + coldelta,))
167 parens.append(t.end + (coloffset + coldelta,))
168 elif _isop(i, ')', ']', '}'):
168 elif _isop(i, ')', ']', '}'):
169 parens.pop()
169 parens.pop()
170
170
171 # Convert most string literals to byte literals. String literals
171 # Convert most string literals to byte literals. String literals
172 # in Python 2 are bytes. String literals in Python 3 are unicode.
172 # in Python 2 are bytes. String literals in Python 3 are unicode.
173 # Most strings in Mercurial are bytes and unicode strings are rare.
173 # Most strings in Mercurial are bytes and unicode strings are rare.
174 # Rather than rewrite all string literals to use ``b''`` to indicate
174 # Rather than rewrite all string literals to use ``b''`` to indicate
175 # byte strings, we apply this token transformer to insert the ``b``
175 # byte strings, we apply this token transformer to insert the ``b``
176 # prefix nearly everywhere.
176 # prefix nearly everywhere.
177 if t.type == token.STRING and t not in sysstrtokens:
177 if t.type == token.STRING and t not in sysstrtokens:
178 s = t.string
178 s = t.string
179
179
180 # Preserve docstrings as string literals. This is inconsistent
180 # Preserve docstrings as string literals. This is inconsistent
181 # with regular unprefixed strings. However, the
181 # with regular unprefixed strings. However, the
182 # "from __future__" parsing (which allows a module docstring to
182 # "from __future__" parsing (which allows a module docstring to
183 # exist before it) doesn't properly handle the docstring if it
183 # exist before it) doesn't properly handle the docstring if it
184 # is b''' prefixed, leading to a SyntaxError. We leave all
184 # is b''' prefixed, leading to a SyntaxError. We leave all
185 # docstrings as unprefixed to avoid this. This means Mercurial
185 # docstrings as unprefixed to avoid this. This means Mercurial
186 # components touching docstrings need to handle unicode,
186 # components touching docstrings need to handle unicode,
187 # unfortunately.
187 # unfortunately.
188 if s[0:3] in ("'''", '"""'):
188 if s[0:3] in ("'''", '"""'):
189 # If it's assigned to something, it's not a docstring
189 # If it's assigned to something, it's not a docstring
190 if not _isop(i - 1, '='):
190 if not _isop(i - 1, '='):
191 yield adjusttokenpos(t, coloffset)
191 yield adjusttokenpos(t, coloffset)
192 continue
192 continue
193
193
194 # If the first character isn't a quote, it is likely a string
194 # If the first character isn't a quote, it is likely a string
195 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
195 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
196 if s[0] not in ("'", '"'):
196 if s[0] not in ("'", '"'):
197 yield adjusttokenpos(t, coloffset)
197 yield adjusttokenpos(t, coloffset)
198 continue
198 continue
199
199
200 # String literal. Prefix to make a b'' string.
200 # String literal. Prefix to make a b'' string.
201 yield adjusttokenpos(t._replace(string='b%s' % t.string),
201 yield adjusttokenpos(t._replace(string='b%s' % t.string),
202 coloffset)
202 coloffset)
203 coldelta += 1
203 coldelta += 1
204 continue
204 continue
205
205
206 # This looks like a function call.
206 # This looks like a function call.
207 if t.type == token.NAME and _isop(i + 1, '('):
207 if t.type == token.NAME and _isop(i + 1, '('):
208 fn = t.string
208 fn = t.string
209
209
210 # *attr() builtins don't accept byte strings to 2nd argument.
210 # *attr() builtins don't accept byte strings to 2nd argument.
211 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
211 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
212 not _isop(i - 1, '.')):
212 not _isop(i - 1, '.')):
213 arg1idx = _findargnofcall(1)
213 arg1idx = _findargnofcall(1)
214 if arg1idx is not None:
214 if arg1idx is not None:
215 _ensuresysstr(arg1idx)
215 _ensuresysstr(arg1idx)
216
216
217 # .encode() and .decode() on str/bytes/unicode don't accept
217 # .encode() and .decode() on str/bytes/unicode don't accept
218 # byte strings on Python 3.
218 # byte strings on Python 3.
219 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
219 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
220 for argn in range(2):
220 for argn in range(2):
221 argidx = _findargnofcall(argn)
221 argidx = _findargnofcall(argn)
222 if argidx is not None:
222 if argidx is not None:
223 _ensuresysstr(argidx)
223 _ensuresysstr(argidx)
224
224
225 # It changes iteritems/values to items/values as they are not
225 # It changes iteritems/values to items/values as they are not
226 # present in Python 3 world.
226 # present in Python 3 world.
227 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
227 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
228 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
228 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
229 continue
229 continue
230
230
231 if t.type == token.NAME and t.string in opts['treat-as-kwargs']:
231 if t.type == token.NAME and t.string in opts['treat-as-kwargs']:
232 if _isitemaccess(i):
232 if _isitemaccess(i):
233 _ensuresysstr(i + 2)
233 _ensuresysstr(i + 2)
234 if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'):
234 if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'):
235 _ensuresysstr(i + 4)
235 _ensuresysstr(i + 4)
236
236
237 # Looks like "if __name__ == '__main__'".
237 # Looks like "if __name__ == '__main__'".
238 if (t.type == token.NAME and t.string == '__name__'
238 if (t.type == token.NAME and t.string == '__name__'
239 and _isop(i + 1, '==')):
239 and _isop(i + 1, '==')):
240 _ensuresysstr(i + 2)
240 _ensuresysstr(i + 2)
241
241
242 # Emit unmodified token.
242 # Emit unmodified token.
243 yield adjusttokenpos(t, coloffset)
243 yield adjusttokenpos(t, coloffset)
244
244
245 def process(fin, fout, opts):
245 def process(fin, fout, opts):
246 tokens = tokenize.tokenize(fin.readline)
246 tokens = tokenize.tokenize(fin.readline)
247 tokens = replacetokens(list(tokens), opts)
247 tokens = replacetokens(list(tokens), opts)
248 fout.write(tokenize.untokenize(tokens))
248 fout.write(tokenize.untokenize(tokens))
249
249
250 def tryunlink(fname):
250 def tryunlink(fname):
251 try:
251 try:
252 os.unlink(fname)
252 os.unlink(fname)
253 except OSError as err:
253 except OSError as err:
254 if err.errno != errno.ENOENT:
254 if err.errno != errno.ENOENT:
255 raise
255 raise
256
256
257 @contextlib.contextmanager
257 @contextlib.contextmanager
258 def editinplace(fname):
258 def editinplace(fname):
259 n = os.path.basename(fname)
259 n = os.path.basename(fname)
260 d = os.path.dirname(fname)
260 d = os.path.dirname(fname)
261 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
261 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
262 delete=False)
262 delete=False)
263 try:
263 try:
264 yield fp
264 yield fp
265 fp.close()
265 fp.close()
266 if os.name == 'nt':
266 if os.name == 'nt':
267 tryunlink(fname)
267 tryunlink(fname)
268 os.rename(fp.name, fname)
268 os.rename(fp.name, fname)
269 finally:
269 finally:
270 fp.close()
270 fp.close()
271 tryunlink(fp.name)
271 tryunlink(fp.name)
272
272
273 def main():
273 def main():
274 ap = argparse.ArgumentParser()
274 ap = argparse.ArgumentParser()
275 ap.add_argument('-i', '--inplace', action='store_true', default=False,
275 ap.add_argument('-i', '--inplace', action='store_true', default=False,
276 help='edit files in place')
276 help='edit files in place')
277 ap.add_argument('--dictiter', action='store_true', default=False,
277 ap.add_argument('--dictiter', action='store_true', default=False,
278 help='rewrite iteritems() and itervalues()'),
278 help='rewrite iteritems() and itervalues()'),
279 ap.add_argument('--treat-as-kwargs', nargs="+",
279 ap.add_argument('--treat-as-kwargs', nargs="+", default=[],
280 help="ignore kwargs-like objects"),
280 help="ignore kwargs-like objects"),
281 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
281 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
282 args = ap.parse_args()
282 args = ap.parse_args()
283 opts = {
283 opts = {
284 'dictiter': args.dictiter,
284 'dictiter': args.dictiter,
285 'treat-as-kwargs': set(
285 'treat-as-kwargs': set(args.treat_as_kwargs),
286 args.treat_as_kwargs
287 ) if args.treat_as_kwargs else set()
288 }
286 }
289 for fname in args.files:
287 for fname in args.files:
290 if args.inplace:
288 if args.inplace:
291 with editinplace(fname) as fout:
289 with editinplace(fname) as fout:
292 with open(fname, 'rb') as fin:
290 with open(fname, 'rb') as fin:
293 process(fin, fout, opts)
291 process(fin, fout, opts)
294 else:
292 else:
295 with open(fname, 'rb') as fin:
293 with open(fname, 'rb') as fin:
296 fout = sys.stdout.buffer
294 fout = sys.stdout.buffer
297 process(fin, fout, opts)
295 process(fin, fout, opts)
298
296
299 if __name__ == '__main__':
297 if __name__ == '__main__':
300 if sys.version_info.major < 3:
298 if sys.version_info.major < 3:
301 print('This script must be run under Python 3.')
299 print('This script must be run under Python 3.')
302 sys.exit(3)
300 sys.exit(3)
303 main()
301 main()
General Comments 0
You need to be logged in to leave comments. Login now