##// END OF EJS Templates
byteify-strings: simplify default value for `--treat-as-kwargs`
Raphaël Gomès -
r42909:5e296f61 default
parent child Browse files
Show More
@@ -1,303 +1,301 b''
1 1 #!/usr/bin/env python3
2 2 #
3 3 # byteify-strings.py - transform string literals to be Python 3 safe
4 4 #
5 5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import, print_function
11 11
12 12 import argparse
13 13 import contextlib
14 14 import errno
15 15 import os
16 16 import sys
17 17 import tempfile
18 18 import token
19 19 import tokenize
20 20
21 21 def adjusttokenpos(t, ofs):
22 22 """Adjust start/end column of the given token"""
23 23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 24 end=(t.end[0], t.end[1] + ofs))
25 25
26 26 def replacetokens(tokens, opts):
27 27 """Transform a stream of tokens from raw to Python 3.
28 28
29 29 Returns a generator of possibly rewritten tokens.
30 30
31 31 The input token list may be mutated as part of processing. However,
32 32 its changes do not necessarily match the output token stream.
33 33 """
34 34 sysstrtokens = set()
35 35
36 36 # The following utility functions access the tokens list and i index of
37 37 # the for i, t enumerate(tokens) loop below
38 38 def _isop(j, *o):
39 39 """Assert that tokens[j] is an OP with one of the given values"""
40 40 try:
41 41 return tokens[j].type == token.OP and tokens[j].string in o
42 42 except IndexError:
43 43 return False
44 44
45 45 def _findargnofcall(n):
46 46 """Find arg n of a call expression (start at 0)
47 47
48 48 Returns index of the first token of that argument, or None if
49 49 there is not that many arguments.
50 50
51 51 Assumes that token[i + 1] is '('.
52 52
53 53 """
54 54 nested = 0
55 55 for j in range(i + 2, len(tokens)):
56 56 if _isop(j, ')', ']', '}'):
57 57 # end of call, tuple, subscription or dict / set
58 58 nested -= 1
59 59 if nested < 0:
60 60 return None
61 61 elif n == 0:
62 62 # this is the starting position of arg
63 63 return j
64 64 elif _isop(j, '(', '[', '{'):
65 65 nested += 1
66 66 elif _isop(j, ',') and nested == 0:
67 67 n -= 1
68 68
69 69 return None
70 70
71 71 def _ensuresysstr(j):
72 72 """Make sure the token at j is a system string
73 73
74 74 Remember the given token so the string transformer won't add
75 75 the byte prefix.
76 76
77 77 Ignores tokens that are not strings. Assumes bounds checking has
78 78 already been done.
79 79
80 80 """
81 81 k = j
82 82 currtoken = tokens[k]
83 83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 84 k += 1
85 85 if (
86 86 currtoken.type == token.STRING
87 87 and currtoken.string.startswith(("'", '"'))
88 88 ):
89 89 sysstrtokens.add(currtoken)
90 90 try:
91 91 currtoken = tokens[k]
92 92 except IndexError:
93 93 break
94 94
95 95 def _isitemaccess(j):
96 96 """Assert the next tokens form an item access on `tokens[j]` and that
97 97 `tokens[j]` is a name.
98 98 """
99 99 try:
100 100 return (
101 101 tokens[j].type == token.NAME
102 102 and _isop(j + 1, '[')
103 103 and tokens[j + 2].type == token.STRING
104 104 and _isop(j + 3, ']')
105 105 )
106 106 except IndexError:
107 107 return False
108 108
109 109 def _ismethodcall(j, *methodnames):
110 110 """Assert the next tokens form a call to `methodname` with a string
111 111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
112 112 """
113 113 try:
114 114 return (
115 115 tokens[j].type == token.NAME
116 116 and _isop(j + 1, '.')
117 117 and tokens[j + 2].type == token.NAME
118 118 and tokens[j + 2].string in methodnames
119 119 and _isop(j + 3, '(')
120 120 and tokens[j + 4].type == token.STRING
121 121 )
122 122 except IndexError:
123 123 return False
124 124
125 125 coldelta = 0 # column increment for new opening parens
126 126 coloffset = -1 # column offset for the current line (-1: TBD)
127 127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
128 128 ignorenextline = False # don't transform the next line
129 129 insideignoreblock = False # don't transform until turned off
130 130 for i, t in enumerate(tokens):
131 131 # Compute the column offset for the current line, such that
132 132 # the current line will be aligned to the last opening paren
133 133 # as before.
134 134 if coloffset < 0:
135 135 if t.start[1] == parens[-1][1]:
136 136 coloffset = parens[-1][2]
137 137 elif t.start[1] + 1 == parens[-1][1]:
138 138 # fix misaligned indent of s/util.Abort/error.Abort/
139 139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
140 140 else:
141 141 coloffset = 0
142 142
143 143 # Reset per-line attributes at EOL.
144 144 if t.type in (token.NEWLINE, tokenize.NL):
145 145 yield adjusttokenpos(t, coloffset)
146 146 coldelta = 0
147 147 coloffset = -1
148 148 if not insideignoreblock:
149 149 ignorenextline = (
150 150 tokens[i - 1].type == token.COMMENT
151 151 and tokens[i - 1].string == "#no-py3-transform"
152 152 )
153 153 continue
154 154
155 155 if t.type == token.COMMENT:
156 156 if t.string == "#py3-transform: off":
157 157 insideignoreblock = True
158 158 if t.string == "#py3-transform: on":
159 159 insideignoreblock = False
160 160
161 161 if ignorenextline or insideignoreblock:
162 162 yield adjusttokenpos(t, coloffset)
163 163 continue
164 164
165 165 # Remember the last paren position.
166 166 if _isop(i, '(', '[', '{'):
167 167 parens.append(t.end + (coloffset + coldelta,))
168 168 elif _isop(i, ')', ']', '}'):
169 169 parens.pop()
170 170
171 171 # Convert most string literals to byte literals. String literals
172 172 # in Python 2 are bytes. String literals in Python 3 are unicode.
173 173 # Most strings in Mercurial are bytes and unicode strings are rare.
174 174 # Rather than rewrite all string literals to use ``b''`` to indicate
175 175 # byte strings, we apply this token transformer to insert the ``b``
176 176 # prefix nearly everywhere.
177 177 if t.type == token.STRING and t not in sysstrtokens:
178 178 s = t.string
179 179
180 180 # Preserve docstrings as string literals. This is inconsistent
181 181 # with regular unprefixed strings. However, the
182 182 # "from __future__" parsing (which allows a module docstring to
183 183 # exist before it) doesn't properly handle the docstring if it
184 184 # is b''' prefixed, leading to a SyntaxError. We leave all
185 185 # docstrings as unprefixed to avoid this. This means Mercurial
186 186 # components touching docstrings need to handle unicode,
187 187 # unfortunately.
188 188 if s[0:3] in ("'''", '"""'):
189 189 # If it's assigned to something, it's not a docstring
190 190 if not _isop(i - 1, '='):
191 191 yield adjusttokenpos(t, coloffset)
192 192 continue
193 193
194 194 # If the first character isn't a quote, it is likely a string
195 195 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
196 196 if s[0] not in ("'", '"'):
197 197 yield adjusttokenpos(t, coloffset)
198 198 continue
199 199
200 200 # String literal. Prefix to make a b'' string.
201 201 yield adjusttokenpos(t._replace(string='b%s' % t.string),
202 202 coloffset)
203 203 coldelta += 1
204 204 continue
205 205
206 206 # This looks like a function call.
207 207 if t.type == token.NAME and _isop(i + 1, '('):
208 208 fn = t.string
209 209
210 210 # *attr() builtins don't accept byte strings to 2nd argument.
211 211 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
212 212 not _isop(i - 1, '.')):
213 213 arg1idx = _findargnofcall(1)
214 214 if arg1idx is not None:
215 215 _ensuresysstr(arg1idx)
216 216
217 217 # .encode() and .decode() on str/bytes/unicode don't accept
218 218 # byte strings on Python 3.
219 219 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
220 220 for argn in range(2):
221 221 argidx = _findargnofcall(argn)
222 222 if argidx is not None:
223 223 _ensuresysstr(argidx)
224 224
225 225 # It changes iteritems/values to items/values as they are not
226 226 # present in Python 3 world.
227 227 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
228 228 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
229 229 continue
230 230
231 231 if t.type == token.NAME and t.string in opts['treat-as-kwargs']:
232 232 if _isitemaccess(i):
233 233 _ensuresysstr(i + 2)
234 234 if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'):
235 235 _ensuresysstr(i + 4)
236 236
237 237 # Looks like "if __name__ == '__main__'".
238 238 if (t.type == token.NAME and t.string == '__name__'
239 239 and _isop(i + 1, '==')):
240 240 _ensuresysstr(i + 2)
241 241
242 242 # Emit unmodified token.
243 243 yield adjusttokenpos(t, coloffset)
244 244
245 245 def process(fin, fout, opts):
246 246 tokens = tokenize.tokenize(fin.readline)
247 247 tokens = replacetokens(list(tokens), opts)
248 248 fout.write(tokenize.untokenize(tokens))
249 249
250 250 def tryunlink(fname):
251 251 try:
252 252 os.unlink(fname)
253 253 except OSError as err:
254 254 if err.errno != errno.ENOENT:
255 255 raise
256 256
257 257 @contextlib.contextmanager
258 258 def editinplace(fname):
259 259 n = os.path.basename(fname)
260 260 d = os.path.dirname(fname)
261 261 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
262 262 delete=False)
263 263 try:
264 264 yield fp
265 265 fp.close()
266 266 if os.name == 'nt':
267 267 tryunlink(fname)
268 268 os.rename(fp.name, fname)
269 269 finally:
270 270 fp.close()
271 271 tryunlink(fp.name)
272 272
273 273 def main():
274 274 ap = argparse.ArgumentParser()
275 275 ap.add_argument('-i', '--inplace', action='store_true', default=False,
276 276 help='edit files in place')
277 277 ap.add_argument('--dictiter', action='store_true', default=False,
278 278 help='rewrite iteritems() and itervalues()'),
279 ap.add_argument('--treat-as-kwargs', nargs="+",
279 ap.add_argument('--treat-as-kwargs', nargs="+", default=[],
280 280 help="ignore kwargs-like objects"),
281 281 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
282 282 args = ap.parse_args()
283 283 opts = {
284 284 'dictiter': args.dictiter,
285 'treat-as-kwargs': set(
286 args.treat_as_kwargs
287 ) if args.treat_as_kwargs else set()
285 'treat-as-kwargs': set(args.treat_as_kwargs),
288 286 }
289 287 for fname in args.files:
290 288 if args.inplace:
291 289 with editinplace(fname) as fout:
292 290 with open(fname, 'rb') as fin:
293 291 process(fin, fout, opts)
294 292 else:
295 293 with open(fname, 'rb') as fin:
296 294 fout = sys.stdout.buffer
297 295 process(fin, fout, opts)
298 296
299 297 if __name__ == '__main__':
300 298 if sys.version_info.major < 3:
301 299 print('This script must be run under Python 3.')
302 300 sys.exit(3)
303 301 main()
General Comments 0
You need to be logged in to leave comments. Login now