##// END OF EJS Templates
byteify-strings: add helpers to check for item access or method call...
Raphaël Gomès -
r42907:c9fd8163 default
parent child Browse files
Show More
@@ -1,262 +1,292 b''
1 1 #!/usr/bin/env python3
2 2 #
3 3 # byteify-strings.py - transform string literals to be Python 3 safe
4 4 #
5 5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import, print_function
11 11
12 12 import argparse
13 13 import contextlib
14 14 import errno
15 15 import os
16 16 import sys
17 17 import tempfile
18 18 import token
19 19 import tokenize
20 20
21 21 def adjusttokenpos(t, ofs):
22 22 """Adjust start/end column of the given token"""
23 23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 24 end=(t.end[0], t.end[1] + ofs))
25 25
26 26 def replacetokens(tokens, opts):
27 27 """Transform a stream of tokens from raw to Python 3.
28 28
29 29 Returns a generator of possibly rewritten tokens.
30 30
31 31 The input token list may be mutated as part of processing. However,
32 32 its changes do not necessarily match the output token stream.
33 33 """
34 34 sysstrtokens = set()
35 35
36 36 # The following utility functions access the tokens list and i index of
37 37 # the for i, t enumerate(tokens) loop below
38 38 def _isop(j, *o):
39 39 """Assert that tokens[j] is an OP with one of the given values"""
40 40 try:
41 41 return tokens[j].type == token.OP and tokens[j].string in o
42 42 except IndexError:
43 43 return False
44 44
45 45 def _findargnofcall(n):
46 46 """Find arg n of a call expression (start at 0)
47 47
48 48 Returns index of the first token of that argument, or None if
49 49 there is not that many arguments.
50 50
51 51 Assumes that token[i + 1] is '('.
52 52
53 53 """
54 54 nested = 0
55 55 for j in range(i + 2, len(tokens)):
56 56 if _isop(j, ')', ']', '}'):
57 57 # end of call, tuple, subscription or dict / set
58 58 nested -= 1
59 59 if nested < 0:
60 60 return None
61 61 elif n == 0:
62 62 # this is the starting position of arg
63 63 return j
64 64 elif _isop(j, '(', '[', '{'):
65 65 nested += 1
66 66 elif _isop(j, ',') and nested == 0:
67 67 n -= 1
68 68
69 69 return None
70 70
71 71 def _ensuresysstr(j):
72 72 """Make sure the token at j is a system string
73 73
74 74 Remember the given token so the string transformer won't add
75 75 the byte prefix.
76 76
77 77 Ignores tokens that are not strings. Assumes bounds checking has
78 78 already been done.
79 79
80 80 """
81 81 k = j
82 82 currtoken = tokens[k]
83 83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 84 k += 1
85 85 if (
86 86 currtoken.type == token.STRING
87 87 and currtoken.string.startswith(("'", '"'))
88 88 ):
89 89 sysstrtokens.add(currtoken)
90 90 try:
91 91 currtoken = tokens[k]
92 92 except IndexError:
93 93 break
94 94
95 def _isitemaccess(j):
96 """Assert the next tokens form an item access on `tokens[j]` and that
97 `tokens[j]` is a name.
98 """
99 try:
100 return (
101 tokens[j].type == token.NAME
102 and _isop(j + 1, '[')
103 and tokens[j + 2].type == token.STRING
104 and _isop(j + 3, ']')
105 )
106 except IndexError:
107 return False
108
109 def _ismethodcall(j, *methodnames):
110 """Assert the next tokens form a call to `methodname` with a string
111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
112 """
113 try:
114 return (
115 tokens[j].type == token.NAME
116 and _isop(j + 1, '.')
117 and tokens[j + 2].type == token.NAME
118 and tokens[j + 2].string in methodnames
119 and _isop(j + 3, '(')
120 and tokens[j + 4].type == token.STRING
121 )
122 except IndexError:
123 return False
124
95 125 coldelta = 0 # column increment for new opening parens
96 126 coloffset = -1 # column offset for the current line (-1: TBD)
97 127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
98 128 ignorenextline = False # don't transform the next line
99 129 insideignoreblock = False # don't transform until turned off
100 130 for i, t in enumerate(tokens):
101 131 # Compute the column offset for the current line, such that
102 132 # the current line will be aligned to the last opening paren
103 133 # as before.
104 134 if coloffset < 0:
105 135 if t.start[1] == parens[-1][1]:
106 136 coloffset = parens[-1][2]
107 137 elif t.start[1] + 1 == parens[-1][1]:
108 138 # fix misaligned indent of s/util.Abort/error.Abort/
109 139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
110 140 else:
111 141 coloffset = 0
112 142
113 143 # Reset per-line attributes at EOL.
114 144 if t.type in (token.NEWLINE, tokenize.NL):
115 145 yield adjusttokenpos(t, coloffset)
116 146 coldelta = 0
117 147 coloffset = -1
118 148 if not insideignoreblock:
119 149 ignorenextline = (
120 150 tokens[i - 1].type == token.COMMENT
121 151 and tokens[i - 1].string == "#no-py3-transform"
122 152 )
123 153 continue
124 154
125 155 if t.type == token.COMMENT:
126 156 if t.string == "#py3-transform: off":
127 157 insideignoreblock = True
128 158 if t.string == "#py3-transform: on":
129 159 insideignoreblock = False
130 160
131 161 if ignorenextline or insideignoreblock:
132 162 yield adjusttokenpos(t, coloffset)
133 163 continue
134 164
135 165 # Remember the last paren position.
136 166 if _isop(i, '(', '[', '{'):
137 167 parens.append(t.end + (coloffset + coldelta,))
138 168 elif _isop(i, ')', ']', '}'):
139 169 parens.pop()
140 170
141 171 # Convert most string literals to byte literals. String literals
142 172 # in Python 2 are bytes. String literals in Python 3 are unicode.
143 173 # Most strings in Mercurial are bytes and unicode strings are rare.
144 174 # Rather than rewrite all string literals to use ``b''`` to indicate
145 175 # byte strings, we apply this token transformer to insert the ``b``
146 176 # prefix nearly everywhere.
147 177 if t.type == token.STRING and t not in sysstrtokens:
148 178 s = t.string
149 179
150 180 # Preserve docstrings as string literals. This is inconsistent
151 181 # with regular unprefixed strings. However, the
152 182 # "from __future__" parsing (which allows a module docstring to
153 183 # exist before it) doesn't properly handle the docstring if it
154 184 # is b''' prefixed, leading to a SyntaxError. We leave all
155 185 # docstrings as unprefixed to avoid this. This means Mercurial
156 186 # components touching docstrings need to handle unicode,
157 187 # unfortunately.
158 188 if s[0:3] in ("'''", '"""'):
159 189 # If it's assigned to something, it's not a docstring
160 190 if not _isop(i - 1, '='):
161 191 yield adjusttokenpos(t, coloffset)
162 192 continue
163 193
164 194 # If the first character isn't a quote, it is likely a string
165 195 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
166 196 if s[0] not in ("'", '"'):
167 197 yield adjusttokenpos(t, coloffset)
168 198 continue
169 199
170 200 # String literal. Prefix to make a b'' string.
171 201 yield adjusttokenpos(t._replace(string='b%s' % t.string),
172 202 coloffset)
173 203 coldelta += 1
174 204 continue
175 205
176 206 # This looks like a function call.
177 207 if t.type == token.NAME and _isop(i + 1, '('):
178 208 fn = t.string
179 209
180 210 # *attr() builtins don't accept byte strings to 2nd argument.
181 211 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
182 212 not _isop(i - 1, '.')):
183 213 arg1idx = _findargnofcall(1)
184 214 if arg1idx is not None:
185 215 _ensuresysstr(arg1idx)
186 216
187 217 # .encode() and .decode() on str/bytes/unicode don't accept
188 218 # byte strings on Python 3.
189 219 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
190 220 for argn in range(2):
191 221 argidx = _findargnofcall(argn)
192 222 if argidx is not None:
193 223 _ensuresysstr(argidx)
194 224
195 225 # It changes iteritems/values to items/values as they are not
196 226 # present in Python 3 world.
197 227 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
198 228 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
199 229 continue
200 230
201 231 # Looks like "if __name__ == '__main__'".
202 232 if (t.type == token.NAME and t.string == '__name__'
203 233 and _isop(i + 1, '==')):
204 234 _ensuresysstr(i + 2)
205 235
206 236 # Emit unmodified token.
207 237 yield adjusttokenpos(t, coloffset)
208 238
209 239 def process(fin, fout, opts):
210 240 tokens = tokenize.tokenize(fin.readline)
211 241 tokens = replacetokens(list(tokens), opts)
212 242 fout.write(tokenize.untokenize(tokens))
213 243
214 244 def tryunlink(fname):
215 245 try:
216 246 os.unlink(fname)
217 247 except OSError as err:
218 248 if err.errno != errno.ENOENT:
219 249 raise
220 250
221 251 @contextlib.contextmanager
222 252 def editinplace(fname):
223 253 n = os.path.basename(fname)
224 254 d = os.path.dirname(fname)
225 255 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
226 256 delete=False)
227 257 try:
228 258 yield fp
229 259 fp.close()
230 260 if os.name == 'nt':
231 261 tryunlink(fname)
232 262 os.rename(fp.name, fname)
233 263 finally:
234 264 fp.close()
235 265 tryunlink(fp.name)
236 266
237 267 def main():
238 268 ap = argparse.ArgumentParser()
239 269 ap.add_argument('-i', '--inplace', action='store_true', default=False,
240 270 help='edit files in place')
241 271 ap.add_argument('--dictiter', action='store_true', default=False,
242 272 help='rewrite iteritems() and itervalues()'),
243 273 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
244 274 args = ap.parse_args()
245 275 opts = {
246 276 'dictiter': args.dictiter,
247 277 }
248 278 for fname in args.files:
249 279 if args.inplace:
250 280 with editinplace(fname) as fout:
251 281 with open(fname, 'rb') as fin:
252 282 process(fin, fout, opts)
253 283 else:
254 284 with open(fname, 'rb') as fin:
255 285 fout = sys.stdout.buffer
256 286 process(fin, fout, opts)
257 287
258 288 if __name__ == '__main__':
259 289 if sys.version_info.major < 3:
260 290 print('This script must be run under Python 3.')
261 291 sys.exit(3)
262 292 main()
General Comments 0
You need to be logged in to leave comments. Login now