##// END OF EJS Templates
byteify-strings: fix misalignment with multi-line parenthesis...
Raphaël Gomès -
r42914:26a31c88 default
parent child Browse files
Show More
@@ -1,307 +1,311
1 1 #!/usr/bin/env python3
2 2 #
3 3 # byteify-strings.py - transform string literals to be Python 3 safe
4 4 #
5 5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import, print_function
11 11
12 12 import argparse
13 13 import contextlib
14 14 import errno
15 15 import os
16 16 import sys
17 17 import tempfile
18 18 import token
19 19 import tokenize
20 20
21 21 def adjusttokenpos(t, ofs):
22 22 """Adjust start/end column of the given token"""
23 23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 24 end=(t.end[0], t.end[1] + ofs))
25 25
26 26 def replacetokens(tokens, opts):
27 27 """Transform a stream of tokens from raw to Python 3.
28 28
29 29 Returns a generator of possibly rewritten tokens.
30 30
31 31 The input token list may be mutated as part of processing. However,
32 32 its changes do not necessarily match the output token stream.
33 33 """
34 34 sysstrtokens = set()
35 35
36 36 # The following utility functions access the tokens list and i index of
37 37 # the for i, t enumerate(tokens) loop below
38 38 def _isop(j, *o):
39 39 """Assert that tokens[j] is an OP with one of the given values"""
40 40 try:
41 41 return tokens[j].type == token.OP and tokens[j].string in o
42 42 except IndexError:
43 43 return False
44 44
45 45 def _findargnofcall(n):
46 46 """Find arg n of a call expression (start at 0)
47 47
48 48 Returns index of the first token of that argument, or None if
49 49 there is not that many arguments.
50 50
51 51 Assumes that token[i + 1] is '('.
52 52
53 53 """
54 54 nested = 0
55 55 for j in range(i + 2, len(tokens)):
56 56 if _isop(j, ')', ']', '}'):
57 57 # end of call, tuple, subscription or dict / set
58 58 nested -= 1
59 59 if nested < 0:
60 60 return None
61 61 elif n == 0:
62 62 # this is the starting position of arg
63 63 return j
64 64 elif _isop(j, '(', '[', '{'):
65 65 nested += 1
66 66 elif _isop(j, ',') and nested == 0:
67 67 n -= 1
68 68
69 69 return None
70 70
71 71 def _ensuresysstr(j):
72 72 """Make sure the token at j is a system string
73 73
74 74 Remember the given token so the string transformer won't add
75 75 the byte prefix.
76 76
77 77 Ignores tokens that are not strings. Assumes bounds checking has
78 78 already been done.
79 79
80 80 """
81 81 k = j
82 82 currtoken = tokens[k]
83 83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 84 k += 1
85 85 if (
86 86 currtoken.type == token.STRING
87 87 and currtoken.string.startswith(("'", '"'))
88 88 ):
89 89 sysstrtokens.add(currtoken)
90 90 try:
91 91 currtoken = tokens[k]
92 92 except IndexError:
93 93 break
94 94
95 95 def _isitemaccess(j):
96 96 """Assert the next tokens form an item access on `tokens[j]` and that
97 97 `tokens[j]` is a name.
98 98 """
99 99 try:
100 100 return (
101 101 tokens[j].type == token.NAME
102 102 and _isop(j + 1, '[')
103 103 and tokens[j + 2].type == token.STRING
104 104 and _isop(j + 3, ']')
105 105 )
106 106 except IndexError:
107 107 return False
108 108
109 109 def _ismethodcall(j, *methodnames):
110 110 """Assert the next tokens form a call to `methodname` with a string
111 111 as first argument on `tokens[j]` and that `tokens[j]` is a name.
112 112 """
113 113 try:
114 114 return (
115 115 tokens[j].type == token.NAME
116 116 and _isop(j + 1, '.')
117 117 and tokens[j + 2].type == token.NAME
118 118 and tokens[j + 2].string in methodnames
119 119 and _isop(j + 3, '(')
120 120 and tokens[j + 4].type == token.STRING
121 121 )
122 122 except IndexError:
123 123 return False
124 124
125 125 coldelta = 0 # column increment for new opening parens
126 126 coloffset = -1 # column offset for the current line (-1: TBD)
127 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
127 parens = [(0, 0, 0, -1)] # stack of (line, end-column, column-offset, type)
128 128 ignorenextline = False # don't transform the next line
129 129 insideignoreblock = False # don't transform until turned off
130 130 for i, t in enumerate(tokens):
131 131 # Compute the column offset for the current line, such that
132 132 # the current line will be aligned to the last opening paren
133 133 # as before.
134 134 if coloffset < 0:
135 if t.start[1] == parens[-1][1]:
136 coloffset = parens[-1][2]
137 elif t.start[1] + 1 == parens[-1][1]:
135 lastparen = parens[-1]
136 if t.start[1] == lastparen[1]:
137 coloffset = lastparen[2]
138 elif (
139 t.start[1] + 1 == lastparen[1]
140 and lastparen[3] not in (token.NEWLINE, tokenize.NL)
141 ):
138 142 # fix misaligned indent of s/util.Abort/error.Abort/
139 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
143 coloffset = lastparen[2] + (lastparen[1] - t.start[1])
140 144 else:
141 145 coloffset = 0
142 146
143 147 # Reset per-line attributes at EOL.
144 148 if t.type in (token.NEWLINE, tokenize.NL):
145 149 yield adjusttokenpos(t, coloffset)
146 150 coldelta = 0
147 151 coloffset = -1
148 152 if not insideignoreblock:
149 153 ignorenextline = (
150 154 tokens[i - 1].type == token.COMMENT
151 155 and tokens[i - 1].string == "#no-py3-transform"
152 156 )
153 157 continue
154 158
155 159 if t.type == token.COMMENT:
156 160 if t.string == "#py3-transform: off":
157 161 insideignoreblock = True
158 162 if t.string == "#py3-transform: on":
159 163 insideignoreblock = False
160 164
161 165 if ignorenextline or insideignoreblock:
162 166 yield adjusttokenpos(t, coloffset)
163 167 continue
164 168
165 169 # Remember the last paren position.
166 170 if _isop(i, '(', '[', '{'):
167 parens.append(t.end + (coloffset + coldelta,))
171 parens.append(t.end + (coloffset + coldelta, tokens[i + 1].type))
168 172 elif _isop(i, ')', ']', '}'):
169 173 parens.pop()
170 174
171 175 # Convert most string literals to byte literals. String literals
172 176 # in Python 2 are bytes. String literals in Python 3 are unicode.
173 177 # Most strings in Mercurial are bytes and unicode strings are rare.
174 178 # Rather than rewrite all string literals to use ``b''`` to indicate
175 179 # byte strings, we apply this token transformer to insert the ``b``
176 180 # prefix nearly everywhere.
177 181 if t.type == token.STRING and t not in sysstrtokens:
178 182 s = t.string
179 183
180 184 # Preserve docstrings as string literals. This is inconsistent
181 185 # with regular unprefixed strings. However, the
182 186 # "from __future__" parsing (which allows a module docstring to
183 187 # exist before it) doesn't properly handle the docstring if it
184 188 # is b''' prefixed, leading to a SyntaxError. We leave all
185 189 # docstrings as unprefixed to avoid this. This means Mercurial
186 190 # components touching docstrings need to handle unicode,
187 191 # unfortunately.
188 192 if s[0:3] in ("'''", '"""'):
189 193 # If it's assigned to something, it's not a docstring
190 194 if not _isop(i - 1, '='):
191 195 yield adjusttokenpos(t, coloffset)
192 196 continue
193 197
194 198 # If the first character isn't a quote, it is likely a string
195 199 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
196 200 if s[0] not in ("'", '"'):
197 201 yield adjusttokenpos(t, coloffset)
198 202 continue
199 203
200 204 # String literal. Prefix to make a b'' string.
201 205 yield adjusttokenpos(t._replace(string='b%s' % t.string),
202 206 coloffset)
203 207 coldelta += 1
204 208 continue
205 209
206 210 # This looks like a function call.
207 211 if t.type == token.NAME and _isop(i + 1, '('):
208 212 fn = t.string
209 213
210 214 # *attr() builtins don't accept byte strings to 2nd argument.
211 215 if fn in (
212 216 'getattr', 'setattr', 'hasattr', 'safehasattr', 'wrapfunction',
213 217 'wrapclass', 'addattr'
214 218 ) and (opts['allow-attr-methods'] or not _isop(i - 1, '.')):
215 219 arg1idx = _findargnofcall(1)
216 220 if arg1idx is not None:
217 221 _ensuresysstr(arg1idx)
218 222
219 223 # .encode() and .decode() on str/bytes/unicode don't accept
220 224 # byte strings on Python 3.
221 225 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
222 226 for argn in range(2):
223 227 argidx = _findargnofcall(argn)
224 228 if argidx is not None:
225 229 _ensuresysstr(argidx)
226 230
227 231 # It changes iteritems/values to items/values as they are not
228 232 # present in Python 3 world.
229 233 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
230 234 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
231 235 continue
232 236
233 237 if t.type == token.NAME and t.string in opts['treat-as-kwargs']:
234 238 if _isitemaccess(i):
235 239 _ensuresysstr(i + 2)
236 240 if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'):
237 241 _ensuresysstr(i + 4)
238 242
239 243 # Looks like "if __name__ == '__main__'".
240 244 if (t.type == token.NAME and t.string == '__name__'
241 245 and _isop(i + 1, '==')):
242 246 _ensuresysstr(i + 2)
243 247
244 248 # Emit unmodified token.
245 249 yield adjusttokenpos(t, coloffset)
246 250
247 251 def process(fin, fout, opts):
248 252 tokens = tokenize.tokenize(fin.readline)
249 253 tokens = replacetokens(list(tokens), opts)
250 254 fout.write(tokenize.untokenize(tokens))
251 255
252 256 def tryunlink(fname):
253 257 try:
254 258 os.unlink(fname)
255 259 except OSError as err:
256 260 if err.errno != errno.ENOENT:
257 261 raise
258 262
259 263 @contextlib.contextmanager
260 264 def editinplace(fname):
261 265 n = os.path.basename(fname)
262 266 d = os.path.dirname(fname)
263 267 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
264 268 delete=False)
265 269 try:
266 270 yield fp
267 271 fp.close()
268 272 if os.name == 'nt':
269 273 tryunlink(fname)
270 274 os.rename(fp.name, fname)
271 275 finally:
272 276 fp.close()
273 277 tryunlink(fp.name)
274 278
275 279 def main():
276 280 ap = argparse.ArgumentParser()
277 281 ap.add_argument('-i', '--inplace', action='store_true', default=False,
278 282 help='edit files in place')
279 283 ap.add_argument('--dictiter', action='store_true', default=False,
280 284 help='rewrite iteritems() and itervalues()'),
281 285 ap.add_argument('--allow-attr-methods', action='store_true',
282 286 default=False,
283 287 help='also handle attr*() when they are methods'),
284 288 ap.add_argument('--treat-as-kwargs', nargs="+", default=[],
285 289 help="ignore kwargs-like objects"),
286 290 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
287 291 args = ap.parse_args()
288 292 opts = {
289 293 'dictiter': args.dictiter,
290 294 'treat-as-kwargs': set(args.treat_as_kwargs),
291 295 'allow-attr-methods': args.allow_attr_methods,
292 296 }
293 297 for fname in args.files:
294 298 if args.inplace:
295 299 with editinplace(fname) as fout:
296 300 with open(fname, 'rb') as fin:
297 301 process(fin, fout, opts)
298 302 else:
299 303 with open(fname, 'rb') as fin:
300 304 fout = sys.stdout.buffer
301 305 process(fin, fout, opts)
302 306
303 307 if __name__ == '__main__':
304 308 if sys.version_info.major < 3:
305 309 print('This script must be run under Python 3.')
306 310 sys.exit(3)
307 311 main()
@@ -1,217 +1,261
1 1 #require py3
2 2
3 3 $ byteify_strings () {
4 4 > $PYTHON "$TESTDIR/../contrib/byteify-strings.py" "$@"
5 5 > }
6 6
7 7 Test in-place
8 8
9 9 $ cat > testfile.py <<EOF
10 10 > obj['test'] = b"1234"
11 11 > mydict.iteritems()
12 12 > EOF
13 13 $ byteify_strings testfile.py -i
14 14 $ cat testfile.py
15 15 obj[b'test'] = b"1234"
16 16 mydict.iteritems()
17 17
18 18 Test with dictiter
19 19
20 20 $ cat > testfile.py <<EOF
21 21 > obj['test'] = b"1234"
22 22 > mydict.iteritems()
23 23 > EOF
24 24 $ byteify_strings testfile.py --dictiter
25 25 obj[b'test'] = b"1234"
26 26 mydict.items()
27 27
28 28 Test kwargs-like objects
29 29
30 30 $ cat > testfile.py <<EOF
31 31 > kwargs['test'] = "123"
32 32 > kwargs[test['testing']]
33 33 > kwargs[test[[['testing']]]]
34 34 > kwargs[kwargs['testing']]
35 35 > kwargs.get('test')
36 36 > kwargs.pop('test')
37 37 > kwargs.get('test', 'testing')
38 38 > kwargs.pop('test', 'testing')
39 39 > kwargs.setdefault('test', 'testing')
40 40 >
41 41 > opts['test'] = "123"
42 42 > opts[test['testing']]
43 43 > opts[test[[['testing']]]]
44 44 > opts[opts['testing']]
45 45 > opts.get('test')
46 46 > opts.pop('test')
47 47 > opts.get('test', 'testing')
48 48 > opts.pop('test', 'testing')
49 49 > opts.setdefault('test', 'testing')
50 50 >
51 51 > commitopts['test'] = "123"
52 52 > commitopts[test['testing']]
53 53 > commitopts[test[[['testing']]]]
54 54 > commitopts[commitopts['testing']]
55 55 > commitopts.get('test')
56 56 > commitopts.pop('test')
57 57 > commitopts.get('test', 'testing')
58 58 > commitopts.pop('test', 'testing')
59 59 > commitopts.setdefault('test', 'testing')
60 60 > EOF
61 61 $ byteify_strings testfile.py --treat-as-kwargs kwargs opts commitopts
62 62 kwargs['test'] = b"123"
63 63 kwargs[test[b'testing']]
64 64 kwargs[test[[[b'testing']]]]
65 65 kwargs[kwargs['testing']]
66 66 kwargs.get('test')
67 67 kwargs.pop('test')
68 68 kwargs.get('test', b'testing')
69 69 kwargs.pop('test', b'testing')
70 70 kwargs.setdefault('test', b'testing')
71 71
72 72 opts['test'] = b"123"
73 73 opts[test[b'testing']]
74 74 opts[test[[[b'testing']]]]
75 75 opts[opts['testing']]
76 76 opts.get('test')
77 77 opts.pop('test')
78 78 opts.get('test', b'testing')
79 79 opts.pop('test', b'testing')
80 80 opts.setdefault('test', b'testing')
81 81
82 82 commitopts['test'] = b"123"
83 83 commitopts[test[b'testing']]
84 84 commitopts[test[[[b'testing']]]]
85 85 commitopts[commitopts['testing']]
86 86 commitopts.get('test')
87 87 commitopts.pop('test')
88 88 commitopts.get('test', b'testing')
89 89 commitopts.pop('test', b'testing')
90 90 commitopts.setdefault('test', b'testing')
91 91
92 92 Test attr*() as methods
93 93
94 94 $ cat > testfile.py <<EOF
95 95 > setattr(o, 'a', 1)
96 96 > util.setattr(o, 'ae', 1)
97 97 > util.getattr(o, 'alksjdf', 'default')
98 98 > util.addattr(o, 'asdf')
99 99 > util.hasattr(o, 'lksjdf', 'default')
100 100 > util.safehasattr(o, 'lksjdf', 'default')
101 101 > @eh.wrapfunction(func, 'lksjdf')
102 102 > def f():
103 103 > pass
104 104 > @eh.wrapclass(klass, 'lksjdf')
105 105 > def f():
106 106 > pass
107 107 > EOF
108 108 $ byteify_strings testfile.py --allow-attr-methods
109 109 setattr(o, 'a', 1)
110 110 util.setattr(o, 'ae', 1)
111 111 util.getattr(o, 'alksjdf', b'default')
112 112 util.addattr(o, 'asdf')
113 113 util.hasattr(o, 'lksjdf', b'default')
114 114 util.safehasattr(o, 'lksjdf', b'default')
115 115 @eh.wrapfunction(func, 'lksjdf')
116 116 def f():
117 117 pass
118 118 @eh.wrapclass(klass, 'lksjdf')
119 119 def f():
120 120 pass
121 121
122 122 Test without attr*() as methods
123 123
124 124 $ cat > testfile.py <<EOF
125 125 > setattr(o, 'a', 1)
126 126 > util.setattr(o, 'ae', 1)
127 127 > util.getattr(o, 'alksjdf', 'default')
128 128 > util.addattr(o, 'asdf')
129 129 > util.hasattr(o, 'lksjdf', 'default')
130 130 > util.safehasattr(o, 'lksjdf', 'default')
131 131 > @eh.wrapfunction(func, 'lksjdf')
132 132 > def f():
133 133 > pass
134 134 > @eh.wrapclass(klass, 'lksjdf')
135 135 > def f():
136 136 > pass
137 137 > EOF
138 138 $ byteify_strings testfile.py
139 139 setattr(o, 'a', 1)
140 140 util.setattr(o, b'ae', 1)
141 141 util.getattr(o, b'alksjdf', b'default')
142 142 util.addattr(o, b'asdf')
143 143 util.hasattr(o, b'lksjdf', b'default')
144 144 util.safehasattr(o, b'lksjdf', b'default')
145 145 @eh.wrapfunction(func, b'lksjdf')
146 146 def f():
147 147 pass
148 148 @eh.wrapclass(klass, b'lksjdf')
149 149 def f():
150 150 pass
151 151
152 152 Test ignore comments
153 153
154 154 $ cat > testfile.py <<EOF
155 155 > #py3-transform: off
156 156 > "none"
157 157 > "of"
158 158 > 'these'
159 159 > s = """should"""
160 160 > d = '''be'''
161 161 > #py3-transform: on
162 162 > "this should"
163 163 > 'and this also'
164 164 >
165 165 > #no-py3-transform
166 166 > l = "this should be ignored"
167 167 > l2 = "this shouldn't"
168 168 >
169 169 > EOF
170 170 $ byteify_strings testfile.py
171 171 #py3-transform: off
172 172 "none"
173 173 "of"
174 174 'these'
175 175 s = """should"""
176 176 d = '''be'''
177 177 #py3-transform: on
178 178 b"this should"
179 179 b'and this also'
180 180
181 181 #no-py3-transform
182 182 l = "this should be ignored"
183 183 l2 = b"this shouldn't"
184 184
185 185 Test triple-quoted strings
186 186
187 187 $ cat > testfile.py <<EOF
188 188 > """This is ignored
189 189 > """
190 190 >
191 191 > line = """
192 192 > This should not be
193 193 > """
194 194 > line = '''
195 195 > Neither should this
196 196 > '''
197 197 > EOF
198 198 $ byteify_strings testfile.py
199 199 """This is ignored
200 200 """
201 201
202 202 line = b"""
203 203 This should not be
204 204 """
205 205 line = b'''
206 206 Neither should this
207 207 '''
208 208
209 209 Test prefixed strings
210 210
211 211 $ cat > testfile.py <<EOF
212 212 > obj['test'] = b"1234"
213 213 > obj[r'test'] = u"1234"
214 214 > EOF
215 215 $ byteify_strings testfile.py
216 216 obj[b'test'] = b"1234"
217 217 obj[r'test'] = u"1234"
218
219 Test multi-line alignment
220
221 $ cat > testfile.py <<'EOF'
222 > def foo():
223 > error.Abort(_("foo"
224 > "bar"
225 > "%s")
226 > % parameter)
227 > {
228 > 'test': dict,
229 > 'test2': dict,
230 > }
231 > [
232 > "thing",
233 > "thing2"
234 > ]
235 > (
236 > "tuple",
237 > "tuple2",
238 > )
239 > {"thing",
240 > }
241 > EOF
242 $ byteify_strings testfile.py
243 def foo():
244 error.Abort(_(b"foo"
245 b"bar"
246 b"%s")
247 % parameter)
248 {
249 b'test': dict,
250 b'test2': dict,
251 }
252 [
253 b"thing",
254 b"thing2"
255 ]
256 (
257 b"tuple",
258 b"tuple2",
259 )
260 {b"thing",
261 }
General Comments 0
You need to be logged in to leave comments. Login now