##// END OF EJS Templates
byteify-strings: add support for ignore comments...
Raphaël Gomès -
r42906:b9a20047 default
parent child Browse files
Show More
@@ -1,245 +1,262 b''
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import, print_function
10 from __future__ import absolute_import, print_function
11
11
12 import argparse
12 import argparse
13 import contextlib
13 import contextlib
14 import errno
14 import errno
15 import os
15 import os
16 import sys
16 import sys
17 import tempfile
17 import tempfile
18 import token
18 import token
19 import tokenize
19 import tokenize
20
20
21 def adjusttokenpos(t, ofs):
21 def adjusttokenpos(t, ofs):
22 """Adjust start/end column of the given token"""
22 """Adjust start/end column of the given token"""
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 end=(t.end[0], t.end[1] + ofs))
24 end=(t.end[0], t.end[1] + ofs))
25
25
26 def replacetokens(tokens, opts):
26 def replacetokens(tokens, opts):
27 """Transform a stream of tokens from raw to Python 3.
27 """Transform a stream of tokens from raw to Python 3.
28
28
29 Returns a generator of possibly rewritten tokens.
29 Returns a generator of possibly rewritten tokens.
30
30
31 The input token list may be mutated as part of processing. However,
31 The input token list may be mutated as part of processing. However,
32 its changes do not necessarily match the output token stream.
32 its changes do not necessarily match the output token stream.
33 """
33 """
34 sysstrtokens = set()
34 sysstrtokens = set()
35
35
36 # The following utility functions access the tokens list and i index of
36 # The following utility functions access the tokens list and i index of
37 # the for i, t enumerate(tokens) loop below
37 # the for i, t enumerate(tokens) loop below
38 def _isop(j, *o):
38 def _isop(j, *o):
39 """Assert that tokens[j] is an OP with one of the given values"""
39 """Assert that tokens[j] is an OP with one of the given values"""
40 try:
40 try:
41 return tokens[j].type == token.OP and tokens[j].string in o
41 return tokens[j].type == token.OP and tokens[j].string in o
42 except IndexError:
42 except IndexError:
43 return False
43 return False
44
44
45 def _findargnofcall(n):
45 def _findargnofcall(n):
46 """Find arg n of a call expression (start at 0)
46 """Find arg n of a call expression (start at 0)
47
47
48 Returns index of the first token of that argument, or None if
48 Returns index of the first token of that argument, or None if
49 there is not that many arguments.
49 there is not that many arguments.
50
50
51 Assumes that token[i + 1] is '('.
51 Assumes that token[i + 1] is '('.
52
52
53 """
53 """
54 nested = 0
54 nested = 0
55 for j in range(i + 2, len(tokens)):
55 for j in range(i + 2, len(tokens)):
56 if _isop(j, ')', ']', '}'):
56 if _isop(j, ')', ']', '}'):
57 # end of call, tuple, subscription or dict / set
57 # end of call, tuple, subscription or dict / set
58 nested -= 1
58 nested -= 1
59 if nested < 0:
59 if nested < 0:
60 return None
60 return None
61 elif n == 0:
61 elif n == 0:
62 # this is the starting position of arg
62 # this is the starting position of arg
63 return j
63 return j
64 elif _isop(j, '(', '[', '{'):
64 elif _isop(j, '(', '[', '{'):
65 nested += 1
65 nested += 1
66 elif _isop(j, ',') and nested == 0:
66 elif _isop(j, ',') and nested == 0:
67 n -= 1
67 n -= 1
68
68
69 return None
69 return None
70
70
71 def _ensuresysstr(j):
71 def _ensuresysstr(j):
72 """Make sure the token at j is a system string
72 """Make sure the token at j is a system string
73
73
74 Remember the given token so the string transformer won't add
74 Remember the given token so the string transformer won't add
75 the byte prefix.
75 the byte prefix.
76
76
77 Ignores tokens that are not strings. Assumes bounds checking has
77 Ignores tokens that are not strings. Assumes bounds checking has
78 already been done.
78 already been done.
79
79
80 """
80 """
81 k = j
81 k = j
82 currtoken = tokens[k]
82 currtoken = tokens[k]
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 k += 1
84 k += 1
85 if (
85 if (
86 currtoken.type == token.STRING
86 currtoken.type == token.STRING
87 and currtoken.string.startswith(("'", '"'))
87 and currtoken.string.startswith(("'", '"'))
88 ):
88 ):
89 sysstrtokens.add(currtoken)
89 sysstrtokens.add(currtoken)
90 try:
90 try:
91 currtoken = tokens[k]
91 currtoken = tokens[k]
92 except IndexError:
92 except IndexError:
93 break
93 break
94
94
95 coldelta = 0 # column increment for new opening parens
95 coldelta = 0 # column increment for new opening parens
96 coloffset = -1 # column offset for the current line (-1: TBD)
96 coloffset = -1 # column offset for the current line (-1: TBD)
97 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
97 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
98 ignorenextline = False # don't transform the next line
99 insideignoreblock = False # don't transform until turned off
98 for i, t in enumerate(tokens):
100 for i, t in enumerate(tokens):
99 # Compute the column offset for the current line, such that
101 # Compute the column offset for the current line, such that
100 # the current line will be aligned to the last opening paren
102 # the current line will be aligned to the last opening paren
101 # as before.
103 # as before.
102 if coloffset < 0:
104 if coloffset < 0:
103 if t.start[1] == parens[-1][1]:
105 if t.start[1] == parens[-1][1]:
104 coloffset = parens[-1][2]
106 coloffset = parens[-1][2]
105 elif t.start[1] + 1 == parens[-1][1]:
107 elif t.start[1] + 1 == parens[-1][1]:
106 # fix misaligned indent of s/util.Abort/error.Abort/
108 # fix misaligned indent of s/util.Abort/error.Abort/
107 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
109 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
108 else:
110 else:
109 coloffset = 0
111 coloffset = 0
110
112
111 # Reset per-line attributes at EOL.
113 # Reset per-line attributes at EOL.
112 if t.type in (token.NEWLINE, tokenize.NL):
114 if t.type in (token.NEWLINE, tokenize.NL):
113 yield adjusttokenpos(t, coloffset)
115 yield adjusttokenpos(t, coloffset)
114 coldelta = 0
116 coldelta = 0
115 coloffset = -1
117 coloffset = -1
118 if not insideignoreblock:
119 ignorenextline = (
120 tokens[i - 1].type == token.COMMENT
121 and tokens[i - 1].string == "#no-py3-transform"
122 )
123 continue
124
125 if t.type == token.COMMENT:
126 if t.string == "#py3-transform: off":
127 insideignoreblock = True
128 if t.string == "#py3-transform: on":
129 insideignoreblock = False
130
131 if ignorenextline or insideignoreblock:
132 yield adjusttokenpos(t, coloffset)
116 continue
133 continue
117
134
118 # Remember the last paren position.
135 # Remember the last paren position.
119 if _isop(i, '(', '[', '{'):
136 if _isop(i, '(', '[', '{'):
120 parens.append(t.end + (coloffset + coldelta,))
137 parens.append(t.end + (coloffset + coldelta,))
121 elif _isop(i, ')', ']', '}'):
138 elif _isop(i, ')', ']', '}'):
122 parens.pop()
139 parens.pop()
123
140
124 # Convert most string literals to byte literals. String literals
141 # Convert most string literals to byte literals. String literals
125 # in Python 2 are bytes. String literals in Python 3 are unicode.
142 # in Python 2 are bytes. String literals in Python 3 are unicode.
126 # Most strings in Mercurial are bytes and unicode strings are rare.
143 # Most strings in Mercurial are bytes and unicode strings are rare.
127 # Rather than rewrite all string literals to use ``b''`` to indicate
144 # Rather than rewrite all string literals to use ``b''`` to indicate
128 # byte strings, we apply this token transformer to insert the ``b``
145 # byte strings, we apply this token transformer to insert the ``b``
129 # prefix nearly everywhere.
146 # prefix nearly everywhere.
130 if t.type == token.STRING and t not in sysstrtokens:
147 if t.type == token.STRING and t not in sysstrtokens:
131 s = t.string
148 s = t.string
132
149
133 # Preserve docstrings as string literals. This is inconsistent
150 # Preserve docstrings as string literals. This is inconsistent
134 # with regular unprefixed strings. However, the
151 # with regular unprefixed strings. However, the
135 # "from __future__" parsing (which allows a module docstring to
152 # "from __future__" parsing (which allows a module docstring to
136 # exist before it) doesn't properly handle the docstring if it
153 # exist before it) doesn't properly handle the docstring if it
137 # is b''' prefixed, leading to a SyntaxError. We leave all
154 # is b''' prefixed, leading to a SyntaxError. We leave all
138 # docstrings as unprefixed to avoid this. This means Mercurial
155 # docstrings as unprefixed to avoid this. This means Mercurial
139 # components touching docstrings need to handle unicode,
156 # components touching docstrings need to handle unicode,
140 # unfortunately.
157 # unfortunately.
141 if s[0:3] in ("'''", '"""'):
158 if s[0:3] in ("'''", '"""'):
142 # If it's assigned to something, it's not a docstring
159 # If it's assigned to something, it's not a docstring
143 if not _isop(i - 1, '='):
160 if not _isop(i - 1, '='):
144 yield adjusttokenpos(t, coloffset)
161 yield adjusttokenpos(t, coloffset)
145 continue
162 continue
146
163
147 # If the first character isn't a quote, it is likely a string
164 # If the first character isn't a quote, it is likely a string
148 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
165 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
149 if s[0] not in ("'", '"'):
166 if s[0] not in ("'", '"'):
150 yield adjusttokenpos(t, coloffset)
167 yield adjusttokenpos(t, coloffset)
151 continue
168 continue
152
169
153 # String literal. Prefix to make a b'' string.
170 # String literal. Prefix to make a b'' string.
154 yield adjusttokenpos(t._replace(string='b%s' % t.string),
171 yield adjusttokenpos(t._replace(string='b%s' % t.string),
155 coloffset)
172 coloffset)
156 coldelta += 1
173 coldelta += 1
157 continue
174 continue
158
175
159 # This looks like a function call.
176 # This looks like a function call.
160 if t.type == token.NAME and _isop(i + 1, '('):
177 if t.type == token.NAME and _isop(i + 1, '('):
161 fn = t.string
178 fn = t.string
162
179
163 # *attr() builtins don't accept byte strings to 2nd argument.
180 # *attr() builtins don't accept byte strings to 2nd argument.
164 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
181 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
165 not _isop(i - 1, '.')):
182 not _isop(i - 1, '.')):
166 arg1idx = _findargnofcall(1)
183 arg1idx = _findargnofcall(1)
167 if arg1idx is not None:
184 if arg1idx is not None:
168 _ensuresysstr(arg1idx)
185 _ensuresysstr(arg1idx)
169
186
170 # .encode() and .decode() on str/bytes/unicode don't accept
187 # .encode() and .decode() on str/bytes/unicode don't accept
171 # byte strings on Python 3.
188 # byte strings on Python 3.
172 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
189 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
173 for argn in range(2):
190 for argn in range(2):
174 argidx = _findargnofcall(argn)
191 argidx = _findargnofcall(argn)
175 if argidx is not None:
192 if argidx is not None:
176 _ensuresysstr(argidx)
193 _ensuresysstr(argidx)
177
194
178 # It changes iteritems/values to items/values as they are not
195 # It changes iteritems/values to items/values as they are not
179 # present in Python 3 world.
196 # present in Python 3 world.
180 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
197 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
181 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
198 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
182 continue
199 continue
183
200
184 # Looks like "if __name__ == '__main__'".
201 # Looks like "if __name__ == '__main__'".
185 if (t.type == token.NAME and t.string == '__name__'
202 if (t.type == token.NAME and t.string == '__name__'
186 and _isop(i + 1, '==')):
203 and _isop(i + 1, '==')):
187 _ensuresysstr(i + 2)
204 _ensuresysstr(i + 2)
188
205
189 # Emit unmodified token.
206 # Emit unmodified token.
190 yield adjusttokenpos(t, coloffset)
207 yield adjusttokenpos(t, coloffset)
191
208
192 def process(fin, fout, opts):
209 def process(fin, fout, opts):
193 tokens = tokenize.tokenize(fin.readline)
210 tokens = tokenize.tokenize(fin.readline)
194 tokens = replacetokens(list(tokens), opts)
211 tokens = replacetokens(list(tokens), opts)
195 fout.write(tokenize.untokenize(tokens))
212 fout.write(tokenize.untokenize(tokens))
196
213
197 def tryunlink(fname):
214 def tryunlink(fname):
198 try:
215 try:
199 os.unlink(fname)
216 os.unlink(fname)
200 except OSError as err:
217 except OSError as err:
201 if err.errno != errno.ENOENT:
218 if err.errno != errno.ENOENT:
202 raise
219 raise
203
220
204 @contextlib.contextmanager
221 @contextlib.contextmanager
205 def editinplace(fname):
222 def editinplace(fname):
206 n = os.path.basename(fname)
223 n = os.path.basename(fname)
207 d = os.path.dirname(fname)
224 d = os.path.dirname(fname)
208 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
225 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
209 delete=False)
226 delete=False)
210 try:
227 try:
211 yield fp
228 yield fp
212 fp.close()
229 fp.close()
213 if os.name == 'nt':
230 if os.name == 'nt':
214 tryunlink(fname)
231 tryunlink(fname)
215 os.rename(fp.name, fname)
232 os.rename(fp.name, fname)
216 finally:
233 finally:
217 fp.close()
234 fp.close()
218 tryunlink(fp.name)
235 tryunlink(fp.name)
219
236
220 def main():
237 def main():
221 ap = argparse.ArgumentParser()
238 ap = argparse.ArgumentParser()
222 ap.add_argument('-i', '--inplace', action='store_true', default=False,
239 ap.add_argument('-i', '--inplace', action='store_true', default=False,
223 help='edit files in place')
240 help='edit files in place')
224 ap.add_argument('--dictiter', action='store_true', default=False,
241 ap.add_argument('--dictiter', action='store_true', default=False,
225 help='rewrite iteritems() and itervalues()'),
242 help='rewrite iteritems() and itervalues()'),
226 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
243 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
227 args = ap.parse_args()
244 args = ap.parse_args()
228 opts = {
245 opts = {
229 'dictiter': args.dictiter,
246 'dictiter': args.dictiter,
230 }
247 }
231 for fname in args.files:
248 for fname in args.files:
232 if args.inplace:
249 if args.inplace:
233 with editinplace(fname) as fout:
250 with editinplace(fname) as fout:
234 with open(fname, 'rb') as fin:
251 with open(fname, 'rb') as fin:
235 process(fin, fout, opts)
252 process(fin, fout, opts)
236 else:
253 else:
237 with open(fname, 'rb') as fin:
254 with open(fname, 'rb') as fin:
238 fout = sys.stdout.buffer
255 fout = sys.stdout.buffer
239 process(fin, fout, opts)
256 process(fin, fout, opts)
240
257
241 if __name__ == '__main__':
258 if __name__ == '__main__':
242 if sys.version_info.major < 3:
259 if sys.version_info.major < 3:
243 print('This script must be run under Python 3.')
260 print('This script must be run under Python 3.')
244 sys.exit(3)
261 sys.exit(3)
245 main()
262 main()
General Comments 0
You need to be logged in to leave comments. Login now