##// END OF EJS Templates
byteify-strings: handle triple quoted strings if they are not docstrings...
Raphaël Gomès -
r42905:e9592e11 default
parent child Browse files
Show More
@@ -1,243 +1,245
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import, print_function
10 from __future__ import absolute_import, print_function
11
11
12 import argparse
12 import argparse
13 import contextlib
13 import contextlib
14 import errno
14 import errno
15 import os
15 import os
16 import sys
16 import sys
17 import tempfile
17 import tempfile
18 import token
18 import token
19 import tokenize
19 import tokenize
20
20
21 def adjusttokenpos(t, ofs):
21 def adjusttokenpos(t, ofs):
22 """Adjust start/end column of the given token"""
22 """Adjust start/end column of the given token"""
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 end=(t.end[0], t.end[1] + ofs))
24 end=(t.end[0], t.end[1] + ofs))
25
25
26 def replacetokens(tokens, opts):
26 def replacetokens(tokens, opts):
27 """Transform a stream of tokens from raw to Python 3.
27 """Transform a stream of tokens from raw to Python 3.
28
28
29 Returns a generator of possibly rewritten tokens.
29 Returns a generator of possibly rewritten tokens.
30
30
31 The input token list may be mutated as part of processing. However,
31 The input token list may be mutated as part of processing. However,
32 its changes do not necessarily match the output token stream.
32 its changes do not necessarily match the output token stream.
33 """
33 """
34 sysstrtokens = set()
34 sysstrtokens = set()
35
35
36 # The following utility functions access the tokens list and i index of
36 # The following utility functions access the tokens list and i index of
37 # the for i, t enumerate(tokens) loop below
37 # the for i, t enumerate(tokens) loop below
38 def _isop(j, *o):
38 def _isop(j, *o):
39 """Assert that tokens[j] is an OP with one of the given values"""
39 """Assert that tokens[j] is an OP with one of the given values"""
40 try:
40 try:
41 return tokens[j].type == token.OP and tokens[j].string in o
41 return tokens[j].type == token.OP and tokens[j].string in o
42 except IndexError:
42 except IndexError:
43 return False
43 return False
44
44
45 def _findargnofcall(n):
45 def _findargnofcall(n):
46 """Find arg n of a call expression (start at 0)
46 """Find arg n of a call expression (start at 0)
47
47
48 Returns index of the first token of that argument, or None if
48 Returns index of the first token of that argument, or None if
49 there is not that many arguments.
49 there is not that many arguments.
50
50
51 Assumes that token[i + 1] is '('.
51 Assumes that token[i + 1] is '('.
52
52
53 """
53 """
54 nested = 0
54 nested = 0
55 for j in range(i + 2, len(tokens)):
55 for j in range(i + 2, len(tokens)):
56 if _isop(j, ')', ']', '}'):
56 if _isop(j, ')', ']', '}'):
57 # end of call, tuple, subscription or dict / set
57 # end of call, tuple, subscription or dict / set
58 nested -= 1
58 nested -= 1
59 if nested < 0:
59 if nested < 0:
60 return None
60 return None
61 elif n == 0:
61 elif n == 0:
62 # this is the starting position of arg
62 # this is the starting position of arg
63 return j
63 return j
64 elif _isop(j, '(', '[', '{'):
64 elif _isop(j, '(', '[', '{'):
65 nested += 1
65 nested += 1
66 elif _isop(j, ',') and nested == 0:
66 elif _isop(j, ',') and nested == 0:
67 n -= 1
67 n -= 1
68
68
69 return None
69 return None
70
70
71 def _ensuresysstr(j):
71 def _ensuresysstr(j):
72 """Make sure the token at j is a system string
72 """Make sure the token at j is a system string
73
73
74 Remember the given token so the string transformer won't add
74 Remember the given token so the string transformer won't add
75 the byte prefix.
75 the byte prefix.
76
76
77 Ignores tokens that are not strings. Assumes bounds checking has
77 Ignores tokens that are not strings. Assumes bounds checking has
78 already been done.
78 already been done.
79
79
80 """
80 """
81 k = j
81 k = j
82 currtoken = tokens[k]
82 currtoken = tokens[k]
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 k += 1
84 k += 1
85 if (
85 if (
86 currtoken.type == token.STRING
86 currtoken.type == token.STRING
87 and currtoken.string.startswith(("'", '"'))
87 and currtoken.string.startswith(("'", '"'))
88 ):
88 ):
89 sysstrtokens.add(currtoken)
89 sysstrtokens.add(currtoken)
90 try:
90 try:
91 currtoken = tokens[k]
91 currtoken = tokens[k]
92 except IndexError:
92 except IndexError:
93 break
93 break
94
94
95 coldelta = 0 # column increment for new opening parens
95 coldelta = 0 # column increment for new opening parens
96 coloffset = -1 # column offset for the current line (-1: TBD)
96 coloffset = -1 # column offset for the current line (-1: TBD)
97 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
97 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
98 for i, t in enumerate(tokens):
98 for i, t in enumerate(tokens):
99 # Compute the column offset for the current line, such that
99 # Compute the column offset for the current line, such that
100 # the current line will be aligned to the last opening paren
100 # the current line will be aligned to the last opening paren
101 # as before.
101 # as before.
102 if coloffset < 0:
102 if coloffset < 0:
103 if t.start[1] == parens[-1][1]:
103 if t.start[1] == parens[-1][1]:
104 coloffset = parens[-1][2]
104 coloffset = parens[-1][2]
105 elif t.start[1] + 1 == parens[-1][1]:
105 elif t.start[1] + 1 == parens[-1][1]:
106 # fix misaligned indent of s/util.Abort/error.Abort/
106 # fix misaligned indent of s/util.Abort/error.Abort/
107 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
107 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
108 else:
108 else:
109 coloffset = 0
109 coloffset = 0
110
110
111 # Reset per-line attributes at EOL.
111 # Reset per-line attributes at EOL.
112 if t.type in (token.NEWLINE, tokenize.NL):
112 if t.type in (token.NEWLINE, tokenize.NL):
113 yield adjusttokenpos(t, coloffset)
113 yield adjusttokenpos(t, coloffset)
114 coldelta = 0
114 coldelta = 0
115 coloffset = -1
115 coloffset = -1
116 continue
116 continue
117
117
118 # Remember the last paren position.
118 # Remember the last paren position.
119 if _isop(i, '(', '[', '{'):
119 if _isop(i, '(', '[', '{'):
120 parens.append(t.end + (coloffset + coldelta,))
120 parens.append(t.end + (coloffset + coldelta,))
121 elif _isop(i, ')', ']', '}'):
121 elif _isop(i, ')', ']', '}'):
122 parens.pop()
122 parens.pop()
123
123
124 # Convert most string literals to byte literals. String literals
124 # Convert most string literals to byte literals. String literals
125 # in Python 2 are bytes. String literals in Python 3 are unicode.
125 # in Python 2 are bytes. String literals in Python 3 are unicode.
126 # Most strings in Mercurial are bytes and unicode strings are rare.
126 # Most strings in Mercurial are bytes and unicode strings are rare.
127 # Rather than rewrite all string literals to use ``b''`` to indicate
127 # Rather than rewrite all string literals to use ``b''`` to indicate
128 # byte strings, we apply this token transformer to insert the ``b``
128 # byte strings, we apply this token transformer to insert the ``b``
129 # prefix nearly everywhere.
129 # prefix nearly everywhere.
130 if t.type == token.STRING and t not in sysstrtokens:
130 if t.type == token.STRING and t not in sysstrtokens:
131 s = t.string
131 s = t.string
132
132
133 # Preserve docstrings as string literals. This is inconsistent
133 # Preserve docstrings as string literals. This is inconsistent
134 # with regular unprefixed strings. However, the
134 # with regular unprefixed strings. However, the
135 # "from __future__" parsing (which allows a module docstring to
135 # "from __future__" parsing (which allows a module docstring to
136 # exist before it) doesn't properly handle the docstring if it
136 # exist before it) doesn't properly handle the docstring if it
137 # is b''' prefixed, leading to a SyntaxError. We leave all
137 # is b''' prefixed, leading to a SyntaxError. We leave all
138 # docstrings as unprefixed to avoid this. This means Mercurial
138 # docstrings as unprefixed to avoid this. This means Mercurial
139 # components touching docstrings need to handle unicode,
139 # components touching docstrings need to handle unicode,
140 # unfortunately.
140 # unfortunately.
141 if s[0:3] in ("'''", '"""'):
141 if s[0:3] in ("'''", '"""'):
142 # If it's assigned to something, it's not a docstring
143 if not _isop(i - 1, '='):
142 yield adjusttokenpos(t, coloffset)
144 yield adjusttokenpos(t, coloffset)
143 continue
145 continue
144
146
145 # If the first character isn't a quote, it is likely a string
147 # If the first character isn't a quote, it is likely a string
146 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
148 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
147 if s[0] not in ("'", '"'):
149 if s[0] not in ("'", '"'):
148 yield adjusttokenpos(t, coloffset)
150 yield adjusttokenpos(t, coloffset)
149 continue
151 continue
150
152
151 # String literal. Prefix to make a b'' string.
153 # String literal. Prefix to make a b'' string.
152 yield adjusttokenpos(t._replace(string='b%s' % t.string),
154 yield adjusttokenpos(t._replace(string='b%s' % t.string),
153 coloffset)
155 coloffset)
154 coldelta += 1
156 coldelta += 1
155 continue
157 continue
156
158
157 # This looks like a function call.
159 # This looks like a function call.
158 if t.type == token.NAME and _isop(i + 1, '('):
160 if t.type == token.NAME and _isop(i + 1, '('):
159 fn = t.string
161 fn = t.string
160
162
161 # *attr() builtins don't accept byte strings to 2nd argument.
163 # *attr() builtins don't accept byte strings to 2nd argument.
162 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
164 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
163 not _isop(i - 1, '.')):
165 not _isop(i - 1, '.')):
164 arg1idx = _findargnofcall(1)
166 arg1idx = _findargnofcall(1)
165 if arg1idx is not None:
167 if arg1idx is not None:
166 _ensuresysstr(arg1idx)
168 _ensuresysstr(arg1idx)
167
169
168 # .encode() and .decode() on str/bytes/unicode don't accept
170 # .encode() and .decode() on str/bytes/unicode don't accept
169 # byte strings on Python 3.
171 # byte strings on Python 3.
170 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
172 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
171 for argn in range(2):
173 for argn in range(2):
172 argidx = _findargnofcall(argn)
174 argidx = _findargnofcall(argn)
173 if argidx is not None:
175 if argidx is not None:
174 _ensuresysstr(argidx)
176 _ensuresysstr(argidx)
175
177
176 # It changes iteritems/values to items/values as they are not
178 # It changes iteritems/values to items/values as they are not
177 # present in Python 3 world.
179 # present in Python 3 world.
178 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
180 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
179 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
181 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
180 continue
182 continue
181
183
182 # Looks like "if __name__ == '__main__'".
184 # Looks like "if __name__ == '__main__'".
183 if (t.type == token.NAME and t.string == '__name__'
185 if (t.type == token.NAME and t.string == '__name__'
184 and _isop(i + 1, '==')):
186 and _isop(i + 1, '==')):
185 _ensuresysstr(i + 2)
187 _ensuresysstr(i + 2)
186
188
187 # Emit unmodified token.
189 # Emit unmodified token.
188 yield adjusttokenpos(t, coloffset)
190 yield adjusttokenpos(t, coloffset)
189
191
190 def process(fin, fout, opts):
192 def process(fin, fout, opts):
191 tokens = tokenize.tokenize(fin.readline)
193 tokens = tokenize.tokenize(fin.readline)
192 tokens = replacetokens(list(tokens), opts)
194 tokens = replacetokens(list(tokens), opts)
193 fout.write(tokenize.untokenize(tokens))
195 fout.write(tokenize.untokenize(tokens))
194
196
195 def tryunlink(fname):
197 def tryunlink(fname):
196 try:
198 try:
197 os.unlink(fname)
199 os.unlink(fname)
198 except OSError as err:
200 except OSError as err:
199 if err.errno != errno.ENOENT:
201 if err.errno != errno.ENOENT:
200 raise
202 raise
201
203
202 @contextlib.contextmanager
204 @contextlib.contextmanager
203 def editinplace(fname):
205 def editinplace(fname):
204 n = os.path.basename(fname)
206 n = os.path.basename(fname)
205 d = os.path.dirname(fname)
207 d = os.path.dirname(fname)
206 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
208 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
207 delete=False)
209 delete=False)
208 try:
210 try:
209 yield fp
211 yield fp
210 fp.close()
212 fp.close()
211 if os.name == 'nt':
213 if os.name == 'nt':
212 tryunlink(fname)
214 tryunlink(fname)
213 os.rename(fp.name, fname)
215 os.rename(fp.name, fname)
214 finally:
216 finally:
215 fp.close()
217 fp.close()
216 tryunlink(fp.name)
218 tryunlink(fp.name)
217
219
218 def main():
220 def main():
219 ap = argparse.ArgumentParser()
221 ap = argparse.ArgumentParser()
220 ap.add_argument('-i', '--inplace', action='store_true', default=False,
222 ap.add_argument('-i', '--inplace', action='store_true', default=False,
221 help='edit files in place')
223 help='edit files in place')
222 ap.add_argument('--dictiter', action='store_true', default=False,
224 ap.add_argument('--dictiter', action='store_true', default=False,
223 help='rewrite iteritems() and itervalues()'),
225 help='rewrite iteritems() and itervalues()'),
224 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
226 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
225 args = ap.parse_args()
227 args = ap.parse_args()
226 opts = {
228 opts = {
227 'dictiter': args.dictiter,
229 'dictiter': args.dictiter,
228 }
230 }
229 for fname in args.files:
231 for fname in args.files:
230 if args.inplace:
232 if args.inplace:
231 with editinplace(fname) as fout:
233 with editinplace(fname) as fout:
232 with open(fname, 'rb') as fin:
234 with open(fname, 'rb') as fin:
233 process(fin, fout, opts)
235 process(fin, fout, opts)
234 else:
236 else:
235 with open(fname, 'rb') as fin:
237 with open(fname, 'rb') as fin:
236 fout = sys.stdout.buffer
238 fout = sys.stdout.buffer
237 process(fin, fout, opts)
239 process(fin, fout, opts)
238
240
239 if __name__ == '__main__':
241 if __name__ == '__main__':
240 if sys.version_info.major < 3:
242 if sys.version_info.major < 3:
241 print('This script must be run under Python 3.')
243 print('This script must be run under Python 3.')
242 sys.exit(3)
244 sys.exit(3)
243 main()
245 main()
General Comments 0
You need to be logged in to leave comments. Login now