##// END OF EJS Templates
byteify-strings: add --inplace option to write back result
Yuya Nishihara -
r38405:9f42e4a8 default
parent child Browse files
Show More
@@ -1,175 +1,209 b''
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import
10 from __future__ import absolute_import
11
11
12 import argparse
12 import argparse
13 import contextlib
14 import errno
13 import io
15 import io
16 import os
14 import sys
17 import sys
18 import tempfile
15 import token
19 import token
16 import tokenize
20 import tokenize
17
21
18 if True:
22 if True:
19 def replacetokens(tokens, fullname):
23 def replacetokens(tokens, fullname):
20 """Transform a stream of tokens from raw to Python 3.
24 """Transform a stream of tokens from raw to Python 3.
21
25
22 Returns a generator of possibly rewritten tokens.
26 Returns a generator of possibly rewritten tokens.
23
27
24 The input token list may be mutated as part of processing. However,
28 The input token list may be mutated as part of processing. However,
25 its changes do not necessarily match the output token stream.
29 its changes do not necessarily match the output token stream.
26 """
30 """
27 futureimpline = False
31 futureimpline = False
28
32
29 # The following utility functions access the tokens list and i index of
33 # The following utility functions access the tokens list and i index of
30 # the for i, t enumerate(tokens) loop below
34 # the for i, t enumerate(tokens) loop below
31 def _isop(j, *o):
35 def _isop(j, *o):
32 """Assert that tokens[j] is an OP with one of the given values"""
36 """Assert that tokens[j] is an OP with one of the given values"""
33 try:
37 try:
34 return tokens[j].type == token.OP and tokens[j].string in o
38 return tokens[j].type == token.OP and tokens[j].string in o
35 except IndexError:
39 except IndexError:
36 return False
40 return False
37
41
38 def _findargnofcall(n):
42 def _findargnofcall(n):
39 """Find arg n of a call expression (start at 0)
43 """Find arg n of a call expression (start at 0)
40
44
41 Returns index of the first token of that argument, or None if
45 Returns index of the first token of that argument, or None if
42 there is not that many arguments.
46 there is not that many arguments.
43
47
44 Assumes that token[i + 1] is '('.
48 Assumes that token[i + 1] is '('.
45
49
46 """
50 """
47 nested = 0
51 nested = 0
48 for j in range(i + 2, len(tokens)):
52 for j in range(i + 2, len(tokens)):
49 if _isop(j, ')', ']', '}'):
53 if _isop(j, ')', ']', '}'):
50 # end of call, tuple, subscription or dict / set
54 # end of call, tuple, subscription or dict / set
51 nested -= 1
55 nested -= 1
52 if nested < 0:
56 if nested < 0:
53 return None
57 return None
54 elif n == 0:
58 elif n == 0:
55 # this is the starting position of arg
59 # this is the starting position of arg
56 return j
60 return j
57 elif _isop(j, '(', '[', '{'):
61 elif _isop(j, '(', '[', '{'):
58 nested += 1
62 nested += 1
59 elif _isop(j, ',') and nested == 0:
63 elif _isop(j, ',') and nested == 0:
60 n -= 1
64 n -= 1
61
65
62 return None
66 return None
63
67
64 def _ensureunicode(j):
68 def _ensureunicode(j):
65 """Make sure the token at j is a unicode string
69 """Make sure the token at j is a unicode string
66
70
67 This rewrites a string token to include the unicode literal prefix
71 This rewrites a string token to include the unicode literal prefix
68 so the string transformer won't add the byte prefix.
72 so the string transformer won't add the byte prefix.
69
73
70 Ignores tokens that are not strings. Assumes bounds checking has
74 Ignores tokens that are not strings. Assumes bounds checking has
71 already been done.
75 already been done.
72
76
73 """
77 """
74 st = tokens[j]
78 st = tokens[j]
75 if st.type == token.STRING and st.string.startswith(("'", '"')):
79 if st.type == token.STRING and st.string.startswith(("'", '"')):
76 tokens[j] = st._replace(string='u%s' % st.string)
80 tokens[j] = st._replace(string='u%s' % st.string)
77
81
78 for i, t in enumerate(tokens):
82 for i, t in enumerate(tokens):
79 # Convert most string literals to byte literals. String literals
83 # Convert most string literals to byte literals. String literals
80 # in Python 2 are bytes. String literals in Python 3 are unicode.
84 # in Python 2 are bytes. String literals in Python 3 are unicode.
81 # Most strings in Mercurial are bytes and unicode strings are rare.
85 # Most strings in Mercurial are bytes and unicode strings are rare.
82 # Rather than rewrite all string literals to use ``b''`` to indicate
86 # Rather than rewrite all string literals to use ``b''`` to indicate
83 # byte strings, we apply this token transformer to insert the ``b``
87 # byte strings, we apply this token transformer to insert the ``b``
84 # prefix nearly everywhere.
88 # prefix nearly everywhere.
85 if t.type == token.STRING:
89 if t.type == token.STRING:
86 s = t.string
90 s = t.string
87
91
88 # Preserve docstrings as string literals. This is inconsistent
92 # Preserve docstrings as string literals. This is inconsistent
89 # with regular unprefixed strings. However, the
93 # with regular unprefixed strings. However, the
90 # "from __future__" parsing (which allows a module docstring to
94 # "from __future__" parsing (which allows a module docstring to
91 # exist before it) doesn't properly handle the docstring if it
95 # exist before it) doesn't properly handle the docstring if it
92 # is b''' prefixed, leading to a SyntaxError. We leave all
96 # is b''' prefixed, leading to a SyntaxError. We leave all
93 # docstrings as unprefixed to avoid this. This means Mercurial
97 # docstrings as unprefixed to avoid this. This means Mercurial
94 # components touching docstrings need to handle unicode,
98 # components touching docstrings need to handle unicode,
95 # unfortunately.
99 # unfortunately.
96 if s[0:3] in ("'''", '"""'):
100 if s[0:3] in ("'''", '"""'):
97 yield t
101 yield t
98 continue
102 continue
99
103
100 # If the first character isn't a quote, it is likely a string
104 # If the first character isn't a quote, it is likely a string
101 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
105 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
102 if s[0] not in ("'", '"'):
106 if s[0] not in ("'", '"'):
103 yield t
107 yield t
104 continue
108 continue
105
109
106 # String literal. Prefix to make a b'' string.
110 # String literal. Prefix to make a b'' string.
107 yield t._replace(string='b%s' % t.string)
111 yield t._replace(string='b%s' % t.string)
108 continue
112 continue
109
113
110 # Insert compatibility imports at "from __future__ import" line.
114 # Insert compatibility imports at "from __future__ import" line.
111 # No '\n' should be added to preserve line numbers.
115 # No '\n' should be added to preserve line numbers.
112 if (t.type == token.NAME and t.string == 'import' and
116 if (t.type == token.NAME and t.string == 'import' and
113 all(u.type == token.NAME for u in tokens[i - 2:i]) and
117 all(u.type == token.NAME for u in tokens[i - 2:i]) and
114 [u.string for u in tokens[i - 2:i]] == ['from', '__future__']):
118 [u.string for u in tokens[i - 2:i]] == ['from', '__future__']):
115 futureimpline = True
119 futureimpline = True
116 if t.type == token.NEWLINE and futureimpline:
120 if t.type == token.NEWLINE and futureimpline:
117 futureimpline = False
121 futureimpline = False
118 if fullname == 'mercurial.pycompat':
122 if fullname == 'mercurial.pycompat':
119 yield t
123 yield t
120 continue
124 continue
121 r, c = t.start
125 r, c = t.start
122 l = (b'; from mercurial.pycompat import '
126 l = (b'; from mercurial.pycompat import '
123 b'delattr, getattr, hasattr, setattr, xrange, '
127 b'delattr, getattr, hasattr, setattr, xrange, '
124 b'open, unicode\n')
128 b'open, unicode\n')
125 for u in tokenize.tokenize(io.BytesIO(l).readline):
129 for u in tokenize.tokenize(io.BytesIO(l).readline):
126 if u.type in (tokenize.ENCODING, token.ENDMARKER):
130 if u.type in (tokenize.ENCODING, token.ENDMARKER):
127 continue
131 continue
128 yield u._replace(
132 yield u._replace(
129 start=(r, c + u.start[1]), end=(r, c + u.end[1]))
133 start=(r, c + u.start[1]), end=(r, c + u.end[1]))
130 continue
134 continue
131
135
132 # This looks like a function call.
136 # This looks like a function call.
133 if t.type == token.NAME and _isop(i + 1, '('):
137 if t.type == token.NAME and _isop(i + 1, '('):
134 fn = t.string
138 fn = t.string
135
139
136 # *attr() builtins don't accept byte strings to 2nd argument.
140 # *attr() builtins don't accept byte strings to 2nd argument.
137 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
141 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
138 not _isop(i - 1, '.')):
142 not _isop(i - 1, '.')):
139 arg1idx = _findargnofcall(1)
143 arg1idx = _findargnofcall(1)
140 if arg1idx is not None:
144 if arg1idx is not None:
141 _ensureunicode(arg1idx)
145 _ensureunicode(arg1idx)
142
146
143 # .encode() and .decode() on str/bytes/unicode don't accept
147 # .encode() and .decode() on str/bytes/unicode don't accept
144 # byte strings on Python 3.
148 # byte strings on Python 3.
145 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
149 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
146 for argn in range(2):
150 for argn in range(2):
147 argidx = _findargnofcall(argn)
151 argidx = _findargnofcall(argn)
148 if argidx is not None:
152 if argidx is not None:
149 _ensureunicode(argidx)
153 _ensureunicode(argidx)
150
154
151 # It changes iteritems/values to items/values as they are not
155 # It changes iteritems/values to items/values as they are not
152 # present in Python 3 world.
156 # present in Python 3 world.
153 elif fn in ('iteritems', 'itervalues'):
157 elif fn in ('iteritems', 'itervalues'):
154 yield t._replace(string=fn[4:])
158 yield t._replace(string=fn[4:])
155 continue
159 continue
156
160
157 # Emit unmodified token.
161 # Emit unmodified token.
158 yield t
162 yield t
159
163
160 def process(fin, fout):
164 def process(fin, fout):
161 tokens = tokenize.tokenize(fin.readline)
165 tokens = tokenize.tokenize(fin.readline)
162 tokens = replacetokens(list(tokens), fullname='<dummy>')
166 tokens = replacetokens(list(tokens), fullname='<dummy>')
163 fout.write(tokenize.untokenize(tokens))
167 fout.write(tokenize.untokenize(tokens))
164
168
169 def tryunlink(fname):
170 try:
171 os.unlink(fname)
172 except OSError as err:
173 if err.errno != errno.ENOENT:
174 raise
175
176 @contextlib.contextmanager
177 def editinplace(fname):
178 n = os.path.basename(fname)
179 d = os.path.dirname(fname)
180 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
181 delete=False)
182 try:
183 yield fp
184 fp.close()
185 if os.name == 'nt':
186 tryunlink(fname)
187 os.rename(fp.name, fname)
188 finally:
189 fp.close()
190 tryunlink(fp.name)
191
165 def main():
192 def main():
166 ap = argparse.ArgumentParser()
193 ap = argparse.ArgumentParser()
194 ap.add_argument('-i', '--inplace', action='store_true', default=False,
195 help='edit files in place')
167 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
196 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
168 args = ap.parse_args()
197 args = ap.parse_args()
169 for fname in args.files:
198 for fname in args.files:
199 if args.inplace:
200 with editinplace(fname) as fout:
201 with open(fname, 'rb') as fin:
202 process(fin, fout)
203 else:
170 with open(fname, 'rb') as fin:
204 with open(fname, 'rb') as fin:
171 fout = sys.stdout.buffer
205 fout = sys.stdout.buffer
172 process(fin, fout)
206 process(fin, fout)
173
207
174 if __name__ == '__main__':
208 if __name__ == '__main__':
175 main()
209 main()
General Comments 0
You need to be logged in to leave comments. Login now