##// END OF EJS Templates
byteify-strings: do not rewrite iteritems() and itervalues() by default...
Yuya Nishihara -
r38407:f701bc93 default
parent child Browse files
Show More
@@ -1,184 +1,189 b''
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import
10 from __future__ import absolute_import
11
11
12 import argparse
12 import argparse
13 import contextlib
13 import contextlib
14 import errno
14 import errno
15 import os
15 import os
16 import sys
16 import sys
17 import tempfile
17 import tempfile
18 import token
18 import token
19 import tokenize
19 import tokenize
20
20
21 if True:
21 if True:
22 def replacetokens(tokens):
22 def replacetokens(tokens, opts):
23 """Transform a stream of tokens from raw to Python 3.
23 """Transform a stream of tokens from raw to Python 3.
24
24
25 Returns a generator of possibly rewritten tokens.
25 Returns a generator of possibly rewritten tokens.
26
26
27 The input token list may be mutated as part of processing. However,
27 The input token list may be mutated as part of processing. However,
28 its changes do not necessarily match the output token stream.
28 its changes do not necessarily match the output token stream.
29 """
29 """
30 # The following utility functions access the tokens list and i index of
30 # The following utility functions access the tokens list and i index of
31 # the for i, t enumerate(tokens) loop below
31 # the for i, t enumerate(tokens) loop below
32 def _isop(j, *o):
32 def _isop(j, *o):
33 """Assert that tokens[j] is an OP with one of the given values"""
33 """Assert that tokens[j] is an OP with one of the given values"""
34 try:
34 try:
35 return tokens[j].type == token.OP and tokens[j].string in o
35 return tokens[j].type == token.OP and tokens[j].string in o
36 except IndexError:
36 except IndexError:
37 return False
37 return False
38
38
39 def _findargnofcall(n):
39 def _findargnofcall(n):
40 """Find arg n of a call expression (start at 0)
40 """Find arg n of a call expression (start at 0)
41
41
42 Returns index of the first token of that argument, or None if
42 Returns index of the first token of that argument, or None if
43 there is not that many arguments.
43 there is not that many arguments.
44
44
45 Assumes that token[i + 1] is '('.
45 Assumes that token[i + 1] is '('.
46
46
47 """
47 """
48 nested = 0
48 nested = 0
49 for j in range(i + 2, len(tokens)):
49 for j in range(i + 2, len(tokens)):
50 if _isop(j, ')', ']', '}'):
50 if _isop(j, ')', ']', '}'):
51 # end of call, tuple, subscription or dict / set
51 # end of call, tuple, subscription or dict / set
52 nested -= 1
52 nested -= 1
53 if nested < 0:
53 if nested < 0:
54 return None
54 return None
55 elif n == 0:
55 elif n == 0:
56 # this is the starting position of arg
56 # this is the starting position of arg
57 return j
57 return j
58 elif _isop(j, '(', '[', '{'):
58 elif _isop(j, '(', '[', '{'):
59 nested += 1
59 nested += 1
60 elif _isop(j, ',') and nested == 0:
60 elif _isop(j, ',') and nested == 0:
61 n -= 1
61 n -= 1
62
62
63 return None
63 return None
64
64
65 def _ensureunicode(j):
65 def _ensureunicode(j):
66 """Make sure the token at j is a unicode string
66 """Make sure the token at j is a unicode string
67
67
68 This rewrites a string token to include the unicode literal prefix
68 This rewrites a string token to include the unicode literal prefix
69 so the string transformer won't add the byte prefix.
69 so the string transformer won't add the byte prefix.
70
70
71 Ignores tokens that are not strings. Assumes bounds checking has
71 Ignores tokens that are not strings. Assumes bounds checking has
72 already been done.
72 already been done.
73
73
74 """
74 """
75 st = tokens[j]
75 st = tokens[j]
76 if st.type == token.STRING and st.string.startswith(("'", '"')):
76 if st.type == token.STRING and st.string.startswith(("'", '"')):
77 tokens[j] = st._replace(string='u%s' % st.string)
77 tokens[j] = st._replace(string='u%s' % st.string)
78
78
79 for i, t in enumerate(tokens):
79 for i, t in enumerate(tokens):
80 # Convert most string literals to byte literals. String literals
80 # Convert most string literals to byte literals. String literals
81 # in Python 2 are bytes. String literals in Python 3 are unicode.
81 # in Python 2 are bytes. String literals in Python 3 are unicode.
82 # Most strings in Mercurial are bytes and unicode strings are rare.
82 # Most strings in Mercurial are bytes and unicode strings are rare.
83 # Rather than rewrite all string literals to use ``b''`` to indicate
83 # Rather than rewrite all string literals to use ``b''`` to indicate
84 # byte strings, we apply this token transformer to insert the ``b``
84 # byte strings, we apply this token transformer to insert the ``b``
85 # prefix nearly everywhere.
85 # prefix nearly everywhere.
86 if t.type == token.STRING:
86 if t.type == token.STRING:
87 s = t.string
87 s = t.string
88
88
89 # Preserve docstrings as string literals. This is inconsistent
89 # Preserve docstrings as string literals. This is inconsistent
90 # with regular unprefixed strings. However, the
90 # with regular unprefixed strings. However, the
91 # "from __future__" parsing (which allows a module docstring to
91 # "from __future__" parsing (which allows a module docstring to
92 # exist before it) doesn't properly handle the docstring if it
92 # exist before it) doesn't properly handle the docstring if it
93 # is b''' prefixed, leading to a SyntaxError. We leave all
93 # is b''' prefixed, leading to a SyntaxError. We leave all
94 # docstrings as unprefixed to avoid this. This means Mercurial
94 # docstrings as unprefixed to avoid this. This means Mercurial
95 # components touching docstrings need to handle unicode,
95 # components touching docstrings need to handle unicode,
96 # unfortunately.
96 # unfortunately.
97 if s[0:3] in ("'''", '"""'):
97 if s[0:3] in ("'''", '"""'):
98 yield t
98 yield t
99 continue
99 continue
100
100
101 # If the first character isn't a quote, it is likely a string
101 # If the first character isn't a quote, it is likely a string
102 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
102 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
103 if s[0] not in ("'", '"'):
103 if s[0] not in ("'", '"'):
104 yield t
104 yield t
105 continue
105 continue
106
106
107 # String literal. Prefix to make a b'' string.
107 # String literal. Prefix to make a b'' string.
108 yield t._replace(string='b%s' % t.string)
108 yield t._replace(string='b%s' % t.string)
109 continue
109 continue
110
110
111 # This looks like a function call.
111 # This looks like a function call.
112 if t.type == token.NAME and _isop(i + 1, '('):
112 if t.type == token.NAME and _isop(i + 1, '('):
113 fn = t.string
113 fn = t.string
114
114
115 # *attr() builtins don't accept byte strings to 2nd argument.
115 # *attr() builtins don't accept byte strings to 2nd argument.
116 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
116 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
117 not _isop(i - 1, '.')):
117 not _isop(i - 1, '.')):
118 arg1idx = _findargnofcall(1)
118 arg1idx = _findargnofcall(1)
119 if arg1idx is not None:
119 if arg1idx is not None:
120 _ensureunicode(arg1idx)
120 _ensureunicode(arg1idx)
121
121
122 # .encode() and .decode() on str/bytes/unicode don't accept
122 # .encode() and .decode() on str/bytes/unicode don't accept
123 # byte strings on Python 3.
123 # byte strings on Python 3.
124 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
124 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
125 for argn in range(2):
125 for argn in range(2):
126 argidx = _findargnofcall(argn)
126 argidx = _findargnofcall(argn)
127 if argidx is not None:
127 if argidx is not None:
128 _ensureunicode(argidx)
128 _ensureunicode(argidx)
129
129
130 # It changes iteritems/values to items/values as they are not
130 # It changes iteritems/values to items/values as they are not
131 # present in Python 3 world.
131 # present in Python 3 world.
132 elif fn in ('iteritems', 'itervalues'):
132 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
133 yield t._replace(string=fn[4:])
133 yield t._replace(string=fn[4:])
134 continue
134 continue
135
135
136 # Emit unmodified token.
136 # Emit unmodified token.
137 yield t
137 yield t
138
138
139 def process(fin, fout):
139 def process(fin, fout, opts):
140 tokens = tokenize.tokenize(fin.readline)
140 tokens = tokenize.tokenize(fin.readline)
141 tokens = replacetokens(list(tokens))
141 tokens = replacetokens(list(tokens), opts)
142 fout.write(tokenize.untokenize(tokens))
142 fout.write(tokenize.untokenize(tokens))
143
143
144 def tryunlink(fname):
144 def tryunlink(fname):
145 try:
145 try:
146 os.unlink(fname)
146 os.unlink(fname)
147 except OSError as err:
147 except OSError as err:
148 if err.errno != errno.ENOENT:
148 if err.errno != errno.ENOENT:
149 raise
149 raise
150
150
151 @contextlib.contextmanager
151 @contextlib.contextmanager
152 def editinplace(fname):
152 def editinplace(fname):
153 n = os.path.basename(fname)
153 n = os.path.basename(fname)
154 d = os.path.dirname(fname)
154 d = os.path.dirname(fname)
155 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
155 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
156 delete=False)
156 delete=False)
157 try:
157 try:
158 yield fp
158 yield fp
159 fp.close()
159 fp.close()
160 if os.name == 'nt':
160 if os.name == 'nt':
161 tryunlink(fname)
161 tryunlink(fname)
162 os.rename(fp.name, fname)
162 os.rename(fp.name, fname)
163 finally:
163 finally:
164 fp.close()
164 fp.close()
165 tryunlink(fp.name)
165 tryunlink(fp.name)
166
166
167 def main():
167 def main():
168 ap = argparse.ArgumentParser()
168 ap = argparse.ArgumentParser()
169 ap.add_argument('-i', '--inplace', action='store_true', default=False,
169 ap.add_argument('-i', '--inplace', action='store_true', default=False,
170 help='edit files in place')
170 help='edit files in place')
171 ap.add_argument('--dictiter', action='store_true', default=False,
172 help='rewrite iteritems() and itervalues()'),
171 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
173 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
172 args = ap.parse_args()
174 args = ap.parse_args()
175 opts = {
176 'dictiter': args.dictiter,
177 }
173 for fname in args.files:
178 for fname in args.files:
174 if args.inplace:
179 if args.inplace:
175 with editinplace(fname) as fout:
180 with editinplace(fname) as fout:
176 with open(fname, 'rb') as fin:
181 with open(fname, 'rb') as fin:
177 process(fin, fout)
182 process(fin, fout, opts)
178 else:
183 else:
179 with open(fname, 'rb') as fin:
184 with open(fname, 'rb') as fin:
180 fout = sys.stdout.buffer
185 fout = sys.stdout.buffer
181 process(fin, fout)
186 process(fin, fout, opts)
182
187
183 if __name__ == '__main__':
188 if __name__ == '__main__':
184 main()
189 main()
General Comments 0
You need to be logged in to leave comments. Login now