##// END OF EJS Templates
byteify-strings: do not rewrite system string literals to u''...
Yuya Nishihara -
r38408:1d68fd5f default
parent child Browse files
Show More
@@ -1,189 +1,191 b''
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import
10 from __future__ import absolute_import
11
11
12 import argparse
12 import argparse
13 import contextlib
13 import contextlib
14 import errno
14 import errno
15 import os
15 import os
16 import sys
16 import sys
17 import tempfile
17 import tempfile
18 import token
18 import token
19 import tokenize
19 import tokenize
20
20
21 if True:
21 if True:
22 def replacetokens(tokens, opts):
22 def replacetokens(tokens, opts):
23 """Transform a stream of tokens from raw to Python 3.
23 """Transform a stream of tokens from raw to Python 3.
24
24
25 Returns a generator of possibly rewritten tokens.
25 Returns a generator of possibly rewritten tokens.
26
26
27 The input token list may be mutated as part of processing. However,
27 The input token list may be mutated as part of processing. However,
28 its changes do not necessarily match the output token stream.
28 its changes do not necessarily match the output token stream.
29 """
29 """
30 sysstrtokens = set()
31
30 # The following utility functions access the tokens list and i index of
32 # The following utility functions access the tokens list and i index of
31 # the for i, t enumerate(tokens) loop below
33 # the for i, t enumerate(tokens) loop below
32 def _isop(j, *o):
34 def _isop(j, *o):
33 """Assert that tokens[j] is an OP with one of the given values"""
35 """Assert that tokens[j] is an OP with one of the given values"""
34 try:
36 try:
35 return tokens[j].type == token.OP and tokens[j].string in o
37 return tokens[j].type == token.OP and tokens[j].string in o
36 except IndexError:
38 except IndexError:
37 return False
39 return False
38
40
39 def _findargnofcall(n):
41 def _findargnofcall(n):
40 """Find arg n of a call expression (start at 0)
42 """Find arg n of a call expression (start at 0)
41
43
42 Returns index of the first token of that argument, or None if
44 Returns index of the first token of that argument, or None if
43 there is not that many arguments.
45 there is not that many arguments.
44
46
45 Assumes that token[i + 1] is '('.
47 Assumes that token[i + 1] is '('.
46
48
47 """
49 """
48 nested = 0
50 nested = 0
49 for j in range(i + 2, len(tokens)):
51 for j in range(i + 2, len(tokens)):
50 if _isop(j, ')', ']', '}'):
52 if _isop(j, ')', ']', '}'):
51 # end of call, tuple, subscription or dict / set
53 # end of call, tuple, subscription or dict / set
52 nested -= 1
54 nested -= 1
53 if nested < 0:
55 if nested < 0:
54 return None
56 return None
55 elif n == 0:
57 elif n == 0:
56 # this is the starting position of arg
58 # this is the starting position of arg
57 return j
59 return j
58 elif _isop(j, '(', '[', '{'):
60 elif _isop(j, '(', '[', '{'):
59 nested += 1
61 nested += 1
60 elif _isop(j, ',') and nested == 0:
62 elif _isop(j, ',') and nested == 0:
61 n -= 1
63 n -= 1
62
64
63 return None
65 return None
64
66
65 def _ensureunicode(j):
67 def _ensuresysstr(j):
66 """Make sure the token at j is a unicode string
68 """Make sure the token at j is a system string
67
69
68 This rewrites a string token to include the unicode literal prefix
70 Remember the given token so the string transformer won't add
69 so the string transformer won't add the byte prefix.
71 the byte prefix.
70
72
71 Ignores tokens that are not strings. Assumes bounds checking has
73 Ignores tokens that are not strings. Assumes bounds checking has
72 already been done.
74 already been done.
73
75
74 """
76 """
75 st = tokens[j]
77 st = tokens[j]
76 if st.type == token.STRING and st.string.startswith(("'", '"')):
78 if st.type == token.STRING and st.string.startswith(("'", '"')):
77 tokens[j] = st._replace(string='u%s' % st.string)
79 sysstrtokens.add(st)
78
80
79 for i, t in enumerate(tokens):
81 for i, t in enumerate(tokens):
80 # Convert most string literals to byte literals. String literals
82 # Convert most string literals to byte literals. String literals
81 # in Python 2 are bytes. String literals in Python 3 are unicode.
83 # in Python 2 are bytes. String literals in Python 3 are unicode.
82 # Most strings in Mercurial are bytes and unicode strings are rare.
84 # Most strings in Mercurial are bytes and unicode strings are rare.
83 # Rather than rewrite all string literals to use ``b''`` to indicate
85 # Rather than rewrite all string literals to use ``b''`` to indicate
84 # byte strings, we apply this token transformer to insert the ``b``
86 # byte strings, we apply this token transformer to insert the ``b``
85 # prefix nearly everywhere.
87 # prefix nearly everywhere.
86 if t.type == token.STRING:
88 if t.type == token.STRING and t not in sysstrtokens:
87 s = t.string
89 s = t.string
88
90
89 # Preserve docstrings as string literals. This is inconsistent
91 # Preserve docstrings as string literals. This is inconsistent
90 # with regular unprefixed strings. However, the
92 # with regular unprefixed strings. However, the
91 # "from __future__" parsing (which allows a module docstring to
93 # "from __future__" parsing (which allows a module docstring to
92 # exist before it) doesn't properly handle the docstring if it
94 # exist before it) doesn't properly handle the docstring if it
93 # is b''' prefixed, leading to a SyntaxError. We leave all
95 # is b''' prefixed, leading to a SyntaxError. We leave all
94 # docstrings as unprefixed to avoid this. This means Mercurial
96 # docstrings as unprefixed to avoid this. This means Mercurial
95 # components touching docstrings need to handle unicode,
97 # components touching docstrings need to handle unicode,
96 # unfortunately.
98 # unfortunately.
97 if s[0:3] in ("'''", '"""'):
99 if s[0:3] in ("'''", '"""'):
98 yield t
100 yield t
99 continue
101 continue
100
102
101 # If the first character isn't a quote, it is likely a string
103 # If the first character isn't a quote, it is likely a string
102 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
104 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
103 if s[0] not in ("'", '"'):
105 if s[0] not in ("'", '"'):
104 yield t
106 yield t
105 continue
107 continue
106
108
107 # String literal. Prefix to make a b'' string.
109 # String literal. Prefix to make a b'' string.
108 yield t._replace(string='b%s' % t.string)
110 yield t._replace(string='b%s' % t.string)
109 continue
111 continue
110
112
111 # This looks like a function call.
113 # This looks like a function call.
112 if t.type == token.NAME and _isop(i + 1, '('):
114 if t.type == token.NAME and _isop(i + 1, '('):
113 fn = t.string
115 fn = t.string
114
116
115 # *attr() builtins don't accept byte strings to 2nd argument.
117 # *attr() builtins don't accept byte strings to 2nd argument.
116 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
118 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
117 not _isop(i - 1, '.')):
119 not _isop(i - 1, '.')):
118 arg1idx = _findargnofcall(1)
120 arg1idx = _findargnofcall(1)
119 if arg1idx is not None:
121 if arg1idx is not None:
120 _ensureunicode(arg1idx)
122 _ensuresysstr(arg1idx)
121
123
122 # .encode() and .decode() on str/bytes/unicode don't accept
124 # .encode() and .decode() on str/bytes/unicode don't accept
123 # byte strings on Python 3.
125 # byte strings on Python 3.
124 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
126 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
125 for argn in range(2):
127 for argn in range(2):
126 argidx = _findargnofcall(argn)
128 argidx = _findargnofcall(argn)
127 if argidx is not None:
129 if argidx is not None:
128 _ensureunicode(argidx)
130 _ensuresysstr(argidx)
129
131
130 # It changes iteritems/values to items/values as they are not
132 # It changes iteritems/values to items/values as they are not
131 # present in Python 3 world.
133 # present in Python 3 world.
132 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
134 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
133 yield t._replace(string=fn[4:])
135 yield t._replace(string=fn[4:])
134 continue
136 continue
135
137
136 # Emit unmodified token.
138 # Emit unmodified token.
137 yield t
139 yield t
138
140
139 def process(fin, fout, opts):
141 def process(fin, fout, opts):
140 tokens = tokenize.tokenize(fin.readline)
142 tokens = tokenize.tokenize(fin.readline)
141 tokens = replacetokens(list(tokens), opts)
143 tokens = replacetokens(list(tokens), opts)
142 fout.write(tokenize.untokenize(tokens))
144 fout.write(tokenize.untokenize(tokens))
143
145
144 def tryunlink(fname):
146 def tryunlink(fname):
145 try:
147 try:
146 os.unlink(fname)
148 os.unlink(fname)
147 except OSError as err:
149 except OSError as err:
148 if err.errno != errno.ENOENT:
150 if err.errno != errno.ENOENT:
149 raise
151 raise
150
152
151 @contextlib.contextmanager
153 @contextlib.contextmanager
152 def editinplace(fname):
154 def editinplace(fname):
153 n = os.path.basename(fname)
155 n = os.path.basename(fname)
154 d = os.path.dirname(fname)
156 d = os.path.dirname(fname)
155 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
157 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
156 delete=False)
158 delete=False)
157 try:
159 try:
158 yield fp
160 yield fp
159 fp.close()
161 fp.close()
160 if os.name == 'nt':
162 if os.name == 'nt':
161 tryunlink(fname)
163 tryunlink(fname)
162 os.rename(fp.name, fname)
164 os.rename(fp.name, fname)
163 finally:
165 finally:
164 fp.close()
166 fp.close()
165 tryunlink(fp.name)
167 tryunlink(fp.name)
166
168
167 def main():
169 def main():
168 ap = argparse.ArgumentParser()
170 ap = argparse.ArgumentParser()
169 ap.add_argument('-i', '--inplace', action='store_true', default=False,
171 ap.add_argument('-i', '--inplace', action='store_true', default=False,
170 help='edit files in place')
172 help='edit files in place')
171 ap.add_argument('--dictiter', action='store_true', default=False,
173 ap.add_argument('--dictiter', action='store_true', default=False,
172 help='rewrite iteritems() and itervalues()'),
174 help='rewrite iteritems() and itervalues()'),
173 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
175 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
174 args = ap.parse_args()
176 args = ap.parse_args()
175 opts = {
177 opts = {
176 'dictiter': args.dictiter,
178 'dictiter': args.dictiter,
177 }
179 }
178 for fname in args.files:
180 for fname in args.files:
179 if args.inplace:
181 if args.inplace:
180 with editinplace(fname) as fout:
182 with editinplace(fname) as fout:
181 with open(fname, 'rb') as fin:
183 with open(fname, 'rb') as fin:
182 process(fin, fout, opts)
184 process(fin, fout, opts)
183 else:
185 else:
184 with open(fname, 'rb') as fin:
186 with open(fname, 'rb') as fin:
185 fout = sys.stdout.buffer
187 fout = sys.stdout.buffer
186 process(fin, fout, opts)
188 process(fin, fout, opts)
187
189
188 if __name__ == '__main__':
190 if __name__ == '__main__':
189 main()
191 main()
General Comments 0
You need to be logged in to leave comments. Login now