##// END OF EJS Templates
byteify-strings: try to preserve column alignment
Yuya Nishihara -
r38409:47dd23e6 default
parent child Browse files
Show More
@@ -1,191 +1,226 b''
1 #!/usr/bin/env python3
1 #!/usr/bin/env python3
2 #
2 #
3 # byteify-strings.py - transform string literals to be Python 3 safe
3 # byteify-strings.py - transform string literals to be Python 3 safe
4 #
4 #
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 #
6 #
7 # This software may be used and distributed according to the terms of the
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
8 # GNU General Public License version 2 or any later version.
9
9
10 from __future__ import absolute_import
10 from __future__ import absolute_import
11
11
12 import argparse
12 import argparse
13 import contextlib
13 import contextlib
14 import errno
14 import errno
15 import os
15 import os
16 import sys
16 import sys
17 import tempfile
17 import tempfile
18 import token
18 import token
19 import tokenize
19 import tokenize
20
20
21 def adjusttokenpos(t, ofs):
22 """Adjust start/end column of the given token"""
23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 end=(t.end[0], t.end[1] + ofs))
25
21 if True:
26 if True:
22 def replacetokens(tokens, opts):
27 def replacetokens(tokens, opts):
23 """Transform a stream of tokens from raw to Python 3.
28 """Transform a stream of tokens from raw to Python 3.
24
29
25 Returns a generator of possibly rewritten tokens.
30 Returns a generator of possibly rewritten tokens.
26
31
27 The input token list may be mutated as part of processing. However,
32 The input token list may be mutated as part of processing. However,
28 its changes do not necessarily match the output token stream.
33 its changes do not necessarily match the output token stream.
29 """
34 """
30 sysstrtokens = set()
35 sysstrtokens = set()
31
36
32 # The following utility functions access the tokens list and i index of
37 # The following utility functions access the tokens list and i index of
33 # the for i, t enumerate(tokens) loop below
38 # the for i, t enumerate(tokens) loop below
34 def _isop(j, *o):
39 def _isop(j, *o):
35 """Assert that tokens[j] is an OP with one of the given values"""
40 """Assert that tokens[j] is an OP with one of the given values"""
36 try:
41 try:
37 return tokens[j].type == token.OP and tokens[j].string in o
42 return tokens[j].type == token.OP and tokens[j].string in o
38 except IndexError:
43 except IndexError:
39 return False
44 return False
40
45
41 def _findargnofcall(n):
46 def _findargnofcall(n):
42 """Find arg n of a call expression (start at 0)
47 """Find arg n of a call expression (start at 0)
43
48
44 Returns index of the first token of that argument, or None if
49 Returns index of the first token of that argument, or None if
45 there is not that many arguments.
50 there is not that many arguments.
46
51
47 Assumes that token[i + 1] is '('.
52 Assumes that token[i + 1] is '('.
48
53
49 """
54 """
50 nested = 0
55 nested = 0
51 for j in range(i + 2, len(tokens)):
56 for j in range(i + 2, len(tokens)):
52 if _isop(j, ')', ']', '}'):
57 if _isop(j, ')', ']', '}'):
53 # end of call, tuple, subscription or dict / set
58 # end of call, tuple, subscription or dict / set
54 nested -= 1
59 nested -= 1
55 if nested < 0:
60 if nested < 0:
56 return None
61 return None
57 elif n == 0:
62 elif n == 0:
58 # this is the starting position of arg
63 # this is the starting position of arg
59 return j
64 return j
60 elif _isop(j, '(', '[', '{'):
65 elif _isop(j, '(', '[', '{'):
61 nested += 1
66 nested += 1
62 elif _isop(j, ',') and nested == 0:
67 elif _isop(j, ',') and nested == 0:
63 n -= 1
68 n -= 1
64
69
65 return None
70 return None
66
71
67 def _ensuresysstr(j):
72 def _ensuresysstr(j):
68 """Make sure the token at j is a system string
73 """Make sure the token at j is a system string
69
74
70 Remember the given token so the string transformer won't add
75 Remember the given token so the string transformer won't add
71 the byte prefix.
76 the byte prefix.
72
77
73 Ignores tokens that are not strings. Assumes bounds checking has
78 Ignores tokens that are not strings. Assumes bounds checking has
74 already been done.
79 already been done.
75
80
76 """
81 """
77 st = tokens[j]
82 st = tokens[j]
78 if st.type == token.STRING and st.string.startswith(("'", '"')):
83 if st.type == token.STRING and st.string.startswith(("'", '"')):
79 sysstrtokens.add(st)
84 sysstrtokens.add(st)
80
85
86 coldelta = 0 # column increment for new opening parens
87 coloffset = -1 # column offset for the current line (-1: TBD)
88 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
81 for i, t in enumerate(tokens):
89 for i, t in enumerate(tokens):
90 # Compute the column offset for the current line, such that
91 # the current line will be aligned to the last opening paren
92 # as before.
93 if coloffset < 0:
94 if t.start[1] == parens[-1][1]:
95 coloffset = parens[-1][2]
96 elif t.start[1] + 1 == parens[-1][1]:
97 # fix misaligned indent of s/util.Abort/error.Abort/
98 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
99 else:
100 coloffset = 0
101
102 # Reset per-line attributes at EOL.
103 if t.type in (token.NEWLINE, tokenize.NL):
104 yield adjusttokenpos(t, coloffset)
105 coldelta = 0
106 coloffset = -1
107 continue
108
109 # Remember the last paren position.
110 if _isop(i, '(', '[', '{'):
111 parens.append(t.end + (coloffset + coldelta,))
112 elif _isop(i, ')', ']', '}'):
113 parens.pop()
114
82 # Convert most string literals to byte literals. String literals
115 # Convert most string literals to byte literals. String literals
83 # in Python 2 are bytes. String literals in Python 3 are unicode.
116 # in Python 2 are bytes. String literals in Python 3 are unicode.
84 # Most strings in Mercurial are bytes and unicode strings are rare.
117 # Most strings in Mercurial are bytes and unicode strings are rare.
85 # Rather than rewrite all string literals to use ``b''`` to indicate
118 # Rather than rewrite all string literals to use ``b''`` to indicate
86 # byte strings, we apply this token transformer to insert the ``b``
119 # byte strings, we apply this token transformer to insert the ``b``
87 # prefix nearly everywhere.
120 # prefix nearly everywhere.
88 if t.type == token.STRING and t not in sysstrtokens:
121 if t.type == token.STRING and t not in sysstrtokens:
89 s = t.string
122 s = t.string
90
123
91 # Preserve docstrings as string literals. This is inconsistent
124 # Preserve docstrings as string literals. This is inconsistent
92 # with regular unprefixed strings. However, the
125 # with regular unprefixed strings. However, the
93 # "from __future__" parsing (which allows a module docstring to
126 # "from __future__" parsing (which allows a module docstring to
94 # exist before it) doesn't properly handle the docstring if it
127 # exist before it) doesn't properly handle the docstring if it
95 # is b''' prefixed, leading to a SyntaxError. We leave all
128 # is b''' prefixed, leading to a SyntaxError. We leave all
96 # docstrings as unprefixed to avoid this. This means Mercurial
129 # docstrings as unprefixed to avoid this. This means Mercurial
97 # components touching docstrings need to handle unicode,
130 # components touching docstrings need to handle unicode,
98 # unfortunately.
131 # unfortunately.
99 if s[0:3] in ("'''", '"""'):
132 if s[0:3] in ("'''", '"""'):
100 yield t
133 yield adjusttokenpos(t, coloffset)
101 continue
134 continue
102
135
103 # If the first character isn't a quote, it is likely a string
136 # If the first character isn't a quote, it is likely a string
104 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
137 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
105 if s[0] not in ("'", '"'):
138 if s[0] not in ("'", '"'):
106 yield t
139 yield adjusttokenpos(t, coloffset)
107 continue
140 continue
108
141
109 # String literal. Prefix to make a b'' string.
142 # String literal. Prefix to make a b'' string.
110 yield t._replace(string='b%s' % t.string)
143 yield adjusttokenpos(t._replace(string='b%s' % t.string),
144 coloffset)
145 coldelta += 1
111 continue
146 continue
112
147
113 # This looks like a function call.
148 # This looks like a function call.
114 if t.type == token.NAME and _isop(i + 1, '('):
149 if t.type == token.NAME and _isop(i + 1, '('):
115 fn = t.string
150 fn = t.string
116
151
117 # *attr() builtins don't accept byte strings to 2nd argument.
152 # *attr() builtins don't accept byte strings to 2nd argument.
118 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
153 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
119 not _isop(i - 1, '.')):
154 not _isop(i - 1, '.')):
120 arg1idx = _findargnofcall(1)
155 arg1idx = _findargnofcall(1)
121 if arg1idx is not None:
156 if arg1idx is not None:
122 _ensuresysstr(arg1idx)
157 _ensuresysstr(arg1idx)
123
158
124 # .encode() and .decode() on str/bytes/unicode don't accept
159 # .encode() and .decode() on str/bytes/unicode don't accept
125 # byte strings on Python 3.
160 # byte strings on Python 3.
126 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
161 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
127 for argn in range(2):
162 for argn in range(2):
128 argidx = _findargnofcall(argn)
163 argidx = _findargnofcall(argn)
129 if argidx is not None:
164 if argidx is not None:
130 _ensuresysstr(argidx)
165 _ensuresysstr(argidx)
131
166
132 # It changes iteritems/values to items/values as they are not
167 # It changes iteritems/values to items/values as they are not
133 # present in Python 3 world.
168 # present in Python 3 world.
134 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
169 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
135 yield t._replace(string=fn[4:])
170 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
136 continue
171 continue
137
172
138 # Emit unmodified token.
173 # Emit unmodified token.
139 yield t
174 yield adjusttokenpos(t, coloffset)
140
175
141 def process(fin, fout, opts):
176 def process(fin, fout, opts):
142 tokens = tokenize.tokenize(fin.readline)
177 tokens = tokenize.tokenize(fin.readline)
143 tokens = replacetokens(list(tokens), opts)
178 tokens = replacetokens(list(tokens), opts)
144 fout.write(tokenize.untokenize(tokens))
179 fout.write(tokenize.untokenize(tokens))
145
180
146 def tryunlink(fname):
181 def tryunlink(fname):
147 try:
182 try:
148 os.unlink(fname)
183 os.unlink(fname)
149 except OSError as err:
184 except OSError as err:
150 if err.errno != errno.ENOENT:
185 if err.errno != errno.ENOENT:
151 raise
186 raise
152
187
153 @contextlib.contextmanager
188 @contextlib.contextmanager
154 def editinplace(fname):
189 def editinplace(fname):
155 n = os.path.basename(fname)
190 n = os.path.basename(fname)
156 d = os.path.dirname(fname)
191 d = os.path.dirname(fname)
157 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
192 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
158 delete=False)
193 delete=False)
159 try:
194 try:
160 yield fp
195 yield fp
161 fp.close()
196 fp.close()
162 if os.name == 'nt':
197 if os.name == 'nt':
163 tryunlink(fname)
198 tryunlink(fname)
164 os.rename(fp.name, fname)
199 os.rename(fp.name, fname)
165 finally:
200 finally:
166 fp.close()
201 fp.close()
167 tryunlink(fp.name)
202 tryunlink(fp.name)
168
203
169 def main():
204 def main():
170 ap = argparse.ArgumentParser()
205 ap = argparse.ArgumentParser()
171 ap.add_argument('-i', '--inplace', action='store_true', default=False,
206 ap.add_argument('-i', '--inplace', action='store_true', default=False,
172 help='edit files in place')
207 help='edit files in place')
173 ap.add_argument('--dictiter', action='store_true', default=False,
208 ap.add_argument('--dictiter', action='store_true', default=False,
174 help='rewrite iteritems() and itervalues()'),
209 help='rewrite iteritems() and itervalues()'),
175 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
210 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
176 args = ap.parse_args()
211 args = ap.parse_args()
177 opts = {
212 opts = {
178 'dictiter': args.dictiter,
213 'dictiter': args.dictiter,
179 }
214 }
180 for fname in args.files:
215 for fname in args.files:
181 if args.inplace:
216 if args.inplace:
182 with editinplace(fname) as fout:
217 with editinplace(fname) as fout:
183 with open(fname, 'rb') as fin:
218 with open(fname, 'rb') as fin:
184 process(fin, fout, opts)
219 process(fin, fout, opts)
185 else:
220 else:
186 with open(fname, 'rb') as fin:
221 with open(fname, 'rb') as fin:
187 fout = sys.stdout.buffer
222 fout = sys.stdout.buffer
188 process(fin, fout, opts)
223 process(fin, fout, opts)
189
224
190 if __name__ == '__main__':
225 if __name__ == '__main__':
191 main()
226 main()
General Comments 0
You need to be logged in to leave comments. Login now