##// END OF EJS Templates
byteify-strings: add support for ignore comments...
Raphaël Gomès -
r42906:b9a20047 default
parent child Browse files
Show More
@@ -1,245 +1,262 b''
1 1 #!/usr/bin/env python3
2 2 #
3 3 # byteify-strings.py - transform string literals to be Python 3 safe
4 4 #
5 5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import, print_function
11 11
12 12 import argparse
13 13 import contextlib
14 14 import errno
15 15 import os
16 16 import sys
17 17 import tempfile
18 18 import token
19 19 import tokenize
20 20
21 21 def adjusttokenpos(t, ofs):
22 22 """Adjust start/end column of the given token"""
23 23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 24 end=(t.end[0], t.end[1] + ofs))
25 25
26 26 def replacetokens(tokens, opts):
27 27 """Transform a stream of tokens from raw to Python 3.
28 28
29 29 Returns a generator of possibly rewritten tokens.
30 30
31 31 The input token list may be mutated as part of processing. However,
32 32 its changes do not necessarily match the output token stream.
33 33 """
34 34 sysstrtokens = set()
35 35
36 36 # The following utility functions access the tokens list and i index of
37 37 # the for i, t enumerate(tokens) loop below
38 38 def _isop(j, *o):
39 39 """Assert that tokens[j] is an OP with one of the given values"""
40 40 try:
41 41 return tokens[j].type == token.OP and tokens[j].string in o
42 42 except IndexError:
43 43 return False
44 44
45 45 def _findargnofcall(n):
46 46 """Find arg n of a call expression (start at 0)
47 47
48 48 Returns index of the first token of that argument, or None if
49 49 there is not that many arguments.
50 50
51 51 Assumes that token[i + 1] is '('.
52 52
53 53 """
54 54 nested = 0
55 55 for j in range(i + 2, len(tokens)):
56 56 if _isop(j, ')', ']', '}'):
57 57 # end of call, tuple, subscription or dict / set
58 58 nested -= 1
59 59 if nested < 0:
60 60 return None
61 61 elif n == 0:
62 62 # this is the starting position of arg
63 63 return j
64 64 elif _isop(j, '(', '[', '{'):
65 65 nested += 1
66 66 elif _isop(j, ',') and nested == 0:
67 67 n -= 1
68 68
69 69 return None
70 70
71 71 def _ensuresysstr(j):
72 72 """Make sure the token at j is a system string
73 73
74 74 Remember the given token so the string transformer won't add
75 75 the byte prefix.
76 76
77 77 Ignores tokens that are not strings. Assumes bounds checking has
78 78 already been done.
79 79
80 80 """
81 81 k = j
82 82 currtoken = tokens[k]
83 83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 84 k += 1
85 85 if (
86 86 currtoken.type == token.STRING
87 87 and currtoken.string.startswith(("'", '"'))
88 88 ):
89 89 sysstrtokens.add(currtoken)
90 90 try:
91 91 currtoken = tokens[k]
92 92 except IndexError:
93 93 break
94 94
95 95 coldelta = 0 # column increment for new opening parens
96 96 coloffset = -1 # column offset for the current line (-1: TBD)
97 97 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
98 ignorenextline = False # don't transform the next line
99 insideignoreblock = False # don't transform until turned off
98 100 for i, t in enumerate(tokens):
99 101 # Compute the column offset for the current line, such that
100 102 # the current line will be aligned to the last opening paren
101 103 # as before.
102 104 if coloffset < 0:
103 105 if t.start[1] == parens[-1][1]:
104 106 coloffset = parens[-1][2]
105 107 elif t.start[1] + 1 == parens[-1][1]:
106 108 # fix misaligned indent of s/util.Abort/error.Abort/
107 109 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
108 110 else:
109 111 coloffset = 0
110 112
111 113 # Reset per-line attributes at EOL.
112 114 if t.type in (token.NEWLINE, tokenize.NL):
113 115 yield adjusttokenpos(t, coloffset)
114 116 coldelta = 0
115 117 coloffset = -1
118 if not insideignoreblock:
119 ignorenextline = (
120 tokens[i - 1].type == token.COMMENT
121 and tokens[i - 1].string == "#no-py3-transform"
122 )
123 continue
124
125 if t.type == token.COMMENT:
126 if t.string == "#py3-transform: off":
127 insideignoreblock = True
128 if t.string == "#py3-transform: on":
129 insideignoreblock = False
130
131 if ignorenextline or insideignoreblock:
132 yield adjusttokenpos(t, coloffset)
116 133 continue
117 134
118 135 # Remember the last paren position.
119 136 if _isop(i, '(', '[', '{'):
120 137 parens.append(t.end + (coloffset + coldelta,))
121 138 elif _isop(i, ')', ']', '}'):
122 139 parens.pop()
123 140
124 141 # Convert most string literals to byte literals. String literals
125 142 # in Python 2 are bytes. String literals in Python 3 are unicode.
126 143 # Most strings in Mercurial are bytes and unicode strings are rare.
127 144 # Rather than rewrite all string literals to use ``b''`` to indicate
128 145 # byte strings, we apply this token transformer to insert the ``b``
129 146 # prefix nearly everywhere.
130 147 if t.type == token.STRING and t not in sysstrtokens:
131 148 s = t.string
132 149
133 150 # Preserve docstrings as string literals. This is inconsistent
134 151 # with regular unprefixed strings. However, the
135 152 # "from __future__" parsing (which allows a module docstring to
136 153 # exist before it) doesn't properly handle the docstring if it
137 154 # is b''' prefixed, leading to a SyntaxError. We leave all
138 155 # docstrings as unprefixed to avoid this. This means Mercurial
139 156 # components touching docstrings need to handle unicode,
140 157 # unfortunately.
141 158 if s[0:3] in ("'''", '"""'):
142 159 # If it's assigned to something, it's not a docstring
143 160 if not _isop(i - 1, '='):
144 161 yield adjusttokenpos(t, coloffset)
145 162 continue
146 163
147 164 # If the first character isn't a quote, it is likely a string
148 165 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
149 166 if s[0] not in ("'", '"'):
150 167 yield adjusttokenpos(t, coloffset)
151 168 continue
152 169
153 170 # String literal. Prefix to make a b'' string.
154 171 yield adjusttokenpos(t._replace(string='b%s' % t.string),
155 172 coloffset)
156 173 coldelta += 1
157 174 continue
158 175
159 176 # This looks like a function call.
160 177 if t.type == token.NAME and _isop(i + 1, '('):
161 178 fn = t.string
162 179
163 180 # *attr() builtins don't accept byte strings to 2nd argument.
164 181 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
165 182 not _isop(i - 1, '.')):
166 183 arg1idx = _findargnofcall(1)
167 184 if arg1idx is not None:
168 185 _ensuresysstr(arg1idx)
169 186
170 187 # .encode() and .decode() on str/bytes/unicode don't accept
171 188 # byte strings on Python 3.
172 189 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
173 190 for argn in range(2):
174 191 argidx = _findargnofcall(argn)
175 192 if argidx is not None:
176 193 _ensuresysstr(argidx)
177 194
178 195 # It changes iteritems/values to items/values as they are not
179 196 # present in Python 3 world.
180 197 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
181 198 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
182 199 continue
183 200
184 201 # Looks like "if __name__ == '__main__'".
185 202 if (t.type == token.NAME and t.string == '__name__'
186 203 and _isop(i + 1, '==')):
187 204 _ensuresysstr(i + 2)
188 205
189 206 # Emit unmodified token.
190 207 yield adjusttokenpos(t, coloffset)
191 208
192 209 def process(fin, fout, opts):
193 210 tokens = tokenize.tokenize(fin.readline)
194 211 tokens = replacetokens(list(tokens), opts)
195 212 fout.write(tokenize.untokenize(tokens))
196 213
197 214 def tryunlink(fname):
198 215 try:
199 216 os.unlink(fname)
200 217 except OSError as err:
201 218 if err.errno != errno.ENOENT:
202 219 raise
203 220
204 221 @contextlib.contextmanager
205 222 def editinplace(fname):
206 223 n = os.path.basename(fname)
207 224 d = os.path.dirname(fname)
208 225 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
209 226 delete=False)
210 227 try:
211 228 yield fp
212 229 fp.close()
213 230 if os.name == 'nt':
214 231 tryunlink(fname)
215 232 os.rename(fp.name, fname)
216 233 finally:
217 234 fp.close()
218 235 tryunlink(fp.name)
219 236
220 237 def main():
221 238 ap = argparse.ArgumentParser()
222 239 ap.add_argument('-i', '--inplace', action='store_true', default=False,
223 240 help='edit files in place')
224 241 ap.add_argument('--dictiter', action='store_true', default=False,
225 242 help='rewrite iteritems() and itervalues()'),
226 243 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
227 244 args = ap.parse_args()
228 245 opts = {
229 246 'dictiter': args.dictiter,
230 247 }
231 248 for fname in args.files:
232 249 if args.inplace:
233 250 with editinplace(fname) as fout:
234 251 with open(fname, 'rb') as fin:
235 252 process(fin, fout, opts)
236 253 else:
237 254 with open(fname, 'rb') as fin:
238 255 fout = sys.stdout.buffer
239 256 process(fin, fout, opts)
240 257
241 258 if __name__ == '__main__':
242 259 if sys.version_info.major < 3:
243 260 print('This script must be run under Python 3.')
244 261 sys.exit(3)
245 262 main()
General Comments 0
You need to be logged in to leave comments. Login now