##// END OF EJS Templates
byteify-strings: handle triple quoted strings if they are not docstrings...
Raphaël Gomès -
r42905:e9592e11 default
parent child Browse files
Show More
@@ -1,243 +1,245
1 1 #!/usr/bin/env python3
2 2 #
3 3 # byteify-strings.py - transform string literals to be Python 3 safe
4 4 #
5 5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import, print_function
11 11
12 12 import argparse
13 13 import contextlib
14 14 import errno
15 15 import os
16 16 import sys
17 17 import tempfile
18 18 import token
19 19 import tokenize
20 20
21 21 def adjusttokenpos(t, ofs):
22 22 """Adjust start/end column of the given token"""
23 23 return t._replace(start=(t.start[0], t.start[1] + ofs),
24 24 end=(t.end[0], t.end[1] + ofs))
25 25
26 26 def replacetokens(tokens, opts):
27 27 """Transform a stream of tokens from raw to Python 3.
28 28
29 29 Returns a generator of possibly rewritten tokens.
30 30
31 31 The input token list may be mutated as part of processing. However,
32 32 its changes do not necessarily match the output token stream.
33 33 """
34 34 sysstrtokens = set()
35 35
36 36 # The following utility functions access the tokens list and i index of
37 37 # the for i, t enumerate(tokens) loop below
38 38 def _isop(j, *o):
39 39 """Assert that tokens[j] is an OP with one of the given values"""
40 40 try:
41 41 return tokens[j].type == token.OP and tokens[j].string in o
42 42 except IndexError:
43 43 return False
44 44
45 45 def _findargnofcall(n):
46 46 """Find arg n of a call expression (start at 0)
47 47
48 48 Returns index of the first token of that argument, or None if
49 49 there is not that many arguments.
50 50
51 51 Assumes that token[i + 1] is '('.
52 52
53 53 """
54 54 nested = 0
55 55 for j in range(i + 2, len(tokens)):
56 56 if _isop(j, ')', ']', '}'):
57 57 # end of call, tuple, subscription or dict / set
58 58 nested -= 1
59 59 if nested < 0:
60 60 return None
61 61 elif n == 0:
62 62 # this is the starting position of arg
63 63 return j
64 64 elif _isop(j, '(', '[', '{'):
65 65 nested += 1
66 66 elif _isop(j, ',') and nested == 0:
67 67 n -= 1
68 68
69 69 return None
70 70
71 71 def _ensuresysstr(j):
72 72 """Make sure the token at j is a system string
73 73
74 74 Remember the given token so the string transformer won't add
75 75 the byte prefix.
76 76
77 77 Ignores tokens that are not strings. Assumes bounds checking has
78 78 already been done.
79 79
80 80 """
81 81 k = j
82 82 currtoken = tokens[k]
83 83 while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL):
84 84 k += 1
85 85 if (
86 86 currtoken.type == token.STRING
87 87 and currtoken.string.startswith(("'", '"'))
88 88 ):
89 89 sysstrtokens.add(currtoken)
90 90 try:
91 91 currtoken = tokens[k]
92 92 except IndexError:
93 93 break
94 94
95 95 coldelta = 0 # column increment for new opening parens
96 96 coloffset = -1 # column offset for the current line (-1: TBD)
97 97 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
98 98 for i, t in enumerate(tokens):
99 99 # Compute the column offset for the current line, such that
100 100 # the current line will be aligned to the last opening paren
101 101 # as before.
102 102 if coloffset < 0:
103 103 if t.start[1] == parens[-1][1]:
104 104 coloffset = parens[-1][2]
105 105 elif t.start[1] + 1 == parens[-1][1]:
106 106 # fix misaligned indent of s/util.Abort/error.Abort/
107 107 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
108 108 else:
109 109 coloffset = 0
110 110
111 111 # Reset per-line attributes at EOL.
112 112 if t.type in (token.NEWLINE, tokenize.NL):
113 113 yield adjusttokenpos(t, coloffset)
114 114 coldelta = 0
115 115 coloffset = -1
116 116 continue
117 117
118 118 # Remember the last paren position.
119 119 if _isop(i, '(', '[', '{'):
120 120 parens.append(t.end + (coloffset + coldelta,))
121 121 elif _isop(i, ')', ']', '}'):
122 122 parens.pop()
123 123
124 124 # Convert most string literals to byte literals. String literals
125 125 # in Python 2 are bytes. String literals in Python 3 are unicode.
126 126 # Most strings in Mercurial are bytes and unicode strings are rare.
127 127 # Rather than rewrite all string literals to use ``b''`` to indicate
128 128 # byte strings, we apply this token transformer to insert the ``b``
129 129 # prefix nearly everywhere.
130 130 if t.type == token.STRING and t not in sysstrtokens:
131 131 s = t.string
132 132
133 133 # Preserve docstrings as string literals. This is inconsistent
134 134 # with regular unprefixed strings. However, the
135 135 # "from __future__" parsing (which allows a module docstring to
136 136 # exist before it) doesn't properly handle the docstring if it
137 137 # is b''' prefixed, leading to a SyntaxError. We leave all
138 138 # docstrings as unprefixed to avoid this. This means Mercurial
139 139 # components touching docstrings need to handle unicode,
140 140 # unfortunately.
141 141 if s[0:3] in ("'''", '"""'):
142 yield adjusttokenpos(t, coloffset)
143 continue
142 # If it's assigned to something, it's not a docstring
143 if not _isop(i - 1, '='):
144 yield adjusttokenpos(t, coloffset)
145 continue
144 146
145 147 # If the first character isn't a quote, it is likely a string
146 148 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
147 149 if s[0] not in ("'", '"'):
148 150 yield adjusttokenpos(t, coloffset)
149 151 continue
150 152
151 153 # String literal. Prefix to make a b'' string.
152 154 yield adjusttokenpos(t._replace(string='b%s' % t.string),
153 155 coloffset)
154 156 coldelta += 1
155 157 continue
156 158
157 159 # This looks like a function call.
158 160 if t.type == token.NAME and _isop(i + 1, '('):
159 161 fn = t.string
160 162
161 163 # *attr() builtins don't accept byte strings to 2nd argument.
162 164 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
163 165 not _isop(i - 1, '.')):
164 166 arg1idx = _findargnofcall(1)
165 167 if arg1idx is not None:
166 168 _ensuresysstr(arg1idx)
167 169
168 170 # .encode() and .decode() on str/bytes/unicode don't accept
169 171 # byte strings on Python 3.
170 172 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
171 173 for argn in range(2):
172 174 argidx = _findargnofcall(argn)
173 175 if argidx is not None:
174 176 _ensuresysstr(argidx)
175 177
176 178 # It changes iteritems/values to items/values as they are not
177 179 # present in Python 3 world.
178 180 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
179 181 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
180 182 continue
181 183
182 184 # Looks like "if __name__ == '__main__'".
183 185 if (t.type == token.NAME and t.string == '__name__'
184 186 and _isop(i + 1, '==')):
185 187 _ensuresysstr(i + 2)
186 188
187 189 # Emit unmodified token.
188 190 yield adjusttokenpos(t, coloffset)
189 191
190 192 def process(fin, fout, opts):
191 193 tokens = tokenize.tokenize(fin.readline)
192 194 tokens = replacetokens(list(tokens), opts)
193 195 fout.write(tokenize.untokenize(tokens))
194 196
195 197 def tryunlink(fname):
196 198 try:
197 199 os.unlink(fname)
198 200 except OSError as err:
199 201 if err.errno != errno.ENOENT:
200 202 raise
201 203
202 204 @contextlib.contextmanager
203 205 def editinplace(fname):
204 206 n = os.path.basename(fname)
205 207 d = os.path.dirname(fname)
206 208 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
207 209 delete=False)
208 210 try:
209 211 yield fp
210 212 fp.close()
211 213 if os.name == 'nt':
212 214 tryunlink(fname)
213 215 os.rename(fp.name, fname)
214 216 finally:
215 217 fp.close()
216 218 tryunlink(fp.name)
217 219
218 220 def main():
219 221 ap = argparse.ArgumentParser()
220 222 ap.add_argument('-i', '--inplace', action='store_true', default=False,
221 223 help='edit files in place')
222 224 ap.add_argument('--dictiter', action='store_true', default=False,
223 225 help='rewrite iteritems() and itervalues()'),
224 226 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
225 227 args = ap.parse_args()
226 228 opts = {
227 229 'dictiter': args.dictiter,
228 230 }
229 231 for fname in args.files:
230 232 if args.inplace:
231 233 with editinplace(fname) as fout:
232 234 with open(fname, 'rb') as fin:
233 235 process(fin, fout, opts)
234 236 else:
235 237 with open(fname, 'rb') as fin:
236 238 fout = sys.stdout.buffer
237 239 process(fin, fout, opts)
238 240
239 241 if __name__ == '__main__':
240 242 if sys.version_info.major < 3:
241 243 print('This script must be run under Python 3.')
242 244 sys.exit(3)
243 245 main()
General Comments 0
You need to be logged in to leave comments. Login now