##// END OF EJS Templates
byteify-strings: do not rewrite system string literals to u''...
Yuya Nishihara -
r38408:1d68fd5f default
parent child Browse files
Show More
@@ -1,189 +1,191 b''
1 1 #!/usr/bin/env python3
2 2 #
3 3 # byteify-strings.py - transform string literals to be Python 3 safe
4 4 #
5 5 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import argparse
13 13 import contextlib
14 14 import errno
15 15 import os
16 16 import sys
17 17 import tempfile
18 18 import token
19 19 import tokenize
20 20
21 21 if True:
22 22 def replacetokens(tokens, opts):
23 23 """Transform a stream of tokens from raw to Python 3.
24 24
25 25 Returns a generator of possibly rewritten tokens.
26 26
27 27 The input token list may be mutated as part of processing. However,
28 28 its changes do not necessarily match the output token stream.
29 29 """
30 sysstrtokens = set()
31
30 32 # The following utility functions access the tokens list and i index of
31 33 # the for i, t enumerate(tokens) loop below
32 34 def _isop(j, *o):
33 35 """Assert that tokens[j] is an OP with one of the given values"""
34 36 try:
35 37 return tokens[j].type == token.OP and tokens[j].string in o
36 38 except IndexError:
37 39 return False
38 40
39 41 def _findargnofcall(n):
40 42 """Find arg n of a call expression (start at 0)
41 43
42 44 Returns index of the first token of that argument, or None if
43 45 there is not that many arguments.
44 46
45 47 Assumes that token[i + 1] is '('.
46 48
47 49 """
48 50 nested = 0
49 51 for j in range(i + 2, len(tokens)):
50 52 if _isop(j, ')', ']', '}'):
51 53 # end of call, tuple, subscription or dict / set
52 54 nested -= 1
53 55 if nested < 0:
54 56 return None
55 57 elif n == 0:
56 58 # this is the starting position of arg
57 59 return j
58 60 elif _isop(j, '(', '[', '{'):
59 61 nested += 1
60 62 elif _isop(j, ',') and nested == 0:
61 63 n -= 1
62 64
63 65 return None
64 66
65 def _ensureunicode(j):
66 """Make sure the token at j is a unicode string
67 def _ensuresysstr(j):
68 """Make sure the token at j is a system string
67 69
68 This rewrites a string token to include the unicode literal prefix
69 so the string transformer won't add the byte prefix.
70 Remember the given token so the string transformer won't add
71 the byte prefix.
70 72
71 73 Ignores tokens that are not strings. Assumes bounds checking has
72 74 already been done.
73 75
74 76 """
75 77 st = tokens[j]
76 78 if st.type == token.STRING and st.string.startswith(("'", '"')):
77 tokens[j] = st._replace(string='u%s' % st.string)
79 sysstrtokens.add(st)
78 80
79 81 for i, t in enumerate(tokens):
80 82 # Convert most string literals to byte literals. String literals
81 83 # in Python 2 are bytes. String literals in Python 3 are unicode.
82 84 # Most strings in Mercurial are bytes and unicode strings are rare.
83 85 # Rather than rewrite all string literals to use ``b''`` to indicate
84 86 # byte strings, we apply this token transformer to insert the ``b``
85 87 # prefix nearly everywhere.
86 if t.type == token.STRING:
88 if t.type == token.STRING and t not in sysstrtokens:
87 89 s = t.string
88 90
89 91 # Preserve docstrings as string literals. This is inconsistent
90 92 # with regular unprefixed strings. However, the
91 93 # "from __future__" parsing (which allows a module docstring to
92 94 # exist before it) doesn't properly handle the docstring if it
93 95 # is b''' prefixed, leading to a SyntaxError. We leave all
94 96 # docstrings as unprefixed to avoid this. This means Mercurial
95 97 # components touching docstrings need to handle unicode,
96 98 # unfortunately.
97 99 if s[0:3] in ("'''", '"""'):
98 100 yield t
99 101 continue
100 102
101 103 # If the first character isn't a quote, it is likely a string
102 104 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
103 105 if s[0] not in ("'", '"'):
104 106 yield t
105 107 continue
106 108
107 109 # String literal. Prefix to make a b'' string.
108 110 yield t._replace(string='b%s' % t.string)
109 111 continue
110 112
111 113 # This looks like a function call.
112 114 if t.type == token.NAME and _isop(i + 1, '('):
113 115 fn = t.string
114 116
115 117 # *attr() builtins don't accept byte strings to 2nd argument.
116 118 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
117 119 not _isop(i - 1, '.')):
118 120 arg1idx = _findargnofcall(1)
119 121 if arg1idx is not None:
120 _ensureunicode(arg1idx)
122 _ensuresysstr(arg1idx)
121 123
122 124 # .encode() and .decode() on str/bytes/unicode don't accept
123 125 # byte strings on Python 3.
124 126 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
125 127 for argn in range(2):
126 128 argidx = _findargnofcall(argn)
127 129 if argidx is not None:
128 _ensureunicode(argidx)
130 _ensuresysstr(argidx)
129 131
130 132 # It changes iteritems/values to items/values as they are not
131 133 # present in Python 3 world.
132 134 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
133 135 yield t._replace(string=fn[4:])
134 136 continue
135 137
136 138 # Emit unmodified token.
137 139 yield t
138 140
139 141 def process(fin, fout, opts):
140 142 tokens = tokenize.tokenize(fin.readline)
141 143 tokens = replacetokens(list(tokens), opts)
142 144 fout.write(tokenize.untokenize(tokens))
143 145
144 146 def tryunlink(fname):
145 147 try:
146 148 os.unlink(fname)
147 149 except OSError as err:
148 150 if err.errno != errno.ENOENT:
149 151 raise
150 152
151 153 @contextlib.contextmanager
152 154 def editinplace(fname):
153 155 n = os.path.basename(fname)
154 156 d = os.path.dirname(fname)
155 157 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
156 158 delete=False)
157 159 try:
158 160 yield fp
159 161 fp.close()
160 162 if os.name == 'nt':
161 163 tryunlink(fname)
162 164 os.rename(fp.name, fname)
163 165 finally:
164 166 fp.close()
165 167 tryunlink(fp.name)
166 168
167 169 def main():
168 170 ap = argparse.ArgumentParser()
169 171 ap.add_argument('-i', '--inplace', action='store_true', default=False,
170 172 help='edit files in place')
171 173 ap.add_argument('--dictiter', action='store_true', default=False,
172 174 help='rewrite iteritems() and itervalues()'),
173 175 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
174 176 args = ap.parse_args()
175 177 opts = {
176 178 'dictiter': args.dictiter,
177 179 }
178 180 for fname in args.files:
179 181 if args.inplace:
180 182 with editinplace(fname) as fout:
181 183 with open(fname, 'rb') as fin:
182 184 process(fin, fout, opts)
183 185 else:
184 186 with open(fname, 'rb') as fin:
185 187 fout = sys.stdout.buffer
186 188 process(fin, fout, opts)
187 189
188 190 if __name__ == '__main__':
189 191 main()
General Comments 0
You need to be logged in to leave comments. Login now