##// END OF EJS Templates
mangler: stop rewriting string constants to be bytes literals...
Augie Fackler -
r43348:88eba710 default
parent child Browse files
Show More
@@ -1,329 +1,298 b''
1 1 # __init__.py - Startup and module loading logic for Mercurial.
2 2 #
3 3 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import sys
11 11
12 12 # Allow 'from mercurial import demandimport' to keep working.
13 13 import hgdemandimport
14 14
15 15 demandimport = hgdemandimport
16 16
17 17 __all__ = []
18 18
19 19 # Python 3 uses a custom module loader that transforms source code between
20 20 # source file reading and compilation. This is done by registering a custom
21 21 # finder that changes the spec for Mercurial modules to use a custom loader.
22 22 if sys.version_info[0] >= 3:
23 23 import importlib
24 24 import importlib.abc
25 25 import io
26 26 import token
27 27 import tokenize
28 28
29 29 class hgpathentryfinder(importlib.abc.MetaPathFinder):
30 30 """A sys.meta_path finder that uses a custom module loader."""
31 31
32 32 def find_spec(self, fullname, path, target=None):
33 33 # Only handle Mercurial-related modules.
34 34 if not fullname.startswith(('mercurial.', 'hgext.')):
35 35 return None
36 36 # don't try to parse binary
37 37 if fullname.startswith('mercurial.cext.'):
38 38 return None
39 39 # third-party packages are expected to be dual-version clean
40 40 if fullname.startswith('mercurial.thirdparty'):
41 41 return None
42 42 # zstd is already dual-version clean, don't try and mangle it
43 43 if fullname.startswith('mercurial.zstd'):
44 44 return None
45 45 # rustext is built for the right python version,
46 46 # don't try and mangle it
47 47 if fullname.startswith('mercurial.rustext'):
48 48 return None
49 49 # pywatchman is already dual-version clean, don't try and mangle it
50 50 if fullname.startswith('hgext.fsmonitor.pywatchman'):
51 51 return None
52 52
53 53 # Try to find the module using other registered finders.
54 54 spec = None
55 55 for finder in sys.meta_path:
56 56 if finder == self:
57 57 continue
58 58
59 59 # Originally the API was a `find_module` method, but it was
60 60 # renamed to `find_spec` in python 3.4, with a new `target`
61 61 # argument.
62 62 find_spec_method = getattr(finder, 'find_spec', None)
63 63 if find_spec_method:
64 64 spec = find_spec_method(fullname, path, target=target)
65 65 else:
66 66 spec = finder.find_module(fullname)
67 67 if spec is not None:
68 68 spec = importlib.util.spec_from_loader(fullname, spec)
69 69 if spec:
70 70 break
71 71
72 72 # This is a Mercurial-related module but we couldn't find it
73 73 # using the previously-registered finders. This likely means
74 74 # the module doesn't exist.
75 75 if not spec:
76 76 return None
77 77
78 78 # TODO need to support loaders from alternate specs, like zip
79 79 # loaders.
80 80 loader = hgloader(spec.name, spec.origin)
81 81 # Can't use util.safehasattr here because that would require
82 82 # importing util, and we're in import code.
83 83 if hasattr(spec.loader, 'loader'): # hasattr-py3-only
84 84 # This is a nested loader (maybe a lazy loader?)
85 85 spec.loader.loader = loader
86 86 else:
87 87 spec.loader = loader
88 88 return spec
89 89
90 90 def replacetokens(tokens, fullname):
91 91 """Transform a stream of tokens from raw to Python 3.
92 92
93 93 It is called by the custom module loading machinery to rewrite
94 94 source/tokens between source decoding and compilation.
95 95
96 96 Returns a generator of possibly rewritten tokens.
97 97
98 98 The input token list may be mutated as part of processing. However,
99 99 its changes do not necessarily match the output token stream.
100 100
101 101 REMEMBER TO CHANGE ``BYTECODEHEADER`` WHEN CHANGING THIS FUNCTION
102 102 OR CACHED FILES WON'T GET INVALIDATED PROPERLY.
103 103 """
104 104 futureimpline = False
105 105
106 106 # The following utility functions access the tokens list and i index of
107 107 # the for i, t enumerate(tokens) loop below
108 108 def _isop(j, *o):
109 109 """Assert that tokens[j] is an OP with one of the given values"""
110 110 try:
111 111 return tokens[j].type == token.OP and tokens[j].string in o
112 112 except IndexError:
113 113 return False
114 114
115 115 def _findargnofcall(n):
116 116 """Find arg n of a call expression (start at 0)
117 117
118 118 Returns index of the first token of that argument, or None if
119 119 there is not that many arguments.
120 120
121 121 Assumes that token[i + 1] is '('.
122 122
123 123 """
124 124 nested = 0
125 125 for j in range(i + 2, len(tokens)):
126 126 if _isop(j, ')', ']', '}'):
127 127 # end of call, tuple, subscription or dict / set
128 128 nested -= 1
129 129 if nested < 0:
130 130 return None
131 131 elif n == 0:
132 132 # this is the starting position of arg
133 133 return j
134 134 elif _isop(j, '(', '[', '{'):
135 135 nested += 1
136 136 elif _isop(j, ',') and nested == 0:
137 137 n -= 1
138 138
139 139 return None
140 140
141 141 def _ensureunicode(j):
142 142 """Make sure the token at j is a unicode string
143 143
144 144 This rewrites a string token to include the unicode literal prefix
145 145 so the string transformer won't add the byte prefix.
146 146
147 147 Ignores tokens that are not strings. Assumes bounds checking has
148 148 already been done.
149 149
150 150 """
151 151 st = tokens[j]
152 152 if st.type == token.STRING and st.string.startswith(("'", '"')):
153 153 tokens[j] = st._replace(string='u%s' % st.string)
154 154
155 155 for i, t in enumerate(tokens):
156 # Convert most string literals to byte literals. String literals
157 # in Python 2 are bytes. String literals in Python 3 are unicode.
158 # Most strings in Mercurial are bytes and unicode strings are rare.
159 # Rather than rewrite all string literals to use ``b''`` to indicate
160 # byte strings, we apply this token transformer to insert the ``b``
161 # prefix nearly everywhere.
162 if t.type == token.STRING:
163 s = t.string
164
165 # Preserve docstrings as string literals. This is inconsistent
166 # with regular unprefixed strings. However, the
167 # "from __future__" parsing (which allows a module docstring to
168 # exist before it) doesn't properly handle the docstring if it
169 # is b''' prefixed, leading to a SyntaxError. We leave all
170 # docstrings as unprefixed to avoid this. This means Mercurial
171 # components touching docstrings need to handle unicode,
172 # unfortunately.
173 if s[0:3] in ("'''", '"""'):
174 yield t
175 continue
176
177 # If the first character isn't a quote, it is likely a string
178 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
179 if s[0] not in ("'", '"'):
180 yield t
181 continue
182
183 # String literal. Prefix to make a b'' string.
184 yield t._replace(string='b%s' % t.string)
185 continue
186
187 156 # Insert compatibility imports at "from __future__ import" line.
188 157 # No '\n' should be added to preserve line numbers.
189 158 if (
190 159 t.type == token.NAME
191 160 and t.string == 'import'
192 161 and all(u.type == token.NAME for u in tokens[i - 2 : i])
193 162 and [u.string for u in tokens[i - 2 : i]]
194 163 == ['from', '__future__']
195 164 ):
196 165 futureimpline = True
197 166 if t.type == token.NEWLINE and futureimpline:
198 167 futureimpline = False
199 168 if fullname == 'mercurial.pycompat':
200 169 yield t
201 170 continue
202 171 r, c = t.start
203 172 l = (
204 173 b'; from mercurial.pycompat import '
205 174 b'delattr, getattr, hasattr, setattr, '
206 175 b'open, unicode\n'
207 176 )
208 177 for u in tokenize.tokenize(io.BytesIO(l).readline):
209 178 if u.type in (tokenize.ENCODING, token.ENDMARKER):
210 179 continue
211 180 yield u._replace(
212 181 start=(r, c + u.start[1]), end=(r, c + u.end[1])
213 182 )
214 183 continue
215 184
216 185 # This looks like a function call.
217 186 if t.type == token.NAME and _isop(i + 1, '('):
218 187 fn = t.string
219 188
220 189 # *attr() builtins don't accept byte strings to 2nd argument.
221 190 if fn in (
222 191 'getattr',
223 192 'setattr',
224 193 'hasattr',
225 194 'safehasattr',
226 195 ) and not _isop(i - 1, '.'):
227 196 arg1idx = _findargnofcall(1)
228 197 if arg1idx is not None:
229 198 _ensureunicode(arg1idx)
230 199
231 200 # .encode() and .decode() on str/bytes/unicode don't accept
232 201 # byte strings on Python 3.
233 202 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
234 203 for argn in range(2):
235 204 argidx = _findargnofcall(argn)
236 205 if argidx is not None:
237 206 _ensureunicode(argidx)
238 207
239 208 # It changes iteritems/values to items/values as they are not
240 209 # present in Python 3 world.
241 210 elif fn in ('iteritems', 'itervalues') and not (
242 211 tokens[i - 1].type == token.NAME
243 212 and tokens[i - 1].string == 'def'
244 213 ):
245 214 yield t._replace(string=fn[4:])
246 215 continue
247 216
248 217 # Emit unmodified token.
249 218 yield t
250 219
251 220 # Header to add to bytecode files. This MUST be changed when
252 221 # ``replacetoken`` or any mechanism that changes semantics of module
253 222 # loading is changed. Otherwise cached bytecode may get loaded without
254 223 # the new transformation mechanisms applied.
255 224 BYTECODEHEADER = b'HG\x00\x0c'
256 225
257 226 class hgloader(importlib.machinery.SourceFileLoader):
258 227 """Custom module loader that transforms source code.
259 228
260 229 When the source code is converted to a code object, we transform
261 230 certain patterns to be Python 3 compatible. This allows us to write code
262 231 that is natively Python 2 and compatible with Python 3 without
263 232 making the code excessively ugly.
264 233
265 234 We do this by transforming the token stream between parse and compile.
266 235
267 236 Implementing transformations invalidates caching assumptions made
268 237 by the built-in importer. The built-in importer stores a header on
269 238 saved bytecode files indicating the Python/bytecode version. If the
270 239 version changes, the cached bytecode is ignored. The Mercurial
271 240 transformations could change at any time. This means we need to check
272 241 that cached bytecode was generated with the current transformation
273 242 code or there could be a mismatch between cached bytecode and what
274 243 would be generated from this class.
275 244
276 245 We supplement the bytecode caching layer by wrapping ``get_data``
277 246 and ``set_data``. These functions are called when the
278 247 ``SourceFileLoader`` retrieves and saves bytecode cache files,
279 248 respectively. We simply add an additional header on the file. As
280 249 long as the version in this file is changed when semantics change,
281 250 cached bytecode should be invalidated when transformations change.
282 251
283 252 The added header has the form ``HG<VERSION>``. That is a literal
284 253 ``HG`` with 2 binary bytes indicating the transformation version.
285 254 """
286 255
287 256 def get_data(self, path):
288 257 data = super(hgloader, self).get_data(path)
289 258
290 259 if not path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
291 260 return data
292 261
293 262 # There should be a header indicating the Mercurial transformation
294 263 # version. If it doesn't exist or doesn't match the current version,
295 264 # we raise an OSError because that is what
296 265 # ``SourceFileLoader.get_code()`` expects when loading bytecode
297 266 # paths to indicate the cached file is "bad."
298 267 if data[0:2] != b'HG':
299 268 raise OSError('no hg header')
300 269 if data[0:4] != BYTECODEHEADER:
301 270 raise OSError('hg header version mismatch')
302 271
303 272 return data[4:]
304 273
305 274 def set_data(self, path, data, *args, **kwargs):
306 275 if path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
307 276 data = BYTECODEHEADER + data
308 277
309 278 return super(hgloader, self).set_data(path, data, *args, **kwargs)
310 279
311 280 def source_to_code(self, data, path):
312 281 """Perform token transformation before compilation."""
313 282 buf = io.BytesIO(data)
314 283 tokens = tokenize.tokenize(buf.readline)
315 284 data = tokenize.untokenize(replacetokens(list(tokens), self.name))
316 285 # Python's built-in importer strips frames from exceptions raised
317 286 # for this code. Unfortunately, that mechanism isn't extensible
318 287 # and our frame will be blamed for the import failure. There
319 288 # are extremely hacky ways to do frame stripping. We haven't
320 289 # implemented them because they are very ugly.
321 290 return super(hgloader, self).source_to_code(data, path)
322 291
323 292 # We automagically register our custom importer as a side-effect of
324 293 # loading. This is necessary to ensure that any entry points are able
325 294 # to import mercurial.* modules without having to perform this
326 295 # registration themselves.
327 296 if not any(isinstance(x, hgpathentryfinder) for x in sys.meta_path):
328 297 # meta_path is used before any implicit finders and before sys.path.
329 298 sys.meta_path.insert(0, hgpathentryfinder())
General Comments 0
You need to be logged in to leave comments. Login now