##// END OF EJS Templates
mangler: stop rewriting string constants to be bytes literals...
Augie Fackler -
r43348:88eba710 default
parent child Browse files
Show More
@@ -1,329 +1,298 b''
1 # __init__.py - Startup and module loading logic for Mercurial.
1 # __init__.py - Startup and module loading logic for Mercurial.
2 #
2 #
3 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
3 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import sys
10 import sys
11
11
12 # Allow 'from mercurial import demandimport' to keep working.
12 # Allow 'from mercurial import demandimport' to keep working.
13 import hgdemandimport
13 import hgdemandimport
14
14
15 demandimport = hgdemandimport
15 demandimport = hgdemandimport
16
16
17 __all__ = []
17 __all__ = []
18
18
19 # Python 3 uses a custom module loader that transforms source code between
19 # Python 3 uses a custom module loader that transforms source code between
20 # source file reading and compilation. This is done by registering a custom
20 # source file reading and compilation. This is done by registering a custom
21 # finder that changes the spec for Mercurial modules to use a custom loader.
21 # finder that changes the spec for Mercurial modules to use a custom loader.
22 if sys.version_info[0] >= 3:
22 if sys.version_info[0] >= 3:
23 import importlib
23 import importlib
24 import importlib.abc
24 import importlib.abc
25 import io
25 import io
26 import token
26 import token
27 import tokenize
27 import tokenize
28
28
29 class hgpathentryfinder(importlib.abc.MetaPathFinder):
29 class hgpathentryfinder(importlib.abc.MetaPathFinder):
30 """A sys.meta_path finder that uses a custom module loader."""
30 """A sys.meta_path finder that uses a custom module loader."""
31
31
32 def find_spec(self, fullname, path, target=None):
32 def find_spec(self, fullname, path, target=None):
33 # Only handle Mercurial-related modules.
33 # Only handle Mercurial-related modules.
34 if not fullname.startswith(('mercurial.', 'hgext.')):
34 if not fullname.startswith(('mercurial.', 'hgext.')):
35 return None
35 return None
36 # don't try to parse binary
36 # don't try to parse binary
37 if fullname.startswith('mercurial.cext.'):
37 if fullname.startswith('mercurial.cext.'):
38 return None
38 return None
39 # third-party packages are expected to be dual-version clean
39 # third-party packages are expected to be dual-version clean
40 if fullname.startswith('mercurial.thirdparty'):
40 if fullname.startswith('mercurial.thirdparty'):
41 return None
41 return None
42 # zstd is already dual-version clean, don't try and mangle it
42 # zstd is already dual-version clean, don't try and mangle it
43 if fullname.startswith('mercurial.zstd'):
43 if fullname.startswith('mercurial.zstd'):
44 return None
44 return None
45 # rustext is built for the right python version,
45 # rustext is built for the right python version,
46 # don't try and mangle it
46 # don't try and mangle it
47 if fullname.startswith('mercurial.rustext'):
47 if fullname.startswith('mercurial.rustext'):
48 return None
48 return None
49 # pywatchman is already dual-version clean, don't try and mangle it
49 # pywatchman is already dual-version clean, don't try and mangle it
50 if fullname.startswith('hgext.fsmonitor.pywatchman'):
50 if fullname.startswith('hgext.fsmonitor.pywatchman'):
51 return None
51 return None
52
52
53 # Try to find the module using other registered finders.
53 # Try to find the module using other registered finders.
54 spec = None
54 spec = None
55 for finder in sys.meta_path:
55 for finder in sys.meta_path:
56 if finder == self:
56 if finder == self:
57 continue
57 continue
58
58
59 # Originally the API was a `find_module` method, but it was
59 # Originally the API was a `find_module` method, but it was
60 # renamed to `find_spec` in python 3.4, with a new `target`
60 # renamed to `find_spec` in python 3.4, with a new `target`
61 # argument.
61 # argument.
62 find_spec_method = getattr(finder, 'find_spec', None)
62 find_spec_method = getattr(finder, 'find_spec', None)
63 if find_spec_method:
63 if find_spec_method:
64 spec = find_spec_method(fullname, path, target=target)
64 spec = find_spec_method(fullname, path, target=target)
65 else:
65 else:
66 spec = finder.find_module(fullname)
66 spec = finder.find_module(fullname)
67 if spec is not None:
67 if spec is not None:
68 spec = importlib.util.spec_from_loader(fullname, spec)
68 spec = importlib.util.spec_from_loader(fullname, spec)
69 if spec:
69 if spec:
70 break
70 break
71
71
72 # This is a Mercurial-related module but we couldn't find it
72 # This is a Mercurial-related module but we couldn't find it
73 # using the previously-registered finders. This likely means
73 # using the previously-registered finders. This likely means
74 # the module doesn't exist.
74 # the module doesn't exist.
75 if not spec:
75 if not spec:
76 return None
76 return None
77
77
78 # TODO need to support loaders from alternate specs, like zip
78 # TODO need to support loaders from alternate specs, like zip
79 # loaders.
79 # loaders.
80 loader = hgloader(spec.name, spec.origin)
80 loader = hgloader(spec.name, spec.origin)
81 # Can't use util.safehasattr here because that would require
81 # Can't use util.safehasattr here because that would require
82 # importing util, and we're in import code.
82 # importing util, and we're in import code.
83 if hasattr(spec.loader, 'loader'): # hasattr-py3-only
83 if hasattr(spec.loader, 'loader'): # hasattr-py3-only
84 # This is a nested loader (maybe a lazy loader?)
84 # This is a nested loader (maybe a lazy loader?)
85 spec.loader.loader = loader
85 spec.loader.loader = loader
86 else:
86 else:
87 spec.loader = loader
87 spec.loader = loader
88 return spec
88 return spec
89
89
90 def replacetokens(tokens, fullname):
90 def replacetokens(tokens, fullname):
91 """Transform a stream of tokens from raw to Python 3.
91 """Transform a stream of tokens from raw to Python 3.
92
92
93 It is called by the custom module loading machinery to rewrite
93 It is called by the custom module loading machinery to rewrite
94 source/tokens between source decoding and compilation.
94 source/tokens between source decoding and compilation.
95
95
96 Returns a generator of possibly rewritten tokens.
96 Returns a generator of possibly rewritten tokens.
97
97
98 The input token list may be mutated as part of processing. However,
98 The input token list may be mutated as part of processing. However,
99 its changes do not necessarily match the output token stream.
99 its changes do not necessarily match the output token stream.
100
100
101 REMEMBER TO CHANGE ``BYTECODEHEADER`` WHEN CHANGING THIS FUNCTION
101 REMEMBER TO CHANGE ``BYTECODEHEADER`` WHEN CHANGING THIS FUNCTION
102 OR CACHED FILES WON'T GET INVALIDATED PROPERLY.
102 OR CACHED FILES WON'T GET INVALIDATED PROPERLY.
103 """
103 """
104 futureimpline = False
104 futureimpline = False
105
105
106 # The following utility functions access the tokens list and i index of
106 # The following utility functions access the tokens list and i index of
107 # the for i, t enumerate(tokens) loop below
107 # the for i, t enumerate(tokens) loop below
108 def _isop(j, *o):
108 def _isop(j, *o):
109 """Assert that tokens[j] is an OP with one of the given values"""
109 """Assert that tokens[j] is an OP with one of the given values"""
110 try:
110 try:
111 return tokens[j].type == token.OP and tokens[j].string in o
111 return tokens[j].type == token.OP and tokens[j].string in o
112 except IndexError:
112 except IndexError:
113 return False
113 return False
114
114
115 def _findargnofcall(n):
115 def _findargnofcall(n):
116 """Find arg n of a call expression (start at 0)
116 """Find arg n of a call expression (start at 0)
117
117
118 Returns index of the first token of that argument, or None if
118 Returns index of the first token of that argument, or None if
119 there is not that many arguments.
119 there is not that many arguments.
120
120
121 Assumes that token[i + 1] is '('.
121 Assumes that token[i + 1] is '('.
122
122
123 """
123 """
124 nested = 0
124 nested = 0
125 for j in range(i + 2, len(tokens)):
125 for j in range(i + 2, len(tokens)):
126 if _isop(j, ')', ']', '}'):
126 if _isop(j, ')', ']', '}'):
127 # end of call, tuple, subscription or dict / set
127 # end of call, tuple, subscription or dict / set
128 nested -= 1
128 nested -= 1
129 if nested < 0:
129 if nested < 0:
130 return None
130 return None
131 elif n == 0:
131 elif n == 0:
132 # this is the starting position of arg
132 # this is the starting position of arg
133 return j
133 return j
134 elif _isop(j, '(', '[', '{'):
134 elif _isop(j, '(', '[', '{'):
135 nested += 1
135 nested += 1
136 elif _isop(j, ',') and nested == 0:
136 elif _isop(j, ',') and nested == 0:
137 n -= 1
137 n -= 1
138
138
139 return None
139 return None
140
140
141 def _ensureunicode(j):
141 def _ensureunicode(j):
142 """Make sure the token at j is a unicode string
142 """Make sure the token at j is a unicode string
143
143
144 This rewrites a string token to include the unicode literal prefix
144 This rewrites a string token to include the unicode literal prefix
145 so the string transformer won't add the byte prefix.
145 so the string transformer won't add the byte prefix.
146
146
147 Ignores tokens that are not strings. Assumes bounds checking has
147 Ignores tokens that are not strings. Assumes bounds checking has
148 already been done.
148 already been done.
149
149
150 """
150 """
151 st = tokens[j]
151 st = tokens[j]
152 if st.type == token.STRING and st.string.startswith(("'", '"')):
152 if st.type == token.STRING and st.string.startswith(("'", '"')):
153 tokens[j] = st._replace(string='u%s' % st.string)
153 tokens[j] = st._replace(string='u%s' % st.string)
154
154
155 for i, t in enumerate(tokens):
155 for i, t in enumerate(tokens):
156 # Convert most string literals to byte literals. String literals
157 # in Python 2 are bytes. String literals in Python 3 are unicode.
158 # Most strings in Mercurial are bytes and unicode strings are rare.
159 # Rather than rewrite all string literals to use ``b''`` to indicate
160 # byte strings, we apply this token transformer to insert the ``b``
161 # prefix nearly everywhere.
162 if t.type == token.STRING:
163 s = t.string
164
165 # Preserve docstrings as string literals. This is inconsistent
166 # with regular unprefixed strings. However, the
167 # "from __future__" parsing (which allows a module docstring to
168 # exist before it) doesn't properly handle the docstring if it
169 # is b''' prefixed, leading to a SyntaxError. We leave all
170 # docstrings as unprefixed to avoid this. This means Mercurial
171 # components touching docstrings need to handle unicode,
172 # unfortunately.
173 if s[0:3] in ("'''", '"""'):
174 yield t
175 continue
176
177 # If the first character isn't a quote, it is likely a string
178 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
179 if s[0] not in ("'", '"'):
180 yield t
181 continue
182
183 # String literal. Prefix to make a b'' string.
184 yield t._replace(string='b%s' % t.string)
185 continue
186
187 # Insert compatibility imports at "from __future__ import" line.
156 # Insert compatibility imports at "from __future__ import" line.
188 # No '\n' should be added to preserve line numbers.
157 # No '\n' should be added to preserve line numbers.
189 if (
158 if (
190 t.type == token.NAME
159 t.type == token.NAME
191 and t.string == 'import'
160 and t.string == 'import'
192 and all(u.type == token.NAME for u in tokens[i - 2 : i])
161 and all(u.type == token.NAME for u in tokens[i - 2 : i])
193 and [u.string for u in tokens[i - 2 : i]]
162 and [u.string for u in tokens[i - 2 : i]]
194 == ['from', '__future__']
163 == ['from', '__future__']
195 ):
164 ):
196 futureimpline = True
165 futureimpline = True
197 if t.type == token.NEWLINE and futureimpline:
166 if t.type == token.NEWLINE and futureimpline:
198 futureimpline = False
167 futureimpline = False
199 if fullname == 'mercurial.pycompat':
168 if fullname == 'mercurial.pycompat':
200 yield t
169 yield t
201 continue
170 continue
202 r, c = t.start
171 r, c = t.start
203 l = (
172 l = (
204 b'; from mercurial.pycompat import '
173 b'; from mercurial.pycompat import '
205 b'delattr, getattr, hasattr, setattr, '
174 b'delattr, getattr, hasattr, setattr, '
206 b'open, unicode\n'
175 b'open, unicode\n'
207 )
176 )
208 for u in tokenize.tokenize(io.BytesIO(l).readline):
177 for u in tokenize.tokenize(io.BytesIO(l).readline):
209 if u.type in (tokenize.ENCODING, token.ENDMARKER):
178 if u.type in (tokenize.ENCODING, token.ENDMARKER):
210 continue
179 continue
211 yield u._replace(
180 yield u._replace(
212 start=(r, c + u.start[1]), end=(r, c + u.end[1])
181 start=(r, c + u.start[1]), end=(r, c + u.end[1])
213 )
182 )
214 continue
183 continue
215
184
216 # This looks like a function call.
185 # This looks like a function call.
217 if t.type == token.NAME and _isop(i + 1, '('):
186 if t.type == token.NAME and _isop(i + 1, '('):
218 fn = t.string
187 fn = t.string
219
188
220 # *attr() builtins don't accept byte strings to 2nd argument.
189 # *attr() builtins don't accept byte strings to 2nd argument.
221 if fn in (
190 if fn in (
222 'getattr',
191 'getattr',
223 'setattr',
192 'setattr',
224 'hasattr',
193 'hasattr',
225 'safehasattr',
194 'safehasattr',
226 ) and not _isop(i - 1, '.'):
195 ) and not _isop(i - 1, '.'):
227 arg1idx = _findargnofcall(1)
196 arg1idx = _findargnofcall(1)
228 if arg1idx is not None:
197 if arg1idx is not None:
229 _ensureunicode(arg1idx)
198 _ensureunicode(arg1idx)
230
199
231 # .encode() and .decode() on str/bytes/unicode don't accept
200 # .encode() and .decode() on str/bytes/unicode don't accept
232 # byte strings on Python 3.
201 # byte strings on Python 3.
233 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
202 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
234 for argn in range(2):
203 for argn in range(2):
235 argidx = _findargnofcall(argn)
204 argidx = _findargnofcall(argn)
236 if argidx is not None:
205 if argidx is not None:
237 _ensureunicode(argidx)
206 _ensureunicode(argidx)
238
207
239 # It changes iteritems/values to items/values as they are not
208 # It changes iteritems/values to items/values as they are not
240 # present in Python 3 world.
209 # present in Python 3 world.
241 elif fn in ('iteritems', 'itervalues') and not (
210 elif fn in ('iteritems', 'itervalues') and not (
242 tokens[i - 1].type == token.NAME
211 tokens[i - 1].type == token.NAME
243 and tokens[i - 1].string == 'def'
212 and tokens[i - 1].string == 'def'
244 ):
213 ):
245 yield t._replace(string=fn[4:])
214 yield t._replace(string=fn[4:])
246 continue
215 continue
247
216
248 # Emit unmodified token.
217 # Emit unmodified token.
249 yield t
218 yield t
250
219
251 # Header to add to bytecode files. This MUST be changed when
220 # Header to add to bytecode files. This MUST be changed when
252 # ``replacetoken`` or any mechanism that changes semantics of module
221 # ``replacetoken`` or any mechanism that changes semantics of module
253 # loading is changed. Otherwise cached bytecode may get loaded without
222 # loading is changed. Otherwise cached bytecode may get loaded without
254 # the new transformation mechanisms applied.
223 # the new transformation mechanisms applied.
255 BYTECODEHEADER = b'HG\x00\x0c'
224 BYTECODEHEADER = b'HG\x00\x0c'
256
225
257 class hgloader(importlib.machinery.SourceFileLoader):
226 class hgloader(importlib.machinery.SourceFileLoader):
258 """Custom module loader that transforms source code.
227 """Custom module loader that transforms source code.
259
228
260 When the source code is converted to a code object, we transform
229 When the source code is converted to a code object, we transform
261 certain patterns to be Python 3 compatible. This allows us to write code
230 certain patterns to be Python 3 compatible. This allows us to write code
262 that is natively Python 2 and compatible with Python 3 without
231 that is natively Python 2 and compatible with Python 3 without
263 making the code excessively ugly.
232 making the code excessively ugly.
264
233
265 We do this by transforming the token stream between parse and compile.
234 We do this by transforming the token stream between parse and compile.
266
235
267 Implementing transformations invalidates caching assumptions made
236 Implementing transformations invalidates caching assumptions made
268 by the built-in importer. The built-in importer stores a header on
237 by the built-in importer. The built-in importer stores a header on
269 saved bytecode files indicating the Python/bytecode version. If the
238 saved bytecode files indicating the Python/bytecode version. If the
270 version changes, the cached bytecode is ignored. The Mercurial
239 version changes, the cached bytecode is ignored. The Mercurial
271 transformations could change at any time. This means we need to check
240 transformations could change at any time. This means we need to check
272 that cached bytecode was generated with the current transformation
241 that cached bytecode was generated with the current transformation
273 code or there could be a mismatch between cached bytecode and what
242 code or there could be a mismatch between cached bytecode and what
274 would be generated from this class.
243 would be generated from this class.
275
244
276 We supplement the bytecode caching layer by wrapping ``get_data``
245 We supplement the bytecode caching layer by wrapping ``get_data``
277 and ``set_data``. These functions are called when the
246 and ``set_data``. These functions are called when the
278 ``SourceFileLoader`` retrieves and saves bytecode cache files,
247 ``SourceFileLoader`` retrieves and saves bytecode cache files,
279 respectively. We simply add an additional header on the file. As
248 respectively. We simply add an additional header on the file. As
280 long as the version in this file is changed when semantics change,
249 long as the version in this file is changed when semantics change,
281 cached bytecode should be invalidated when transformations change.
250 cached bytecode should be invalidated when transformations change.
282
251
283 The added header has the form ``HG<VERSION>``. That is a literal
252 The added header has the form ``HG<VERSION>``. That is a literal
284 ``HG`` with 2 binary bytes indicating the transformation version.
253 ``HG`` with 2 binary bytes indicating the transformation version.
285 """
254 """
286
255
287 def get_data(self, path):
256 def get_data(self, path):
288 data = super(hgloader, self).get_data(path)
257 data = super(hgloader, self).get_data(path)
289
258
290 if not path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
259 if not path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
291 return data
260 return data
292
261
293 # There should be a header indicating the Mercurial transformation
262 # There should be a header indicating the Mercurial transformation
294 # version. If it doesn't exist or doesn't match the current version,
263 # version. If it doesn't exist or doesn't match the current version,
295 # we raise an OSError because that is what
264 # we raise an OSError because that is what
296 # ``SourceFileLoader.get_code()`` expects when loading bytecode
265 # ``SourceFileLoader.get_code()`` expects when loading bytecode
297 # paths to indicate the cached file is "bad."
266 # paths to indicate the cached file is "bad."
298 if data[0:2] != b'HG':
267 if data[0:2] != b'HG':
299 raise OSError('no hg header')
268 raise OSError('no hg header')
300 if data[0:4] != BYTECODEHEADER:
269 if data[0:4] != BYTECODEHEADER:
301 raise OSError('hg header version mismatch')
270 raise OSError('hg header version mismatch')
302
271
303 return data[4:]
272 return data[4:]
304
273
305 def set_data(self, path, data, *args, **kwargs):
274 def set_data(self, path, data, *args, **kwargs):
306 if path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
275 if path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
307 data = BYTECODEHEADER + data
276 data = BYTECODEHEADER + data
308
277
309 return super(hgloader, self).set_data(path, data, *args, **kwargs)
278 return super(hgloader, self).set_data(path, data, *args, **kwargs)
310
279
311 def source_to_code(self, data, path):
280 def source_to_code(self, data, path):
312 """Perform token transformation before compilation."""
281 """Perform token transformation before compilation."""
313 buf = io.BytesIO(data)
282 buf = io.BytesIO(data)
314 tokens = tokenize.tokenize(buf.readline)
283 tokens = tokenize.tokenize(buf.readline)
315 data = tokenize.untokenize(replacetokens(list(tokens), self.name))
284 data = tokenize.untokenize(replacetokens(list(tokens), self.name))
316 # Python's built-in importer strips frames from exceptions raised
285 # Python's built-in importer strips frames from exceptions raised
317 # for this code. Unfortunately, that mechanism isn't extensible
286 # for this code. Unfortunately, that mechanism isn't extensible
318 # and our frame will be blamed for the import failure. There
287 # and our frame will be blamed for the import failure. There
319 # are extremely hacky ways to do frame stripping. We haven't
288 # are extremely hacky ways to do frame stripping. We haven't
320 # implemented them because they are very ugly.
289 # implemented them because they are very ugly.
321 return super(hgloader, self).source_to_code(data, path)
290 return super(hgloader, self).source_to_code(data, path)
322
291
323 # We automagically register our custom importer as a side-effect of
292 # We automagically register our custom importer as a side-effect of
324 # loading. This is necessary to ensure that any entry points are able
293 # loading. This is necessary to ensure that any entry points are able
325 # to import mercurial.* modules without having to perform this
294 # to import mercurial.* modules without having to perform this
326 # registration themselves.
295 # registration themselves.
327 if not any(isinstance(x, hgpathentryfinder) for x in sys.meta_path):
296 if not any(isinstance(x, hgpathentryfinder) for x in sys.meta_path):
328 # meta_path is used before any implicit finders and before sys.path.
297 # meta_path is used before any implicit finders and before sys.path.
329 sys.meta_path.insert(0, hgpathentryfinder())
298 sys.meta_path.insert(0, hgpathentryfinder())
General Comments 0
You need to be logged in to leave comments. Login now