byteify-strings.py
338 lines
| 11.0 KiB
| text/x-python
|
PythonLexer
/ contrib / byteify-strings.py
Yuya Nishihara
|
r38404 | #!/usr/bin/env python3 | ||
# | ||||
Yuya Nishihara
|
r38403 | # byteify-strings.py - transform string literals to be Python 3 safe | ||
# | ||||
# Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Yuya Nishihara
|
r38404 | import argparse | ||
Yuya Nishihara
|
r38405 | import contextlib | ||
import errno | ||||
import os | ||||
Yuya Nishihara
|
r38404 | import sys | ||
Yuya Nishihara
|
r38405 | import tempfile | ||
Yuya Nishihara
|
r38403 | import token | ||
import tokenize | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r38409 | def adjusttokenpos(t, ofs): | ||
"""Adjust start/end column of the given token""" | ||||
Augie Fackler
|
r43346 | return t._replace( | ||
start=(t.start[0], t.start[1] + ofs), end=(t.end[0], t.end[1] + ofs) | ||||
) | ||||
Yuya Nishihara
|
r38409 | |||
Yuya Nishihara
|
r38410 | def replacetokens(tokens, opts): | ||
"""Transform a stream of tokens from raw to Python 3. | ||||
Returns a generator of possibly rewritten tokens. | ||||
The input token list may be mutated as part of processing. However, | ||||
its changes do not necessarily match the output token stream. | ||||
""" | ||||
sysstrtokens = set() | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | # The following utility functions access the tokens list and i index of | ||
# the for i, t enumerate(tokens) loop below | ||||
def _isop(j, *o): | ||||
"""Assert that tokens[j] is an OP with one of the given values""" | ||||
try: | ||||
return tokens[j].type == token.OP and tokens[j].string in o | ||||
except IndexError: | ||||
return False | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | def _findargnofcall(n): | ||
"""Find arg n of a call expression (start at 0) | ||||
Returns index of the first token of that argument, or None if | ||||
there is not that many arguments. | ||||
Assumes that token[i + 1] is '('. | ||||
Yuya Nishihara
|
r38408 | |||
Yuya Nishihara
|
r38410 | """ | ||
nested = 0 | ||||
for j in range(i + 2, len(tokens)): | ||||
if _isop(j, ')', ']', '}'): | ||||
# end of call, tuple, subscription or dict / set | ||||
nested -= 1 | ||||
if nested < 0: | ||||
return None | ||||
elif n == 0: | ||||
# this is the starting position of arg | ||||
return j | ||||
elif _isop(j, '(', '[', '{'): | ||||
nested += 1 | ||||
elif _isop(j, ',') and nested == 0: | ||||
n -= 1 | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | return None | ||
def _ensuresysstr(j): | ||||
"""Make sure the token at j is a system string | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | Remember the given token so the string transformer won't add | ||
the byte prefix. | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | Ignores tokens that are not strings. Assumes bounds checking has | ||
already been done. | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | """ | ||
Raphaël Gomès
|
r42904 | k = j | ||
currtoken = tokens[k] | ||||
while currtoken.type in (token.STRING, token.NEWLINE, tokenize.NL): | ||||
k += 1 | ||||
Augie Fackler
|
r43346 | if currtoken.type == token.STRING and currtoken.string.startswith( | ||
("'", '"') | ||||
Raphaël Gomès
|
r42904 | ): | ||
sysstrtokens.add(currtoken) | ||||
try: | ||||
currtoken = tokens[k] | ||||
except IndexError: | ||||
break | ||||
Yuya Nishihara
|
r38403 | |||
Raphaël Gomès
|
r42907 | def _isitemaccess(j): | ||
"""Assert the next tokens form an item access on `tokens[j]` and that | ||||
`tokens[j]` is a name. | ||||
""" | ||||
try: | ||||
return ( | ||||
tokens[j].type == token.NAME | ||||
and _isop(j + 1, '[') | ||||
and tokens[j + 2].type == token.STRING | ||||
and _isop(j + 3, ']') | ||||
) | ||||
except IndexError: | ||||
return False | ||||
def _ismethodcall(j, *methodnames): | ||||
"""Assert the next tokens form a call to `methodname` with a string | ||||
as first argument on `tokens[j]` and that `tokens[j]` is a name. | ||||
""" | ||||
try: | ||||
return ( | ||||
tokens[j].type == token.NAME | ||||
and _isop(j + 1, '.') | ||||
and tokens[j + 2].type == token.NAME | ||||
and tokens[j + 2].string in methodnames | ||||
and _isop(j + 3, '(') | ||||
and tokens[j + 4].type == token.STRING | ||||
) | ||||
except IndexError: | ||||
return False | ||||
Yuya Nishihara
|
r38410 | coldelta = 0 # column increment for new opening parens | ||
coloffset = -1 # column offset for the current line (-1: TBD) | ||||
Raphaël Gomès
|
r42914 | parens = [(0, 0, 0, -1)] # stack of (line, end-column, column-offset, type) | ||
Raphaël Gomès
|
r42906 | ignorenextline = False # don't transform the next line | ||
Augie Fackler
|
r43346 | insideignoreblock = False # don't transform until turned off | ||
Yuya Nishihara
|
r38410 | for i, t in enumerate(tokens): | ||
# Compute the column offset for the current line, such that | ||||
# the current line will be aligned to the last opening paren | ||||
# as before. | ||||
if coloffset < 0: | ||||
Raphaël Gomès
|
r42914 | lastparen = parens[-1] | ||
if t.start[1] == lastparen[1]: | ||||
coloffset = lastparen[2] | ||||
Augie Fackler
|
r43346 | elif t.start[1] + 1 == lastparen[1] and lastparen[3] not in ( | ||
token.NEWLINE, | ||||
tokenize.NL, | ||||
Raphaël Gomès
|
r42914 | ): | ||
Yuya Nishihara
|
r38410 | # fix misaligned indent of s/util.Abort/error.Abort/ | ||
Raphaël Gomès
|
r42914 | coloffset = lastparen[2] + (lastparen[1] - t.start[1]) | ||
Yuya Nishihara
|
r38410 | else: | ||
coloffset = 0 | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | # Reset per-line attributes at EOL. | ||
if t.type in (token.NEWLINE, tokenize.NL): | ||||
yield adjusttokenpos(t, coloffset) | ||||
coldelta = 0 | ||||
coloffset = -1 | ||||
Raphaël Gomès
|
r42906 | if not insideignoreblock: | ||
ignorenextline = ( | ||||
tokens[i - 1].type == token.COMMENT | ||||
Raphaël Gomès
|
r42928 | and tokens[i - 1].string == "# no-py3-transform" | ||
Raphaël Gomès
|
r42906 | ) | ||
continue | ||||
if t.type == token.COMMENT: | ||||
Raphaël Gomès
|
r42928 | if t.string == "# py3-transform: off": | ||
Raphaël Gomès
|
r42906 | insideignoreblock = True | ||
Raphaël Gomès
|
r42928 | if t.string == "# py3-transform: on": | ||
Raphaël Gomès
|
r42906 | insideignoreblock = False | ||
if ignorenextline or insideignoreblock: | ||||
yield adjusttokenpos(t, coloffset) | ||||
Yuya Nishihara
|
r38410 | continue | ||
# Remember the last paren position. | ||||
if _isop(i, '(', '[', '{'): | ||||
Raphaël Gomès
|
r42914 | parens.append(t.end + (coloffset + coldelta, tokens[i + 1].type)) | ||
Yuya Nishihara
|
r38410 | elif _isop(i, ')', ']', '}'): | ||
parens.pop() | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | # Convert most string literals to byte literals. String literals | ||
# in Python 2 are bytes. String literals in Python 3 are unicode. | ||||
# Most strings in Mercurial are bytes and unicode strings are rare. | ||||
# Rather than rewrite all string literals to use ``b''`` to indicate | ||||
# byte strings, we apply this token transformer to insert the ``b`` | ||||
# prefix nearly everywhere. | ||||
if t.type == token.STRING and t not in sysstrtokens: | ||||
s = t.string | ||||
Yuya Nishihara
|
r38409 | |||
Yuya Nishihara
|
r38410 | # Preserve docstrings as string literals. This is inconsistent | ||
# with regular unprefixed strings. However, the | ||||
# "from __future__" parsing (which allows a module docstring to | ||||
# exist before it) doesn't properly handle the docstring if it | ||||
# is b''' prefixed, leading to a SyntaxError. We leave all | ||||
# docstrings as unprefixed to avoid this. This means Mercurial | ||||
# components touching docstrings need to handle unicode, | ||||
# unfortunately. | ||||
if s[0:3] in ("'''", '"""'): | ||||
Raphaël Gomès
|
r42905 | # If it's assigned to something, it's not a docstring | ||
if not _isop(i - 1, '='): | ||||
yield adjusttokenpos(t, coloffset) | ||||
continue | ||||
Yuya Nishihara
|
r38409 | |||
Yuya Nishihara
|
r38410 | # If the first character isn't a quote, it is likely a string | ||
# prefixing character (such as 'b', 'u', or 'r'. Ignore. | ||||
if s[0] not in ("'", '"'): | ||||
yield adjusttokenpos(t, coloffset) | ||||
Yuya Nishihara
|
r38403 | continue | ||
Yuya Nishihara
|
r38410 | # String literal. Prefix to make a b'' string. | ||
Augie Fackler
|
r43346 | yield adjusttokenpos(t._replace(string='b%s' % t.string), coloffset) | ||
Yuya Nishihara
|
r38410 | coldelta += 1 | ||
continue | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | # This looks like a function call. | ||
if t.type == token.NAME and _isop(i + 1, '('): | ||||
fn = t.string | ||||
# *attr() builtins don't accept byte strings to 2nd argument. | ||||
r51666 | if fn in ( | |||
'getattr', | ||||
'setattr', | ||||
'hasattr', | ||||
'safehasattr', | ||||
'wrapfunction', | ||||
'wrapclass', | ||||
'addattr', | ||||
Augie Fackler
|
r46554 | ): | ||
Yuya Nishihara
|
r38410 | arg1idx = _findargnofcall(1) | ||
if arg1idx is not None: | ||||
_ensuresysstr(arg1idx) | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | # .encode() and .decode() on str/bytes/unicode don't accept | ||
# byte strings on Python 3. | ||||
elif fn in ('encode', 'decode') and _isop(i - 1, '.'): | ||||
for argn in range(2): | ||||
argidx = _findargnofcall(argn) | ||||
if argidx is not None: | ||||
_ensuresysstr(argidx) | ||||
Yuya Nishihara
|
r38403 | |||
Yuya Nishihara
|
r38410 | # It changes iteritems/values to items/values as they are not | ||
# present in Python 3 world. | ||||
elif opts['dictiter'] and fn in ('iteritems', 'itervalues'): | ||||
yield adjusttokenpos(t._replace(string=fn[4:]), coloffset) | ||||
continue | ||||
Yuya Nishihara
|
r38403 | |||
Raphaël Gomès
|
r42908 | if t.type == token.NAME and t.string in opts['treat-as-kwargs']: | ||
if _isitemaccess(i): | ||||
_ensuresysstr(i + 2) | ||||
if _ismethodcall(i, 'get', 'pop', 'setdefault', 'popitem'): | ||||
_ensuresysstr(i + 4) | ||||
Yuya Nishihara
|
r39139 | # Looks like "if __name__ == '__main__'". | ||
Augie Fackler
|
r43346 | if ( | ||
t.type == token.NAME | ||||
and t.string == '__name__' | ||||
and _isop(i + 1, '==') | ||||
): | ||||
Yuya Nishihara
|
r39139 | _ensuresysstr(i + 2) | ||
Yuya Nishihara
|
r38410 | # Emit unmodified token. | ||
yield adjusttokenpos(t, coloffset) | ||||
Yuya Nishihara
|
r38404 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r38407 | def process(fin, fout, opts): | ||
Yuya Nishihara
|
r38404 | tokens = tokenize.tokenize(fin.readline) | ||
Yuya Nishihara
|
r38407 | tokens = replacetokens(list(tokens), opts) | ||
Yuya Nishihara
|
r38404 | fout.write(tokenize.untokenize(tokens)) | ||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r38405 | def tryunlink(fname): | ||
try: | ||||
os.unlink(fname) | ||||
except OSError as err: | ||||
if err.errno != errno.ENOENT: | ||||
raise | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r38405 | @contextlib.contextmanager | ||
def editinplace(fname): | ||||
n = os.path.basename(fname) | ||||
d = os.path.dirname(fname) | ||||
Augie Fackler
|
r43346 | fp = tempfile.NamedTemporaryFile( | ||
prefix='.%s-' % n, suffix='~', dir=d, delete=False | ||||
) | ||||
Yuya Nishihara
|
r38405 | try: | ||
yield fp | ||||
fp.close() | ||||
if os.name == 'nt': | ||||
tryunlink(fname) | ||||
os.rename(fp.name, fname) | ||||
finally: | ||||
fp.close() | ||||
tryunlink(fp.name) | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r38404 | def main(): | ||
ap = argparse.ArgumentParser() | ||||
Augie Fackler
|
r43346 | ap.add_argument( | ||
'--version', action='version', version='Byteify strings 1.0' | ||||
) | ||||
ap.add_argument( | ||||
'-i', | ||||
'--inplace', | ||||
action='store_true', | ||||
default=False, | ||||
help='edit files in place', | ||||
) | ||||
ap.add_argument( | ||||
'--dictiter', | ||||
action='store_true', | ||||
default=False, | ||||
help='rewrite iteritems() and itervalues()', | ||||
), | ||||
ap.add_argument( | ||||
'--treat-as-kwargs', | ||||
nargs="+", | ||||
default=[], | ||||
help="ignore kwargs-like objects", | ||||
), | ||||
Yuya Nishihara
|
r38404 | ap.add_argument('files', metavar='FILE', nargs='+', help='source file') | ||
args = ap.parse_args() | ||||
Yuya Nishihara
|
r38407 | opts = { | ||
'dictiter': args.dictiter, | ||||
Raphaël Gomès
|
r42909 | 'treat-as-kwargs': set(args.treat_as_kwargs), | ||
Yuya Nishihara
|
r38407 | } | ||
Yuya Nishihara
|
r38404 | for fname in args.files: | ||
r45067 | fname = os.path.realpath(fname) | |||
Yuya Nishihara
|
r38405 | if args.inplace: | ||
with editinplace(fname) as fout: | ||||
with open(fname, 'rb') as fin: | ||||
Yuya Nishihara
|
r38407 | process(fin, fout, opts) | ||
Yuya Nishihara
|
r38405 | else: | ||
with open(fname, 'rb') as fin: | ||||
fout = sys.stdout.buffer | ||||
Yuya Nishihara
|
r38407 | process(fin, fout, opts) | ||
Yuya Nishihara
|
r38404 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r38404 | if __name__ == '__main__': | ||
Ian Moody
|
r43725 | if sys.version_info[0:2] < (3, 7): | ||
print('This script must be run under Python 3.7+') | ||||
Augie Fackler
|
r42433 | sys.exit(3) | ||
Yuya Nishihara
|
r38404 | main() | ||