upstream/mercurial-mirror Commit - r38409:47dd23e6

1

#!/usr/bin/env python3

1

#!/usr/bin/env python3

2

#

2

#

3

# byteify-strings.py - transform string literals to be Python 3 safe

3

# byteify-strings.py - transform string literals to be Python 3 safe

4

#

4

#

5

6

#

6

#

7

# This software may be used and distributed according to the terms of the

7

# This software may be used and distributed according to the terms of the

8

# GNU General Public License version 2 or any later version.

8

# GNU General Public License version 2 or any later version.

9

10

from __future__ import absolute_import

10

from __future__ import absolute_import

11

12

import argparse

12

import argparse

13

import contextlib

13

import contextlib

14

import errno

14

import errno

15

import os

15

import os

16

import sys

16

import sys

17

import tempfile

17

import tempfile

18

import token

18

import token

19

import tokenize

19

import tokenize

20

21

def adjusttokenpos(t, ofs):

22

"""Adjust start/end column of the given token"""

23

return t._replace(start=(t.start[0], t.start[1] + ofs),

24

end=(t.end[0], t.end[1] + ofs))

25

21

if True:

26

if True:

22

def replacetokens(tokens, opts):

27

def replacetokens(tokens, opts):

23

"""Transform a stream of tokens from raw to Python 3.

28

"""Transform a stream of tokens from raw to Python 3.

24

29

25

Returns a generator of possibly rewritten tokens.

30

Returns a generator of possibly rewritten tokens.

26

31

27

The input token list may be mutated as part of processing. However,

32

The input token list may be mutated as part of processing. However,

28

its changes do not necessarily match the output token stream.

33

its changes do not necessarily match the output token stream.

29

"""

34

"""

30

sysstrtokens = set()

35

sysstrtokens = set()

31

36

32

# The following utility functions access the tokens list and i index of

37

# The following utility functions access the tokens list and i index of

33

# the for i, t enumerate(tokens) loop below

38

# the for i, t enumerate(tokens) loop below

34

def _isop(j, *o):

39

def _isop(j, *o):

35

"""Assert that tokens[j] is an OP with one of the given values"""

40

"""Assert that tokens[j] is an OP with one of the given values"""

36

try:

41

try:

37

return tokens[j].type == token.OP and tokens[j].string in o

42

return tokens[j].type == token.OP and tokens[j].string in o

38

except IndexError:

43

except IndexError:

39

return False

44

return False

40

45

41

def _findargnofcall(n):

46

def _findargnofcall(n):

42

"""Find arg n of a call expression (start at 0)

47

"""Find arg n of a call expression (start at 0)

43

48

44

Returns index of the first token of that argument, or None if

49

Returns index of the first token of that argument, or None if

45

there is not that many arguments.

50

there is not that many arguments.

46

51

47

Assumes that token[i + 1] is '('.

52

Assumes that token[i + 1] is '('.

48

53

49

"""

54

"""

50

nested = 0

55

nested = 0

51

for j in range(i + 2, len(tokens)):

56

for j in range(i + 2, len(tokens)):

52

if _isop(j, ')', ']', '}'):

57

if _isop(j, ')', ']', '}'):

53

# end of call, tuple, subscription or dict / set

58

# end of call, tuple, subscription or dict / set

54

nested -= 1

59

nested -= 1

55

if nested < 0:

60

if nested < 0:

56

return None

61

return None

57

elif n == 0:

62

elif n == 0:

58

# this is the starting position of arg

63

# this is the starting position of arg

59

return j

64

return j

60

elif _isop(j, '(', '[', '{'):

65

elif _isop(j, '(', '[', '{'):

61

nested += 1

66

nested += 1

62

elif _isop(j, ',') and nested == 0:

67

elif _isop(j, ',') and nested == 0:

63

n -= 1

68

n -= 1

64

69

65

return None

70

return None

66

71

67

def _ensuresysstr(j):

72

def _ensuresysstr(j):

68

"""Make sure the token at j is a system string

73

"""Make sure the token at j is a system string

69

74

70

Remember the given token so the string transformer won't add

75

Remember the given token so the string transformer won't add

71

the byte prefix.

76

the byte prefix.

72

77

73

Ignores tokens that are not strings. Assumes bounds checking has

78

Ignores tokens that are not strings. Assumes bounds checking has

74

already been done.

79

already been done.

75

80

76

"""

81

"""

77

st = tokens[j]

82

st = tokens[j]

78

if st.type == token.STRING and st.string.startswith(("'", '"')):

83

if st.type == token.STRING and st.string.startswith(("'", '"')):

79

sysstrtokens.add(st)

84

sysstrtokens.add(st)

80

85

86

coldelta = 0 # column increment for new opening parens

87

coloffset = -1 # column offset for the current line (-1: TBD)

88

parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)

81

for i, t in enumerate(tokens):

89

for i, t in enumerate(tokens):

90

# Compute the column offset for the current line, such that

91

# the current line will be aligned to the last opening paren

92

# as before.

93

if coloffset < 0:

94

if t.start[1] == parens[-1][1]:

95

coloffset = parens[-1][2]

96

elif t.start[1] + 1 == parens[-1][1]:

97

# fix misaligned indent of s/util.Abort/error.Abort/

98

coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])

99

else:

100

coloffset = 0

101

102

# Reset per-line attributes at EOL.

103

if t.type in (token.NEWLINE, tokenize.NL):

104

yield adjusttokenpos(t, coloffset)

105

coldelta = 0

106

coloffset = -1

107

continue

108

109

# Remember the last paren position.

110

if _isop(i, '(', '[', '{'):

111

parens.append(t.end + (coloffset + coldelta,))

112

elif _isop(i, ')', ']', '}'):

113

parens.pop()

114

82

# Convert most string literals to byte literals. String literals

115

# Convert most string literals to byte literals. String literals

83

# in Python 2 are bytes. String literals in Python 3 are unicode.

116

# in Python 2 are bytes. String literals in Python 3 are unicode.

84

# Most strings in Mercurial are bytes and unicode strings are rare.

117

# Most strings in Mercurial are bytes and unicode strings are rare.

85

# Rather than rewrite all string literals to use ``b''`` to indicate

118

# Rather than rewrite all string literals to use ``b''`` to indicate

86

# byte strings, we apply this token transformer to insert the ``b``

119

# byte strings, we apply this token transformer to insert the ``b``

87

# prefix nearly everywhere.

120

# prefix nearly everywhere.

88

if t.type == token.STRING and t not in sysstrtokens:

121

if t.type == token.STRING and t not in sysstrtokens:

89

s = t.string

122

s = t.string

90

123

91

# Preserve docstrings as string literals. This is inconsistent

124

# Preserve docstrings as string literals. This is inconsistent

92

# with regular unprefixed strings. However, the

125

# with regular unprefixed strings. However, the

93

# "from __future__" parsing (which allows a module docstring to

126

# "from __future__" parsing (which allows a module docstring to

94

# exist before it) doesn't properly handle the docstring if it

127

# exist before it) doesn't properly handle the docstring if it

95

# is b''' prefixed, leading to a SyntaxError. We leave all

128

# is b''' prefixed, leading to a SyntaxError. We leave all

96

# docstrings as unprefixed to avoid this. This means Mercurial

129

# docstrings as unprefixed to avoid this. This means Mercurial

97

# components touching docstrings need to handle unicode,

130

# components touching docstrings need to handle unicode,

98

# unfortunately.

131

# unfortunately.

99

if s[0:3] in ("'''", '"""'):

132

if s[0:3] in ("'''", '"""'):

100

yield t

133

yield adjusttokenpos(t, coloffset)

101

continue

134

continue

102

135

103

# If the first character isn't a quote, it is likely a string

136

# If the first character isn't a quote, it is likely a string

104

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

137

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

105

if s[0] not in ("'", '"'):

138

if s[0] not in ("'", '"'):

106

yield t

139

yield adjusttokenpos(t, coloffset)

107

continue

140

continue

108

141

109

# String literal. Prefix to make a b'' string.

142

# String literal. Prefix to make a b'' string.

110

yield t._replace(string='b%s' % t.string)

143

yield adjusttokenpos(t._replace(string='b%s' % t.string),

144

coloffset)

145

coldelta += 1

111

continue

146

continue

112

147

113

# This looks like a function call.

148

# This looks like a function call.

114

if t.type == token.NAME and _isop(i + 1, '('):

149

if t.type == token.NAME and _isop(i + 1, '('):

115

fn = t.string

150

fn = t.string

116

151

117

# *attr() builtins don't accept byte strings to 2nd argument.

152

# *attr() builtins don't accept byte strings to 2nd argument.

118

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

153

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

119

not _isop(i - 1, '.')):

154

not _isop(i - 1, '.')):

120

arg1idx = _findargnofcall(1)

155

arg1idx = _findargnofcall(1)

121

if arg1idx is not None:

156

if arg1idx is not None:

122

_ensuresysstr(arg1idx)

157

_ensuresysstr(arg1idx)

123

158

124

# .encode() and .decode() on str/bytes/unicode don't accept

159

# .encode() and .decode() on str/bytes/unicode don't accept

125

# byte strings on Python 3.

160

# byte strings on Python 3.

126

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

161

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

127

for argn in range(2):

162

for argn in range(2):

128

argidx = _findargnofcall(argn)

163

argidx = _findargnofcall(argn)

129

if argidx is not None:

164

if argidx is not None:

130

_ensuresysstr(argidx)

165

_ensuresysstr(argidx)

131

166

132

# It changes iteritems/values to items/values as they are not

167

# It changes iteritems/values to items/values as they are not

133

# present in Python 3 world.

168

# present in Python 3 world.

134

elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):

169

elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):

135

yield t._replace(string=fn[4:])

170

yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)

136

continue

171

continue

137

172

138

# Emit unmodified token.

173

# Emit unmodified token.

139

yield t

174

yield adjusttokenpos(t, coloffset)

140

175

141

def process(fin, fout, opts):

176

def process(fin, fout, opts):

142

tokens = tokenize.tokenize(fin.readline)

177

tokens = tokenize.tokenize(fin.readline)

143

tokens = replacetokens(list(tokens), opts)

178

tokens = replacetokens(list(tokens), opts)

144

fout.write(tokenize.untokenize(tokens))

179

fout.write(tokenize.untokenize(tokens))

145

180

146

def tryunlink(fname):

181

def tryunlink(fname):

147

try:

182

try:

148

os.unlink(fname)

183

os.unlink(fname)

149

except OSError as err:

184

except OSError as err:

150

if err.errno != errno.ENOENT:

185

if err.errno != errno.ENOENT:

151

raise

186

raise

152

187

153

@contextlib.contextmanager

188

@contextlib.contextmanager

154

def editinplace(fname):

189

def editinplace(fname):

155

n = os.path.basename(fname)

190

n = os.path.basename(fname)

156

d = os.path.dirname(fname)

191

d = os.path.dirname(fname)

157

fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,

192

fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,

158

delete=False)

193

delete=False)

159

try:

194

try:

160

yield fp

195

yield fp

161

fp.close()

196

fp.close()

162

if os.name == 'nt':

197

if os.name == 'nt':

163

tryunlink(fname)

198

tryunlink(fname)

164

os.rename(fp.name, fname)

199

os.rename(fp.name, fname)

165

finally:

200

finally:

166

fp.close()

201

fp.close()

167

tryunlink(fp.name)

202

tryunlink(fp.name)

168

203

169

def main():

204

def main():

170

ap = argparse.ArgumentParser()

205

ap = argparse.ArgumentParser()

171

ap.add_argument('-i', '--inplace', action='store_true', default=False,

206

ap.add_argument('-i', '--inplace', action='store_true', default=False,

172

help='edit files in place')

207

help='edit files in place')

173

ap.add_argument('--dictiter', action='store_true', default=False,

208

ap.add_argument('--dictiter', action='store_true', default=False,

174

help='rewrite iteritems() and itervalues()'),

209

help='rewrite iteritems() and itervalues()'),

175

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

210

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

176

args = ap.parse_args()

211

args = ap.parse_args()

177

opts = {

212

opts = {

178

'dictiter': args.dictiter,

213

'dictiter': args.dictiter,

179

}

214

}

180

for fname in args.files:

215

for fname in args.files:

181

if args.inplace:

216

if args.inplace:

182

with editinplace(fname) as fout:

217

with editinplace(fname) as fout:

183

with open(fname, 'rb') as fin:

218

with open(fname, 'rb') as fin:

184

process(fin, fout, opts)

219

process(fin, fout, opts)

185

else:

220

else:

186

with open(fname, 'rb') as fin:

221

with open(fname, 'rb') as fin:

187

fout = sys.stdout.buffer

222

fout = sys.stdout.buffer

188

process(fin, fout, opts)

223

process(fin, fout, opts)

189

224

190

if __name__ == '__main__':

225

if __name__ == '__main__':

191

main()

226

main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python3
             #
             # byteify-strings.py - transform string literals to be Python 3 safe
             #
             # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import argparse
             import contextlib
             import errno
             import os
             import sys
             import tempfile
             import token
             import tokenize
+            def adjusttokenpos(t, ofs):
+                """Adjust start/end column of the given token"""
+                return t._replace(start=(t.start[0], t.start[1] + ofs),
+                                  end=(t.end[0], t.end[1] + ofs))
             if True:
                 def replacetokens(tokens, opts):
                     """Transform a stream of tokens from raw to Python 3.
                     Returns a generator of possibly rewritten tokens.
                     The input token list may be mutated as part of processing. However,
                     its changes do not necessarily match the output token stream.
                     """
                     sysstrtokens = set()
                     # The following utility functions access the tokens list and i index of
                     # the for i, t enumerate(tokens) loop below
                     def _isop(j, *o):
                         """Assert that tokens[j] is an OP with one of the given values"""
                         try:
                             return tokens[j].type == token.OP and tokens[j].string in o
                         except IndexError:
                             return False
                     def _findargnofcall(n):
                         """Find arg n of a call expression (start at 0)
                         Returns index of the first token of that argument, or None if
                         there is not that many arguments.
                         Assumes that token[i + 1] is '('.
                         """
                         nested = 0
                         for j in range(i + 2, len(tokens)):
                             if _isop(j, ')', ']', '}'):
                                 # end of call, tuple, subscription or dict / set
                                 nested -= 1
                                 if nested < 0:
                                     return None
                             elif n == 0:
                                 # this is the starting position of arg
                                 return j
                             elif _isop(j, '(', '[', '{'):
                                 nested += 1
                             elif _isop(j, ',') and nested == 0:
                                 n -= 1
                         return None
                     def _ensuresysstr(j):
                         """Make sure the token at j is a system string
                         Remember the given token so the string transformer won't add
                         the byte prefix.
                         Ignores tokens that are not strings. Assumes bounds checking has
                         already been done.
                         """
                         st = tokens[j]
                         if st.type == token.STRING and st.string.startswith(("'", '"')):
                             sysstrtokens.add(st)
+                    coldelta = 0  # column increment for new opening parens
+                    coloffset = -1  # column offset for the current line (-1: TBD)
+                    parens = [(0, 0, 0)]  # stack of (line, end-column, column-offset)
                     for i, t in enumerate(tokens):
+                        # Compute the column offset for the current line, such that
+                        # the current line will be aligned to the last opening paren
+                        # as before.
+                        if coloffset < 0:
+                            if t.start[1] == parens[-1][1]:
+                                coloffset = parens[-1][2]
+                            elif t.start[1] + 1 == parens[-1][1]:
+                                # fix misaligned indent of s/util.Abort/error.Abort/
+                                coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
+                            else:
+                                coloffset = 0
+                        # Reset per-line attributes at EOL.
+                        if t.type in (token.NEWLINE, tokenize.NL):
+                            yield adjusttokenpos(t, coloffset)
+                            coldelta = 0
+                            coloffset = -1
+                            continue
+                        # Remember the last paren position.
+                        if _isop(i, '(', '[', '{'):
+                            parens.append(t.end + (coloffset + coldelta,))
+                        elif _isop(i, ')', ']', '}'):
+                            parens.pop()
                         # Convert most string literals to byte literals. String literals
                         # in Python 2 are bytes. String literals in Python 3 are unicode.
                         # Most strings in Mercurial are bytes and unicode strings are rare.
                         # Rather than rewrite all string literals to use ``b''`` to indicate
                         # byte strings, we apply this token transformer to insert the ``b``
                         # prefix nearly everywhere.
                         if t.type == token.STRING and t not in sysstrtokens:
                             s = t.string
                             # Preserve docstrings as string literals. This is inconsistent
                             # with regular unprefixed strings. However, the
                             # "from __future__" parsing (which allows a module docstring to
                             # exist before it) doesn't properly handle the docstring if it
                             # is b''' prefixed, leading to a SyntaxError. We leave all
                             # docstrings as unprefixed to avoid this. This means Mercurial
                             # components touching docstrings need to handle unicode,
                             # unfortunately.
                             if s[0:3] in ("'''", '"""'):
-                                yield t
+                                yield adjusttokenpos(t, coloffset)
                                 continue
                             # If the first character isn't a quote, it is likely a string
                             # prefixing character (such as 'b', 'u', or 'r'. Ignore.
                             if s[0] not in ("'", '"'):
-                                yield t
+                                yield adjusttokenpos(t, coloffset)
                                 continue
                             # String literal. Prefix to make a b'' string.
-                            yield t._replace(string='b%s' % t.string)
+                            yield adjusttokenpos(t._replace(string='b%s' % t.string),
+                                                 coloffset)
+                            coldelta += 1
                             continue
                         # This looks like a function call.
                         if t.type == token.NAME and _isop(i + 1, '('):
                             fn = t.string
                             # *attr() builtins don't accept byte strings to 2nd argument.
                             if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
                                     not _isop(i - 1, '.')):
                                 arg1idx = _findargnofcall(1)
                                 if arg1idx is not None:
                                     _ensuresysstr(arg1idx)
                             # .encode() and .decode() on str/bytes/unicode don't accept
                             # byte strings on Python 3.
                             elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
                                 for argn in range(2):
                                     argidx = _findargnofcall(argn)
                                     if argidx is not None:
                                         _ensuresysstr(argidx)
                             # It changes iteritems/values to items/values as they are not
                             # present in Python 3 world.
                             elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
-                                yield t._replace(string=fn[4:])
+                                yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
                                 continue
                         # Emit unmodified token.
-                        yield t
+                        yield adjusttokenpos(t, coloffset)
             def process(fin, fout, opts):
                 tokens = tokenize.tokenize(fin.readline)
                 tokens = replacetokens(list(tokens), opts)
                 fout.write(tokenize.untokenize(tokens))
             def tryunlink(fname):
                 try:
                     os.unlink(fname)
                 except OSError as err:
                     if err.errno != errno.ENOENT:
                         raise
             @contextlib.contextmanager
             def editinplace(fname):
                 n = os.path.basename(fname)
                 d = os.path.dirname(fname)
                 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
                                                  delete=False)
                 try:
                     yield fp
                     fp.close()
                     if os.name == 'nt':
                         tryunlink(fname)
                     os.rename(fp.name, fname)
                 finally:
                     fp.close()
                     tryunlink(fp.name)
             def main():
                 ap = argparse.ArgumentParser()
                 ap.add_argument('-i', '--inplace', action='store_true', default=False,
                                 help='edit files in place')
                 ap.add_argument('--dictiter', action='store_true', default=False,
                                 help='rewrite iteritems() and itervalues()'),
                 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
                 args = ap.parse_args()
                 opts = {
                     'dictiter': args.dictiter,
                 }
                 for fname in args.files:
                     if args.inplace:
                         with editinplace(fname) as fout:
                             with open(fname, 'rb') as fin:
                                 process(fin, fout, opts)
                     else:
                         with open(fname, 'rb') as fin:
                             fout = sys.stdout.buffer
                             process(fin, fout, opts)
             if __name__ == '__main__':
                 main()