upstream/mercurial-mirror Commit - r38405:9f42e4a8

1

#!/usr/bin/env python3

1

#!/usr/bin/env python3

2

#

2

#

3

# byteify-strings.py - transform string literals to be Python 3 safe

3

# byteify-strings.py - transform string literals to be Python 3 safe

4

#

4

#

5

6

#

6

#

7

# This software may be used and distributed according to the terms of the

7

# This software may be used and distributed according to the terms of the

8

# GNU General Public License version 2 or any later version.

8

# GNU General Public License version 2 or any later version.

9

10

from __future__ import absolute_import

10

from __future__ import absolute_import

11

12

import argparse

12

import argparse

13

import contextlib

14

import errno

13

import io

15

import io

16

import os

14

import sys

17

import sys

18

import tempfile

15

import token

19

import token

16

import tokenize

20

import tokenize

17

21

18

if True:

22

if True:

19

def replacetokens(tokens, fullname):

23

def replacetokens(tokens, fullname):

20

"""Transform a stream of tokens from raw to Python 3.

24

"""Transform a stream of tokens from raw to Python 3.

21

25

22

Returns a generator of possibly rewritten tokens.

26

Returns a generator of possibly rewritten tokens.

23

27

24

The input token list may be mutated as part of processing. However,

28

The input token list may be mutated as part of processing. However,

25

its changes do not necessarily match the output token stream.

29

its changes do not necessarily match the output token stream.

26

"""

30

"""

27

futureimpline = False

31

futureimpline = False

28

32

29

# The following utility functions access the tokens list and i index of

33

# The following utility functions access the tokens list and i index of

30

# the for i, t enumerate(tokens) loop below

34

# the for i, t enumerate(tokens) loop below

31

def _isop(j, *o):

35

def _isop(j, *o):

32

"""Assert that tokens[j] is an OP with one of the given values"""

36

"""Assert that tokens[j] is an OP with one of the given values"""

33

try:

37

try:

34

return tokens[j].type == token.OP and tokens[j].string in o

38

return tokens[j].type == token.OP and tokens[j].string in o

35

except IndexError:

39

except IndexError:

36

return False

40

return False

37

41

38

def _findargnofcall(n):

42

def _findargnofcall(n):

39

"""Find arg n of a call expression (start at 0)

43

"""Find arg n of a call expression (start at 0)

40

44

41

Returns index of the first token of that argument, or None if

45

Returns index of the first token of that argument, or None if

42

there is not that many arguments.

46

there is not that many arguments.

43

47

44

Assumes that token[i + 1] is '('.

48

Assumes that token[i + 1] is '('.

45

49

46

"""

50

"""

47

nested = 0

51

nested = 0

48

for j in range(i + 2, len(tokens)):

52

for j in range(i + 2, len(tokens)):

49

if _isop(j, ')', ']', '}'):

53

if _isop(j, ')', ']', '}'):

50

# end of call, tuple, subscription or dict / set

54

# end of call, tuple, subscription or dict / set

51

nested -= 1

55

nested -= 1

52

if nested < 0:

56

if nested < 0:

53

return None

57

return None

54

elif n == 0:

58

elif n == 0:

55

# this is the starting position of arg

59

# this is the starting position of arg

56

return j

60

return j

57

elif _isop(j, '(', '[', '{'):

61

elif _isop(j, '(', '[', '{'):

58

nested += 1

62

nested += 1

59

elif _isop(j, ',') and nested == 0:

63

elif _isop(j, ',') and nested == 0:

60

n -= 1

64

n -= 1

61

65

62

return None

66

return None

63

67

64

def _ensureunicode(j):

68

def _ensureunicode(j):

65

"""Make sure the token at j is a unicode string

69

"""Make sure the token at j is a unicode string

66

70

67

This rewrites a string token to include the unicode literal prefix

71

This rewrites a string token to include the unicode literal prefix

68

so the string transformer won't add the byte prefix.

72

so the string transformer won't add the byte prefix.

69

73

70

Ignores tokens that are not strings. Assumes bounds checking has

74

Ignores tokens that are not strings. Assumes bounds checking has

71

already been done.

75

already been done.

72

76

73

"""

77

"""

74

st = tokens[j]

78

st = tokens[j]

75

if st.type == token.STRING and st.string.startswith(("'", '"')):

79

if st.type == token.STRING and st.string.startswith(("'", '"')):

76

tokens[j] = st._replace(string='u%s' % st.string)

80

tokens[j] = st._replace(string='u%s' % st.string)

77

81

78

for i, t in enumerate(tokens):

82

for i, t in enumerate(tokens):

79

# Convert most string literals to byte literals. String literals

83

# Convert most string literals to byte literals. String literals

80

# in Python 2 are bytes. String literals in Python 3 are unicode.

84

# in Python 2 are bytes. String literals in Python 3 are unicode.

81

# Most strings in Mercurial are bytes and unicode strings are rare.

85

# Most strings in Mercurial are bytes and unicode strings are rare.

82

# Rather than rewrite all string literals to use ``b''`` to indicate

86

# Rather than rewrite all string literals to use ``b''`` to indicate

83

# byte strings, we apply this token transformer to insert the ``b``

87

# byte strings, we apply this token transformer to insert the ``b``

84

# prefix nearly everywhere.

88

# prefix nearly everywhere.

85

if t.type == token.STRING:

89

if t.type == token.STRING:

86

s = t.string

90

s = t.string

87

91

88

# Preserve docstrings as string literals. This is inconsistent

92

# Preserve docstrings as string literals. This is inconsistent

89

# with regular unprefixed strings. However, the

93

# with regular unprefixed strings. However, the

90

# "from __future__" parsing (which allows a module docstring to

94

# "from __future__" parsing (which allows a module docstring to

91

# exist before it) doesn't properly handle the docstring if it

95

# exist before it) doesn't properly handle the docstring if it

92

# is b''' prefixed, leading to a SyntaxError. We leave all

96

# is b''' prefixed, leading to a SyntaxError. We leave all

93

# docstrings as unprefixed to avoid this. This means Mercurial

97

# docstrings as unprefixed to avoid this. This means Mercurial

94

# components touching docstrings need to handle unicode,

98

# components touching docstrings need to handle unicode,

95

# unfortunately.

99

# unfortunately.

96

if s[0:3] in ("'''", '"""'):

100

if s[0:3] in ("'''", '"""'):

97

yield t

101

yield t

98

continue

102

continue

99

103

100

# If the first character isn't a quote, it is likely a string

104

# If the first character isn't a quote, it is likely a string

101

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

105

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

102

if s[0] not in ("'", '"'):

106

if s[0] not in ("'", '"'):

103

yield t

107

yield t

104

continue

108

continue

105

109

106

# String literal. Prefix to make a b'' string.

110

# String literal. Prefix to make a b'' string.

107

yield t._replace(string='b%s' % t.string)

111

yield t._replace(string='b%s' % t.string)

108

continue

112

continue

109

113

110

# Insert compatibility imports at "from __future__ import" line.

114

# Insert compatibility imports at "from __future__ import" line.

111

# No '\n' should be added to preserve line numbers.

115

# No '\n' should be added to preserve line numbers.

112

if (t.type == token.NAME and t.string == 'import' and

116

if (t.type == token.NAME and t.string == 'import' and

113

all(u.type == token.NAME for u in tokens[i - 2:i]) and

117

all(u.type == token.NAME for u in tokens[i - 2:i]) and

114

[u.string for u in tokens[i - 2:i]] == ['from', '__future__']):

118

[u.string for u in tokens[i - 2:i]] == ['from', '__future__']):

115

futureimpline = True

119

futureimpline = True

116

if t.type == token.NEWLINE and futureimpline:

120

if t.type == token.NEWLINE and futureimpline:

117

futureimpline = False

121

futureimpline = False

118

if fullname == 'mercurial.pycompat':

122

if fullname == 'mercurial.pycompat':

119

yield t

123

yield t

120

continue

124

continue

121

r, c = t.start

125

r, c = t.start

122

l = (b'; from mercurial.pycompat import '

126

l = (b'; from mercurial.pycompat import '

123

b'delattr, getattr, hasattr, setattr, xrange, '

127

b'delattr, getattr, hasattr, setattr, xrange, '

124

b'open, unicode\n')

128

b'open, unicode\n')

125

for u in tokenize.tokenize(io.BytesIO(l).readline):

129

for u in tokenize.tokenize(io.BytesIO(l).readline):

126

if u.type in (tokenize.ENCODING, token.ENDMARKER):

130

if u.type in (tokenize.ENCODING, token.ENDMARKER):

127

continue

131

continue

128

yield u._replace(

132

yield u._replace(

129

start=(r, c + u.start[1]), end=(r, c + u.end[1]))

133

start=(r, c + u.start[1]), end=(r, c + u.end[1]))

130

continue

134

continue

131

135

132

# This looks like a function call.

136

# This looks like a function call.

133

if t.type == token.NAME and _isop(i + 1, '('):

137

if t.type == token.NAME and _isop(i + 1, '('):

134

fn = t.string

138

fn = t.string

135

139

136

# *attr() builtins don't accept byte strings to 2nd argument.

140

# *attr() builtins don't accept byte strings to 2nd argument.

137

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

141

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

138

not _isop(i - 1, '.')):

142

not _isop(i - 1, '.')):

139

arg1idx = _findargnofcall(1)

143

arg1idx = _findargnofcall(1)

140

if arg1idx is not None:

144

if arg1idx is not None:

141

_ensureunicode(arg1idx)

145

_ensureunicode(arg1idx)

142

146

143

# .encode() and .decode() on str/bytes/unicode don't accept

147

# .encode() and .decode() on str/bytes/unicode don't accept

144

# byte strings on Python 3.

148

# byte strings on Python 3.

145

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

149

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

146

for argn in range(2):

150

for argn in range(2):

147

argidx = _findargnofcall(argn)

151

argidx = _findargnofcall(argn)

148

if argidx is not None:

152

if argidx is not None:

149

_ensureunicode(argidx)

153

_ensureunicode(argidx)

150

154

151

# It changes iteritems/values to items/values as they are not

155

# It changes iteritems/values to items/values as they are not

152

# present in Python 3 world.

156

# present in Python 3 world.

153

elif fn in ('iteritems', 'itervalues'):

157

elif fn in ('iteritems', 'itervalues'):

154

yield t._replace(string=fn[4:])

158

yield t._replace(string=fn[4:])

155

continue

159

continue

156

160

157

# Emit unmodified token.

161

# Emit unmodified token.

158

yield t

162

yield t

159

163

160

def process(fin, fout):

164

def process(fin, fout):

161

tokens = tokenize.tokenize(fin.readline)

165

tokens = tokenize.tokenize(fin.readline)

162

tokens = replacetokens(list(tokens), fullname='<dummy>')

166

tokens = replacetokens(list(tokens), fullname='<dummy>')

163

fout.write(tokenize.untokenize(tokens))

167

fout.write(tokenize.untokenize(tokens))

164

168

169

def tryunlink(fname):

170

try:

171

os.unlink(fname)

172

except OSError as err:

173

if err.errno != errno.ENOENT:

174

raise

175

176

@contextlib.contextmanager

177

def editinplace(fname):

178

n = os.path.basename(fname)

179

d = os.path.dirname(fname)

180

fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,

181

delete=False)

182

try:

183

yield fp

184

fp.close()

185

if os.name == 'nt':

186

tryunlink(fname)

187

os.rename(fp.name, fname)

188

finally:

189

fp.close()

190

tryunlink(fp.name)

191

165

def main():

192

def main():

166

ap = argparse.ArgumentParser()

193

ap = argparse.ArgumentParser()

194

ap.add_argument('-i', '--inplace', action='store_true', default=False,

195

help='edit files in place')

167

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

196

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

168

args = ap.parse_args()

197

args = ap.parse_args()

169

for fname in args.files:

198

for fname in args.files:

199

if args.inplace:

200

with editinplace(fname) as fout:

201

with open(fname, 'rb') as fin:

202

process(fin, fout)

203

else:

170

with open(fname, 'rb') as fin:

204

with open(fname, 'rb') as fin:

171

fout = sys.stdout.buffer

205

fout = sys.stdout.buffer

172

process(fin, fout)

206

process(fin, fout)

173

207

174

if __name__ == '__main__':

208

if __name__ == '__main__':

175

main()

209

main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python3
             #
             # byteify-strings.py - transform string literals to be Python 3 safe
             #
             # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import argparse
+            import contextlib
+            import errno
             import io
+            import os
             import sys
+            import tempfile
             import token
             import tokenize
             if True:
                 def replacetokens(tokens, fullname):
                     """Transform a stream of tokens from raw to Python 3.
                     Returns a generator of possibly rewritten tokens.
                     The input token list may be mutated as part of processing. However,
                     its changes do not necessarily match the output token stream.
                     """
                     futureimpline = False
                     # The following utility functions access the tokens list and i index of
                     # the for i, t enumerate(tokens) loop below
                     def _isop(j, *o):
                         """Assert that tokens[j] is an OP with one of the given values"""
                         try:
                             return tokens[j].type == token.OP and tokens[j].string in o
                         except IndexError:
                             return False
                     def _findargnofcall(n):
                         """Find arg n of a call expression (start at 0)
                         Returns index of the first token of that argument, or None if
                         there is not that many arguments.
                         Assumes that token[i + 1] is '('.
                         """
                         nested = 0
                         for j in range(i + 2, len(tokens)):
                             if _isop(j, ')', ']', '}'):
                                 # end of call, tuple, subscription or dict / set
                                 nested -= 1
                                 if nested < 0:
                                     return None
                             elif n == 0:
                                 # this is the starting position of arg
                                 return j
                             elif _isop(j, '(', '[', '{'):
                                 nested += 1
                             elif _isop(j, ',') and nested == 0:
                                 n -= 1
                         return None
                     def _ensureunicode(j):
                         """Make sure the token at j is a unicode string
                         This rewrites a string token to include the unicode literal prefix
                         so the string transformer won't add the byte prefix.
                         Ignores tokens that are not strings. Assumes bounds checking has
                         already been done.
                         """
                         st = tokens[j]
                         if st.type == token.STRING and st.string.startswith(("'", '"')):
                             tokens[j] = st._replace(string='u%s' % st.string)
                     for i, t in enumerate(tokens):
                         # Convert most string literals to byte literals. String literals
                         # in Python 2 are bytes. String literals in Python 3 are unicode.
                         # Most strings in Mercurial are bytes and unicode strings are rare.
                         # Rather than rewrite all string literals to use ``b''`` to indicate
                         # byte strings, we apply this token transformer to insert the ``b``
                         # prefix nearly everywhere.
                         if t.type == token.STRING:
                             s = t.string
                             # Preserve docstrings as string literals. This is inconsistent
                             # with regular unprefixed strings. However, the
                             # "from __future__" parsing (which allows a module docstring to
                             # exist before it) doesn't properly handle the docstring if it
                             # is b''' prefixed, leading to a SyntaxError. We leave all
                             # docstrings as unprefixed to avoid this. This means Mercurial
                             # components touching docstrings need to handle unicode,
                             # unfortunately.
                             if s[0:3] in ("'''", '"""'):
                                 yield t
                                 continue
                             # If the first character isn't a quote, it is likely a string
                             # prefixing character (such as 'b', 'u', or 'r'. Ignore.
                             if s[0] not in ("'", '"'):
                                 yield t
                                 continue
                             # String literal. Prefix to make a b'' string.
                             yield t._replace(string='b%s' % t.string)
                             continue
                         # Insert compatibility imports at "from __future__ import" line.
                         # No '\n' should be added to preserve line numbers.
                         if (t.type == token.NAME and t.string == 'import' and
                             all(u.type == token.NAME for u in tokens[i - 2:i]) and
                             [u.string for u in tokens[i - 2:i]] == ['from', '__future__']):
                             futureimpline = True
                         if t.type == token.NEWLINE and futureimpline:
                             futureimpline = False
                             if fullname == 'mercurial.pycompat':
                                 yield t
                                 continue
                             r, c = t.start
                             l = (b'; from mercurial.pycompat import '
                                  b'delattr, getattr, hasattr, setattr, xrange, '
                                  b'open, unicode\n')
                             for u in tokenize.tokenize(io.BytesIO(l).readline):
                                 if u.type in (tokenize.ENCODING, token.ENDMARKER):
                                     continue
                                 yield u._replace(
                                     start=(r, c + u.start[1]), end=(r, c + u.end[1]))
                             continue
                         # This looks like a function call.
                         if t.type == token.NAME and _isop(i + 1, '('):
                             fn = t.string
                             # *attr() builtins don't accept byte strings to 2nd argument.
                             if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
                                     not _isop(i - 1, '.')):
                                 arg1idx = _findargnofcall(1)
                                 if arg1idx is not None:
                                     _ensureunicode(arg1idx)
                             # .encode() and .decode() on str/bytes/unicode don't accept
                             # byte strings on Python 3.
                             elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
                                 for argn in range(2):
                                     argidx = _findargnofcall(argn)
                                     if argidx is not None:
                                         _ensureunicode(argidx)
                             # It changes iteritems/values to items/values as they are not
                             # present in Python 3 world.
                             elif fn in ('iteritems', 'itervalues'):
                                 yield t._replace(string=fn[4:])
                                 continue
                         # Emit unmodified token.
                         yield t
             def process(fin, fout):
                 tokens = tokenize.tokenize(fin.readline)
                 tokens = replacetokens(list(tokens), fullname='<dummy>')
                 fout.write(tokenize.untokenize(tokens))
+            def tryunlink(fname):
+                try:
+                    os.unlink(fname)
+                except OSError as err:
+                    if err.errno != errno.ENOENT:
+                        raise
+            @contextlib.contextmanager
+            def editinplace(fname):
+                n = os.path.basename(fname)
+                d = os.path.dirname(fname)
+                fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
+                                                 delete=False)
+                try:
+                    yield fp
+                    fp.close()
+                    if os.name == 'nt':
+                        tryunlink(fname)
+                    os.rename(fp.name, fname)
+                finally:
+                    fp.close()
+                    tryunlink(fp.name)
             def main():
                 ap = argparse.ArgumentParser()
+                ap.add_argument('-i', '--inplace', action='store_true', default=False,
+                                help='edit files in place')
                 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
                 args = ap.parse_args()
                 for fname in args.files:
+                    if args.inplace:
+                        with editinplace(fname) as fout:
+                            with open(fname, 'rb') as fin:
+                                process(fin, fout)
+                    else:
                         with open(fname, 'rb') as fin:
                             fout = sys.stdout.buffer
                             process(fin, fout)
             if __name__ == '__main__':
                 main()