upstream/mercurial-mirror Commit - r38407:f701bc93

1

#!/usr/bin/env python3

1

#!/usr/bin/env python3

2

#

2

#

3

# byteify-strings.py - transform string literals to be Python 3 safe

3

# byteify-strings.py - transform string literals to be Python 3 safe

4

#

4

#

5

6

#

6

#

7

# This software may be used and distributed according to the terms of the

7

# This software may be used and distributed according to the terms of the

8

# GNU General Public License version 2 or any later version.

8

# GNU General Public License version 2 or any later version.

9

10

from __future__ import absolute_import

10

from __future__ import absolute_import

11

12

import argparse

12

import argparse

13

import contextlib

13

import contextlib

14

import errno

14

import errno

15

import os

15

import os

16

import sys

16

import sys

17

import tempfile

17

import tempfile

18

import token

18

import token

19

import tokenize

19

import tokenize

20

21

if True:

21

if True:

22

def replacetokens(tokens):

22

def replacetokens(tokens, opts):

23

"""Transform a stream of tokens from raw to Python 3.

23

"""Transform a stream of tokens from raw to Python 3.

24

25

Returns a generator of possibly rewritten tokens.

25

Returns a generator of possibly rewritten tokens.

26

27

The input token list may be mutated as part of processing. However,

27

The input token list may be mutated as part of processing. However,

28

its changes do not necessarily match the output token stream.

28

its changes do not necessarily match the output token stream.

29

"""

29

"""

30

# The following utility functions access the tokens list and i index of

30

# The following utility functions access the tokens list and i index of

31

# the for i, t enumerate(tokens) loop below

31

# the for i, t enumerate(tokens) loop below

32

def _isop(j, *o):

32

def _isop(j, *o):

33

"""Assert that tokens[j] is an OP with one of the given values"""

33

"""Assert that tokens[j] is an OP with one of the given values"""

34

try:

34

try:

35

return tokens[j].type == token.OP and tokens[j].string in o

35

return tokens[j].type == token.OP and tokens[j].string in o

36

except IndexError:

36

except IndexError:

37

return False

37

return False

38

39

def _findargnofcall(n):

39

def _findargnofcall(n):

40

"""Find arg n of a call expression (start at 0)

40

"""Find arg n of a call expression (start at 0)

41

42

Returns index of the first token of that argument, or None if

42

Returns index of the first token of that argument, or None if

43

there is not that many arguments.

43

there is not that many arguments.

44

45

Assumes that token[i + 1] is '('.

45

Assumes that token[i + 1] is '('.

46

47

"""

47

"""

48

nested = 0

48

nested = 0

49

for j in range(i + 2, len(tokens)):

49

for j in range(i + 2, len(tokens)):

50

if _isop(j, ')', ']', '}'):

50

if _isop(j, ')', ']', '}'):

51

# end of call, tuple, subscription or dict / set

51

# end of call, tuple, subscription or dict / set

52

nested -= 1

52

nested -= 1

53

if nested < 0:

53

if nested < 0:

54

return None

54

return None

55

elif n == 0:

55

elif n == 0:

56

# this is the starting position of arg

56

# this is the starting position of arg

57

return j

57

return j

58

elif _isop(j, '(', '[', '{'):

58

elif _isop(j, '(', '[', '{'):

59

nested += 1

59

nested += 1

60

elif _isop(j, ',') and nested == 0:

60

elif _isop(j, ',') and nested == 0:

61

n -= 1

61

n -= 1

62

63

return None

63

return None

64

65

def _ensureunicode(j):

65

def _ensureunicode(j):

66

"""Make sure the token at j is a unicode string

66

"""Make sure the token at j is a unicode string

67

68

This rewrites a string token to include the unicode literal prefix

68

This rewrites a string token to include the unicode literal prefix

69

so the string transformer won't add the byte prefix.

69

so the string transformer won't add the byte prefix.

70

71

Ignores tokens that are not strings. Assumes bounds checking has

71

Ignores tokens that are not strings. Assumes bounds checking has

72

already been done.

72

already been done.

73

74

"""

74

"""

75

st = tokens[j]

75

st = tokens[j]

76

if st.type == token.STRING and st.string.startswith(("'", '"')):

76

if st.type == token.STRING and st.string.startswith(("'", '"')):

77

tokens[j] = st._replace(string='u%s' % st.string)

77

tokens[j] = st._replace(string='u%s' % st.string)

78

79

for i, t in enumerate(tokens):

79

for i, t in enumerate(tokens):

80

# Convert most string literals to byte literals. String literals

80

# Convert most string literals to byte literals. String literals

81

# in Python 2 are bytes. String literals in Python 3 are unicode.

81

# in Python 2 are bytes. String literals in Python 3 are unicode.

82

# Most strings in Mercurial are bytes and unicode strings are rare.

82

# Most strings in Mercurial are bytes and unicode strings are rare.

83

# Rather than rewrite all string literals to use ``b''`` to indicate

83

# Rather than rewrite all string literals to use ``b''`` to indicate

84

# byte strings, we apply this token transformer to insert the ``b``

84

# byte strings, we apply this token transformer to insert the ``b``

85

# prefix nearly everywhere.

85

# prefix nearly everywhere.

86

if t.type == token.STRING:

86

if t.type == token.STRING:

87

s = t.string

87

s = t.string

88

89

# Preserve docstrings as string literals. This is inconsistent

89

# Preserve docstrings as string literals. This is inconsistent

90

# with regular unprefixed strings. However, the

90

# with regular unprefixed strings. However, the

91

# "from __future__" parsing (which allows a module docstring to

91

# "from __future__" parsing (which allows a module docstring to

92

# exist before it) doesn't properly handle the docstring if it

92

# exist before it) doesn't properly handle the docstring if it

93

# is b''' prefixed, leading to a SyntaxError. We leave all

93

# is b''' prefixed, leading to a SyntaxError. We leave all

94

# docstrings as unprefixed to avoid this. This means Mercurial

94

# docstrings as unprefixed to avoid this. This means Mercurial

95

# components touching docstrings need to handle unicode,

95

# components touching docstrings need to handle unicode,

96

# unfortunately.

96

# unfortunately.

97

if s[0:3] in ("'''", '"""'):

97

if s[0:3] in ("'''", '"""'):

98

yield t

98

yield t

99

continue

99

continue

100

101

# If the first character isn't a quote, it is likely a string

101

# If the first character isn't a quote, it is likely a string

102

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

102

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

103

if s[0] not in ("'", '"'):

103

if s[0] not in ("'", '"'):

104

yield t

104

yield t

105

continue

105

continue

106

107

# String literal. Prefix to make a b'' string.

107

# String literal. Prefix to make a b'' string.

108

yield t._replace(string='b%s' % t.string)

108

yield t._replace(string='b%s' % t.string)

109

continue

109

continue

110

111

# This looks like a function call.

111

# This looks like a function call.

112

if t.type == token.NAME and _isop(i + 1, '('):

112

if t.type == token.NAME and _isop(i + 1, '('):

113

fn = t.string

113

fn = t.string

114

115

# *attr() builtins don't accept byte strings to 2nd argument.

115

# *attr() builtins don't accept byte strings to 2nd argument.

116

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

116

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

117

not _isop(i - 1, '.')):

117

not _isop(i - 1, '.')):

118

arg1idx = _findargnofcall(1)

118

arg1idx = _findargnofcall(1)

119

if arg1idx is not None:

119

if arg1idx is not None:

120

_ensureunicode(arg1idx)

120

_ensureunicode(arg1idx)

121

122

# .encode() and .decode() on str/bytes/unicode don't accept

122

# .encode() and .decode() on str/bytes/unicode don't accept

123

# byte strings on Python 3.

123

# byte strings on Python 3.

124

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

124

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

125

for argn in range(2):

125

for argn in range(2):

126

argidx = _findargnofcall(argn)

126

argidx = _findargnofcall(argn)

127

if argidx is not None:

127

if argidx is not None:

128

_ensureunicode(argidx)

128

_ensureunicode(argidx)

129

130

# It changes iteritems/values to items/values as they are not

130

# It changes iteritems/values to items/values as they are not

131

# present in Python 3 world.

131

# present in Python 3 world.

132

elif fn in ('iteritems', 'itervalues'):

132

elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):

133

yield t._replace(string=fn[4:])

133

yield t._replace(string=fn[4:])

134

continue

134

continue

135

136

# Emit unmodified token.

136

# Emit unmodified token.

137

yield t

137

yield t

138

139

def process(fin, fout):

139

def process(fin, fout, opts):

140

tokens = tokenize.tokenize(fin.readline)

140

tokens = tokenize.tokenize(fin.readline)

141

tokens = replacetokens(list(tokens))

141

tokens = replacetokens(list(tokens), opts)

142

fout.write(tokenize.untokenize(tokens))

142

fout.write(tokenize.untokenize(tokens))

143

144

def tryunlink(fname):

144

def tryunlink(fname):

145

try:

145

try:

146

os.unlink(fname)

146

os.unlink(fname)

147

except OSError as err:

147

except OSError as err:

148

if err.errno != errno.ENOENT:

148

if err.errno != errno.ENOENT:

149

raise

149

raise

150

151

@contextlib.contextmanager

151

@contextlib.contextmanager

152

def editinplace(fname):

152

def editinplace(fname):

153

n = os.path.basename(fname)

153

n = os.path.basename(fname)

154

d = os.path.dirname(fname)

154

d = os.path.dirname(fname)

155

fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,

155

fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,

156

delete=False)

156

delete=False)

157

try:

157

try:

158

yield fp

158

yield fp

159

fp.close()

159

fp.close()

160

if os.name == 'nt':

160

if os.name == 'nt':

161

tryunlink(fname)

161

tryunlink(fname)

162

os.rename(fp.name, fname)

162

os.rename(fp.name, fname)

163

finally:

163

finally:

164

fp.close()

164

fp.close()

165

tryunlink(fp.name)

165

tryunlink(fp.name)

166

167

def main():

167

def main():

168

ap = argparse.ArgumentParser()

168

ap = argparse.ArgumentParser()

169

ap.add_argument('-i', '--inplace', action='store_true', default=False,

169

ap.add_argument('-i', '--inplace', action='store_true', default=False,

170

help='edit files in place')

170

help='edit files in place')

171

ap.add_argument('--dictiter', action='store_true', default=False,

172

help='rewrite iteritems() and itervalues()'),

171

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

173

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

172

args = ap.parse_args()

174

args = ap.parse_args()

175

opts = {

176

'dictiter': args.dictiter,

177

}

173

for fname in args.files:

178

for fname in args.files:

174

if args.inplace:

179

if args.inplace:

175

with editinplace(fname) as fout:

180

with editinplace(fname) as fout:

176

with open(fname, 'rb') as fin:

181

with open(fname, 'rb') as fin:

177

process(fin, fout)

182

process(fin, fout, opts)

178

else:

183

else:

179

with open(fname, 'rb') as fin:

184

with open(fname, 'rb') as fin:

180

fout = sys.stdout.buffer

185

fout = sys.stdout.buffer

181

process(fin, fout)

186

process(fin, fout, opts)

182

187

183

if __name__ == '__main__':

188

if __name__ == '__main__':

184

main()

189

main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python3
             #
             # byteify-strings.py - transform string literals to be Python 3 safe
             #
             # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import argparse
             import contextlib
             import errno
             import os
             import sys
             import tempfile
             import token
             import tokenize
             if True:
-                def replacetokens(tokens):
+                def replacetokens(tokens, opts):
                     """Transform a stream of tokens from raw to Python 3.
                     Returns a generator of possibly rewritten tokens.
                     The input token list may be mutated as part of processing. However,
                     its changes do not necessarily match the output token stream.
                     """
                     # The following utility functions access the tokens list and i index of
                     # the for i, t enumerate(tokens) loop below
                     def _isop(j, *o):
                         """Assert that tokens[j] is an OP with one of the given values"""
                         try:
                             return tokens[j].type == token.OP and tokens[j].string in o
                         except IndexError:
                             return False
                     def _findargnofcall(n):
                         """Find arg n of a call expression (start at 0)
                         Returns index of the first token of that argument, or None if
                         there is not that many arguments.
                         Assumes that token[i + 1] is '('.
                         """
                         nested = 0
                         for j in range(i + 2, len(tokens)):
                             if _isop(j, ')', ']', '}'):
                                 # end of call, tuple, subscription or dict / set
                                 nested -= 1
                                 if nested < 0:
                                     return None
                             elif n == 0:
                                 # this is the starting position of arg
                                 return j
                             elif _isop(j, '(', '[', '{'):
                                 nested += 1
                             elif _isop(j, ',') and nested == 0:
                                 n -= 1
                         return None
                     def _ensureunicode(j):
                         """Make sure the token at j is a unicode string
                         This rewrites a string token to include the unicode literal prefix
                         so the string transformer won't add the byte prefix.
                         Ignores tokens that are not strings. Assumes bounds checking has
                         already been done.
                         """
                         st = tokens[j]
                         if st.type == token.STRING and st.string.startswith(("'", '"')):
                             tokens[j] = st._replace(string='u%s' % st.string)
                     for i, t in enumerate(tokens):
                         # Convert most string literals to byte literals. String literals
                         # in Python 2 are bytes. String literals in Python 3 are unicode.
                         # Most strings in Mercurial are bytes and unicode strings are rare.
                         # Rather than rewrite all string literals to use ``b''`` to indicate
                         # byte strings, we apply this token transformer to insert the ``b``
                         # prefix nearly everywhere.
                         if t.type == token.STRING:
                             s = t.string
                             # Preserve docstrings as string literals. This is inconsistent
                             # with regular unprefixed strings. However, the
                             # "from __future__" parsing (which allows a module docstring to
                             # exist before it) doesn't properly handle the docstring if it
                             # is b''' prefixed, leading to a SyntaxError. We leave all
                             # docstrings as unprefixed to avoid this. This means Mercurial
                             # components touching docstrings need to handle unicode,
                             # unfortunately.
                             if s[0:3] in ("'''", '"""'):
                                 yield t
                                 continue
                             # If the first character isn't a quote, it is likely a string
                             # prefixing character (such as 'b', 'u', or 'r'. Ignore.
                             if s[0] not in ("'", '"'):
                                 yield t
                                 continue
                             # String literal. Prefix to make a b'' string.
                             yield t._replace(string='b%s' % t.string)
                             continue
                         # This looks like a function call.
                         if t.type == token.NAME and _isop(i + 1, '('):
                             fn = t.string
                             # *attr() builtins don't accept byte strings to 2nd argument.
                             if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
                                     not _isop(i - 1, '.')):
                                 arg1idx = _findargnofcall(1)
                                 if arg1idx is not None:
                                     _ensureunicode(arg1idx)
                             # .encode() and .decode() on str/bytes/unicode don't accept
                             # byte strings on Python 3.
                             elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
                                 for argn in range(2):
                                     argidx = _findargnofcall(argn)
                                     if argidx is not None:
                                         _ensureunicode(argidx)
                             # It changes iteritems/values to items/values as they are not
                             # present in Python 3 world.
-                            elif fn in ('iteritems', 'itervalues'):
+                            elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
                                 yield t._replace(string=fn[4:])
                                 continue
                         # Emit unmodified token.
                         yield t
-            def process(fin, fout):
+            def process(fin, fout, opts):
                 tokens = tokenize.tokenize(fin.readline)
-                tokens = replacetokens(list(tokens))
+                tokens = replacetokens(list(tokens), opts)
                 fout.write(tokenize.untokenize(tokens))
             def tryunlink(fname):
                 try:
                     os.unlink(fname)
                 except OSError as err:
                     if err.errno != errno.ENOENT:
                         raise
             @contextlib.contextmanager
             def editinplace(fname):
                 n = os.path.basename(fname)
                 d = os.path.dirname(fname)
                 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
                                                  delete=False)
                 try:
                     yield fp
                     fp.close()
                     if os.name == 'nt':
                         tryunlink(fname)
                     os.rename(fp.name, fname)
                 finally:
                     fp.close()
                     tryunlink(fp.name)
             def main():
                 ap = argparse.ArgumentParser()
                 ap.add_argument('-i', '--inplace', action='store_true', default=False,
                                 help='edit files in place')
+                ap.add_argument('--dictiter', action='store_true', default=False,
+                                help='rewrite iteritems() and itervalues()'),
                 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
                 args = ap.parse_args()
+                opts = {
+                    'dictiter': args.dictiter,
+                }
                 for fname in args.files:
                     if args.inplace:
                         with editinplace(fname) as fout:
                             with open(fname, 'rb') as fin:
-                                process(fin, fout)
+                                process(fin, fout, opts)
                     else:
                         with open(fname, 'rb') as fin:
                             fout = sys.stdout.buffer
-                            process(fin, fout)
+                            process(fin, fout, opts)
             if __name__ == '__main__':
                 main()