upstream/mercurial-mirror Commit - r38408:1d68fd5f

1

#!/usr/bin/env python3

1

#!/usr/bin/env python3

2

#

2

#

3

# byteify-strings.py - transform string literals to be Python 3 safe

3

# byteify-strings.py - transform string literals to be Python 3 safe

4

#

4

#

5

6

#

6

#

7

# This software may be used and distributed according to the terms of the

7

# This software may be used and distributed according to the terms of the

8

# GNU General Public License version 2 or any later version.

8

# GNU General Public License version 2 or any later version.

9

10

from __future__ import absolute_import

10

from __future__ import absolute_import

11

12

import argparse

12

import argparse

13

import contextlib

13

import contextlib

14

import errno

14

import errno

15

import os

15

import os

16

import sys

16

import sys

17

import tempfile

17

import tempfile

18

import token

18

import token

19

import tokenize

19

import tokenize

20

21

if True:

21

if True:

22

def replacetokens(tokens, opts):

22

def replacetokens(tokens, opts):

23

"""Transform a stream of tokens from raw to Python 3.

23

"""Transform a stream of tokens from raw to Python 3.

24

25

Returns a generator of possibly rewritten tokens.

25

Returns a generator of possibly rewritten tokens.

26

27

The input token list may be mutated as part of processing. However,

27

The input token list may be mutated as part of processing. However,

28

its changes do not necessarily match the output token stream.

28

its changes do not necessarily match the output token stream.

29

"""

29

"""

30

sysstrtokens = set()

31

30

# The following utility functions access the tokens list and i index of

32

# The following utility functions access the tokens list and i index of

31

# the for i, t enumerate(tokens) loop below

33

# the for i, t enumerate(tokens) loop below

32

def _isop(j, *o):

34

def _isop(j, *o):

33

"""Assert that tokens[j] is an OP with one of the given values"""

35

"""Assert that tokens[j] is an OP with one of the given values"""

34

try:

36

try:

35

return tokens[j].type == token.OP and tokens[j].string in o

37

return tokens[j].type == token.OP and tokens[j].string in o

36

except IndexError:

38

except IndexError:

37

return False

39

return False

38

40

39

def _findargnofcall(n):

41

def _findargnofcall(n):

40

"""Find arg n of a call expression (start at 0)

42

"""Find arg n of a call expression (start at 0)

41

43

42

Returns index of the first token of that argument, or None if

44

Returns index of the first token of that argument, or None if

43

there is not that many arguments.

45

there is not that many arguments.

44

46

45

Assumes that token[i + 1] is '('.

47

Assumes that token[i + 1] is '('.

46

48

47

"""

49

"""

48

nested = 0

50

nested = 0

49

for j in range(i + 2, len(tokens)):

51

for j in range(i + 2, len(tokens)):

50

if _isop(j, ')', ']', '}'):

52

if _isop(j, ')', ']', '}'):

51

# end of call, tuple, subscription or dict / set

53

# end of call, tuple, subscription or dict / set

52

nested -= 1

54

nested -= 1

53

if nested < 0:

55

if nested < 0:

54

return None

56

return None

55

elif n == 0:

57

elif n == 0:

56

# this is the starting position of arg

58

# this is the starting position of arg

57

return j

59

return j

58

elif _isop(j, '(', '[', '{'):

60

elif _isop(j, '(', '[', '{'):

59

nested += 1

61

nested += 1

60

elif _isop(j, ',') and nested == 0:

62

elif _isop(j, ',') and nested == 0:

61

n -= 1

63

n -= 1

62

64

63

return None

65

return None

64

66

65

def _ensure~~unicode~~(j):

67

def _ensuresysstr(j):

66

"""Make sure the token at j is a ~~unicode~~ string

68

"""Make sure the token at j is a system string

67

69

68

This rewrites a string token to include the unicode literal prefix

70

Remember the given token so the string transformer won't add

69

~~so the string transformer won't add~~ the byte prefix.

71

the byte prefix.

70

72

71

Ignores tokens that are not strings. Assumes bounds checking has

73

Ignores tokens that are not strings. Assumes bounds checking has

72

already been done.

74

already been done.

73

75

74

"""

76

"""

75

st = tokens[j]

77

st = tokens[j]

76

if st.type == token.STRING and st.string.startswith(("'", '"')):

78

if st.type == token.STRING and st.string.startswith(("'", '"')):

77

tokens[j] = st._replace(string='u%s' % st.string)

79

sysstrtokens.add(st)

78

80

79

for i, t in enumerate(tokens):

81

for i, t in enumerate(tokens):

80

# Convert most string literals to byte literals. String literals

82

# Convert most string literals to byte literals. String literals

81

# in Python 2 are bytes. String literals in Python 3 are unicode.

83

# in Python 2 are bytes. String literals in Python 3 are unicode.

82

# Most strings in Mercurial are bytes and unicode strings are rare.

84

# Most strings in Mercurial are bytes and unicode strings are rare.

83

# Rather than rewrite all string literals to use ``b''`` to indicate

85

# Rather than rewrite all string literals to use ``b''`` to indicate

84

# byte strings, we apply this token transformer to insert the ``b``

86

# byte strings, we apply this token transformer to insert the ``b``

85

# prefix nearly everywhere.

87

# prefix nearly everywhere.

86

if t.type == token.STRING:

88

if t.type == token.STRING and t not in sysstrtokens:

87

s = t.string

89

s = t.string

88

90

89

# Preserve docstrings as string literals. This is inconsistent

91

# Preserve docstrings as string literals. This is inconsistent

90

# with regular unprefixed strings. However, the

92

# with regular unprefixed strings. However, the

91

# "from __future__" parsing (which allows a module docstring to

93

# "from __future__" parsing (which allows a module docstring to

92

# exist before it) doesn't properly handle the docstring if it

94

# exist before it) doesn't properly handle the docstring if it

93

# is b''' prefixed, leading to a SyntaxError. We leave all

95

# is b''' prefixed, leading to a SyntaxError. We leave all

94

# docstrings as unprefixed to avoid this. This means Mercurial

96

# docstrings as unprefixed to avoid this. This means Mercurial

95

# components touching docstrings need to handle unicode,

97

# components touching docstrings need to handle unicode,

96

# unfortunately.

98

# unfortunately.

97

if s[0:3] in ("'''", '"""'):

99

if s[0:3] in ("'''", '"""'):

98

yield t

100

yield t

99

continue

101

continue

100

102

101

# If the first character isn't a quote, it is likely a string

103

# If the first character isn't a quote, it is likely a string

102

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

104

# prefixing character (such as 'b', 'u', or 'r'. Ignore.

103

if s[0] not in ("'", '"'):

105

if s[0] not in ("'", '"'):

104

yield t

106

yield t

105

continue

107

continue

106

108

107

# String literal. Prefix to make a b'' string.

109

# String literal. Prefix to make a b'' string.

108

yield t._replace(string='b%s' % t.string)

110

yield t._replace(string='b%s' % t.string)

109

continue

111

continue

110

112

111

# This looks like a function call.

113

# This looks like a function call.

112

if t.type == token.NAME and _isop(i + 1, '('):

114

if t.type == token.NAME and _isop(i + 1, '('):

113

fn = t.string

115

fn = t.string

114

116

115

# *attr() builtins don't accept byte strings to 2nd argument.

117

# *attr() builtins don't accept byte strings to 2nd argument.

116

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

118

if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and

117

not _isop(i - 1, '.')):

119

not _isop(i - 1, '.')):

118

arg1idx = _findargnofcall(1)

120

arg1idx = _findargnofcall(1)

119

if arg1idx is not None:

121

if arg1idx is not None:

120

_ensure~~unicode~~(arg1idx)

122

_ensuresysstr(arg1idx)

121

123

122

# .encode() and .decode() on str/bytes/unicode don't accept

124

# .encode() and .decode() on str/bytes/unicode don't accept

123

# byte strings on Python 3.

125

# byte strings on Python 3.

124

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

126

elif fn in ('encode', 'decode') and _isop(i - 1, '.'):

125

for argn in range(2):

127

for argn in range(2):

126

argidx = _findargnofcall(argn)

128

argidx = _findargnofcall(argn)

127

if argidx is not None:

129

if argidx is not None:

128

_ensure~~unicode~~(argidx)

130

_ensuresysstr(argidx)

129

131

130

# It changes iteritems/values to items/values as they are not

132

# It changes iteritems/values to items/values as they are not

131

# present in Python 3 world.

133

# present in Python 3 world.

132

elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):

134

elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):

133

yield t._replace(string=fn[4:])

135

yield t._replace(string=fn[4:])

134

continue

136

continue

135

137

136

# Emit unmodified token.

138

# Emit unmodified token.

137

yield t

139

yield t

138

140

139

def process(fin, fout, opts):

141

def process(fin, fout, opts):

140

tokens = tokenize.tokenize(fin.readline)

142

tokens = tokenize.tokenize(fin.readline)

141

tokens = replacetokens(list(tokens), opts)

143

tokens = replacetokens(list(tokens), opts)

142

fout.write(tokenize.untokenize(tokens))

144

fout.write(tokenize.untokenize(tokens))

143

145

144

def tryunlink(fname):

146

def tryunlink(fname):

145

try:

147

try:

146

os.unlink(fname)

148

os.unlink(fname)

147

except OSError as err:

149

except OSError as err:

148

if err.errno != errno.ENOENT:

150

if err.errno != errno.ENOENT:

149

raise

151

raise

150

152

151

@contextlib.contextmanager

153

@contextlib.contextmanager

152

def editinplace(fname):

154

def editinplace(fname):

153

n = os.path.basename(fname)

155

n = os.path.basename(fname)

154

d = os.path.dirname(fname)

156

d = os.path.dirname(fname)

155

fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,

157

fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,

156

delete=False)

158

delete=False)

157

try:

159

try:

158

yield fp

160

yield fp

159

fp.close()

161

fp.close()

160

if os.name == 'nt':

162

if os.name == 'nt':

161

tryunlink(fname)

163

tryunlink(fname)

162

os.rename(fp.name, fname)

164

os.rename(fp.name, fname)

163

finally:

165

finally:

164

fp.close()

166

fp.close()

165

tryunlink(fp.name)

167

tryunlink(fp.name)

166

168

167

def main():

169

def main():

168

ap = argparse.ArgumentParser()

170

ap = argparse.ArgumentParser()

169

ap.add_argument('-i', '--inplace', action='store_true', default=False,

171

ap.add_argument('-i', '--inplace', action='store_true', default=False,

170

help='edit files in place')

172

help='edit files in place')

171

ap.add_argument('--dictiter', action='store_true', default=False,

173

ap.add_argument('--dictiter', action='store_true', default=False,

172

help='rewrite iteritems() and itervalues()'),

174

help='rewrite iteritems() and itervalues()'),

173

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

175

ap.add_argument('files', metavar='FILE', nargs='+', help='source file')

174

args = ap.parse_args()

176

args = ap.parse_args()

175

opts = {

177

opts = {

176

'dictiter': args.dictiter,

178

'dictiter': args.dictiter,

177

}

179

}

178

for fname in args.files:

180

for fname in args.files:

179

if args.inplace:

181

if args.inplace:

180

with editinplace(fname) as fout:

182

with editinplace(fname) as fout:

181

with open(fname, 'rb') as fin:

183

with open(fname, 'rb') as fin:

182

process(fin, fout, opts)

184

process(fin, fout, opts)

183

else:

185

else:

184

with open(fname, 'rb') as fin:

186

with open(fname, 'rb') as fin:

185

fout = sys.stdout.buffer

187

fout = sys.stdout.buffer

186

process(fin, fout, opts)

188

process(fin, fout, opts)

187

189

188

if __name__ == '__main__':

190

if __name__ == '__main__':

189

main()

191

main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python3
             #
             # byteify-strings.py - transform string literals to be Python 3 safe
             #
             # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import argparse
             import contextlib
             import errno
             import os
             import sys
             import tempfile
             import token
             import tokenize
             if True:
                 def replacetokens(tokens, opts):
                     """Transform a stream of tokens from raw to Python 3.
                     Returns a generator of possibly rewritten tokens.
                     The input token list may be mutated as part of processing. However,
                     its changes do not necessarily match the output token stream.
                     """
+                    sysstrtokens = set()
                     # The following utility functions access the tokens list and i index of
                     # the for i, t enumerate(tokens) loop below
                     def _isop(j, *o):
                         """Assert that tokens[j] is an OP with one of the given values"""
                         try:
                             return tokens[j].type == token.OP and tokens[j].string in o
                         except IndexError:
                             return False
                     def _findargnofcall(n):
                         """Find arg n of a call expression (start at 0)
                         Returns index of the first token of that argument, or None if
                         there is not that many arguments.
                         Assumes that token[i + 1] is '('.
                         """
                         nested = 0
                         for j in range(i + 2, len(tokens)):
                             if _isop(j, ')', ']', '}'):
                                 # end of call, tuple, subscription or dict / set
                                 nested -= 1
                                 if nested < 0:
                                     return None
                             elif n == 0:
                                 # this is the starting position of arg
                                 return j
                             elif _isop(j, '(', '[', '{'):
                                 nested += 1
                             elif _isop(j, ',') and nested == 0:
                                 n -= 1
                         return None
-                    def _ensureunicode(j):
+                    def _ensuresysstr(j):
-                        """Make sure the token at j is a unicode string
+                        """Make sure the token at j is a system string
-                        This rewrites a string token to include the unicode literal prefix
+                        Remember the given token so the string transformer won't add
-                        so the string transformer won't add the byte prefix.
+                        the byte prefix.
                         Ignores tokens that are not strings. Assumes bounds checking has
                         already been done.
                         """
                         st = tokens[j]
                         if st.type == token.STRING and st.string.startswith(("'", '"')):
-                            tokens[j] = st._replace(string='u%s' % st.string)
+                            sysstrtokens.add(st)
                     for i, t in enumerate(tokens):
                         # Convert most string literals to byte literals. String literals
                         # in Python 2 are bytes. String literals in Python 3 are unicode.
                         # Most strings in Mercurial are bytes and unicode strings are rare.
                         # Rather than rewrite all string literals to use ``b''`` to indicate
                         # byte strings, we apply this token transformer to insert the ``b``
                         # prefix nearly everywhere.
-                        if t.type == token.STRING:
+                        if t.type == token.STRING and t not in sysstrtokens:
                             s = t.string
                             # Preserve docstrings as string literals. This is inconsistent
                             # with regular unprefixed strings. However, the
                             # "from __future__" parsing (which allows a module docstring to
                             # exist before it) doesn't properly handle the docstring if it
                             # is b''' prefixed, leading to a SyntaxError. We leave all
                             # docstrings as unprefixed to avoid this. This means Mercurial
                             # components touching docstrings need to handle unicode,
                             # unfortunately.
                             if s[0:3] in ("'''", '"""'):
                                 yield t
                                 continue
                             # If the first character isn't a quote, it is likely a string
                             # prefixing character (such as 'b', 'u', or 'r'. Ignore.
                             if s[0] not in ("'", '"'):
                                 yield t
                                 continue
                             # String literal. Prefix to make a b'' string.
                             yield t._replace(string='b%s' % t.string)
                             continue
                         # This looks like a function call.
                         if t.type == token.NAME and _isop(i + 1, '('):
                             fn = t.string
                             # *attr() builtins don't accept byte strings to 2nd argument.
                             if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
                                     not _isop(i - 1, '.')):
                                 arg1idx = _findargnofcall(1)
                                 if arg1idx is not None:
-                                    _ensureunicode(arg1idx)
+                                    _ensuresysstr(arg1idx)
                             # .encode() and .decode() on str/bytes/unicode don't accept
                             # byte strings on Python 3.
                             elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
                                 for argn in range(2):
                                     argidx = _findargnofcall(argn)
                                     if argidx is not None:
-                                        _ensureunicode(argidx)
+                                        _ensuresysstr(argidx)
                             # It changes iteritems/values to items/values as they are not
                             # present in Python 3 world.
                             elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
                                 yield t._replace(string=fn[4:])
                                 continue
                         # Emit unmodified token.
                         yield t
             def process(fin, fout, opts):
                 tokens = tokenize.tokenize(fin.readline)
                 tokens = replacetokens(list(tokens), opts)
                 fout.write(tokenize.untokenize(tokens))
             def tryunlink(fname):
                 try:
                     os.unlink(fname)
                 except OSError as err:
                     if err.errno != errno.ENOENT:
                         raise
             @contextlib.contextmanager
             def editinplace(fname):
                 n = os.path.basename(fname)
                 d = os.path.dirname(fname)
                 fp = tempfile.NamedTemporaryFile(prefix='.%s-' % n, suffix='~', dir=d,
                                                  delete=False)
                 try:
                     yield fp
                     fp.close()
                     if os.name == 'nt':
                         tryunlink(fname)
                     os.rename(fp.name, fname)
                 finally:
                     fp.close()
                     tryunlink(fp.name)
             def main():
                 ap = argparse.ArgumentParser()
                 ap.add_argument('-i', '--inplace', action='store_true', default=False,
                                 help='edit files in place')
                 ap.add_argument('--dictiter', action='store_true', default=False,
                                 help='rewrite iteritems() and itervalues()'),
                 ap.add_argument('files', metavar='FILE', nargs='+', help='source file')
                 args = ap.parse_args()
                 opts = {
                     'dictiter': args.dictiter,
                 }
                 for fname in args.files:
                     if args.inplace:
                         with editinplace(fname) as fout:
                             with open(fname, 'rb') as fin:
                                 process(fin, fout, opts)
                     else:
                         with open(fname, 'rb') as fin:
                             fout = sys.stdout.buffer
                             process(fin, fout, opts)
             if __name__ == '__main__':
                 main()