upstream/ipython Commit - r10150:4ed49bf8

1

"""Patched version of standard library tokenize, to deal with various bugs.

1

"""Patched version of standard library tokenize, to deal with various bugs.

2

3

Based on Python 3.2 code.

3

Based on Python 3.2 code.

4

5

Patches:

5

Patches:

6

7

- Gareth Rees' patch for Python issue #12691 (untokenizing)

7

- Gareth Rees' patch for Python issue #12691 (untokenizing)

8

- Except we don't encode the output of untokenize

8

- Except we don't encode the output of untokenize

9

- Python 2 compatible syntax, so that it can be byte-compiled at installation

9

- Python 2 compatible syntax, so that it can be byte-compiled at installation

10

- Newlines in comments and blank lines should be either NL or NEWLINE, depending

10

- Newlines in comments and blank lines should be either NL or NEWLINE, depending

11

on whether they are in a multi-line statement. Filed as Python issue #17061.

11

on whether they are in a multi-line statement. Filed as Python issue #17061.

12

- Export generate_tokens & TokenError

12

- Export generate_tokens & TokenError

13

- u and rb literals are allowed under Python 3.3 and above.

13

- u and rb literals are allowed under Python 3.3 and above.

14

15

------------------------------------------------------------------------------

15

------------------------------------------------------------------------------

16

Tokenization help for Python programs.

16

Tokenization help for Python programs.

17

18

tokenize(readline) is a generator that breaks a stream of bytes into

18

tokenize(readline) is a generator that breaks a stream of bytes into

19

Python tokens. It decodes the bytes according to PEP-0263 for

19

Python tokens. It decodes the bytes according to PEP-0263 for

20

determining source file encoding.

20

determining source file encoding.

21

22

It accepts a readline-like method which is called repeatedly to get the

22

It accepts a readline-like method which is called repeatedly to get the

23

next line of input (or b"" for EOF). It generates 5-tuples with these

23

next line of input (or b"" for EOF). It generates 5-tuples with these

24

members:

24

members:

25

26

the token type (see token.py)

26

the token type (see token.py)

27

the token (a string)

27

the token (a string)

28

the starting (row, column) indices of the token (a 2-tuple of ints)

28

the starting (row, column) indices of the token (a 2-tuple of ints)

29

the ending (row, column) indices of the token (a 2-tuple of ints)

29

the ending (row, column) indices of the token (a 2-tuple of ints)

30

the original line (string)

30

the original line (string)

31

32

It is designed to match the working of the Python tokenizer exactly, except

32

It is designed to match the working of the Python tokenizer exactly, except

33

that it produces COMMENT tokens for comments and gives type OP for all

33

that it produces COMMENT tokens for comments and gives type OP for all

34

operators. Additionally, all token lists start with an ENCODING token

34

operators. Additionally, all token lists start with an ENCODING token

35

which tells you which encoding was used to decode the bytes stream.

35

which tells you which encoding was used to decode the bytes stream.

36

"""

36

"""

37

from __future__ import absolute_import

37

from __future__ import absolute_import

38

39

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

39

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

40

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

40

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

41

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

41

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

42

'Michael Foord')

42

'Michael Foord')

43

import builtins

43

import builtins

44

import re

44

import re

45

import sys

45

import sys

46

from token import *

46

from token import *

47

from codecs import lookup, BOM_UTF8

47

from codecs import lookup, BOM_UTF8

48

import collections

48

import collections

49

from io import TextIOWrapper

49

from io import TextIOWrapper

50

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

50

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

51

52

import token

52

import token

53

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

53

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

54

"NL", "untokenize", "ENCODING", "TokenInfo"]

54

"NL", "untokenize", "ENCODING", "TokenInfo"]

55

del token

55

del token

56

57

__all__ += ["generate_tokens", "TokenError"]

57

__all__ += ["generate_tokens", "TokenError"]

58

59

COMMENT = N_TOKENS

59

COMMENT = N_TOKENS

60

tok_name[COMMENT] = 'COMMENT'

60

tok_name[COMMENT] = 'COMMENT'

61

NL = N_TOKENS + 1

61

NL = N_TOKENS + 1

62

tok_name[NL] = 'NL'

62

tok_name[NL] = 'NL'

63

ENCODING = N_TOKENS + 2

63

ENCODING = N_TOKENS + 2

64

tok_name[ENCODING] = 'ENCODING'

64

tok_name[ENCODING] = 'ENCODING'

65

N_TOKENS += 3

65

N_TOKENS += 3

66

67

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

67

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

68

def __repr__(self):

68

def __repr__(self):

69

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

69

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

70

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

70

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

71

self._replace(type=annotated_type))

71

self._replace(type=annotated_type))

72

73

def group(*choices): return '(' + '|'.join(choices) + ')'

73

def group(*choices): return '(' + '|'.join(choices) + ')'

74

def any(*choices): return group(*choices) + '*'

74

def any(*choices): return group(*choices) + '*'

75

def maybe(*choices): return group(*choices) + '?'

75

def maybe(*choices): return group(*choices) + '?'

76

77

# Note: we use unicode matching for names ("\w") but ascii matching for

77

# Note: we use unicode matching for names ("\w") but ascii matching for

78

# number literals.

78

# number literals.

79

Whitespace = r'[ \f\t]*'

79

Whitespace = r'[ \f\t]*'

80

Comment = r'#[^\r\n]*'

80

Comment = r'#[^\r\n]*'

81

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

81

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

82

Name = r'\w+'

82

Name = r'\w+'

83

84

Hexnumber = r'0[xX][0-9a-fA-F]+'

84

Hexnumber = r'0[xX][0-9a-fA-F]+'

85

Binnumber = r'0[bB][01]+'

85

Binnumber = r'0[bB][01]+'

86

Octnumber = r'0[oO][0-7]+'

86

Octnumber = r'0[oO][0-7]+'

87

Decnumber = r'(?:0+|[1-9][0-9]*)'

87

Decnumber = r'(?:0+|[1-9][0-9]*)'

88

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

88

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

89

Exponent = r'[eE][-+]?[0-9]+'

89

Exponent = r'[eE][-+]?[0-9]+'

90

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

90

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

91

Expfloat = r'[0-9]+' + Exponent

91

Expfloat = r'[0-9]+' + Exponent

92

Floatnumber = group(Pointfloat, Expfloat)

92

Floatnumber = group(Pointfloat, Expfloat)

93

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

93

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

94

Number = group(Imagnumber, Floatnumber, Intnumber)

94

Number = group(Imagnumber, Floatnumber, Intnumber)

95

96

if sys.version_info.minor >= 3:

96

if sys.version_info.minor >= 3:

97

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

97

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

98

else:

98

else:

99

StringPrefix = r'(?:[bB]?[rR]?)?'

99

StringPrefix = r'(?:[bB]?[rR]?)?'

100

101

# Tail end of ' string.

101

# Tail end of ' string.

102

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

102

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

103

# Tail end of " string.

103

# Tail end of " string.

104

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

104

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

105

# Tail end of ''' string.

105

# Tail end of ''' string.

106

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

106

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

107

# Tail end of """ string.

107

# Tail end of """ string.

108

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

108

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

109

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

109

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

110

# Single-line ' or " string.

110

# Single-line ' or " string.

111

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

111

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

112

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

112

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

113

114

# Because of leftmost-then-longest match semantics, be sure to put the

114

# Because of leftmost-then-longest match semantics, be sure to put the

115

# longest operators first (e.g., if = came before ==, == would get

115

# longest operators first (e.g., if = came before ==, == would get

116

# recognized as two instances of =).

116

# recognized as two instances of =).

117

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

117

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

118

r"//=?", r"->",

118

r"//=?", r"->",

119

r"[+\-*/%&|^=<>]=?",

119

r"[+\-*/%&|^=<>]=?",

120

r"~")

120

r"~")

121

122

Bracket = '[][(){}]'

122

Bracket = '[][(){}]'

123

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

123

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

124

Funny = group(Operator, Bracket, Special)

124

Funny = group(Operator, Bracket, Special)

125

126

PlainToken = group(Number, Funny, String, Name)

126

PlainToken = group(Number, Funny, String, Name)

127

Token = Ignore + PlainToken

127

Token = Ignore + PlainToken

128

129

# First (or only) line of ' or " string.

129

# First (or only) line of ' or " string.

130

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

130

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

131

group("'", r'\\\r?\n'),

131

group("'", r'\\\r?\n'),

132

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

132

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

133

group('"', r'\\\r?\n'))

133

group('"', r'\\\r?\n'))

134

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

134

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

135

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

135

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

136

137

def _compile(expr):

137

def _compile(expr):

138

return re.compile(expr, re.UNICODE)

138

return re.compile(expr, re.UNICODE)

139

140

tokenprog, pseudoprog, single3prog, double3prog = map(

140

tokenprog, pseudoprog, single3prog, double3prog = map(

141

_compile, (Token, PseudoToken, Single3, Double3))

141

_compile, (Token, PseudoToken, Single3, Double3))

142

endprogs = {"'": _compile(Single), '"': _compile(Double),

142

endprogs = {"'": _compile(Single), '"': _compile(Double),

143

"'''": single3prog, '"""': double3prog,

143

"'''": single3prog, '"""': double3prog,

144

"r'''": single3prog, 'r"""': double3prog,

144

"r'''": single3prog, 'r"""': double3prog,

145

"b'''": single3prog, 'b"""': double3prog,

145

"b'''": single3prog, 'b"""': double3prog,

146

"R'''": single3prog, 'R"""': double3prog,

146

"R'''": single3prog, 'R"""': double3prog,

147

"B'''": single3prog, 'B"""': double3prog,

147

"B'''": single3prog, 'B"""': double3prog,

148

"br'''": single3prog, 'br"""': double3prog,

148

"br'''": single3prog, 'br"""': double3prog,

149

"bR'''": single3prog, 'bR"""': double3prog,

149

"bR'''": single3prog, 'bR"""': double3prog,

150

"Br'''": single3prog, 'Br"""': double3prog,

150

"Br'''": single3prog, 'Br"""': double3prog,

151

"BR'''": single3prog, 'BR"""': double3prog,

151

"BR'''": single3prog, 'BR"""': double3prog,

152

'r': None, 'R': None, 'b': None, 'B': None}

152

'r': None, 'R': None, 'b': None, 'B': None}

153

154

triple_quoted = {}

154

triple_quoted = {}

155

for t in ("'''", '"""',

155

for t in ("'''", '"""',

156

"r'''", 'r"""', "R'''", 'R"""',

156

"r'''", 'r"""', "R'''", 'R"""',

157

"b'''", 'b"""', "B'''", 'B"""',

157

"b'''", 'b"""', "B'''", 'B"""',

158

"br'''", 'br"""', "Br'''", 'Br"""',

158

"br'''", 'br"""', "Br'''", 'Br"""',

159

"bR'''", 'bR"""', "BR'''", 'BR"""'):

159

"bR'''", 'bR"""', "BR'''", 'BR"""'):

160

triple_quoted[t] = t

160

triple_quoted[t] = t

161

single_quoted = {}

161

single_quoted = {}

162

for t in ("'", '"',

162

for t in ("'", '"',

163

"r'", 'r"', "R'", 'R"',

163

"r'", 'r"', "R'", 'R"',

164

"b'", 'b"', "B'", 'B"',

164

"b'", 'b"', "B'", 'B"',

165

"br'", 'br"', "Br'", 'Br"',

165

"br'", 'br"', "Br'", 'Br"',

166

"bR'", 'bR"', "BR'", 'BR"' ):

166

"bR'", 'bR"', "BR'", 'BR"' ):

167

single_quoted[t] = t

167

single_quoted[t] = t

168

169

if sys.version_info.minor >= 3:

169

if sys.version_info.minor >= 3:

170

# Python 3.3

170

# Python 3.3

171

for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:

171

for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:

172

_t2 = prefix+'"""'

172

_t2 = _prefix+'"""'

173

endprogs[_t2] = double3prog

173

endprogs[_t2] = double3prog

174

triple_quoted[_t2] = _t2

174

triple_quoted[_t2] = _t2

175

_t1 = prefix + "'''"

175

_t1 = _prefix + "'''"

176

endprogs[_t1] = single3prog

176

endprogs[_t1] = single3prog

177

triple_quoted[_t1] = _t1

177

triple_quoted[_t1] = _t1

178

single_quoted[_prefix+'"'] = _prefix+'"'

178

single_quoted[_prefix+'"'] = _prefix+'"'

179

single_quoted[_prefix+"'"] + _prefix+"'"

179

single_quoted[_prefix+"'"] = _prefix+"'"

180

del _prefix, _t2, _t1

180

del _prefix, _t2, _t1

181

endprogs['u'] = None

181

endprogs['u'] = None

182

endprogs['U'] = None

182

endprogs['U'] = None

183

184

del _compile

184

del _compile

185

186

tabsize = 8

186

tabsize = 8

187

188

class TokenError(Exception): pass

188

class TokenError(Exception): pass

189

190

class StopTokenizing(Exception): pass

190

class StopTokenizing(Exception): pass

191

192

193

class Untokenizer:

193

class Untokenizer:

194

195

def __init__(self):

195

def __init__(self):

196

self.tokens = []

196

self.tokens = []

197

self.prev_row = 1

197

self.prev_row = 1

198

self.prev_col = 0

198

self.prev_col = 0

199

self.encoding = 'utf-8'

199

self.encoding = 'utf-8'

200

201

def add_whitespace(self, tok_type, start):

201

def add_whitespace(self, tok_type, start):

202

row, col = start

202

row, col = start

203

assert row >= self.prev_row

203

assert row >= self.prev_row

204

col_offset = col - self.prev_col

204

col_offset = col - self.prev_col

205

if col_offset > 0:

205

if col_offset > 0:

206

self.tokens.append(" " * col_offset)

206

self.tokens.append(" " * col_offset)

207

elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):

207

elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):

208

# Line was backslash-continued.

208

# Line was backslash-continued.

209

self.tokens.append(" ")

209

self.tokens.append(" ")

210

211

def untokenize(self, tokens):

211

def untokenize(self, tokens):

212

iterable = iter(tokens)

212

iterable = iter(tokens)

213

for t in iterable:

213

for t in iterable:

214

if len(t) == 2:

214

if len(t) == 2:

215

self.compat(t, iterable)

215

self.compat(t, iterable)

216

break

216

break

217

tok_type, token, start, end = t[:4]

217

tok_type, token, start, end = t[:4]

218

if tok_type == ENCODING:

218

if tok_type == ENCODING:

219

self.encoding = token

219

self.encoding = token

220

continue

220

continue

221

self.add_whitespace(tok_type, start)

221

self.add_whitespace(tok_type, start)

222

self.tokens.append(token)

222

self.tokens.append(token)

223

self.prev_row, self.prev_col = end

223

self.prev_row, self.prev_col = end

224

if tok_type in (NEWLINE, NL):

224

if tok_type in (NEWLINE, NL):

225

self.prev_row += 1

225

self.prev_row += 1

226

self.prev_col = 0

226

self.prev_col = 0

227

return "".join(self.tokens)

227

return "".join(self.tokens)

228

229

def compat(self, token, iterable):

229

def compat(self, token, iterable):

230

# This import is here to avoid problems when the itertools

230

# This import is here to avoid problems when the itertools

231

# module is not built yet and tokenize is imported.

231

# module is not built yet and tokenize is imported.

232

from itertools import chain

232

from itertools import chain

233

startline = False

233

startline = False

234

prevstring = False

234

prevstring = False

235

indents = []

235

indents = []

236

toks_append = self.tokens.append

236

toks_append = self.tokens.append

237

238

for tok in chain([token], iterable):

238

for tok in chain([token], iterable):

239

toknum, tokval = tok[:2]

239

toknum, tokval = tok[:2]

240

if toknum == ENCODING:

240

if toknum == ENCODING:

241

self.encoding = tokval

241

self.encoding = tokval

242

continue

242

continue

243

244

if toknum in (NAME, NUMBER):

244

if toknum in (NAME, NUMBER):

245

tokval += ' '

245

tokval += ' '

246

247

# Insert a space between two consecutive strings

247

# Insert a space between two consecutive strings

248

if toknum == STRING:

248

if toknum == STRING:

249

if prevstring:

249

if prevstring:

250

tokval = ' ' + tokval

250

tokval = ' ' + tokval

251

prevstring = True

251

prevstring = True

252

else:

252

else:

253

prevstring = False

253

prevstring = False

254

255

if toknum == INDENT:

255

if toknum == INDENT:

256

indents.append(tokval)

256

indents.append(tokval)

257

continue

257

continue

258

elif toknum == DEDENT:

258

elif toknum == DEDENT:

259

indents.pop()

259

indents.pop()

260

continue

260

continue

261

elif toknum in (NEWLINE, NL):

261

elif toknum in (NEWLINE, NL):

262

startline = True

262

startline = True

263

elif startline and indents:

263

elif startline and indents:

264

toks_append(indents[-1])

264

toks_append(indents[-1])

265

startline = False

265

startline = False

266

toks_append(tokval)

266

toks_append(tokval)

267

268

269

def untokenize(tokens):

269

def untokenize(tokens):

270

"""

270

"""

271

Convert ``tokens`` (an iterable) back into Python source code. Return

271

Convert ``tokens`` (an iterable) back into Python source code. Return

272

a bytes object, encoded using the encoding specified by the last

272

a bytes object, encoded using the encoding specified by the last

273

ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.

273

ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.

274

275

The result is guaranteed to tokenize back to match the input so that

275

The result is guaranteed to tokenize back to match the input so that

276

the conversion is lossless and round-trips are assured. The

276

the conversion is lossless and round-trips are assured. The

277

guarantee applies only to the token type and token string as the

277

guarantee applies only to the token type and token string as the

278

spacing between tokens (column positions) may change.

278

spacing between tokens (column positions) may change.

279

280

:func:`untokenize` has two modes. If the input tokens are sequences

280

:func:`untokenize` has two modes. If the input tokens are sequences

281

of length 2 (``type``, ``string``) then spaces are added as necessary to

281

of length 2 (``type``, ``string``) then spaces are added as necessary to

282

preserve the round-trip property.

282

preserve the round-trip property.

283

284

If the input tokens are sequences of length 4 or more (``type``,

284

If the input tokens are sequences of length 4 or more (``type``,

285

``string``, ``start``, ``end``), as returned by :func:`tokenize`, then

285

``string``, ``start``, ``end``), as returned by :func:`tokenize`, then

286

spaces are added so that each token appears in the result at the

286

spaces are added so that each token appears in the result at the

287

position indicated by ``start`` and ``end``, if possible.

287

position indicated by ``start`` and ``end``, if possible.

288

"""

288

"""

289

return Untokenizer().untokenize(tokens)

289

return Untokenizer().untokenize(tokens)

290

291

292

def _get_normal_name(orig_enc):

292

def _get_normal_name(orig_enc):

293

"""Imitates get_normal_name in tokenizer.c."""

293

"""Imitates get_normal_name in tokenizer.c."""

294

# Only care about the first 12 characters.

294

# Only care about the first 12 characters.

295

enc = orig_enc[:12].lower().replace("_", "-")

295

enc = orig_enc[:12].lower().replace("_", "-")

296

if enc == "utf-8" or enc.startswith("utf-8-"):

296

if enc == "utf-8" or enc.startswith("utf-8-"):

297

return "utf-8"

297

return "utf-8"

298

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

298

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

299

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

299

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

300

return "iso-8859-1"

300

return "iso-8859-1"

301

return orig_enc

301

return orig_enc

302

303

def detect_encoding(readline):

303

def detect_encoding(readline):

304

"""

304

"""

305

The detect_encoding() function is used to detect the encoding that should

305

The detect_encoding() function is used to detect the encoding that should

306

be used to decode a Python source file. It requires one argment, readline,

306

be used to decode a Python source file. It requires one argment, readline,

307

in the same way as the tokenize() generator.

307

in the same way as the tokenize() generator.

308

309

It will call readline a maximum of twice, and return the encoding used

309

It will call readline a maximum of twice, and return the encoding used

310

(as a string) and a list of any lines (left as bytes) it has read in.

310

(as a string) and a list of any lines (left as bytes) it has read in.

311

312

It detects the encoding from the presence of a utf-8 bom or an encoding

312

It detects the encoding from the presence of a utf-8 bom or an encoding

313

cookie as specified in pep-0263. If both a bom and a cookie are present,

313

cookie as specified in pep-0263. If both a bom and a cookie are present,

314

but disagree, a SyntaxError will be raised. If the encoding cookie is an

314

but disagree, a SyntaxError will be raised. If the encoding cookie is an

315

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

315

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

316

'utf-8-sig' is returned.

316

'utf-8-sig' is returned.

317

318

If no encoding is specified, then the default of 'utf-8' will be returned.

318

If no encoding is specified, then the default of 'utf-8' will be returned.

319

"""

319

"""

320

bom_found = False

320

bom_found = False

321

encoding = None

321

encoding = None

322

default = 'utf-8'

322

default = 'utf-8'

323

def read_or_stop():

323

def read_or_stop():

324

try:

324

try:

325

return readline()

325

return readline()

326

except StopIteration:

326

except StopIteration:

327

return b''

327

return b''

328

329

def find_cookie(line):

329

def find_cookie(line):

330

try:

330

try:

331

# Decode as UTF-8. Either the line is an encoding declaration,

331

# Decode as UTF-8. Either the line is an encoding declaration,

332

# in which case it should be pure ASCII, or it must be UTF-8

332

# in which case it should be pure ASCII, or it must be UTF-8

333

# per default encoding.

333

# per default encoding.

334

line_string = line.decode('utf-8')

334

line_string = line.decode('utf-8')

335

except UnicodeDecodeError:

335

except UnicodeDecodeError:

336

raise SyntaxError("invalid or missing encoding declaration")

336

raise SyntaxError("invalid or missing encoding declaration")

337

338

matches = cookie_re.findall(line_string)

338

matches = cookie_re.findall(line_string)

339

if not matches:

339

if not matches:

340

return None

340

return None

341

encoding = _get_normal_name(matches[0])

341

encoding = _get_normal_name(matches[0])

342

try:

342

try:

343

codec = lookup(encoding)

343

codec = lookup(encoding)

344

except LookupError:

344

except LookupError:

345

# This behaviour mimics the Python interpreter

345

# This behaviour mimics the Python interpreter

346

raise SyntaxError("unknown encoding: " + encoding)

346

raise SyntaxError("unknown encoding: " + encoding)

347

348

if bom_found:

348

if bom_found:

349

if encoding != 'utf-8':

349

if encoding != 'utf-8':

350

# This behaviour mimics the Python interpreter

350

# This behaviour mimics the Python interpreter

351

raise SyntaxError('encoding problem: utf-8')

351

raise SyntaxError('encoding problem: utf-8')

352

encoding += '-sig'

352

encoding += '-sig'

353

return encoding

353

return encoding

354

355

first = read_or_stop()

355

first = read_or_stop()

356

if first.startswith(BOM_UTF8):

356

if first.startswith(BOM_UTF8):

357

bom_found = True

357

bom_found = True

358

first = first[3:]

358

first = first[3:]

359

default = 'utf-8-sig'

359

default = 'utf-8-sig'

360

if not first:

360

if not first:

361

return default, []

361

return default, []

362

363

encoding = find_cookie(first)

363

encoding = find_cookie(first)

364

if encoding:

364

if encoding:

365

return encoding, [first]

365

return encoding, [first]

366

367

second = read_or_stop()

367

second = read_or_stop()

368

if not second:

368

if not second:

369

return default, [first]

369

return default, [first]

370

371

encoding = find_cookie(second)

371

encoding = find_cookie(second)

372

if encoding:

372

if encoding:

373

return encoding, [first, second]

373

return encoding, [first, second]

374

375

return default, [first, second]

375

return default, [first, second]

376

377

378

def open(filename):

378

def open(filename):

379

"""Open a file in read only mode using the encoding detected by

379

"""Open a file in read only mode using the encoding detected by

380

detect_encoding().

380

detect_encoding().

381

"""

381

"""

382

buffer = builtins.open(filename, 'rb')

382

buffer = builtins.open(filename, 'rb')

383

encoding, lines = detect_encoding(buffer.readline)

383

encoding, lines = detect_encoding(buffer.readline)

384

buffer.seek(0)

384

buffer.seek(0)

385

text = TextIOWrapper(buffer, encoding, line_buffering=True)

385

text = TextIOWrapper(buffer, encoding, line_buffering=True)

386

text.mode = 'r'

386

text.mode = 'r'

387

return text

387

return text

388

389

390

def tokenize(readline):

390

def tokenize(readline):

391

"""

391

"""

392

The tokenize() generator requires one argment, readline, which

392

The tokenize() generator requires one argment, readline, which

393

must be a callable object which provides the same interface as the

393

must be a callable object which provides the same interface as the

394

readline() method of built-in file objects. Each call to the function

394

readline() method of built-in file objects. Each call to the function

395

should return one line of input as bytes. Alternately, readline

395

should return one line of input as bytes. Alternately, readline

396

can be a callable function terminating with StopIteration:

396

can be a callable function terminating with StopIteration:

397

readline = open(myfile, 'rb').__next__ # Example of alternate readline

397

readline = open(myfile, 'rb').__next__ # Example of alternate readline

398

399

The generator produces 5-tuples with these members: the token type; the

399

The generator produces 5-tuples with these members: the token type; the

400

token string; a 2-tuple (srow, scol) of ints specifying the row and

400

token string; a 2-tuple (srow, scol) of ints specifying the row and

401

column where the token begins in the source; a 2-tuple (erow, ecol) of

401

column where the token begins in the source; a 2-tuple (erow, ecol) of

402

ints specifying the row and column where the token ends in the source;

402

ints specifying the row and column where the token ends in the source;

403

and the line on which the token was found. The line passed is the

403

and the line on which the token was found. The line passed is the

404

logical line; continuation lines are included.

404

logical line; continuation lines are included.

405

406

The first token sequence will always be an ENCODING token

406

The first token sequence will always be an ENCODING token

407

which tells you which encoding was used to decode the bytes stream.

407

which tells you which encoding was used to decode the bytes stream.

408

"""

408

"""

409

# This import is here to avoid problems when the itertools module is not

409

# This import is here to avoid problems when the itertools module is not

410

# built yet and tokenize is imported.

410

# built yet and tokenize is imported.

411

from itertools import chain, repeat

411

from itertools import chain, repeat

412

encoding, consumed = detect_encoding(readline)

412

encoding, consumed = detect_encoding(readline)

413

rl_gen = iter(readline, b"")

413

rl_gen = iter(readline, b"")

414

empty = repeat(b"")

414

empty = repeat(b"")

415

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

415

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

416

417

418

def _tokenize(readline, encoding):

418

def _tokenize(readline, encoding):

419

lnum = parenlev = continued = 0

419

lnum = parenlev = continued = 0

420

numchars = '0123456789'

420

numchars = '0123456789'

421

contstr, needcont = '', 0

421

contstr, needcont = '', 0

422

contline = None

422

contline = None

423

indents = [0]

423

indents = [0]

424

425

if encoding is not None:

425

if encoding is not None:

426

if encoding == "utf-8-sig":

426

if encoding == "utf-8-sig":

427

# BOM will already have been stripped.

427

# BOM will already have been stripped.

428

encoding = "utf-8"

428

encoding = "utf-8"

429

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

429

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

430

while True: # loop over lines in stream

430

while True: # loop over lines in stream

431

try:

431

try:

432

line = readline()

432

line = readline()

433

except StopIteration:

433

except StopIteration:

434

line = b''

434

line = b''

435

436

if encoding is not None:

436

if encoding is not None:

437

line = line.decode(encoding)

437

line = line.decode(encoding)

438

lnum += 1

438

lnum += 1

439

pos, max = 0, len(line)

439

pos, max = 0, len(line)

440

441

if contstr: # continued string

441

if contstr: # continued string

442

if not line:

442

if not line:

443

raise TokenError("EOF in multi-line string", strstart)

443

raise TokenError("EOF in multi-line string", strstart)

444

endmatch = endprog.match(line)

444

endmatch = endprog.match(line)

445

if endmatch:

445

if endmatch:

446

pos = end = endmatch.end(0)

446

pos = end = endmatch.end(0)

447

yield TokenInfo(STRING, contstr + line[:end],

447

yield TokenInfo(STRING, contstr + line[:end],

448

strstart, (lnum, end), contline + line)

448

strstart, (lnum, end), contline + line)

449

contstr, needcont = '', 0

449

contstr, needcont = '', 0

450

contline = None

450

contline = None

451

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

451

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

452

yield TokenInfo(ERRORTOKEN, contstr + line,

452

yield TokenInfo(ERRORTOKEN, contstr + line,

453

strstart, (lnum, len(line)), contline)

453

strstart, (lnum, len(line)), contline)

454

contstr = ''

454

contstr = ''

455

contline = None

455

contline = None

456

continue

456

continue

457

else:

457

else:

458

contstr = contstr + line

458

contstr = contstr + line

459

contline = contline + line

459

contline = contline + line

460

continue

460

continue

461

462

elif parenlev == 0 and not continued: # new statement

462

elif parenlev == 0 and not continued: # new statement

463

if not line: break

463

if not line: break

464

column = 0

464

column = 0

465

while pos < max: # measure leading whitespace

465

while pos < max: # measure leading whitespace

466

if line[pos] == ' ':

466

if line[pos] == ' ':

467

column += 1

467

column += 1

468

elif line[pos] == '\t':

468

elif line[pos] == '\t':

469

column = (column//tabsize + 1)*tabsize

469

column = (column//tabsize + 1)*tabsize

470

elif line[pos] == '\f':

470

elif line[pos] == '\f':

471

column = 0

471

column = 0

472

else:

472

else:

473

break

473

break

474

pos += 1

474

pos += 1

475

if pos == max:

475

if pos == max:

476

break

476

break

477

478

if line[pos] in '#\r\n': # skip comments or blank lines

478

if line[pos] in '#\r\n': # skip comments or blank lines

479

if line[pos] == '#':

479

if line[pos] == '#':

480

comment_token = line[pos:].rstrip('\r\n')

480

comment_token = line[pos:].rstrip('\r\n')

481

nl_pos = pos + len(comment_token)

481

nl_pos = pos + len(comment_token)

482

yield TokenInfo(COMMENT, comment_token,

482

yield TokenInfo(COMMENT, comment_token,

483

(lnum, pos), (lnum, pos + len(comment_token)), line)

483

(lnum, pos), (lnum, pos + len(comment_token)), line)

484

yield TokenInfo(NEWLINE, line[nl_pos:],

484

yield TokenInfo(NEWLINE, line[nl_pos:],

485

(lnum, nl_pos), (lnum, len(line)), line)

485

(lnum, nl_pos), (lnum, len(line)), line)

486

else:

486

else:

487

yield TokenInfo(NEWLINE, line[pos:],

487

yield TokenInfo(NEWLINE, line[pos:],

488

(lnum, pos), (lnum, len(line)), line)

488

(lnum, pos), (lnum, len(line)), line)

489

continue

489

continue

490

491

if column > indents[-1]: # count indents or dedents

491

if column > indents[-1]: # count indents or dedents

492

indents.append(column)

492

indents.append(column)

493

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

493

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

494

while column < indents[-1]:

494

while column < indents[-1]:

495

if column not in indents:

495

if column not in indents:

496

raise IndentationError(

496

raise IndentationError(

497

"unindent does not match any outer indentation level",

497

"unindent does not match any outer indentation level",

498

("<tokenize>", lnum, pos, line))

498

("<tokenize>", lnum, pos, line))

499

indents = indents[:-1]

499

indents = indents[:-1]

500

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

500

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

501

502

else: # continued statement

502

else: # continued statement

503

if not line:

503

if not line:

504

raise TokenError("EOF in multi-line statement", (lnum, 0))

504

raise TokenError("EOF in multi-line statement", (lnum, 0))

505

continued = 0

505

continued = 0

506

507

while pos < max:

507

while pos < max:

508

pseudomatch = pseudoprog.match(line, pos)

508

pseudomatch = pseudoprog.match(line, pos)

509

if pseudomatch: # scan for tokens

509

if pseudomatch: # scan for tokens

510

start, end = pseudomatch.span(1)

510

start, end = pseudomatch.span(1)

511

spos, epos, pos = (lnum, start), (lnum, end), end

511

spos, epos, pos = (lnum, start), (lnum, end), end

512

token, initial = line[start:end], line[start]

512

token, initial = line[start:end], line[start]

513

514

if (initial in numchars or # ordinary number

514

if (initial in numchars or # ordinary number

515

(initial == '.' and token != '.' and token != '...')):

515

(initial == '.' and token != '.' and token != '...')):

516

yield TokenInfo(NUMBER, token, spos, epos, line)

516

yield TokenInfo(NUMBER, token, spos, epos, line)

517

elif initial in '\r\n':

517

elif initial in '\r\n':

518

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

518

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

519

token, spos, epos, line)

519

token, spos, epos, line)

520

elif initial == '#':

520

elif initial == '#':

521

assert not token.endswith("\n")

521

assert not token.endswith("\n")

522

yield TokenInfo(COMMENT, token, spos, epos, line)

522

yield TokenInfo(COMMENT, token, spos, epos, line)

523

elif token in triple_quoted:

523

elif token in triple_quoted:

524

endprog = endprogs[token]

524

endprog = endprogs[token]

525

endmatch = endprog.match(line, pos)

525

endmatch = endprog.match(line, pos)

526

if endmatch: # all on one line

526

if endmatch: # all on one line

527

pos = endmatch.end(0)

527

pos = endmatch.end(0)

528

token = line[start:pos]

528

token = line[start:pos]

529

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

529

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

530

else:

530

else:

531

strstart = (lnum, start) # multiple lines

531

strstart = (lnum, start) # multiple lines

532

contstr = line[start:]

532

contstr = line[start:]

533

contline = line

533

contline = line

534

break

534

break

535

elif initial in single_quoted or \

535

elif initial in single_quoted or \

536

token[:2] in single_quoted or \

536

token[:2] in single_quoted or \

537

token[:3] in single_quoted:

537

token[:3] in single_quoted:

538

if token[-1] == '\n': # continued string

538

if token[-1] == '\n': # continued string

539

strstart = (lnum, start)

539

strstart = (lnum, start)

540

endprog = (endprogs[initial] or endprogs[token[1]] or

540

endprog = (endprogs[initial] or endprogs[token[1]] or

541

endprogs[token[2]])

541

endprogs[token[2]])

542

contstr, needcont = line[start:], 1

542

contstr, needcont = line[start:], 1

543

contline = line

543

contline = line

544

break

544

break

545

else: # ordinary string

545

else: # ordinary string

546

yield TokenInfo(STRING, token, spos, epos, line)

546

yield TokenInfo(STRING, token, spos, epos, line)

547

elif initial.isidentifier(): # ordinary name

547

elif initial.isidentifier(): # ordinary name

548

yield TokenInfo(NAME, token, spos, epos, line)

548

yield TokenInfo(NAME, token, spos, epos, line)

549

elif initial == '\\': # continued stmt

549

elif initial == '\\': # continued stmt

550

continued = 1

550

continued = 1

551

else:

551

else:

552

if initial in '([{':

552

if initial in '([{':

553

parenlev += 1

553

parenlev += 1

554

elif initial in ')]}':

554

elif initial in ')]}':

555

parenlev -= 1

555

parenlev -= 1

556

yield TokenInfo(OP, token, spos, epos, line)

556

yield TokenInfo(OP, token, spos, epos, line)

557

else:

557

else:

558

yield TokenInfo(ERRORTOKEN, line[pos],

558

yield TokenInfo(ERRORTOKEN, line[pos],

559

(lnum, pos), (lnum, pos+1), line)

559

(lnum, pos), (lnum, pos+1), line)

560

pos += 1

560

pos += 1

561

562

for indent in indents[1:]: # pop remaining indent levels

562

for indent in indents[1:]: # pop remaining indent levels

563

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

563

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

564

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

564

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

565

566

567

# An undocumented, backwards compatible, API for all the places in the standard

567

# An undocumented, backwards compatible, API for all the places in the standard

568

# library that expect to be able to use tokenize with strings

568

# library that expect to be able to use tokenize with strings

569

def generate_tokens(readline):

569

def generate_tokens(readline):

570

return _tokenize(readline, None)

570

return _tokenize(readline, None)

571

572

if __name__ == "__main__":

572

if __name__ == "__main__":

573

# Quick sanity check

573

# Quick sanity check

574

s = b'''def parseline(self, line):

574

s = b'''def parseline(self, line):

575

"""Parse the line into a command name and a string containing

575

"""Parse the line into a command name and a string containing

576

the arguments. Returns a tuple containing (command, args, line).

576

the arguments. Returns a tuple containing (command, args, line).

577

'command' and 'args' may be None if the line couldn't be parsed.

577

'command' and 'args' may be None if the line couldn't be parsed.

578

"""

578

"""

579

line = line.strip()

579

line = line.strip()

580

if not line:

580

if not line:

581

return None, None, line

581

return None, None, line

582

elif line[0] == '?':

582

elif line[0] == '?':

583

line = 'help ' + line[1:]

583

line = 'help ' + line[1:]

584

elif line[0] == '!':

584

elif line[0] == '!':

585

if hasattr(self, 'do_shell'):

585

if hasattr(self, 'do_shell'):

586

line = 'shell ' + line[1:]

586

line = 'shell ' + line[1:]

587

else:

587

else:

588

return None, None, line

588

return None, None, line

589

i, n = 0, len(line)

589

i, n = 0, len(line)

590

while i < n and line[i] in self.identchars: i = i+1

590

while i < n and line[i] in self.identchars: i = i+1

591

cmd, arg = line[:i], line[i:].strip()

591

cmd, arg = line[:i], line[i:].strip()

592

return cmd, arg, line

592

return cmd, arg, line

593

'''

593

'''

594

for tok in tokenize(iter(s.splitlines()).__next__):

594

for tok in tokenize(iter(s.splitlines()).__next__):

595

print(tok)

595

print(tok)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             """Patched version of standard library tokenize, to deal with various bugs.
             Based on Python 3.2 code.
             Patches:
             - Gareth Rees' patch for Python issue #12691 (untokenizing)
               - Except we don't encode the output of untokenize
               - Python 2 compatible syntax, so that it can be byte-compiled at installation
             - Newlines in comments and blank lines should be either NL or NEWLINE, depending
               on whether they are in a multi-line statement. Filed as Python issue #17061.
             - Export generate_tokens & TokenError
             - u and rb literals are allowed under Python 3.3 and above.
             ------------------------------------------------------------------------------
             Tokenization help for Python programs.
             tokenize(readline) is a generator that breaks a stream of bytes into
             Python tokens.  It decodes the bytes according to PEP-0263 for
             determining source file encoding.
             It accepts a readline-like method which is called repeatedly to get the
             next line of input (or b"" for EOF).  It generates 5-tuples with these
             members:
                 the token type (see token.py)
                 the token (a string)
                 the starting (row, column) indices of the token (a 2-tuple of ints)
                 the ending (row, column) indices of the token (a 2-tuple of ints)
                 the original line (string)
             It is designed to match the working of the Python tokenizer exactly, except
             that it produces COMMENT tokens for comments and gives type OP for all
             operators.  Additionally, all token lists start with an ENCODING token
             which tells you which encoding was used to decode the bytes stream.
             """
             from __future__ import absolute_import
             __author__ = 'Ka-Ping Yee <ping@lfw.org>'
             __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
                            'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
                            'Michael Foord')
             import builtins
             import re
             import sys
             from token import *
             from codecs import lookup, BOM_UTF8
             import collections
             from io import TextIOWrapper
             cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
             import token
             __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
                                        "NL", "untokenize", "ENCODING", "TokenInfo"]
             del token
             __all__ += ["generate_tokens", "TokenError"]
             COMMENT = N_TOKENS
             tok_name[COMMENT] = 'COMMENT'
             NL = N_TOKENS + 1
             tok_name[NL] = 'NL'
             ENCODING = N_TOKENS + 2
             tok_name[ENCODING] = 'ENCODING'
             N_TOKENS += 3
             class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
                 def __repr__(self):
                     annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
                     return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                             self._replace(type=annotated_type))
             def group(*choices): return '(' + '|'.join(choices) + ')'
             def any(*choices): return group(*choices) + '*'
             def maybe(*choices): return group(*choices) + '?'
             # Note: we use unicode matching for names ("\w") but ascii matching for
             # number literals.
             Whitespace = r'[ \f\t]*'
             Comment = r'#[^\r\n]*'
             Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
             Name = r'\w+'
             Hexnumber = r'0[xX][0-9a-fA-F]+'
             Binnumber = r'0[bB][01]+'
             Octnumber = r'0[oO][0-7]+'
             Decnumber = r'(?:0+|[1-9][0-9]*)'
             Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
             Exponent = r'[eE][-+]?[0-9]+'
             Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
             Expfloat = r'[0-9]+' + Exponent
             Floatnumber = group(Pointfloat, Expfloat)
             Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
             Number = group(Imagnumber, Floatnumber, Intnumber)
             if sys.version_info.minor >= 3:
                 StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
             else:
                 StringPrefix = r'(?:[bB]?[rR]?)?'
             # Tail end of ' string.
             Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
             # Tail end of " string.
             Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
             # Tail end of ''' string.
             Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
             # Tail end of """ string.
             Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
             Triple = group(StringPrefix + "'''", StringPrefix + '"""')
             # Single-line ' or " string.
             String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
                            StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
             # Because of leftmost-then-longest match semantics, be sure to put the
             # longest operators first (e.g., if = came before ==, == would get
             # recognized as two instances of =).
             Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
                              r"//=?", r"->",
                              r"[+\-*/%&|^=<>]=?",
                              r"~")
             Bracket = '[][(){}]'
             Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
             Funny = group(Operator, Bracket, Special)
             PlainToken = group(Number, Funny, String, Name)
             Token = Ignore + PlainToken
             # First (or only) line of ' or " string.
             ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                             group("'", r'\\\r?\n'),
                             StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                             group('"', r'\\\r?\n'))
             PseudoExtras = group(r'\\\r?\n', Comment, Triple)
             PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
             def _compile(expr):
                 return re.compile(expr, re.UNICODE)
             tokenprog, pseudoprog, single3prog, double3prog = map(
                 _compile, (Token, PseudoToken, Single3, Double3))
             endprogs = {"'": _compile(Single), '"': _compile(Double),
                         "'''": single3prog, '"""': double3prog,
                         "r'''": single3prog, 'r"""': double3prog,
                         "b'''": single3prog, 'b"""': double3prog,
                         "R'''": single3prog, 'R"""': double3prog,
                         "B'''": single3prog, 'B"""': double3prog,
                         "br'''": single3prog, 'br"""': double3prog,
                         "bR'''": single3prog, 'bR"""': double3prog,
                         "Br'''": single3prog, 'Br"""': double3prog,
                         "BR'''": single3prog, 'BR"""': double3prog,
                         'r': None, 'R': None, 'b': None, 'B': None}
             triple_quoted = {}
             for t in ("'''", '"""',
                       "r'''", 'r"""', "R'''", 'R"""',
                       "b'''", 'b"""', "B'''", 'B"""',
                       "br'''", 'br"""', "Br'''", 'Br"""',
                       "bR'''", 'bR"""', "BR'''", 'BR"""'):
                 triple_quoted[t] = t
             single_quoted = {}
             for t in ("'", '"',
                       "r'", 'r"', "R'", 'R"',
                       "b'", 'b"', "B'", 'B"',
                       "br'", 'br"', "Br'", 'Br"',
                       "bR'", 'bR"', "BR'", 'BR"' ):
                 single_quoted[t] = t
             if sys.version_info.minor >= 3:
                 # Python 3.3
                 for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
-                    _t2 = prefix+'"""'
+                    _t2 = _prefix+'"""'
                     endprogs[_t2] = double3prog
                     triple_quoted[_t2] = _t2
-                    _t1 = prefix + "'''"
+                    _t1 = _prefix + "'''"
                     endprogs[_t1] = single3prog
                     triple_quoted[_t1] = _t1
                     single_quoted[_prefix+'"'] = _prefix+'"'
-                    single_quoted[_prefix+"'"] + _prefix+"'"
+                    single_quoted[_prefix+"'"] = _prefix+"'"
                 del _prefix, _t2, _t1
                 endprogs['u'] = None
                 endprogs['U'] = None
             del _compile
             tabsize = 8
             class TokenError(Exception): pass
             class StopTokenizing(Exception): pass
             class Untokenizer:
                 def __init__(self):
                     self.tokens = []
                     self.prev_row = 1
                     self.prev_col = 0
                     self.encoding = 'utf-8'
                 def add_whitespace(self, tok_type, start):
                     row, col = start
                     assert row >= self.prev_row
                     col_offset = col - self.prev_col
                     if col_offset > 0:
                         self.tokens.append(" " * col_offset)
                     elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
                         # Line was backslash-continued.
                         self.tokens.append(" ")
                 def untokenize(self, tokens):
                     iterable = iter(tokens)
                     for t in iterable:
                         if len(t) == 2:
                             self.compat(t, iterable)
                             break
                         tok_type, token, start, end = t[:4]
                         if tok_type == ENCODING:
                             self.encoding = token
                             continue
                         self.add_whitespace(tok_type, start)
                         self.tokens.append(token)
                         self.prev_row, self.prev_col = end
                         if tok_type in (NEWLINE, NL):
                             self.prev_row += 1
                             self.prev_col = 0
                     return "".join(self.tokens)
                 def compat(self, token, iterable):
                     # This import is here to avoid problems when the itertools
                     # module is not built yet and tokenize is imported.
                     from itertools import chain
                     startline = False
                     prevstring = False
                     indents = []
                     toks_append = self.tokens.append
                     for tok in chain([token], iterable):
                         toknum, tokval = tok[:2]
                         if toknum == ENCODING:
                             self.encoding = tokval
                             continue
                         if toknum in (NAME, NUMBER):
                             tokval += ' '
                         # Insert a space between two consecutive strings
                         if toknum == STRING:
                             if prevstring:
                                 tokval = ' ' + tokval
                             prevstring = True
                         else:
                             prevstring = False
                         if toknum == INDENT:
                             indents.append(tokval)
                             continue
                         elif toknum == DEDENT:
                             indents.pop()
                             continue
                         elif toknum in (NEWLINE, NL):
                             startline = True
                         elif startline and indents:
                             toks_append(indents[-1])
                             startline = False
                         toks_append(tokval)
             def untokenize(tokens):
                 """
                 Convert ``tokens`` (an iterable) back into Python source code. Return
                 a bytes object, encoded using the encoding specified by the last
                 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
                 The result is guaranteed to tokenize back to match the input so that
                 the conversion is lossless and round-trips are assured.  The
                 guarantee applies only to the token type and token string as the
                 spacing between tokens (column positions) may change.
                 :func:`untokenize` has two modes. If the input tokens are sequences
                 of length 2 (``type``, ``string``) then spaces are added as necessary to
                 preserve the round-trip property.
                 If the input tokens are sequences of length 4 or more (``type``,
                 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
                 spaces are added so that each token appears in the result at the
                 position indicated by ``start`` and ``end``, if possible.
                 """
                 return Untokenizer().untokenize(tokens)
             def _get_normal_name(orig_enc):
                 """Imitates get_normal_name in tokenizer.c."""
                 # Only care about the first 12 characters.
                 enc = orig_enc[:12].lower().replace("_", "-")
                 if enc == "utf-8" or enc.startswith("utf-8-"):
                     return "utf-8"
                 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
                    enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
                     return "iso-8859-1"
                 return orig_enc
             def detect_encoding(readline):
                 """
                 The detect_encoding() function is used to detect the encoding that should
                 be used to decode a Python source file.  It requires one argment, readline,
                 in the same way as the tokenize() generator.
                 It will call readline a maximum of twice, and return the encoding used
                 (as a string) and a list of any lines (left as bytes) it has read in.
                 It detects the encoding from the presence of a utf-8 bom or an encoding
                 cookie as specified in pep-0263.  If both a bom and a cookie are present,
                 but disagree, a SyntaxError will be raised.  If the encoding cookie is an
                 invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
                 'utf-8-sig' is returned.
                 If no encoding is specified, then the default of 'utf-8' will be returned.
                 """
                 bom_found = False
                 encoding = None
                 default = 'utf-8'
                 def read_or_stop():
                     try:
                         return readline()
                     except StopIteration:
                         return b''
                 def find_cookie(line):
                     try:
                         # Decode as UTF-8. Either the line is an encoding declaration,
                         # in which case it should be pure ASCII, or it must be UTF-8
                         # per default encoding.
                         line_string = line.decode('utf-8')
                     except UnicodeDecodeError:
                         raise SyntaxError("invalid or missing encoding declaration")
                     matches = cookie_re.findall(line_string)
                     if not matches:
                         return None
                     encoding = _get_normal_name(matches[0])
                     try:
                         codec = lookup(encoding)
                     except LookupError:
                         # This behaviour mimics the Python interpreter
                         raise SyntaxError("unknown encoding: " + encoding)
                     if bom_found:
                         if encoding != 'utf-8':
                             # This behaviour mimics the Python interpreter
                             raise SyntaxError('encoding problem: utf-8')
                         encoding += '-sig'
                     return encoding
                 first = read_or_stop()
                 if first.startswith(BOM_UTF8):
                     bom_found = True
                     first = first[3:]
                     default = 'utf-8-sig'
                 if not first:
                     return default, []
                 encoding = find_cookie(first)
                 if encoding:
                     return encoding, [first]
                 second = read_or_stop()
                 if not second:
                     return default, [first]
                 encoding = find_cookie(second)
                 if encoding:
                     return encoding, [first, second]
                 return default, [first, second]
             def open(filename):
                 """Open a file in read only mode using the encoding detected by
                 detect_encoding().
                 """
                 buffer = builtins.open(filename, 'rb')
                 encoding, lines = detect_encoding(buffer.readline)
                 buffer.seek(0)
                 text = TextIOWrapper(buffer, encoding, line_buffering=True)
                 text.mode = 'r'
                 return text
             def tokenize(readline):
                 """
                 The tokenize() generator requires one argment, readline, which
                 must be a callable object which provides the same interface as the
                 readline() method of built-in file objects.  Each call to the function
                 should return one line of input as bytes.  Alternately, readline
                 can be a callable function terminating with StopIteration:
                     readline = open(myfile, 'rb').__next__  # Example of alternate readline
                 The generator produces 5-tuples with these members: the token type; the
                 token string; a 2-tuple (srow, scol) of ints specifying the row and
                 column where the token begins in the source; a 2-tuple (erow, ecol) of
                 ints specifying the row and column where the token ends in the source;
                 and the line on which the token was found.  The line passed is the
                 logical line; continuation lines are included.
                 The first token sequence will always be an ENCODING token
                 which tells you which encoding was used to decode the bytes stream.
                 """
                 # This import is here to avoid problems when the itertools module is not
                 # built yet and tokenize is imported.
                 from itertools import chain, repeat
                 encoding, consumed = detect_encoding(readline)
                 rl_gen = iter(readline, b"")
                 empty = repeat(b"")
                 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
             def _tokenize(readline, encoding):
                 lnum = parenlev = continued = 0
                 numchars = '0123456789'
                 contstr, needcont = '', 0
                 contline = None
                 indents = [0]
                 if encoding is not None:
                     if encoding == "utf-8-sig":
                         # BOM will already have been stripped.
                         encoding = "utf-8"
                     yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
                 while True:             # loop over lines in stream
                     try:
                         line = readline()
                     except StopIteration:
                         line = b''
                     if encoding is not None:
                         line = line.decode(encoding)
                     lnum += 1
                     pos, max = 0, len(line)
                     if contstr:                            # continued string
                         if not line:
                             raise TokenError("EOF in multi-line string", strstart)
                         endmatch = endprog.match(line)
                         if endmatch:
                             pos = end = endmatch.end(0)
                             yield TokenInfo(STRING, contstr + line[:end],
                                    strstart, (lnum, end), contline + line)
                             contstr, needcont = '', 0
                             contline = None
                         elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
                             yield TokenInfo(ERRORTOKEN, contstr + line,
                                        strstart, (lnum, len(line)), contline)
                             contstr = ''
                             contline = None
                             continue
                         else:
                             contstr = contstr + line
                             contline = contline + line
                             continue
                     elif parenlev == 0 and not continued:  # new statement
                         if not line: break
                         column = 0
                         while pos < max:                   # measure leading whitespace
                             if line[pos] == ' ':
                                 column += 1
                             elif line[pos] == '\t':
                                 column = (column//tabsize + 1)*tabsize
                             elif line[pos] == '\f':
                                 column = 0
                             else:
                                 break
                             pos += 1
                         if pos == max:
                             break
                         if line[pos] in '#\r\n':           # skip comments or blank lines
                             if line[pos] == '#':
                                 comment_token = line[pos:].rstrip('\r\n')
                                 nl_pos = pos + len(comment_token)
                                 yield TokenInfo(COMMENT, comment_token,
                                        (lnum, pos), (lnum, pos + len(comment_token)), line)
                                 yield TokenInfo(NEWLINE, line[nl_pos:],
                                        (lnum, nl_pos), (lnum, len(line)), line)
                             else:
                                 yield TokenInfo(NEWLINE, line[pos:],
                                        (lnum, pos), (lnum, len(line)), line)
                             continue
                         if column > indents[-1]:           # count indents or dedents
                             indents.append(column)
                             yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
                         while column < indents[-1]:
                             if column not in indents:
                                 raise IndentationError(
                                     "unindent does not match any outer indentation level",
                                     ("<tokenize>", lnum, pos, line))
                             indents = indents[:-1]
                             yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
                     else:                                  # continued statement
                         if not line:
                             raise TokenError("EOF in multi-line statement", (lnum, 0))
                         continued = 0
                     while pos < max:
                         pseudomatch = pseudoprog.match(line, pos)
                         if pseudomatch:                                # scan for tokens
                             start, end = pseudomatch.span(1)
                             spos, epos, pos = (lnum, start), (lnum, end), end
                             token, initial = line[start:end], line[start]
                             if (initial in numchars or                  # ordinary number
                                 (initial == '.' and token != '.' and token != '...')):
                                 yield TokenInfo(NUMBER, token, spos, epos, line)
                             elif initial in '\r\n':
                                 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
                                        token, spos, epos, line)
                             elif initial == '#':
                                 assert not token.endswith("\n")
                                 yield TokenInfo(COMMENT, token, spos, epos, line)
                             elif token in triple_quoted:
                                 endprog = endprogs[token]
                                 endmatch = endprog.match(line, pos)
                                 if endmatch:                           # all on one line
                                     pos = endmatch.end(0)
                                     token = line[start:pos]
                                     yield TokenInfo(STRING, token, spos, (lnum, pos), line)
                                 else:
                                     strstart = (lnum, start)           # multiple lines
                                     contstr = line[start:]
                                     contline = line
                                     break
                             elif initial in single_quoted or \
                                 token[:2] in single_quoted or \
                                 token[:3] in single_quoted:
                                 if token[-1] == '\n':                  # continued string
                                     strstart = (lnum, start)
                                     endprog = (endprogs[initial] or endprogs[token[1]] or
                                                endprogs[token[2]])
                                     contstr, needcont = line[start:], 1
                                     contline = line
                                     break
                                 else:                                  # ordinary string
                                     yield TokenInfo(STRING, token, spos, epos, line)
                             elif initial.isidentifier():               # ordinary name
                                 yield TokenInfo(NAME, token, spos, epos, line)
                             elif initial == '\\':                      # continued stmt
                                 continued = 1
                             else:
                                 if initial in '([{':
                                     parenlev += 1
                                 elif initial in ')]}':
                                     parenlev -= 1
                                 yield TokenInfo(OP, token, spos, epos, line)
                         else:
                             yield TokenInfo(ERRORTOKEN, line[pos],
                                        (lnum, pos), (lnum, pos+1), line)
                             pos += 1
                 for indent in indents[1:]:                 # pop remaining indent levels
                     yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
                 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
             # An undocumented, backwards compatible, API for all the places in the standard
             # library that expect to be able to use tokenize with strings
             def generate_tokens(readline):
                 return _tokenize(readline, None)
             if __name__ == "__main__":
                 # Quick sanity check
                 s = b'''def parseline(self, line):
                         """Parse the line into a command name and a string containing
                         the arguments.  Returns a tuple containing (command, args, line).
                         'command' and 'args' may be None if the line couldn't be parsed.
                         """
                         line = line.strip()
                         if not line:
                             return None, None, line
                         elif line[0] == '?':
                             line = 'help ' + line[1:]
                         elif line[0] == '!':
                             if hasattr(self, 'do_shell'):
                                 line = 'shell ' + line[1:]
                             else:
                                 return None, None, line
                         i, n = 0, len(line)
                         while i < n and line[i] in self.identchars: i = i+1
                         cmd, arg = line[:i], line[i:].strip()
                         return cmd, arg, line
                 '''
                 for tok in tokenize(iter(s.splitlines()).__next__):
                     print(tok)