upstream/ipython Commit - r24065:6667a88e

1

"""Patched version of standard library tokenize, to deal with various bugs.

1

"""Patched version of standard library tokenize, to deal with various bugs.

2

3

Based on Python 3.2 code.

3

Based on Python 3.2 code.

4

5

Patches:

5

Patches:

6

7

- Gareth Rees' patch for Python issue #12691 (untokenizing)

7

- Gareth Rees' patch for Python issue #12691 (untokenizing)

8

- Except we don't encode the output of untokenize

8

- Except we don't encode the output of untokenize

9

- Python 2 compatible syntax, so that it can be byte-compiled at installation

9

- Python 2 compatible syntax, so that it can be byte-compiled at installation

10

- Newlines in comments and blank lines should be either NL or NEWLINE, depending

10

- Newlines in comments and blank lines should be either NL or NEWLINE, depending

11

on whether they are in a multi-line statement. Filed as Python issue #17061.

11

on whether they are in a multi-line statement. Filed as Python issue #17061.

12

- Export generate_tokens & TokenError

12

- Export generate_tokens & TokenError

13

- u and rb literals are allowed under Python 3.3 and above.

13

- u and rb literals are allowed under Python 3.3 and above.

14

15

------------------------------------------------------------------------------

15

------------------------------------------------------------------------------

16

Tokenization help for Python programs.

17

Tokenization help for Python programs.

17

18

tokenize(readline) is a generator that breaks a stream of bytes into

19

tokenize(readline) is a generator that breaks a stream of bytes into

19

Python tokens. It decodes the bytes according to PEP-0263 for

20

Python tokens. It decodes the bytes according to PEP-0263 for

20

determining source file encoding.

21

determining source file encoding.

21

22

It accepts a readline-like method which is called repeatedly to get the

23

It accepts a readline-like method which is called repeatedly to get the

23

next line of input (or b"" for EOF). It generates 5-tuples with these

24

next line of input (or b"" for EOF). It generates 5-tuples with these

24

members:

25

members:

25

26

the token type (see token.py)

27

the token type (see token.py)

27

the token (a string)

28

the token (a string)

28

the starting (row, column) indices of the token (a 2-tuple of ints)

29

the starting (row, column) indices of the token (a 2-tuple of ints)

29

the ending (row, column) indices of the token (a 2-tuple of ints)

30

the ending (row, column) indices of the token (a 2-tuple of ints)

30

the original line (string)

31

the original line (string)

31

32

It is designed to match the working of the Python tokenizer exactly, except

33

It is designed to match the working of the Python tokenizer exactly, except

33

that it produces COMMENT tokens for comments and gives type OP for all

34

that it produces COMMENT tokens for comments and gives type OP for all

34

operators. Additionally, all token lists start with an ENCODING token

35

operators. Additionally, all token lists start with an ENCODING token

35

which tells you which encoding was used to decode the bytes stream.

36

which tells you which encoding was used to decode the bytes stream.

36

"""

37

"""

37

38

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

39

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

39

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

40

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

40

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

41

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

41

'Michael Foord')

42

'Michael Foord')

42

import builtins

43

import builtins

43

import re

44

import re

44

import sys

45

import sys

45

from token import *

46

from token import *

46

from codecs import lookup, BOM_UTF8

47

from codecs import lookup, BOM_UTF8

47

import collections

48

import collections

48

from io import TextIOWrapper

49

from io import TextIOWrapper

49

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

50

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

50

51

import token

52

import token

52

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

53

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

53

"NL", "untokenize", "ENCODING", "TokenInfo"]

54

"NL", "untokenize", "ENCODING", "TokenInfo"]

54

del token

55

del token

55

56

__all__ += ["generate_tokens", "TokenError"]

57

__all__ += ["generate_tokens", "TokenError"]

57

58

COMMENT = N_TOKENS

59

COMMENT = N_TOKENS

59

tok_name[COMMENT] = 'COMMENT'

60

tok_name[COMMENT] = 'COMMENT'

60

NL = N_TOKENS + 1

61

NL = N_TOKENS + 1

61

tok_name[NL] = 'NL'

62

tok_name[NL] = 'NL'

62

ENCODING = N_TOKENS + 2

63

ENCODING = N_TOKENS + 2

63

tok_name[ENCODING] = 'ENCODING'

64

tok_name[ENCODING] = 'ENCODING'

64

N_TOKENS += 3

65

N_TOKENS += 3

65

66

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

67

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

67

def __repr__(self):

68

def __repr__(self):

68

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

69

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

69

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

70

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

70

self._replace(type=annotated_type))

71

self._replace(type=annotated_type))

71

72

def group(*choices): return '(' + '|'.join(choices) + ')'

73

def group(*choices): return '(' + '|'.join(choices) + ')'

73

def any(*choices): return group(*choices) + '*'

74

def any(*choices): return group(*choices) + '*'

74

def maybe(*choices): return group(*choices) + '?'

75

def maybe(*choices): return group(*choices) + '?'

75

76

# Note: we use unicode matching for names ("\w") but ascii matching for

77

# Note: we use unicode matching for names ("\w") but ascii matching for

77

# number literals.

78

# number literals.

78

Whitespace = r'[ \f\t]*'

79

Whitespace = r'[ \f\t]*'

79

Comment = r'#[^\r\n]*'

80

Comment = r'#[^\r\n]*'

80

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

81

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

81

Name = r'\w+'

82

Name = r'\w+'

82

83

Hexnumber = r'0[xX][0-9a-fA-F]+'

84

Hexnumber = r'0[xX][0-9a-fA-F]+'

84

Binnumber = r'0[bB][01]+'

85

Binnumber = r'0[bB][01]+'

85

Octnumber = r'0[oO][0-7]+'

86

Octnumber = r'0[oO][0-7]+'

86

Decnumber = r'(?:0+|[1-9][0-9]*)'

87

Decnumber = r'(?:0+|[1-9][0-9]*)'

87

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

88

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

88

Exponent = r'[eE][-+]?[0-9]+'

89

Exponent = r'[eE][-+]?[0-9]+'

89

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

90

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

90

Expfloat = r'[0-9]+' + Exponent

91

Expfloat = r'[0-9]+' + Exponent

91

Floatnumber = group(Pointfloat, Expfloat)

92

Floatnumber = group(Pointfloat, Expfloat)

92

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

93

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

93

Number = group(Imagnumber, Floatnumber, Intnumber)

94

Number = group(Imagnumber, Floatnumber, Intnumber)

94

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

95

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

95

96

# Tail end of ' string.

97

# Tail end of ' string.

97

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

98

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

98

# Tail end of " string.

99

# Tail end of " string.

99

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

100

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

100

# Tail end of ''' string.

101

# Tail end of ''' string.

101

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

102

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

102

# Tail end of """ string.

103

# Tail end of """ string.

103

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

104

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

104

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

105

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

105

# Single-line ' or " string.

106

# Single-line ' or " string.

106

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

107

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

107

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

108

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

108

109

# Because of leftmost-then-longest match semantics, be sure to put the

110

# Because of leftmost-then-longest match semantics, be sure to put the

110

# longest operators first (e.g., if = came before ==, == would get

111

# longest operators first (e.g., if = came before ==, == would get

111

# recognized as two instances of =).

112

# recognized as two instances of =).

112

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

113

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

113

r"//=?", r"->",

114

r"//=?", r"->",

114

r"[+\-*/%&|^=<>]=?",

115

r"[+\-*/%&|^=<>]=?",

115

r"~")

116

r"~")

116

117

Bracket = '[][(){}]'

118

Bracket = '[][(){}]'

118

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

119

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

119

Funny = group(Operator, Bracket, Special)

120

Funny = group(Operator, Bracket, Special)

120

121

PlainToken = group(Number, Funny, String, Name)

122

PlainToken = group(Number, Funny, String, Name)

122

Token = Ignore + PlainToken

123

Token = Ignore + PlainToken

123

124

# First (or only) line of ' or " string.

125

# First (or only) line of ' or " string.

125

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

126

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

126

group("'", r'\\\r?\n'),

127

group("'", r'\\\r?\n'),

127

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

128

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

128

group('"', r'\\\r?\n'))

129

group('"', r'\\\r?\n'))

129

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

130

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

130

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

131

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

131

132

def _compile(expr):

133

def _compile(expr):

133

return re.compile(expr, re.UNICODE)

134

return re.compile(expr, re.UNICODE)

134

135

tokenprog, pseudoprog, single3prog, double3prog = map(

136

tokenprog, pseudoprog, single3prog, double3prog = map(

136

_compile, (Token, PseudoToken, Single3, Double3))

137

_compile, (Token, PseudoToken, Single3, Double3))

137

endprogs = {"'": _compile(Single), '"': _compile(Double),

138

endprogs = {"'": _compile(Single), '"': _compile(Double),

138

"'''": single3prog, '"""': double3prog,

139

"'''": single3prog, '"""': double3prog,

139

"r'''": single3prog, 'r"""': double3prog,

140

"r'''": single3prog, 'r"""': double3prog,

140

"b'''": single3prog, 'b"""': double3prog,

141

"b'''": single3prog, 'b"""': double3prog,

141

"R'''": single3prog, 'R"""': double3prog,

142

"R'''": single3prog, 'R"""': double3prog,

142

"B'''": single3prog, 'B"""': double3prog,

143

"B'''": single3prog, 'B"""': double3prog,

143

"br'''": single3prog, 'br"""': double3prog,

144

"br'''": single3prog, 'br"""': double3prog,

144

"bR'''": single3prog, 'bR"""': double3prog,

145

"bR'''": single3prog, 'bR"""': double3prog,

145

"Br'''": single3prog, 'Br"""': double3prog,

146

"Br'''": single3prog, 'Br"""': double3prog,

146

"BR'''": single3prog, 'BR"""': double3prog,

147

"BR'''": single3prog, 'BR"""': double3prog,

147

'r': None, 'R': None, 'b': None, 'B': None}

148

'r': None, 'R': None, 'b': None, 'B': None}

148

149

triple_quoted = {}

150

triple_quoted = {}

150

for t in ("'''", '"""',

151

for t in ("'''", '"""',

151

"r'''", 'r"""', "R'''", 'R"""',

152

"r'''", 'r"""', "R'''", 'R"""',

152

"b'''", 'b"""', "B'''", 'B"""',

153

"b'''", 'b"""', "B'''", 'B"""',

153

"br'''", 'br"""', "Br'''", 'Br"""',

154

"br'''", 'br"""', "Br'''", 'Br"""',

154

"bR'''", 'bR"""', "BR'''", 'BR"""'):

155

"bR'''", 'bR"""', "BR'''", 'BR"""'):

155

triple_quoted[t] = t

156

triple_quoted[t] = t

156

single_quoted = {}

157

single_quoted = {}

157

for t in ("'", '"',

158

for t in ("'", '"',

158

"r'", 'r"', "R'", 'R"',

159

"r'", 'r"', "R'", 'R"',

159

"b'", 'b"', "B'", 'B"',

160

"b'", 'b"', "B'", 'B"',

160

"br'", 'br"', "Br'", 'Br"',

161

"br'", 'br"', "Br'", 'Br"',

161

"bR'", 'bR"', "BR'", 'BR"' ):

162

"bR'", 'bR"', "BR'", 'BR"' ):

162

single_quoted[t] = t

163

single_quoted[t] = t

163

164

for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:

165

for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:

165

_t2 = _prefix+'"""'

166

_t2 = _prefix+'"""'

166

endprogs[_t2] = double3prog

167

endprogs[_t2] = double3prog

167

triple_quoted[_t2] = _t2

168

triple_quoted[_t2] = _t2

168

_t1 = _prefix + "'''"

169

_t1 = _prefix + "'''"

169

endprogs[_t1] = single3prog

170

endprogs[_t1] = single3prog

170

triple_quoted[_t1] = _t1

171

triple_quoted[_t1] = _t1

171

single_quoted[_prefix+'"'] = _prefix+'"'

172

single_quoted[_prefix+'"'] = _prefix+'"'

172

single_quoted[_prefix+"'"] = _prefix+"'"

173

single_quoted[_prefix+"'"] = _prefix+"'"

173

del _prefix, _t2, _t1

174

del _prefix, _t2, _t1

174

endprogs['u'] = None

175

endprogs['u'] = None

175

endprogs['U'] = None

176

endprogs['U'] = None

176

177

del _compile

178

del _compile

178

179

tabsize = 8

180

tabsize = 8

180

181

class TokenError(Exception): pass

182

class TokenError(Exception): pass

182

183

class StopTokenizing(Exception): pass

184

class StopTokenizing(Exception): pass

184

185

186

class Untokenizer:

187

class Untokenizer:

187

188

def __init__(self):

189

def __init__(self):

189

self.tokens = []

190

self.tokens = []

190

self.prev_row = 1

191

self.prev_row = 1

191

self.prev_col = 0

192

self.prev_col = 0

192

self.encoding = 'utf-8'

193

self.encoding = 'utf-8'

193

194

def add_whitespace(self, tok_type, start):

195

def add_whitespace(self, tok_type, start):

195

row, col = start

196

row, col = start

196

assert row >= self.prev_row

197

assert row >= self.prev_row

197

col_offset = col - self.prev_col

198

col_offset = col - self.prev_col

198

if col_offset > 0:

199

if col_offset > 0:

199

self.tokens.append(" " * col_offset)

200

self.tokens.append(" " * col_offset)

200

elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):

201

elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):

201

# Line was backslash-continued.

202

# Line was backslash-continued.

202

self.tokens.append(" ")

203

self.tokens.append(" ")

203

204

def untokenize(self, tokens):

205

def untokenize(self, tokens):

205

iterable = iter(tokens)

206

iterable = iter(tokens)

206

for t in iterable:

207

for t in iterable:

207

if len(t) == 2:

208

if len(t) == 2:

208

self.compat(t, iterable)

209

self.compat(t, iterable)

209

break

210

break

210

tok_type, token, start, end = t[:4]

211

tok_type, token, start, end = t[:4]

211

if tok_type == ENCODING:

212

if tok_type == ENCODING:

212

self.encoding = token

213

self.encoding = token

213

continue

214

continue

214

self.add_whitespace(tok_type, start)

215

self.add_whitespace(tok_type, start)

215

self.tokens.append(token)

216

self.tokens.append(token)

216

self.prev_row, self.prev_col = end

217

self.prev_row, self.prev_col = end

217

if tok_type in (NEWLINE, NL):

218

if tok_type in (NEWLINE, NL):

218

self.prev_row += 1

219

self.prev_row += 1

219

self.prev_col = 0

220

self.prev_col = 0

220

return "".join(self.tokens)

221

return "".join(self.tokens)

221

222

def compat(self, token, iterable):

223

def compat(self, token, iterable):

223

# This import is here to avoid problems when the itertools

224

# This import is here to avoid problems when the itertools

224

# module is not built yet and tokenize is imported.

225

# module is not built yet and tokenize is imported.

225

from itertools import chain

226

from itertools import chain

226

startline = False

227

startline = False

227

prevstring = False

228

prevstring = False

228

indents = []

229

indents = []

229

toks_append = self.tokens.append

230

toks_append = self.tokens.append

230

231

for tok in chain([token], iterable):

232

for tok in chain([token], iterable):

232

toknum, tokval = tok[:2]

233

toknum, tokval = tok[:2]

233

if toknum == ENCODING:

234

if toknum == ENCODING:

234

self.encoding = tokval

235

self.encoding = tokval

235

continue

236

continue

236

237

if toknum in (NAME, NUMBER):

238

if toknum in (NAME, NUMBER):

238

tokval += ' '

239

tokval += ' '

239

240

# Insert a space between two consecutive strings

241

# Insert a space between two consecutive strings

241

if toknum == STRING:

242

if toknum == STRING:

242

if prevstring:

243

if prevstring:

243

tokval = ' ' + tokval

244

tokval = ' ' + tokval

244

prevstring = True

245

prevstring = True

245

else:

246

else:

246

prevstring = False

247

prevstring = False

247

248

if toknum == INDENT:

249

if toknum == INDENT:

249

indents.append(tokval)

250

indents.append(tokval)

250

continue

251

continue

251

elif toknum == DEDENT:

252

elif toknum == DEDENT:

252

indents.pop()

253

indents.pop()

253

continue

254

continue

254

elif toknum in (NEWLINE, NL):

255

elif toknum in (NEWLINE, NL):

255

startline = True

256

startline = True

256

elif startline and indents:

257

elif startline and indents:

257

toks_append(indents[-1])

258

toks_append(indents[-1])

258

startline = False

259

startline = False

259

toks_append(tokval)

260

toks_append(tokval)

260

261

262

def untokenize(tokens):

263

def untokenize(tokens):

263

"""

264

"""

264

Convert ``tokens`` (an iterable) back into Python source code. Return

265

Convert ``tokens`` (an iterable) back into Python source code. Return

265

a bytes object, encoded using the encoding specified by the last

266

a bytes object, encoded using the encoding specified by the last

266

ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.

267

ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.

267

268

The result is guaranteed to tokenize back to match the input so that

269

The result is guaranteed to tokenize back to match the input so that

269

the conversion is lossless and round-trips are assured. The

270

the conversion is lossless and round-trips are assured. The

270

guarantee applies only to the token type and token string as the

271

guarantee applies only to the token type and token string as the

271

spacing between tokens (column positions) may change.

272

spacing between tokens (column positions) may change.

272

273

:func:`untokenize` has two modes. If the input tokens are sequences

274

:func:`untokenize` has two modes. If the input tokens are sequences

274

of length 2 (``type``, ``string``) then spaces are added as necessary to

275

of length 2 (``type``, ``string``) then spaces are added as necessary to

275

preserve the round-trip property.

276

preserve the round-trip property.

276

277

If the input tokens are sequences of length 4 or more (``type``,

278

If the input tokens are sequences of length 4 or more (``type``,

278

``string``, ``start``, ``end``), as returned by :func:`tokenize`, then

279

``string``, ``start``, ``end``), as returned by :func:`tokenize`, then

279

spaces are added so that each token appears in the result at the

280

spaces are added so that each token appears in the result at the

280

position indicated by ``start`` and ``end``, if possible.

281

position indicated by ``start`` and ``end``, if possible.

281

"""

282

"""

282

return Untokenizer().untokenize(tokens)

283

return Untokenizer().untokenize(tokens)

283

284

285

def _get_normal_name(orig_enc):

286

def _get_normal_name(orig_enc):

286

"""Imitates get_normal_name in tokenizer.c."""

287

"""Imitates get_normal_name in tokenizer.c."""

287

# Only care about the first 12 characters.

288

# Only care about the first 12 characters.

288

enc = orig_enc[:12].lower().replace("_", "-")

289

enc = orig_enc[:12].lower().replace("_", "-")

289

if enc == "utf-8" or enc.startswith("utf-8-"):

290

if enc == "utf-8" or enc.startswith("utf-8-"):

290

return "utf-8"

291

return "utf-8"

291

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

292

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

292

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

293

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

293

return "iso-8859-1"

294

return "iso-8859-1"

294

return orig_enc

295

return orig_enc

295

296

def detect_encoding(readline):

297

def detect_encoding(readline):

297

"""

298

"""

298

The detect_encoding() function is used to detect the encoding that should

299

The detect_encoding() function is used to detect the encoding that should

299

be used to decode a Python source file. It requires one argment, readline,

300

be used to decode a Python source file. It requires one argment, readline,

300

in the same way as the tokenize() generator.

301

in the same way as the tokenize() generator.

301

302

It will call readline a maximum of twice, and return the encoding used

303

It will call readline a maximum of twice, and return the encoding used

303

(as a string) and a list of any lines (left as bytes) it has read in.

304

(as a string) and a list of any lines (left as bytes) it has read in.

304

305

It detects the encoding from the presence of a utf-8 bom or an encoding

306

It detects the encoding from the presence of a utf-8 bom or an encoding

306

cookie as specified in pep-0263. If both a bom and a cookie are present,

307

cookie as specified in pep-0263. If both a bom and a cookie are present,

307

but disagree, a SyntaxError will be raised. If the encoding cookie is an

308

but disagree, a SyntaxError will be raised. If the encoding cookie is an

308

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

309

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

309

'utf-8-sig' is returned.

310

'utf-8-sig' is returned.

310

311

If no encoding is specified, then the default of 'utf-8' will be returned.

312

If no encoding is specified, then the default of 'utf-8' will be returned.

312

"""

313

"""

313

bom_found = False

314

bom_found = False

314

encoding = None

315

encoding = None

315

default = 'utf-8'

316

default = 'utf-8'

316

def read_or_stop():

317

def read_or_stop():

317

try:

318

try:

318

return readline()

319

return readline()

319

except StopIteration:

320

except StopIteration:

320

return b''

321

return b''

321

322

def find_cookie(line):

323

def find_cookie(line):

323

try:

324

try:

324

# Decode as UTF-8. Either the line is an encoding declaration,

325

# Decode as UTF-8. Either the line is an encoding declaration,

325

# in which case it should be pure ASCII, or it must be UTF-8

326

# in which case it should be pure ASCII, or it must be UTF-8

326

# per default encoding.

327

# per default encoding.

327

line_string = line.decode('utf-8')

328

line_string = line.decode('utf-8')

328

except UnicodeDecodeError:

329

except UnicodeDecodeError:

329

raise SyntaxError("invalid or missing encoding declaration")

330

raise SyntaxError("invalid or missing encoding declaration")

330

331

matches = cookie_re.findall(line_string)

332

matches = cookie_re.findall(line_string)

332

if not matches:

333

if not matches:

333

return None

334

return None

334

encoding = _get_normal_name(matches[0])

335

encoding = _get_normal_name(matches[0])

335

try:

336

try:

336

codec = lookup(encoding)

337

codec = lookup(encoding)

337

except LookupError:

338

except LookupError:

338

# This behaviour mimics the Python interpreter

339

# This behaviour mimics the Python interpreter

339

raise SyntaxError("unknown encoding: " + encoding)

340

raise SyntaxError("unknown encoding: " + encoding)

340

341

if bom_found:

342

if bom_found:

342

if encoding != 'utf-8':

343

if encoding != 'utf-8':

343

# This behaviour mimics the Python interpreter

344

# This behaviour mimics the Python interpreter

344

raise SyntaxError('encoding problem: utf-8')

345

raise SyntaxError('encoding problem: utf-8')

345

encoding += '-sig'

346

encoding += '-sig'

346

return encoding

347

return encoding

347

348

first = read_or_stop()

349

first = read_or_stop()

349

if first.startswith(BOM_UTF8):

350

if first.startswith(BOM_UTF8):

350

bom_found = True

351

bom_found = True

351

first = first[3:]

352

first = first[3:]

352

default = 'utf-8-sig'

353

default = 'utf-8-sig'

353

if not first:

354

if not first:

354

return default, []

355

return default, []

355

356

encoding = find_cookie(first)

357

encoding = find_cookie(first)

357

if encoding:

358

if encoding:

358

return encoding, [first]

359

return encoding, [first]

359

360

second = read_or_stop()

361

second = read_or_stop()

361

if not second:

362

if not second:

362

return default, [first]

363

return default, [first]

363

364

encoding = find_cookie(second)

365

encoding = find_cookie(second)

365

if encoding:

366

if encoding:

366

return encoding, [first, second]

367

return encoding, [first, second]

367

368

return default, [first, second]

369

return default, [first, second]

369

370

371

def open(filename):

372

def open(filename):

372

"""Open a file in read only mode using the encoding detected by

373

"""Open a file in read only mode using the encoding detected by

373

detect_encoding().

374

detect_encoding().

374

"""

375

"""

375

buffer = builtins.open(filename, 'rb')

376

buffer = builtins.open(filename, 'rb')

376

encoding, lines = detect_encoding(buffer.readline)

377

encoding, lines = detect_encoding(buffer.readline)

377

buffer.seek(0)

378

buffer.seek(0)

378

text = TextIOWrapper(buffer, encoding, line_buffering=True)

379

text = TextIOWrapper(buffer, encoding, line_buffering=True)

379

text.mode = 'r'

380

text.mode = 'r'

380

return text

381

return text

381

382

383

def tokenize(readline):

384

def tokenize(readline):

384

"""

385

"""

385

The tokenize() generator requires one argment, readline, which

386

The tokenize() generator requires one argment, readline, which

386

must be a callable object which provides the same interface as the

387

must be a callable object which provides the same interface as the

387

readline() method of built-in file objects. Each call to the function

388

readline() method of built-in file objects. Each call to the function

388

should return one line of input as bytes. Alternately, readline

389

should return one line of input as bytes. Alternately, readline

389

can be a callable function terminating with :class:`StopIteration`::

390

can be a callable function terminating with :class:`StopIteration`::

390

391

readline = open(myfile, 'rb').__next__ # Example of alternate readline

392

readline = open(myfile, 'rb').__next__ # Example of alternate readline

392

393

The generator produces 5-tuples with these members: the token type; the

394

The generator produces 5-tuples with these members: the token type; the

394

token string; a 2-tuple (srow, scol) of ints specifying the row and

395

token string; a 2-tuple (srow, scol) of ints specifying the row and

395

column where the token begins in the source; a 2-tuple (erow, ecol) of

396

column where the token begins in the source; a 2-tuple (erow, ecol) of

396

ints specifying the row and column where the token ends in the source;

397

ints specifying the row and column where the token ends in the source;

397

and the line on which the token was found. The line passed is the

398

and the line on which the token was found. The line passed is the

398

logical line; continuation lines are included.

399

logical line; continuation lines are included.

399

400

The first token sequence will always be an ENCODING token

401

The first token sequence will always be an ENCODING token

401

which tells you which encoding was used to decode the bytes stream.

402

which tells you which encoding was used to decode the bytes stream.

402

"""

403

"""

403

# This import is here to avoid problems when the itertools module is not

404

# This import is here to avoid problems when the itertools module is not

404

# built yet and tokenize is imported.

405

# built yet and tokenize is imported.

405

from itertools import chain, repeat

406

from itertools import chain, repeat

406

encoding, consumed = detect_encoding(readline)

407

encoding, consumed = detect_encoding(readline)

407

rl_gen = iter(readline, b"")

408

rl_gen = iter(readline, b"")

408

empty = repeat(b"")

409

empty = repeat(b"")

409

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

410

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

410

411

412

def _tokenize(readline, encoding):

413

def _tokenize(readline, encoding):

413

lnum = parenlev = continued = 0

414

lnum = parenlev = continued = 0

414

numchars = '0123456789'

415

numchars = '0123456789'

415

contstr, needcont = '', 0

416

contstr, needcont = '', 0

416

contline = None

417

contline = None

417

indents = [0]

418

indents = [0]

418

419

if encoding is not None:

420

if encoding is not None:

420

if encoding == "utf-8-sig":

421

if encoding == "utf-8-sig":

421

# BOM will already have been stripped.

422

# BOM will already have been stripped.

422

encoding = "utf-8"

423

encoding = "utf-8"

423

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

424

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

424

while True: # loop over lines in stream

425

while True: # loop over lines in stream

425

try:

426

try:

426

line = readline()

427

line = readline()

427

except StopIteration:

428

except StopIteration:

428

line = b''

429

line = b''

429

430

if encoding is not None:

431

if encoding is not None:

431

line = line.decode(encoding)

432

line = line.decode(encoding)

432

lnum += 1

433

lnum += 1

433

pos, max = 0, len(line)

434

pos, max = 0, len(line)

434

435

if contstr: # continued string

436

if contstr: # continued string

436

if not line:

437

if not line:

437

raise TokenError("EOF in multi-line string", strstart)

438

raise TokenError("EOF in multi-line string", strstart)

438

endmatch = endprog.match(line)

439

endmatch = endprog.match(line)

439

if endmatch:

440

if endmatch:

440

pos = end = endmatch.end(0)

441

pos = end = endmatch.end(0)

441

yield TokenInfo(STRING, contstr + line[:end],

442

yield TokenInfo(STRING, contstr + line[:end],

442

strstart, (lnum, end), contline + line)

443

strstart, (lnum, end), contline + line)

443

contstr, needcont = '', 0

444

contstr, needcont = '', 0

444

contline = None

445

contline = None

445

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

446

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

446

yield TokenInfo(ERRORTOKEN, contstr + line,

447

yield TokenInfo(ERRORTOKEN, contstr + line,

447

strstart, (lnum, len(line)), contline)

448

strstart, (lnum, len(line)), contline)

448

contstr = ''

449

contstr = ''

449

contline = None

450

contline = None

450

continue

451

continue

451

else:

452

else:

452

contstr = contstr + line

453

contstr = contstr + line

453

contline = contline + line

454

contline = contline + line

454

continue

455

continue

455

456

elif parenlev == 0 and not continued: # new statement

457

elif parenlev == 0 and not continued: # new statement

457

if not line: break

458

if not line: break

458

column = 0

459

column = 0

459

while pos < max: # measure leading whitespace

460

while pos < max: # measure leading whitespace

460

if line[pos] == ' ':

461

if line[pos] == ' ':

461

column += 1

462

column += 1

462

elif line[pos] == '\t':

463

elif line[pos] == '\t':

463

column = (column//tabsize + 1)*tabsize

464

column = (column//tabsize + 1)*tabsize

464

elif line[pos] == '\f':

465

elif line[pos] == '\f':

465

column = 0

466

column = 0

466

else:

467

else:

467

break

468

break

468

pos += 1

469

pos += 1

469

if pos == max:

470

if pos == max:

470

break

471

break

471

472

if line[pos] in '#\r\n': # skip comments or blank lines

473

if line[pos] in '#\r\n': # skip comments or blank lines

473

if line[pos] == '#':

474

if line[pos] == '#':

474

comment_token = line[pos:].rstrip('\r\n')

475

comment_token = line[pos:].rstrip('\r\n')

475

nl_pos = pos + len(comment_token)

476

nl_pos = pos + len(comment_token)

476

yield TokenInfo(COMMENT, comment_token,

477

yield TokenInfo(COMMENT, comment_token,

477

(lnum, pos), (lnum, pos + len(comment_token)), line)

478

(lnum, pos), (lnum, pos + len(comment_token)), line)

478

yield TokenInfo(NEWLINE, line[nl_pos:],

479

yield TokenInfo(NEWLINE, line[nl_pos:],

479

(lnum, nl_pos), (lnum, len(line)), line)

480

(lnum, nl_pos), (lnum, len(line)), line)

480

else:

481

else:

481

yield TokenInfo(NEWLINE, line[pos:],

482

yield TokenInfo(NEWLINE, line[pos:],

482

(lnum, pos), (lnum, len(line)), line)

483

(lnum, pos), (lnum, len(line)), line)

483

continue

484

continue

484

485

if column > indents[-1]: # count indents or dedents

486

if column > indents[-1]: # count indents or dedents

486

indents.append(column)

487

indents.append(column)

487

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

488

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

488

while column < indents[-1]:

489

while column < indents[-1]:

489

if column not in indents:

490

if column not in indents:

490

raise IndentationError(

491

raise IndentationError(

491

"unindent does not match any outer indentation level",

492

"unindent does not match any outer indentation level",

492

("<tokenize>", lnum, pos, line))

493

("<tokenize>", lnum, pos, line))

493

indents = indents[:-1]

494

indents = indents[:-1]

494

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

495

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

495

496

else: # continued statement

497

else: # continued statement

497

if not line:

498

if not line:

498

raise TokenError("EOF in multi-line statement", (lnum, 0))

499

raise TokenError("EOF in multi-line statement", (lnum, 0))

499

continued = 0

500

continued = 0

500

501

while pos < max:

502

while pos < max:

502

pseudomatch = pseudoprog.match(line, pos)

503

pseudomatch = pseudoprog.match(line, pos)

503

if pseudomatch: # scan for tokens

504

if pseudomatch: # scan for tokens

504

start, end = pseudomatch.span(1)

505

start, end = pseudomatch.span(1)

505

spos, epos, pos = (lnum, start), (lnum, end), end

506

spos, epos, pos = (lnum, start), (lnum, end), end

506

token, initial = line[start:end], line[start]

507

token, initial = line[start:end], line[start]

507

508

if (initial in numchars or # ordinary number

509

if (initial in numchars or # ordinary number

509

(initial == '.' and token != '.' and token != '...')):

510

(initial == '.' and token != '.' and token != '...')):

510

yield TokenInfo(NUMBER, token, spos, epos, line)

511

yield TokenInfo(NUMBER, token, spos, epos, line)

511

elif initial in '\r\n':

512

elif initial in '\r\n':

512

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

513

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

513

token, spos, epos, line)

514

token, spos, epos, line)

514

elif initial == '#':

515

elif initial == '#':

515

assert not token.endswith("\n")

516

assert not token.endswith("\n")

516

yield TokenInfo(COMMENT, token, spos, epos, line)

517

yield TokenInfo(COMMENT, token, spos, epos, line)

517

elif token in triple_quoted:

518

elif token in triple_quoted:

518

endprog = endprogs[token]

519

endprog = endprogs[token]

519

endmatch = endprog.match(line, pos)

520

endmatch = endprog.match(line, pos)

520

if endmatch: # all on one line

521

if endmatch: # all on one line

521

pos = endmatch.end(0)

522

pos = endmatch.end(0)

522

token = line[start:pos]

523

token = line[start:pos]

523

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

524

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

524

else:

525

else:

525

strstart = (lnum, start) # multiple lines

526

strstart = (lnum, start) # multiple lines

526

contstr = line[start:]

527

contstr = line[start:]

527

contline = line

528

contline = line

528

break

529

break

529

elif initial in single_quoted or \

530

elif initial in single_quoted or \

530

token[:2] in single_quoted or \

531

token[:2] in single_quoted or \

531

token[:3] in single_quoted:

532

token[:3] in single_quoted:

532

if token[-1] == '\n': # continued string

533

if token[-1] == '\n': # continued string

533

strstart = (lnum, start)

534

strstart = (lnum, start)

534

endprog = (endprogs[initial] or endprogs[token[1]] or

535

endprog = (endprogs[initial] or endprogs[token[1]] or

535

endprogs[token[2]])

536

endprogs[token[2]])

536

contstr, needcont = line[start:], 1

537

contstr, needcont = line[start:], 1

537

contline = line

538

contline = line

538

break

539

break

539

else: # ordinary string

540

else: # ordinary string

540

yield TokenInfo(STRING, token, spos, epos, line)

541

yield TokenInfo(STRING, token, spos, epos, line)

541

elif initial.isidentifier(): # ordinary name

542

elif initial.isidentifier(): # ordinary name

542

yield TokenInfo(NAME, token, spos, epos, line)

543

yield TokenInfo(NAME, token, spos, epos, line)

543

elif initial == '\\': # continued stmt

544

elif initial == '\\': # continued stmt

544

continued = 1

545

continued = 1

545

else:

546

else:

546

if initial in '([{':

547

if initial in '([{':

547

parenlev += 1

548

parenlev += 1

548

elif initial in ')]}':

549

elif initial in ')]}':

549

parenlev -= 1

550

parenlev -= 1

550

yield TokenInfo(OP, token, spos, epos, line)

551

yield TokenInfo(OP, token, spos, epos, line)

551

else:

552

else:

552

yield TokenInfo(ERRORTOKEN, line[pos],

553

yield TokenInfo(ERRORTOKEN, line[pos],

553

(lnum, pos), (lnum, pos+1), line)

554

(lnum, pos), (lnum, pos+1), line)

554

pos += 1

555

pos += 1

555

556

for indent in indents[1:]: # pop remaining indent levels

557

for indent in indents[1:]: # pop remaining indent levels

557

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

558

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

558

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

559

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

559

560

561

# An undocumented, backwards compatible, API for all the places in the standard

562

# An undocumented, backwards compatible, API for all the places in the standard

562

# library that expect to be able to use tokenize with strings

563

# library that expect to be able to use tokenize with strings

563

def generate_tokens(readline):

564

def generate_tokens(readline):

564

return _tokenize(readline, None)

565

return _tokenize(readline, None)

565

566

if __name__ == "__main__":

567

if __name__ == "__main__":

567

# Quick sanity check

568

# Quick sanity check

568

s = b'''def parseline(self, line):

569

s = b'''def parseline(self, line):

569

"""Parse the line into a command name and a string containing

570

"""Parse the line into a command name and a string containing

570

the arguments. Returns a tuple containing (command, args, line).

571

the arguments. Returns a tuple containing (command, args, line).

571

'command' and 'args' may be None if the line couldn't be parsed.

572

'command' and 'args' may be None if the line couldn't be parsed.

572

"""

573

"""

573

line = line.strip()

574

line = line.strip()

574

if not line:

575

if not line:

575

return None, None, line

576

return None, None, line

576

elif line[0] == '?':

577

elif line[0] == '?':

577

line = 'help ' + line[1:]

578

line = 'help ' + line[1:]

578

elif line[0] == '!':

579

elif line[0] == '!':

579

if hasattr(self, 'do_shell'):

580

if hasattr(self, 'do_shell'):

580

line = 'shell ' + line[1:]

581

line = 'shell ' + line[1:]

581

else:

582

else:

582

return None, None, line

583

return None, None, line

583

i, n = 0, len(line)

584

i, n = 0, len(line)

584

while i < n and line[i] in self.identchars: i = i+1

585

while i < n and line[i] in self.identchars: i = i+1

585

cmd, arg = line[:i], line[i:].strip()

586

cmd, arg = line[:i], line[i:].strip()

586

return cmd, arg, line

587

return cmd, arg, line

587

'''

588

'''

588

for tok in tokenize(iter(s.splitlines()).__next__):

589

for tok in tokenize(iter(s.splitlines()).__next__):

589

print(tok)

590

print(tok)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             """Patched version of standard library tokenize, to deal with various bugs.
             Based on Python 3.2 code.
             Patches:
             - Gareth Rees' patch for Python issue #12691 (untokenizing)
               - Except we don't encode the output of untokenize
               - Python 2 compatible syntax, so that it can be byte-compiled at installation
             - Newlines in comments and blank lines should be either NL or NEWLINE, depending
               on whether they are in a multi-line statement. Filed as Python issue #17061.
             - Export generate_tokens & TokenError
             - u and rb literals are allowed under Python 3.3 and above.
             ------------------------------------------------------------------------------
             Tokenization help for Python programs.
             tokenize(readline) is a generator that breaks a stream of bytes into
             Python tokens.  It decodes the bytes according to PEP-0263 for
             determining source file encoding.
             It accepts a readline-like method which is called repeatedly to get the
             next line of input (or b"" for EOF).  It generates 5-tuples with these
             members:
                 the token type (see token.py)
                 the token (a string)
                 the starting (row, column) indices of the token (a 2-tuple of ints)
                 the ending (row, column) indices of the token (a 2-tuple of ints)
                 the original line (string)
             It is designed to match the working of the Python tokenizer exactly, except
             that it produces COMMENT tokens for comments and gives type OP for all
             operators.  Additionally, all token lists start with an ENCODING token
             which tells you which encoding was used to decode the bytes stream.
             """
             __author__ = 'Ka-Ping Yee <ping@lfw.org>'
             __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
                            'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
                            'Michael Foord')
             import builtins
             import re
             import sys
             from token import *
             from codecs import lookup, BOM_UTF8
             import collections
             from io import TextIOWrapper
             cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
             import token
             __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
                                        "NL", "untokenize", "ENCODING", "TokenInfo"]
             del token
             __all__ += ["generate_tokens", "TokenError"]
             COMMENT = N_TOKENS
             tok_name[COMMENT] = 'COMMENT'
             NL = N_TOKENS + 1
             tok_name[NL] = 'NL'
             ENCODING = N_TOKENS + 2
             tok_name[ENCODING] = 'ENCODING'
             N_TOKENS += 3
             class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
                 def __repr__(self):
                     annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
                     return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                             self._replace(type=annotated_type))
             def group(*choices): return '(' + '|'.join(choices) + ')'
             def any(*choices): return group(*choices) + '*'
             def maybe(*choices): return group(*choices) + '?'
             # Note: we use unicode matching for names ("\w") but ascii matching for
             # number literals.
             Whitespace = r'[ \f\t]*'
             Comment = r'#[^\r\n]*'
             Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
             Name = r'\w+'
             Hexnumber = r'0[xX][0-9a-fA-F]+'
             Binnumber = r'0[bB][01]+'
             Octnumber = r'0[oO][0-7]+'
             Decnumber = r'(?:0+|[1-9][0-9]*)'
             Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
             Exponent = r'[eE][-+]?[0-9]+'
             Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
             Expfloat = r'[0-9]+' + Exponent
             Floatnumber = group(Pointfloat, Expfloat)
             Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
             Number = group(Imagnumber, Floatnumber, Intnumber)
             StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
             # Tail end of ' string.
             Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
             # Tail end of " string.
             Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
             # Tail end of ''' string.
             Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
             # Tail end of """ string.
             Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
             Triple = group(StringPrefix + "'''", StringPrefix + '"""')
             # Single-line ' or " string.
             String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
                            StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
             # Because of leftmost-then-longest match semantics, be sure to put the
             # longest operators first (e.g., if = came before ==, == would get
             # recognized as two instances of =).
             Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
                              r"//=?", r"->",
                              r"[+\-*/%&|^=<>]=?",
                              r"~")
             Bracket = '[][(){}]'
             Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
             Funny = group(Operator, Bracket, Special)
             PlainToken = group(Number, Funny, String, Name)
             Token = Ignore + PlainToken
             # First (or only) line of ' or " string.
             ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                             group("'", r'\\\r?\n'),
                             StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                             group('"', r'\\\r?\n'))
             PseudoExtras = group(r'\\\r?\n', Comment, Triple)
             PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
             def _compile(expr):
                 return re.compile(expr, re.UNICODE)
             tokenprog, pseudoprog, single3prog, double3prog = map(
                 _compile, (Token, PseudoToken, Single3, Double3))
             endprogs = {"'": _compile(Single), '"': _compile(Double),
                         "'''": single3prog, '"""': double3prog,
                         "r'''": single3prog, 'r"""': double3prog,
                         "b'''": single3prog, 'b"""': double3prog,
                         "R'''": single3prog, 'R"""': double3prog,
                         "B'''": single3prog, 'B"""': double3prog,
                         "br'''": single3prog, 'br"""': double3prog,
                         "bR'''": single3prog, 'bR"""': double3prog,
                         "Br'''": single3prog, 'Br"""': double3prog,
                         "BR'''": single3prog, 'BR"""': double3prog,
                         'r': None, 'R': None, 'b': None, 'B': None}
             triple_quoted = {}
             for t in ("'''", '"""',
                       "r'''", 'r"""', "R'''", 'R"""',
                       "b'''", 'b"""', "B'''", 'B"""',
                       "br'''", 'br"""', "Br'''", 'Br"""',
                       "bR'''", 'bR"""', "BR'''", 'BR"""'):
                 triple_quoted[t] = t
             single_quoted = {}
             for t in ("'", '"',
                       "r'", 'r"', "R'", 'R"',
                       "b'", 'b"', "B'", 'B"',
                       "br'", 'br"', "Br'", 'Br"',
                       "bR'", 'bR"', "BR'", 'BR"' ):
                 single_quoted[t] = t
             for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
                 _t2 = _prefix+'"""'
                 endprogs[_t2] = double3prog
                 triple_quoted[_t2] = _t2
                 _t1 = _prefix + "'''"
                 endprogs[_t1] = single3prog
                 triple_quoted[_t1] = _t1
                 single_quoted[_prefix+'"'] = _prefix+'"'
                 single_quoted[_prefix+"'"] = _prefix+"'"
             del _prefix, _t2, _t1
             endprogs['u'] = None
             endprogs['U'] = None
             del _compile
             tabsize = 8
             class TokenError(Exception): pass
             class StopTokenizing(Exception): pass
             class Untokenizer:
                 def __init__(self):
                     self.tokens = []
                     self.prev_row = 1
                     self.prev_col = 0
                     self.encoding = 'utf-8'
                 def add_whitespace(self, tok_type, start):
                     row, col = start
                     assert row >= self.prev_row
                     col_offset = col - self.prev_col
                     if col_offset > 0:
                         self.tokens.append(" " * col_offset)
                     elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
                         # Line was backslash-continued.
                         self.tokens.append(" ")
                 def untokenize(self, tokens):
                     iterable = iter(tokens)
                     for t in iterable:
                         if len(t) == 2:
                             self.compat(t, iterable)
                             break
                         tok_type, token, start, end = t[:4]
                         if tok_type == ENCODING:
                             self.encoding = token
                             continue
                         self.add_whitespace(tok_type, start)
                         self.tokens.append(token)
                         self.prev_row, self.prev_col = end
                         if tok_type in (NEWLINE, NL):
                             self.prev_row += 1
                             self.prev_col = 0
                     return "".join(self.tokens)
                 def compat(self, token, iterable):
                     # This import is here to avoid problems when the itertools
                     # module is not built yet and tokenize is imported.
                     from itertools import chain
                     startline = False
                     prevstring = False
                     indents = []
                     toks_append = self.tokens.append
                     for tok in chain([token], iterable):
                         toknum, tokval = tok[:2]
                         if toknum == ENCODING:
                             self.encoding = tokval
                             continue
                         if toknum in (NAME, NUMBER):
                             tokval += ' '
                         # Insert a space between two consecutive strings
                         if toknum == STRING:
                             if prevstring:
                                 tokval = ' ' + tokval
                             prevstring = True
                         else:
                             prevstring = False
                         if toknum == INDENT:
                             indents.append(tokval)
                             continue
                         elif toknum == DEDENT:
                             indents.pop()
                             continue
                         elif toknum in (NEWLINE, NL):
                             startline = True
                         elif startline and indents:
                             toks_append(indents[-1])
                             startline = False
                         toks_append(tokval)
             def untokenize(tokens):
                 """
                 Convert ``tokens`` (an iterable) back into Python source code. Return
                 a bytes object, encoded using the encoding specified by the last
                 ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
                 The result is guaranteed to tokenize back to match the input so that
                 the conversion is lossless and round-trips are assured.  The
                 guarantee applies only to the token type and token string as the
                 spacing between tokens (column positions) may change.
                 :func:`untokenize` has two modes. If the input tokens are sequences
                 of length 2 (``type``, ``string``) then spaces are added as necessary to
                 preserve the round-trip property.
                 If the input tokens are sequences of length 4 or more (``type``,
                 ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
                 spaces are added so that each token appears in the result at the
                 position indicated by ``start`` and ``end``, if possible.
                 """
                 return Untokenizer().untokenize(tokens)
             def _get_normal_name(orig_enc):
                 """Imitates get_normal_name in tokenizer.c."""
                 # Only care about the first 12 characters.
                 enc = orig_enc[:12].lower().replace("_", "-")
                 if enc == "utf-8" or enc.startswith("utf-8-"):
                     return "utf-8"
                 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
                    enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
                     return "iso-8859-1"
                 return orig_enc
             def detect_encoding(readline):
                 """
                 The detect_encoding() function is used to detect the encoding that should
                 be used to decode a Python source file.  It requires one argment, readline,
                 in the same way as the tokenize() generator.
                 It will call readline a maximum of twice, and return the encoding used
                 (as a string) and a list of any lines (left as bytes) it has read in.
                 It detects the encoding from the presence of a utf-8 bom or an encoding
                 cookie as specified in pep-0263.  If both a bom and a cookie are present,
                 but disagree, a SyntaxError will be raised.  If the encoding cookie is an
                 invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
                 'utf-8-sig' is returned.
                 If no encoding is specified, then the default of 'utf-8' will be returned.
                 """
                 bom_found = False
                 encoding = None
                 default = 'utf-8'
                 def read_or_stop():
                     try:
                         return readline()
                     except StopIteration:
                         return b''
                 def find_cookie(line):
                     try:
                         # Decode as UTF-8. Either the line is an encoding declaration,
                         # in which case it should be pure ASCII, or it must be UTF-8
                         # per default encoding.
                         line_string = line.decode('utf-8')
                     except UnicodeDecodeError:
                         raise SyntaxError("invalid or missing encoding declaration")
                     matches = cookie_re.findall(line_string)
                     if not matches:
                         return None
                     encoding = _get_normal_name(matches[0])
                     try:
                         codec = lookup(encoding)
                     except LookupError:
                         # This behaviour mimics the Python interpreter
                         raise SyntaxError("unknown encoding: " + encoding)
                     if bom_found:
                         if encoding != 'utf-8':
                             # This behaviour mimics the Python interpreter
                             raise SyntaxError('encoding problem: utf-8')
                         encoding += '-sig'
                     return encoding
                 first = read_or_stop()
                 if first.startswith(BOM_UTF8):
                     bom_found = True
                     first = first[3:]
                     default = 'utf-8-sig'
                 if not first:
                     return default, []
                 encoding = find_cookie(first)
                 if encoding:
                     return encoding, [first]
                 second = read_or_stop()
                 if not second:
                     return default, [first]
                 encoding = find_cookie(second)
                 if encoding:
                     return encoding, [first, second]
                 return default, [first, second]
             def open(filename):
                 """Open a file in read only mode using the encoding detected by
                 detect_encoding().
                 """
                 buffer = builtins.open(filename, 'rb')
                 encoding, lines = detect_encoding(buffer.readline)
                 buffer.seek(0)
                 text = TextIOWrapper(buffer, encoding, line_buffering=True)
                 text.mode = 'r'
                 return text
             def tokenize(readline):
                 """
                 The tokenize() generator requires one argment, readline, which
                 must be a callable object which provides the same interface as the
                 readline() method of built-in file objects.  Each call to the function
                 should return one line of input as bytes.  Alternately, readline
                 can be a callable function terminating with :class:`StopIteration`::
                     readline = open(myfile, 'rb').__next__  # Example of alternate readline
                 The generator produces 5-tuples with these members: the token type; the
                 token string; a 2-tuple (srow, scol) of ints specifying the row and
                 column where the token begins in the source; a 2-tuple (erow, ecol) of
                 ints specifying the row and column where the token ends in the source;
                 and the line on which the token was found.  The line passed is the
                 logical line; continuation lines are included.
                 The first token sequence will always be an ENCODING token
                 which tells you which encoding was used to decode the bytes stream.
                 """
                 # This import is here to avoid problems when the itertools module is not
                 # built yet and tokenize is imported.
                 from itertools import chain, repeat
                 encoding, consumed = detect_encoding(readline)
                 rl_gen = iter(readline, b"")
                 empty = repeat(b"")
                 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
             def _tokenize(readline, encoding):
                 lnum = parenlev = continued = 0
                 numchars = '0123456789'
                 contstr, needcont = '', 0
                 contline = None
                 indents = [0]
                 if encoding is not None:
                     if encoding == "utf-8-sig":
                         # BOM will already have been stripped.
                         encoding = "utf-8"
                     yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
                 while True:             # loop over lines in stream
                     try:
                         line = readline()
                     except StopIteration:
                         line = b''
                     if encoding is not None:
                         line = line.decode(encoding)
                     lnum += 1
                     pos, max = 0, len(line)
                     if contstr:                            # continued string
                         if not line:
                             raise TokenError("EOF in multi-line string", strstart)
                         endmatch = endprog.match(line)
                         if endmatch:
                             pos = end = endmatch.end(0)
                             yield TokenInfo(STRING, contstr + line[:end],
                                    strstart, (lnum, end), contline + line)
                             contstr, needcont = '', 0
                             contline = None
                         elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
                             yield TokenInfo(ERRORTOKEN, contstr + line,
                                        strstart, (lnum, len(line)), contline)
                             contstr = ''
                             contline = None
                             continue
                         else:
                             contstr = contstr + line
                             contline = contline + line
                             continue
                     elif parenlev == 0 and not continued:  # new statement
                         if not line: break
                         column = 0
                         while pos < max:                   # measure leading whitespace
                             if line[pos] == ' ':
                                 column += 1
                             elif line[pos] == '\t':
                                 column = (column//tabsize + 1)*tabsize
                             elif line[pos] == '\f':
                                 column = 0
                             else:
                                 break
                             pos += 1
                         if pos == max:
                             break
                         if line[pos] in '#\r\n':           # skip comments or blank lines
                             if line[pos] == '#':
                                 comment_token = line[pos:].rstrip('\r\n')
                                 nl_pos = pos + len(comment_token)
                                 yield TokenInfo(COMMENT, comment_token,
                                        (lnum, pos), (lnum, pos + len(comment_token)), line)
                                 yield TokenInfo(NEWLINE, line[nl_pos:],
                                        (lnum, nl_pos), (lnum, len(line)), line)
                             else:
                                 yield TokenInfo(NEWLINE, line[pos:],
                                        (lnum, pos), (lnum, len(line)), line)
                             continue
                         if column > indents[-1]:           # count indents or dedents
                             indents.append(column)
                             yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
                         while column < indents[-1]:
                             if column not in indents:
                                 raise IndentationError(
                                     "unindent does not match any outer indentation level",
                                     ("<tokenize>", lnum, pos, line))
                             indents = indents[:-1]
                             yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
                     else:                                  # continued statement
                         if not line:
                             raise TokenError("EOF in multi-line statement", (lnum, 0))
                         continued = 0
                     while pos < max:
                         pseudomatch = pseudoprog.match(line, pos)
                         if pseudomatch:                                # scan for tokens
                             start, end = pseudomatch.span(1)
                             spos, epos, pos = (lnum, start), (lnum, end), end
                             token, initial = line[start:end], line[start]
                             if (initial in numchars or                  # ordinary number
                                 (initial == '.' and token != '.' and token != '...')):
                                 yield TokenInfo(NUMBER, token, spos, epos, line)
                             elif initial in '\r\n':
                                 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
                                        token, spos, epos, line)
                             elif initial == '#':
                                 assert not token.endswith("\n")
                                 yield TokenInfo(COMMENT, token, spos, epos, line)
                             elif token in triple_quoted:
                                 endprog = endprogs[token]
                                 endmatch = endprog.match(line, pos)
                                 if endmatch:                           # all on one line
                                     pos = endmatch.end(0)
                                     token = line[start:pos]
                                     yield TokenInfo(STRING, token, spos, (lnum, pos), line)
                                 else:
                                     strstart = (lnum, start)           # multiple lines
                                     contstr = line[start:]
                                     contline = line
                                     break
                             elif initial in single_quoted or \
                                 token[:2] in single_quoted or \
                                 token[:3] in single_quoted:
                                 if token[-1] == '\n':                  # continued string
                                     strstart = (lnum, start)
                                     endprog = (endprogs[initial] or endprogs[token[1]] or
                                                endprogs[token[2]])
                                     contstr, needcont = line[start:], 1
                                     contline = line
                                     break
                                 else:                                  # ordinary string
                                     yield TokenInfo(STRING, token, spos, epos, line)
                             elif initial.isidentifier():               # ordinary name
                                 yield TokenInfo(NAME, token, spos, epos, line)
                             elif initial == '\\':                      # continued stmt
                                 continued = 1
                             else:
                                 if initial in '([{':
                                     parenlev += 1
                                 elif initial in ')]}':
                                     parenlev -= 1
                                 yield TokenInfo(OP, token, spos, epos, line)
                         else:
                             yield TokenInfo(ERRORTOKEN, line[pos],
                                        (lnum, pos), (lnum, pos+1), line)
                             pos += 1
                 for indent in indents[1:]:                 # pop remaining indent levels
                     yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
                 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
             # An undocumented, backwards compatible, API for all the places in the standard
             # library that expect to be able to use tokenize with strings
             def generate_tokens(readline):
                 return _tokenize(readline, None)
             if __name__ == "__main__":
                 # Quick sanity check
                 s = b'''def parseline(self, line):
                         """Parse the line into a command name and a string containing
                         the arguments.  Returns a tuple containing (command, args, line).
                         'command' and 'args' may be None if the line couldn't be parsed.
                         """
                         line = line.strip()
                         if not line:
                             return None, None, line
                         elif line[0] == '?':
                             line = 'help ' + line[1:]
                         elif line[0] == '!':
                             if hasattr(self, 'do_shell'):
                                 line = 'shell ' + line[1:]
                             else:
                                 return None, None, line
                         i, n = 0, len(line)
                         while i < n and line[i] in self.identchars: i = i+1
                         cmd, arg = line[:i], line[i:].strip()
                         return cmd, arg, line
                 '''
                 for tok in tokenize(iter(s.splitlines()).__next__):
                     print(tok)