upstream/ipython Commit - r8314:a58f4cc5

1

"""

1

"""

2

Tools to open .py files as Unicode, using the encoding specified within the file,

2

Tools to open .py files as Unicode, using the encoding specified within the file,

3

as per PEP 263.

3

as per PEP 263.

4

5

Much of the code is taken from the tokenize module in Python 3.2.

5

Much of the code is taken from the tokenize module in Python 3.2.

6

"""

6

"""

7

from __future__ import absolute_import

7

from __future__ import absolute_import

8

9

import io

9

import io

10

from io import TextIOWrapper, BytesIO

10

from io import TextIOWrapper, BytesIO

11

import re

11

import re

12

import urllib

12

import urllib

13

14

cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)

14

cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)

15

cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)

15

cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)

16

17

try:

17

try:

18

# Available in Python 3

18

# Available in Python 3

19

from tokenize import detect_encoding

19

from tokenize import detect_encoding

20

except ImportError:

20

except ImportError:

21

from codecs import lookup, BOM_UTF8

21

from codecs import lookup, BOM_UTF8

22

23

# Copied from Python 3.2 tokenize

23

# Copied from Python 3.2 tokenize

24

def _get_normal_name(orig_enc):

24

def _get_normal_name(orig_enc):

25

"""Imitates get_normal_name in tokenizer.c."""

25

"""Imitates get_normal_name in tokenizer.c."""

26

# Only care about the first 12 characters.

26

# Only care about the first 12 characters.

27

enc = orig_enc[:12].lower().replace("_", "-")

27

enc = orig_enc[:12].lower().replace("_", "-")

28

if enc == "utf-8" or enc.startswith("utf-8-"):

28

if enc == "utf-8" or enc.startswith("utf-8-"):

29

return "utf-8"

29

return "utf-8"

30

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

30

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

31

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

31

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

32

return "iso-8859-1"

32

return "iso-8859-1"

33

return orig_enc

33

return orig_enc

34

35

# Copied from Python 3.2 tokenize

35

# Copied from Python 3.2 tokenize

36

def detect_encoding(readline):

36

def detect_encoding(readline):

37

"""

37

"""

38

The detect_encoding() function is used to detect the encoding that should

38

The detect_encoding() function is used to detect the encoding that should

39

be used to decode a Python source file. It requires one argment, readline,

39

be used to decode a Python source file. It requires one argment, readline,

40

in the same way as the tokenize() generator.

40

in the same way as the tokenize() generator.

41

42

It will call readline a maximum of twice, and return the encoding used

42

It will call readline a maximum of twice, and return the encoding used

43

(as a string) and a list of any lines (left as bytes) it has read in.

43

(as a string) and a list of any lines (left as bytes) it has read in.

44

45

It detects the encoding from the presence of a utf-8 bom or an encoding

45

It detects the encoding from the presence of a utf-8 bom or an encoding

46

cookie as specified in pep-0263. If both a bom and a cookie are present,

46

cookie as specified in pep-0263. If both a bom and a cookie are present,

47

but disagree, a SyntaxError will be raised. If the encoding cookie is an

47

but disagree, a SyntaxError will be raised. If the encoding cookie is an

48

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

48

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

49

'utf-8-sig' is returned.

49

'utf-8-sig' is returned.

50

51

If no encoding is specified, then the default of 'utf-8' will be returned.

51

If no encoding is specified, then the default of 'utf-8' will be returned.

52

"""

52

"""

53

bom_found = False

53

bom_found = False

54

encoding = None

54

encoding = None

55

default = 'utf-8'

55

default = 'utf-8'

56

def read_or_stop():

56

def read_or_stop():

57

try:

57

try:

58

return readline()

58

return readline()

59

except StopIteration:

59

except StopIteration:

60

return b''

60

return b''

61

62

def find_cookie(line):

62

def find_cookie(line):

63

try:

63

try:

64

line_string = line.decode('ascii')

64

line_string = line.decode('ascii')

65

except UnicodeDecodeError:

65

except UnicodeDecodeError:

66

return None

66

return None

67

68

matches = cookie_re.findall(line_string)

68

matches = cookie_re.findall(line_string)

69

if not matches:

69

if not matches:

70

return None

70

return None

71

encoding = _get_normal_name(matches[0])

71

encoding = _get_normal_name(matches[0])

72

try:

72

try:

73

codec = lookup(encoding)

73

codec = lookup(encoding)

74

except LookupError:

74

except LookupError:

75

# This behaviour mimics the Python interpreter

75

# This behaviour mimics the Python interpreter

76

raise SyntaxError("unknown encoding: " + encoding)

76

raise SyntaxError("unknown encoding: " + encoding)

77

78

if bom_found:

78

if bom_found:

79

if codec.name != 'utf-8':

79

if codec.name != 'utf-8':

80

# This behaviour mimics the Python interpreter

80

# This behaviour mimics the Python interpreter

81

raise SyntaxError('encoding problem: utf-8')

81

raise SyntaxError('encoding problem: utf-8')

82

encoding += '-sig'

82

encoding += '-sig'

83

return encoding

83

return encoding

84

85

first = read_or_stop()

85

first = read_or_stop()

86

if first.startswith(BOM_UTF8):

86

if first.startswith(BOM_UTF8):

87

bom_found = True

87

bom_found = True

88

first = first[3:]

88

first = first[3:]

89

default = 'utf-8-sig'

89

default = 'utf-8-sig'

90

if not first:

90

if not first:

91

return default, []

91

return default, []

92

93

encoding = find_cookie(first)

93

encoding = find_cookie(first)

94

if encoding:

94

if encoding:

95

return encoding, [first]

95

return encoding, [first]

96

97

second = read_or_stop()

97

second = read_or_stop()

98

if not second:

98

if not second:

99

return default, [first]

99

return default, [first]

100

101

encoding = find_cookie(second)

101

encoding = find_cookie(second)

102

if encoding:

102

if encoding:

103

return encoding, [first, second]

103

return encoding, [first, second]

104

105

return default, [first, second]

105

return default, [first, second]

106

107

try:

107

try:

108

# Available in Python 3.2 and above.

108

# Available in Python 3.2 and above.

109

from tokenize import open

109

from tokenize import open

110

except ImportError:

110

except ImportError:

111

# Copied from Python 3.2 tokenize

111

# Copied from Python 3.2 tokenize

112

def open(filename):

112

def open(filename):

113

"""Open a file in read only mode using the encoding detected by

113

"""Open a file in read only mode using the encoding detected by

114

detect_encoding().

114

detect_encoding().

115

"""

115

"""

116

buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2

116

buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2

117

encoding, lines = detect_encoding(buffer.readline)

117

encoding, lines = detect_encoding(buffer.readline)

118

buffer.seek(0)

118

buffer.seek(0)

119

text = TextIOWrapper(buffer, encoding, line_buffering=True)

119

text = TextIOWrapper(buffer, encoding, line_buffering=True)

120

text.mode = 'r'

120

text.mode = 'r'

121

return text

121

return text

122

123

def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):

123

def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):

124

"""Converts a bytes string with python source code to unicode.

124

"""Converts a bytes string with python source code to unicode.

125

126

Unicode strings are passed through unchanged. Byte strings are checked

126

Unicode strings are passed through unchanged. Byte strings are checked

127

for the python source file encoding cookie to determine encoding.

127

for the python source file encoding cookie to determine encoding.

128

txt can be either a bytes buffer or a string containing the source

128

txt can be either a bytes buffer or a string containing the source

129

code.

129

code.

130

"""

130

"""

131

if isinstance(txt, unicode):

131

if isinstance(txt, unicode):

132

return txt

132

return txt

133

if isinstance(txt, ~~str~~):

133

if isinstance(txt, bytes):

134

buffer = BytesIO(txt)

134

buffer = BytesIO(txt)

135

else:

135

else:

136

buffer = txt

136

buffer = txt

137

try:

137

try:

138

encoding, _ = detect_encoding(buffer.readline)

138

encoding, _ = detect_encoding(buffer.readline)

139

except SyntaxError:

139

except SyntaxError:

140

encoding = "ascii"

140

encoding = "ascii"

141

buffer.seek(0)

141

buffer.seek(0)

142

text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)

142

text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)

143

text.mode = 'r'

143

text.mode = 'r'

144

if skip_encoding_cookie:

144

if skip_encoding_cookie:

145

return u"".join(strip_encoding_cookie(text))

145

return u"".join(strip_encoding_cookie(text))

146

else:

146

else:

147

return text.read()

147

return text.read()

148

149

def strip_encoding_cookie(filelike):

149

def strip_encoding_cookie(filelike):

150

"""Generator to pull lines from a text-mode file, skipping the encoding

150

"""Generator to pull lines from a text-mode file, skipping the encoding

151

cookie if it is found in the first two lines.

151

cookie if it is found in the first two lines.

152

"""

152

"""

153

it = iter(filelike)

153

it = iter(filelike)

154

try:

154

try:

155

first = next(it)

155

first = next(it)

156

if not cookie_comment_re.match(first):

156

if not cookie_comment_re.match(first):

157

yield first

157

yield first

158

second = next(it)

158

second = next(it)

159

if not cookie_comment_re.match(second):

159

if not cookie_comment_re.match(second):

160

yield second

160

yield second

161

except StopIteration:

161

except StopIteration:

162

return

162

return

163

164

for line in it:

164

for line in it:

165

yield line

165

yield line

166

167

def read_py_file(filename, skip_encoding_cookie=True):

167

def read_py_file(filename, skip_encoding_cookie=True):

168

"""Read a Python file, using the encoding declared inside the file.

168

"""Read a Python file, using the encoding declared inside the file.

169

170

Parameters

170

Parameters

171

----------

171

----------

172

filename : str

172

filename : str

173

The path to the file to read.

173

The path to the file to read.

174

skip_encoding_cookie : bool

174

skip_encoding_cookie : bool

175

If True (the default), and the encoding declaration is found in the first

175

If True (the default), and the encoding declaration is found in the first

176

two lines, that line will be excluded from the output - compiling a

176

two lines, that line will be excluded from the output - compiling a

177

unicode string with an encoding declaration is a SyntaxError in Python 2.

177

unicode string with an encoding declaration is a SyntaxError in Python 2.

178

179

Returns

179

Returns

180

-------

180

-------

181

A unicode string containing the contents of the file.

181

A unicode string containing the contents of the file.

182

"""

182

"""

183

with open(filename) as f: # the open function defined in this module.

183

with open(filename) as f: # the open function defined in this module.

184

if skip_encoding_cookie:

184

if skip_encoding_cookie:

185

return "".join(strip_encoding_cookie(f))

185

return "".join(strip_encoding_cookie(f))

186

else:

186

else:

187

return f.read()

187

return f.read()

188

189

def read_py_url(url, errors='replace', skip_encoding_cookie=True):

189

def read_py_url(url, errors='replace', skip_encoding_cookie=True):

190

"""Read a Python file from a URL, using the encoding declared inside the file.

190

"""Read a Python file from a URL, using the encoding declared inside the file.

191

192

Parameters

192

Parameters

193

----------

193

----------

194

url : str

194

url : str

195

The URL from which to fetch the file.

195

The URL from which to fetch the file.

196

errors : str

196

errors : str

197

How to handle decoding errors in the file. Options are the same as for

197

How to handle decoding errors in the file. Options are the same as for

198

bytes.decode(), but here 'replace' is the default.

198

bytes.decode(), but here 'replace' is the default.

199

skip_encoding_cookie : bool

199

skip_encoding_cookie : bool

200

If True (the default), and the encoding declaration is found in the first

200

If True (the default), and the encoding declaration is found in the first

201

two lines, that line will be excluded from the output - compiling a

201

two lines, that line will be excluded from the output - compiling a

202

unicode string with an encoding declaration is a SyntaxError in Python 2.

202

unicode string with an encoding declaration is a SyntaxError in Python 2.

203

204

Returns

204

Returns

205

-------

205

-------

206

A unicode string containing the contents of the file.

206

A unicode string containing the contents of the file.

207

"""

207

"""

208

response = urllib.urlopen(url)

208

response = urllib.urlopen(url)

209

buffer = io.BytesIO(response.read())

209

buffer = io.BytesIO(response.read())

210

return source_to_unicode(buffer, errors, skip_encoding_cookie)

210

return source_to_unicode(buffer, errors, skip_encoding_cookie)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             """
             Tools to open .py files as Unicode, using the encoding specified within the file,
             as per PEP 263.
             Much of the code is taken from the tokenize module in Python 3.2.
             """
             from __future__ import absolute_import
             import io
             from io import TextIOWrapper, BytesIO
             import re
             import urllib
             cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
             cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
             try:
                 # Available in Python 3
                 from tokenize import detect_encoding
             except ImportError:
                 from codecs import lookup, BOM_UTF8
                 # Copied from Python 3.2 tokenize
                 def _get_normal_name(orig_enc):
                     """Imitates get_normal_name in tokenizer.c."""
                     # Only care about the first 12 characters.
                     enc = orig_enc[:12].lower().replace("_", "-")
                     if enc == "utf-8" or enc.startswith("utf-8-"):
                         return "utf-8"
                     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
                        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
                         return "iso-8859-1"
                     return orig_enc
                 # Copied from Python 3.2 tokenize
                 def detect_encoding(readline):
                     """
                     The detect_encoding() function is used to detect the encoding that should
                     be used to decode a Python source file.  It requires one argment, readline,
                     in the same way as the tokenize() generator.
                     It will call readline a maximum of twice, and return the encoding used
                     (as a string) and a list of any lines (left as bytes) it has read in.
                     It detects the encoding from the presence of a utf-8 bom or an encoding
                     cookie as specified in pep-0263.  If both a bom and a cookie are present,
                     but disagree, a SyntaxError will be raised.  If the encoding cookie is an
                     invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
                     'utf-8-sig' is returned.
                     If no encoding is specified, then the default of 'utf-8' will be returned.
                     """
                     bom_found = False
                     encoding = None
                     default = 'utf-8'
                     def read_or_stop():
                         try:
                             return readline()
                         except StopIteration:
                             return b''
                     def find_cookie(line):
                         try:
                             line_string = line.decode('ascii')
                         except UnicodeDecodeError:
                             return None
                         matches = cookie_re.findall(line_string)
                         if not matches:
                             return None
                         encoding = _get_normal_name(matches[0])
                         try:
                             codec = lookup(encoding)
                         except LookupError:
                             # This behaviour mimics the Python interpreter
                             raise SyntaxError("unknown encoding: " + encoding)
                         if bom_found:
                             if codec.name != 'utf-8':
                                 # This behaviour mimics the Python interpreter
                                 raise SyntaxError('encoding problem: utf-8')
                             encoding += '-sig'
                         return encoding
                     first = read_or_stop()
                     if first.startswith(BOM_UTF8):
                         bom_found = True
                         first = first[3:]
                         default = 'utf-8-sig'
                     if not first:
                         return default, []
                     encoding = find_cookie(first)
                     if encoding:
                         return encoding, [first]
                     second = read_or_stop()
                     if not second:
                         return default, [first]
                     encoding = find_cookie(second)
                     if encoding:
                         return encoding, [first, second]
                     return default, [first, second]
             try:
                 # Available in Python 3.2 and above.
                 from tokenize import open
             except ImportError:
                 # Copied from Python 3.2 tokenize
                 def open(filename):
                     """Open a file in read only mode using the encoding detected by
                     detect_encoding().
                     """
                     buffer = io.open(filename, 'rb')   # Tweaked to use io.open for Python 2
                     encoding, lines = detect_encoding(buffer.readline)
                     buffer.seek(0)
                     text = TextIOWrapper(buffer, encoding, line_buffering=True)
                     text.mode = 'r'
                     return text
             def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
                 """Converts a bytes string with python source code to unicode.
                 Unicode strings are passed through unchanged. Byte strings are checked
                 for the python source file encoding cookie to determine encoding.
                 txt can be either a bytes buffer or a string containing the source
                 code.
                 """
                 if isinstance(txt, unicode):
                     return txt
-                if isinstance(txt, str):
+                if isinstance(txt, bytes):
                     buffer = BytesIO(txt)
                 else:
                     buffer = txt
                 try:
                     encoding, _ = detect_encoding(buffer.readline)
                 except SyntaxError:
                     encoding = "ascii"
                 buffer.seek(0)
                 text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
                 text.mode = 'r'
                 if skip_encoding_cookie:
                     return u"".join(strip_encoding_cookie(text))
                 else:
                     return text.read()
             def strip_encoding_cookie(filelike):
                 """Generator to pull lines from a text-mode file, skipping the encoding
                 cookie if it is found in the first two lines.
                 """
                 it = iter(filelike)
                 try:
                     first = next(it)
                     if not cookie_comment_re.match(first):
                         yield first
                     second = next(it)
                     if not cookie_comment_re.match(second):
                         yield second
                 except StopIteration:
                     return
                 for line in it:
                     yield line
             def read_py_file(filename, skip_encoding_cookie=True):
                 """Read a Python file, using the encoding declared inside the file.
                 Parameters
                 ----------
                 filename : str
                   The path to the file to read.
                 skip_encoding_cookie : bool
                   If True (the default), and the encoding declaration is found in the first
                   two lines, that line will be excluded from the output - compiling a
                   unicode string with an encoding declaration is a SyntaxError in Python 2.
                 Returns
                 -------
                 A unicode string containing the contents of the file.
                 """
                 with open(filename) as f:   # the open function defined in this module.
                     if skip_encoding_cookie:
                         return "".join(strip_encoding_cookie(f))
                     else:
                         return f.read()
             def read_py_url(url, errors='replace', skip_encoding_cookie=True):
                 """Read a Python file from a URL, using the encoding declared inside the file.
                 Parameters
                 ----------
                 url : str
                   The URL from which to fetch the file.
                 errors : str
                   How to handle decoding errors in the file. Options are the same as for
                   bytes.decode(), but here 'replace' is the default.
                 skip_encoding_cookie : bool
                   If True (the default), and the encoding declaration is found in the first
                   two lines, that line will be excluded from the output - compiling a
                   unicode string with an encoding declaration is a SyntaxError in Python 2.
                 Returns
                 -------
                 A unicode string containing the contents of the file.
                 """
                 response = urllib.urlopen(url)
                 buffer = io.BytesIO(response.read())
                 return source_to_unicode(buffer, errors, skip_encoding_cookie)