upstream/ipython Commit - r8309:aca66064

1

"""

1

"""

2

Tools to open .py files as Unicode, using the encoding specified within the file,

2

Tools to open .py files as Unicode, using the encoding specified within the file,

3

as per PEP 263.

3

as per PEP 263.

4

5

Much of the code is taken from the tokenize module in Python 3.2.

5

Much of the code is taken from the tokenize module in Python 3.2.

6

"""

6

"""

7

from __future__ import absolute_import

7

from __future__ import absolute_import

8

9

import io

9

import io

10

from io import TextIOWrapper

10

from io import TextIOWrapper, BytesIO

11

import re

11

import re

12

from StringIO import StringIO

13

import urllib

12

import urllib

14

13

15

cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)

14

cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)

16

cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)

15

cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)

17

16

18

try:

17

try:

19

# Available in Python 3

18

# Available in Python 3

20

from tokenize import detect_encoding

19

from tokenize import detect_encoding

21

except ImportError:

20

except ImportError:

22

from codecs import lookup, BOM_UTF8

21

from codecs import lookup, BOM_UTF8

23

22

24

# Copied from Python 3.2 tokenize

23

# Copied from Python 3.2 tokenize

25

def _get_normal_name(orig_enc):

24

def _get_normal_name(orig_enc):

26

"""Imitates get_normal_name in tokenizer.c."""

25

"""Imitates get_normal_name in tokenizer.c."""

27

# Only care about the first 12 characters.

26

# Only care about the first 12 characters.

28

enc = orig_enc[:12].lower().replace("_", "-")

27

enc = orig_enc[:12].lower().replace("_", "-")

29

if enc == "utf-8" or enc.startswith("utf-8-"):

28

if enc == "utf-8" or enc.startswith("utf-8-"):

30

return "utf-8"

29

return "utf-8"

31

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

30

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

32

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

31

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

33

return "iso-8859-1"

32

return "iso-8859-1"

34

return orig_enc

33

return orig_enc

35

34

36

# Copied from Python 3.2 tokenize

35

# Copied from Python 3.2 tokenize

37

def detect_encoding(readline):

36

def detect_encoding(readline):

38

"""

37

"""

39

The detect_encoding() function is used to detect the encoding that should

38

The detect_encoding() function is used to detect the encoding that should

40

be used to decode a Python source file. It requires one argment, readline,

39

be used to decode a Python source file. It requires one argment, readline,

41

in the same way as the tokenize() generator.

40

in the same way as the tokenize() generator.

42

41

43

It will call readline a maximum of twice, and return the encoding used

42

It will call readline a maximum of twice, and return the encoding used

44

(as a string) and a list of any lines (left as bytes) it has read in.

43

(as a string) and a list of any lines (left as bytes) it has read in.

45

44

46

It detects the encoding from the presence of a utf-8 bom or an encoding

45

It detects the encoding from the presence of a utf-8 bom or an encoding

47

cookie as specified in pep-0263. If both a bom and a cookie are present,

46

cookie as specified in pep-0263. If both a bom and a cookie are present,

48

but disagree, a SyntaxError will be raised. If the encoding cookie is an

47

but disagree, a SyntaxError will be raised. If the encoding cookie is an

49

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

48

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

50

'utf-8-sig' is returned.

49

'utf-8-sig' is returned.

51

50

52

If no encoding is specified, then the default of 'utf-8' will be returned.

51

If no encoding is specified, then the default of 'utf-8' will be returned.

53

"""

52

"""

54

bom_found = False

53

bom_found = False

55

encoding = None

54

encoding = None

56

default = 'utf-8'

55

default = 'utf-8'

57

def read_or_stop():

56

def read_or_stop():

58

try:

57

try:

59

return readline()

58

return readline()

60

except StopIteration:

59

except StopIteration:

61

return b''

60

return b''

62

61

63

def find_cookie(line):

62

def find_cookie(line):

64

try:

63

try:

65

line_string = line.decode('ascii')

64

line_string = line.decode('ascii')

66

except UnicodeDecodeError:

65

except UnicodeDecodeError:

67

return None

66

return None

68

67

69

matches = cookie_re.findall(line_string)

68

matches = cookie_re.findall(line_string)

70

if not matches:

69

if not matches:

71

return None

70

return None

72

encoding = _get_normal_name(matches[0])

71

encoding = _get_normal_name(matches[0])

73

try:

72

try:

74

codec = lookup(encoding)

73

codec = lookup(encoding)

75

except LookupError:

74

except LookupError:

76

# This behaviour mimics the Python interpreter

75

# This behaviour mimics the Python interpreter

77

raise SyntaxError("unknown encoding: " + encoding)

76

raise SyntaxError("unknown encoding: " + encoding)

78

77

79

if bom_found:

78

if bom_found:

80

if codec.name != 'utf-8':

79

if codec.name != 'utf-8':

81

# This behaviour mimics the Python interpreter

80

# This behaviour mimics the Python interpreter

82

raise SyntaxError('encoding problem: utf-8')

81

raise SyntaxError('encoding problem: utf-8')

83

encoding += '-sig'

82

encoding += '-sig'

84

return encoding

83

return encoding

85

84

86

first = read_or_stop()

85

first = read_or_stop()

87

if first.startswith(BOM_UTF8):

86

if first.startswith(BOM_UTF8):

88

bom_found = True

87

bom_found = True

89

first = first[3:]

88

first = first[3:]

90

default = 'utf-8-sig'

89

default = 'utf-8-sig'

91

if not first:

90

if not first:

92

return default, []

91

return default, []

93

92

94

encoding = find_cookie(first)

93

encoding = find_cookie(first)

95

if encoding:

94

if encoding:

96

return encoding, [first]

95

return encoding, [first]

97

96

98

second = read_or_stop()

97

second = read_or_stop()

99

if not second:

98

if not second:

100

return default, [first]

99

return default, [first]

101

100

102

encoding = find_cookie(second)

101

encoding = find_cookie(second)

103

if encoding:

102

if encoding:

104

return encoding, [first, second]

103

return encoding, [first, second]

105

104

106

return default, [first, second]

105

return default, [first, second]

107

106

108

try:

107

try:

109

# Available in Python 3.2 and above.

108

# Available in Python 3.2 and above.

110

from tokenize import open

109

from tokenize import open

111

except ImportError:

110

except ImportError:

112

# Copied from Python 3.2 tokenize

111

# Copied from Python 3.2 tokenize

113

def open(filename):

112

def open(filename):

114

"""Open a file in read only mode using the encoding detected by

113

"""Open a file in read only mode using the encoding detected by

115

detect_encoding().

114

detect_encoding().

116

"""

115

"""

117

buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2

116

buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2

118

encoding, lines = detect_encoding(buffer.readline)

117

encoding, lines = detect_encoding(buffer.readline)

119

buffer.seek(0)

118

buffer.seek(0)

120

text = TextIOWrapper(buffer, encoding, line_buffering=True)

119

text = TextIOWrapper(buffer, encoding, line_buffering=True)

121

text.mode = 'r'

120

text.mode = 'r'

122

return text

121

return text

123

122

124

def source_to_unicode(txt):

123

def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):

125

"""Converts string with python source code to unicode

124

"""Converts a bytes string with python source code to unicode.

125

126

Unicode strings are passed through unchanged. Byte strings are checked

127

for the python source file encoding cookie to determine encoding.

128

txt can be either a bytes buffer or a string containing the source

129

code.

126

"""

130

"""

127

if isinstance(txt, unicode):

131

if isinstance(txt, unicode):

128

return txt

132

return txt

133

if isinstance(txt, str):

134

buffer = BytesIO(txt)

135

else:

136

buffer = txt

129

try:

137

try:

130

coding, _ = detect_encoding(~~StringIO~~(~~txt~~).readline)

138

encoding, _ = detect_encoding(buffer.readline)

131

except SyntaxError:

139

except SyntaxError:

132

coding = "ascii"

140

encoding = "ascii"

133

return txt.decode(coding, errors="replace")

141

buffer.seek(0)

142

text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)

143

text.mode = 'r'

144

if skip_encoding_cookie:

145

return u"".join(strip_encoding_cookie(text))

146

else:

147

return text.read()

134

148

135

def strip_encoding_cookie(filelike):

149

def strip_encoding_cookie(filelike):

136

"""Generator to pull lines from a text-mode file, skipping the encoding

150

"""Generator to pull lines from a text-mode file, skipping the encoding

137

cookie if it is found in the first two lines.

151

cookie if it is found in the first two lines.

138

"""

152

"""

139

it = iter(filelike)

153

it = iter(filelike)

140

try:

154

try:

141

first = next(it)

155

first = next(it)

142

if not cookie_comment_re.match(first):

156

if not cookie_comment_re.match(first):

143

yield first

157

yield first

144

second = next(it)

158

second = next(it)

145

if not cookie_comment_re.match(second):

159

if not cookie_comment_re.match(second):

146

yield second

160

yield second

147

except StopIteration:

161

except StopIteration:

148

return

162

return

149

163

150

for line in it:

164

for line in it:

151

yield line

165

yield line

152

166

153

def read_py_file(filename, skip_encoding_cookie=True):

167

def read_py_file(filename, skip_encoding_cookie=True):

154

"""Read a Python file, using the encoding declared inside the file.

168

"""Read a Python file, using the encoding declared inside the file.

155

169

156

Parameters

170

Parameters

157

----------

171

----------

158

filename : str

172

filename : str

159

The path to the file to read.

173

The path to the file to read.

160

skip_encoding_cookie : bool

174

skip_encoding_cookie : bool

161

If True (the default), and the encoding declaration is found in the first

175

If True (the default), and the encoding declaration is found in the first

162

two lines, that line will be excluded from the output - compiling a

176

two lines, that line will be excluded from the output - compiling a

163

unicode string with an encoding declaration is a SyntaxError in Python 2.

177

unicode string with an encoding declaration is a SyntaxError in Python 2.

164

178

165

Returns

179

Returns

166

-------

180

-------

167

A unicode string containing the contents of the file.

181

A unicode string containing the contents of the file.

168

"""

182

"""

169

with open(filename) as f: # the open function defined in this module.

183

with open(filename) as f: # the open function defined in this module.

170

if skip_encoding_cookie:

184

if skip_encoding_cookie:

171

return "".join(strip_encoding_cookie(f))

185

return "".join(strip_encoding_cookie(f))

172

else:

186

else:

173

return f.read()

187

return f.read()

174

188

175

def read_py_url(url, errors='replace', skip_encoding_cookie=True):

189

def read_py_url(url, errors='replace', skip_encoding_cookie=True):

176

"""Read a Python file from a URL, using the encoding declared inside the file.

190

"""Read a Python file from a URL, using the encoding declared inside the file.

177

191

178

Parameters

192

Parameters

179

----------

193

----------

180

url : str

194

url : str

181

The URL from which to fetch the file.

195

The URL from which to fetch the file.

182

errors : str

196

errors : str

183

How to handle decoding errors in the file. Options are the same as for

197

How to handle decoding errors in the file. Options are the same as for

184

bytes.decode(), but here 'replace' is the default.

198

bytes.decode(), but here 'replace' is the default.

185

skip_encoding_cookie : bool

199

skip_encoding_cookie : bool

186

If True (the default), and the encoding declaration is found in the first

200

If True (the default), and the encoding declaration is found in the first

187

two lines, that line will be excluded from the output - compiling a

201

two lines, that line will be excluded from the output - compiling a

188

unicode string with an encoding declaration is a SyntaxError in Python 2.

202

unicode string with an encoding declaration is a SyntaxError in Python 2.

189

203

190

Returns

204

Returns

191

-------

205

-------

192

A unicode string containing the contents of the file.

206

A unicode string containing the contents of the file.

193

"""

207

"""

194

response = urllib.urlopen(url)

208

response = urllib.urlopen(url)

195

buffer = io.BytesIO(response.read())

209

buffer = io.BytesIO(response.read())

196

encoding, lines = detect_encoding(buffer.readline)

210

return source_to_unicode(buffer, errors, skip_encoding_cookie)

197

buffer.seek(0)

198

text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)

199

text.mode = 'r'

200

if skip_encoding_cookie:

201

return "".join(strip_encoding_cookie(text))

202

else:

203

return text.read()

204

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             """
             Tools to open .py files as Unicode, using the encoding specified within the file,
             as per PEP 263.
             Much of the code is taken from the tokenize module in Python 3.2.
             """
             from __future__ import absolute_import
             import io
-            from io import TextIOWrapper
+            from io import TextIOWrapper, BytesIO
             import re
-            from StringIO import StringIO
             import urllib
             cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
             cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
             try:
                 # Available in Python 3
                 from tokenize import detect_encoding
             except ImportError:
                 from codecs import lookup, BOM_UTF8
                 # Copied from Python 3.2 tokenize
                 def _get_normal_name(orig_enc):
                     """Imitates get_normal_name in tokenizer.c."""
                     # Only care about the first 12 characters.
                     enc = orig_enc[:12].lower().replace("_", "-")
                     if enc == "utf-8" or enc.startswith("utf-8-"):
                         return "utf-8"
                     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
                        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
                         return "iso-8859-1"
                     return orig_enc
                 # Copied from Python 3.2 tokenize
                 def detect_encoding(readline):
                     """
                     The detect_encoding() function is used to detect the encoding that should
                     be used to decode a Python source file.  It requires one argment, readline,
                     in the same way as the tokenize() generator.
                     It will call readline a maximum of twice, and return the encoding used
                     (as a string) and a list of any lines (left as bytes) it has read in.
                     It detects the encoding from the presence of a utf-8 bom or an encoding
                     cookie as specified in pep-0263.  If both a bom and a cookie are present,
                     but disagree, a SyntaxError will be raised.  If the encoding cookie is an
                     invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
                     'utf-8-sig' is returned.
                     If no encoding is specified, then the default of 'utf-8' will be returned.
                     """
                     bom_found = False
                     encoding = None
                     default = 'utf-8'
                     def read_or_stop():
                         try:
                             return readline()
                         except StopIteration:
                             return b''
                     def find_cookie(line):
                         try:
                             line_string = line.decode('ascii')
                         except UnicodeDecodeError:
                             return None
                         matches = cookie_re.findall(line_string)
                         if not matches:
                             return None
                         encoding = _get_normal_name(matches[0])
                         try:
                             codec = lookup(encoding)
                         except LookupError:
                             # This behaviour mimics the Python interpreter
                             raise SyntaxError("unknown encoding: " + encoding)
                         if bom_found:
                             if codec.name != 'utf-8':
                                 # This behaviour mimics the Python interpreter
                                 raise SyntaxError('encoding problem: utf-8')
                             encoding += '-sig'
                         return encoding
                     first = read_or_stop()
                     if first.startswith(BOM_UTF8):
                         bom_found = True
                         first = first[3:]
                         default = 'utf-8-sig'
                     if not first:
                         return default, []
                     encoding = find_cookie(first)
                     if encoding:
                         return encoding, [first]
                     second = read_or_stop()
                     if not second:
                         return default, [first]
                     encoding = find_cookie(second)
                     if encoding:
                         return encoding, [first, second]
                     return default, [first, second]
             try:
                 # Available in Python 3.2 and above.
                 from tokenize import open
             except ImportError:
                 # Copied from Python 3.2 tokenize
                 def open(filename):
                     """Open a file in read only mode using the encoding detected by
                     detect_encoding().
                     """
                     buffer = io.open(filename, 'rb')   # Tweaked to use io.open for Python 2
                     encoding, lines = detect_encoding(buffer.readline)
                     buffer.seek(0)
                     text = TextIOWrapper(buffer, encoding, line_buffering=True)
                     text.mode = 'r'
                     return text
-            def source_to_unicode(txt):
+            def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
-                """Converts string with python source code to unicode
+                """Converts a bytes string with python source code to unicode.
+                Unicode strings are passed through unchanged. Byte strings are checked
+                for the python source file encoding cookie to determine encoding.
+                txt can be either a bytes buffer or a string containing the source
+                code.
                 """
                 if isinstance(txt, unicode):
                     return txt
+                if isinstance(txt, str):
+                    buffer = BytesIO(txt)
+                else:
+                    buffer = txt
                 try:
-                    coding, _ = detect_encoding(StringIO(txt).readline)
+                    encoding, _ = detect_encoding(buffer.readline)
                 except SyntaxError:
-                    coding = "ascii"
+                    encoding = "ascii"
-                return txt.decode(coding, errors="replace")
+                buffer.seek(0)
+                text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
+                text.mode = 'r'
+                if skip_encoding_cookie:
+                    return u"".join(strip_encoding_cookie(text))
+                else:
+                    return text.read()
             def strip_encoding_cookie(filelike):
                 """Generator to pull lines from a text-mode file, skipping the encoding
                 cookie if it is found in the first two lines.
                 """
                 it = iter(filelike)
                 try:
                     first = next(it)
                     if not cookie_comment_re.match(first):
                         yield first
                     second = next(it)
                     if not cookie_comment_re.match(second):
                         yield second
                 except StopIteration:
                     return
                 for line in it:
                     yield line
             def read_py_file(filename, skip_encoding_cookie=True):
                 """Read a Python file, using the encoding declared inside the file.
                 Parameters
                 ----------
                 filename : str
                   The path to the file to read.
                 skip_encoding_cookie : bool
                   If True (the default), and the encoding declaration is found in the first
                   two lines, that line will be excluded from the output - compiling a
                   unicode string with an encoding declaration is a SyntaxError in Python 2.
                 Returns
                 -------
                 A unicode string containing the contents of the file.
                 """
                 with open(filename) as f:   # the open function defined in this module.
                     if skip_encoding_cookie:
                         return "".join(strip_encoding_cookie(f))
                     else:
                         return f.read()
             def read_py_url(url, errors='replace', skip_encoding_cookie=True):
                 """Read a Python file from a URL, using the encoding declared inside the file.
                 Parameters
                 ----------
                 url : str
                   The URL from which to fetch the file.
                 errors : str
                   How to handle decoding errors in the file. Options are the same as for
                   bytes.decode(), but here 'replace' is the default.
                 skip_encoding_cookie : bool
                   If True (the default), and the encoding declaration is found in the first
                   two lines, that line will be excluded from the output - compiling a
                   unicode string with an encoding declaration is a SyntaxError in Python 2.
                 Returns
                 -------
                 A unicode string containing the contents of the file.
                 """
                 response = urllib.urlopen(url)
                 buffer = io.BytesIO(response.read())
-                encoding, lines = detect_encoding(buffer.readline)
+                return source_to_unicode(buffer, errors, skip_encoding_cookie)
-                buffer.seek(0)
-                text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
-                text.mode = 'r'
-                if skip_encoding_cookie:
-                    return "".join(strip_encoding_cookie(text))
-                else:
-                    return text.read()