upstream/mercurial-mirror Commit - r17700:5b1b0e4e

1

# This library is free software; you can redistribute it and/or

1

# This library is free software; you can redistribute it and/or

2

# modify it under the terms of the GNU Lesser General Public

2

# modify it under the terms of the GNU Lesser General Public

3

# License as published by the Free Software Foundation; either

3

# License as published by the Free Software Foundation; either

4

# version 2.1 of the License, or (at your option) any later version.

4

# version 2.1 of the License, or (at your option) any later version.

5

#

5

#

6

# This library is distributed in the hope that it will be useful,

6

# This library is distributed in the hope that it will be useful,

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

9

# Lesser General Public License for more details.

9

# Lesser General Public License for more details.

10

#

10

#

11

# You should have received a copy of the GNU Lesser General Public

11

# You should have received a copy of the GNU Lesser General Public

12

# License along with this library; if not, see

12

# License along with this library; if not, see

13

# <http://www.gnu.org/licenses/>.

13

# <http://www.gnu.org/licenses/>.

14

15

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

15

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

16

17

18

# Modified by Benoit Boissinot:

18

# Modified by Benoit Boissinot:

19

# - fix for digest auth (inspired from urllib2.py @ Python v2.4)

19

# - fix for digest auth (inspired from urllib2.py @ Python v2.4)

20

# Modified by Dirkjan Ochtman:

20

# Modified by Dirkjan Ochtman:

21

# - import md5 function from a local util module

21

# - import md5 function from a local util module

22

# Modified by Martin Geisler:

22

# Modified by Martin Geisler:

23

# - moved md5 function from local util module to this module

23

# - moved md5 function from local util module to this module

24

# Modified by Augie Fackler:

24

# Modified by Augie Fackler:

25

# - add safesend method and use it to prevent broken pipe errors

25

# - add safesend method and use it to prevent broken pipe errors

26

# on large POST requests

26

# on large POST requests

27

28

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

28

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

29

30

>>> import urllib2

30

>>> import urllib2

31

>>> from keepalive import HTTPHandler

31

>>> from keepalive import HTTPHandler

32

>>> keepalive_handler = HTTPHandler()

32

>>> keepalive_handler = HTTPHandler()

33

>>> opener = urllib2.build_opener(keepalive_handler)

33

>>> opener = urllib2.build_opener(keepalive_handler)

34

>>> urllib2.install_opener(opener)

34

>>> urllib2.install_opener(opener)

35

>>>

35

>>>

36

>>> fo = urllib2.urlopen('http://www.python.org')

36

>>> fo = urllib2.urlopen('http://www.python.org')

37

38

If a connection to a given host is requested, and all of the existing

38

If a connection to a given host is requested, and all of the existing

39

connections are still in use, another connection will be opened. If

39

connections are still in use, another connection will be opened. If

40

the handler tries to use an existing connection but it fails in some

40

the handler tries to use an existing connection but it fails in some

41

way, it will be closed and removed from the pool.

41

way, it will be closed and removed from the pool.

42

43

To remove the handler, simply re-run build_opener with no arguments, and

43

To remove the handler, simply re-run build_opener with no arguments, and

44

install that opener.

44

install that opener.

45

46

You can explicitly close connections by using the close_connection()

46

You can explicitly close connections by using the close_connection()

47

method of the returned file-like object (described below) or you can

47

method of the returned file-like object (described below) or you can

48

use the handler methods:

48

use the handler methods:

49

50

close_connection(host)

50

close_connection(host)

51

close_all()

51

close_all()

52

open_connections()

52

open_connections()

53

54

NOTE: using the close_connection and close_all methods of the handler

54

NOTE: using the close_connection and close_all methods of the handler

55

should be done with care when using multiple threads.

55

should be done with care when using multiple threads.

56

* there is nothing that prevents another thread from creating new

56

* there is nothing that prevents another thread from creating new

57

connections immediately after connections are closed

57

connections immediately after connections are closed

58

* no checks are done to prevent in-use connections from being closed

58

* no checks are done to prevent in-use connections from being closed

59

60

>>> keepalive_handler.close_all()

60

>>> keepalive_handler.close_all()

61

62

EXTRA ATTRIBUTES AND METHODS

62

EXTRA ATTRIBUTES AND METHODS

63

64

Upon a status of 200, the object returned has a few additional

64

Upon a status of 200, the object returned has a few additional

65

attributes and methods, which should not be used if you want to

65

attributes and methods, which should not be used if you want to

66

remain consistent with the normal urllib2-returned objects:

66

remain consistent with the normal urllib2-returned objects:

67

68

close_connection() - close the connection to the host

68

close_connection() - close the connection to the host

69

readlines() - you know, readlines()

69

readlines() - you know, readlines()

70

status - the return status (i.e. 404)

70

status - the return status (i.e. 404)

71

reason - english translation of status (i.e. 'File not found')

71

reason - english translation of status (i.e. 'File not found')

72

73

If you want the best of both worlds, use this inside an

73

If you want the best of both worlds, use this inside an

74

AttributeError-catching try:

74

AttributeError-catching try:

75

76

>>> try: status = fo.status

76

>>> try: status = fo.status

77

>>> except AttributeError: status = None

77

>>> except AttributeError: status = None

78

79

Unfortunately, these are ONLY there if status == 200, so it's not

79

Unfortunately, these are ONLY there if status == 200, so it's not

80

easy to distinguish between non-200 responses. The reason is that

80

easy to distinguish between non-200 responses. The reason is that

81

urllib2 tries to do clever things with error codes 301, 302, 401,

81

urllib2 tries to do clever things with error codes 301, 302, 401,

82

and 407, and it wraps the object upon return.

82

and 407, and it wraps the object upon return.

83

84

For python versions earlier than 2.4, you can avoid this fancy error

84

For python versions earlier than 2.4, you can avoid this fancy error

85

handling by setting the module-level global HANDLE_ERRORS to zero.

85

handling by setting the module-level global HANDLE_ERRORS to zero.

86

You see, prior to 2.4, it's the HTTP Handler's job to determine what

86

You see, prior to 2.4, it's the HTTP Handler's job to determine what

87

to handle specially, and what to just pass up. HANDLE_ERRORS == 0

87

to handle specially, and what to just pass up. HANDLE_ERRORS == 0

88

means "pass everything up". In python 2.4, however, this job no

88

means "pass everything up". In python 2.4, however, this job no

89

longer belongs to the HTTP Handler and is now done by a NEW handler,

89

longer belongs to the HTTP Handler and is now done by a NEW handler,

90

HTTPErrorProcessor. Here's the bottom line:

90

HTTPErrorProcessor. Here's the bottom line:

91

92

python version < 2.4

92

python version < 2.4

93

HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as

93

HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as

94

errors

94

errors

95

HANDLE_ERRORS == 0 pass everything up, error processing is

95

HANDLE_ERRORS == 0 pass everything up, error processing is

96

left to the calling code

96

left to the calling code

97

python version >= 2.4

97

python version >= 2.4

98

HANDLE_ERRORS == 1 pass up 200, treat the rest as errors

98

HANDLE_ERRORS == 1 pass up 200, treat the rest as errors

99

HANDLE_ERRORS == 0 (default) pass everything up, let the

99

HANDLE_ERRORS == 0 (default) pass everything up, let the

100

other handlers (specifically,

100

other handlers (specifically,

101

HTTPErrorProcessor) decide what to do

101

HTTPErrorProcessor) decide what to do

102

103

In practice, setting the variable either way makes little difference

103

In practice, setting the variable either way makes little difference

104

in python 2.4, so for the most consistent behavior across versions,

104

in python 2.4, so for the most consistent behavior across versions,

105

you probably just want to use the defaults, which will give you

105

you probably just want to use the defaults, which will give you

106

exceptions on errors.

106

exceptions on errors.

107

108

"""

108

"""

109

110

# $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $

110

# $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $

111

112

import errno

112

import errno

113

import httplib

113

import httplib

114

import socket

114

import socket

115

import thread

115

import thread

116

import urllib2

116

import urllib2

117

118

DEBUG = None

118

DEBUG = None

119

120

import sys

120

import sys

121

if sys.version_info < (2, 4):

121

if sys.version_info < (2, 4):

122

HANDLE_ERRORS = 1

122

HANDLE_ERRORS = 1

123

else: HANDLE_ERRORS = 0

123

else: HANDLE_ERRORS = 0

124

125

class ConnectionManager(object):

125

class ConnectionManager(object):

126

"""

126

"""

127

The connection manager must be able to:

127

The connection manager must be able to:

128

* keep track of all existing

128

* keep track of all existing

129

"""

129

"""

130

def __init__(self):

130

def __init__(self):

131

self._lock = thread.allocate_lock()

131

self._lock = thread.allocate_lock()

132

self._hostmap = {} # map hosts to a list of connections

132

self._hostmap = {} # map hosts to a list of connections

133

self._connmap = {} # map connections to host

133

self._connmap = {} # map connections to host

134

self._readymap = {} # map connection to ready state

134

self._readymap = {} # map connection to ready state

135

136

def add(self, host, connection, ready):

136

def add(self, host, connection, ready):

137

self._lock.acquire()

137

self._lock.acquire()

138

try:

138

try:

139

if host not in self._hostmap:

139

if host not in self._hostmap:

140

self._hostmap[host] = []

140

self._hostmap[host] = []

141

self._hostmap[host].append(connection)

141

self._hostmap[host].append(connection)

142

self._connmap[connection] = host

142

self._connmap[connection] = host

143

self._readymap[connection] = ready

143

self._readymap[connection] = ready

144

finally:

144

finally:

145

self._lock.release()

145

self._lock.release()

146

147

def remove(self, connection):

147

def remove(self, connection):

148

self._lock.acquire()

148

self._lock.acquire()

149

try:

149

try:

150

try:

150

try:

151

host = self._connmap[connection]

151

host = self._connmap[connection]

152

except KeyError:

152

except KeyError:

153

pass

153

pass

154

else:

154

else:

155

del self._connmap[connection]

155

del self._connmap[connection]

156

del self._readymap[connection]

156

del self._readymap[connection]

157

self._hostmap[host].remove(connection)

157

self._hostmap[host].remove(connection)

158

if not self._hostmap[host]: del self._hostmap[host]

158

if not self._hostmap[host]: del self._hostmap[host]

159

finally:

159

finally:

160

self._lock.release()

160

self._lock.release()

161

162

def set_ready(self, connection, ready):

162

def set_ready(self, connection, ready):

163

try:

163

try:

164

self._readymap[connection] = ready

164

self._readymap[connection] = ready

165

except KeyError:

165

except KeyError:

166

pass

166

pass

167

168

def get_ready_conn(self, host):

168

def get_ready_conn(self, host):

169

conn = None

169

conn = None

170

self._lock.acquire()

170

self._lock.acquire()

171

try:

171

try:

172

if host in self._hostmap:

172

if host in self._hostmap:

173

for c in self._hostmap[host]:

173

for c in self._hostmap[host]:

174

if self._readymap[c]:

174

if self._readymap[c]:

175

self._readymap[c] = 0

175

self._readymap[c] = 0

176

conn = c

176

conn = c

177

break

177

break

178

finally:

178

finally:

179

self._lock.release()

179

self._lock.release()

180

return conn

180

return conn

181

182

def get_all(self, host=None):

182

def get_all(self, host=None):

183

if host:

183

if host:

184

return list(self._hostmap.get(host, []))

184

return list(self._hostmap.get(host, []))

185

else:

185

else:

186

return dict(self._hostmap)

186

return dict(self._hostmap)

187

188

class KeepAliveHandler(object):

188

class KeepAliveHandler(object):

189

def __init__(self):

189

def __init__(self):

190

self._cm = ConnectionManager()

190

self._cm = ConnectionManager()

191

192

#### Connection Management

192

#### Connection Management

193

def open_connections(self):

193

def open_connections(self):

194

"""return a list of connected hosts and the number of connections

194

"""return a list of connected hosts and the number of connections

195

to each. [('foo.com:80', 2), ('bar.org', 1)]"""

195

to each. [('foo.com:80', 2), ('bar.org', 1)]"""

196

return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

196

return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

197

198

def close_connection(self, host):

198

def close_connection(self, host):

199

"""close connection(s) to <host>

199

"""close connection(s) to <host>

200

host is the host:port spec, as in 'www.cnn.com:8080' as passed in.

200

host is the host:port spec, as in 'www.cnn.com:8080' as passed in.

201

no error occurs if there is no connection to that host."""

201

no error occurs if there is no connection to that host."""

202

for h in self._cm.get_all(host):

202

for h in self._cm.get_all(host):

203

self._cm.remove(h)

203

self._cm.remove(h)

204

h.close()

204

h.close()

205

206

def close_all(self):

206

def close_all(self):

207

"""close all open connections"""

207

"""close all open connections"""

208

for host, conns in self._cm.get_all().iteritems():

208

for host, conns in self._cm.get_all().iteritems():

209

for h in conns:

209

for h in conns:

210

self._cm.remove(h)

210

self._cm.remove(h)

211

h.close()

211

h.close()

212

213

def _request_closed(self, request, host, connection):

213

def _request_closed(self, request, host, connection):

214

"""tells us that this request is now closed and that the

214

"""tells us that this request is now closed and that the

215

connection is ready for another request"""

215

connection is ready for another request"""

216

self._cm.set_ready(connection, 1)

216

self._cm.set_ready(connection, 1)

217

218

def _remove_connection(self, host, connection, close=0):

218

def _remove_connection(self, host, connection, close=0):

219

if close:

219

if close:

220

connection.close()

220

connection.close()

221

self._cm.remove(connection)

221

self._cm.remove(connection)

222

223

#### Transaction Execution

223

#### Transaction Execution

224

def http_open(self, req):

224

def http_open(self, req):

225

return self.do_open(HTTPConnection, req)

225

return self.do_open(HTTPConnection, req)

226

227

def do_open(self, http_class, req):

227

def do_open(self, http_class, req):

228

host = req.get_host()

228

host = req.get_host()

229

if not host:

229

if not host:

230

raise urllib2.URLError('no host given')

230

raise urllib2.URLError('no host given')

231

232

try:

232

try:

233

h = self._cm.get_ready_conn(host)

233

h = self._cm.get_ready_conn(host)

234

while h:

234

while h:

235

r = self._reuse_connection(h, req, host)

235

r = self._reuse_connection(h, req, host)

236

237

# if this response is non-None, then it worked and we're

237

# if this response is non-None, then it worked and we're

238

# done. Break out, skipping the else block.

238

# done. Break out, skipping the else block.

239

if r:

239

if r:

240

break

240

break

241

242

# connection is bad - possibly closed by server

242

# connection is bad - possibly closed by server

243

# discard it and ask for the next free connection

243

# discard it and ask for the next free connection

244

h.close()

244

h.close()

245

self._cm.remove(h)

245

self._cm.remove(h)

246

h = self._cm.get_ready_conn(host)

246

h = self._cm.get_ready_conn(host)

247

else:

247

else:

248

# no (working) free connections were found. Create a new one.

248

# no (working) free connections were found. Create a new one.

249

h = http_class(host)

249

h = http_class(host)

250

if DEBUG:

250

if DEBUG:

251

DEBUG.info("creating new connection to %s (%d)",

251

DEBUG.info("creating new connection to %s (%d)",

252

host, id(h))

252

host, id(h))

253

self._cm.add(host, h, 0)

253

self._cm.add(host, h, 0)

254

self._start_transaction(h, req)

254

self._start_transaction(h, req)

255

r = h.getresponse()

255

r = h.getresponse()

256

except (socket.error, httplib.HTTPException), err:

256

except (socket.error, httplib.HTTPException), err:

257

raise urllib2.URLError(err)

257

raise urllib2.URLError(err)

258

259

# if not a persistent connection, don't try to reuse it

259

# if not a persistent connection, don't try to reuse it

260

if r.will_close:

260

if r.will_close:

261

self._cm.remove(h)

261

self._cm.remove(h)

262

263

if DEBUG:

263

if DEBUG:

264

DEBUG.info("STATUS: %s, %s", r.status, r.reason)

264

DEBUG.info("STATUS: %s, %s", r.status, r.reason)

265

r._handler = self

265

r._handler = self

266

r._host = host

266

r._host = host

267

r._url = req.get_full_url()

267

r._url = req.get_full_url()

268

r._connection = h

268

r._connection = h

269

r.code = r.status

269

r.code = r.status

270

r.headers = r.msg

270

r.headers = r.msg

271

r.msg = r.reason

271

r.msg = r.reason

272

273

if r.status == 200 or not HANDLE_ERRORS:

273

if r.status == 200 or not HANDLE_ERRORS:

274

return r

274

return r

275

else:

275

else:

276

return self.parent.error('http', req, r,

276

return self.parent.error('http', req, r,

277

r.status, r.msg, r.headers)

277

r.status, r.msg, r.headers)

278

279

def _reuse_connection(self, h, req, host):

279

def _reuse_connection(self, h, req, host):

280

"""start the transaction with a re-used connection

280

"""start the transaction with a re-used connection

281

return a response object (r) upon success or None on failure.

281

return a response object (r) upon success or None on failure.

282

This DOES not close or remove bad connections in cases where

282

This DOES not close or remove bad connections in cases where

283

it returns. However, if an unexpected exception occurs, it

283

it returns. However, if an unexpected exception occurs, it

284

will close and remove the connection before re-raising.

284

will close and remove the connection before re-raising.

285

"""

285

"""

286

try:

286

try:

287

self._start_transaction(h, req)

287

self._start_transaction(h, req)

288

r = h.getresponse()

288

r = h.getresponse()

289

# note: just because we got something back doesn't mean it

289

# note: just because we got something back doesn't mean it

290

# worked. We'll check the version below, too.

290

# worked. We'll check the version below, too.

291

except (socket.error, httplib.HTTPException):

291

except (socket.error, httplib.HTTPException):

292

r = None

292

r = None

293

except: # re-raises

293

except: # re-raises

294

# adding this block just in case we've missed

294

# adding this block just in case we've missed

295

# something we will still raise the exception, but

295

# something we will still raise the exception, but

296

# lets try and close the connection and remove it

296

# lets try and close the connection and remove it

297

# first. We previously got into a nasty loop

297

# first. We previously got into a nasty loop

298

# where an exception was uncaught, and so the

298

# where an exception was uncaught, and so the

299

# connection stayed open. On the next try, the

299

# connection stayed open. On the next try, the

300

# same exception was raised, etc. The trade-off is

300

# same exception was raised, etc. The trade-off is

301

# that it's now possible this call will raise

301

# that it's now possible this call will raise

302

# a DIFFERENT exception

302

# a DIFFERENT exception

303

if DEBUG:

303

if DEBUG:

304

DEBUG.error("unexpected exception - closing "

304

DEBUG.error("unexpected exception - closing "

305

"connection to %s (%d)", host, id(h))

305

"connection to %s (%d)", host, id(h))

306

self._cm.remove(h)

306

self._cm.remove(h)

307

h.close()

307

h.close()

308

raise

308

raise

309

310

if r is None or r.version == 9:

310

if r is None or r.version == 9:

311

# httplib falls back to assuming HTTP 0.9 if it gets a

311

# httplib falls back to assuming HTTP 0.9 if it gets a

312

# bad header back. This is most likely to happen if

312

# bad header back. This is most likely to happen if

313

# the socket has been closed by the server since we

313

# the socket has been closed by the server since we

314

# last used the connection.

314

# last used the connection.

315

if DEBUG:

315

if DEBUG:

316

DEBUG.info("failed to re-use connection to %s (%d)",

316

DEBUG.info("failed to re-use connection to %s (%d)",

317

host, id(h))

317

host, id(h))

318

r = None

318

r = None

319

else:

319

else:

320

if DEBUG:

320

if DEBUG:

321

DEBUG.info("re-using connection to %s (%d)", host, id(h))

321

DEBUG.info("re-using connection to %s (%d)", host, id(h))

322

323

return r

323

return r

324

325

def _start_transaction(self, h, req):

325

def _start_transaction(self, h, req):

326

# What follows mostly reimplements HTTPConnection.request()

326

# What follows mostly reimplements HTTPConnection.request()

327

# except it adds self.parent.addheaders in the mix.

327

# except it adds self.parent.addheaders in the mix.

328

headers = req.headers.copy()

328

headers = req.headers.copy()

329

if sys.version_info >= (2, 4):

329

if sys.version_info >= (2, 4):

330

headers.update(req.unredirected_hdrs)

330

headers.update(req.unredirected_hdrs)

331

headers.update(self.parent.addheaders)

331

headers.update(self.parent.addheaders)

332

headers = dict((n.lower(), v) for n, v in headers.items())

332

headers = dict((n.lower(), v) for n, v in headers.items())

333

skipheaders = {}

333

skipheaders = {}

334

for n in ('host', 'accept-encoding'):

334

for n in ('host', 'accept-encoding'):

335

if n in headers:

335

if n in headers:

336

skipheaders['skip_' + n.replace('-', '_')] = 1

336

skipheaders['skip_' + n.replace('-', '_')] = 1

337

try:

337

try:

338

if req.has_data():

338

if req.has_data():

339

data = req.get_data()

339

data = req.get_data()

340

h.putrequest('POST', req.get_selector(), **skipheaders)

340

h.putrequest('POST', req.get_selector(), **skipheaders)

341

if 'content-type' not in headers:

341

if 'content-type' not in headers:

342

h.putheader('Content-type',

342

h.putheader('Content-type',

343

'application/x-www-form-urlencoded')

343

'application/x-www-form-urlencoded')

344

if 'content-length' not in headers:

344

if 'content-length' not in headers:

345

h.putheader('Content-length', '%d' % len(data))

345

h.putheader('Content-length', '%d' % len(data))

346

else:

346

else:

347

h.putrequest('GET', req.get_selector(), **skipheaders)

347

h.putrequest('GET', req.get_selector(), **skipheaders)

348

except (socket.error), err:

348

except (socket.error), err:

349

raise urllib2.URLError(err)

349

raise urllib2.URLError(err)

350

for k, v in headers.items():

350

for k, v in headers.items():

351

h.putheader(k, v)

351

h.putheader(k, v)

352

h.endheaders()

352

h.endheaders()

353

if req.has_data():

353

if req.has_data():

354

h.send(data)

354

h.send(data)

355

356

class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):

356

class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):

357

pass

357

pass

358

359

class HTTPResponse(httplib.HTTPResponse):

359

class HTTPResponse(httplib.HTTPResponse):

360

# we need to subclass HTTPResponse in order to

360

# we need to subclass HTTPResponse in order to

361

# 1) add readline() and readlines() methods

361

# 1) add readline() and readlines() methods

362

# 2) add close_connection() methods

362

# 2) add close_connection() methods

363

# 3) add info() and geturl() methods

363

# 3) add info() and geturl() methods

364

365

# in order to add readline(), read must be modified to deal with a

365

# in order to add readline(), read must be modified to deal with a

366

# buffer. example: readline must read a buffer and then spit back

366

# buffer. example: readline must read a buffer and then spit back

367

# one line at a time. The only real alternative is to read one

367

# one line at a time. The only real alternative is to read one

368

# BYTE at a time (ick). Once something has been read, it can't be

368

# BYTE at a time (ick). Once something has been read, it can't be

369

# put back (ok, maybe it can, but that's even uglier than this),

369

# put back (ok, maybe it can, but that's even uglier than this),

370

# so if you THEN do a normal read, you must first take stuff from

370

# so if you THEN do a normal read, you must first take stuff from

371

# the buffer.

371

# the buffer.

372

373

# the read method wraps the original to accommodate buffering,

373

# the read method wraps the original to accommodate buffering,

374

# although read() never adds to the buffer.

374

# although read() never adds to the buffer.

375

# Both readline and readlines have been stolen with almost no

375

# Both readline and readlines have been stolen with almost no

376

# modification from socket.py

376

# modification from socket.py

377

378

379

def __init__(self, sock, debuglevel=0, strict=0, method=None):

379

def __init__(self, sock, debuglevel=0, strict=0, method=None):

380

if method: # the httplib in python 2.3 uses the method arg

380

httplib.HTTPResponse.__init__(self, sock, debuglevel, method)

381

httplib.HTTPResponse.__init__(self, sock, debuglevel, method)

382

else: # 2.2 doesn't

383

httplib.HTTPResponse.__init__(self, sock, debuglevel)

384

self.fileno = sock.fileno

381

self.fileno = sock.fileno

385

self.code = None

382

self.code = None

386

self._rbuf = ''

383

self._rbuf = ''

387

self._rbufsize = 8096

384

self._rbufsize = 8096

388

self._handler = None # inserted by the handler later

385

self._handler = None # inserted by the handler later

389

self._host = None # (same)

386

self._host = None # (same)

390

self._url = None # (same)

387

self._url = None # (same)

391

self._connection = None # (same)

388

self._connection = None # (same)

392

389

393

_raw_read = httplib.HTTPResponse.read

390

_raw_read = httplib.HTTPResponse.read

394

391

395

def close(self):

392

def close(self):

396

if self.fp:

393

if self.fp:

397

self.fp.close()

394

self.fp.close()

398

self.fp = None

395

self.fp = None

399

if self._handler:

396

if self._handler:

400

self._handler._request_closed(self, self._host,

397

self._handler._request_closed(self, self._host,

401

self._connection)

398

self._connection)

402

399

403

def close_connection(self):

400

def close_connection(self):

404

self._handler._remove_connection(self._host, self._connection, close=1)

401

self._handler._remove_connection(self._host, self._connection, close=1)

405

self.close()

402

self.close()

406

403

407

def info(self):

404

def info(self):

408

return self.headers

405

return self.headers

409

406

410

def geturl(self):

407

def geturl(self):

411

return self._url

408

return self._url

412

409

413

def read(self, amt=None):

410

def read(self, amt=None):

414

# the _rbuf test is only in this first if for speed. It's not

411

# the _rbuf test is only in this first if for speed. It's not

415

# logically necessary

412

# logically necessary

416

if self._rbuf and not amt is None:

413

if self._rbuf and not amt is None:

417

L = len(self._rbuf)

414

L = len(self._rbuf)

418

if amt > L:

415

if amt > L:

419

amt -= L

416

amt -= L

420

else:

417

else:

421

s = self._rbuf[:amt]

418

s = self._rbuf[:amt]

422

self._rbuf = self._rbuf[amt:]

419

self._rbuf = self._rbuf[amt:]

423

return s

420

return s

424

421

425

s = self._rbuf + self._raw_read(amt)

422

s = self._rbuf + self._raw_read(amt)

426

self._rbuf = ''

423

self._rbuf = ''

427

return s

424

return s

428

425

429

# stolen from Python SVN #68532 to fix issue1088

426

# stolen from Python SVN #68532 to fix issue1088

430

def _read_chunked(self, amt):

427

def _read_chunked(self, amt):

431

chunk_left = self.chunk_left

428

chunk_left = self.chunk_left

432

value = ''

429

value = ''

433

430

434

# XXX This accumulates chunks by repeated string concatenation,

431

# XXX This accumulates chunks by repeated string concatenation,

435

# which is not efficient as the number or size of chunks gets big.

432

# which is not efficient as the number or size of chunks gets big.

436

while True:

433

while True:

437

if chunk_left is None:

434

if chunk_left is None:

438

line = self.fp.readline()

435

line = self.fp.readline()

439

i = line.find(';')

436

i = line.find(';')

440

if i >= 0:

437

if i >= 0:

441

line = line[:i] # strip chunk-extensions

438

line = line[:i] # strip chunk-extensions

442

try:

439

try:

443

chunk_left = int(line, 16)

440

chunk_left = int(line, 16)

444

except ValueError:

441

except ValueError:

445

# close the connection as protocol synchronization is

442

# close the connection as protocol synchronization is

446

# probably lost

443

# probably lost

447

self.close()

444

self.close()

448

raise httplib.IncompleteRead(value)

445

raise httplib.IncompleteRead(value)

449

if chunk_left == 0:

446

if chunk_left == 0:

450

break

447

break

451

if amt is None:

448

if amt is None:

452

value += self._safe_read(chunk_left)

449

value += self._safe_read(chunk_left)

453

elif amt < chunk_left:

450

elif amt < chunk_left:

454

value += self._safe_read(amt)

451

value += self._safe_read(amt)

455

self.chunk_left = chunk_left - amt

452

self.chunk_left = chunk_left - amt

456

return value

453

return value

457

elif amt == chunk_left:

454

elif amt == chunk_left:

458

value += self._safe_read(amt)

455

value += self._safe_read(amt)

459

self._safe_read(2) # toss the CRLF at the end of the chunk

456

self._safe_read(2) # toss the CRLF at the end of the chunk

460

self.chunk_left = None

457

self.chunk_left = None

461

return value

458

return value

462

else:

459

else:

463

value += self._safe_read(chunk_left)

460

value += self._safe_read(chunk_left)

464

amt -= chunk_left

461

amt -= chunk_left

465

462

466

# we read the whole chunk, get another

463

# we read the whole chunk, get another

467

self._safe_read(2) # toss the CRLF at the end of the chunk

464

self._safe_read(2) # toss the CRLF at the end of the chunk

468

chunk_left = None

465

chunk_left = None

469

466

470

# read and discard trailer up to the CRLF terminator

467

# read and discard trailer up to the CRLF terminator

471

### note: we shouldn't have any trailers!

468

### note: we shouldn't have any trailers!

472

while True:

469

while True:

473

line = self.fp.readline()

470

line = self.fp.readline()

474

if not line:

471

if not line:

475

# a vanishingly small number of sites EOF without

472

# a vanishingly small number of sites EOF without

476

# sending the trailer

473

# sending the trailer

477

break

474

break

478

if line == '\r\n':

475

if line == '\r\n':

479

break

476

break

480

477

481

# we read everything; close the "file"

478

# we read everything; close the "file"

482

self.close()

479

self.close()

483

480

484

return value

481

return value

485

482

486

def readline(self, limit=-1):

483

def readline(self, limit=-1):

487

i = self._rbuf.find('\n')

484

i = self._rbuf.find('\n')

488

while i < 0 and not (0 < limit <= len(self._rbuf)):

485

while i < 0 and not (0 < limit <= len(self._rbuf)):

489

new = self._raw_read(self._rbufsize)

486

new = self._raw_read(self._rbufsize)

490

if not new:

487

if not new:

491

break

488

break

492

i = new.find('\n')

489

i = new.find('\n')

493

if i >= 0:

490

if i >= 0:

494

i = i + len(self._rbuf)

491

i = i + len(self._rbuf)

495

self._rbuf = self._rbuf + new

492

self._rbuf = self._rbuf + new

496

if i < 0:

493

if i < 0:

497

i = len(self._rbuf)

494

i = len(self._rbuf)

498

else:

495

else:

499

i = i + 1

496

i = i + 1

500

if 0 <= limit < len(self._rbuf):

497

if 0 <= limit < len(self._rbuf):

501

i = limit

498

i = limit

502

data, self._rbuf = self._rbuf[:i], self._rbuf[i:]

499

data, self._rbuf = self._rbuf[:i], self._rbuf[i:]

503

return data

500

return data

504

501

505

def readlines(self, sizehint = 0):

502

def readlines(self, sizehint = 0):

506

total = 0

503

total = 0

507

list = []

504

list = []

508

while True:

505

while True:

509

line = self.readline()

506

line = self.readline()

510

if not line:

507

if not line:

511

break

508

break

512

list.append(line)

509

list.append(line)

513

total += len(line)

510

total += len(line)

514

if sizehint and total >= sizehint:

511

if sizehint and total >= sizehint:

515

break

512

break

516

return list

513

return list

517

514

518

def safesend(self, str):

515

def safesend(self, str):

519

"""Send `str' to the server.

516

"""Send `str' to the server.

520

517

521

Shamelessly ripped off from httplib to patch a bad behavior.

518

Shamelessly ripped off from httplib to patch a bad behavior.

522

"""

519

"""

523

# _broken_pipe_resp is an attribute we set in this function

520

# _broken_pipe_resp is an attribute we set in this function

524

# if the socket is closed while we're sending data but

521

# if the socket is closed while we're sending data but

525

# the server sent us a response before hanging up.

522

# the server sent us a response before hanging up.

526

# In that case, we want to pretend to send the rest of the

523

# In that case, we want to pretend to send the rest of the

527

# outgoing data, and then let the user use getresponse()

524

# outgoing data, and then let the user use getresponse()

528

# (which we wrap) to get this last response before

525

# (which we wrap) to get this last response before

529

# opening a new socket.

526

# opening a new socket.

530

if getattr(self, '_broken_pipe_resp', None) is not None:

527

if getattr(self, '_broken_pipe_resp', None) is not None:

531

return

528

return

532

529

533

if self.sock is None:

530

if self.sock is None:

534

if self.auto_open:

531

if self.auto_open:

535

self.connect()

532

self.connect()

536

else:

533

else:

537

raise httplib.NotConnected

534

raise httplib.NotConnected

538

535

539

# send the data to the server. if we get a broken pipe, then close

536

# send the data to the server. if we get a broken pipe, then close

540

# the socket. we want to reconnect when somebody tries to send again.

537

# the socket. we want to reconnect when somebody tries to send again.

541

#

538

#

542

# NOTE: we DO propagate the error, though, because we cannot simply

539

# NOTE: we DO propagate the error, though, because we cannot simply

543

# ignore the error... the caller will know if they can retry.

540

# ignore the error... the caller will know if they can retry.

544

if self.debuglevel > 0:

541

if self.debuglevel > 0:

545

print "send:", repr(str)

542

print "send:", repr(str)

546

try:

543

try:

547

blocksize = 8192

544

blocksize = 8192

548

read = getattr(str, 'read', None)

545

read = getattr(str, 'read', None)

549

if read is not None:

546

if read is not None:

550

if self.debuglevel > 0:

547

if self.debuglevel > 0:

551

print "sending a read()able"

548

print "sending a read()able"

552

data = read(blocksize)

549

data = read(blocksize)

553

while data:

550

while data:

554

self.sock.sendall(data)

551

self.sock.sendall(data)

555

data = read(blocksize)

552

data = read(blocksize)

556

else:

553

else:

557

self.sock.sendall(str)

554

self.sock.sendall(str)

558

except socket.error, v:

555

except socket.error, v:

559

reraise = True

556

reraise = True

560

if v[0] == errno.EPIPE: # Broken pipe

557

if v[0] == errno.EPIPE: # Broken pipe

561

if self._HTTPConnection__state == httplib._CS_REQ_SENT:

558

if self._HTTPConnection__state == httplib._CS_REQ_SENT:

562

self._broken_pipe_resp = None

559

self._broken_pipe_resp = None

563

self._broken_pipe_resp = self.getresponse()

560

self._broken_pipe_resp = self.getresponse()

564

reraise = False

561

reraise = False

565

self.close()

562

self.close()

566

if reraise:

563

if reraise:

567

raise

564

raise

568

565

569

def wrapgetresponse(cls):

566

def wrapgetresponse(cls):

570

"""Wraps getresponse in cls with a broken-pipe sane version.

567

"""Wraps getresponse in cls with a broken-pipe sane version.

571

"""

568

"""

572

def safegetresponse(self):

569

def safegetresponse(self):

573

# In safesend() we might set the _broken_pipe_resp

570

# In safesend() we might set the _broken_pipe_resp

574

# attribute, in which case the socket has already

571

# attribute, in which case the socket has already

575

# been closed and we just need to give them the response

572

# been closed and we just need to give them the response

576

# back. Otherwise, we use the normal response path.

573

# back. Otherwise, we use the normal response path.

577

r = getattr(self, '_broken_pipe_resp', None)

574

r = getattr(self, '_broken_pipe_resp', None)

578

if r is not None:

575

if r is not None:

579

return r

576

return r

580

return cls.getresponse(self)

577

return cls.getresponse(self)

581

safegetresponse.__doc__ = cls.getresponse.__doc__

578

safegetresponse.__doc__ = cls.getresponse.__doc__

582

return safegetresponse

579

return safegetresponse

583

580

584

class HTTPConnection(httplib.HTTPConnection):

581

class HTTPConnection(httplib.HTTPConnection):

585

# use the modified response class

582

# use the modified response class

586

response_class = HTTPResponse

583

response_class = HTTPResponse

587

send = safesend

584

send = safesend

588

getresponse = wrapgetresponse(httplib.HTTPConnection)

585

getresponse = wrapgetresponse(httplib.HTTPConnection)

589

586

590

587

591

#########################################################################

588

#########################################################################

592

##### TEST FUNCTIONS

589

##### TEST FUNCTIONS

593

#########################################################################

590

#########################################################################

594

591

595

def error_handler(url):

592

def error_handler(url):

596

global HANDLE_ERRORS

593

global HANDLE_ERRORS

597

orig = HANDLE_ERRORS

594

orig = HANDLE_ERRORS

598

keepalive_handler = HTTPHandler()

595

keepalive_handler = HTTPHandler()

599

opener = urllib2.build_opener(keepalive_handler)

596

opener = urllib2.build_opener(keepalive_handler)

600

urllib2.install_opener(opener)

597

urllib2.install_opener(opener)

601

pos = {0: 'off', 1: 'on'}

598

pos = {0: 'off', 1: 'on'}

602

for i in (0, 1):

599

for i in (0, 1):

603

print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)

600

print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)

604

HANDLE_ERRORS = i

601

HANDLE_ERRORS = i

605

try:

602

try:

606

fo = urllib2.urlopen(url)

603

fo = urllib2.urlopen(url)

607

fo.read()

604

fo.read()

608

fo.close()

605

fo.close()

609

try:

606

try:

610

status, reason = fo.status, fo.reason

607

status, reason = fo.status, fo.reason

611

except AttributeError:

608

except AttributeError:

612

status, reason = None, None

609

status, reason = None, None

613

except IOError, e:

610

except IOError, e:

614

print " EXCEPTION: %s" % e

611

print " EXCEPTION: %s" % e

615

raise

612

raise

616

else:

613

else:

617

print " status = %s, reason = %s" % (status, reason)

614

print " status = %s, reason = %s" % (status, reason)

618

HANDLE_ERRORS = orig

615

HANDLE_ERRORS = orig

619

hosts = keepalive_handler.open_connections()

616

hosts = keepalive_handler.open_connections()

620

print "open connections:", hosts

617

print "open connections:", hosts

621

keepalive_handler.close_all()

618

keepalive_handler.close_all()

622

619

623

def md5(s):

620

def md5(s):

624

try:

621

try:

625

from hashlib import md5 as _md5

622

from hashlib import md5 as _md5

626

except ImportError:

623

except ImportError:

627

from md5 import md5 as _md5

624

from md5 import md5 as _md5

628

global md5

625

global md5

629

md5 = _md5

626

md5 = _md5

630

return _md5(s)

627

return _md5(s)

631

628

632

def continuity(url):

629

def continuity(url):

633

format = '%25s: %s'

630

format = '%25s: %s'

634

631

635

# first fetch the file with the normal http handler

632

# first fetch the file with the normal http handler

636

opener = urllib2.build_opener()

633

opener = urllib2.build_opener()

637

urllib2.install_opener(opener)

634

urllib2.install_opener(opener)

638

fo = urllib2.urlopen(url)

635

fo = urllib2.urlopen(url)

639

foo = fo.read()

636

foo = fo.read()

640

fo.close()

637

fo.close()

641

m = md5.new(foo)

638

m = md5.new(foo)

642

print format % ('normal urllib', m.hexdigest())

639

print format % ('normal urllib', m.hexdigest())

643

640

644

# now install the keepalive handler and try again

641

# now install the keepalive handler and try again

645

opener = urllib2.build_opener(HTTPHandler())

642

opener = urllib2.build_opener(HTTPHandler())

646

urllib2.install_opener(opener)

643

urllib2.install_opener(opener)

647

644

648

fo = urllib2.urlopen(url)

645

fo = urllib2.urlopen(url)

649

foo = fo.read()

646

foo = fo.read()

650

fo.close()

647

fo.close()

651

m = md5.new(foo)

648

m = md5.new(foo)

652

print format % ('keepalive read', m.hexdigest())

649

print format % ('keepalive read', m.hexdigest())

653

650

654

fo = urllib2.urlopen(url)

651

fo = urllib2.urlopen(url)

655

foo = ''

652

foo = ''

656

while True:

653

while True:

657

f = fo.readline()

654

f = fo.readline()

658

if f:

655

if f:

659

foo = foo + f

656

foo = foo + f

660

else: break

657

else: break

661

fo.close()

658

fo.close()

662

m = md5.new(foo)

659

m = md5.new(foo)

663

print format % ('keepalive readline', m.hexdigest())

660

print format % ('keepalive readline', m.hexdigest())

664

661

665

def comp(N, url):

662

def comp(N, url):

666

print ' making %i connections to:\n %s' % (N, url)

663

print ' making %i connections to:\n %s' % (N, url)

667

664

668

sys.stdout.write(' first using the normal urllib handlers')

665

sys.stdout.write(' first using the normal urllib handlers')

669

# first use normal opener

666

# first use normal opener

670

opener = urllib2.build_opener()

667

opener = urllib2.build_opener()

671

urllib2.install_opener(opener)

668

urllib2.install_opener(opener)

672

t1 = fetch(N, url)

669

t1 = fetch(N, url)

673

print ' TIME: %.3f s' % t1

670

print ' TIME: %.3f s' % t1

674

671

675

sys.stdout.write(' now using the keepalive handler ')

672

sys.stdout.write(' now using the keepalive handler ')

676

# now install the keepalive handler and try again

673

# now install the keepalive handler and try again

677

opener = urllib2.build_opener(HTTPHandler())

674

opener = urllib2.build_opener(HTTPHandler())

678

urllib2.install_opener(opener)

675

urllib2.install_opener(opener)

679

t2 = fetch(N, url)

676

t2 = fetch(N, url)

680

print ' TIME: %.3f s' % t2

677

print ' TIME: %.3f s' % t2

681

print ' improvement factor: %.2f' % (t1 / t2)

678

print ' improvement factor: %.2f' % (t1 / t2)

682

679

683

def fetch(N, url, delay=0):

680

def fetch(N, url, delay=0):

684

import time

681

import time

685

lens = []

682

lens = []

686

starttime = time.time()

683

starttime = time.time()

687

for i in range(N):

684

for i in range(N):

688

if delay and i > 0:

685

if delay and i > 0:

689

time.sleep(delay)

686

time.sleep(delay)

690

fo = urllib2.urlopen(url)

687

fo = urllib2.urlopen(url)

691

foo = fo.read()

688

foo = fo.read()

692

fo.close()

689

fo.close()

693

lens.append(len(foo))

690

lens.append(len(foo))

694

diff = time.time() - starttime

691

diff = time.time() - starttime

695

692

696

j = 0

693

j = 0

697

for i in lens[1:]:

694

for i in lens[1:]:

698

j = j + 1

695

j = j + 1

699

if not i == lens[0]:

696

if not i == lens[0]:

700

print "WARNING: inconsistent length on read %i: %i" % (j, i)

697

print "WARNING: inconsistent length on read %i: %i" % (j, i)

701

698

702

return diff

699

return diff

703

700

704

def test_timeout(url):

701

def test_timeout(url):

705

global DEBUG

702

global DEBUG

706

dbbackup = DEBUG

703

dbbackup = DEBUG

707

class FakeLogger(object):

704

class FakeLogger(object):

708

def debug(self, msg, *args):

705

def debug(self, msg, *args):

709

print msg % args

706

print msg % args

710

info = warning = error = debug

707

info = warning = error = debug

711

DEBUG = FakeLogger()

708

DEBUG = FakeLogger()

712

print " fetching the file to establish a connection"

709

print " fetching the file to establish a connection"

713

fo = urllib2.urlopen(url)

710

fo = urllib2.urlopen(url)

714

data1 = fo.read()

711

data1 = fo.read()

715

fo.close()

712

fo.close()

716

713

717

i = 20

714

i = 20

718

print " waiting %i seconds for the server to close the connection" % i

715

print " waiting %i seconds for the server to close the connection" % i

719

while i > 0:

716

while i > 0:

720

sys.stdout.write('\r %2i' % i)

717

sys.stdout.write('\r %2i' % i)

721

sys.stdout.flush()

718

sys.stdout.flush()

722

time.sleep(1)

719

time.sleep(1)

723

i -= 1

720

i -= 1

724

sys.stderr.write('\r')

721

sys.stderr.write('\r')

725

722

726

print " fetching the file a second time"

723

print " fetching the file a second time"

727

fo = urllib2.urlopen(url)

724

fo = urllib2.urlopen(url)

728

data2 = fo.read()

725

data2 = fo.read()

729

fo.close()

726

fo.close()

730

727

731

if data1 == data2:

728

if data1 == data2:

732

print ' data are identical'

729

print ' data are identical'

733

else:

730

else:

734

print ' ERROR: DATA DIFFER'

731

print ' ERROR: DATA DIFFER'

735

732

736

DEBUG = dbbackup

733

DEBUG = dbbackup

737

734

738

735

739

def test(url, N=10):

736

def test(url, N=10):

740

print "checking error handler (do this on a non-200)"

737

print "checking error handler (do this on a non-200)"

741

try: error_handler(url)

738

try: error_handler(url)

742

except IOError:

739

except IOError:

743

print "exiting - exception will prevent further tests"

740

print "exiting - exception will prevent further tests"

744

sys.exit()

741

sys.exit()

745

print

742

print

746

print "performing continuity test (making sure stuff isn't corrupted)"

743

print "performing continuity test (making sure stuff isn't corrupted)"

747

continuity(url)

744

continuity(url)

748

print

745

print

749

print "performing speed comparison"

746

print "performing speed comparison"

750

comp(N, url)

747

comp(N, url)

751

print

748

print

752

print "performing dropped-connection check"

749

print "performing dropped-connection check"

753

test_timeout(url)

750

test_timeout(url)

754

751

755

if __name__ == '__main__':

752

if __name__ == '__main__':

756

import time

753

import time

757

import sys

754

import sys

758

try:

755

try:

759

N = int(sys.argv[1])

756

N = int(sys.argv[1])

760

url = sys.argv[2]

757

url = sys.argv[2]

761

except (IndexError, ValueError):

758

except (IndexError, ValueError):

762

print "%s <integer> <url>" % sys.argv[0]

759

print "%s <integer> <url>" % sys.argv[0]

763

else:

760

else:

764

test(url, N)

761

test(url, N)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #   This library is free software; you can redistribute it and/or
             #   modify it under the terms of the GNU Lesser General Public
             #   License as published by the Free Software Foundation; either
             #   version 2.1 of the License, or (at your option) any later version.
             #
             #   This library is distributed in the hope that it will be useful,
             #   but WITHOUT ANY WARRANTY; without even the implied warranty of
             #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
             #   Lesser General Public License for more details.
             #
             #   You should have received a copy of the GNU Lesser General Public
             #   License along with this library; if not, see
             #   <http://www.gnu.org/licenses/>.
             # This file is part of urlgrabber, a high-level cross-protocol url-grabber
             # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
             # Modified by Benoit Boissinot:
             #  - fix for digest auth (inspired from urllib2.py @ Python v2.4)
             # Modified by Dirkjan Ochtman:
             #  - import md5 function from a local util module
             # Modified by Martin Geisler:
             #  - moved md5 function from local util module to this module
             # Modified by Augie Fackler:
             #  - add safesend method and use it to prevent broken pipe errors
             #    on large POST requests
             """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
             >>> import urllib2
             >>> from keepalive import HTTPHandler
             >>> keepalive_handler = HTTPHandler()
             >>> opener = urllib2.build_opener(keepalive_handler)
             >>> urllib2.install_opener(opener)
             >>>
             >>> fo = urllib2.urlopen('http://www.python.org')
             If a connection to a given host is requested, and all of the existing
             connections are still in use, another connection will be opened.  If
             the handler tries to use an existing connection but it fails in some
             way, it will be closed and removed from the pool.
             To remove the handler, simply re-run build_opener with no arguments, and
             install that opener.
             You can explicitly close connections by using the close_connection()
             method of the returned file-like object (described below) or you can
             use the handler methods:
               close_connection(host)
               close_all()
               open_connections()
             NOTE: using the close_connection and close_all methods of the handler
             should be done with care when using multiple threads.
               * there is nothing that prevents another thread from creating new
                 connections immediately after connections are closed
               * no checks are done to prevent in-use connections from being closed
             >>> keepalive_handler.close_all()
             EXTRA ATTRIBUTES AND METHODS
               Upon a status of 200, the object returned has a few additional
               attributes and methods, which should not be used if you want to
               remain consistent with the normal urllib2-returned objects:
                 close_connection()  -  close the connection to the host
                 readlines()         -  you know, readlines()
                 status              -  the return status (i.e. 404)
                 reason              -  english translation of status (i.e. 'File not found')
               If you want the best of both worlds, use this inside an
               AttributeError-catching try:
               >>> try: status = fo.status
               >>> except AttributeError: status = None
               Unfortunately, these are ONLY there if status == 200, so it's not
               easy to distinguish between non-200 responses.  The reason is that
               urllib2 tries to do clever things with error codes 301, 302, 401,
               and 407, and it wraps the object upon return.
               For python versions earlier than 2.4, you can avoid this fancy error
               handling by setting the module-level global HANDLE_ERRORS to zero.
               You see, prior to 2.4, it's the HTTP Handler's job to determine what
               to handle specially, and what to just pass up.  HANDLE_ERRORS == 0
               means "pass everything up".  In python 2.4, however, this job no
               longer belongs to the HTTP Handler and is now done by a NEW handler,
               HTTPErrorProcessor.  Here's the bottom line:
                 python version < 2.4
                     HANDLE_ERRORS == 1  (default) pass up 200, treat the rest as
                                         errors
                     HANDLE_ERRORS == 0  pass everything up, error processing is
                                         left to the calling code
                 python version >= 2.4
                     HANDLE_ERRORS == 1  pass up 200, treat the rest as errors
                     HANDLE_ERRORS == 0  (default) pass everything up, let the
                                         other handlers (specifically,
                                         HTTPErrorProcessor) decide what to do
               In practice, setting the variable either way makes little difference
               in python 2.4, so for the most consistent behavior across versions,
               you probably just want to use the defaults, which will give you
               exceptions on errors.
             """
             # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
             import errno
             import httplib
             import socket
             import thread
             import urllib2
             DEBUG = None
             import sys
             if sys.version_info < (2, 4):
                 HANDLE_ERRORS = 1
             else: HANDLE_ERRORS = 0
             class ConnectionManager(object):
                 """
                 The connection manager must be able to:
                   * keep track of all existing
                   """
                 def __init__(self):
                     self._lock = thread.allocate_lock()
                     self._hostmap = {} # map hosts to a list of connections
                     self._connmap = {} # map connections to host
                     self._readymap = {} # map connection to ready state
                 def add(self, host, connection, ready):
                     self._lock.acquire()
                     try:
                         if host not in self._hostmap:
                             self._hostmap[host] = []
                         self._hostmap[host].append(connection)
                         self._connmap[connection] = host
                         self._readymap[connection] = ready
                     finally:
                         self._lock.release()
                 def remove(self, connection):
                     self._lock.acquire()
                     try:
                         try:
                             host = self._connmap[connection]
                         except KeyError:
                             pass
                         else:
                             del self._connmap[connection]
                             del self._readymap[connection]
                             self._hostmap[host].remove(connection)
                             if not self._hostmap[host]: del self._hostmap[host]
                     finally:
                         self._lock.release()
                 def set_ready(self, connection, ready):
                     try:
                         self._readymap[connection] = ready
                     except KeyError:
                         pass
                 def get_ready_conn(self, host):
                     conn = None
                     self._lock.acquire()
                     try:
                         if host in self._hostmap:
                             for c in self._hostmap[host]:
                                 if self._readymap[c]:
                                     self._readymap[c] = 0
                                     conn = c
                                     break
                     finally:
                         self._lock.release()
                     return conn
                 def get_all(self, host=None):
                     if host:
                         return list(self._hostmap.get(host, []))
                     else:
                         return dict(self._hostmap)
             class KeepAliveHandler(object):
                 def __init__(self):
                     self._cm = ConnectionManager()
                 #### Connection Management
                 def open_connections(self):
                     """return a list of connected hosts and the number of connections
                     to each.  [('foo.com:80', 2), ('bar.org', 1)]"""
                     return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
                 def close_connection(self, host):
                     """close connection(s) to <host>
                     host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
                     no error occurs if there is no connection to that host."""
                     for h in self._cm.get_all(host):
                         self._cm.remove(h)
                         h.close()
                 def close_all(self):
                     """close all open connections"""
                     for host, conns in self._cm.get_all().iteritems():
                         for h in conns:
                             self._cm.remove(h)
                             h.close()
                 def _request_closed(self, request, host, connection):
                     """tells us that this request is now closed and that the
                     connection is ready for another request"""
                     self._cm.set_ready(connection, 1)
                 def _remove_connection(self, host, connection, close=0):
                     if close:
                         connection.close()
                     self._cm.remove(connection)
                 #### Transaction Execution
                 def http_open(self, req):
                     return self.do_open(HTTPConnection, req)
                 def do_open(self, http_class, req):
                     host = req.get_host()
                     if not host:
                         raise urllib2.URLError('no host given')
                     try:
                         h = self._cm.get_ready_conn(host)
                         while h:
                             r = self._reuse_connection(h, req, host)
                             # if this response is non-None, then it worked and we're
                             # done.  Break out, skipping the else block.
                             if r:
                                 break
                             # connection is bad - possibly closed by server
                             # discard it and ask for the next free connection
                             h.close()
                             self._cm.remove(h)
                             h = self._cm.get_ready_conn(host)
                         else:
                             # no (working) free connections were found.  Create a new one.
                             h = http_class(host)
                             if DEBUG:
                                 DEBUG.info("creating new connection to %s (%d)",
                                            host, id(h))
                             self._cm.add(host, h, 0)
                             self._start_transaction(h, req)
                             r = h.getresponse()
                     except (socket.error, httplib.HTTPException), err:
                         raise urllib2.URLError(err)
                     # if not a persistent connection, don't try to reuse it
                     if r.will_close:
                         self._cm.remove(h)
                     if DEBUG:
                         DEBUG.info("STATUS: %s, %s", r.status, r.reason)
                     r._handler = self
                     r._host = host
                     r._url = req.get_full_url()
                     r._connection = h
                     r.code = r.status
                     r.headers = r.msg
                     r.msg = r.reason
                     if r.status == 200 or not HANDLE_ERRORS:
                         return r
                     else:
                         return self.parent.error('http', req, r,
                                                  r.status, r.msg, r.headers)
                 def _reuse_connection(self, h, req, host):
                     """start the transaction with a re-used connection
                     return a response object (r) upon success or None on failure.
                     This DOES not close or remove bad connections in cases where
                     it returns.  However, if an unexpected exception occurs, it
                     will close and remove the connection before re-raising.
                     """
                     try:
                         self._start_transaction(h, req)
                         r = h.getresponse()
                         # note: just because we got something back doesn't mean it
                         # worked.  We'll check the version below, too.
                     except (socket.error, httplib.HTTPException):
                         r = None
                     except: # re-raises
                         # adding this block just in case we've missed
                         # something we will still raise the exception, but
                         # lets try and close the connection and remove it
                         # first.  We previously got into a nasty loop
                         # where an exception was uncaught, and so the
                         # connection stayed open.  On the next try, the
                         # same exception was raised, etc.  The trade-off is
                         # that it's now possible this call will raise
                         # a DIFFERENT exception
                         if DEBUG:
                             DEBUG.error("unexpected exception - closing "
                                         "connection to %s (%d)", host, id(h))
                         self._cm.remove(h)
                         h.close()
                         raise
                     if r is None or r.version == 9:
                         # httplib falls back to assuming HTTP 0.9 if it gets a
                         # bad header back.  This is most likely to happen if
                         # the socket has been closed by the server since we
                         # last used the connection.
                         if DEBUG:
                             DEBUG.info("failed to re-use connection to %s (%d)",
                                        host, id(h))
                         r = None
                     else:
                         if DEBUG:
                             DEBUG.info("re-using connection to %s (%d)", host, id(h))
                     return r
                 def _start_transaction(self, h, req):
                     # What follows mostly reimplements HTTPConnection.request()
                     # except it adds self.parent.addheaders in the mix.
                     headers = req.headers.copy()
                     if sys.version_info >= (2, 4):
                         headers.update(req.unredirected_hdrs)
                     headers.update(self.parent.addheaders)
                     headers = dict((n.lower(), v) for n, v in headers.items())
                     skipheaders = {}
                     for n in ('host', 'accept-encoding'):
                         if n in headers:
                             skipheaders['skip_' + n.replace('-', '_')] = 1
                     try:
                         if req.has_data():
                             data = req.get_data()
                             h.putrequest('POST', req.get_selector(), **skipheaders)
                             if 'content-type' not in headers:
                                 h.putheader('Content-type',
                                             'application/x-www-form-urlencoded')
                             if 'content-length' not in headers:
                                 h.putheader('Content-length', '%d' % len(data))
                         else:
                             h.putrequest('GET', req.get_selector(), **skipheaders)
                     except (socket.error), err:
                         raise urllib2.URLError(err)
                     for k, v in headers.items():
                         h.putheader(k, v)
                     h.endheaders()
                     if req.has_data():
                         h.send(data)
             class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
                 pass
             class HTTPResponse(httplib.HTTPResponse):
                 # we need to subclass HTTPResponse in order to
                 # 1) add readline() and readlines() methods
                 # 2) add close_connection() methods
                 # 3) add info() and geturl() methods
                 # in order to add readline(), read must be modified to deal with a
                 # buffer.  example: readline must read a buffer and then spit back
                 # one line at a time.  The only real alternative is to read one
                 # BYTE at a time (ick).  Once something has been read, it can't be
                 # put back (ok, maybe it can, but that's even uglier than this),
                 # so if you THEN do a normal read, you must first take stuff from
                 # the buffer.
                 # the read method wraps the original to accommodate buffering,
                 # although read() never adds to the buffer.
                 # Both readline and readlines have been stolen with almost no
                 # modification from socket.py
                 def __init__(self, sock, debuglevel=0, strict=0, method=None):
-                    if method: # the httplib in python 2.3 uses the method arg
+                    httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
-                        httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
-                    else: # 2.2 doesn't
-                        httplib.HTTPResponse.__init__(self, sock, debuglevel)
                     self.fileno = sock.fileno
                     self.code = None
                     self._rbuf = ''
                     self._rbufsize = 8096
                     self._handler = None # inserted by the handler later
                     self._host = None    # (same)
                     self._url = None     # (same)
                     self._connection = None # (same)
                 _raw_read = httplib.HTTPResponse.read
                 def close(self):
                     if self.fp:
                         self.fp.close()
                         self.fp = None
                         if self._handler:
                             self._handler._request_closed(self, self._host,
                                                           self._connection)
                 def close_connection(self):
                     self._handler._remove_connection(self._host, self._connection, close=1)
                     self.close()
                 def info(self):
                     return self.headers
                 def geturl(self):
                     return self._url
                 def read(self, amt=None):
                     # the _rbuf test is only in this first if for speed.  It's not
                     # logically necessary
                     if self._rbuf and not amt is None:
                         L = len(self._rbuf)
                         if amt > L:
                             amt -= L
                         else:
                             s = self._rbuf[:amt]
                             self._rbuf = self._rbuf[amt:]
                             return s
                     s = self._rbuf + self._raw_read(amt)
                     self._rbuf = ''
                     return s
                 # stolen from Python SVN #68532 to fix issue1088
                 def _read_chunked(self, amt):
                     chunk_left = self.chunk_left
                     value = ''
                     # XXX This accumulates chunks by repeated string concatenation,
                     # which is not efficient as the number or size of chunks gets big.
                     while True:
                         if chunk_left is None:
                             line = self.fp.readline()
                             i = line.find(';')
                             if i >= 0:
                                 line = line[:i] # strip chunk-extensions
                             try:
                                 chunk_left = int(line, 16)
                             except ValueError:
                                 # close the connection as protocol synchronization is
                                 # probably lost
                                 self.close()
                                 raise httplib.IncompleteRead(value)
                             if chunk_left == 0:
                                 break
                         if amt is None:
                             value += self._safe_read(chunk_left)
                         elif amt < chunk_left:
                             value += self._safe_read(amt)
                             self.chunk_left = chunk_left - amt
                             return value
                         elif amt == chunk_left:
                             value += self._safe_read(amt)
                             self._safe_read(2)  # toss the CRLF at the end of the chunk
                             self.chunk_left = None
                             return value
                         else:
                             value += self._safe_read(chunk_left)
                             amt -= chunk_left
                         # we read the whole chunk, get another
                         self._safe_read(2)      # toss the CRLF at the end of the chunk
                         chunk_left = None
                     # read and discard trailer up to the CRLF terminator
                     ### note: we shouldn't have any trailers!
                     while True:
                         line = self.fp.readline()
                         if not line:
                             # a vanishingly small number of sites EOF without
                             # sending the trailer
                             break
                         if line == '\r\n':
                             break
                     # we read everything; close the "file"
                     self.close()
                     return value
                 def readline(self, limit=-1):
                     i = self._rbuf.find('\n')
                     while i < 0 and not (0 < limit <= len(self._rbuf)):
                         new = self._raw_read(self._rbufsize)
                         if not new:
                             break
                         i = new.find('\n')
                         if i >= 0:
                             i = i + len(self._rbuf)
                         self._rbuf = self._rbuf + new
                     if i < 0:
                         i = len(self._rbuf)
                     else:
                         i = i + 1
                     if 0 <= limit < len(self._rbuf):
                         i = limit
                     data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
                     return data
                 def readlines(self, sizehint = 0):
                     total = 0
                     list = []
                     while True:
                         line = self.readline()
                         if not line:
                             break
                         list.append(line)
                         total += len(line)
                         if sizehint and total >= sizehint:
                             break
                     return list
             def safesend(self, str):
                 """Send `str' to the server.
                 Shamelessly ripped off from httplib to patch a bad behavior.
                 """
                 # _broken_pipe_resp is an attribute we set in this function
                 # if the socket is closed while we're sending data but
                 # the server sent us a response before hanging up.
                 # In that case, we want to pretend to send the rest of the
                 # outgoing data, and then let the user use getresponse()
                 # (which we wrap) to get this last response before
                 # opening a new socket.
                 if getattr(self, '_broken_pipe_resp', None) is not None:
                     return
                 if self.sock is None:
                     if self.auto_open:
                         self.connect()
                     else:
                         raise httplib.NotConnected
                 # send the data to the server. if we get a broken pipe, then close
                 # the socket. we want to reconnect when somebody tries to send again.
                 #
                 # NOTE: we DO propagate the error, though, because we cannot simply
                 #       ignore the error... the caller will know if they can retry.
                 if self.debuglevel > 0:
                     print "send:", repr(str)
                 try:
                     blocksize = 8192
                     read = getattr(str, 'read', None)
                     if read is not None:
                         if self.debuglevel > 0:
                             print "sending a read()able"
                         data = read(blocksize)
                         while data:
                             self.sock.sendall(data)
                             data = read(blocksize)
                     else:
                         self.sock.sendall(str)
                 except socket.error, v:
                     reraise = True
                     if v[0] == errno.EPIPE:      # Broken pipe
                         if self._HTTPConnection__state == httplib._CS_REQ_SENT:
                             self._broken_pipe_resp = None
                             self._broken_pipe_resp = self.getresponse()
                             reraise = False
                         self.close()
                     if reraise:
                         raise
             def wrapgetresponse(cls):
                 """Wraps getresponse in cls with a broken-pipe sane version.
                 """
                 def safegetresponse(self):
                     # In safesend() we might set the _broken_pipe_resp
                     # attribute, in which case the socket has already
                     # been closed and we just need to give them the response
                     # back. Otherwise, we use the normal response path.
                     r = getattr(self, '_broken_pipe_resp', None)
                     if r is not None:
                         return r
                     return cls.getresponse(self)
                 safegetresponse.__doc__ = cls.getresponse.__doc__
                 return safegetresponse
             class HTTPConnection(httplib.HTTPConnection):
                 # use the modified response class
                 response_class = HTTPResponse
                 send = safesend
                 getresponse = wrapgetresponse(httplib.HTTPConnection)
             #########################################################################
             #####   TEST FUNCTIONS
             #########################################################################
             def error_handler(url):
                 global HANDLE_ERRORS
                 orig = HANDLE_ERRORS
                 keepalive_handler = HTTPHandler()
                 opener = urllib2.build_opener(keepalive_handler)
                 urllib2.install_opener(opener)
                 pos = {0: 'off', 1: 'on'}
                 for i in (0, 1):
                     print "  fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
                     HANDLE_ERRORS = i
                     try:
                         fo = urllib2.urlopen(url)
                         fo.read()
                         fo.close()
                         try:
                             status, reason = fo.status, fo.reason
                         except AttributeError:
                             status, reason = None, None
                     except IOError, e:
                         print "  EXCEPTION: %s" % e
                         raise
                     else:
                         print "  status = %s, reason = %s" % (status, reason)
                 HANDLE_ERRORS = orig
                 hosts = keepalive_handler.open_connections()
                 print "open connections:", hosts
                 keepalive_handler.close_all()
             def md5(s):
                 try:
                     from hashlib import md5 as _md5
                 except ImportError:
                     from md5 import md5 as _md5
                 global md5
                 md5 = _md5
                 return _md5(s)
             def continuity(url):
                 format = '%25s: %s'
                 # first fetch the file with the normal http handler
                 opener = urllib2.build_opener()
                 urllib2.install_opener(opener)
                 fo = urllib2.urlopen(url)
                 foo = fo.read()
                 fo.close()
                 m = md5.new(foo)
                 print format % ('normal urllib', m.hexdigest())
                 # now install the keepalive handler and try again
                 opener = urllib2.build_opener(HTTPHandler())
                 urllib2.install_opener(opener)
                 fo = urllib2.urlopen(url)
                 foo = fo.read()
                 fo.close()
                 m = md5.new(foo)
                 print format % ('keepalive read', m.hexdigest())
                 fo = urllib2.urlopen(url)
                 foo = ''
                 while True:
                     f = fo.readline()
                     if f:
                         foo = foo + f
                     else: break
                 fo.close()
                 m = md5.new(foo)
                 print format % ('keepalive readline', m.hexdigest())
             def comp(N, url):
                 print '  making %i connections to:\n  %s' % (N, url)
                 sys.stdout.write('  first using the normal urllib handlers')
                 # first use normal opener
                 opener = urllib2.build_opener()
                 urllib2.install_opener(opener)
                 t1 = fetch(N, url)
                 print '  TIME: %.3f s' % t1
                 sys.stdout.write('  now using the keepalive handler       ')
                 # now install the keepalive handler and try again
                 opener = urllib2.build_opener(HTTPHandler())
                 urllib2.install_opener(opener)
                 t2 = fetch(N, url)
                 print '  TIME: %.3f s' % t2
                 print '  improvement factor: %.2f' % (t1 / t2)
             def fetch(N, url, delay=0):
                 import time
                 lens = []
                 starttime = time.time()
                 for i in range(N):
                     if delay and i > 0:
                         time.sleep(delay)
                     fo = urllib2.urlopen(url)
                     foo = fo.read()
                     fo.close()
                     lens.append(len(foo))
                 diff = time.time() - starttime
                 j = 0
                 for i in lens[1:]:
                     j = j + 1
                     if not i == lens[0]:
                         print "WARNING: inconsistent length on read %i: %i" % (j, i)
                 return diff
             def test_timeout(url):
                 global DEBUG
                 dbbackup = DEBUG
                 class FakeLogger(object):
                     def debug(self, msg, *args):
                         print msg % args
                     info = warning = error = debug
                 DEBUG = FakeLogger()
                 print "  fetching the file to establish a connection"
                 fo = urllib2.urlopen(url)
                 data1 = fo.read()
                 fo.close()
                 i = 20
                 print "  waiting %i seconds for the server to close the connection" % i
                 while i > 0:
                     sys.stdout.write('\r  %2i' % i)
                     sys.stdout.flush()
                     time.sleep(1)
                     i -= 1
                 sys.stderr.write('\r')
                 print "  fetching the file a second time"
                 fo = urllib2.urlopen(url)
                 data2 = fo.read()
                 fo.close()
                 if data1 == data2:
                     print '  data are identical'
                 else:
                     print '  ERROR: DATA DIFFER'
                 DEBUG = dbbackup
             def test(url, N=10):
                 print "checking error handler (do this on a non-200)"
                 try: error_handler(url)
                 except IOError:
                     print "exiting - exception will prevent further tests"
                     sys.exit()
                 print
                 print "performing continuity test (making sure stuff isn't corrupted)"
                 continuity(url)
                 print
                 print "performing speed comparison"
                 comp(N, url)
                 print
                 print "performing dropped-connection check"
                 test_timeout(url)
             if __name__ == '__main__':
                 import time
                 import sys
                 try:
                     N = int(sys.argv[1])
                     url = sys.argv[2]
                 except (IndexError, ValueError):
                     print "%s <integer> <url>" % sys.argv[0]
                 else:
                     test(url, N)