upstream/mercurial-mirror Commit - r22505:232d437a

1

# This library is free software; you can redistribute it and/or

1

# This library is free software; you can redistribute it and/or

2

# modify it under the terms of the GNU Lesser General Public

2

# modify it under the terms of the GNU Lesser General Public

3

# License as published by the Free Software Foundation; either

3

# License as published by the Free Software Foundation; either

4

# version 2.1 of the License, or (at your option) any later version.

4

# version 2.1 of the License, or (at your option) any later version.

5

#

5

#

6

# This library is distributed in the hope that it will be useful,

6

# This library is distributed in the hope that it will be useful,

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

9

# Lesser General Public License for more details.

9

# Lesser General Public License for more details.

10

#

10

#

11

# You should have received a copy of the GNU Lesser General Public

11

# You should have received a copy of the GNU Lesser General Public

12

# License along with this library; if not, see

12

# License along with this library; if not, see

13

# <http://www.gnu.org/licenses/>.

13

# <http://www.gnu.org/licenses/>.

14

15

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

15

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

16

17

18

# Modified by Benoit Boissinot:

18

# Modified by Benoit Boissinot:

19

# - fix for digest auth (inspired from urllib2.py @ Python v2.4)

19

# - fix for digest auth (inspired from urllib2.py @ Python v2.4)

20

# Modified by Dirkjan Ochtman:

20

# Modified by Dirkjan Ochtman:

21

# - import md5 function from a local util module

21

# - import md5 function from a local util module

22

# Modified by Martin Geisler:

22

# Modified by Martin Geisler:

23

# - moved md5 function from local util module to this module

23

# - moved md5 function from local util module to this module

24

# Modified by Augie Fackler:

24

# Modified by Augie Fackler:

25

# - add safesend method and use it to prevent broken pipe errors

25

# - add safesend method and use it to prevent broken pipe errors

26

# on large POST requests

26

# on large POST requests

27

28

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

28

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

29

30

>>> import urllib2

30

>>> import urllib2

31

>>> from keepalive import HTTPHandler

31

>>> from keepalive import HTTPHandler

32

>>> keepalive_handler = HTTPHandler()

32

>>> keepalive_handler = HTTPHandler()

33

>>> opener = urllib2.build_opener(keepalive_handler)

33

>>> opener = urllib2.build_opener(keepalive_handler)

34

>>> urllib2.install_opener(opener)

34

>>> urllib2.install_opener(opener)

35

>>>

35

>>>

36

>>> fo = urllib2.urlopen('http://www.python.org')

36

>>> fo = urllib2.urlopen('http://www.python.org')

37

38

If a connection to a given host is requested, and all of the existing

38

If a connection to a given host is requested, and all of the existing

39

connections are still in use, another connection will be opened. If

39

connections are still in use, another connection will be opened. If

40

the handler tries to use an existing connection but it fails in some

40

the handler tries to use an existing connection but it fails in some

41

way, it will be closed and removed from the pool.

41

way, it will be closed and removed from the pool.

42

43

To remove the handler, simply re-run build_opener with no arguments, and

43

To remove the handler, simply re-run build_opener with no arguments, and

44

install that opener.

44

install that opener.

45

46

You can explicitly close connections by using the close_connection()

46

You can explicitly close connections by using the close_connection()

47

method of the returned file-like object (described below) or you can

47

method of the returned file-like object (described below) or you can

48

use the handler methods:

48

use the handler methods:

49

50

close_connection(host)

50

close_connection(host)

51

close_all()

51

close_all()

52

open_connections()

52

open_connections()

53

54

NOTE: using the close_connection and close_all methods of the handler

54

NOTE: using the close_connection and close_all methods of the handler

55

should be done with care when using multiple threads.

55

should be done with care when using multiple threads.

56

* there is nothing that prevents another thread from creating new

56

* there is nothing that prevents another thread from creating new

57

connections immediately after connections are closed

57

connections immediately after connections are closed

58

* no checks are done to prevent in-use connections from being closed

58

* no checks are done to prevent in-use connections from being closed

59

60

>>> keepalive_handler.close_all()

60

>>> keepalive_handler.close_all()

61

62

EXTRA ATTRIBUTES AND METHODS

62

EXTRA ATTRIBUTES AND METHODS

63

64

Upon a status of 200, the object returned has a few additional

64

Upon a status of 200, the object returned has a few additional

65

attributes and methods, which should not be used if you want to

65

attributes and methods, which should not be used if you want to

66

remain consistent with the normal urllib2-returned objects:

66

remain consistent with the normal urllib2-returned objects:

67

68

close_connection() - close the connection to the host

68

close_connection() - close the connection to the host

69

readlines() - you know, readlines()

69

readlines() - you know, readlines()

70

status - the return status (i.e. 404)

70

status - the return status (i.e. 404)

71

reason - english translation of status (i.e. 'File not found')

71

reason - english translation of status (i.e. 'File not found')

72

73

If you want the best of both worlds, use this inside an

73

If you want the best of both worlds, use this inside an

74

AttributeError-catching try:

74

AttributeError-catching try:

75

76

>>> try: status = fo.status

76

>>> try: status = fo.status

77

>>> except AttributeError: status = None

77

>>> except AttributeError: status = None

78

79

Unfortunately, these are ONLY there if status == 200, so it's not

79

Unfortunately, these are ONLY there if status == 200, so it's not

80

easy to distinguish between non-200 responses. The reason is that

80

easy to distinguish between non-200 responses. The reason is that

81

urllib2 tries to do clever things with error codes 301, 302, 401,

81

urllib2 tries to do clever things with error codes 301, 302, 401,

82

and 407, and it wraps the object upon return.

82

and 407, and it wraps the object upon return.

83

84

For python versions earlier than 2.4, you can avoid this fancy error

84

For python versions earlier than 2.4, you can avoid this fancy error

85

handling by setting the module-level global HANDLE_ERRORS to zero.

85

handling by setting the module-level global HANDLE_ERRORS to zero.

86

You see, prior to 2.4, it's the HTTP Handler's job to determine what

86

You see, prior to 2.4, it's the HTTP Handler's job to determine what

87

to handle specially, and what to just pass up. HANDLE_ERRORS == 0

87

to handle specially, and what to just pass up. HANDLE_ERRORS == 0

88

means "pass everything up". In python 2.4, however, this job no

88

means "pass everything up". In python 2.4, however, this job no

89

longer belongs to the HTTP Handler and is now done by a NEW handler,

89

longer belongs to the HTTP Handler and is now done by a NEW handler,

90

HTTPErrorProcessor. Here's the bottom line:

90

HTTPErrorProcessor. Here's the bottom line:

91

92

python version < 2.4

92

python version < 2.4

93

HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as

93

HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as

94

errors

94

errors

95

HANDLE_ERRORS == 0 pass everything up, error processing is

95

HANDLE_ERRORS == 0 pass everything up, error processing is

96

left to the calling code

96

left to the calling code

97

python version >= 2.4

97

python version >= 2.4

98

HANDLE_ERRORS == 1 pass up 200, treat the rest as errors

98

HANDLE_ERRORS == 1 pass up 200, treat the rest as errors

99

HANDLE_ERRORS == 0 (default) pass everything up, let the

99

HANDLE_ERRORS == 0 (default) pass everything up, let the

100

other handlers (specifically,

100

other handlers (specifically,

101

HTTPErrorProcessor) decide what to do

101

HTTPErrorProcessor) decide what to do

102

103

In practice, setting the variable either way makes little difference

103

In practice, setting the variable either way makes little difference

104

in python 2.4, so for the most consistent behavior across versions,

104

in python 2.4, so for the most consistent behavior across versions,

105

you probably just want to use the defaults, which will give you

105

you probably just want to use the defaults, which will give you

106

exceptions on errors.

106

exceptions on errors.

107

108

"""

108

"""

109

110

# $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $

110

# $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $

111

112

import errno

112

import errno

113

import httplib

113

import httplib

114

import socket

114

import socket

115

import thread

115

import thread

116

import urllib2

116

import urllib2

117

118

DEBUG = None

118

DEBUG = None

119

120

import sys

120

import sys

121

if sys.version_info < (2, 4):

121

if sys.version_info < (2, 4):

122

HANDLE_ERRORS = 1

122

HANDLE_ERRORS = 1

123

else: HANDLE_ERRORS = 0

123

else: HANDLE_ERRORS = 0

124

125

class ConnectionManager(object):

125

class ConnectionManager(object):

126

"""

126

"""

127

The connection manager must be able to:

127

The connection manager must be able to:

128

* keep track of all existing

128

* keep track of all existing

129

"""

129

"""

130

def __init__(self):

130

def __init__(self):

131

self._lock = thread.allocate_lock()

131

self._lock = thread.allocate_lock()

132

self._hostmap = {} # map hosts to a list of connections

132

self._hostmap = {} # map hosts to a list of connections

133

self._connmap = {} # map connections to host

133

self._connmap = {} # map connections to host

134

self._readymap = {} # map connection to ready state

134

self._readymap = {} # map connection to ready state

135

136

def add(self, host, connection, ready):

136

def add(self, host, connection, ready):

137

self._lock.acquire()

137

self._lock.acquire()

138

try:

138

try:

139

if host not in self._hostmap:

139

if host not in self._hostmap:

140

self._hostmap[host] = []

140

self._hostmap[host] = []

141

self._hostmap[host].append(connection)

141

self._hostmap[host].append(connection)

142

self._connmap[connection] = host

142

self._connmap[connection] = host

143

self._readymap[connection] = ready

143

self._readymap[connection] = ready

144

finally:

144

finally:

145

self._lock.release()

145

self._lock.release()

146

147

def remove(self, connection):

147

def remove(self, connection):

148

self._lock.acquire()

148

self._lock.acquire()

149

try:

149

try:

150

try:

150

try:

151

host = self._connmap[connection]

151

host = self._connmap[connection]

152

except KeyError:

152

except KeyError:

153

pass

153

pass

154

else:

154

else:

155

del self._connmap[connection]

155

del self._connmap[connection]

156

del self._readymap[connection]

156

del self._readymap[connection]

157

self._hostmap[host].remove(connection)

157

self._hostmap[host].remove(connection)

158

if not self._hostmap[host]: del self._hostmap[host]

158

if not self._hostmap[host]: del self._hostmap[host]

159

finally:

159

finally:

160

self._lock.release()

160

self._lock.release()

161

162

def set_ready(self, connection, ready):

162

def set_ready(self, connection, ready):

163

try:

163

try:

164

self._readymap[connection] = ready

164

self._readymap[connection] = ready

165

except KeyError:

165

except KeyError:

166

pass

166

pass

167

168

def get_ready_conn(self, host):

168

def get_ready_conn(self, host):

169

conn = None

169

conn = None

170

self._lock.acquire()

170

self._lock.acquire()

171

try:

171

try:

172

if host in self._hostmap:

172

if host in self._hostmap:

173

for c in self._hostmap[host]:

173

for c in self._hostmap[host]:

174

if self._readymap[c]:

174

if self._readymap[c]:

175

self._readymap[c] = 0

175

self._readymap[c] = 0

176

conn = c

176

conn = c

177

break

177

break

178

finally:

178

finally:

179

self._lock.release()

179

self._lock.release()

180

return conn

180

return conn

181

182

def get_all(self, host=None):

182

def get_all(self, host=None):

183

if host:

183

if host:

184

return list(self._hostmap.get(host, []))

184

return list(self._hostmap.get(host, []))

185

else:

185

else:

186

return dict(self._hostmap)

186

return dict(self._hostmap)

187

188

class KeepAliveHandler(object):

188

class KeepAliveHandler(object):

189

def __init__(self):

189

def __init__(self):

190

self._cm = ConnectionManager()

190

self._cm = ConnectionManager()

191

192

#### Connection Management

192

#### Connection Management

193

def open_connections(self):

193

def open_connections(self):

194

"""return a list of connected hosts and the number of connections

194

"""return a list of connected hosts and the number of connections

195

to each. [('foo.com:80', 2), ('bar.org', 1)]"""

195

to each. [('foo.com:80', 2), ('bar.org', 1)]"""

196

return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

196

return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

197

198

def close_connection(self, host):

198

def close_connection(self, host):

199

"""close connection(s) to <host>

199

"""close connection(s) to <host>

200

host is the host:port spec, as in 'www.cnn.com:8080' as passed in.

200

host is the host:port spec, as in 'www.cnn.com:8080' as passed in.

201

no error occurs if there is no connection to that host."""

201

no error occurs if there is no connection to that host."""

202

for h in self._cm.get_all(host):

202

for h in self._cm.get_all(host):

203

self._cm.remove(h)

203

self._cm.remove(h)

204

h.close()

204

h.close()

205

206

def close_all(self):

206

def close_all(self):

207

"""close all open connections"""

207

"""close all open connections"""

208

for host, conns in self._cm.get_all().iteritems():

208

for host, conns in self._cm.get_all().iteritems():

209

for h in conns:

209

for h in conns:

210

self._cm.remove(h)

210

self._cm.remove(h)

211

h.close()

211

h.close()

212

213

def _request_closed(self, request, host, connection):

213

def _request_closed(self, request, host, connection):

214

"""tells us that this request is now closed and that the

214

"""tells us that this request is now closed and that the

215

connection is ready for another request"""

215

connection is ready for another request"""

216

self._cm.set_ready(connection, 1)

216

self._cm.set_ready(connection, 1)

217

218

def _remove_connection(self, host, connection, close=0):

218

def _remove_connection(self, host, connection, close=0):

219

if close:

219

if close:

220

connection.close()

220

connection.close()

221

self._cm.remove(connection)

221

self._cm.remove(connection)

222

223

#### Transaction Execution

223

#### Transaction Execution

224

def http_open(self, req):

224

def http_open(self, req):

225

return self.do_open(HTTPConnection, req)

225

return self.do_open(HTTPConnection, req)

226

227

def do_open(self, http_class, req):

227

def do_open(self, http_class, req):

228

host = req.get_host()

228

host = req.get_host()

229

if not host:

229

if not host:

230

raise urllib2.URLError('no host given')

230

raise urllib2.URLError('no host given')

231

232

try:

232

try:

233

h = self._cm.get_ready_conn(host)

233

h = self._cm.get_ready_conn(host)

234

while h:

234

while h:

235

r = self._reuse_connection(h, req, host)

235

r = self._reuse_connection(h, req, host)

236

237

# if this response is non-None, then it worked and we're

237

# if this response is non-None, then it worked and we're

238

# done. Break out, skipping the else block.

238

# done. Break out, skipping the else block.

239

if r:

239

if r:

240

break

240

break

241

242

# connection is bad - possibly closed by server

242

# connection is bad - possibly closed by server

243

# discard it and ask for the next free connection

243

# discard it and ask for the next free connection

244

h.close()

244

h.close()

245

self._cm.remove(h)

245

self._cm.remove(h)

246

h = self._cm.get_ready_conn(host)

246

h = self._cm.get_ready_conn(host)

247

else:

247

else:

248

# no (working) free connections were found. Create a new one.

248

# no (working) free connections were found. Create a new one.

249

h = http_class(host)

249

h = http_class(host)

250

if DEBUG:

250

if DEBUG:

251

DEBUG.info("creating new connection to %s (%d)",

251

DEBUG.info("creating new connection to %s (%d)",

252

host, id(h))

252

host, id(h))

253

self._cm.add(host, h, 0)

253

self._cm.add(host, h, 0)

254

self._start_transaction(h, req)

254

self._start_transaction(h, req)

255

r = h.getresponse()

255

r = h.getresponse()

256

except (socket.error, httplib.HTTPException), err:

256

except (socket.error, httplib.HTTPException), err:

257

raise urllib2.URLError(err)

257

raise urllib2.URLError(err)

258

259

# if not a persistent connection, don't try to reuse it

259

# if not a persistent connection, don't try to reuse it

260

if r.will_close:

260

if r.will_close:

261

self._cm.remove(h)

261

self._cm.remove(h)

262

263

if DEBUG:

263

if DEBUG:

264

DEBUG.info("STATUS: %s, %s", r.status, r.reason)

264

DEBUG.info("STATUS: %s, %s", r.status, r.reason)

265

r._handler = self

265

r._handler = self

266

r._host = host

266

r._host = host

267

r._url = req.get_full_url()

267

r._url = req.get_full_url()

268

r._connection = h

268

r._connection = h

269

r.code = r.status

269

r.code = r.status

270

r.headers = r.msg

270

r.headers = r.msg

271

r.msg = r.reason

271

r.msg = r.reason

272

273

if r.status == 200 or not HANDLE_ERRORS:

273

if r.status == 200 or not HANDLE_ERRORS:

274

return r

274

return r

275

else:

275

else:

276

return self.parent.error('http', req, r,

276

return self.parent.error('http', req, r,

277

r.status, r.msg, r.headers)

277

r.status, r.msg, r.headers)

278

279

def _reuse_connection(self, h, req, host):

279

def _reuse_connection(self, h, req, host):

280

"""start the transaction with a re-used connection

280

"""start the transaction with a re-used connection

281

return a response object (r) upon success or None on failure.

281

return a response object (r) upon success or None on failure.

282

This DOES not close or remove bad connections in cases where

282

This DOES not close or remove bad connections in cases where

283

it returns. However, if an unexpected exception occurs, it

283

it returns. However, if an unexpected exception occurs, it

284

will close and remove the connection before re-raising.

284

will close and remove the connection before re-raising.

285

"""

285

"""

286

try:

286

try:

287

self._start_transaction(h, req)

287

self._start_transaction(h, req)

288

r = h.getresponse()

288

r = h.getresponse()

289

# note: just because we got something back doesn't mean it

289

# note: just because we got something back doesn't mean it

290

# worked. We'll check the version below, too.

290

# worked. We'll check the version below, too.

291

except (socket.error, httplib.HTTPException):

291

except (socket.error, httplib.HTTPException):

292

r = None

292

r = None

293

except: # re-raises

293

except: # re-raises

294

# adding this block just in case we've missed

294

# adding this block just in case we've missed

295

# something we will still raise the exception, but

295

# something we will still raise the exception, but

296

# lets try and close the connection and remove it

296

# lets try and close the connection and remove it

297

# first. We previously got into a nasty loop

297

# first. We previously got into a nasty loop

298

# where an exception was uncaught, and so the

298

# where an exception was uncaught, and so the

299

# connection stayed open. On the next try, the

299

# connection stayed open. On the next try, the

300

# same exception was raised, etc. The trade-off is

300

# same exception was raised, etc. The trade-off is

301

# that it's now possible this call will raise

301

# that it's now possible this call will raise

302

# a DIFFERENT exception

302

# a DIFFERENT exception

303

if DEBUG:

303

if DEBUG:

304

DEBUG.error("unexpected exception - closing "

304

DEBUG.error("unexpected exception - closing "

305

"connection to %s (%d)", host, id(h))

305

"connection to %s (%d)", host, id(h))

306

self._cm.remove(h)

306

self._cm.remove(h)

307

h.close()

307

h.close()

308

raise

308

raise

309

310

if r is None or r.version == 9:

310

if r is None or r.version == 9:

311

# httplib falls back to assuming HTTP 0.9 if it gets a

311

# httplib falls back to assuming HTTP 0.9 if it gets a

312

# bad header back. This is most likely to happen if

312

# bad header back. This is most likely to happen if

313

# the socket has been closed by the server since we

313

# the socket has been closed by the server since we

314

# last used the connection.

314

# last used the connection.

315

if DEBUG:

315

if DEBUG:

316

DEBUG.info("failed to re-use connection to %s (%d)",

316

DEBUG.info("failed to re-use connection to %s (%d)",

317

host, id(h))

317

host, id(h))

318

r = None

318

r = None

319

else:

319

else:

320

if DEBUG:

320

if DEBUG:

321

DEBUG.info("re-using connection to %s (%d)", host, id(h))

321

DEBUG.info("re-using connection to %s (%d)", host, id(h))

322

323

return r

323

return r

324

325

def _start_transaction(self, h, req):

325

def _start_transaction(self, h, req):

326

# What follows mostly reimplements HTTPConnection.request()

326

# What follows mostly reimplements HTTPConnection.request()

327

# except it adds self.parent.addheaders in the mix.

327

# except it adds self.parent.addheaders in the mix.

328

headers = req.headers.copy()

328

headers = req.headers.copy()

329

if sys.version_info >= (2, 4):

329

if sys.version_info >= (2, 4):

330

headers.update(req.unredirected_hdrs)

330

headers.update(req.unredirected_hdrs)

331

headers.update(self.parent.addheaders)

331

headers.update(self.parent.addheaders)

332

headers = dict((n.lower(), v) for n, v in headers.items())

332

headers = dict((n.lower(), v) for n, v in headers.items())

333

skipheaders = {}

333

skipheaders = {}

334

for n in ('host', 'accept-encoding'):

334

for n in ('host', 'accept-encoding'):

335

if n in headers:

335

if n in headers:

336

skipheaders['skip_' + n.replace('-', '_')] = 1

336

skipheaders['skip_' + n.replace('-', '_')] = 1

337

try:

337

try:

338

if req.has_data():

338

if req.has_data():

339

data = req.get_data()

339

data = req.get_data()

340

h.putrequest('POST', req.get_selector(), **skipheaders)

340

h.putrequest('POST', req.get_selector(), **skipheaders)

341

if 'content-type' not in headers:

341

if 'content-type' not in headers:

342

h.putheader('Content-type',

342

h.putheader('Content-type',

343

'application/x-www-form-urlencoded')

343

'application/x-www-form-urlencoded')

344

if 'content-length' not in headers:

344

if 'content-length' not in headers:

345

h.putheader('Content-length', '%d' % len(data))

345

h.putheader('Content-length', '%d' % len(data))

346

else:

346

else:

347

h.putrequest('GET', req.get_selector(), **skipheaders)

347

h.putrequest('GET', req.get_selector(), **skipheaders)

348

except (socket.error), err:

348

except (socket.error), err:

349

raise urllib2.URLError(err)

349

raise urllib2.URLError(err)

350

for k, v in headers.items():

350

for k, v in headers.items():

351

h.putheader(k, v)

351

h.putheader(k, v)

352

h.endheaders()

352

h.endheaders()

353

if req.has_data():

353

if req.has_data():

354

h.send(data)

354

h.send(data)

355

356

class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):

356

class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):

357

pass

357

pass

358

359

class HTTPResponse(httplib.HTTPResponse):

359

class HTTPResponse(httplib.HTTPResponse):

360

# we need to subclass HTTPResponse in order to

360

# we need to subclass HTTPResponse in order to

361

# 1) add readline() and readlines() methods

361

# 1) add readline() and readlines() methods

362

# 2) add close_connection() methods

362

# 2) add close_connection() methods

363

# 3) add info() and geturl() methods

363

# 3) add info() and geturl() methods

364

365

# in order to add readline(), read must be modified to deal with a

365

# in order to add readline(), read must be modified to deal with a

366

# buffer. example: readline must read a buffer and then spit back

366

# buffer. example: readline must read a buffer and then spit back

367

# one line at a time. The only real alternative is to read one

367

# one line at a time. The only real alternative is to read one

368

# BYTE at a time (ick). Once something has been read, it can't be

368

# BYTE at a time (ick). Once something has been read, it can't be

369

# put back (ok, maybe it can, but that's even uglier than this),

369

# put back (ok, maybe it can, but that's even uglier than this),

370

# so if you THEN do a normal read, you must first take stuff from

370

# so if you THEN do a normal read, you must first take stuff from

371

# the buffer.

371

# the buffer.

372

373

# the read method wraps the original to accommodate buffering,

373

# the read method wraps the original to accommodate buffering,

374

# although read() never adds to the buffer.

374

# although read() never adds to the buffer.

375

# Both readline and readlines have been stolen with almost no

375

# Both readline and readlines have been stolen with almost no

376

# modification from socket.py

376

# modification from socket.py

377

378

379

def __init__(self, sock, debuglevel=0, strict=0, method=None):

379

def __init__(self, sock, debuglevel=0, strict=0, method=None):

380

httplib.HTTPResponse.__init__(self, sock, debuglevel, method)

380

httplib.HTTPResponse.__init__(self, sock, debuglevel, method)

381

self.fileno = sock.fileno

381

self.fileno = sock.fileno

382

self.code = None

382

self.code = None

383

self._rbuf = ''

383

self._rbuf = ''

384

self._rbufsize = 8096

384

self._rbufsize = 8096

385

self._handler = None # inserted by the handler later

385

self._handler = None # inserted by the handler later

386

self._host = None # (same)

386

self._host = None # (same)

387

self._url = None # (same)

387

self._url = None # (same)

388

self._connection = None # (same)

388

self._connection = None # (same)

389

390

_raw_read = httplib.HTTPResponse.read

390

_raw_read = httplib.HTTPResponse.read

391

392

def close(self):

392

def close(self):

393

if self.fp:

393

if self.fp:

394

self.fp.close()

394

self.fp.close()

395

self.fp = None

395

self.fp = None

396

if self._handler:

396

if self._handler:

397

self._handler._request_closed(self, self._host,

397

self._handler._request_closed(self, self._host,

398

self._connection)

398

self._connection)

399

400

def close_connection(self):

400

def close_connection(self):

401

self._handler._remove_connection(self._host, self._connection, close=1)

401

self._handler._remove_connection(self._host, self._connection, close=1)

402

self.close()

402

self.close()

403

404

def info(self):

404

def info(self):

405

return self.headers

405

return self.headers

406

407

def geturl(self):

407

def geturl(self):

408

return self._url

408

return self._url

409

410

def read(self, amt=None):

410

def read(self, amt=None):

411

# the _rbuf test is only in this first if for speed. It's not

411

# the _rbuf test is only in this first if for speed. It's not

412

# logically necessary

412

# logically necessary

413

if self._rbuf and not amt is None:

413

if self._rbuf and not amt is None:

414

L = len(self._rbuf)

414

L = len(self._rbuf)

415

if amt > L:

415

if amt > L:

416

amt -= L

416

amt -= L

417

else:

417

else:

418

s = self._rbuf[:amt]

418

s = self._rbuf[:amt]

419

self._rbuf = self._rbuf[amt:]

419

self._rbuf = self._rbuf[amt:]

420

return s

420

return s

421

422

s = self._rbuf + self._raw_read(amt)

422

s = self._rbuf + self._raw_read(amt)

423

self._rbuf = ''

423

self._rbuf = ''

424

return s

424

return s

425

426

# stolen from Python SVN #68532 to fix issue1088

426

# stolen from Python SVN #68532 to fix issue1088

427

def _read_chunked(self, amt):

427

def _read_chunked(self, amt):

428

chunk_left = self.chunk_left

428

chunk_left = self.chunk_left

429

value = ''

429

value = ''

430

431

# XXX This accumulates chunks by repeated string concatenation,

431

# XXX This accumulates chunks by repeated string concatenation,

432

# which is not efficient as the number or size of chunks gets big.

432

# which is not efficient as the number or size of chunks gets big.

433

while True:

433

while True:

434

if chunk_left is None:

434

if chunk_left is None:

435

line = self.fp.readline()

435

line = self.fp.readline()

436

i = line.find(';')

436

i = line.find(';')

437

if i >= 0:

437

if i >= 0:

438

line = line[:i] # strip chunk-extensions

438

line = line[:i] # strip chunk-extensions

439

try:

439

try:

440

chunk_left = int(line, 16)

440

chunk_left = int(line, 16)

441

except ValueError:

441

except ValueError:

442

# close the connection as protocol synchronization is

442

# close the connection as protocol synchronization is

443

# probably lost

443

# probably lost

444

self.close()

444

self.close()

445

raise httplib.IncompleteRead(value)

445

raise httplib.IncompleteRead(value)

446

if chunk_left == 0:

446

if chunk_left == 0:

447

break

447

break

448

if amt is None:

448

if amt is None:

449

value += self._safe_read(chunk_left)

449

value += self._safe_read(chunk_left)

450

elif amt < chunk_left:

450

elif amt < chunk_left:

451

value += self._safe_read(amt)

451

value += self._safe_read(amt)

452

self.chunk_left = chunk_left - amt

452

self.chunk_left = chunk_left - amt

453

return value

453

return value

454

elif amt == chunk_left:

454

elif amt == chunk_left:

455

value += self._safe_read(amt)

455

value += self._safe_read(amt)

456

self._safe_read(2) # toss the CRLF at the end of the chunk

456

self._safe_read(2) # toss the CRLF at the end of the chunk

457

self.chunk_left = None

457

self.chunk_left = None

458

return value

458

return value

459

else:

459

else:

460

value += self._safe_read(chunk_left)

460

value += self._safe_read(chunk_left)

461

amt -= chunk_left

461

amt -= chunk_left

462

463

# we read the whole chunk, get another

463

# we read the whole chunk, get another

464

self._safe_read(2) # toss the CRLF at the end of the chunk

464

self._safe_read(2) # toss the CRLF at the end of the chunk

465

chunk_left = None

465

chunk_left = None

466

467

# read and discard trailer up to the CRLF terminator

467

# read and discard trailer up to the CRLF terminator

468

### note: we shouldn't have any trailers!

468

### note: we shouldn't have any trailers!

469

while True:

469

while True:

470

line = self.fp.readline()

470

line = self.fp.readline()

471

if not line:

471

if not line:

472

# a vanishingly small number of sites EOF without

472

# a vanishingly small number of sites EOF without

473

# sending the trailer

473

# sending the trailer

474

break

474

break

475

if line == '\r\n':

475

if line == '\r\n':

476

break

476

break

477

478

# we read everything; close the "file"

478

# we read everything; close the "file"

479

self.close()

479

self.close()

480

481

return value

481

return value

482

483

def readline(self, limit=-1):

483

def readline(self, limit=-1):

484

i = self._rbuf.find('\n')

484

i = self._rbuf.find('\n')

485

while i < 0 and not (0 < limit <= len(self._rbuf)):

485

while i < 0 and not (0 < limit <= len(self._rbuf)):

486

new = self._raw_read(self._rbufsize)

486

new = self._raw_read(self._rbufsize)

487

if not new:

487

if not new:

488

break

488

break

489

i = new.find('\n')

489

i = new.find('\n')

490

if i >= 0:

490

if i >= 0:

491

i = i + len(self._rbuf)

491

i = i + len(self._rbuf)

492

self._rbuf = self._rbuf + new

492

self._rbuf = self._rbuf + new

493

if i < 0:

493

if i < 0:

494

i = len(self._rbuf)

494

i = len(self._rbuf)

495

else:

495

else:

496

i = i + 1

496

i = i + 1

497

if 0 <= limit < len(self._rbuf):

497

if 0 <= limit < len(self._rbuf):

498

i = limit

498

i = limit

499

data, self._rbuf = self._rbuf[:i], self._rbuf[i:]

499

data, self._rbuf = self._rbuf[:i], self._rbuf[i:]

500

return data

500

return data

501

502

def readlines(self, sizehint=0):

502

def readlines(self, sizehint=0):

503

total = 0

503

total = 0

504

list = []

504

list = []

505

while True:

505

while True:

506

line = self.readline()

506

line = self.readline()

507

if not line:

507

if not line:

508

break

508

break

509

list.append(line)

509

list.append(line)

510

total += len(line)

510

total += len(line)

511

if sizehint and total >= sizehint:

511

if sizehint and total >= sizehint:

512

break

512

break

513

return list

513

return list

514

515

def safesend(self, str):

515

def safesend(self, str):

516

"""Send `str' to the server.

516

"""Send `str' to the server.

517

518

Shamelessly ripped off from httplib to patch a bad behavior.

518

Shamelessly ripped off from httplib to patch a bad behavior.

519

"""

519

"""

520

# _broken_pipe_resp is an attribute we set in this function

520

# _broken_pipe_resp is an attribute we set in this function

521

# if the socket is closed while we're sending data but

521

# if the socket is closed while we're sending data but

522

# the server sent us a response before hanging up.

522

# the server sent us a response before hanging up.

523

# In that case, we want to pretend to send the rest of the

523

# In that case, we want to pretend to send the rest of the

524

# outgoing data, and then let the user use getresponse()

524

# outgoing data, and then let the user use getresponse()

525

# (which we wrap) to get this last response before

525

# (which we wrap) to get this last response before

526

# opening a new socket.

526

# opening a new socket.

527

if getattr(self, '_broken_pipe_resp', None) is not None:

527

if getattr(self, '_broken_pipe_resp', None) is not None:

528

return

528

return

529

530

if self.sock is None:

530

if self.sock is None:

531

if self.auto_open:

531

if self.auto_open:

532

self.connect()

532

self.connect()

533

else:

533

else:

534

raise httplib.NotConnected

534

raise httplib.NotConnected

535

536

# send the data to the server. if we get a broken pipe, then close

536

# send the data to the server. if we get a broken pipe, then close

537

# the socket. we want to reconnect when somebody tries to send again.

537

# the socket. we want to reconnect when somebody tries to send again.

538

#

538

#

539

# NOTE: we DO propagate the error, though, because we cannot simply

539

# NOTE: we DO propagate the error, though, because we cannot simply

540

# ignore the error... the caller will know if they can retry.

540

# ignore the error... the caller will know if they can retry.

541

if self.debuglevel > 0:

541

if self.debuglevel > 0:

542

print "send:", repr(str)

542

print "send:", repr(str)

543

try:

543

try:

544

blocksize = 8192

544

blocksize = 8192

545

read = getattr(str, 'read', None)

545

read = getattr(str, 'read', None)

546

if read is not None:

546

if read is not None:

547

if self.debuglevel > 0:

547

if self.debuglevel > 0:

548

print "sending a read()able"

548

print "sending a read()able"

549

data = read(blocksize)

549

data = read(blocksize)

550

while data:

550

while data:

551

self.sock.sendall(data)

551

self.sock.sendall(data)

552

data = read(blocksize)

552

data = read(blocksize)

553

else:

553

else:

554

self.sock.sendall(str)

554

self.sock.sendall(str)

555

except socket.error, v:

555

except socket.error, v:

556

reraise = True

556

reraise = True

557

if v[0] == errno.EPIPE: # Broken pipe

557

if v[0] == errno.EPIPE: # Broken pipe

558

if self._HTTPConnection__state == httplib._CS_REQ_SENT:

558

if self._HTTPConnection__state == httplib._CS_REQ_SENT:

559

self._broken_pipe_resp = None

559

self._broken_pipe_resp = None

560

self._broken_pipe_resp = self.getresponse()

560

self._broken_pipe_resp = self.getresponse()

561

reraise = False

561

reraise = False

562

self.close()

562

self.close()

563

if reraise:

563

if reraise:

564

raise

564

raise

565

566

def wrapgetresponse(cls):

566

def wrapgetresponse(cls):

567

"""Wraps getresponse in cls with a broken-pipe sane version.

567

"""Wraps getresponse in cls with a broken-pipe sane version.

568

"""

568

"""

569

def safegetresponse(self):

569

def safegetresponse(self):

570

# In safesend() we might set the _broken_pipe_resp

570

# In safesend() we might set the _broken_pipe_resp

571

# attribute, in which case the socket has already

571

# attribute, in which case the socket has already

572

# been closed and we just need to give them the response

572

# been closed and we just need to give them the response

573

# back. Otherwise, we use the normal response path.

573

# back. Otherwise, we use the normal response path.

574

r = getattr(self, '_broken_pipe_resp', None)

574

r = getattr(self, '_broken_pipe_resp', None)

575

if r is not None:

575

if r is not None:

576

return r

576

return r

577

return cls.getresponse(self)

577

return cls.getresponse(self)

578

safegetresponse.__doc__ = cls.getresponse.__doc__

578

safegetresponse.__doc__ = cls.getresponse.__doc__

579

return safegetresponse

579

return safegetresponse

580

581

class HTTPConnection(httplib.HTTPConnection):

581

class HTTPConnection(httplib.HTTPConnection):

582

# use the modified response class

582

# use the modified response class

583

response_class = HTTPResponse

583

response_class = HTTPResponse

584

send = safesend

584

send = safesend

585

getresponse = wrapgetresponse(httplib.HTTPConnection)

585

getresponse = wrapgetresponse(httplib.HTTPConnection)

586

587

588

#########################################################################

588

#########################################################################

589

##### TEST FUNCTIONS

589

##### TEST FUNCTIONS

590

#########################################################################

590

#########################################################################

591

592

def error_handler(url):

592

def error_handler(url):

593

global HANDLE_ERRORS

593

global HANDLE_ERRORS

594

orig = HANDLE_ERRORS

594

orig = HANDLE_ERRORS

595

keepalive_handler = HTTPHandler()

595

keepalive_handler = HTTPHandler()

596

opener = urllib2.build_opener(keepalive_handler)

596

opener = urllib2.build_opener(keepalive_handler)

597

urllib2.install_opener(opener)

597

urllib2.install_opener(opener)

598

pos = {0: 'off', 1: 'on'}

598

pos = {0: 'off', 1: 'on'}

599

for i in (0, 1):

599

for i in (0, 1):

600

print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)

600

print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)

601

HANDLE_ERRORS = i

601

HANDLE_ERRORS = i

602

try:

602

try:

603

fo = urllib2.urlopen(url)

603

fo = urllib2.urlopen(url)

604

fo.read()

604

fo.read()

605

fo.close()

605

fo.close()

606

try:

606

try:

607

status, reason = fo.status, fo.reason

607

status, reason = fo.status, fo.reason

608

except AttributeError:

608

except AttributeError:

609

status, reason = None, None

609

status, reason = None, None

610

except IOError, e:

610

except IOError, e:

611

print " EXCEPTION: %s" % e

611

print " EXCEPTION: %s" % e

612

raise

612

raise

613

else:

613

else:

614

print " status = %s, reason = %s" % (status, reason)

614

print " status = %s, reason = %s" % (status, reason)

615

HANDLE_ERRORS = orig

615

HANDLE_ERRORS = orig

616

hosts = keepalive_handler.open_connections()

616

hosts = keepalive_handler.open_connections()

617

print "open connections:", hosts

617

print "open connections:", hosts

618

keepalive_handler.close_all()

618

keepalive_handler.close_all()

619

620

def md5(s):

620

def md5(s):

621

try:

621

try:

622

from hashlib import md5 as _md5

622

from hashlib import md5 as _md5

623

except ImportError:

623

except ImportError:

624

from md5 import md5 as _md5

624

from md5 import md5 as _md5

625

global md5

625

global md5

626

md5 = _md5

626

md5 = _md5

627

return _md5(s)

627

return _md5(s)

628

629

def continuity(url):

629

def continuity(url):

630

format = '%25s: %s'

630

format = '%25s: %s'

631

632

# first fetch the file with the normal http handler

632

# first fetch the file with the normal http handler

633

opener = urllib2.build_opener()

633

opener = urllib2.build_opener()

634

urllib2.install_opener(opener)

634

urllib2.install_opener(opener)

635

fo = urllib2.urlopen(url)

635

fo = urllib2.urlopen(url)

636

foo = fo.read()

636

foo = fo.read()

637

fo.close()

637

fo.close()

638

m = md5.~~new~~(foo)

638

m = md5(foo)

639

print format % ('normal urllib', m.hexdigest())

639

print format % ('normal urllib', m.hexdigest())

640

641

# now install the keepalive handler and try again

641

# now install the keepalive handler and try again

642

opener = urllib2.build_opener(HTTPHandler())

642

opener = urllib2.build_opener(HTTPHandler())

643

urllib2.install_opener(opener)

643

urllib2.install_opener(opener)

644

645

fo = urllib2.urlopen(url)

645

fo = urllib2.urlopen(url)

646

foo = fo.read()

646

foo = fo.read()

647

fo.close()

647

fo.close()

648

m = md5.~~new~~(foo)

648

m = md5(foo)

649

print format % ('keepalive read', m.hexdigest())

649

print format % ('keepalive read', m.hexdigest())

650

651

fo = urllib2.urlopen(url)

651

fo = urllib2.urlopen(url)

652

foo = ''

652

foo = ''

653

while True:

653

while True:

654

f = fo.readline()

654

f = fo.readline()

655

if f:

655

if f:

656

foo = foo + f

656

foo = foo + f

657

else: break

657

else: break

658

fo.close()

658

fo.close()

659

m = md5.~~new~~(foo)

659

m = md5(foo)

660

print format % ('keepalive readline', m.hexdigest())

660

print format % ('keepalive readline', m.hexdigest())

661

662

def comp(N, url):

662

def comp(N, url):

663

print ' making %i connections to:\n %s' % (N, url)

663

print ' making %i connections to:\n %s' % (N, url)

664

665

sys.stdout.write(' first using the normal urllib handlers')

665

sys.stdout.write(' first using the normal urllib handlers')

666

# first use normal opener

666

# first use normal opener

667

opener = urllib2.build_opener()

667

opener = urllib2.build_opener()

668

urllib2.install_opener(opener)

668

urllib2.install_opener(opener)

669

t1 = fetch(N, url)

669

t1 = fetch(N, url)

670

print ' TIME: %.3f s' % t1

670

print ' TIME: %.3f s' % t1

671

672

sys.stdout.write(' now using the keepalive handler ')

672

sys.stdout.write(' now using the keepalive handler ')

673

# now install the keepalive handler and try again

673

# now install the keepalive handler and try again

674

opener = urllib2.build_opener(HTTPHandler())

674

opener = urllib2.build_opener(HTTPHandler())

675

urllib2.install_opener(opener)

675

urllib2.install_opener(opener)

676

t2 = fetch(N, url)

676

t2 = fetch(N, url)

677

print ' TIME: %.3f s' % t2

677

print ' TIME: %.3f s' % t2

678

print ' improvement factor: %.2f' % (t1 / t2)

678

print ' improvement factor: %.2f' % (t1 / t2)

679

680

def fetch(N, url, delay=0):

680

def fetch(N, url, delay=0):

681

import time

681

import time

682

lens = []

682

lens = []

683

starttime = time.time()

683

starttime = time.time()

684

for i in range(N):

684

for i in range(N):

685

if delay and i > 0:

685

if delay and i > 0:

686

time.sleep(delay)

686

time.sleep(delay)

687

fo = urllib2.urlopen(url)

687

fo = urllib2.urlopen(url)

688

foo = fo.read()

688

foo = fo.read()

689

fo.close()

689

fo.close()

690

lens.append(len(foo))

690

lens.append(len(foo))

691

diff = time.time() - starttime

691

diff = time.time() - starttime

692

693

j = 0

693

j = 0

694

for i in lens[1:]:

694

for i in lens[1:]:

695

j = j + 1

695

j = j + 1

696

if not i == lens[0]:

696

if not i == lens[0]:

697

print "WARNING: inconsistent length on read %i: %i" % (j, i)

697

print "WARNING: inconsistent length on read %i: %i" % (j, i)

698

699

return diff

699

return diff

700

701

def test_timeout(url):

701

def test_timeout(url):

702

global DEBUG

702

global DEBUG

703

dbbackup = DEBUG

703

dbbackup = DEBUG

704

class FakeLogger(object):

704

class FakeLogger(object):

705

def debug(self, msg, *args):

705

def debug(self, msg, *args):

706

print msg % args

706

print msg % args

707

info = warning = error = debug

707

info = warning = error = debug

708

DEBUG = FakeLogger()

708

DEBUG = FakeLogger()

709

print " fetching the file to establish a connection"

709

print " fetching the file to establish a connection"

710

fo = urllib2.urlopen(url)

710

fo = urllib2.urlopen(url)

711

data1 = fo.read()

711

data1 = fo.read()

712

fo.close()

712

fo.close()

713

714

i = 20

714

i = 20

715

print " waiting %i seconds for the server to close the connection" % i

715

print " waiting %i seconds for the server to close the connection" % i

716

while i > 0:

716

while i > 0:

717

sys.stdout.write('\r %2i' % i)

717

sys.stdout.write('\r %2i' % i)

718

sys.stdout.flush()

718

sys.stdout.flush()

719

time.sleep(1)

719

time.sleep(1)

720

i -= 1

720

i -= 1

721

sys.stderr.write('\r')

721

sys.stderr.write('\r')

722

723

print " fetching the file a second time"

723

print " fetching the file a second time"

724

fo = urllib2.urlopen(url)

724

fo = urllib2.urlopen(url)

725

data2 = fo.read()

725

data2 = fo.read()

726

fo.close()

726

fo.close()

727

728

if data1 == data2:

728

if data1 == data2:

729

print ' data are identical'

729

print ' data are identical'

730

else:

730

else:

731

print ' ERROR: DATA DIFFER'

731

print ' ERROR: DATA DIFFER'

732

733

DEBUG = dbbackup

733

DEBUG = dbbackup

734

735

736

def test(url, N=10):

736

def test(url, N=10):

737

print "checking error handler (do this on a non-200)"

737

print "checking error handler (do this on a non-200)"

738

try: error_handler(url)

738

try: error_handler(url)

739

except IOError:

739

except IOError:

740

print "exiting - exception will prevent further tests"

740

print "exiting - exception will prevent further tests"

741

sys.exit()

741

sys.exit()

742

print

742

print

743

print "performing continuity test (making sure stuff isn't corrupted)"

743

print "performing continuity test (making sure stuff isn't corrupted)"

744

continuity(url)

744

continuity(url)

745

print

745

print

746

print "performing speed comparison"

746

print "performing speed comparison"

747

comp(N, url)

747

comp(N, url)

748

print

748

print

749

print "performing dropped-connection check"

749

print "performing dropped-connection check"

750

test_timeout(url)

750

test_timeout(url)

751

752

if __name__ == '__main__':

752

if __name__ == '__main__':

753

import time

753

import time

754

import sys

754

import sys

755

try:

755

try:

756

N = int(sys.argv[1])

756

N = int(sys.argv[1])

757

url = sys.argv[2]

757

url = sys.argv[2]

758

except (IndexError, ValueError):

758

except (IndexError, ValueError):

759

print "%s <integer> <url>" % sys.argv[0]

759

print "%s <integer> <url>" % sys.argv[0]

760

else:

760

else:

761

test(url, N)

761

test(url, N)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #   This library is free software; you can redistribute it and/or
             #   modify it under the terms of the GNU Lesser General Public
             #   License as published by the Free Software Foundation; either
             #   version 2.1 of the License, or (at your option) any later version.
             #
             #   This library is distributed in the hope that it will be useful,
             #   but WITHOUT ANY WARRANTY; without even the implied warranty of
             #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
             #   Lesser General Public License for more details.
             #
             #   You should have received a copy of the GNU Lesser General Public
             #   License along with this library; if not, see
             #   <http://www.gnu.org/licenses/>.
             # This file is part of urlgrabber, a high-level cross-protocol url-grabber
             # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
             # Modified by Benoit Boissinot:
             #  - fix for digest auth (inspired from urllib2.py @ Python v2.4)
             # Modified by Dirkjan Ochtman:
             #  - import md5 function from a local util module
             # Modified by Martin Geisler:
             #  - moved md5 function from local util module to this module
             # Modified by Augie Fackler:
             #  - add safesend method and use it to prevent broken pipe errors
             #    on large POST requests
             """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
             >>> import urllib2
             >>> from keepalive import HTTPHandler
             >>> keepalive_handler = HTTPHandler()
             >>> opener = urllib2.build_opener(keepalive_handler)
             >>> urllib2.install_opener(opener)
             >>>
             >>> fo = urllib2.urlopen('http://www.python.org')
             If a connection to a given host is requested, and all of the existing
             connections are still in use, another connection will be opened.  If
             the handler tries to use an existing connection but it fails in some
             way, it will be closed and removed from the pool.
             To remove the handler, simply re-run build_opener with no arguments, and
             install that opener.
             You can explicitly close connections by using the close_connection()
             method of the returned file-like object (described below) or you can
             use the handler methods:
               close_connection(host)
               close_all()
               open_connections()
             NOTE: using the close_connection and close_all methods of the handler
             should be done with care when using multiple threads.
               * there is nothing that prevents another thread from creating new
                 connections immediately after connections are closed
               * no checks are done to prevent in-use connections from being closed
             >>> keepalive_handler.close_all()
             EXTRA ATTRIBUTES AND METHODS
               Upon a status of 200, the object returned has a few additional
               attributes and methods, which should not be used if you want to
               remain consistent with the normal urllib2-returned objects:
                 close_connection()  -  close the connection to the host
                 readlines()         -  you know, readlines()
                 status              -  the return status (i.e. 404)
                 reason              -  english translation of status (i.e. 'File not found')
               If you want the best of both worlds, use this inside an
               AttributeError-catching try:
               >>> try: status = fo.status
               >>> except AttributeError: status = None
               Unfortunately, these are ONLY there if status == 200, so it's not
               easy to distinguish between non-200 responses.  The reason is that
               urllib2 tries to do clever things with error codes 301, 302, 401,
               and 407, and it wraps the object upon return.
               For python versions earlier than 2.4, you can avoid this fancy error
               handling by setting the module-level global HANDLE_ERRORS to zero.
               You see, prior to 2.4, it's the HTTP Handler's job to determine what
               to handle specially, and what to just pass up.  HANDLE_ERRORS == 0
               means "pass everything up".  In python 2.4, however, this job no
               longer belongs to the HTTP Handler and is now done by a NEW handler,
               HTTPErrorProcessor.  Here's the bottom line:
                 python version < 2.4
                     HANDLE_ERRORS == 1  (default) pass up 200, treat the rest as
                                         errors
                     HANDLE_ERRORS == 0  pass everything up, error processing is
                                         left to the calling code
                 python version >= 2.4
                     HANDLE_ERRORS == 1  pass up 200, treat the rest as errors
                     HANDLE_ERRORS == 0  (default) pass everything up, let the
                                         other handlers (specifically,
                                         HTTPErrorProcessor) decide what to do
               In practice, setting the variable either way makes little difference
               in python 2.4, so for the most consistent behavior across versions,
               you probably just want to use the defaults, which will give you
               exceptions on errors.
             """
             # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
             import errno
             import httplib
             import socket
             import thread
             import urllib2
             DEBUG = None
             import sys
             if sys.version_info < (2, 4):
                 HANDLE_ERRORS = 1
             else: HANDLE_ERRORS = 0
             class ConnectionManager(object):
                 """
                 The connection manager must be able to:
                   * keep track of all existing
                   """
                 def __init__(self):
                     self._lock = thread.allocate_lock()
                     self._hostmap = {} # map hosts to a list of connections
                     self._connmap = {} # map connections to host
                     self._readymap = {} # map connection to ready state
                 def add(self, host, connection, ready):
                     self._lock.acquire()
                     try:
                         if host not in self._hostmap:
                             self._hostmap[host] = []
                         self._hostmap[host].append(connection)
                         self._connmap[connection] = host
                         self._readymap[connection] = ready
                     finally:
                         self._lock.release()
                 def remove(self, connection):
                     self._lock.acquire()
                     try:
                         try:
                             host = self._connmap[connection]
                         except KeyError:
                             pass
                         else:
                             del self._connmap[connection]
                             del self._readymap[connection]
                             self._hostmap[host].remove(connection)
                             if not self._hostmap[host]: del self._hostmap[host]
                     finally:
                         self._lock.release()
                 def set_ready(self, connection, ready):
                     try:
                         self._readymap[connection] = ready
                     except KeyError:
                         pass
                 def get_ready_conn(self, host):
                     conn = None
                     self._lock.acquire()
                     try:
                         if host in self._hostmap:
                             for c in self._hostmap[host]:
                                 if self._readymap[c]:
                                     self._readymap[c] = 0
                                     conn = c
                                     break
                     finally:
                         self._lock.release()
                     return conn
                 def get_all(self, host=None):
                     if host:
                         return list(self._hostmap.get(host, []))
                     else:
                         return dict(self._hostmap)
             class KeepAliveHandler(object):
                 def __init__(self):
                     self._cm = ConnectionManager()
                 #### Connection Management
                 def open_connections(self):
                     """return a list of connected hosts and the number of connections
                     to each.  [('foo.com:80', 2), ('bar.org', 1)]"""
                     return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
                 def close_connection(self, host):
                     """close connection(s) to <host>
                     host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
                     no error occurs if there is no connection to that host."""
                     for h in self._cm.get_all(host):
                         self._cm.remove(h)
                         h.close()
                 def close_all(self):
                     """close all open connections"""
                     for host, conns in self._cm.get_all().iteritems():
                         for h in conns:
                             self._cm.remove(h)
                             h.close()
                 def _request_closed(self, request, host, connection):
                     """tells us that this request is now closed and that the
                     connection is ready for another request"""
                     self._cm.set_ready(connection, 1)
                 def _remove_connection(self, host, connection, close=0):
                     if close:
                         connection.close()
                     self._cm.remove(connection)
                 #### Transaction Execution
                 def http_open(self, req):
                     return self.do_open(HTTPConnection, req)
                 def do_open(self, http_class, req):
                     host = req.get_host()
                     if not host:
                         raise urllib2.URLError('no host given')
                     try:
                         h = self._cm.get_ready_conn(host)
                         while h:
                             r = self._reuse_connection(h, req, host)
                             # if this response is non-None, then it worked and we're
                             # done.  Break out, skipping the else block.
                             if r:
                                 break
                             # connection is bad - possibly closed by server
                             # discard it and ask for the next free connection
                             h.close()
                             self._cm.remove(h)
                             h = self._cm.get_ready_conn(host)
                         else:
                             # no (working) free connections were found.  Create a new one.
                             h = http_class(host)
                             if DEBUG:
                                 DEBUG.info("creating new connection to %s (%d)",
                                            host, id(h))
                             self._cm.add(host, h, 0)
                             self._start_transaction(h, req)
                             r = h.getresponse()
                     except (socket.error, httplib.HTTPException), err:
                         raise urllib2.URLError(err)
                     # if not a persistent connection, don't try to reuse it
                     if r.will_close:
                         self._cm.remove(h)
                     if DEBUG:
                         DEBUG.info("STATUS: %s, %s", r.status, r.reason)
                     r._handler = self
                     r._host = host
                     r._url = req.get_full_url()
                     r._connection = h
                     r.code = r.status
                     r.headers = r.msg
                     r.msg = r.reason
                     if r.status == 200 or not HANDLE_ERRORS:
                         return r
                     else:
                         return self.parent.error('http', req, r,
                                                  r.status, r.msg, r.headers)
                 def _reuse_connection(self, h, req, host):
                     """start the transaction with a re-used connection
                     return a response object (r) upon success or None on failure.
                     This DOES not close or remove bad connections in cases where
                     it returns.  However, if an unexpected exception occurs, it
                     will close and remove the connection before re-raising.
                     """
                     try:
                         self._start_transaction(h, req)
                         r = h.getresponse()
                         # note: just because we got something back doesn't mean it
                         # worked.  We'll check the version below, too.
                     except (socket.error, httplib.HTTPException):
                         r = None
                     except: # re-raises
                         # adding this block just in case we've missed
                         # something we will still raise the exception, but
                         # lets try and close the connection and remove it
                         # first.  We previously got into a nasty loop
                         # where an exception was uncaught, and so the
                         # connection stayed open.  On the next try, the
                         # same exception was raised, etc.  The trade-off is
                         # that it's now possible this call will raise
                         # a DIFFERENT exception
                         if DEBUG:
                             DEBUG.error("unexpected exception - closing "
                                         "connection to %s (%d)", host, id(h))
                         self._cm.remove(h)
                         h.close()
                         raise
                     if r is None or r.version == 9:
                         # httplib falls back to assuming HTTP 0.9 if it gets a
                         # bad header back.  This is most likely to happen if
                         # the socket has been closed by the server since we
                         # last used the connection.
                         if DEBUG:
                             DEBUG.info("failed to re-use connection to %s (%d)",
                                        host, id(h))
                         r = None
                     else:
                         if DEBUG:
                             DEBUG.info("re-using connection to %s (%d)", host, id(h))
                     return r
                 def _start_transaction(self, h, req):
                     # What follows mostly reimplements HTTPConnection.request()
                     # except it adds self.parent.addheaders in the mix.
                     headers = req.headers.copy()
                     if sys.version_info >= (2, 4):
                         headers.update(req.unredirected_hdrs)
                     headers.update(self.parent.addheaders)
                     headers = dict((n.lower(), v) for n, v in headers.items())
                     skipheaders = {}
                     for n in ('host', 'accept-encoding'):
                         if n in headers:
                             skipheaders['skip_' + n.replace('-', '_')] = 1
                     try:
                         if req.has_data():
                             data = req.get_data()
                             h.putrequest('POST', req.get_selector(), **skipheaders)
                             if 'content-type' not in headers:
                                 h.putheader('Content-type',
                                             'application/x-www-form-urlencoded')
                             if 'content-length' not in headers:
                                 h.putheader('Content-length', '%d' % len(data))
                         else:
                             h.putrequest('GET', req.get_selector(), **skipheaders)
                     except (socket.error), err:
                         raise urllib2.URLError(err)
                     for k, v in headers.items():
                         h.putheader(k, v)
                     h.endheaders()
                     if req.has_data():
                         h.send(data)
             class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
                 pass
             class HTTPResponse(httplib.HTTPResponse):
                 # we need to subclass HTTPResponse in order to
                 # 1) add readline() and readlines() methods
                 # 2) add close_connection() methods
                 # 3) add info() and geturl() methods
                 # in order to add readline(), read must be modified to deal with a
                 # buffer.  example: readline must read a buffer and then spit back
                 # one line at a time.  The only real alternative is to read one
                 # BYTE at a time (ick).  Once something has been read, it can't be
                 # put back (ok, maybe it can, but that's even uglier than this),
                 # so if you THEN do a normal read, you must first take stuff from
                 # the buffer.
                 # the read method wraps the original to accommodate buffering,
                 # although read() never adds to the buffer.
                 # Both readline and readlines have been stolen with almost no
                 # modification from socket.py
                 def __init__(self, sock, debuglevel=0, strict=0, method=None):
                     httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
                     self.fileno = sock.fileno
                     self.code = None
                     self._rbuf = ''
                     self._rbufsize = 8096
                     self._handler = None # inserted by the handler later
                     self._host = None    # (same)
                     self._url = None     # (same)
                     self._connection = None # (same)
                 _raw_read = httplib.HTTPResponse.read
                 def close(self):
                     if self.fp:
                         self.fp.close()
                         self.fp = None
                         if self._handler:
                             self._handler._request_closed(self, self._host,
                                                           self._connection)
                 def close_connection(self):
                     self._handler._remove_connection(self._host, self._connection, close=1)
                     self.close()
                 def info(self):
                     return self.headers
                 def geturl(self):
                     return self._url
                 def read(self, amt=None):
                     # the _rbuf test is only in this first if for speed.  It's not
                     # logically necessary
                     if self._rbuf and not amt is None:
                         L = len(self._rbuf)
                         if amt > L:
                             amt -= L
                         else:
                             s = self._rbuf[:amt]
                             self._rbuf = self._rbuf[amt:]
                             return s
                     s = self._rbuf + self._raw_read(amt)
                     self._rbuf = ''
                     return s
                 # stolen from Python SVN #68532 to fix issue1088
                 def _read_chunked(self, amt):
                     chunk_left = self.chunk_left
                     value = ''
                     # XXX This accumulates chunks by repeated string concatenation,
                     # which is not efficient as the number or size of chunks gets big.
                     while True:
                         if chunk_left is None:
                             line = self.fp.readline()
                             i = line.find(';')
                             if i >= 0:
                                 line = line[:i] # strip chunk-extensions
                             try:
                                 chunk_left = int(line, 16)
                             except ValueError:
                                 # close the connection as protocol synchronization is
                                 # probably lost
                                 self.close()
                                 raise httplib.IncompleteRead(value)
                             if chunk_left == 0:
                                 break
                         if amt is None:
                             value += self._safe_read(chunk_left)
                         elif amt < chunk_left:
                             value += self._safe_read(amt)
                             self.chunk_left = chunk_left - amt
                             return value
                         elif amt == chunk_left:
                             value += self._safe_read(amt)
                             self._safe_read(2)  # toss the CRLF at the end of the chunk
                             self.chunk_left = None
                             return value
                         else:
                             value += self._safe_read(chunk_left)
                             amt -= chunk_left
                         # we read the whole chunk, get another
                         self._safe_read(2)      # toss the CRLF at the end of the chunk
                         chunk_left = None
                     # read and discard trailer up to the CRLF terminator
                     ### note: we shouldn't have any trailers!
                     while True:
                         line = self.fp.readline()
                         if not line:
                             # a vanishingly small number of sites EOF without
                             # sending the trailer
                             break
                         if line == '\r\n':
                             break
                     # we read everything; close the "file"
                     self.close()
                     return value
                 def readline(self, limit=-1):
                     i = self._rbuf.find('\n')
                     while i < 0 and not (0 < limit <= len(self._rbuf)):
                         new = self._raw_read(self._rbufsize)
                         if not new:
                             break
                         i = new.find('\n')
                         if i >= 0:
                             i = i + len(self._rbuf)
                         self._rbuf = self._rbuf + new
                     if i < 0:
                         i = len(self._rbuf)
                     else:
                         i = i + 1
                     if 0 <= limit < len(self._rbuf):
                         i = limit
                     data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
                     return data
                 def readlines(self, sizehint=0):
                     total = 0
                     list = []
                     while True:
                         line = self.readline()
                         if not line:
                             break
                         list.append(line)
                         total += len(line)
                         if sizehint and total >= sizehint:
                             break
                     return list
             def safesend(self, str):
                 """Send `str' to the server.
                 Shamelessly ripped off from httplib to patch a bad behavior.
                 """
                 # _broken_pipe_resp is an attribute we set in this function
                 # if the socket is closed while we're sending data but
                 # the server sent us a response before hanging up.
                 # In that case, we want to pretend to send the rest of the
                 # outgoing data, and then let the user use getresponse()
                 # (which we wrap) to get this last response before
                 # opening a new socket.
                 if getattr(self, '_broken_pipe_resp', None) is not None:
                     return
                 if self.sock is None:
                     if self.auto_open:
                         self.connect()
                     else:
                         raise httplib.NotConnected
                 # send the data to the server. if we get a broken pipe, then close
                 # the socket. we want to reconnect when somebody tries to send again.
                 #
                 # NOTE: we DO propagate the error, though, because we cannot simply
                 #       ignore the error... the caller will know if they can retry.
                 if self.debuglevel > 0:
                     print "send:", repr(str)
                 try:
                     blocksize = 8192
                     read = getattr(str, 'read', None)
                     if read is not None:
                         if self.debuglevel > 0:
                             print "sending a read()able"
                         data = read(blocksize)
                         while data:
                             self.sock.sendall(data)
                             data = read(blocksize)
                     else:
                         self.sock.sendall(str)
                 except socket.error, v:
                     reraise = True
                     if v[0] == errno.EPIPE:      # Broken pipe
                         if self._HTTPConnection__state == httplib._CS_REQ_SENT:
                             self._broken_pipe_resp = None
                             self._broken_pipe_resp = self.getresponse()
                             reraise = False
                         self.close()
                     if reraise:
                         raise
             def wrapgetresponse(cls):
                 """Wraps getresponse in cls with a broken-pipe sane version.
                 """
                 def safegetresponse(self):
                     # In safesend() we might set the _broken_pipe_resp
                     # attribute, in which case the socket has already
                     # been closed and we just need to give them the response
                     # back. Otherwise, we use the normal response path.
                     r = getattr(self, '_broken_pipe_resp', None)
                     if r is not None:
                         return r
                     return cls.getresponse(self)
                 safegetresponse.__doc__ = cls.getresponse.__doc__
                 return safegetresponse
             class HTTPConnection(httplib.HTTPConnection):
                 # use the modified response class
                 response_class = HTTPResponse
                 send = safesend
                 getresponse = wrapgetresponse(httplib.HTTPConnection)
             #########################################################################
             #####   TEST FUNCTIONS
             #########################################################################
             def error_handler(url):
                 global HANDLE_ERRORS
                 orig = HANDLE_ERRORS
                 keepalive_handler = HTTPHandler()
                 opener = urllib2.build_opener(keepalive_handler)
                 urllib2.install_opener(opener)
                 pos = {0: 'off', 1: 'on'}
                 for i in (0, 1):
                     print "  fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
                     HANDLE_ERRORS = i
                     try:
                         fo = urllib2.urlopen(url)
                         fo.read()
                         fo.close()
                         try:
                             status, reason = fo.status, fo.reason
                         except AttributeError:
                             status, reason = None, None
                     except IOError, e:
                         print "  EXCEPTION: %s" % e
                         raise
                     else:
                         print "  status = %s, reason = %s" % (status, reason)
                 HANDLE_ERRORS = orig
                 hosts = keepalive_handler.open_connections()
                 print "open connections:", hosts
                 keepalive_handler.close_all()
             def md5(s):
                 try:
                     from hashlib import md5 as _md5
                 except ImportError:
                     from md5 import md5 as _md5
                 global md5
                 md5 = _md5
                 return _md5(s)
             def continuity(url):
                 format = '%25s: %s'
                 # first fetch the file with the normal http handler
                 opener = urllib2.build_opener()
                 urllib2.install_opener(opener)
                 fo = urllib2.urlopen(url)
                 foo = fo.read()
                 fo.close()
-                m = md5.new(foo)
+                m = md5(foo)
                 print format % ('normal urllib', m.hexdigest())
                 # now install the keepalive handler and try again
                 opener = urllib2.build_opener(HTTPHandler())
                 urllib2.install_opener(opener)
                 fo = urllib2.urlopen(url)
                 foo = fo.read()
                 fo.close()
-                m = md5.new(foo)
+                m = md5(foo)
                 print format % ('keepalive read', m.hexdigest())
                 fo = urllib2.urlopen(url)
                 foo = ''
                 while True:
                     f = fo.readline()
                     if f:
                         foo = foo + f
                     else: break
                 fo.close()
-                m = md5.new(foo)
+                m = md5(foo)
                 print format % ('keepalive readline', m.hexdigest())
             def comp(N, url):
                 print '  making %i connections to:\n  %s' % (N, url)
                 sys.stdout.write('  first using the normal urllib handlers')
                 # first use normal opener
                 opener = urllib2.build_opener()
                 urllib2.install_opener(opener)
                 t1 = fetch(N, url)
                 print '  TIME: %.3f s' % t1
                 sys.stdout.write('  now using the keepalive handler       ')
                 # now install the keepalive handler and try again
                 opener = urllib2.build_opener(HTTPHandler())
                 urllib2.install_opener(opener)
                 t2 = fetch(N, url)
                 print '  TIME: %.3f s' % t2
                 print '  improvement factor: %.2f' % (t1 / t2)
             def fetch(N, url, delay=0):
                 import time
                 lens = []
                 starttime = time.time()
                 for i in range(N):
                     if delay and i > 0:
                         time.sleep(delay)
                     fo = urllib2.urlopen(url)
                     foo = fo.read()
                     fo.close()
                     lens.append(len(foo))
                 diff = time.time() - starttime
                 j = 0
                 for i in lens[1:]:
                     j = j + 1
                     if not i == lens[0]:
                         print "WARNING: inconsistent length on read %i: %i" % (j, i)
                 return diff
             def test_timeout(url):
                 global DEBUG
                 dbbackup = DEBUG
                 class FakeLogger(object):
                     def debug(self, msg, *args):
                         print msg % args
                     info = warning = error = debug
                 DEBUG = FakeLogger()
                 print "  fetching the file to establish a connection"
                 fo = urllib2.urlopen(url)
                 data1 = fo.read()
                 fo.close()
                 i = 20
                 print "  waiting %i seconds for the server to close the connection" % i
                 while i > 0:
                     sys.stdout.write('\r  %2i' % i)
                     sys.stdout.flush()
                     time.sleep(1)
                     i -= 1
                 sys.stderr.write('\r')
                 print "  fetching the file a second time"
                 fo = urllib2.urlopen(url)
                 data2 = fo.read()
                 fo.close()
                 if data1 == data2:
                     print '  data are identical'
                 else:
                     print '  ERROR: DATA DIFFER'
                 DEBUG = dbbackup
             def test(url, N=10):
                 print "checking error handler (do this on a non-200)"
                 try: error_handler(url)
                 except IOError:
                     print "exiting - exception will prevent further tests"
                     sys.exit()
                 print
                 print "performing continuity test (making sure stuff isn't corrupted)"
                 continuity(url)
                 print
                 print "performing speed comparison"
                 comp(N, url)
                 print
                 print "performing dropped-connection check"
                 test_timeout(url)
             if __name__ == '__main__':
                 import time
                 import sys
                 try:
                     N = int(sys.argv[1])
                     url = sys.argv[2]
                 except (IndexError, ValueError):
                     print "%s <integer> <url>" % sys.argv[0]
                 else:
                     test(url, N)