upstream/mercurial-mirror Commit - r8146:4f13ed6e

1

# This library is free software; you can redistribute it and/or

1

# This library is free software; you can redistribute it and/or

2

# modify it under the terms of the GNU Lesser General Public

2

# modify it under the terms of the GNU Lesser General Public

3

# License as published by the Free Software Foundation; either

3

# License as published by the Free Software Foundation; either

4

# version 2.1 of the License, or (at your option) any later version.

4

# version 2.1 of the License, or (at your option) any later version.

5

#

5

#

6

# This library is distributed in the hope that it will be useful,

6

# This library is distributed in the hope that it will be useful,

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

9

# Lesser General Public License for more details.

9

# Lesser General Public License for more details.

10

#

10

#

11

# You should have received a copy of the GNU Lesser General Public

11

# You should have received a copy of the GNU Lesser General Public

12

# License along with this library; if not, write to the

12

# License along with this library; if not, write to the

13

# Free Software Foundation, Inc.,

13

# Free Software Foundation, Inc.,

14

# 59 Temple Place, Suite 330,

14

# 59 Temple Place, Suite 330,

15

# Boston, MA 02111-1307 USA

15

# Boston, MA 02111-1307 USA

16

17

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

17

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

18

19

20

# Modified by Benoit Boissinot:

20

# Modified by Benoit Boissinot:

21

# - fix for digest auth (inspired from urllib2.py @ Python v2.4)

21

# - fix for digest auth (inspired from urllib2.py @ Python v2.4)

22

# Modified by Dirkjan Ochtman:

22

# Modified by Dirkjan Ochtman:

23

# - import md5 function from a local util module

23

# - import md5 function from a local util module

24

25

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

25

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

26

27

>>> import urllib2

27

>>> import urllib2

28

>>> from keepalive import HTTPHandler

28

>>> from keepalive import HTTPHandler

29

>>> keepalive_handler = HTTPHandler()

29

>>> keepalive_handler = HTTPHandler()

30

>>> opener = urllib2.build_opener(keepalive_handler)

30

>>> opener = urllib2.build_opener(keepalive_handler)

31

>>> urllib2.install_opener(opener)

31

>>> urllib2.install_opener(opener)

32

>>>

32

>>>

33

>>> fo = urllib2.urlopen('http://www.python.org')

33

>>> fo = urllib2.urlopen('http://www.python.org')

34

35

If a connection to a given host is requested, and all of the existing

35

If a connection to a given host is requested, and all of the existing

36

connections are still in use, another connection will be opened. If

36

connections are still in use, another connection will be opened. If

37

the handler tries to use an existing connection but it fails in some

37

the handler tries to use an existing connection but it fails in some

38

way, it will be closed and removed from the pool.

38

way, it will be closed and removed from the pool.

39

40

To remove the handler, simply re-run build_opener with no arguments, and

40

To remove the handler, simply re-run build_opener with no arguments, and

41

install that opener.

41

install that opener.

42

43

You can explicitly close connections by using the close_connection()

43

You can explicitly close connections by using the close_connection()

44

method of the returned file-like object (described below) or you can

44

method of the returned file-like object (described below) or you can

45

use the handler methods:

45

use the handler methods:

46

47

close_connection(host)

47

close_connection(host)

48

close_all()

48

close_all()

49

open_connections()

49

open_connections()

50

51

NOTE: using the close_connection and close_all methods of the handler

51

NOTE: using the close_connection and close_all methods of the handler

52

should be done with care when using multiple threads.

52

should be done with care when using multiple threads.

53

* there is nothing that prevents another thread from creating new

53

* there is nothing that prevents another thread from creating new

54

connections immediately after connections are closed

54

connections immediately after connections are closed

55

* no checks are done to prevent in-use connections from being closed

55

* no checks are done to prevent in-use connections from being closed

56

57

>>> keepalive_handler.close_all()

57

>>> keepalive_handler.close_all()

58

59

EXTRA ATTRIBUTES AND METHODS

59

EXTRA ATTRIBUTES AND METHODS

60

61

Upon a status of 200, the object returned has a few additional

61

Upon a status of 200, the object returned has a few additional

62

attributes and methods, which should not be used if you want to

62

attributes and methods, which should not be used if you want to

63

remain consistent with the normal urllib2-returned objects:

63

remain consistent with the normal urllib2-returned objects:

64

65

close_connection() - close the connection to the host

65

close_connection() - close the connection to the host

66

readlines() - you know, readlines()

66

readlines() - you know, readlines()

67

status - the return status (ie 404)

67

status - the return status (ie 404)

68

reason - english translation of status (ie 'File not found')

68

reason - english translation of status (ie 'File not found')

69

70

If you want the best of both worlds, use this inside an

70

If you want the best of both worlds, use this inside an

71

AttributeError-catching try:

71

AttributeError-catching try:

72

73

>>> try: status = fo.status

73

>>> try: status = fo.status

74

>>> except AttributeError: status = None

74

>>> except AttributeError: status = None

75

76

Unfortunately, these are ONLY there if status == 200, so it's not

76

Unfortunately, these are ONLY there if status == 200, so it's not

77

easy to distinguish between non-200 responses. The reason is that

77

easy to distinguish between non-200 responses. The reason is that

78

urllib2 tries to do clever things with error codes 301, 302, 401,

78

urllib2 tries to do clever things with error codes 301, 302, 401,

79

and 407, and it wraps the object upon return.

79

and 407, and it wraps the object upon return.

80

81

For python versions earlier than 2.4, you can avoid this fancy error

81

For python versions earlier than 2.4, you can avoid this fancy error

82

handling by setting the module-level global HANDLE_ERRORS to zero.

82

handling by setting the module-level global HANDLE_ERRORS to zero.

83

You see, prior to 2.4, it's the HTTP Handler's job to determine what

83

You see, prior to 2.4, it's the HTTP Handler's job to determine what

84

to handle specially, and what to just pass up. HANDLE_ERRORS == 0

84

to handle specially, and what to just pass up. HANDLE_ERRORS == 0

85

means "pass everything up". In python 2.4, however, this job no

85

means "pass everything up". In python 2.4, however, this job no

86

longer belongs to the HTTP Handler and is now done by a NEW handler,

86

longer belongs to the HTTP Handler and is now done by a NEW handler,

87

HTTPErrorProcessor. Here's the bottom line:

87

HTTPErrorProcessor. Here's the bottom line:

88

89

python version < 2.4

89

python version < 2.4

90

HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as

90

HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as

91

errors

91

errors

92

HANDLE_ERRORS == 0 pass everything up, error processing is

92

HANDLE_ERRORS == 0 pass everything up, error processing is

93

left to the calling code

93

left to the calling code

94

python version >= 2.4

94

python version >= 2.4

95

HANDLE_ERRORS == 1 pass up 200, treat the rest as errors

95

HANDLE_ERRORS == 1 pass up 200, treat the rest as errors

96

HANDLE_ERRORS == 0 (default) pass everything up, let the

96

HANDLE_ERRORS == 0 (default) pass everything up, let the

97

other handlers (specifically,

97

other handlers (specifically,

98

HTTPErrorProcessor) decide what to do

98

HTTPErrorProcessor) decide what to do

99

100

In practice, setting the variable either way makes little difference

100

In practice, setting the variable either way makes little difference

101

in python 2.4, so for the most consistent behavior across versions,

101

in python 2.4, so for the most consistent behavior across versions,

102

you probably just want to use the defaults, which will give you

102

you probably just want to use the defaults, which will give you

103

exceptions on errors.

103

exceptions on errors.

104

105

"""

105

"""

106

107

# $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $

107

# $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $

108

109

import urllib2

109

import urllib2

110

import httplib

110

import httplib

111

import socket

111

import socket

112

import thread

112

import thread

113

114

DEBUG = None

114

DEBUG = None

115

116

import sys

116

import sys

117

if sys.version_info < (2, 4): HANDLE_ERRORS = 1

117

if sys.version_info < (2, 4): HANDLE_ERRORS = 1

118

else: HANDLE_ERRORS = 0

118

else: HANDLE_ERRORS = 0

119

120

class ConnectionManager:

120

class ConnectionManager:

121

"""

121

"""

122

The connection manager must be able to:

122

The connection manager must be able to:

123

* keep track of all existing

123

* keep track of all existing

124

"""

124

"""

125

def __init__(self):

125

def __init__(self):

126

self._lock = thread.allocate_lock()

126

self._lock = thread.allocate_lock()

127

self._hostmap = {} # map hosts to a list of connections

127

self._hostmap = {} # map hosts to a list of connections

128

self._connmap = {} # map connections to host

128

self._connmap = {} # map connections to host

129

self._readymap = {} # map connection to ready state

129

self._readymap = {} # map connection to ready state

130

131

def add(self, host, connection, ready):

131

def add(self, host, connection, ready):

132

self._lock.acquire()

132

self._lock.acquire()

133

try:

133

try:

134

if not host in self._hostmap: self._hostmap[host] = []

134

if not host in self._hostmap: self._hostmap[host] = []

135

self._hostmap[host].append(connection)

135

self._hostmap[host].append(connection)

136

self._connmap[connection] = host

136

self._connmap[connection] = host

137

self._readymap[connection] = ready

137

self._readymap[connection] = ready

138

finally:

138

finally:

139

self._lock.release()

139

self._lock.release()

140

141

def remove(self, connection):

141

def remove(self, connection):

142

self._lock.acquire()

142

self._lock.acquire()

143

try:

143

try:

144

try:

144

try:

145

host = self._connmap[connection]

145

host = self._connmap[connection]

146

except KeyError:

146

except KeyError:

147

pass

147

pass

148

else:

148

else:

149

del self._connmap[connection]

149

del self._connmap[connection]

150

del self._readymap[connection]

150

del self._readymap[connection]

151

self._hostmap[host].remove(connection)

151

self._hostmap[host].remove(connection)

152

if not self._hostmap[host]: del self._hostmap[host]

152

if not self._hostmap[host]: del self._hostmap[host]

153

finally:

153

finally:

154

self._lock.release()

154

self._lock.release()

155

156

def set_ready(self, connection, ready):

156

def set_ready(self, connection, ready):

157

try: self._readymap[connection] = ready

157

try: self._readymap[connection] = ready

158

except KeyError: pass

158

except KeyError: pass

159

160

def get_ready_conn(self, host):

160

def get_ready_conn(self, host):

161

conn = None

161

conn = None

162

self._lock.acquire()

162

self._lock.acquire()

163

try:

163

try:

164

if host in self._hostmap:

164

if host in self._hostmap:

165

for c in self._hostmap[host]:

165

for c in self._hostmap[host]:

166

if self._readymap[c]:

166

if self._readymap[c]:

167

self._readymap[c] = 0

167

self._readymap[c] = 0

168

conn = c

168

conn = c

169

break

169

break

170

finally:

170

finally:

171

self._lock.release()

171

self._lock.release()

172

return conn

172

return conn

173

174

def get_all(self, host=None):

174

def get_all(self, host=None):

175

if host:

175

if host:

176

return list(self._hostmap.get(host, []))

176

return list(self._hostmap.get(host, []))

177

else:

177

else:

178

return dict(self._hostmap)

178

return dict(self._hostmap)

179

180

class KeepAliveHandler:

180

class KeepAliveHandler:

181

def __init__(self):

181

def __init__(self):

182

self._cm = ConnectionManager()

182

self._cm = ConnectionManager()

183

184

#### Connection Management

184

#### Connection Management

185

def open_connections(self):

185

def open_connections(self):

186

"""return a list of connected hosts and the number of connections

186

"""return a list of connected hosts and the number of connections

187

to each. [('foo.com:80', 2), ('bar.org', 1)]"""

187

to each. [('foo.com:80', 2), ('bar.org', 1)]"""

188

return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

188

return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

189

190

def close_connection(self, host):

190

def close_connection(self, host):

191

"""close connection(s) to <host>

191

"""close connection(s) to <host>

192

host is the host:port spec, as in 'www.cnn.com:8080' as passed in.

192

host is the host:port spec, as in 'www.cnn.com:8080' as passed in.

193

no error occurs if there is no connection to that host."""

193

no error occurs if there is no connection to that host."""

194

for h in self._cm.get_all(host):

194

for h in self._cm.get_all(host):

195

self._cm.remove(h)

195

self._cm.remove(h)

196

h.close()

196

h.close()

197

198

def close_all(self):

198

def close_all(self):

199

"""close all open connections"""

199

"""close all open connections"""

200

for host, conns in self._cm.get_all().iteritems():

200

for host, conns in self._cm.get_all().iteritems():

201

for h in conns:

201

for h in conns:

202

self._cm.remove(h)

202

self._cm.remove(h)

203

h.close()

203

h.close()

204

205

def _request_closed(self, request, host, connection):

205

def _request_closed(self, request, host, connection):

206

"""tells us that this request is now closed and the the

206

"""tells us that this request is now closed and the the

207

connection is ready for another request"""

207

connection is ready for another request"""

208

self._cm.set_ready(connection, 1)

208

self._cm.set_ready(connection, 1)

209

210

def _remove_connection(self, host, connection, close=0):

210

def _remove_connection(self, host, connection, close=0):

211

if close: connection.close()

211

if close: connection.close()

212

self._cm.remove(connection)

212

self._cm.remove(connection)

213

214

#### Transaction Execution

214

#### Transaction Execution

215

def http_open(self, req):

215

def http_open(self, req):

216

return self.do_open(HTTPConnection, req)

216

return self.do_open(HTTPConnection, req)

217

218

def do_open(self, http_class, req):

218

def do_open(self, http_class, req):

219

host = req.get_host()

219

host = req.get_host()

220

if not host:

220

if not host:

221

raise urllib2.URLError('no host given')

221

raise urllib2.URLError('no host given')

222

223

try:

223

try:

224

h = self._cm.get_ready_conn(host)

224

h = self._cm.get_ready_conn(host)

225

while h:

225

while h:

226

r = self._reuse_connection(h, req, host)

226

r = self._reuse_connection(h, req, host)

227

228

# if this response is non-None, then it worked and we're

228

# if this response is non-None, then it worked and we're

229

# done. Break out, skipping the else block.

229

# done. Break out, skipping the else block.

230

if r: break

230

if r: break

231

232

# connection is bad - possibly closed by server

232

# connection is bad - possibly closed by server

233

# discard it and ask for the next free connection

233

# discard it and ask for the next free connection

234

h.close()

234

h.close()

235

self._cm.remove(h)

235

self._cm.remove(h)

236

h = self._cm.get_ready_conn(host)

236

h = self._cm.get_ready_conn(host)

237

else:

237

else:

238

# no (working) free connections were found. Create a new one.

238

# no (working) free connections were found. Create a new one.

239

h = http_class(host)

239

h = http_class(host)

240

if DEBUG: DEBUG.info("creating new connection to %s (%d)",

240

if DEBUG: DEBUG.info("creating new connection to %s (%d)",

241

host, id(h))

241

host, id(h))

242

self._cm.add(host, h, 0)

242

self._cm.add(host, h, 0)

243

self._start_transaction(h, req)

243

self._start_transaction(h, req)

244

r = h.getresponse()

244

r = h.getresponse()

245

except (socket.error, httplib.HTTPException), err:

245

except (socket.error, httplib.HTTPException), err:

246

raise urllib2.URLError(err)

246

raise urllib2.URLError(err)

247

248

# if not a persistent connection, don't try to reuse it

248

# if not a persistent connection, don't try to reuse it

249

if r.will_close: self._cm.remove(h)

249

if r.will_close: self._cm.remove(h)

250

251

if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)

251

if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)

252

r._handler = self

252

r._handler = self

253

r._host = host

253

r._host = host

254

r._url = req.get_full_url()

254

r._url = req.get_full_url()

255

r._connection = h

255

r._connection = h

256

r.code = r.status

256

r.code = r.status

257

r.headers = r.msg

257

r.headers = r.msg

258

r.msg = r.reason

258

r.msg = r.reason

259

260

if r.status == 200 or not HANDLE_ERRORS:

260

if r.status == 200 or not HANDLE_ERRORS:

261

return r

261

return r

262

else:

262

else:

263

return self.parent.error('http', req, r,

263

return self.parent.error('http', req, r,

264

r.status, r.msg, r.headers)

264

r.status, r.msg, r.headers)

265

266

def _reuse_connection(self, h, req, host):

266

def _reuse_connection(self, h, req, host):

267

"""start the transaction with a re-used connection

267

"""start the transaction with a re-used connection

268

return a response object (r) upon success or None on failure.

268

return a response object (r) upon success or None on failure.

269

This DOES not close or remove bad connections in cases where

269

This DOES not close or remove bad connections in cases where

270

it returns. However, if an unexpected exception occurs, it

270

it returns. However, if an unexpected exception occurs, it

271

will close and remove the connection before re-raising.

271

will close and remove the connection before re-raising.

272

"""

272

"""

273

try:

273

try:

274

self._start_transaction(h, req)

274

self._start_transaction(h, req)

275

r = h.getresponse()

275

r = h.getresponse()

276

# note: just because we got something back doesn't mean it

276

# note: just because we got something back doesn't mean it

277

# worked. We'll check the version below, too.

277

# worked. We'll check the version below, too.

278

except (socket.error, httplib.HTTPException):

278

except (socket.error, httplib.HTTPException):

279

r = None

279

r = None

280

except:

280

except:

281

# adding this block just in case we've missed

281

# adding this block just in case we've missed

282

# something we will still raise the exception, but

282

# something we will still raise the exception, but

283

# lets try and close the connection and remove it

283

# lets try and close the connection and remove it

284

# first. We previously got into a nasty loop

284

# first. We previously got into a nasty loop

285

# where an exception was uncaught, and so the

285

# where an exception was uncaught, and so the

286

# connection stayed open. On the next try, the

286

# connection stayed open. On the next try, the

287

# same exception was raised, etc. The tradeoff is

287

# same exception was raised, etc. The tradeoff is

288

# that it's now possible this call will raise

288

# that it's now possible this call will raise

289

# a DIFFERENT exception

289

# a DIFFERENT exception

290

if DEBUG: DEBUG.error("unexpected exception - closing " + \

290

if DEBUG: DEBUG.error("unexpected exception - closing " + \

291

"connection to %s (%d)", host, id(h))

291

"connection to %s (%d)", host, id(h))

292

self._cm.remove(h)

292

self._cm.remove(h)

293

h.close()

293

h.close()

294

raise

294

raise

295

296

if r is None or r.version == 9:

296

if r is None or r.version == 9:

297

# httplib falls back to assuming HTTP 0.9 if it gets a

297

# httplib falls back to assuming HTTP 0.9 if it gets a

298

# bad header back. This is most likely to happen if

298

# bad header back. This is most likely to happen if

299

# the socket has been closed by the server since we

299

# the socket has been closed by the server since we

300

# last used the connection.

300

# last used the connection.

301

if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",

301

if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",

302

host, id(h))

302

host, id(h))

303

r = None

303

r = None

304

else:

304

else:

305

if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))

305

if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))

306

307

return r

307

return r

308

309

def _start_transaction(self, h, req):

309

def _start_transaction(self, h, req):

310

headers = req.headers.copy()

311

body = req.data

312

if sys.version_info >= (2, 4):

313

headers.update(req.unredirected_hdrs)

314

try:

310

try:

315

h.request(req.get_method(), req.get_selector(), body, headers)

311

if req.has_data():

316

except socket.error, err: # XXX what error?

312

data = req.get_data()

313

h.putrequest('POST', req.get_selector())

314

if 'Content-type' not in req.headers:

315

h.putheader('Content-type',

316

'application/x-www-form-urlencoded')

317

if 'Content-length' not in req.headers:

318

h.putheader('Content-length', '%d' % len(data))

319

else:

320

h.putrequest('GET', req.get_selector())

321

except (socket.error), err:

317

raise urllib2.URLError(err)

322

raise urllib2.URLError(err)

318

323

324

for args in self.parent.addheaders:

325

h.putheader(*args)

326

for k, v in req.headers.items():

327

h.putheader(k, v)

328

h.endheaders()

329

if req.has_data():

330

h.send(data)

331

319

class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):

332

class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):

320

pass

333

pass

321

334

322

class HTTPResponse(httplib.HTTPResponse):

335

class HTTPResponse(httplib.HTTPResponse):

323

# we need to subclass HTTPResponse in order to

336

# we need to subclass HTTPResponse in order to

324

# 1) add readline() and readlines() methods

337

# 1) add readline() and readlines() methods

325

# 2) add close_connection() methods

338

# 2) add close_connection() methods

326

# 3) add info() and geturl() methods

339

# 3) add info() and geturl() methods

327

340

328

# in order to add readline(), read must be modified to deal with a

341

# in order to add readline(), read must be modified to deal with a

329

# buffer. example: readline must read a buffer and then spit back

342

# buffer. example: readline must read a buffer and then spit back

330

# one line at a time. The only real alternative is to read one

343

# one line at a time. The only real alternative is to read one

331

# BYTE at a time (ick). Once something has been read, it can't be

344

# BYTE at a time (ick). Once something has been read, it can't be

332

# put back (ok, maybe it can, but that's even uglier than this),

345

# put back (ok, maybe it can, but that's even uglier than this),

333

# so if you THEN do a normal read, you must first take stuff from

346

# so if you THEN do a normal read, you must first take stuff from

334

# the buffer.

347

# the buffer.

335

348

336

# the read method wraps the original to accomodate buffering,

349

# the read method wraps the original to accomodate buffering,

337

# although read() never adds to the buffer.

350

# although read() never adds to the buffer.

338

# Both readline and readlines have been stolen with almost no

351

# Both readline and readlines have been stolen with almost no

339

# modification from socket.py

352

# modification from socket.py

340

353

341

354

342

def __init__(self, sock, debuglevel=0, strict=0, method=None):

355

def __init__(self, sock, debuglevel=0, strict=0, method=None):

343

if method: # the httplib in python 2.3 uses the method arg

356

if method: # the httplib in python 2.3 uses the method arg

344

httplib.HTTPResponse.__init__(self, sock, debuglevel, method)

357

httplib.HTTPResponse.__init__(self, sock, debuglevel, method)

345

else: # 2.2 doesn't

358

else: # 2.2 doesn't

346

httplib.HTTPResponse.__init__(self, sock, debuglevel)

359

httplib.HTTPResponse.__init__(self, sock, debuglevel)

347

self.fileno = sock.fileno

360

self.fileno = sock.fileno

348

self.code = None

361

self.code = None

349

self._rbuf = ''

362

self._rbuf = ''

350

self._rbufsize = 8096

363

self._rbufsize = 8096

351

self._handler = None # inserted by the handler later

364

self._handler = None # inserted by the handler later

352

self._host = None # (same)

365

self._host = None # (same)

353

self._url = None # (same)

366

self._url = None # (same)

354

self._connection = None # (same)

367

self._connection = None # (same)

355

368

356

_raw_read = httplib.HTTPResponse.read

369

_raw_read = httplib.HTTPResponse.read

357

370

358

def close(self):

371

def close(self):

359

if self.fp:

372

if self.fp:

360

self.fp.close()

373

self.fp.close()

361

self.fp = None

374

self.fp = None

362

if self._handler:

375

if self._handler:

363

self._handler._request_closed(self, self._host,

376

self._handler._request_closed(self, self._host,

364

self._connection)

377

self._connection)

365

378

366

def close_connection(self):

379

def close_connection(self):

367

self._handler._remove_connection(self._host, self._connection, close=1)

380

self._handler._remove_connection(self._host, self._connection, close=1)

368

self.close()

381

self.close()

369

382

370

def info(self):

383

def info(self):

371

return self.headers

384

return self.headers

372

385

373

def geturl(self):

386

def geturl(self):

374

return self._url

387

return self._url

375

388

376

def read(self, amt=None):

389

def read(self, amt=None):

377

# the _rbuf test is only in this first if for speed. It's not

390

# the _rbuf test is only in this first if for speed. It's not

378

# logically necessary

391

# logically necessary

379

if self._rbuf and not amt is None:

392

if self._rbuf and not amt is None:

380

L = len(self._rbuf)

393

L = len(self._rbuf)

381

if amt > L:

394

if amt > L:

382

amt -= L

395

amt -= L

383

else:

396

else:

384

s = self._rbuf[:amt]

397

s = self._rbuf[:amt]

385

self._rbuf = self._rbuf[amt:]

398

self._rbuf = self._rbuf[amt:]

386

return s

399

return s

387

400

388

s = self._rbuf + self._raw_read(amt)

401

s = self._rbuf + self._raw_read(amt)

389

self._rbuf = ''

402

self._rbuf = ''

390

return s

403

return s

391

404

392

# stolen from Python SVN #68532 to fix issue1088

405

# stolen from Python SVN #68532 to fix issue1088

393

def _read_chunked(self, amt):

406

def _read_chunked(self, amt):

394

chunk_left = self.chunk_left

407

chunk_left = self.chunk_left

395

value = ''

408

value = ''

396

409

397

# XXX This accumulates chunks by repeated string concatenation,

410

# XXX This accumulates chunks by repeated string concatenation,

398

# which is not efficient as the number or size of chunks gets big.

411

# which is not efficient as the number or size of chunks gets big.

399

while True:

412

while True:

400

if chunk_left is None:

413

if chunk_left is None:

401

line = self.fp.readline()

414

line = self.fp.readline()

402

i = line.find(';')

415

i = line.find(';')

403

if i >= 0:

416

if i >= 0:

404

line = line[:i] # strip chunk-extensions

417

line = line[:i] # strip chunk-extensions

405

try:

418

try:

406

chunk_left = int(line, 16)

419

chunk_left = int(line, 16)

407

except ValueError:

420

except ValueError:

408

# close the connection as protocol synchronisation is

421

# close the connection as protocol synchronisation is

409

# probably lost

422

# probably lost

410

self.close()

423

self.close()

411

raise httplib.IncompleteRead(value)

424

raise httplib.IncompleteRead(value)

412

if chunk_left == 0:

425

if chunk_left == 0:

413

break

426

break

414

if amt is None:

427

if amt is None:

415

value += self._safe_read(chunk_left)

428

value += self._safe_read(chunk_left)

416

elif amt < chunk_left:

429

elif amt < chunk_left:

417

value += self._safe_read(amt)

430

value += self._safe_read(amt)

418

self.chunk_left = chunk_left - amt

431

self.chunk_left = chunk_left - amt

419

return value

432

return value

420

elif amt == chunk_left:

433

elif amt == chunk_left:

421

value += self._safe_read(amt)

434

value += self._safe_read(amt)

422

self._safe_read(2) # toss the CRLF at the end of the chunk

435

self._safe_read(2) # toss the CRLF at the end of the chunk

423

self.chunk_left = None

436

self.chunk_left = None

424

return value

437

return value

425

else:

438

else:

426

value += self._safe_read(chunk_left)

439

value += self._safe_read(chunk_left)

427

amt -= chunk_left

440

amt -= chunk_left

428

441

429

# we read the whole chunk, get another

442

# we read the whole chunk, get another

430

self._safe_read(2) # toss the CRLF at the end of the chunk

443

self._safe_read(2) # toss the CRLF at the end of the chunk

431

chunk_left = None

444

chunk_left = None

432

445

433

# read and discard trailer up to the CRLF terminator

446

# read and discard trailer up to the CRLF terminator

434

### note: we shouldn't have any trailers!

447

### note: we shouldn't have any trailers!

435

while True:

448

while True:

436

line = self.fp.readline()

449

line = self.fp.readline()

437

if not line:

450

if not line:

438

# a vanishingly small number of sites EOF without

451

# a vanishingly small number of sites EOF without

439

# sending the trailer

452

# sending the trailer

440

break

453

break

441

if line == '\r\n':

454

if line == '\r\n':

442

break

455

break

443

456

444

# we read everything; close the "file"

457

# we read everything; close the "file"

445

self.close()

458

self.close()

446

459

447

return value

460

return value

448

461

449

def readline(self, limit=-1):

462

def readline(self, limit=-1):

450

i = self._rbuf.find('\n')

463

i = self._rbuf.find('\n')

451

while i < 0 and not (0 < limit <= len(self._rbuf)):

464

while i < 0 and not (0 < limit <= len(self._rbuf)):

452

new = self._raw_read(self._rbufsize)

465

new = self._raw_read(self._rbufsize)

453

if not new: break

466

if not new: break

454

i = new.find('\n')

467

i = new.find('\n')

455

if i >= 0: i = i + len(self._rbuf)

468

if i >= 0: i = i + len(self._rbuf)

456

self._rbuf = self._rbuf + new

469

self._rbuf = self._rbuf + new

457

if i < 0: i = len(self._rbuf)

470

if i < 0: i = len(self._rbuf)

458

else: i = i+1

471

else: i = i+1

459

if 0 <= limit < len(self._rbuf): i = limit

472

if 0 <= limit < len(self._rbuf): i = limit

460

data, self._rbuf = self._rbuf[:i], self._rbuf[i:]

473

data, self._rbuf = self._rbuf[:i], self._rbuf[i:]

461

return data

474

return data

462

475

463

def readlines(self, sizehint = 0):

476

def readlines(self, sizehint = 0):

464

total = 0

477

total = 0

465

list = []

478

list = []

466

while 1:

479

while 1:

467

line = self.readline()

480

line = self.readline()

468

if not line: break

481

if not line: break

469

list.append(line)

482

list.append(line)

470

total += len(line)

483

total += len(line)

471

if sizehint and total >= sizehint:

484

if sizehint and total >= sizehint:

472

break

485

break

473

return list

486

return list

474

487

475

488

476

class HTTPConnection(httplib.HTTPConnection):

489

class HTTPConnection(httplib.HTTPConnection):

477

# use the modified response class

490

# use the modified response class

478

response_class = HTTPResponse

491

response_class = HTTPResponse

479

492

480

#########################################################################

493

#########################################################################

481

##### TEST FUNCTIONS

494

##### TEST FUNCTIONS

482

#########################################################################

495

#########################################################################

483

496

484

def error_handler(url):

497

def error_handler(url):

485

global HANDLE_ERRORS

498

global HANDLE_ERRORS

486

orig = HANDLE_ERRORS

499

orig = HANDLE_ERRORS

487

keepalive_handler = HTTPHandler()

500

keepalive_handler = HTTPHandler()

488

opener = urllib2.build_opener(keepalive_handler)

501

opener = urllib2.build_opener(keepalive_handler)

489

urllib2.install_opener(opener)

502

urllib2.install_opener(opener)

490

pos = {0: 'off', 1: 'on'}

503

pos = {0: 'off', 1: 'on'}

491

for i in (0, 1):

504

for i in (0, 1):

492

print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)

505

print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)

493

HANDLE_ERRORS = i

506

HANDLE_ERRORS = i

494

try:

507

try:

495

fo = urllib2.urlopen(url)

508

fo = urllib2.urlopen(url)

496

fo.read()

509

fo.read()

497

fo.close()

510

fo.close()

498

try: status, reason = fo.status, fo.reason

511

try: status, reason = fo.status, fo.reason

499

except AttributeError: status, reason = None, None

512

except AttributeError: status, reason = None, None

500

except IOError, e:

513

except IOError, e:

501

print " EXCEPTION: %s" % e

514

print " EXCEPTION: %s" % e

502

raise

515

raise

503

else:

516

else:

504

print " status = %s, reason = %s" % (status, reason)

517

print " status = %s, reason = %s" % (status, reason)

505

HANDLE_ERRORS = orig

518

HANDLE_ERRORS = orig

506

hosts = keepalive_handler.open_connections()

519

hosts = keepalive_handler.open_connections()

507

print "open connections:", hosts

520

print "open connections:", hosts

508

keepalive_handler.close_all()

521

keepalive_handler.close_all()

509

522

510

def continuity(url):

523

def continuity(url):

511

from util import md5

524

from util import md5

512

format = '%25s: %s'

525

format = '%25s: %s'

513

526

514

# first fetch the file with the normal http handler

527

# first fetch the file with the normal http handler

515

opener = urllib2.build_opener()

528

opener = urllib2.build_opener()

516

urllib2.install_opener(opener)

529

urllib2.install_opener(opener)

517

fo = urllib2.urlopen(url)

530

fo = urllib2.urlopen(url)

518

foo = fo.read()

531

foo = fo.read()

519

fo.close()

532

fo.close()

520

m = md5.new(foo)

533

m = md5.new(foo)

521

print format % ('normal urllib', m.hexdigest())

534

print format % ('normal urllib', m.hexdigest())

522

535

523

# now install the keepalive handler and try again

536

# now install the keepalive handler and try again

524

opener = urllib2.build_opener(HTTPHandler())

537

opener = urllib2.build_opener(HTTPHandler())

525

urllib2.install_opener(opener)

538

urllib2.install_opener(opener)

526

539

527

fo = urllib2.urlopen(url)

540

fo = urllib2.urlopen(url)

528

foo = fo.read()

541

foo = fo.read()

529

fo.close()

542

fo.close()

530

m = md5.new(foo)

543

m = md5.new(foo)

531

print format % ('keepalive read', m.hexdigest())

544

print format % ('keepalive read', m.hexdigest())

532

545

533

fo = urllib2.urlopen(url)

546

fo = urllib2.urlopen(url)

534

foo = ''

547

foo = ''

535

while 1:

548

while 1:

536

f = fo.readline()

549

f = fo.readline()

537

if f: foo = foo + f

550

if f: foo = foo + f

538

else: break

551

else: break

539

fo.close()

552

fo.close()

540

m = md5.new(foo)

553

m = md5.new(foo)

541

print format % ('keepalive readline', m.hexdigest())

554

print format % ('keepalive readline', m.hexdigest())

542

555

543

def comp(N, url):

556

def comp(N, url):

544

print ' making %i connections to:\n %s' % (N, url)

557

print ' making %i connections to:\n %s' % (N, url)

545

558

546

sys.stdout.write(' first using the normal urllib handlers')

559

sys.stdout.write(' first using the normal urllib handlers')

547

# first use normal opener

560

# first use normal opener

548

opener = urllib2.build_opener()

561

opener = urllib2.build_opener()

549

urllib2.install_opener(opener)

562

urllib2.install_opener(opener)

550

t1 = fetch(N, url)

563

t1 = fetch(N, url)

551

print ' TIME: %.3f s' % t1

564

print ' TIME: %.3f s' % t1

552

565

553

sys.stdout.write(' now using the keepalive handler ')

566

sys.stdout.write(' now using the keepalive handler ')

554

# now install the keepalive handler and try again

567

# now install the keepalive handler and try again

555

opener = urllib2.build_opener(HTTPHandler())

568

opener = urllib2.build_opener(HTTPHandler())

556

urllib2.install_opener(opener)

569

urllib2.install_opener(opener)

557

t2 = fetch(N, url)

570

t2 = fetch(N, url)

558

print ' TIME: %.3f s' % t2

571

print ' TIME: %.3f s' % t2

559

print ' improvement factor: %.2f' % (t1/t2, )

572

print ' improvement factor: %.2f' % (t1/t2, )

560

573

561

def fetch(N, url, delay=0):

574

def fetch(N, url, delay=0):

562

import time

575

import time

563

lens = []

576

lens = []

564

starttime = time.time()

577

starttime = time.time()

565

for i in range(N):

578

for i in range(N):

566

if delay and i > 0: time.sleep(delay)

579

if delay and i > 0: time.sleep(delay)

567

fo = urllib2.urlopen(url)

580

fo = urllib2.urlopen(url)

568

foo = fo.read()

581

foo = fo.read()

569

fo.close()

582

fo.close()

570

lens.append(len(foo))

583

lens.append(len(foo))

571

diff = time.time() - starttime

584

diff = time.time() - starttime

572

585

573

j = 0

586

j = 0

574

for i in lens[1:]:

587

for i in lens[1:]:

575

j = j + 1

588

j = j + 1

576

if not i == lens[0]:

589

if not i == lens[0]:

577

print "WARNING: inconsistent length on read %i: %i" % (j, i)

590

print "WARNING: inconsistent length on read %i: %i" % (j, i)

578

591

579

return diff

592

return diff

580

593

581

def test_timeout(url):

594

def test_timeout(url):

582

global DEBUG

595

global DEBUG

583

dbbackup = DEBUG

596

dbbackup = DEBUG

584

class FakeLogger:

597

class FakeLogger:

585

def debug(self, msg, *args): print msg % args

598

def debug(self, msg, *args): print msg % args

586

info = warning = error = debug

599

info = warning = error = debug

587

DEBUG = FakeLogger()

600

DEBUG = FakeLogger()

588

print " fetching the file to establish a connection"

601

print " fetching the file to establish a connection"

589

fo = urllib2.urlopen(url)

602

fo = urllib2.urlopen(url)

590

data1 = fo.read()

603

data1 = fo.read()

591

fo.close()

604

fo.close()

592

605

593

i = 20

606

i = 20

594

print " waiting %i seconds for the server to close the connection" % i

607

print " waiting %i seconds for the server to close the connection" % i

595

while i > 0:

608

while i > 0:

596

sys.stdout.write('\r %2i' % i)

609

sys.stdout.write('\r %2i' % i)

597

sys.stdout.flush()

610

sys.stdout.flush()

598

time.sleep(1)

611

time.sleep(1)

599

i -= 1

612

i -= 1

600

sys.stderr.write('\r')

613

sys.stderr.write('\r')

601

614

602

print " fetching the file a second time"

615

print " fetching the file a second time"

603

fo = urllib2.urlopen(url)

616

fo = urllib2.urlopen(url)

604

data2 = fo.read()

617

data2 = fo.read()

605

fo.close()

618

fo.close()

606

619

607

if data1 == data2:

620

if data1 == data2:

608

print ' data are identical'

621

print ' data are identical'

609

else:

622

else:

610

print ' ERROR: DATA DIFFER'

623

print ' ERROR: DATA DIFFER'

611

624

612

DEBUG = dbbackup

625

DEBUG = dbbackup

613

626

614

627

615

def test(url, N=10):

628

def test(url, N=10):

616

print "checking error hander (do this on a non-200)"

629

print "checking error hander (do this on a non-200)"

617

try: error_handler(url)

630

try: error_handler(url)

618

except IOError:

631

except IOError:

619

print "exiting - exception will prevent further tests"

632

print "exiting - exception will prevent further tests"

620

sys.exit()

633

sys.exit()

621

print

634

print

622

print "performing continuity test (making sure stuff isn't corrupted)"

635

print "performing continuity test (making sure stuff isn't corrupted)"

623

continuity(url)

636

continuity(url)

624

print

637

print

625

print "performing speed comparison"

638

print "performing speed comparison"

626

comp(N, url)

639

comp(N, url)

627

print

640

print

628

print "performing dropped-connection check"

641

print "performing dropped-connection check"

629

test_timeout(url)

642

test_timeout(url)

630

643

631

if __name__ == '__main__':

644

if __name__ == '__main__':

632

import time

645

import time

633

import sys

646

import sys

634

try:

647

try:

635

N = int(sys.argv[1])

648

N = int(sys.argv[1])

636

url = sys.argv[2]

649

url = sys.argv[2]

637

except:

650

except:

638

print "%s <integer> <url>" % sys.argv[0]

651

print "%s <integer> <url>" % sys.argv[0]

639

else:

652

else:

640

test(url, N)

653

test(url, N)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #   This library is free software; you can redistribute it and/or
             #   modify it under the terms of the GNU Lesser General Public
             #   License as published by the Free Software Foundation; either
             #   version 2.1 of the License, or (at your option) any later version.
             #
             #   This library is distributed in the hope that it will be useful,
             #   but WITHOUT ANY WARRANTY; without even the implied warranty of
             #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
             #   Lesser General Public License for more details.
             #
             #   You should have received a copy of the GNU Lesser General Public
             #   License along with this library; if not, write to the
             #      Free Software Foundation, Inc.,
             #      59 Temple Place, Suite 330,
             #      Boston, MA  02111-1307  USA
             # This file is part of urlgrabber, a high-level cross-protocol url-grabber
             # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
             # Modified by Benoit Boissinot:
             #  - fix for digest auth (inspired from urllib2.py @ Python v2.4)
             # Modified by Dirkjan Ochtman:
             #  - import md5 function from a local util module
             """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
             >>> import urllib2
             >>> from keepalive import HTTPHandler
             >>> keepalive_handler = HTTPHandler()
             >>> opener = urllib2.build_opener(keepalive_handler)
             >>> urllib2.install_opener(opener)
             >>>
             >>> fo = urllib2.urlopen('http://www.python.org')
             If a connection to a given host is requested, and all of the existing
             connections are still in use, another connection will be opened.  If
             the handler tries to use an existing connection but it fails in some
             way, it will be closed and removed from the pool.
             To remove the handler, simply re-run build_opener with no arguments, and
             install that opener.
             You can explicitly close connections by using the close_connection()
             method of the returned file-like object (described below) or you can
             use the handler methods:
               close_connection(host)
               close_all()
               open_connections()
             NOTE: using the close_connection and close_all methods of the handler
             should be done with care when using multiple threads.
               * there is nothing that prevents another thread from creating new
                 connections immediately after connections are closed
               * no checks are done to prevent in-use connections from being closed
             >>> keepalive_handler.close_all()
             EXTRA ATTRIBUTES AND METHODS
               Upon a status of 200, the object returned has a few additional
               attributes and methods, which should not be used if you want to
               remain consistent with the normal urllib2-returned objects:
                 close_connection()  -  close the connection to the host
                 readlines()         -  you know, readlines()
                 status              -  the return status (ie 404)
                 reason              -  english translation of status (ie 'File not found')
               If you want the best of both worlds, use this inside an
               AttributeError-catching try:
               >>> try: status = fo.status
               >>> except AttributeError: status = None
               Unfortunately, these are ONLY there if status == 200, so it's not
               easy to distinguish between non-200 responses.  The reason is that
               urllib2 tries to do clever things with error codes 301, 302, 401,
               and 407, and it wraps the object upon return.
               For python versions earlier than 2.4, you can avoid this fancy error
               handling by setting the module-level global HANDLE_ERRORS to zero.
               You see, prior to 2.4, it's the HTTP Handler's job to determine what
               to handle specially, and what to just pass up.  HANDLE_ERRORS == 0
               means "pass everything up".  In python 2.4, however, this job no
               longer belongs to the HTTP Handler and is now done by a NEW handler,
               HTTPErrorProcessor.  Here's the bottom line:
                 python version < 2.4
                     HANDLE_ERRORS == 1  (default) pass up 200, treat the rest as
                                         errors
                     HANDLE_ERRORS == 0  pass everything up, error processing is
                                         left to the calling code
                 python version >= 2.4
                     HANDLE_ERRORS == 1  pass up 200, treat the rest as errors
                     HANDLE_ERRORS == 0  (default) pass everything up, let the
                                         other handlers (specifically,
                                         HTTPErrorProcessor) decide what to do
               In practice, setting the variable either way makes little difference
               in python 2.4, so for the most consistent behavior across versions,
               you probably just want to use the defaults, which will give you
               exceptions on errors.
             """
             # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
             import urllib2
             import httplib
             import socket
             import thread
             DEBUG = None
             import sys
             if sys.version_info < (2, 4): HANDLE_ERRORS = 1
             else: HANDLE_ERRORS = 0
             class ConnectionManager:
                 """
                 The connection manager must be able to:
                   * keep track of all existing
                   """
                 def __init__(self):
                     self._lock = thread.allocate_lock()
                     self._hostmap = {} # map hosts to a list of connections
                     self._connmap = {} # map connections to host
                     self._readymap = {} # map connection to ready state
                 def add(self, host, connection, ready):
                     self._lock.acquire()
                     try:
                         if not host in self._hostmap: self._hostmap[host] = []
                         self._hostmap[host].append(connection)
                         self._connmap[connection] = host
                         self._readymap[connection] = ready
                     finally:
                         self._lock.release()
                 def remove(self, connection):
                     self._lock.acquire()
                     try:
                         try:
                             host = self._connmap[connection]
                         except KeyError:
                             pass
                         else:
                             del self._connmap[connection]
                             del self._readymap[connection]
                             self._hostmap[host].remove(connection)
                             if not self._hostmap[host]: del self._hostmap[host]
                     finally:
                         self._lock.release()
                 def set_ready(self, connection, ready):
                     try: self._readymap[connection] = ready
                     except KeyError: pass
                 def get_ready_conn(self, host):
                     conn = None
                     self._lock.acquire()
                     try:
                         if host in self._hostmap:
                             for c in self._hostmap[host]:
                                 if self._readymap[c]:
                                     self._readymap[c] = 0
                                     conn = c
                                     break
                     finally:
                         self._lock.release()
                     return conn
                 def get_all(self, host=None):
                     if host:
                         return list(self._hostmap.get(host, []))
                     else:
                         return dict(self._hostmap)
             class KeepAliveHandler:
                 def __init__(self):
                     self._cm = ConnectionManager()
                 #### Connection Management
                 def open_connections(self):
                     """return a list of connected hosts and the number of connections
                     to each.  [('foo.com:80', 2), ('bar.org', 1)]"""
                     return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
                 def close_connection(self, host):
                     """close connection(s) to <host>
                     host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
                     no error occurs if there is no connection to that host."""
                     for h in self._cm.get_all(host):
                         self._cm.remove(h)
                         h.close()
                 def close_all(self):
                     """close all open connections"""
                     for host, conns in self._cm.get_all().iteritems():
                         for h in conns:
                             self._cm.remove(h)
                             h.close()
                 def _request_closed(self, request, host, connection):
                     """tells us that this request is now closed and the the
                     connection is ready for another request"""
                     self._cm.set_ready(connection, 1)
                 def _remove_connection(self, host, connection, close=0):
                     if close: connection.close()
                     self._cm.remove(connection)
                 #### Transaction Execution
                 def http_open(self, req):
                     return self.do_open(HTTPConnection, req)
                 def do_open(self, http_class, req):
                     host = req.get_host()
                     if not host:
                         raise urllib2.URLError('no host given')
                     try:
                         h = self._cm.get_ready_conn(host)
                         while h:
                             r = self._reuse_connection(h, req, host)
                             # if this response is non-None, then it worked and we're
                             # done.  Break out, skipping the else block.
                             if r: break
                             # connection is bad - possibly closed by server
                             # discard it and ask for the next free connection
                             h.close()
                             self._cm.remove(h)
                             h = self._cm.get_ready_conn(host)
                         else:
                             # no (working) free connections were found.  Create a new one.
                             h = http_class(host)
                             if DEBUG: DEBUG.info("creating new connection to %s (%d)",
                                                  host, id(h))
                             self._cm.add(host, h, 0)
                             self._start_transaction(h, req)
                             r = h.getresponse()
                     except (socket.error, httplib.HTTPException), err:
                         raise urllib2.URLError(err)
                     # if not a persistent connection, don't try to reuse it
                     if r.will_close: self._cm.remove(h)
                     if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
                     r._handler = self
                     r._host = host
                     r._url = req.get_full_url()
                     r._connection = h
                     r.code = r.status
                     r.headers = r.msg
                     r.msg = r.reason
                     if r.status == 200 or not HANDLE_ERRORS:
                         return r
                     else:
                         return self.parent.error('http', req, r,
                                                  r.status, r.msg, r.headers)
                 def _reuse_connection(self, h, req, host):
                     """start the transaction with a re-used connection
                     return a response object (r) upon success or None on failure.
                     This DOES not close or remove bad connections in cases where
                     it returns.  However, if an unexpected exception occurs, it
                     will close and remove the connection before re-raising.
                     """
                     try:
                         self._start_transaction(h, req)
                         r = h.getresponse()
                         # note: just because we got something back doesn't mean it
                         # worked.  We'll check the version below, too.
                     except (socket.error, httplib.HTTPException):
                         r = None
                     except:
                         # adding this block just in case we've missed
                         # something we will still raise the exception, but
                         # lets try and close the connection and remove it
                         # first.  We previously got into a nasty loop
                         # where an exception was uncaught, and so the
                         # connection stayed open.  On the next try, the
                         # same exception was raised, etc.  The tradeoff is
                         # that it's now possible this call will raise
                         # a DIFFERENT exception
                         if DEBUG: DEBUG.error("unexpected exception - closing " + \
                                               "connection to %s (%d)", host, id(h))
                         self._cm.remove(h)
                         h.close()
                         raise
                     if r is None or r.version == 9:
                         # httplib falls back to assuming HTTP 0.9 if it gets a
                         # bad header back.  This is most likely to happen if
                         # the socket has been closed by the server since we
                         # last used the connection.
                         if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
                                              host, id(h))
                         r = None
                     else:
                         if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
                     return r
                 def _start_transaction(self, h, req):
-                    headers = req.headers.copy()
-                    body = req.data
-                    if sys.version_info >= (2, 4):
-                        headers.update(req.unredirected_hdrs)
                     try:
-                        h.request(req.get_method(), req.get_selector(), body, headers)
+                        if req.has_data():
-                    except socket.error, err: # XXX what error?
+                            data = req.get_data()
+                            h.putrequest('POST', req.get_selector())
+                            if 'Content-type' not in req.headers:
+                                h.putheader('Content-type',
+                                            'application/x-www-form-urlencoded')
+                            if 'Content-length' not in req.headers:
+                                h.putheader('Content-length', '%d' % len(data))
+                        else:
+                            h.putrequest('GET', req.get_selector())
+                    except (socket.error), err:
                         raise urllib2.URLError(err)
+                    for args in self.parent.addheaders:
+                        h.putheader(*args)
+                    for k, v in req.headers.items():
+                        h.putheader(k, v)
+                    h.endheaders()
+                    if req.has_data():
+                        h.send(data)
             class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
                 pass
             class HTTPResponse(httplib.HTTPResponse):
                 # we need to subclass HTTPResponse in order to
                 # 1) add readline() and readlines() methods
                 # 2) add close_connection() methods
                 # 3) add info() and geturl() methods
                 # in order to add readline(), read must be modified to deal with a
                 # buffer.  example: readline must read a buffer and then spit back
                 # one line at a time.  The only real alternative is to read one
                 # BYTE at a time (ick).  Once something has been read, it can't be
                 # put back (ok, maybe it can, but that's even uglier than this),
                 # so if you THEN do a normal read, you must first take stuff from
                 # the buffer.
                 # the read method wraps the original to accomodate buffering,
                 # although read() never adds to the buffer.
                 # Both readline and readlines have been stolen with almost no
                 # modification from socket.py
                 def __init__(self, sock, debuglevel=0, strict=0, method=None):
                     if method: # the httplib in python 2.3 uses the method arg
                         httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
                     else: # 2.2 doesn't
                         httplib.HTTPResponse.__init__(self, sock, debuglevel)
                     self.fileno = sock.fileno
                     self.code = None
                     self._rbuf = ''
                     self._rbufsize = 8096
                     self._handler = None # inserted by the handler later
                     self._host = None    # (same)
                     self._url = None     # (same)
                     self._connection = None # (same)
                 _raw_read = httplib.HTTPResponse.read
                 def close(self):
                     if self.fp:
                         self.fp.close()
                         self.fp = None
                         if self._handler:
                             self._handler._request_closed(self, self._host,
                                                           self._connection)
                 def close_connection(self):
                     self._handler._remove_connection(self._host, self._connection, close=1)
                     self.close()
                 def info(self):
                     return self.headers
                 def geturl(self):
                     return self._url
                 def read(self, amt=None):
                     # the _rbuf test is only in this first if for speed.  It's not
                     # logically necessary
                     if self._rbuf and not amt is None:
                         L = len(self._rbuf)
                         if amt > L:
                             amt -= L
                         else:
                             s = self._rbuf[:amt]
                             self._rbuf = self._rbuf[amt:]
                             return s
                     s = self._rbuf + self._raw_read(amt)
                     self._rbuf = ''
                     return s
                 # stolen from Python SVN #68532 to fix issue1088
                 def _read_chunked(self, amt):
                     chunk_left = self.chunk_left
                     value = ''
                     # XXX This accumulates chunks by repeated string concatenation,
                     # which is not efficient as the number or size of chunks gets big.
                     while True:
                         if chunk_left is None:
                             line = self.fp.readline()
                             i = line.find(';')
                             if i >= 0:
                                 line = line[:i] # strip chunk-extensions
                             try:
                                 chunk_left = int(line, 16)
                             except ValueError:
                                 # close the connection as protocol synchronisation is
                                 # probably lost
                                 self.close()
                                 raise httplib.IncompleteRead(value)
                             if chunk_left == 0:
                                 break
                         if amt is None:
                             value += self._safe_read(chunk_left)
                         elif amt < chunk_left:
                             value += self._safe_read(amt)
                             self.chunk_left = chunk_left - amt
                             return value
                         elif amt == chunk_left:
                             value += self._safe_read(amt)
                             self._safe_read(2)  # toss the CRLF at the end of the chunk
                             self.chunk_left = None
                             return value
                         else:
                             value += self._safe_read(chunk_left)
                             amt -= chunk_left
                         # we read the whole chunk, get another
                         self._safe_read(2)      # toss the CRLF at the end of the chunk
                         chunk_left = None
                     # read and discard trailer up to the CRLF terminator
                     ### note: we shouldn't have any trailers!
                     while True:
                         line = self.fp.readline()
                         if not line:
                             # a vanishingly small number of sites EOF without
                             # sending the trailer
                             break
                         if line == '\r\n':
                             break
                     # we read everything; close the "file"
                     self.close()
                     return value
                 def readline(self, limit=-1):
                     i = self._rbuf.find('\n')
                     while i < 0 and not (0 < limit <= len(self._rbuf)):
                         new = self._raw_read(self._rbufsize)
                         if not new: break
                         i = new.find('\n')
                         if i >= 0: i = i + len(self._rbuf)
                         self._rbuf = self._rbuf + new
                     if i < 0: i = len(self._rbuf)
                     else: i = i+1
                     if 0 <= limit < len(self._rbuf): i = limit
                     data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
                     return data
                 def readlines(self, sizehint = 0):
                     total = 0
                     list = []
                     while 1:
                         line = self.readline()
                         if not line: break
                         list.append(line)
                         total += len(line)
                         if sizehint and total >= sizehint:
                             break
                     return list
             class HTTPConnection(httplib.HTTPConnection):
                 # use the modified response class
                 response_class = HTTPResponse
             #########################################################################
             #####   TEST FUNCTIONS
             #########################################################################
             def error_handler(url):
                 global HANDLE_ERRORS
                 orig = HANDLE_ERRORS
                 keepalive_handler = HTTPHandler()
                 opener = urllib2.build_opener(keepalive_handler)
                 urllib2.install_opener(opener)
                 pos = {0: 'off', 1: 'on'}
                 for i in (0, 1):
                     print "  fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
                     HANDLE_ERRORS = i
                     try:
                         fo = urllib2.urlopen(url)
                         fo.read()
                         fo.close()
                         try: status, reason = fo.status, fo.reason
                         except AttributeError: status, reason = None, None
                     except IOError, e:
                         print "  EXCEPTION: %s" % e
                         raise
                     else:
                         print "  status = %s, reason = %s" % (status, reason)
                 HANDLE_ERRORS = orig
                 hosts = keepalive_handler.open_connections()
                 print "open connections:", hosts
                 keepalive_handler.close_all()
             def continuity(url):
                 from util import md5
                 format = '%25s: %s'
                 # first fetch the file with the normal http handler
                 opener = urllib2.build_opener()
                 urllib2.install_opener(opener)
                 fo = urllib2.urlopen(url)
                 foo = fo.read()
                 fo.close()
                 m = md5.new(foo)
                 print format % ('normal urllib', m.hexdigest())
                 # now install the keepalive handler and try again
                 opener = urllib2.build_opener(HTTPHandler())
                 urllib2.install_opener(opener)
                 fo = urllib2.urlopen(url)
                 foo = fo.read()
                 fo.close()
                 m = md5.new(foo)
                 print format % ('keepalive read', m.hexdigest())
                 fo = urllib2.urlopen(url)
                 foo = ''
                 while 1:
                     f = fo.readline()
                     if f: foo = foo + f
                     else: break
                 fo.close()
                 m = md5.new(foo)
                 print format % ('keepalive readline', m.hexdigest())
             def comp(N, url):
                 print '  making %i connections to:\n  %s' % (N, url)
                 sys.stdout.write('  first using the normal urllib handlers')
                 # first use normal opener
                 opener = urllib2.build_opener()
                 urllib2.install_opener(opener)
                 t1 = fetch(N, url)
                 print '  TIME: %.3f s' % t1
                 sys.stdout.write('  now using the keepalive handler       ')
                 # now install the keepalive handler and try again
                 opener = urllib2.build_opener(HTTPHandler())
                 urllib2.install_opener(opener)
                 t2 = fetch(N, url)
                 print '  TIME: %.3f s' % t2
                 print '  improvement factor: %.2f' % (t1/t2, )
             def fetch(N, url, delay=0):
                 import time
                 lens = []
                 starttime = time.time()
                 for i in range(N):
                     if delay and i > 0: time.sleep(delay)
                     fo = urllib2.urlopen(url)
                     foo = fo.read()
                     fo.close()
                     lens.append(len(foo))
                 diff = time.time() - starttime
                 j = 0
                 for i in lens[1:]:
                     j = j + 1
                     if not i == lens[0]:
                         print "WARNING: inconsistent length on read %i: %i" % (j, i)
                 return diff
             def test_timeout(url):
                 global DEBUG
                 dbbackup = DEBUG
                 class FakeLogger:
                     def debug(self, msg, *args): print msg % args
                     info = warning = error = debug
                 DEBUG = FakeLogger()
                 print "  fetching the file to establish a connection"
                 fo = urllib2.urlopen(url)
                 data1 = fo.read()
                 fo.close()
                 i = 20
                 print "  waiting %i seconds for the server to close the connection" % i
                 while i > 0:
                     sys.stdout.write('\r  %2i' % i)
                     sys.stdout.flush()
                     time.sleep(1)
                     i -= 1
                 sys.stderr.write('\r')
                 print "  fetching the file a second time"
                 fo = urllib2.urlopen(url)
                 data2 = fo.read()
                 fo.close()
                 if data1 == data2:
                     print '  data are identical'
                 else:
                     print '  ERROR: DATA DIFFER'
                 DEBUG = dbbackup
             def test(url, N=10):
                 print "checking error hander (do this on a non-200)"
                 try: error_handler(url)
                 except IOError:
                     print "exiting - exception will prevent further tests"
                     sys.exit()
                 print
                 print "performing continuity test (making sure stuff isn't corrupted)"
                 continuity(url)
                 print
                 print "performing speed comparison"
                 comp(N, url)
                 print
                 print "performing dropped-connection check"
                 test_timeout(url)
             if __name__ == '__main__':
                 import time
                 import sys
                 try:
                     N = int(sys.argv[1])
                     url = sys.argv[2]
                 except:
                     print "%s <integer> <url>" % sys.argv[0]
                 else:
                     test(url, N)