##// END OF EJS Templates
Update keepalive.py to current CVS version of urlgrabber....
Alexis S. L. Carvalho -
r2444:5eb02f9e default
parent child Browse files
Show More
@@ -1,587 +1,589 b''
1 1 # This library is free software; you can redistribute it and/or
2 2 # modify it under the terms of the GNU Lesser General Public
3 3 # License as published by the Free Software Foundation; either
4 4 # version 2.1 of the License, or (at your option) any later version.
5 5 #
6 6 # This library is distributed in the hope that it will be useful,
7 7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 9 # Lesser General Public License for more details.
10 10 #
11 11 # You should have received a copy of the GNU Lesser General Public
12 12 # License along with this library; if not, write to the
13 13 # Free Software Foundation, Inc.,
14 14 # 59 Temple Place, Suite 330,
15 15 # Boston, MA 02111-1307 USA
16 16
17 17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
18 18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19 19
20 20 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
21 21
22 22 >>> import urllib2
23 23 >>> from keepalive import HTTPHandler
24 24 >>> keepalive_handler = HTTPHandler()
25 25 >>> opener = urllib2.build_opener(keepalive_handler)
26 26 >>> urllib2.install_opener(opener)
27 27 >>>
28 28 >>> fo = urllib2.urlopen('http://www.python.org')
29 29
30 30 If a connection to a given host is requested, and all of the existing
31 31 connections are still in use, another connection will be opened. If
32 32 the handler tries to use an existing connection but it fails in some
33 33 way, it will be closed and removed from the pool.
34 34
35 35 To remove the handler, simply re-run build_opener with no arguments, and
36 36 install that opener.
37 37
38 38 You can explicitly close connections by using the close_connection()
39 39 method of the returned file-like object (described below) or you can
40 40 use the handler methods:
41 41
42 42 close_connection(host)
43 43 close_all()
44 44 open_connections()
45 45
46 46 NOTE: using the close_connection and close_all methods of the handler
47 47 should be done with care when using multiple threads.
48 48 * there is nothing that prevents another thread from creating new
49 49 connections immediately after connections are closed
50 50 * no checks are done to prevent in-use connections from being closed
51 51
52 52 >>> keepalive_handler.close_all()
53 53
54 54 EXTRA ATTRIBUTES AND METHODS
55 55
56 56 Upon a status of 200, the object returned has a few additional
57 57 attributes and methods, which should not be used if you want to
58 58 remain consistent with the normal urllib2-returned objects:
59 59
60 60 close_connection() - close the connection to the host
61 61 readlines() - you know, readlines()
62 62 status - the return status (ie 404)
63 63 reason - english translation of status (ie 'File not found')
64 64
65 65 If you want the best of both worlds, use this inside an
66 66 AttributeError-catching try:
67 67
68 68 >>> try: status = fo.status
69 69 >>> except AttributeError: status = None
70 70
71 71 Unfortunately, these are ONLY there if status == 200, so it's not
72 72 easy to distinguish between non-200 responses. The reason is that
73 73 urllib2 tries to do clever things with error codes 301, 302, 401,
74 74 and 407, and it wraps the object upon return.
75 75
76 76 For python versions earlier than 2.4, you can avoid this fancy error
77 77 handling by setting the module-level global HANDLE_ERRORS to zero.
78 78 You see, prior to 2.4, it's the HTTP Handler's job to determine what
79 79 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
80 80 means "pass everything up". In python 2.4, however, this job no
81 81 longer belongs to the HTTP Handler and is now done by a NEW handler,
82 82 HTTPErrorProcessor. Here's the bottom line:
83 83
84 84 python version < 2.4
85 85 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
86 86 errors
87 87 HANDLE_ERRORS == 0 pass everything up, error processing is
88 88 left to the calling code
89 89 python version >= 2.4
90 90 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
91 91 HANDLE_ERRORS == 0 (default) pass everything up, let the
92 92 other handlers (specifically,
93 93 HTTPErrorProcessor) decide what to do
94 94
95 95 In practice, setting the variable either way makes little difference
96 96 in python 2.4, so for the most consistent behavior across versions,
97 97 you probably just want to use the defaults, which will give you
98 98 exceptions on errors.
99 99
100 100 """
101 101
102 # $Id: keepalive.py,v 1.13 2005/10/22 21:57:28 mstenner Exp $
102 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
103 103
104 104 import urllib2
105 105 import httplib
106 106 import socket
107 107 import thread
108 108
109 109 DEBUG = None
110 110
111 111 import sys
112 112 if sys.version_info < (2, 4): HANDLE_ERRORS = 1
113 113 else: HANDLE_ERRORS = 0
114 114
115 115 class ConnectionManager:
116 116 """
117 117 The connection manager must be able to:
118 118 * keep track of all existing
119 119 """
120 120 def __init__(self):
121 121 self._lock = thread.allocate_lock()
122 122 self._hostmap = {} # map hosts to a list of connections
123 123 self._connmap = {} # map connections to host
124 124 self._readymap = {} # map connection to ready state
125 125
126 126 def add(self, host, connection, ready):
127 127 self._lock.acquire()
128 128 try:
129 129 if not self._hostmap.has_key(host): self._hostmap[host] = []
130 130 self._hostmap[host].append(connection)
131 131 self._connmap[connection] = host
132 132 self._readymap[connection] = ready
133 133 finally:
134 134 self._lock.release()
135 135
136 136 def remove(self, connection):
137 137 self._lock.acquire()
138 138 try:
139 139 try:
140 140 host = self._connmap[connection]
141 141 except KeyError:
142 142 pass
143 143 else:
144 144 del self._connmap[connection]
145 145 del self._readymap[connection]
146 146 self._hostmap[host].remove(connection)
147 147 if not self._hostmap[host]: del self._hostmap[host]
148 148 finally:
149 149 self._lock.release()
150 150
151 151 def set_ready(self, connection, ready):
152 152 try: self._readymap[connection] = ready
153 153 except KeyError: pass
154 154
155 155 def get_ready_conn(self, host):
156 156 conn = None
157 157 self._lock.acquire()
158 158 try:
159 159 if self._hostmap.has_key(host):
160 160 for c in self._hostmap[host]:
161 161 if self._readymap[c]:
162 162 self._readymap[c] = 0
163 163 conn = c
164 164 break
165 165 finally:
166 166 self._lock.release()
167 167 return conn
168 168
169 169 def get_all(self, host=None):
170 170 if host:
171 171 return list(self._hostmap.get(host, []))
172 172 else:
173 173 return dict(self._hostmap)
174 174
175 175 class HTTPHandler(urllib2.HTTPHandler):
176 176 def __init__(self):
177 177 self._cm = ConnectionManager()
178 178
179 179 #### Connection Management
180 180 def open_connections(self):
181 181 """return a list of connected hosts and the number of connections
182 182 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
183 183 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
184 184
185 185 def close_connection(self, host):
186 186 """close connection(s) to <host>
187 187 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
188 188 no error occurs if there is no connection to that host."""
189 189 for h in self._cm.get_all(host):
190 190 self._cm.remove(h)
191 191 h.close()
192 192
193 193 def close_all(self):
194 194 """close all open connections"""
195 195 for host, conns in self._cm.get_all().items():
196 196 for h in conns:
197 197 self._cm.remove(h)
198 198 h.close()
199 199
200 200 def _request_closed(self, request, host, connection):
201 201 """tells us that this request is now closed and the the
202 202 connection is ready for another request"""
203 203 self._cm.set_ready(connection, 1)
204 204
205 205 def _remove_connection(self, host, connection, close=0):
206 206 if close: connection.close()
207 207 self._cm.remove(connection)
208 208
209 209 #### Transaction Execution
210 210 def http_open(self, req):
211 211 return self.do_open(HTTPConnection, req)
212 212
213 213 def do_open(self, http_class, req):
214 214 host = req.get_host()
215 215 if not host:
216 216 raise urllib2.URLError('no host given')
217 217
218 218 try:
219 219 h = self._cm.get_ready_conn(host)
220 220 while h:
221 221 r = self._reuse_connection(h, req, host)
222 222
223 223 # if this response is non-None, then it worked and we're
224 224 # done. Break out, skipping the else block.
225 225 if r: break
226 226
227 227 # connection is bad - possibly closed by server
228 228 # discard it and ask for the next free connection
229 229 h.close()
230 230 self._cm.remove(h)
231 231 h = self._cm.get_ready_conn(host)
232 232 else:
233 233 # no (working) free connections were found. Create a new one.
234 234 h = http_class(host)
235 235 if DEBUG: DEBUG.info("creating new connection to %s (%d)",
236 236 host, id(h))
237 237 self._cm.add(host, h, 0)
238 238 self._start_transaction(h, req)
239 239 r = h.getresponse()
240 240 except (socket.error, httplib.HTTPException), err:
241 241 raise urllib2.URLError(err)
242 242
243 243 # if not a persistent connection, don't try to reuse it
244 244 if r.will_close: self._cm.remove(h)
245 245
246 246 if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
247 247 r._handler = self
248 248 r._host = host
249 249 r._url = req.get_full_url()
250 250 r._connection = h
251 251 r.code = r.status
252 r.headers = r.msg
253 r.msg = r.reason
252 254
253 255 if r.status == 200 or not HANDLE_ERRORS:
254 256 return r
255 257 else:
256 return self.parent.error('http', req, r, r.status, r.reason, r.msg)
257
258 return self.parent.error('http', req, r,
259 r.status, r.msg, r.headers)
258 260
259 261 def _reuse_connection(self, h, req, host):
260 262 """start the transaction with a re-used connection
261 263 return a response object (r) upon success or None on failure.
262 264 This DOES not close or remove bad connections in cases where
263 265 it returns. However, if an unexpected exception occurs, it
264 266 will close and remove the connection before re-raising.
265 267 """
266 268 try:
267 269 self._start_transaction(h, req)
268 270 r = h.getresponse()
269 271 # note: just because we got something back doesn't mean it
270 272 # worked. We'll check the version below, too.
271 273 except (socket.error, httplib.HTTPException):
272 274 r = None
273 275 except:
274 276 # adding this block just in case we've missed
275 277 # something we will still raise the exception, but
276 278 # lets try and close the connection and remove it
277 279 # first. We previously got into a nasty loop
278 280 # where an exception was uncaught, and so the
279 281 # connection stayed open. On the next try, the
280 282 # same exception was raised, etc. The tradeoff is
281 283 # that it's now possible this call will raise
282 284 # a DIFFERENT exception
283 285 if DEBUG: DEBUG.error("unexpected exception - closing " + \
284 286 "connection to %s (%d)", host, id(h))
285 287 self._cm.remove(h)
286 288 h.close()
287 289 raise
288 290
289 291 if r is None or r.version == 9:
290 292 # httplib falls back to assuming HTTP 0.9 if it gets a
291 293 # bad header back. This is most likely to happen if
292 294 # the socket has been closed by the server since we
293 295 # last used the connection.
294 296 if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
295 297 host, id(h))
296 298 r = None
297 299 else:
298 300 if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
299 301
300 302 return r
301 303
302 304 def _start_transaction(self, h, req):
303 305 try:
304 306 if req.has_data():
305 307 data = req.get_data()
306 308 h.putrequest('POST', req.get_selector())
307 309 if not req.headers.has_key('Content-type'):
308 310 h.putheader('Content-type',
309 311 'application/x-www-form-urlencoded')
310 312 if not req.headers.has_key('Content-length'):
311 313 h.putheader('Content-length', '%d' % len(data))
312 314 else:
313 315 h.putrequest('GET', req.get_selector())
314 316 except (socket.error, httplib.HTTPException), err:
315 317 raise urllib2.URLError(err)
316 318
317 319 for args in self.parent.addheaders:
318 320 h.putheader(*args)
319 321 for k, v in req.headers.items():
320 322 h.putheader(k, v)
321 323 h.endheaders()
322 324 if req.has_data():
323 325 h.send(data)
324 326
325 327 class HTTPResponse(httplib.HTTPResponse):
326 328 # we need to subclass HTTPResponse in order to
327 329 # 1) add readline() and readlines() methods
328 330 # 2) add close_connection() methods
329 331 # 3) add info() and geturl() methods
330 332
331 333 # in order to add readline(), read must be modified to deal with a
332 334 # buffer. example: readline must read a buffer and then spit back
333 335 # one line at a time. The only real alternative is to read one
334 336 # BYTE at a time (ick). Once something has been read, it can't be
335 337 # put back (ok, maybe it can, but that's even uglier than this),
336 338 # so if you THEN do a normal read, you must first take stuff from
337 339 # the buffer.
338 340
339 341 # the read method wraps the original to accomodate buffering,
340 342 # although read() never adds to the buffer.
341 343 # Both readline and readlines have been stolen with almost no
342 344 # modification from socket.py
343 345
344 346
345 347 def __init__(self, sock, debuglevel=0, strict=0, method=None):
346 348 if method: # the httplib in python 2.3 uses the method arg
347 349 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
348 350 else: # 2.2 doesn't
349 351 httplib.HTTPResponse.__init__(self, sock, debuglevel)
350 352 self.fileno = sock.fileno
351 353 self.code = None
352 354 self._rbuf = ''
353 355 self._rbufsize = 8096
354 356 self._handler = None # inserted by the handler later
355 357 self._host = None # (same)
356 358 self._url = None # (same)
357 359 self._connection = None # (same)
358 360
359 361 _raw_read = httplib.HTTPResponse.read
360 362
361 363 def close(self):
362 364 if self.fp:
363 365 self.fp.close()
364 366 self.fp = None
365 367 if self._handler:
366 368 self._handler._request_closed(self, self._host,
367 369 self._connection)
368 370
369 371 def close_connection(self):
370 372 self._handler._remove_connection(self._host, self._connection, close=1)
371 373 self.close()
372 374
373 375 def info(self):
374 return self.msg
376 return self.headers
375 377
376 378 def geturl(self):
377 379 return self._url
378 380
379 381 def read(self, amt=None):
380 382 # the _rbuf test is only in this first if for speed. It's not
381 383 # logically necessary
382 384 if self._rbuf and not amt is None:
383 385 L = len(self._rbuf)
384 386 if amt > L:
385 387 amt -= L
386 388 else:
387 389 s = self._rbuf[:amt]
388 390 self._rbuf = self._rbuf[amt:]
389 391 return s
390 392
391 393 s = self._rbuf + self._raw_read(amt)
392 394 self._rbuf = ''
393 395 return s
394 396
395 397 def readline(self, limit=-1):
396 398 data = ""
397 399 i = self._rbuf.find('\n')
398 400 while i < 0 and not (0 < limit <= len(self._rbuf)):
399 401 new = self._raw_read(self._rbufsize)
400 402 if not new: break
401 403 i = new.find('\n')
402 404 if i >= 0: i = i + len(self._rbuf)
403 405 self._rbuf = self._rbuf + new
404 406 if i < 0: i = len(self._rbuf)
405 407 else: i = i+1
406 408 if 0 <= limit < len(self._rbuf): i = limit
407 409 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
408 410 return data
409 411
410 412 def readlines(self, sizehint = 0):
411 413 total = 0
412 414 list = []
413 415 while 1:
414 416 line = self.readline()
415 417 if not line: break
416 418 list.append(line)
417 419 total += len(line)
418 420 if sizehint and total >= sizehint:
419 421 break
420 422 return list
421 423
422 424
423 425 class HTTPConnection(httplib.HTTPConnection):
424 426 # use the modified response class
425 427 response_class = HTTPResponse
426 428
427 429 #########################################################################
428 430 ##### TEST FUNCTIONS
429 431 #########################################################################
430 432
431 433 def error_handler(url):
432 434 global HANDLE_ERRORS
433 435 orig = HANDLE_ERRORS
434 436 keepalive_handler = HTTPHandler()
435 437 opener = urllib2.build_opener(keepalive_handler)
436 438 urllib2.install_opener(opener)
437 439 pos = {0: 'off', 1: 'on'}
438 440 for i in (0, 1):
439 441 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
440 442 HANDLE_ERRORS = i
441 443 try:
442 444 fo = urllib2.urlopen(url)
443 445 foo = fo.read()
444 446 fo.close()
445 447 try: status, reason = fo.status, fo.reason
446 448 except AttributeError: status, reason = None, None
447 449 except IOError, e:
448 450 print " EXCEPTION: %s" % e
449 451 raise
450 452 else:
451 453 print " status = %s, reason = %s" % (status, reason)
452 454 HANDLE_ERRORS = orig
453 455 hosts = keepalive_handler.open_connections()
454 456 print "open connections:", hosts
455 457 keepalive_handler.close_all()
456 458
457 459 def continuity(url):
458 460 import md5
459 461 format = '%25s: %s'
460 462
461 463 # first fetch the file with the normal http handler
462 464 opener = urllib2.build_opener()
463 465 urllib2.install_opener(opener)
464 466 fo = urllib2.urlopen(url)
465 467 foo = fo.read()
466 468 fo.close()
467 469 m = md5.new(foo)
468 470 print format % ('normal urllib', m.hexdigest())
469 471
470 472 # now install the keepalive handler and try again
471 473 opener = urllib2.build_opener(HTTPHandler())
472 474 urllib2.install_opener(opener)
473 475
474 476 fo = urllib2.urlopen(url)
475 477 foo = fo.read()
476 478 fo.close()
477 479 m = md5.new(foo)
478 480 print format % ('keepalive read', m.hexdigest())
479 481
480 482 fo = urllib2.urlopen(url)
481 483 foo = ''
482 484 while 1:
483 485 f = fo.readline()
484 486 if f: foo = foo + f
485 487 else: break
486 488 fo.close()
487 489 m = md5.new(foo)
488 490 print format % ('keepalive readline', m.hexdigest())
489 491
490 492 def comp(N, url):
491 493 print ' making %i connections to:\n %s' % (N, url)
492 494
493 495 sys.stdout.write(' first using the normal urllib handlers')
494 496 # first use normal opener
495 497 opener = urllib2.build_opener()
496 498 urllib2.install_opener(opener)
497 499 t1 = fetch(N, url)
498 500 print ' TIME: %.3f s' % t1
499 501
500 502 sys.stdout.write(' now using the keepalive handler ')
501 503 # now install the keepalive handler and try again
502 504 opener = urllib2.build_opener(HTTPHandler())
503 505 urllib2.install_opener(opener)
504 506 t2 = fetch(N, url)
505 507 print ' TIME: %.3f s' % t2
506 508 print ' improvement factor: %.2f' % (t1/t2, )
507 509
508 510 def fetch(N, url, delay=0):
509 511 import time
510 512 lens = []
511 513 starttime = time.time()
512 514 for i in range(N):
513 515 if delay and i > 0: time.sleep(delay)
514 516 fo = urllib2.urlopen(url)
515 517 foo = fo.read()
516 518 fo.close()
517 519 lens.append(len(foo))
518 520 diff = time.time() - starttime
519 521
520 522 j = 0
521 523 for i in lens[1:]:
522 524 j = j + 1
523 525 if not i == lens[0]:
524 526 print "WARNING: inconsistent length on read %i: %i" % (j, i)
525 527
526 528 return diff
527 529
528 530 def test_timeout(url):
529 531 global DEBUG
530 532 dbbackup = DEBUG
531 533 class FakeLogger:
532 534 def debug(self, msg, *args): print msg % args
533 535 info = warning = error = debug
534 536 DEBUG = FakeLogger()
535 537 print " fetching the file to establish a connection"
536 538 fo = urllib2.urlopen(url)
537 539 data1 = fo.read()
538 540 fo.close()
539 541
540 542 i = 20
541 543 print " waiting %i seconds for the server to close the connection" % i
542 544 while i > 0:
543 545 sys.stdout.write('\r %2i' % i)
544 546 sys.stdout.flush()
545 547 time.sleep(1)
546 548 i -= 1
547 549 sys.stderr.write('\r')
548 550
549 551 print " fetching the file a second time"
550 552 fo = urllib2.urlopen(url)
551 553 data2 = fo.read()
552 554 fo.close()
553 555
554 556 if data1 == data2:
555 557 print ' data are identical'
556 558 else:
557 559 print ' ERROR: DATA DIFFER'
558 560
559 561 DEBUG = dbbackup
560 562
561 563
562 564 def test(url, N=10):
563 565 print "checking error hander (do this on a non-200)"
564 566 try: error_handler(url)
565 567 except IOError, e:
566 568 print "exiting - exception will prevent further tests"
567 569 sys.exit()
568 570 print
569 571 print "performing continuity test (making sure stuff isn't corrupted)"
570 572 continuity(url)
571 573 print
572 574 print "performing speed comparison"
573 575 comp(N, url)
574 576 print
575 577 print "performing dropped-connection check"
576 578 test_timeout(url)
577 579
578 580 if __name__ == '__main__':
579 581 import time
580 582 import sys
581 583 try:
582 584 N = int(sys.argv[1])
583 585 url = sys.argv[2]
584 586 except:
585 587 print "%s <integer> <url>" % sys.argv[0]
586 588 else:
587 589 test(url, N)
@@ -1,9 +1,5 b''
1 1 abort: error: Connection refused
2 2 255
3 3 copy: No such file or directory
4 4 abort: HTTP Error 404
5 Date:
6 Content-Type: text/html
7 Connection: close
8
9 5 0
General Comments 0
You need to be logged in to leave comments. Login now