##// END OF EJS Templates
keepalive: use safehasattr instead of hasattr
Augie Fackler -
r14958:fd246aef default
parent child Browse files
Show More
@@ -1,765 +1,766 b''
1 # This library is free software; you can redistribute it and/or
1 # This library is free software; you can redistribute it and/or
2 # modify it under the terms of the GNU Lesser General Public
2 # modify it under the terms of the GNU Lesser General Public
3 # License as published by the Free Software Foundation; either
3 # License as published by the Free Software Foundation; either
4 # version 2.1 of the License, or (at your option) any later version.
4 # version 2.1 of the License, or (at your option) any later version.
5 #
5 #
6 # This library is distributed in the hope that it will be useful,
6 # This library is distributed in the hope that it will be useful,
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 # Lesser General Public License for more details.
9 # Lesser General Public License for more details.
10 #
10 #
11 # You should have received a copy of the GNU Lesser General Public
11 # You should have received a copy of the GNU Lesser General Public
12 # License along with this library; if not, write to the
12 # License along with this library; if not, write to the
13 # Free Software Foundation, Inc.,
13 # Free Software Foundation, Inc.,
14 # 59 Temple Place, Suite 330,
14 # 59 Temple Place, Suite 330,
15 # Boston, MA 02111-1307 USA
15 # Boston, MA 02111-1307 USA
16
16
17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19
19
20 # Modified by Benoit Boissinot:
20 # Modified by Benoit Boissinot:
21 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
21 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
22 # Modified by Dirkjan Ochtman:
22 # Modified by Dirkjan Ochtman:
23 # - import md5 function from a local util module
23 # - import md5 function from a local util module
24 # Modified by Martin Geisler:
24 # Modified by Martin Geisler:
25 # - moved md5 function from local util module to this module
25 # - moved md5 function from local util module to this module
26 # Modified by Augie Fackler:
26 # Modified by Augie Fackler:
27 # - add safesend method and use it to prevent broken pipe errors
27 # - add safesend method and use it to prevent broken pipe errors
28 # on large POST requests
28 # on large POST requests
29
29
30 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
30 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
31
31
32 >>> import urllib2
32 >>> import urllib2
33 >>> from keepalive import HTTPHandler
33 >>> from keepalive import HTTPHandler
34 >>> keepalive_handler = HTTPHandler()
34 >>> keepalive_handler = HTTPHandler()
35 >>> opener = urllib2.build_opener(keepalive_handler)
35 >>> opener = urllib2.build_opener(keepalive_handler)
36 >>> urllib2.install_opener(opener)
36 >>> urllib2.install_opener(opener)
37 >>>
37 >>>
38 >>> fo = urllib2.urlopen('http://www.python.org')
38 >>> fo = urllib2.urlopen('http://www.python.org')
39
39
40 If a connection to a given host is requested, and all of the existing
40 If a connection to a given host is requested, and all of the existing
41 connections are still in use, another connection will be opened. If
41 connections are still in use, another connection will be opened. If
42 the handler tries to use an existing connection but it fails in some
42 the handler tries to use an existing connection but it fails in some
43 way, it will be closed and removed from the pool.
43 way, it will be closed and removed from the pool.
44
44
45 To remove the handler, simply re-run build_opener with no arguments, and
45 To remove the handler, simply re-run build_opener with no arguments, and
46 install that opener.
46 install that opener.
47
47
48 You can explicitly close connections by using the close_connection()
48 You can explicitly close connections by using the close_connection()
49 method of the returned file-like object (described below) or you can
49 method of the returned file-like object (described below) or you can
50 use the handler methods:
50 use the handler methods:
51
51
52 close_connection(host)
52 close_connection(host)
53 close_all()
53 close_all()
54 open_connections()
54 open_connections()
55
55
56 NOTE: using the close_connection and close_all methods of the handler
56 NOTE: using the close_connection and close_all methods of the handler
57 should be done with care when using multiple threads.
57 should be done with care when using multiple threads.
58 * there is nothing that prevents another thread from creating new
58 * there is nothing that prevents another thread from creating new
59 connections immediately after connections are closed
59 connections immediately after connections are closed
60 * no checks are done to prevent in-use connections from being closed
60 * no checks are done to prevent in-use connections from being closed
61
61
62 >>> keepalive_handler.close_all()
62 >>> keepalive_handler.close_all()
63
63
64 EXTRA ATTRIBUTES AND METHODS
64 EXTRA ATTRIBUTES AND METHODS
65
65
66 Upon a status of 200, the object returned has a few additional
66 Upon a status of 200, the object returned has a few additional
67 attributes and methods, which should not be used if you want to
67 attributes and methods, which should not be used if you want to
68 remain consistent with the normal urllib2-returned objects:
68 remain consistent with the normal urllib2-returned objects:
69
69
70 close_connection() - close the connection to the host
70 close_connection() - close the connection to the host
71 readlines() - you know, readlines()
71 readlines() - you know, readlines()
72 status - the return status (ie 404)
72 status - the return status (ie 404)
73 reason - english translation of status (ie 'File not found')
73 reason - english translation of status (ie 'File not found')
74
74
75 If you want the best of both worlds, use this inside an
75 If you want the best of both worlds, use this inside an
76 AttributeError-catching try:
76 AttributeError-catching try:
77
77
78 >>> try: status = fo.status
78 >>> try: status = fo.status
79 >>> except AttributeError: status = None
79 >>> except AttributeError: status = None
80
80
81 Unfortunately, these are ONLY there if status == 200, so it's not
81 Unfortunately, these are ONLY there if status == 200, so it's not
82 easy to distinguish between non-200 responses. The reason is that
82 easy to distinguish between non-200 responses. The reason is that
83 urllib2 tries to do clever things with error codes 301, 302, 401,
83 urllib2 tries to do clever things with error codes 301, 302, 401,
84 and 407, and it wraps the object upon return.
84 and 407, and it wraps the object upon return.
85
85
86 For python versions earlier than 2.4, you can avoid this fancy error
86 For python versions earlier than 2.4, you can avoid this fancy error
87 handling by setting the module-level global HANDLE_ERRORS to zero.
87 handling by setting the module-level global HANDLE_ERRORS to zero.
88 You see, prior to 2.4, it's the HTTP Handler's job to determine what
88 You see, prior to 2.4, it's the HTTP Handler's job to determine what
89 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
89 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
90 means "pass everything up". In python 2.4, however, this job no
90 means "pass everything up". In python 2.4, however, this job no
91 longer belongs to the HTTP Handler and is now done by a NEW handler,
91 longer belongs to the HTTP Handler and is now done by a NEW handler,
92 HTTPErrorProcessor. Here's the bottom line:
92 HTTPErrorProcessor. Here's the bottom line:
93
93
94 python version < 2.4
94 python version < 2.4
95 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
95 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
96 errors
96 errors
97 HANDLE_ERRORS == 0 pass everything up, error processing is
97 HANDLE_ERRORS == 0 pass everything up, error processing is
98 left to the calling code
98 left to the calling code
99 python version >= 2.4
99 python version >= 2.4
100 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
100 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
101 HANDLE_ERRORS == 0 (default) pass everything up, let the
101 HANDLE_ERRORS == 0 (default) pass everything up, let the
102 other handlers (specifically,
102 other handlers (specifically,
103 HTTPErrorProcessor) decide what to do
103 HTTPErrorProcessor) decide what to do
104
104
105 In practice, setting the variable either way makes little difference
105 In practice, setting the variable either way makes little difference
106 in python 2.4, so for the most consistent behavior across versions,
106 in python 2.4, so for the most consistent behavior across versions,
107 you probably just want to use the defaults, which will give you
107 you probably just want to use the defaults, which will give you
108 exceptions on errors.
108 exceptions on errors.
109
109
110 """
110 """
111
111
112 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
112 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
113
113
114 import errno
114 import errno
115 import httplib
115 import httplib
116 import socket
116 import socket
117 import thread
117 import thread
118 import urllib2
118 import urllib2
119
119
120 DEBUG = None
120 DEBUG = None
121
121
122 import sys
122 import sys
123 if sys.version_info < (2, 4):
123 if sys.version_info < (2, 4):
124 HANDLE_ERRORS = 1
124 HANDLE_ERRORS = 1
125 else: HANDLE_ERRORS = 0
125 else: HANDLE_ERRORS = 0
126
126
127 class ConnectionManager(object):
127 class ConnectionManager(object):
128 """
128 """
129 The connection manager must be able to:
129 The connection manager must be able to:
130 * keep track of all existing
130 * keep track of all existing
131 """
131 """
132 def __init__(self):
132 def __init__(self):
133 self._lock = thread.allocate_lock()
133 self._lock = thread.allocate_lock()
134 self._hostmap = {} # map hosts to a list of connections
134 self._hostmap = {} # map hosts to a list of connections
135 self._connmap = {} # map connections to host
135 self._connmap = {} # map connections to host
136 self._readymap = {} # map connection to ready state
136 self._readymap = {} # map connection to ready state
137
137
138 def add(self, host, connection, ready):
138 def add(self, host, connection, ready):
139 self._lock.acquire()
139 self._lock.acquire()
140 try:
140 try:
141 if not host in self._hostmap:
141 if not host in self._hostmap:
142 self._hostmap[host] = []
142 self._hostmap[host] = []
143 self._hostmap[host].append(connection)
143 self._hostmap[host].append(connection)
144 self._connmap[connection] = host
144 self._connmap[connection] = host
145 self._readymap[connection] = ready
145 self._readymap[connection] = ready
146 finally:
146 finally:
147 self._lock.release()
147 self._lock.release()
148
148
149 def remove(self, connection):
149 def remove(self, connection):
150 self._lock.acquire()
150 self._lock.acquire()
151 try:
151 try:
152 try:
152 try:
153 host = self._connmap[connection]
153 host = self._connmap[connection]
154 except KeyError:
154 except KeyError:
155 pass
155 pass
156 else:
156 else:
157 del self._connmap[connection]
157 del self._connmap[connection]
158 del self._readymap[connection]
158 del self._readymap[connection]
159 self._hostmap[host].remove(connection)
159 self._hostmap[host].remove(connection)
160 if not self._hostmap[host]: del self._hostmap[host]
160 if not self._hostmap[host]: del self._hostmap[host]
161 finally:
161 finally:
162 self._lock.release()
162 self._lock.release()
163
163
164 def set_ready(self, connection, ready):
164 def set_ready(self, connection, ready):
165 try:
165 try:
166 self._readymap[connection] = ready
166 self._readymap[connection] = ready
167 except KeyError:
167 except KeyError:
168 pass
168 pass
169
169
170 def get_ready_conn(self, host):
170 def get_ready_conn(self, host):
171 conn = None
171 conn = None
172 self._lock.acquire()
172 self._lock.acquire()
173 try:
173 try:
174 if host in self._hostmap:
174 if host in self._hostmap:
175 for c in self._hostmap[host]:
175 for c in self._hostmap[host]:
176 if self._readymap[c]:
176 if self._readymap[c]:
177 self._readymap[c] = 0
177 self._readymap[c] = 0
178 conn = c
178 conn = c
179 break
179 break
180 finally:
180 finally:
181 self._lock.release()
181 self._lock.release()
182 return conn
182 return conn
183
183
184 def get_all(self, host=None):
184 def get_all(self, host=None):
185 if host:
185 if host:
186 return list(self._hostmap.get(host, []))
186 return list(self._hostmap.get(host, []))
187 else:
187 else:
188 return dict(self._hostmap)
188 return dict(self._hostmap)
189
189
190 class KeepAliveHandler(object):
190 class KeepAliveHandler(object):
191 def __init__(self):
191 def __init__(self):
192 self._cm = ConnectionManager()
192 self._cm = ConnectionManager()
193
193
194 #### Connection Management
194 #### Connection Management
195 def open_connections(self):
195 def open_connections(self):
196 """return a list of connected hosts and the number of connections
196 """return a list of connected hosts and the number of connections
197 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
197 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
198 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
198 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
199
199
200 def close_connection(self, host):
200 def close_connection(self, host):
201 """close connection(s) to <host>
201 """close connection(s) to <host>
202 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
202 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
203 no error occurs if there is no connection to that host."""
203 no error occurs if there is no connection to that host."""
204 for h in self._cm.get_all(host):
204 for h in self._cm.get_all(host):
205 self._cm.remove(h)
205 self._cm.remove(h)
206 h.close()
206 h.close()
207
207
208 def close_all(self):
208 def close_all(self):
209 """close all open connections"""
209 """close all open connections"""
210 for host, conns in self._cm.get_all().iteritems():
210 for host, conns in self._cm.get_all().iteritems():
211 for h in conns:
211 for h in conns:
212 self._cm.remove(h)
212 self._cm.remove(h)
213 h.close()
213 h.close()
214
214
215 def _request_closed(self, request, host, connection):
215 def _request_closed(self, request, host, connection):
216 """tells us that this request is now closed and the the
216 """tells us that this request is now closed and the the
217 connection is ready for another request"""
217 connection is ready for another request"""
218 self._cm.set_ready(connection, 1)
218 self._cm.set_ready(connection, 1)
219
219
220 def _remove_connection(self, host, connection, close=0):
220 def _remove_connection(self, host, connection, close=0):
221 if close:
221 if close:
222 connection.close()
222 connection.close()
223 self._cm.remove(connection)
223 self._cm.remove(connection)
224
224
225 #### Transaction Execution
225 #### Transaction Execution
226 def http_open(self, req):
226 def http_open(self, req):
227 return self.do_open(HTTPConnection, req)
227 return self.do_open(HTTPConnection, req)
228
228
229 def do_open(self, http_class, req):
229 def do_open(self, http_class, req):
230 host = req.get_host()
230 host = req.get_host()
231 if not host:
231 if not host:
232 raise urllib2.URLError('no host given')
232 raise urllib2.URLError('no host given')
233
233
234 try:
234 try:
235 h = self._cm.get_ready_conn(host)
235 h = self._cm.get_ready_conn(host)
236 while h:
236 while h:
237 r = self._reuse_connection(h, req, host)
237 r = self._reuse_connection(h, req, host)
238
238
239 # if this response is non-None, then it worked and we're
239 # if this response is non-None, then it worked and we're
240 # done. Break out, skipping the else block.
240 # done. Break out, skipping the else block.
241 if r:
241 if r:
242 break
242 break
243
243
244 # connection is bad - possibly closed by server
244 # connection is bad - possibly closed by server
245 # discard it and ask for the next free connection
245 # discard it and ask for the next free connection
246 h.close()
246 h.close()
247 self._cm.remove(h)
247 self._cm.remove(h)
248 h = self._cm.get_ready_conn(host)
248 h = self._cm.get_ready_conn(host)
249 else:
249 else:
250 # no (working) free connections were found. Create a new one.
250 # no (working) free connections were found. Create a new one.
251 h = http_class(host)
251 h = http_class(host)
252 if DEBUG:
252 if DEBUG:
253 DEBUG.info("creating new connection to %s (%d)",
253 DEBUG.info("creating new connection to %s (%d)",
254 host, id(h))
254 host, id(h))
255 self._cm.add(host, h, 0)
255 self._cm.add(host, h, 0)
256 self._start_transaction(h, req)
256 self._start_transaction(h, req)
257 r = h.getresponse()
257 r = h.getresponse()
258 except (socket.error, httplib.HTTPException), err:
258 except (socket.error, httplib.HTTPException), err:
259 raise urllib2.URLError(err)
259 raise urllib2.URLError(err)
260
260
261 # if not a persistent connection, don't try to reuse it
261 # if not a persistent connection, don't try to reuse it
262 if r.will_close:
262 if r.will_close:
263 self._cm.remove(h)
263 self._cm.remove(h)
264
264
265 if DEBUG:
265 if DEBUG:
266 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
266 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
267 r._handler = self
267 r._handler = self
268 r._host = host
268 r._host = host
269 r._url = req.get_full_url()
269 r._url = req.get_full_url()
270 r._connection = h
270 r._connection = h
271 r.code = r.status
271 r.code = r.status
272 r.headers = r.msg
272 r.headers = r.msg
273 r.msg = r.reason
273 r.msg = r.reason
274
274
275 if r.status == 200 or not HANDLE_ERRORS:
275 if r.status == 200 or not HANDLE_ERRORS:
276 return r
276 return r
277 else:
277 else:
278 return self.parent.error('http', req, r,
278 return self.parent.error('http', req, r,
279 r.status, r.msg, r.headers)
279 r.status, r.msg, r.headers)
280
280
281 def _reuse_connection(self, h, req, host):
281 def _reuse_connection(self, h, req, host):
282 """start the transaction with a re-used connection
282 """start the transaction with a re-used connection
283 return a response object (r) upon success or None on failure.
283 return a response object (r) upon success or None on failure.
284 This DOES not close or remove bad connections in cases where
284 This DOES not close or remove bad connections in cases where
285 it returns. However, if an unexpected exception occurs, it
285 it returns. However, if an unexpected exception occurs, it
286 will close and remove the connection before re-raising.
286 will close and remove the connection before re-raising.
287 """
287 """
288 try:
288 try:
289 self._start_transaction(h, req)
289 self._start_transaction(h, req)
290 r = h.getresponse()
290 r = h.getresponse()
291 # note: just because we got something back doesn't mean it
291 # note: just because we got something back doesn't mean it
292 # worked. We'll check the version below, too.
292 # worked. We'll check the version below, too.
293 except (socket.error, httplib.HTTPException):
293 except (socket.error, httplib.HTTPException):
294 r = None
294 r = None
295 except:
295 except:
296 # adding this block just in case we've missed
296 # adding this block just in case we've missed
297 # something we will still raise the exception, but
297 # something we will still raise the exception, but
298 # lets try and close the connection and remove it
298 # lets try and close the connection and remove it
299 # first. We previously got into a nasty loop
299 # first. We previously got into a nasty loop
300 # where an exception was uncaught, and so the
300 # where an exception was uncaught, and so the
301 # connection stayed open. On the next try, the
301 # connection stayed open. On the next try, the
302 # same exception was raised, etc. The tradeoff is
302 # same exception was raised, etc. The tradeoff is
303 # that it's now possible this call will raise
303 # that it's now possible this call will raise
304 # a DIFFERENT exception
304 # a DIFFERENT exception
305 if DEBUG:
305 if DEBUG:
306 DEBUG.error("unexpected exception - closing "
306 DEBUG.error("unexpected exception - closing "
307 "connection to %s (%d)", host, id(h))
307 "connection to %s (%d)", host, id(h))
308 self._cm.remove(h)
308 self._cm.remove(h)
309 h.close()
309 h.close()
310 raise
310 raise
311
311
312 if r is None or r.version == 9:
312 if r is None or r.version == 9:
313 # httplib falls back to assuming HTTP 0.9 if it gets a
313 # httplib falls back to assuming HTTP 0.9 if it gets a
314 # bad header back. This is most likely to happen if
314 # bad header back. This is most likely to happen if
315 # the socket has been closed by the server since we
315 # the socket has been closed by the server since we
316 # last used the connection.
316 # last used the connection.
317 if DEBUG:
317 if DEBUG:
318 DEBUG.info("failed to re-use connection to %s (%d)",
318 DEBUG.info("failed to re-use connection to %s (%d)",
319 host, id(h))
319 host, id(h))
320 r = None
320 r = None
321 else:
321 else:
322 if DEBUG:
322 if DEBUG:
323 DEBUG.info("re-using connection to %s (%d)", host, id(h))
323 DEBUG.info("re-using connection to %s (%d)", host, id(h))
324
324
325 return r
325 return r
326
326
327 def _start_transaction(self, h, req):
327 def _start_transaction(self, h, req):
328 # What follows mostly reimplements HTTPConnection.request()
328 # What follows mostly reimplements HTTPConnection.request()
329 # except it adds self.parent.addheaders in the mix.
329 # except it adds self.parent.addheaders in the mix.
330 headers = req.headers.copy()
330 headers = req.headers.copy()
331 if sys.version_info >= (2, 4):
331 if sys.version_info >= (2, 4):
332 headers.update(req.unredirected_hdrs)
332 headers.update(req.unredirected_hdrs)
333 headers.update(self.parent.addheaders)
333 headers.update(self.parent.addheaders)
334 headers = dict((n.lower(), v) for n, v in headers.items())
334 headers = dict((n.lower(), v) for n, v in headers.items())
335 skipheaders = {}
335 skipheaders = {}
336 for n in ('host', 'accept-encoding'):
336 for n in ('host', 'accept-encoding'):
337 if n in headers:
337 if n in headers:
338 skipheaders['skip_' + n.replace('-', '_')] = 1
338 skipheaders['skip_' + n.replace('-', '_')] = 1
339 try:
339 try:
340 if req.has_data():
340 if req.has_data():
341 data = req.get_data()
341 data = req.get_data()
342 h.putrequest('POST', req.get_selector(), **skipheaders)
342 h.putrequest('POST', req.get_selector(), **skipheaders)
343 if 'content-type' not in headers:
343 if 'content-type' not in headers:
344 h.putheader('Content-type',
344 h.putheader('Content-type',
345 'application/x-www-form-urlencoded')
345 'application/x-www-form-urlencoded')
346 if 'content-length' not in headers:
346 if 'content-length' not in headers:
347 h.putheader('Content-length', '%d' % len(data))
347 h.putheader('Content-length', '%d' % len(data))
348 else:
348 else:
349 h.putrequest('GET', req.get_selector(), **skipheaders)
349 h.putrequest('GET', req.get_selector(), **skipheaders)
350 except (socket.error), err:
350 except (socket.error), err:
351 raise urllib2.URLError(err)
351 raise urllib2.URLError(err)
352 for k, v in headers.items():
352 for k, v in headers.items():
353 h.putheader(k, v)
353 h.putheader(k, v)
354 h.endheaders()
354 h.endheaders()
355 if req.has_data():
355 if req.has_data():
356 h.send(data)
356 h.send(data)
357
357
358 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
358 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
359 pass
359 pass
360
360
361 class HTTPResponse(httplib.HTTPResponse):
361 class HTTPResponse(httplib.HTTPResponse):
362 # we need to subclass HTTPResponse in order to
362 # we need to subclass HTTPResponse in order to
363 # 1) add readline() and readlines() methods
363 # 1) add readline() and readlines() methods
364 # 2) add close_connection() methods
364 # 2) add close_connection() methods
365 # 3) add info() and geturl() methods
365 # 3) add info() and geturl() methods
366
366
367 # in order to add readline(), read must be modified to deal with a
367 # in order to add readline(), read must be modified to deal with a
368 # buffer. example: readline must read a buffer and then spit back
368 # buffer. example: readline must read a buffer and then spit back
369 # one line at a time. The only real alternative is to read one
369 # one line at a time. The only real alternative is to read one
370 # BYTE at a time (ick). Once something has been read, it can't be
370 # BYTE at a time (ick). Once something has been read, it can't be
371 # put back (ok, maybe it can, but that's even uglier than this),
371 # put back (ok, maybe it can, but that's even uglier than this),
372 # so if you THEN do a normal read, you must first take stuff from
372 # so if you THEN do a normal read, you must first take stuff from
373 # the buffer.
373 # the buffer.
374
374
375 # the read method wraps the original to accomodate buffering,
375 # the read method wraps the original to accomodate buffering,
376 # although read() never adds to the buffer.
376 # although read() never adds to the buffer.
377 # Both readline and readlines have been stolen with almost no
377 # Both readline and readlines have been stolen with almost no
378 # modification from socket.py
378 # modification from socket.py
379
379
380
380
381 def __init__(self, sock, debuglevel=0, strict=0, method=None):
381 def __init__(self, sock, debuglevel=0, strict=0, method=None):
382 if method: # the httplib in python 2.3 uses the method arg
382 if method: # the httplib in python 2.3 uses the method arg
383 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
383 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
384 else: # 2.2 doesn't
384 else: # 2.2 doesn't
385 httplib.HTTPResponse.__init__(self, sock, debuglevel)
385 httplib.HTTPResponse.__init__(self, sock, debuglevel)
386 self.fileno = sock.fileno
386 self.fileno = sock.fileno
387 self.code = None
387 self.code = None
388 self._rbuf = ''
388 self._rbuf = ''
389 self._rbufsize = 8096
389 self._rbufsize = 8096
390 self._handler = None # inserted by the handler later
390 self._handler = None # inserted by the handler later
391 self._host = None # (same)
391 self._host = None # (same)
392 self._url = None # (same)
392 self._url = None # (same)
393 self._connection = None # (same)
393 self._connection = None # (same)
394
394
395 _raw_read = httplib.HTTPResponse.read
395 _raw_read = httplib.HTTPResponse.read
396
396
397 def close(self):
397 def close(self):
398 if self.fp:
398 if self.fp:
399 self.fp.close()
399 self.fp.close()
400 self.fp = None
400 self.fp = None
401 if self._handler:
401 if self._handler:
402 self._handler._request_closed(self, self._host,
402 self._handler._request_closed(self, self._host,
403 self._connection)
403 self._connection)
404
404
405 def close_connection(self):
405 def close_connection(self):
406 self._handler._remove_connection(self._host, self._connection, close=1)
406 self._handler._remove_connection(self._host, self._connection, close=1)
407 self.close()
407 self.close()
408
408
409 def info(self):
409 def info(self):
410 return self.headers
410 return self.headers
411
411
412 def geturl(self):
412 def geturl(self):
413 return self._url
413 return self._url
414
414
415 def read(self, amt=None):
415 def read(self, amt=None):
416 # the _rbuf test is only in this first if for speed. It's not
416 # the _rbuf test is only in this first if for speed. It's not
417 # logically necessary
417 # logically necessary
418 if self._rbuf and not amt is None:
418 if self._rbuf and not amt is None:
419 L = len(self._rbuf)
419 L = len(self._rbuf)
420 if amt > L:
420 if amt > L:
421 amt -= L
421 amt -= L
422 else:
422 else:
423 s = self._rbuf[:amt]
423 s = self._rbuf[:amt]
424 self._rbuf = self._rbuf[amt:]
424 self._rbuf = self._rbuf[amt:]
425 return s
425 return s
426
426
427 s = self._rbuf + self._raw_read(amt)
427 s = self._rbuf + self._raw_read(amt)
428 self._rbuf = ''
428 self._rbuf = ''
429 return s
429 return s
430
430
431 # stolen from Python SVN #68532 to fix issue1088
431 # stolen from Python SVN #68532 to fix issue1088
432 def _read_chunked(self, amt):
432 def _read_chunked(self, amt):
433 chunk_left = self.chunk_left
433 chunk_left = self.chunk_left
434 value = ''
434 value = ''
435
435
436 # XXX This accumulates chunks by repeated string concatenation,
436 # XXX This accumulates chunks by repeated string concatenation,
437 # which is not efficient as the number or size of chunks gets big.
437 # which is not efficient as the number or size of chunks gets big.
438 while True:
438 while True:
439 if chunk_left is None:
439 if chunk_left is None:
440 line = self.fp.readline()
440 line = self.fp.readline()
441 i = line.find(';')
441 i = line.find(';')
442 if i >= 0:
442 if i >= 0:
443 line = line[:i] # strip chunk-extensions
443 line = line[:i] # strip chunk-extensions
444 try:
444 try:
445 chunk_left = int(line, 16)
445 chunk_left = int(line, 16)
446 except ValueError:
446 except ValueError:
447 # close the connection as protocol synchronisation is
447 # close the connection as protocol synchronisation is
448 # probably lost
448 # probably lost
449 self.close()
449 self.close()
450 raise httplib.IncompleteRead(value)
450 raise httplib.IncompleteRead(value)
451 if chunk_left == 0:
451 if chunk_left == 0:
452 break
452 break
453 if amt is None:
453 if amt is None:
454 value += self._safe_read(chunk_left)
454 value += self._safe_read(chunk_left)
455 elif amt < chunk_left:
455 elif amt < chunk_left:
456 value += self._safe_read(amt)
456 value += self._safe_read(amt)
457 self.chunk_left = chunk_left - amt
457 self.chunk_left = chunk_left - amt
458 return value
458 return value
459 elif amt == chunk_left:
459 elif amt == chunk_left:
460 value += self._safe_read(amt)
460 value += self._safe_read(amt)
461 self._safe_read(2) # toss the CRLF at the end of the chunk
461 self._safe_read(2) # toss the CRLF at the end of the chunk
462 self.chunk_left = None
462 self.chunk_left = None
463 return value
463 return value
464 else:
464 else:
465 value += self._safe_read(chunk_left)
465 value += self._safe_read(chunk_left)
466 amt -= chunk_left
466 amt -= chunk_left
467
467
468 # we read the whole chunk, get another
468 # we read the whole chunk, get another
469 self._safe_read(2) # toss the CRLF at the end of the chunk
469 self._safe_read(2) # toss the CRLF at the end of the chunk
470 chunk_left = None
470 chunk_left = None
471
471
472 # read and discard trailer up to the CRLF terminator
472 # read and discard trailer up to the CRLF terminator
473 ### note: we shouldn't have any trailers!
473 ### note: we shouldn't have any trailers!
474 while True:
474 while True:
475 line = self.fp.readline()
475 line = self.fp.readline()
476 if not line:
476 if not line:
477 # a vanishingly small number of sites EOF without
477 # a vanishingly small number of sites EOF without
478 # sending the trailer
478 # sending the trailer
479 break
479 break
480 if line == '\r\n':
480 if line == '\r\n':
481 break
481 break
482
482
483 # we read everything; close the "file"
483 # we read everything; close the "file"
484 self.close()
484 self.close()
485
485
486 return value
486 return value
487
487
488 def readline(self, limit=-1):
488 def readline(self, limit=-1):
489 i = self._rbuf.find('\n')
489 i = self._rbuf.find('\n')
490 while i < 0 and not (0 < limit <= len(self._rbuf)):
490 while i < 0 and not (0 < limit <= len(self._rbuf)):
491 new = self._raw_read(self._rbufsize)
491 new = self._raw_read(self._rbufsize)
492 if not new:
492 if not new:
493 break
493 break
494 i = new.find('\n')
494 i = new.find('\n')
495 if i >= 0:
495 if i >= 0:
496 i = i + len(self._rbuf)
496 i = i + len(self._rbuf)
497 self._rbuf = self._rbuf + new
497 self._rbuf = self._rbuf + new
498 if i < 0:
498 if i < 0:
499 i = len(self._rbuf)
499 i = len(self._rbuf)
500 else:
500 else:
501 i = i + 1
501 i = i + 1
502 if 0 <= limit < len(self._rbuf):
502 if 0 <= limit < len(self._rbuf):
503 i = limit
503 i = limit
504 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
504 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
505 return data
505 return data
506
506
507 def readlines(self, sizehint = 0):
507 def readlines(self, sizehint = 0):
508 total = 0
508 total = 0
509 list = []
509 list = []
510 while True:
510 while True:
511 line = self.readline()
511 line = self.readline()
512 if not line:
512 if not line:
513 break
513 break
514 list.append(line)
514 list.append(line)
515 total += len(line)
515 total += len(line)
516 if sizehint and total >= sizehint:
516 if sizehint and total >= sizehint:
517 break
517 break
518 return list
518 return list
519
519
520 def safesend(self, str):
520 def safesend(self, str):
521 """Send `str' to the server.
521 """Send `str' to the server.
522
522
523 Shamelessly ripped off from httplib to patch a bad behavior.
523 Shamelessly ripped off from httplib to patch a bad behavior.
524 """
524 """
525 # _broken_pipe_resp is an attribute we set in this function
525 # _broken_pipe_resp is an attribute we set in this function
526 # if the socket is closed while we're sending data but
526 # if the socket is closed while we're sending data but
527 # the server sent us a response before hanging up.
527 # the server sent us a response before hanging up.
528 # In that case, we want to pretend to send the rest of the
528 # In that case, we want to pretend to send the rest of the
529 # outgoing data, and then let the user use getresponse()
529 # outgoing data, and then let the user use getresponse()
530 # (which we wrap) to get this last response before
530 # (which we wrap) to get this last response before
531 # opening a new socket.
531 # opening a new socket.
532 if getattr(self, '_broken_pipe_resp', None) is not None:
532 if getattr(self, '_broken_pipe_resp', None) is not None:
533 return
533 return
534
534
535 if self.sock is None:
535 if self.sock is None:
536 if self.auto_open:
536 if self.auto_open:
537 self.connect()
537 self.connect()
538 else:
538 else:
539 raise httplib.NotConnected()
539 raise httplib.NotConnected()
540
540
541 # send the data to the server. if we get a broken pipe, then close
541 # send the data to the server. if we get a broken pipe, then close
542 # the socket. we want to reconnect when somebody tries to send again.
542 # the socket. we want to reconnect when somebody tries to send again.
543 #
543 #
544 # NOTE: we DO propagate the error, though, because we cannot simply
544 # NOTE: we DO propagate the error, though, because we cannot simply
545 # ignore the error... the caller will know if they can retry.
545 # ignore the error... the caller will know if they can retry.
546 if self.debuglevel > 0:
546 if self.debuglevel > 0:
547 print "send:", repr(str)
547 print "send:", repr(str)
548 try:
548 try:
549 blocksize = 8192
549 blocksize = 8192
550 if hasattr(str,'read') :
550 read = getattr(str, 'read', None)
551 if read is not None:
551 if self.debuglevel > 0:
552 if self.debuglevel > 0:
552 print "sendIng a read()able"
553 print "sendIng a read()able"
553 data = str.read(blocksize)
554 data = read(blocksize)
554 while data:
555 while data:
555 self.sock.sendall(data)
556 self.sock.sendall(data)
556 data = str.read(blocksize)
557 data = read(blocksize)
557 else:
558 else:
558 self.sock.sendall(str)
559 self.sock.sendall(str)
559 except socket.error, v:
560 except socket.error, v:
560 reraise = True
561 reraise = True
561 if v[0] == errno.EPIPE: # Broken pipe
562 if v[0] == errno.EPIPE: # Broken pipe
562 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
563 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
563 self._broken_pipe_resp = None
564 self._broken_pipe_resp = None
564 self._broken_pipe_resp = self.getresponse()
565 self._broken_pipe_resp = self.getresponse()
565 reraise = False
566 reraise = False
566 self.close()
567 self.close()
567 if reraise:
568 if reraise:
568 raise
569 raise
569
570
570 def wrapgetresponse(cls):
571 def wrapgetresponse(cls):
571 """Wraps getresponse in cls with a broken-pipe sane version.
572 """Wraps getresponse in cls with a broken-pipe sane version.
572 """
573 """
573 def safegetresponse(self):
574 def safegetresponse(self):
574 # In safesend() we might set the _broken_pipe_resp
575 # In safesend() we might set the _broken_pipe_resp
575 # attribute, in which case the socket has already
576 # attribute, in which case the socket has already
576 # been closed and we just need to give them the response
577 # been closed and we just need to give them the response
577 # back. Otherwise, we use the normal response path.
578 # back. Otherwise, we use the normal response path.
578 r = getattr(self, '_broken_pipe_resp', None)
579 r = getattr(self, '_broken_pipe_resp', None)
579 if r is not None:
580 if r is not None:
580 return r
581 return r
581 return cls.getresponse(self)
582 return cls.getresponse(self)
582 safegetresponse.__doc__ = cls.getresponse.__doc__
583 safegetresponse.__doc__ = cls.getresponse.__doc__
583 return safegetresponse
584 return safegetresponse
584
585
585 class HTTPConnection(httplib.HTTPConnection):
586 class HTTPConnection(httplib.HTTPConnection):
586 # use the modified response class
587 # use the modified response class
587 response_class = HTTPResponse
588 response_class = HTTPResponse
588 send = safesend
589 send = safesend
589 getresponse = wrapgetresponse(httplib.HTTPConnection)
590 getresponse = wrapgetresponse(httplib.HTTPConnection)
590
591
591
592
592 #########################################################################
593 #########################################################################
593 ##### TEST FUNCTIONS
594 ##### TEST FUNCTIONS
594 #########################################################################
595 #########################################################################
595
596
596 def error_handler(url):
597 def error_handler(url):
597 global HANDLE_ERRORS
598 global HANDLE_ERRORS
598 orig = HANDLE_ERRORS
599 orig = HANDLE_ERRORS
599 keepalive_handler = HTTPHandler()
600 keepalive_handler = HTTPHandler()
600 opener = urllib2.build_opener(keepalive_handler)
601 opener = urllib2.build_opener(keepalive_handler)
601 urllib2.install_opener(opener)
602 urllib2.install_opener(opener)
602 pos = {0: 'off', 1: 'on'}
603 pos = {0: 'off', 1: 'on'}
603 for i in (0, 1):
604 for i in (0, 1):
604 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
605 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
605 HANDLE_ERRORS = i
606 HANDLE_ERRORS = i
606 try:
607 try:
607 fo = urllib2.urlopen(url)
608 fo = urllib2.urlopen(url)
608 fo.read()
609 fo.read()
609 fo.close()
610 fo.close()
610 try:
611 try:
611 status, reason = fo.status, fo.reason
612 status, reason = fo.status, fo.reason
612 except AttributeError:
613 except AttributeError:
613 status, reason = None, None
614 status, reason = None, None
614 except IOError, e:
615 except IOError, e:
615 print " EXCEPTION: %s" % e
616 print " EXCEPTION: %s" % e
616 raise
617 raise
617 else:
618 else:
618 print " status = %s, reason = %s" % (status, reason)
619 print " status = %s, reason = %s" % (status, reason)
619 HANDLE_ERRORS = orig
620 HANDLE_ERRORS = orig
620 hosts = keepalive_handler.open_connections()
621 hosts = keepalive_handler.open_connections()
621 print "open connections:", hosts
622 print "open connections:", hosts
622 keepalive_handler.close_all()
623 keepalive_handler.close_all()
623
624
624 def md5(s):
625 def md5(s):
625 try:
626 try:
626 from hashlib import md5 as _md5
627 from hashlib import md5 as _md5
627 except ImportError:
628 except ImportError:
628 from md5 import md5 as _md5
629 from md5 import md5 as _md5
629 global md5
630 global md5
630 md5 = _md5
631 md5 = _md5
631 return _md5(s)
632 return _md5(s)
632
633
633 def continuity(url):
634 def continuity(url):
634 format = '%25s: %s'
635 format = '%25s: %s'
635
636
636 # first fetch the file with the normal http handler
637 # first fetch the file with the normal http handler
637 opener = urllib2.build_opener()
638 opener = urllib2.build_opener()
638 urllib2.install_opener(opener)
639 urllib2.install_opener(opener)
639 fo = urllib2.urlopen(url)
640 fo = urllib2.urlopen(url)
640 foo = fo.read()
641 foo = fo.read()
641 fo.close()
642 fo.close()
642 m = md5.new(foo)
643 m = md5.new(foo)
643 print format % ('normal urllib', m.hexdigest())
644 print format % ('normal urllib', m.hexdigest())
644
645
645 # now install the keepalive handler and try again
646 # now install the keepalive handler and try again
646 opener = urllib2.build_opener(HTTPHandler())
647 opener = urllib2.build_opener(HTTPHandler())
647 urllib2.install_opener(opener)
648 urllib2.install_opener(opener)
648
649
649 fo = urllib2.urlopen(url)
650 fo = urllib2.urlopen(url)
650 foo = fo.read()
651 foo = fo.read()
651 fo.close()
652 fo.close()
652 m = md5.new(foo)
653 m = md5.new(foo)
653 print format % ('keepalive read', m.hexdigest())
654 print format % ('keepalive read', m.hexdigest())
654
655
655 fo = urllib2.urlopen(url)
656 fo = urllib2.urlopen(url)
656 foo = ''
657 foo = ''
657 while True:
658 while True:
658 f = fo.readline()
659 f = fo.readline()
659 if f:
660 if f:
660 foo = foo + f
661 foo = foo + f
661 else: break
662 else: break
662 fo.close()
663 fo.close()
663 m = md5.new(foo)
664 m = md5.new(foo)
664 print format % ('keepalive readline', m.hexdigest())
665 print format % ('keepalive readline', m.hexdigest())
665
666
666 def comp(N, url):
667 def comp(N, url):
667 print ' making %i connections to:\n %s' % (N, url)
668 print ' making %i connections to:\n %s' % (N, url)
668
669
669 sys.stdout.write(' first using the normal urllib handlers')
670 sys.stdout.write(' first using the normal urllib handlers')
670 # first use normal opener
671 # first use normal opener
671 opener = urllib2.build_opener()
672 opener = urllib2.build_opener()
672 urllib2.install_opener(opener)
673 urllib2.install_opener(opener)
673 t1 = fetch(N, url)
674 t1 = fetch(N, url)
674 print ' TIME: %.3f s' % t1
675 print ' TIME: %.3f s' % t1
675
676
676 sys.stdout.write(' now using the keepalive handler ')
677 sys.stdout.write(' now using the keepalive handler ')
677 # now install the keepalive handler and try again
678 # now install the keepalive handler and try again
678 opener = urllib2.build_opener(HTTPHandler())
679 opener = urllib2.build_opener(HTTPHandler())
679 urllib2.install_opener(opener)
680 urllib2.install_opener(opener)
680 t2 = fetch(N, url)
681 t2 = fetch(N, url)
681 print ' TIME: %.3f s' % t2
682 print ' TIME: %.3f s' % t2
682 print ' improvement factor: %.2f' % (t1 / t2)
683 print ' improvement factor: %.2f' % (t1 / t2)
683
684
684 def fetch(N, url, delay=0):
685 def fetch(N, url, delay=0):
685 import time
686 import time
686 lens = []
687 lens = []
687 starttime = time.time()
688 starttime = time.time()
688 for i in range(N):
689 for i in range(N):
689 if delay and i > 0:
690 if delay and i > 0:
690 time.sleep(delay)
691 time.sleep(delay)
691 fo = urllib2.urlopen(url)
692 fo = urllib2.urlopen(url)
692 foo = fo.read()
693 foo = fo.read()
693 fo.close()
694 fo.close()
694 lens.append(len(foo))
695 lens.append(len(foo))
695 diff = time.time() - starttime
696 diff = time.time() - starttime
696
697
697 j = 0
698 j = 0
698 for i in lens[1:]:
699 for i in lens[1:]:
699 j = j + 1
700 j = j + 1
700 if not i == lens[0]:
701 if not i == lens[0]:
701 print "WARNING: inconsistent length on read %i: %i" % (j, i)
702 print "WARNING: inconsistent length on read %i: %i" % (j, i)
702
703
703 return diff
704 return diff
704
705
705 def test_timeout(url):
706 def test_timeout(url):
706 global DEBUG
707 global DEBUG
707 dbbackup = DEBUG
708 dbbackup = DEBUG
708 class FakeLogger(object):
709 class FakeLogger(object):
709 def debug(self, msg, *args):
710 def debug(self, msg, *args):
710 print msg % args
711 print msg % args
711 info = warning = error = debug
712 info = warning = error = debug
712 DEBUG = FakeLogger()
713 DEBUG = FakeLogger()
713 print " fetching the file to establish a connection"
714 print " fetching the file to establish a connection"
714 fo = urllib2.urlopen(url)
715 fo = urllib2.urlopen(url)
715 data1 = fo.read()
716 data1 = fo.read()
716 fo.close()
717 fo.close()
717
718
718 i = 20
719 i = 20
719 print " waiting %i seconds for the server to close the connection" % i
720 print " waiting %i seconds for the server to close the connection" % i
720 while i > 0:
721 while i > 0:
721 sys.stdout.write('\r %2i' % i)
722 sys.stdout.write('\r %2i' % i)
722 sys.stdout.flush()
723 sys.stdout.flush()
723 time.sleep(1)
724 time.sleep(1)
724 i -= 1
725 i -= 1
725 sys.stderr.write('\r')
726 sys.stderr.write('\r')
726
727
727 print " fetching the file a second time"
728 print " fetching the file a second time"
728 fo = urllib2.urlopen(url)
729 fo = urllib2.urlopen(url)
729 data2 = fo.read()
730 data2 = fo.read()
730 fo.close()
731 fo.close()
731
732
732 if data1 == data2:
733 if data1 == data2:
733 print ' data are identical'
734 print ' data are identical'
734 else:
735 else:
735 print ' ERROR: DATA DIFFER'
736 print ' ERROR: DATA DIFFER'
736
737
737 DEBUG = dbbackup
738 DEBUG = dbbackup
738
739
739
740
740 def test(url, N=10):
741 def test(url, N=10):
741 print "checking error hander (do this on a non-200)"
742 print "checking error hander (do this on a non-200)"
742 try: error_handler(url)
743 try: error_handler(url)
743 except IOError:
744 except IOError:
744 print "exiting - exception will prevent further tests"
745 print "exiting - exception will prevent further tests"
745 sys.exit()
746 sys.exit()
746 print
747 print
747 print "performing continuity test (making sure stuff isn't corrupted)"
748 print "performing continuity test (making sure stuff isn't corrupted)"
748 continuity(url)
749 continuity(url)
749 print
750 print
750 print "performing speed comparison"
751 print "performing speed comparison"
751 comp(N, url)
752 comp(N, url)
752 print
753 print
753 print "performing dropped-connection check"
754 print "performing dropped-connection check"
754 test_timeout(url)
755 test_timeout(url)
755
756
756 if __name__ == '__main__':
757 if __name__ == '__main__':
757 import time
758 import time
758 import sys
759 import sys
759 try:
760 try:
760 N = int(sys.argv[1])
761 N = int(sys.argv[1])
761 url = sys.argv[2]
762 url = sys.argv[2]
762 except:
763 except:
763 print "%s <integer> <url>" % sys.argv[0]
764 print "%s <integer> <url>" % sys.argv[0]
764 else:
765 else:
765 test(url, N)
766 test(url, N)
General Comments 0
You need to be logged in to leave comments. Login now