##// END OF EJS Templates
keepalive: fix how md5 is used...
Mike Hommey -
r22505:232d437a stable
parent child Browse files
Show More
@@ -1,761 +1,761 b''
1 # This library is free software; you can redistribute it and/or
1 # This library is free software; you can redistribute it and/or
2 # modify it under the terms of the GNU Lesser General Public
2 # modify it under the terms of the GNU Lesser General Public
3 # License as published by the Free Software Foundation; either
3 # License as published by the Free Software Foundation; either
4 # version 2.1 of the License, or (at your option) any later version.
4 # version 2.1 of the License, or (at your option) any later version.
5 #
5 #
6 # This library is distributed in the hope that it will be useful,
6 # This library is distributed in the hope that it will be useful,
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 # Lesser General Public License for more details.
9 # Lesser General Public License for more details.
10 #
10 #
11 # You should have received a copy of the GNU Lesser General Public
11 # You should have received a copy of the GNU Lesser General Public
12 # License along with this library; if not, see
12 # License along with this library; if not, see
13 # <http://www.gnu.org/licenses/>.
13 # <http://www.gnu.org/licenses/>.
14
14
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
17
17
18 # Modified by Benoit Boissinot:
18 # Modified by Benoit Boissinot:
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
20 # Modified by Dirkjan Ochtman:
20 # Modified by Dirkjan Ochtman:
21 # - import md5 function from a local util module
21 # - import md5 function from a local util module
22 # Modified by Martin Geisler:
22 # Modified by Martin Geisler:
23 # - moved md5 function from local util module to this module
23 # - moved md5 function from local util module to this module
24 # Modified by Augie Fackler:
24 # Modified by Augie Fackler:
25 # - add safesend method and use it to prevent broken pipe errors
25 # - add safesend method and use it to prevent broken pipe errors
26 # on large POST requests
26 # on large POST requests
27
27
28 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
28 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
29
29
30 >>> import urllib2
30 >>> import urllib2
31 >>> from keepalive import HTTPHandler
31 >>> from keepalive import HTTPHandler
32 >>> keepalive_handler = HTTPHandler()
32 >>> keepalive_handler = HTTPHandler()
33 >>> opener = urllib2.build_opener(keepalive_handler)
33 >>> opener = urllib2.build_opener(keepalive_handler)
34 >>> urllib2.install_opener(opener)
34 >>> urllib2.install_opener(opener)
35 >>>
35 >>>
36 >>> fo = urllib2.urlopen('http://www.python.org')
36 >>> fo = urllib2.urlopen('http://www.python.org')
37
37
38 If a connection to a given host is requested, and all of the existing
38 If a connection to a given host is requested, and all of the existing
39 connections are still in use, another connection will be opened. If
39 connections are still in use, another connection will be opened. If
40 the handler tries to use an existing connection but it fails in some
40 the handler tries to use an existing connection but it fails in some
41 way, it will be closed and removed from the pool.
41 way, it will be closed and removed from the pool.
42
42
43 To remove the handler, simply re-run build_opener with no arguments, and
43 To remove the handler, simply re-run build_opener with no arguments, and
44 install that opener.
44 install that opener.
45
45
46 You can explicitly close connections by using the close_connection()
46 You can explicitly close connections by using the close_connection()
47 method of the returned file-like object (described below) or you can
47 method of the returned file-like object (described below) or you can
48 use the handler methods:
48 use the handler methods:
49
49
50 close_connection(host)
50 close_connection(host)
51 close_all()
51 close_all()
52 open_connections()
52 open_connections()
53
53
54 NOTE: using the close_connection and close_all methods of the handler
54 NOTE: using the close_connection and close_all methods of the handler
55 should be done with care when using multiple threads.
55 should be done with care when using multiple threads.
56 * there is nothing that prevents another thread from creating new
56 * there is nothing that prevents another thread from creating new
57 connections immediately after connections are closed
57 connections immediately after connections are closed
58 * no checks are done to prevent in-use connections from being closed
58 * no checks are done to prevent in-use connections from being closed
59
59
60 >>> keepalive_handler.close_all()
60 >>> keepalive_handler.close_all()
61
61
62 EXTRA ATTRIBUTES AND METHODS
62 EXTRA ATTRIBUTES AND METHODS
63
63
64 Upon a status of 200, the object returned has a few additional
64 Upon a status of 200, the object returned has a few additional
65 attributes and methods, which should not be used if you want to
65 attributes and methods, which should not be used if you want to
66 remain consistent with the normal urllib2-returned objects:
66 remain consistent with the normal urllib2-returned objects:
67
67
68 close_connection() - close the connection to the host
68 close_connection() - close the connection to the host
69 readlines() - you know, readlines()
69 readlines() - you know, readlines()
70 status - the return status (i.e. 404)
70 status - the return status (i.e. 404)
71 reason - english translation of status (i.e. 'File not found')
71 reason - english translation of status (i.e. 'File not found')
72
72
73 If you want the best of both worlds, use this inside an
73 If you want the best of both worlds, use this inside an
74 AttributeError-catching try:
74 AttributeError-catching try:
75
75
76 >>> try: status = fo.status
76 >>> try: status = fo.status
77 >>> except AttributeError: status = None
77 >>> except AttributeError: status = None
78
78
79 Unfortunately, these are ONLY there if status == 200, so it's not
79 Unfortunately, these are ONLY there if status == 200, so it's not
80 easy to distinguish between non-200 responses. The reason is that
80 easy to distinguish between non-200 responses. The reason is that
81 urllib2 tries to do clever things with error codes 301, 302, 401,
81 urllib2 tries to do clever things with error codes 301, 302, 401,
82 and 407, and it wraps the object upon return.
82 and 407, and it wraps the object upon return.
83
83
84 For python versions earlier than 2.4, you can avoid this fancy error
84 For python versions earlier than 2.4, you can avoid this fancy error
85 handling by setting the module-level global HANDLE_ERRORS to zero.
85 handling by setting the module-level global HANDLE_ERRORS to zero.
86 You see, prior to 2.4, it's the HTTP Handler's job to determine what
86 You see, prior to 2.4, it's the HTTP Handler's job to determine what
87 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
87 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
88 means "pass everything up". In python 2.4, however, this job no
88 means "pass everything up". In python 2.4, however, this job no
89 longer belongs to the HTTP Handler and is now done by a NEW handler,
89 longer belongs to the HTTP Handler and is now done by a NEW handler,
90 HTTPErrorProcessor. Here's the bottom line:
90 HTTPErrorProcessor. Here's the bottom line:
91
91
92 python version < 2.4
92 python version < 2.4
93 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
93 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
94 errors
94 errors
95 HANDLE_ERRORS == 0 pass everything up, error processing is
95 HANDLE_ERRORS == 0 pass everything up, error processing is
96 left to the calling code
96 left to the calling code
97 python version >= 2.4
97 python version >= 2.4
98 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
98 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
99 HANDLE_ERRORS == 0 (default) pass everything up, let the
99 HANDLE_ERRORS == 0 (default) pass everything up, let the
100 other handlers (specifically,
100 other handlers (specifically,
101 HTTPErrorProcessor) decide what to do
101 HTTPErrorProcessor) decide what to do
102
102
103 In practice, setting the variable either way makes little difference
103 In practice, setting the variable either way makes little difference
104 in python 2.4, so for the most consistent behavior across versions,
104 in python 2.4, so for the most consistent behavior across versions,
105 you probably just want to use the defaults, which will give you
105 you probably just want to use the defaults, which will give you
106 exceptions on errors.
106 exceptions on errors.
107
107
108 """
108 """
109
109
110 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
110 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
111
111
112 import errno
112 import errno
113 import httplib
113 import httplib
114 import socket
114 import socket
115 import thread
115 import thread
116 import urllib2
116 import urllib2
117
117
118 DEBUG = None
118 DEBUG = None
119
119
120 import sys
120 import sys
121 if sys.version_info < (2, 4):
121 if sys.version_info < (2, 4):
122 HANDLE_ERRORS = 1
122 HANDLE_ERRORS = 1
123 else: HANDLE_ERRORS = 0
123 else: HANDLE_ERRORS = 0
124
124
125 class ConnectionManager(object):
125 class ConnectionManager(object):
126 """
126 """
127 The connection manager must be able to:
127 The connection manager must be able to:
128 * keep track of all existing
128 * keep track of all existing
129 """
129 """
130 def __init__(self):
130 def __init__(self):
131 self._lock = thread.allocate_lock()
131 self._lock = thread.allocate_lock()
132 self._hostmap = {} # map hosts to a list of connections
132 self._hostmap = {} # map hosts to a list of connections
133 self._connmap = {} # map connections to host
133 self._connmap = {} # map connections to host
134 self._readymap = {} # map connection to ready state
134 self._readymap = {} # map connection to ready state
135
135
136 def add(self, host, connection, ready):
136 def add(self, host, connection, ready):
137 self._lock.acquire()
137 self._lock.acquire()
138 try:
138 try:
139 if host not in self._hostmap:
139 if host not in self._hostmap:
140 self._hostmap[host] = []
140 self._hostmap[host] = []
141 self._hostmap[host].append(connection)
141 self._hostmap[host].append(connection)
142 self._connmap[connection] = host
142 self._connmap[connection] = host
143 self._readymap[connection] = ready
143 self._readymap[connection] = ready
144 finally:
144 finally:
145 self._lock.release()
145 self._lock.release()
146
146
147 def remove(self, connection):
147 def remove(self, connection):
148 self._lock.acquire()
148 self._lock.acquire()
149 try:
149 try:
150 try:
150 try:
151 host = self._connmap[connection]
151 host = self._connmap[connection]
152 except KeyError:
152 except KeyError:
153 pass
153 pass
154 else:
154 else:
155 del self._connmap[connection]
155 del self._connmap[connection]
156 del self._readymap[connection]
156 del self._readymap[connection]
157 self._hostmap[host].remove(connection)
157 self._hostmap[host].remove(connection)
158 if not self._hostmap[host]: del self._hostmap[host]
158 if not self._hostmap[host]: del self._hostmap[host]
159 finally:
159 finally:
160 self._lock.release()
160 self._lock.release()
161
161
162 def set_ready(self, connection, ready):
162 def set_ready(self, connection, ready):
163 try:
163 try:
164 self._readymap[connection] = ready
164 self._readymap[connection] = ready
165 except KeyError:
165 except KeyError:
166 pass
166 pass
167
167
168 def get_ready_conn(self, host):
168 def get_ready_conn(self, host):
169 conn = None
169 conn = None
170 self._lock.acquire()
170 self._lock.acquire()
171 try:
171 try:
172 if host in self._hostmap:
172 if host in self._hostmap:
173 for c in self._hostmap[host]:
173 for c in self._hostmap[host]:
174 if self._readymap[c]:
174 if self._readymap[c]:
175 self._readymap[c] = 0
175 self._readymap[c] = 0
176 conn = c
176 conn = c
177 break
177 break
178 finally:
178 finally:
179 self._lock.release()
179 self._lock.release()
180 return conn
180 return conn
181
181
182 def get_all(self, host=None):
182 def get_all(self, host=None):
183 if host:
183 if host:
184 return list(self._hostmap.get(host, []))
184 return list(self._hostmap.get(host, []))
185 else:
185 else:
186 return dict(self._hostmap)
186 return dict(self._hostmap)
187
187
188 class KeepAliveHandler(object):
188 class KeepAliveHandler(object):
189 def __init__(self):
189 def __init__(self):
190 self._cm = ConnectionManager()
190 self._cm = ConnectionManager()
191
191
192 #### Connection Management
192 #### Connection Management
193 def open_connections(self):
193 def open_connections(self):
194 """return a list of connected hosts and the number of connections
194 """return a list of connected hosts and the number of connections
195 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
195 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
196 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
196 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
197
197
198 def close_connection(self, host):
198 def close_connection(self, host):
199 """close connection(s) to <host>
199 """close connection(s) to <host>
200 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
200 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
201 no error occurs if there is no connection to that host."""
201 no error occurs if there is no connection to that host."""
202 for h in self._cm.get_all(host):
202 for h in self._cm.get_all(host):
203 self._cm.remove(h)
203 self._cm.remove(h)
204 h.close()
204 h.close()
205
205
206 def close_all(self):
206 def close_all(self):
207 """close all open connections"""
207 """close all open connections"""
208 for host, conns in self._cm.get_all().iteritems():
208 for host, conns in self._cm.get_all().iteritems():
209 for h in conns:
209 for h in conns:
210 self._cm.remove(h)
210 self._cm.remove(h)
211 h.close()
211 h.close()
212
212
213 def _request_closed(self, request, host, connection):
213 def _request_closed(self, request, host, connection):
214 """tells us that this request is now closed and that the
214 """tells us that this request is now closed and that the
215 connection is ready for another request"""
215 connection is ready for another request"""
216 self._cm.set_ready(connection, 1)
216 self._cm.set_ready(connection, 1)
217
217
218 def _remove_connection(self, host, connection, close=0):
218 def _remove_connection(self, host, connection, close=0):
219 if close:
219 if close:
220 connection.close()
220 connection.close()
221 self._cm.remove(connection)
221 self._cm.remove(connection)
222
222
223 #### Transaction Execution
223 #### Transaction Execution
224 def http_open(self, req):
224 def http_open(self, req):
225 return self.do_open(HTTPConnection, req)
225 return self.do_open(HTTPConnection, req)
226
226
227 def do_open(self, http_class, req):
227 def do_open(self, http_class, req):
228 host = req.get_host()
228 host = req.get_host()
229 if not host:
229 if not host:
230 raise urllib2.URLError('no host given')
230 raise urllib2.URLError('no host given')
231
231
232 try:
232 try:
233 h = self._cm.get_ready_conn(host)
233 h = self._cm.get_ready_conn(host)
234 while h:
234 while h:
235 r = self._reuse_connection(h, req, host)
235 r = self._reuse_connection(h, req, host)
236
236
237 # if this response is non-None, then it worked and we're
237 # if this response is non-None, then it worked and we're
238 # done. Break out, skipping the else block.
238 # done. Break out, skipping the else block.
239 if r:
239 if r:
240 break
240 break
241
241
242 # connection is bad - possibly closed by server
242 # connection is bad - possibly closed by server
243 # discard it and ask for the next free connection
243 # discard it and ask for the next free connection
244 h.close()
244 h.close()
245 self._cm.remove(h)
245 self._cm.remove(h)
246 h = self._cm.get_ready_conn(host)
246 h = self._cm.get_ready_conn(host)
247 else:
247 else:
248 # no (working) free connections were found. Create a new one.
248 # no (working) free connections were found. Create a new one.
249 h = http_class(host)
249 h = http_class(host)
250 if DEBUG:
250 if DEBUG:
251 DEBUG.info("creating new connection to %s (%d)",
251 DEBUG.info("creating new connection to %s (%d)",
252 host, id(h))
252 host, id(h))
253 self._cm.add(host, h, 0)
253 self._cm.add(host, h, 0)
254 self._start_transaction(h, req)
254 self._start_transaction(h, req)
255 r = h.getresponse()
255 r = h.getresponse()
256 except (socket.error, httplib.HTTPException), err:
256 except (socket.error, httplib.HTTPException), err:
257 raise urllib2.URLError(err)
257 raise urllib2.URLError(err)
258
258
259 # if not a persistent connection, don't try to reuse it
259 # if not a persistent connection, don't try to reuse it
260 if r.will_close:
260 if r.will_close:
261 self._cm.remove(h)
261 self._cm.remove(h)
262
262
263 if DEBUG:
263 if DEBUG:
264 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
264 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
265 r._handler = self
265 r._handler = self
266 r._host = host
266 r._host = host
267 r._url = req.get_full_url()
267 r._url = req.get_full_url()
268 r._connection = h
268 r._connection = h
269 r.code = r.status
269 r.code = r.status
270 r.headers = r.msg
270 r.headers = r.msg
271 r.msg = r.reason
271 r.msg = r.reason
272
272
273 if r.status == 200 or not HANDLE_ERRORS:
273 if r.status == 200 or not HANDLE_ERRORS:
274 return r
274 return r
275 else:
275 else:
276 return self.parent.error('http', req, r,
276 return self.parent.error('http', req, r,
277 r.status, r.msg, r.headers)
277 r.status, r.msg, r.headers)
278
278
279 def _reuse_connection(self, h, req, host):
279 def _reuse_connection(self, h, req, host):
280 """start the transaction with a re-used connection
280 """start the transaction with a re-used connection
281 return a response object (r) upon success or None on failure.
281 return a response object (r) upon success or None on failure.
282 This DOES not close or remove bad connections in cases where
282 This DOES not close or remove bad connections in cases where
283 it returns. However, if an unexpected exception occurs, it
283 it returns. However, if an unexpected exception occurs, it
284 will close and remove the connection before re-raising.
284 will close and remove the connection before re-raising.
285 """
285 """
286 try:
286 try:
287 self._start_transaction(h, req)
287 self._start_transaction(h, req)
288 r = h.getresponse()
288 r = h.getresponse()
289 # note: just because we got something back doesn't mean it
289 # note: just because we got something back doesn't mean it
290 # worked. We'll check the version below, too.
290 # worked. We'll check the version below, too.
291 except (socket.error, httplib.HTTPException):
291 except (socket.error, httplib.HTTPException):
292 r = None
292 r = None
293 except: # re-raises
293 except: # re-raises
294 # adding this block just in case we've missed
294 # adding this block just in case we've missed
295 # something we will still raise the exception, but
295 # something we will still raise the exception, but
296 # lets try and close the connection and remove it
296 # lets try and close the connection and remove it
297 # first. We previously got into a nasty loop
297 # first. We previously got into a nasty loop
298 # where an exception was uncaught, and so the
298 # where an exception was uncaught, and so the
299 # connection stayed open. On the next try, the
299 # connection stayed open. On the next try, the
300 # same exception was raised, etc. The trade-off is
300 # same exception was raised, etc. The trade-off is
301 # that it's now possible this call will raise
301 # that it's now possible this call will raise
302 # a DIFFERENT exception
302 # a DIFFERENT exception
303 if DEBUG:
303 if DEBUG:
304 DEBUG.error("unexpected exception - closing "
304 DEBUG.error("unexpected exception - closing "
305 "connection to %s (%d)", host, id(h))
305 "connection to %s (%d)", host, id(h))
306 self._cm.remove(h)
306 self._cm.remove(h)
307 h.close()
307 h.close()
308 raise
308 raise
309
309
310 if r is None or r.version == 9:
310 if r is None or r.version == 9:
311 # httplib falls back to assuming HTTP 0.9 if it gets a
311 # httplib falls back to assuming HTTP 0.9 if it gets a
312 # bad header back. This is most likely to happen if
312 # bad header back. This is most likely to happen if
313 # the socket has been closed by the server since we
313 # the socket has been closed by the server since we
314 # last used the connection.
314 # last used the connection.
315 if DEBUG:
315 if DEBUG:
316 DEBUG.info("failed to re-use connection to %s (%d)",
316 DEBUG.info("failed to re-use connection to %s (%d)",
317 host, id(h))
317 host, id(h))
318 r = None
318 r = None
319 else:
319 else:
320 if DEBUG:
320 if DEBUG:
321 DEBUG.info("re-using connection to %s (%d)", host, id(h))
321 DEBUG.info("re-using connection to %s (%d)", host, id(h))
322
322
323 return r
323 return r
324
324
325 def _start_transaction(self, h, req):
325 def _start_transaction(self, h, req):
326 # What follows mostly reimplements HTTPConnection.request()
326 # What follows mostly reimplements HTTPConnection.request()
327 # except it adds self.parent.addheaders in the mix.
327 # except it adds self.parent.addheaders in the mix.
328 headers = req.headers.copy()
328 headers = req.headers.copy()
329 if sys.version_info >= (2, 4):
329 if sys.version_info >= (2, 4):
330 headers.update(req.unredirected_hdrs)
330 headers.update(req.unredirected_hdrs)
331 headers.update(self.parent.addheaders)
331 headers.update(self.parent.addheaders)
332 headers = dict((n.lower(), v) for n, v in headers.items())
332 headers = dict((n.lower(), v) for n, v in headers.items())
333 skipheaders = {}
333 skipheaders = {}
334 for n in ('host', 'accept-encoding'):
334 for n in ('host', 'accept-encoding'):
335 if n in headers:
335 if n in headers:
336 skipheaders['skip_' + n.replace('-', '_')] = 1
336 skipheaders['skip_' + n.replace('-', '_')] = 1
337 try:
337 try:
338 if req.has_data():
338 if req.has_data():
339 data = req.get_data()
339 data = req.get_data()
340 h.putrequest('POST', req.get_selector(), **skipheaders)
340 h.putrequest('POST', req.get_selector(), **skipheaders)
341 if 'content-type' not in headers:
341 if 'content-type' not in headers:
342 h.putheader('Content-type',
342 h.putheader('Content-type',
343 'application/x-www-form-urlencoded')
343 'application/x-www-form-urlencoded')
344 if 'content-length' not in headers:
344 if 'content-length' not in headers:
345 h.putheader('Content-length', '%d' % len(data))
345 h.putheader('Content-length', '%d' % len(data))
346 else:
346 else:
347 h.putrequest('GET', req.get_selector(), **skipheaders)
347 h.putrequest('GET', req.get_selector(), **skipheaders)
348 except (socket.error), err:
348 except (socket.error), err:
349 raise urllib2.URLError(err)
349 raise urllib2.URLError(err)
350 for k, v in headers.items():
350 for k, v in headers.items():
351 h.putheader(k, v)
351 h.putheader(k, v)
352 h.endheaders()
352 h.endheaders()
353 if req.has_data():
353 if req.has_data():
354 h.send(data)
354 h.send(data)
355
355
356 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
356 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
357 pass
357 pass
358
358
359 class HTTPResponse(httplib.HTTPResponse):
359 class HTTPResponse(httplib.HTTPResponse):
360 # we need to subclass HTTPResponse in order to
360 # we need to subclass HTTPResponse in order to
361 # 1) add readline() and readlines() methods
361 # 1) add readline() and readlines() methods
362 # 2) add close_connection() methods
362 # 2) add close_connection() methods
363 # 3) add info() and geturl() methods
363 # 3) add info() and geturl() methods
364
364
365 # in order to add readline(), read must be modified to deal with a
365 # in order to add readline(), read must be modified to deal with a
366 # buffer. example: readline must read a buffer and then spit back
366 # buffer. example: readline must read a buffer and then spit back
367 # one line at a time. The only real alternative is to read one
367 # one line at a time. The only real alternative is to read one
368 # BYTE at a time (ick). Once something has been read, it can't be
368 # BYTE at a time (ick). Once something has been read, it can't be
369 # put back (ok, maybe it can, but that's even uglier than this),
369 # put back (ok, maybe it can, but that's even uglier than this),
370 # so if you THEN do a normal read, you must first take stuff from
370 # so if you THEN do a normal read, you must first take stuff from
371 # the buffer.
371 # the buffer.
372
372
373 # the read method wraps the original to accommodate buffering,
373 # the read method wraps the original to accommodate buffering,
374 # although read() never adds to the buffer.
374 # although read() never adds to the buffer.
375 # Both readline and readlines have been stolen with almost no
375 # Both readline and readlines have been stolen with almost no
376 # modification from socket.py
376 # modification from socket.py
377
377
378
378
379 def __init__(self, sock, debuglevel=0, strict=0, method=None):
379 def __init__(self, sock, debuglevel=0, strict=0, method=None):
380 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
380 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
381 self.fileno = sock.fileno
381 self.fileno = sock.fileno
382 self.code = None
382 self.code = None
383 self._rbuf = ''
383 self._rbuf = ''
384 self._rbufsize = 8096
384 self._rbufsize = 8096
385 self._handler = None # inserted by the handler later
385 self._handler = None # inserted by the handler later
386 self._host = None # (same)
386 self._host = None # (same)
387 self._url = None # (same)
387 self._url = None # (same)
388 self._connection = None # (same)
388 self._connection = None # (same)
389
389
390 _raw_read = httplib.HTTPResponse.read
390 _raw_read = httplib.HTTPResponse.read
391
391
392 def close(self):
392 def close(self):
393 if self.fp:
393 if self.fp:
394 self.fp.close()
394 self.fp.close()
395 self.fp = None
395 self.fp = None
396 if self._handler:
396 if self._handler:
397 self._handler._request_closed(self, self._host,
397 self._handler._request_closed(self, self._host,
398 self._connection)
398 self._connection)
399
399
400 def close_connection(self):
400 def close_connection(self):
401 self._handler._remove_connection(self._host, self._connection, close=1)
401 self._handler._remove_connection(self._host, self._connection, close=1)
402 self.close()
402 self.close()
403
403
404 def info(self):
404 def info(self):
405 return self.headers
405 return self.headers
406
406
407 def geturl(self):
407 def geturl(self):
408 return self._url
408 return self._url
409
409
410 def read(self, amt=None):
410 def read(self, amt=None):
411 # the _rbuf test is only in this first if for speed. It's not
411 # the _rbuf test is only in this first if for speed. It's not
412 # logically necessary
412 # logically necessary
413 if self._rbuf and not amt is None:
413 if self._rbuf and not amt is None:
414 L = len(self._rbuf)
414 L = len(self._rbuf)
415 if amt > L:
415 if amt > L:
416 amt -= L
416 amt -= L
417 else:
417 else:
418 s = self._rbuf[:amt]
418 s = self._rbuf[:amt]
419 self._rbuf = self._rbuf[amt:]
419 self._rbuf = self._rbuf[amt:]
420 return s
420 return s
421
421
422 s = self._rbuf + self._raw_read(amt)
422 s = self._rbuf + self._raw_read(amt)
423 self._rbuf = ''
423 self._rbuf = ''
424 return s
424 return s
425
425
426 # stolen from Python SVN #68532 to fix issue1088
426 # stolen from Python SVN #68532 to fix issue1088
427 def _read_chunked(self, amt):
427 def _read_chunked(self, amt):
428 chunk_left = self.chunk_left
428 chunk_left = self.chunk_left
429 value = ''
429 value = ''
430
430
431 # XXX This accumulates chunks by repeated string concatenation,
431 # XXX This accumulates chunks by repeated string concatenation,
432 # which is not efficient as the number or size of chunks gets big.
432 # which is not efficient as the number or size of chunks gets big.
433 while True:
433 while True:
434 if chunk_left is None:
434 if chunk_left is None:
435 line = self.fp.readline()
435 line = self.fp.readline()
436 i = line.find(';')
436 i = line.find(';')
437 if i >= 0:
437 if i >= 0:
438 line = line[:i] # strip chunk-extensions
438 line = line[:i] # strip chunk-extensions
439 try:
439 try:
440 chunk_left = int(line, 16)
440 chunk_left = int(line, 16)
441 except ValueError:
441 except ValueError:
442 # close the connection as protocol synchronization is
442 # close the connection as protocol synchronization is
443 # probably lost
443 # probably lost
444 self.close()
444 self.close()
445 raise httplib.IncompleteRead(value)
445 raise httplib.IncompleteRead(value)
446 if chunk_left == 0:
446 if chunk_left == 0:
447 break
447 break
448 if amt is None:
448 if amt is None:
449 value += self._safe_read(chunk_left)
449 value += self._safe_read(chunk_left)
450 elif amt < chunk_left:
450 elif amt < chunk_left:
451 value += self._safe_read(amt)
451 value += self._safe_read(amt)
452 self.chunk_left = chunk_left - amt
452 self.chunk_left = chunk_left - amt
453 return value
453 return value
454 elif amt == chunk_left:
454 elif amt == chunk_left:
455 value += self._safe_read(amt)
455 value += self._safe_read(amt)
456 self._safe_read(2) # toss the CRLF at the end of the chunk
456 self._safe_read(2) # toss the CRLF at the end of the chunk
457 self.chunk_left = None
457 self.chunk_left = None
458 return value
458 return value
459 else:
459 else:
460 value += self._safe_read(chunk_left)
460 value += self._safe_read(chunk_left)
461 amt -= chunk_left
461 amt -= chunk_left
462
462
463 # we read the whole chunk, get another
463 # we read the whole chunk, get another
464 self._safe_read(2) # toss the CRLF at the end of the chunk
464 self._safe_read(2) # toss the CRLF at the end of the chunk
465 chunk_left = None
465 chunk_left = None
466
466
467 # read and discard trailer up to the CRLF terminator
467 # read and discard trailer up to the CRLF terminator
468 ### note: we shouldn't have any trailers!
468 ### note: we shouldn't have any trailers!
469 while True:
469 while True:
470 line = self.fp.readline()
470 line = self.fp.readline()
471 if not line:
471 if not line:
472 # a vanishingly small number of sites EOF without
472 # a vanishingly small number of sites EOF without
473 # sending the trailer
473 # sending the trailer
474 break
474 break
475 if line == '\r\n':
475 if line == '\r\n':
476 break
476 break
477
477
478 # we read everything; close the "file"
478 # we read everything; close the "file"
479 self.close()
479 self.close()
480
480
481 return value
481 return value
482
482
483 def readline(self, limit=-1):
483 def readline(self, limit=-1):
484 i = self._rbuf.find('\n')
484 i = self._rbuf.find('\n')
485 while i < 0 and not (0 < limit <= len(self._rbuf)):
485 while i < 0 and not (0 < limit <= len(self._rbuf)):
486 new = self._raw_read(self._rbufsize)
486 new = self._raw_read(self._rbufsize)
487 if not new:
487 if not new:
488 break
488 break
489 i = new.find('\n')
489 i = new.find('\n')
490 if i >= 0:
490 if i >= 0:
491 i = i + len(self._rbuf)
491 i = i + len(self._rbuf)
492 self._rbuf = self._rbuf + new
492 self._rbuf = self._rbuf + new
493 if i < 0:
493 if i < 0:
494 i = len(self._rbuf)
494 i = len(self._rbuf)
495 else:
495 else:
496 i = i + 1
496 i = i + 1
497 if 0 <= limit < len(self._rbuf):
497 if 0 <= limit < len(self._rbuf):
498 i = limit
498 i = limit
499 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
499 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
500 return data
500 return data
501
501
502 def readlines(self, sizehint=0):
502 def readlines(self, sizehint=0):
503 total = 0
503 total = 0
504 list = []
504 list = []
505 while True:
505 while True:
506 line = self.readline()
506 line = self.readline()
507 if not line:
507 if not line:
508 break
508 break
509 list.append(line)
509 list.append(line)
510 total += len(line)
510 total += len(line)
511 if sizehint and total >= sizehint:
511 if sizehint and total >= sizehint:
512 break
512 break
513 return list
513 return list
514
514
515 def safesend(self, str):
515 def safesend(self, str):
516 """Send `str' to the server.
516 """Send `str' to the server.
517
517
518 Shamelessly ripped off from httplib to patch a bad behavior.
518 Shamelessly ripped off from httplib to patch a bad behavior.
519 """
519 """
520 # _broken_pipe_resp is an attribute we set in this function
520 # _broken_pipe_resp is an attribute we set in this function
521 # if the socket is closed while we're sending data but
521 # if the socket is closed while we're sending data but
522 # the server sent us a response before hanging up.
522 # the server sent us a response before hanging up.
523 # In that case, we want to pretend to send the rest of the
523 # In that case, we want to pretend to send the rest of the
524 # outgoing data, and then let the user use getresponse()
524 # outgoing data, and then let the user use getresponse()
525 # (which we wrap) to get this last response before
525 # (which we wrap) to get this last response before
526 # opening a new socket.
526 # opening a new socket.
527 if getattr(self, '_broken_pipe_resp', None) is not None:
527 if getattr(self, '_broken_pipe_resp', None) is not None:
528 return
528 return
529
529
530 if self.sock is None:
530 if self.sock is None:
531 if self.auto_open:
531 if self.auto_open:
532 self.connect()
532 self.connect()
533 else:
533 else:
534 raise httplib.NotConnected
534 raise httplib.NotConnected
535
535
536 # send the data to the server. if we get a broken pipe, then close
536 # send the data to the server. if we get a broken pipe, then close
537 # the socket. we want to reconnect when somebody tries to send again.
537 # the socket. we want to reconnect when somebody tries to send again.
538 #
538 #
539 # NOTE: we DO propagate the error, though, because we cannot simply
539 # NOTE: we DO propagate the error, though, because we cannot simply
540 # ignore the error... the caller will know if they can retry.
540 # ignore the error... the caller will know if they can retry.
541 if self.debuglevel > 0:
541 if self.debuglevel > 0:
542 print "send:", repr(str)
542 print "send:", repr(str)
543 try:
543 try:
544 blocksize = 8192
544 blocksize = 8192
545 read = getattr(str, 'read', None)
545 read = getattr(str, 'read', None)
546 if read is not None:
546 if read is not None:
547 if self.debuglevel > 0:
547 if self.debuglevel > 0:
548 print "sending a read()able"
548 print "sending a read()able"
549 data = read(blocksize)
549 data = read(blocksize)
550 while data:
550 while data:
551 self.sock.sendall(data)
551 self.sock.sendall(data)
552 data = read(blocksize)
552 data = read(blocksize)
553 else:
553 else:
554 self.sock.sendall(str)
554 self.sock.sendall(str)
555 except socket.error, v:
555 except socket.error, v:
556 reraise = True
556 reraise = True
557 if v[0] == errno.EPIPE: # Broken pipe
557 if v[0] == errno.EPIPE: # Broken pipe
558 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
558 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
559 self._broken_pipe_resp = None
559 self._broken_pipe_resp = None
560 self._broken_pipe_resp = self.getresponse()
560 self._broken_pipe_resp = self.getresponse()
561 reraise = False
561 reraise = False
562 self.close()
562 self.close()
563 if reraise:
563 if reraise:
564 raise
564 raise
565
565
566 def wrapgetresponse(cls):
566 def wrapgetresponse(cls):
567 """Wraps getresponse in cls with a broken-pipe sane version.
567 """Wraps getresponse in cls with a broken-pipe sane version.
568 """
568 """
569 def safegetresponse(self):
569 def safegetresponse(self):
570 # In safesend() we might set the _broken_pipe_resp
570 # In safesend() we might set the _broken_pipe_resp
571 # attribute, in which case the socket has already
571 # attribute, in which case the socket has already
572 # been closed and we just need to give them the response
572 # been closed and we just need to give them the response
573 # back. Otherwise, we use the normal response path.
573 # back. Otherwise, we use the normal response path.
574 r = getattr(self, '_broken_pipe_resp', None)
574 r = getattr(self, '_broken_pipe_resp', None)
575 if r is not None:
575 if r is not None:
576 return r
576 return r
577 return cls.getresponse(self)
577 return cls.getresponse(self)
578 safegetresponse.__doc__ = cls.getresponse.__doc__
578 safegetresponse.__doc__ = cls.getresponse.__doc__
579 return safegetresponse
579 return safegetresponse
580
580
581 class HTTPConnection(httplib.HTTPConnection):
581 class HTTPConnection(httplib.HTTPConnection):
582 # use the modified response class
582 # use the modified response class
583 response_class = HTTPResponse
583 response_class = HTTPResponse
584 send = safesend
584 send = safesend
585 getresponse = wrapgetresponse(httplib.HTTPConnection)
585 getresponse = wrapgetresponse(httplib.HTTPConnection)
586
586
587
587
588 #########################################################################
588 #########################################################################
589 ##### TEST FUNCTIONS
589 ##### TEST FUNCTIONS
590 #########################################################################
590 #########################################################################
591
591
592 def error_handler(url):
592 def error_handler(url):
593 global HANDLE_ERRORS
593 global HANDLE_ERRORS
594 orig = HANDLE_ERRORS
594 orig = HANDLE_ERRORS
595 keepalive_handler = HTTPHandler()
595 keepalive_handler = HTTPHandler()
596 opener = urllib2.build_opener(keepalive_handler)
596 opener = urllib2.build_opener(keepalive_handler)
597 urllib2.install_opener(opener)
597 urllib2.install_opener(opener)
598 pos = {0: 'off', 1: 'on'}
598 pos = {0: 'off', 1: 'on'}
599 for i in (0, 1):
599 for i in (0, 1):
600 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
600 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
601 HANDLE_ERRORS = i
601 HANDLE_ERRORS = i
602 try:
602 try:
603 fo = urllib2.urlopen(url)
603 fo = urllib2.urlopen(url)
604 fo.read()
604 fo.read()
605 fo.close()
605 fo.close()
606 try:
606 try:
607 status, reason = fo.status, fo.reason
607 status, reason = fo.status, fo.reason
608 except AttributeError:
608 except AttributeError:
609 status, reason = None, None
609 status, reason = None, None
610 except IOError, e:
610 except IOError, e:
611 print " EXCEPTION: %s" % e
611 print " EXCEPTION: %s" % e
612 raise
612 raise
613 else:
613 else:
614 print " status = %s, reason = %s" % (status, reason)
614 print " status = %s, reason = %s" % (status, reason)
615 HANDLE_ERRORS = orig
615 HANDLE_ERRORS = orig
616 hosts = keepalive_handler.open_connections()
616 hosts = keepalive_handler.open_connections()
617 print "open connections:", hosts
617 print "open connections:", hosts
618 keepalive_handler.close_all()
618 keepalive_handler.close_all()
619
619
620 def md5(s):
620 def md5(s):
621 try:
621 try:
622 from hashlib import md5 as _md5
622 from hashlib import md5 as _md5
623 except ImportError:
623 except ImportError:
624 from md5 import md5 as _md5
624 from md5 import md5 as _md5
625 global md5
625 global md5
626 md5 = _md5
626 md5 = _md5
627 return _md5(s)
627 return _md5(s)
628
628
629 def continuity(url):
629 def continuity(url):
630 format = '%25s: %s'
630 format = '%25s: %s'
631
631
632 # first fetch the file with the normal http handler
632 # first fetch the file with the normal http handler
633 opener = urllib2.build_opener()
633 opener = urllib2.build_opener()
634 urllib2.install_opener(opener)
634 urllib2.install_opener(opener)
635 fo = urllib2.urlopen(url)
635 fo = urllib2.urlopen(url)
636 foo = fo.read()
636 foo = fo.read()
637 fo.close()
637 fo.close()
638 m = md5.new(foo)
638 m = md5(foo)
639 print format % ('normal urllib', m.hexdigest())
639 print format % ('normal urllib', m.hexdigest())
640
640
641 # now install the keepalive handler and try again
641 # now install the keepalive handler and try again
642 opener = urllib2.build_opener(HTTPHandler())
642 opener = urllib2.build_opener(HTTPHandler())
643 urllib2.install_opener(opener)
643 urllib2.install_opener(opener)
644
644
645 fo = urllib2.urlopen(url)
645 fo = urllib2.urlopen(url)
646 foo = fo.read()
646 foo = fo.read()
647 fo.close()
647 fo.close()
648 m = md5.new(foo)
648 m = md5(foo)
649 print format % ('keepalive read', m.hexdigest())
649 print format % ('keepalive read', m.hexdigest())
650
650
651 fo = urllib2.urlopen(url)
651 fo = urllib2.urlopen(url)
652 foo = ''
652 foo = ''
653 while True:
653 while True:
654 f = fo.readline()
654 f = fo.readline()
655 if f:
655 if f:
656 foo = foo + f
656 foo = foo + f
657 else: break
657 else: break
658 fo.close()
658 fo.close()
659 m = md5.new(foo)
659 m = md5(foo)
660 print format % ('keepalive readline', m.hexdigest())
660 print format % ('keepalive readline', m.hexdigest())
661
661
662 def comp(N, url):
662 def comp(N, url):
663 print ' making %i connections to:\n %s' % (N, url)
663 print ' making %i connections to:\n %s' % (N, url)
664
664
665 sys.stdout.write(' first using the normal urllib handlers')
665 sys.stdout.write(' first using the normal urllib handlers')
666 # first use normal opener
666 # first use normal opener
667 opener = urllib2.build_opener()
667 opener = urllib2.build_opener()
668 urllib2.install_opener(opener)
668 urllib2.install_opener(opener)
669 t1 = fetch(N, url)
669 t1 = fetch(N, url)
670 print ' TIME: %.3f s' % t1
670 print ' TIME: %.3f s' % t1
671
671
672 sys.stdout.write(' now using the keepalive handler ')
672 sys.stdout.write(' now using the keepalive handler ')
673 # now install the keepalive handler and try again
673 # now install the keepalive handler and try again
674 opener = urllib2.build_opener(HTTPHandler())
674 opener = urllib2.build_opener(HTTPHandler())
675 urllib2.install_opener(opener)
675 urllib2.install_opener(opener)
676 t2 = fetch(N, url)
676 t2 = fetch(N, url)
677 print ' TIME: %.3f s' % t2
677 print ' TIME: %.3f s' % t2
678 print ' improvement factor: %.2f' % (t1 / t2)
678 print ' improvement factor: %.2f' % (t1 / t2)
679
679
680 def fetch(N, url, delay=0):
680 def fetch(N, url, delay=0):
681 import time
681 import time
682 lens = []
682 lens = []
683 starttime = time.time()
683 starttime = time.time()
684 for i in range(N):
684 for i in range(N):
685 if delay and i > 0:
685 if delay and i > 0:
686 time.sleep(delay)
686 time.sleep(delay)
687 fo = urllib2.urlopen(url)
687 fo = urllib2.urlopen(url)
688 foo = fo.read()
688 foo = fo.read()
689 fo.close()
689 fo.close()
690 lens.append(len(foo))
690 lens.append(len(foo))
691 diff = time.time() - starttime
691 diff = time.time() - starttime
692
692
693 j = 0
693 j = 0
694 for i in lens[1:]:
694 for i in lens[1:]:
695 j = j + 1
695 j = j + 1
696 if not i == lens[0]:
696 if not i == lens[0]:
697 print "WARNING: inconsistent length on read %i: %i" % (j, i)
697 print "WARNING: inconsistent length on read %i: %i" % (j, i)
698
698
699 return diff
699 return diff
700
700
701 def test_timeout(url):
701 def test_timeout(url):
702 global DEBUG
702 global DEBUG
703 dbbackup = DEBUG
703 dbbackup = DEBUG
704 class FakeLogger(object):
704 class FakeLogger(object):
705 def debug(self, msg, *args):
705 def debug(self, msg, *args):
706 print msg % args
706 print msg % args
707 info = warning = error = debug
707 info = warning = error = debug
708 DEBUG = FakeLogger()
708 DEBUG = FakeLogger()
709 print " fetching the file to establish a connection"
709 print " fetching the file to establish a connection"
710 fo = urllib2.urlopen(url)
710 fo = urllib2.urlopen(url)
711 data1 = fo.read()
711 data1 = fo.read()
712 fo.close()
712 fo.close()
713
713
714 i = 20
714 i = 20
715 print " waiting %i seconds for the server to close the connection" % i
715 print " waiting %i seconds for the server to close the connection" % i
716 while i > 0:
716 while i > 0:
717 sys.stdout.write('\r %2i' % i)
717 sys.stdout.write('\r %2i' % i)
718 sys.stdout.flush()
718 sys.stdout.flush()
719 time.sleep(1)
719 time.sleep(1)
720 i -= 1
720 i -= 1
721 sys.stderr.write('\r')
721 sys.stderr.write('\r')
722
722
723 print " fetching the file a second time"
723 print " fetching the file a second time"
724 fo = urllib2.urlopen(url)
724 fo = urllib2.urlopen(url)
725 data2 = fo.read()
725 data2 = fo.read()
726 fo.close()
726 fo.close()
727
727
728 if data1 == data2:
728 if data1 == data2:
729 print ' data are identical'
729 print ' data are identical'
730 else:
730 else:
731 print ' ERROR: DATA DIFFER'
731 print ' ERROR: DATA DIFFER'
732
732
733 DEBUG = dbbackup
733 DEBUG = dbbackup
734
734
735
735
736 def test(url, N=10):
736 def test(url, N=10):
737 print "checking error handler (do this on a non-200)"
737 print "checking error handler (do this on a non-200)"
738 try: error_handler(url)
738 try: error_handler(url)
739 except IOError:
739 except IOError:
740 print "exiting - exception will prevent further tests"
740 print "exiting - exception will prevent further tests"
741 sys.exit()
741 sys.exit()
742 print
742 print
743 print "performing continuity test (making sure stuff isn't corrupted)"
743 print "performing continuity test (making sure stuff isn't corrupted)"
744 continuity(url)
744 continuity(url)
745 print
745 print
746 print "performing speed comparison"
746 print "performing speed comparison"
747 comp(N, url)
747 comp(N, url)
748 print
748 print
749 print "performing dropped-connection check"
749 print "performing dropped-connection check"
750 test_timeout(url)
750 test_timeout(url)
751
751
752 if __name__ == '__main__':
752 if __name__ == '__main__':
753 import time
753 import time
754 import sys
754 import sys
755 try:
755 try:
756 N = int(sys.argv[1])
756 N = int(sys.argv[1])
757 url = sys.argv[2]
757 url = sys.argv[2]
758 except (IndexError, ValueError):
758 except (IndexError, ValueError):
759 print "%s <integer> <url>" % sys.argv[0]
759 print "%s <integer> <url>" % sys.argv[0]
760 else:
760 else:
761 test(url, N)
761 test(url, N)
General Comments 0
You need to be logged in to leave comments. Login now