##// END OF EJS Templates
Merge
Bryan O'Sullivan -
r17701:f7b3518c merge default
parent child Browse files
Show More
@@ -1,764 +1,761 b''
1 # This library is free software; you can redistribute it and/or
1 # This library is free software; you can redistribute it and/or
2 # modify it under the terms of the GNU Lesser General Public
2 # modify it under the terms of the GNU Lesser General Public
3 # License as published by the Free Software Foundation; either
3 # License as published by the Free Software Foundation; either
4 # version 2.1 of the License, or (at your option) any later version.
4 # version 2.1 of the License, or (at your option) any later version.
5 #
5 #
6 # This library is distributed in the hope that it will be useful,
6 # This library is distributed in the hope that it will be useful,
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 # Lesser General Public License for more details.
9 # Lesser General Public License for more details.
10 #
10 #
11 # You should have received a copy of the GNU Lesser General Public
11 # You should have received a copy of the GNU Lesser General Public
12 # License along with this library; if not, see
12 # License along with this library; if not, see
13 # <http://www.gnu.org/licenses/>.
13 # <http://www.gnu.org/licenses/>.
14
14
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
17
17
18 # Modified by Benoit Boissinot:
18 # Modified by Benoit Boissinot:
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
20 # Modified by Dirkjan Ochtman:
20 # Modified by Dirkjan Ochtman:
21 # - import md5 function from a local util module
21 # - import md5 function from a local util module
22 # Modified by Martin Geisler:
22 # Modified by Martin Geisler:
23 # - moved md5 function from local util module to this module
23 # - moved md5 function from local util module to this module
24 # Modified by Augie Fackler:
24 # Modified by Augie Fackler:
25 # - add safesend method and use it to prevent broken pipe errors
25 # - add safesend method and use it to prevent broken pipe errors
26 # on large POST requests
26 # on large POST requests
27
27
28 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
28 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
29
29
30 >>> import urllib2
30 >>> import urllib2
31 >>> from keepalive import HTTPHandler
31 >>> from keepalive import HTTPHandler
32 >>> keepalive_handler = HTTPHandler()
32 >>> keepalive_handler = HTTPHandler()
33 >>> opener = urllib2.build_opener(keepalive_handler)
33 >>> opener = urllib2.build_opener(keepalive_handler)
34 >>> urllib2.install_opener(opener)
34 >>> urllib2.install_opener(opener)
35 >>>
35 >>>
36 >>> fo = urllib2.urlopen('http://www.python.org')
36 >>> fo = urllib2.urlopen('http://www.python.org')
37
37
38 If a connection to a given host is requested, and all of the existing
38 If a connection to a given host is requested, and all of the existing
39 connections are still in use, another connection will be opened. If
39 connections are still in use, another connection will be opened. If
40 the handler tries to use an existing connection but it fails in some
40 the handler tries to use an existing connection but it fails in some
41 way, it will be closed and removed from the pool.
41 way, it will be closed and removed from the pool.
42
42
43 To remove the handler, simply re-run build_opener with no arguments, and
43 To remove the handler, simply re-run build_opener with no arguments, and
44 install that opener.
44 install that opener.
45
45
46 You can explicitly close connections by using the close_connection()
46 You can explicitly close connections by using the close_connection()
47 method of the returned file-like object (described below) or you can
47 method of the returned file-like object (described below) or you can
48 use the handler methods:
48 use the handler methods:
49
49
50 close_connection(host)
50 close_connection(host)
51 close_all()
51 close_all()
52 open_connections()
52 open_connections()
53
53
54 NOTE: using the close_connection and close_all methods of the handler
54 NOTE: using the close_connection and close_all methods of the handler
55 should be done with care when using multiple threads.
55 should be done with care when using multiple threads.
56 * there is nothing that prevents another thread from creating new
56 * there is nothing that prevents another thread from creating new
57 connections immediately after connections are closed
57 connections immediately after connections are closed
58 * no checks are done to prevent in-use connections from being closed
58 * no checks are done to prevent in-use connections from being closed
59
59
60 >>> keepalive_handler.close_all()
60 >>> keepalive_handler.close_all()
61
61
62 EXTRA ATTRIBUTES AND METHODS
62 EXTRA ATTRIBUTES AND METHODS
63
63
64 Upon a status of 200, the object returned has a few additional
64 Upon a status of 200, the object returned has a few additional
65 attributes and methods, which should not be used if you want to
65 attributes and methods, which should not be used if you want to
66 remain consistent with the normal urllib2-returned objects:
66 remain consistent with the normal urllib2-returned objects:
67
67
68 close_connection() - close the connection to the host
68 close_connection() - close the connection to the host
69 readlines() - you know, readlines()
69 readlines() - you know, readlines()
70 status - the return status (i.e. 404)
70 status - the return status (i.e. 404)
71 reason - english translation of status (i.e. 'File not found')
71 reason - english translation of status (i.e. 'File not found')
72
72
73 If you want the best of both worlds, use this inside an
73 If you want the best of both worlds, use this inside an
74 AttributeError-catching try:
74 AttributeError-catching try:
75
75
76 >>> try: status = fo.status
76 >>> try: status = fo.status
77 >>> except AttributeError: status = None
77 >>> except AttributeError: status = None
78
78
79 Unfortunately, these are ONLY there if status == 200, so it's not
79 Unfortunately, these are ONLY there if status == 200, so it's not
80 easy to distinguish between non-200 responses. The reason is that
80 easy to distinguish between non-200 responses. The reason is that
81 urllib2 tries to do clever things with error codes 301, 302, 401,
81 urllib2 tries to do clever things with error codes 301, 302, 401,
82 and 407, and it wraps the object upon return.
82 and 407, and it wraps the object upon return.
83
83
84 For python versions earlier than 2.4, you can avoid this fancy error
84 For python versions earlier than 2.4, you can avoid this fancy error
85 handling by setting the module-level global HANDLE_ERRORS to zero.
85 handling by setting the module-level global HANDLE_ERRORS to zero.
86 You see, prior to 2.4, it's the HTTP Handler's job to determine what
86 You see, prior to 2.4, it's the HTTP Handler's job to determine what
87 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
87 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
88 means "pass everything up". In python 2.4, however, this job no
88 means "pass everything up". In python 2.4, however, this job no
89 longer belongs to the HTTP Handler and is now done by a NEW handler,
89 longer belongs to the HTTP Handler and is now done by a NEW handler,
90 HTTPErrorProcessor. Here's the bottom line:
90 HTTPErrorProcessor. Here's the bottom line:
91
91
92 python version < 2.4
92 python version < 2.4
93 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
93 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
94 errors
94 errors
95 HANDLE_ERRORS == 0 pass everything up, error processing is
95 HANDLE_ERRORS == 0 pass everything up, error processing is
96 left to the calling code
96 left to the calling code
97 python version >= 2.4
97 python version >= 2.4
98 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
98 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
99 HANDLE_ERRORS == 0 (default) pass everything up, let the
99 HANDLE_ERRORS == 0 (default) pass everything up, let the
100 other handlers (specifically,
100 other handlers (specifically,
101 HTTPErrorProcessor) decide what to do
101 HTTPErrorProcessor) decide what to do
102
102
103 In practice, setting the variable either way makes little difference
103 In practice, setting the variable either way makes little difference
104 in python 2.4, so for the most consistent behavior across versions,
104 in python 2.4, so for the most consistent behavior across versions,
105 you probably just want to use the defaults, which will give you
105 you probably just want to use the defaults, which will give you
106 exceptions on errors.
106 exceptions on errors.
107
107
108 """
108 """
109
109
110 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
110 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
111
111
112 import errno
112 import errno
113 import httplib
113 import httplib
114 import socket
114 import socket
115 import thread
115 import thread
116 import urllib2
116 import urllib2
117
117
118 DEBUG = None
118 DEBUG = None
119
119
120 import sys
120 import sys
121 if sys.version_info < (2, 4):
121 if sys.version_info < (2, 4):
122 HANDLE_ERRORS = 1
122 HANDLE_ERRORS = 1
123 else: HANDLE_ERRORS = 0
123 else: HANDLE_ERRORS = 0
124
124
125 class ConnectionManager(object):
125 class ConnectionManager(object):
126 """
126 """
127 The connection manager must be able to:
127 The connection manager must be able to:
128 * keep track of all existing
128 * keep track of all existing
129 """
129 """
130 def __init__(self):
130 def __init__(self):
131 self._lock = thread.allocate_lock()
131 self._lock = thread.allocate_lock()
132 self._hostmap = {} # map hosts to a list of connections
132 self._hostmap = {} # map hosts to a list of connections
133 self._connmap = {} # map connections to host
133 self._connmap = {} # map connections to host
134 self._readymap = {} # map connection to ready state
134 self._readymap = {} # map connection to ready state
135
135
136 def add(self, host, connection, ready):
136 def add(self, host, connection, ready):
137 self._lock.acquire()
137 self._lock.acquire()
138 try:
138 try:
139 if host not in self._hostmap:
139 if host not in self._hostmap:
140 self._hostmap[host] = []
140 self._hostmap[host] = []
141 self._hostmap[host].append(connection)
141 self._hostmap[host].append(connection)
142 self._connmap[connection] = host
142 self._connmap[connection] = host
143 self._readymap[connection] = ready
143 self._readymap[connection] = ready
144 finally:
144 finally:
145 self._lock.release()
145 self._lock.release()
146
146
147 def remove(self, connection):
147 def remove(self, connection):
148 self._lock.acquire()
148 self._lock.acquire()
149 try:
149 try:
150 try:
150 try:
151 host = self._connmap[connection]
151 host = self._connmap[connection]
152 except KeyError:
152 except KeyError:
153 pass
153 pass
154 else:
154 else:
155 del self._connmap[connection]
155 del self._connmap[connection]
156 del self._readymap[connection]
156 del self._readymap[connection]
157 self._hostmap[host].remove(connection)
157 self._hostmap[host].remove(connection)
158 if not self._hostmap[host]: del self._hostmap[host]
158 if not self._hostmap[host]: del self._hostmap[host]
159 finally:
159 finally:
160 self._lock.release()
160 self._lock.release()
161
161
162 def set_ready(self, connection, ready):
162 def set_ready(self, connection, ready):
163 try:
163 try:
164 self._readymap[connection] = ready
164 self._readymap[connection] = ready
165 except KeyError:
165 except KeyError:
166 pass
166 pass
167
167
168 def get_ready_conn(self, host):
168 def get_ready_conn(self, host):
169 conn = None
169 conn = None
170 self._lock.acquire()
170 self._lock.acquire()
171 try:
171 try:
172 if host in self._hostmap:
172 if host in self._hostmap:
173 for c in self._hostmap[host]:
173 for c in self._hostmap[host]:
174 if self._readymap[c]:
174 if self._readymap[c]:
175 self._readymap[c] = 0
175 self._readymap[c] = 0
176 conn = c
176 conn = c
177 break
177 break
178 finally:
178 finally:
179 self._lock.release()
179 self._lock.release()
180 return conn
180 return conn
181
181
182 def get_all(self, host=None):
182 def get_all(self, host=None):
183 if host:
183 if host:
184 return list(self._hostmap.get(host, []))
184 return list(self._hostmap.get(host, []))
185 else:
185 else:
186 return dict(self._hostmap)
186 return dict(self._hostmap)
187
187
188 class KeepAliveHandler(object):
188 class KeepAliveHandler(object):
189 def __init__(self):
189 def __init__(self):
190 self._cm = ConnectionManager()
190 self._cm = ConnectionManager()
191
191
192 #### Connection Management
192 #### Connection Management
193 def open_connections(self):
193 def open_connections(self):
194 """return a list of connected hosts and the number of connections
194 """return a list of connected hosts and the number of connections
195 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
195 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
196 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
196 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
197
197
198 def close_connection(self, host):
198 def close_connection(self, host):
199 """close connection(s) to <host>
199 """close connection(s) to <host>
200 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
200 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
201 no error occurs if there is no connection to that host."""
201 no error occurs if there is no connection to that host."""
202 for h in self._cm.get_all(host):
202 for h in self._cm.get_all(host):
203 self._cm.remove(h)
203 self._cm.remove(h)
204 h.close()
204 h.close()
205
205
206 def close_all(self):
206 def close_all(self):
207 """close all open connections"""
207 """close all open connections"""
208 for host, conns in self._cm.get_all().iteritems():
208 for host, conns in self._cm.get_all().iteritems():
209 for h in conns:
209 for h in conns:
210 self._cm.remove(h)
210 self._cm.remove(h)
211 h.close()
211 h.close()
212
212
213 def _request_closed(self, request, host, connection):
213 def _request_closed(self, request, host, connection):
214 """tells us that this request is now closed and that the
214 """tells us that this request is now closed and that the
215 connection is ready for another request"""
215 connection is ready for another request"""
216 self._cm.set_ready(connection, 1)
216 self._cm.set_ready(connection, 1)
217
217
218 def _remove_connection(self, host, connection, close=0):
218 def _remove_connection(self, host, connection, close=0):
219 if close:
219 if close:
220 connection.close()
220 connection.close()
221 self._cm.remove(connection)
221 self._cm.remove(connection)
222
222
223 #### Transaction Execution
223 #### Transaction Execution
224 def http_open(self, req):
224 def http_open(self, req):
225 return self.do_open(HTTPConnection, req)
225 return self.do_open(HTTPConnection, req)
226
226
227 def do_open(self, http_class, req):
227 def do_open(self, http_class, req):
228 host = req.get_host()
228 host = req.get_host()
229 if not host:
229 if not host:
230 raise urllib2.URLError('no host given')
230 raise urllib2.URLError('no host given')
231
231
232 try:
232 try:
233 h = self._cm.get_ready_conn(host)
233 h = self._cm.get_ready_conn(host)
234 while h:
234 while h:
235 r = self._reuse_connection(h, req, host)
235 r = self._reuse_connection(h, req, host)
236
236
237 # if this response is non-None, then it worked and we're
237 # if this response is non-None, then it worked and we're
238 # done. Break out, skipping the else block.
238 # done. Break out, skipping the else block.
239 if r:
239 if r:
240 break
240 break
241
241
242 # connection is bad - possibly closed by server
242 # connection is bad - possibly closed by server
243 # discard it and ask for the next free connection
243 # discard it and ask for the next free connection
244 h.close()
244 h.close()
245 self._cm.remove(h)
245 self._cm.remove(h)
246 h = self._cm.get_ready_conn(host)
246 h = self._cm.get_ready_conn(host)
247 else:
247 else:
248 # no (working) free connections were found. Create a new one.
248 # no (working) free connections were found. Create a new one.
249 h = http_class(host)
249 h = http_class(host)
250 if DEBUG:
250 if DEBUG:
251 DEBUG.info("creating new connection to %s (%d)",
251 DEBUG.info("creating new connection to %s (%d)",
252 host, id(h))
252 host, id(h))
253 self._cm.add(host, h, 0)
253 self._cm.add(host, h, 0)
254 self._start_transaction(h, req)
254 self._start_transaction(h, req)
255 r = h.getresponse()
255 r = h.getresponse()
256 except (socket.error, httplib.HTTPException), err:
256 except (socket.error, httplib.HTTPException), err:
257 raise urllib2.URLError(err)
257 raise urllib2.URLError(err)
258
258
259 # if not a persistent connection, don't try to reuse it
259 # if not a persistent connection, don't try to reuse it
260 if r.will_close:
260 if r.will_close:
261 self._cm.remove(h)
261 self._cm.remove(h)
262
262
263 if DEBUG:
263 if DEBUG:
264 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
264 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
265 r._handler = self
265 r._handler = self
266 r._host = host
266 r._host = host
267 r._url = req.get_full_url()
267 r._url = req.get_full_url()
268 r._connection = h
268 r._connection = h
269 r.code = r.status
269 r.code = r.status
270 r.headers = r.msg
270 r.headers = r.msg
271 r.msg = r.reason
271 r.msg = r.reason
272
272
273 if r.status == 200 or not HANDLE_ERRORS:
273 if r.status == 200 or not HANDLE_ERRORS:
274 return r
274 return r
275 else:
275 else:
276 return self.parent.error('http', req, r,
276 return self.parent.error('http', req, r,
277 r.status, r.msg, r.headers)
277 r.status, r.msg, r.headers)
278
278
279 def _reuse_connection(self, h, req, host):
279 def _reuse_connection(self, h, req, host):
280 """start the transaction with a re-used connection
280 """start the transaction with a re-used connection
281 return a response object (r) upon success or None on failure.
281 return a response object (r) upon success or None on failure.
282 This DOES not close or remove bad connections in cases where
282 This DOES not close or remove bad connections in cases where
283 it returns. However, if an unexpected exception occurs, it
283 it returns. However, if an unexpected exception occurs, it
284 will close and remove the connection before re-raising.
284 will close and remove the connection before re-raising.
285 """
285 """
286 try:
286 try:
287 self._start_transaction(h, req)
287 self._start_transaction(h, req)
288 r = h.getresponse()
288 r = h.getresponse()
289 # note: just because we got something back doesn't mean it
289 # note: just because we got something back doesn't mean it
290 # worked. We'll check the version below, too.
290 # worked. We'll check the version below, too.
291 except (socket.error, httplib.HTTPException):
291 except (socket.error, httplib.HTTPException):
292 r = None
292 r = None
293 except: # re-raises
293 except: # re-raises
294 # adding this block just in case we've missed
294 # adding this block just in case we've missed
295 # something we will still raise the exception, but
295 # something we will still raise the exception, but
296 # lets try and close the connection and remove it
296 # lets try and close the connection and remove it
297 # first. We previously got into a nasty loop
297 # first. We previously got into a nasty loop
298 # where an exception was uncaught, and so the
298 # where an exception was uncaught, and so the
299 # connection stayed open. On the next try, the
299 # connection stayed open. On the next try, the
300 # same exception was raised, etc. The trade-off is
300 # same exception was raised, etc. The trade-off is
301 # that it's now possible this call will raise
301 # that it's now possible this call will raise
302 # a DIFFERENT exception
302 # a DIFFERENT exception
303 if DEBUG:
303 if DEBUG:
304 DEBUG.error("unexpected exception - closing "
304 DEBUG.error("unexpected exception - closing "
305 "connection to %s (%d)", host, id(h))
305 "connection to %s (%d)", host, id(h))
306 self._cm.remove(h)
306 self._cm.remove(h)
307 h.close()
307 h.close()
308 raise
308 raise
309
309
310 if r is None or r.version == 9:
310 if r is None or r.version == 9:
311 # httplib falls back to assuming HTTP 0.9 if it gets a
311 # httplib falls back to assuming HTTP 0.9 if it gets a
312 # bad header back. This is most likely to happen if
312 # bad header back. This is most likely to happen if
313 # the socket has been closed by the server since we
313 # the socket has been closed by the server since we
314 # last used the connection.
314 # last used the connection.
315 if DEBUG:
315 if DEBUG:
316 DEBUG.info("failed to re-use connection to %s (%d)",
316 DEBUG.info("failed to re-use connection to %s (%d)",
317 host, id(h))
317 host, id(h))
318 r = None
318 r = None
319 else:
319 else:
320 if DEBUG:
320 if DEBUG:
321 DEBUG.info("re-using connection to %s (%d)", host, id(h))
321 DEBUG.info("re-using connection to %s (%d)", host, id(h))
322
322
323 return r
323 return r
324
324
325 def _start_transaction(self, h, req):
325 def _start_transaction(self, h, req):
326 # What follows mostly reimplements HTTPConnection.request()
326 # What follows mostly reimplements HTTPConnection.request()
327 # except it adds self.parent.addheaders in the mix.
327 # except it adds self.parent.addheaders in the mix.
328 headers = req.headers.copy()
328 headers = req.headers.copy()
329 if sys.version_info >= (2, 4):
329 if sys.version_info >= (2, 4):
330 headers.update(req.unredirected_hdrs)
330 headers.update(req.unredirected_hdrs)
331 headers.update(self.parent.addheaders)
331 headers.update(self.parent.addheaders)
332 headers = dict((n.lower(), v) for n, v in headers.items())
332 headers = dict((n.lower(), v) for n, v in headers.items())
333 skipheaders = {}
333 skipheaders = {}
334 for n in ('host', 'accept-encoding'):
334 for n in ('host', 'accept-encoding'):
335 if n in headers:
335 if n in headers:
336 skipheaders['skip_' + n.replace('-', '_')] = 1
336 skipheaders['skip_' + n.replace('-', '_')] = 1
337 try:
337 try:
338 if req.has_data():
338 if req.has_data():
339 data = req.get_data()
339 data = req.get_data()
340 h.putrequest('POST', req.get_selector(), **skipheaders)
340 h.putrequest('POST', req.get_selector(), **skipheaders)
341 if 'content-type' not in headers:
341 if 'content-type' not in headers:
342 h.putheader('Content-type',
342 h.putheader('Content-type',
343 'application/x-www-form-urlencoded')
343 'application/x-www-form-urlencoded')
344 if 'content-length' not in headers:
344 if 'content-length' not in headers:
345 h.putheader('Content-length', '%d' % len(data))
345 h.putheader('Content-length', '%d' % len(data))
346 else:
346 else:
347 h.putrequest('GET', req.get_selector(), **skipheaders)
347 h.putrequest('GET', req.get_selector(), **skipheaders)
348 except (socket.error), err:
348 except (socket.error), err:
349 raise urllib2.URLError(err)
349 raise urllib2.URLError(err)
350 for k, v in headers.items():
350 for k, v in headers.items():
351 h.putheader(k, v)
351 h.putheader(k, v)
352 h.endheaders()
352 h.endheaders()
353 if req.has_data():
353 if req.has_data():
354 h.send(data)
354 h.send(data)
355
355
356 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
356 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
357 pass
357 pass
358
358
359 class HTTPResponse(httplib.HTTPResponse):
359 class HTTPResponse(httplib.HTTPResponse):
360 # we need to subclass HTTPResponse in order to
360 # we need to subclass HTTPResponse in order to
361 # 1) add readline() and readlines() methods
361 # 1) add readline() and readlines() methods
362 # 2) add close_connection() methods
362 # 2) add close_connection() methods
363 # 3) add info() and geturl() methods
363 # 3) add info() and geturl() methods
364
364
365 # in order to add readline(), read must be modified to deal with a
365 # in order to add readline(), read must be modified to deal with a
366 # buffer. example: readline must read a buffer and then spit back
366 # buffer. example: readline must read a buffer and then spit back
367 # one line at a time. The only real alternative is to read one
367 # one line at a time. The only real alternative is to read one
368 # BYTE at a time (ick). Once something has been read, it can't be
368 # BYTE at a time (ick). Once something has been read, it can't be
369 # put back (ok, maybe it can, but that's even uglier than this),
369 # put back (ok, maybe it can, but that's even uglier than this),
370 # so if you THEN do a normal read, you must first take stuff from
370 # so if you THEN do a normal read, you must first take stuff from
371 # the buffer.
371 # the buffer.
372
372
373 # the read method wraps the original to accommodate buffering,
373 # the read method wraps the original to accommodate buffering,
374 # although read() never adds to the buffer.
374 # although read() never adds to the buffer.
375 # Both readline and readlines have been stolen with almost no
375 # Both readline and readlines have been stolen with almost no
376 # modification from socket.py
376 # modification from socket.py
377
377
378
378
379 def __init__(self, sock, debuglevel=0, strict=0, method=None):
379 def __init__(self, sock, debuglevel=0, strict=0, method=None):
380 if method: # the httplib in python 2.3 uses the method arg
380 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
381 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
382 else: # 2.2 doesn't
383 httplib.HTTPResponse.__init__(self, sock, debuglevel)
384 self.fileno = sock.fileno
381 self.fileno = sock.fileno
385 self.code = None
382 self.code = None
386 self._rbuf = ''
383 self._rbuf = ''
387 self._rbufsize = 8096
384 self._rbufsize = 8096
388 self._handler = None # inserted by the handler later
385 self._handler = None # inserted by the handler later
389 self._host = None # (same)
386 self._host = None # (same)
390 self._url = None # (same)
387 self._url = None # (same)
391 self._connection = None # (same)
388 self._connection = None # (same)
392
389
393 _raw_read = httplib.HTTPResponse.read
390 _raw_read = httplib.HTTPResponse.read
394
391
395 def close(self):
392 def close(self):
396 if self.fp:
393 if self.fp:
397 self.fp.close()
394 self.fp.close()
398 self.fp = None
395 self.fp = None
399 if self._handler:
396 if self._handler:
400 self._handler._request_closed(self, self._host,
397 self._handler._request_closed(self, self._host,
401 self._connection)
398 self._connection)
402
399
403 def close_connection(self):
400 def close_connection(self):
404 self._handler._remove_connection(self._host, self._connection, close=1)
401 self._handler._remove_connection(self._host, self._connection, close=1)
405 self.close()
402 self.close()
406
403
407 def info(self):
404 def info(self):
408 return self.headers
405 return self.headers
409
406
410 def geturl(self):
407 def geturl(self):
411 return self._url
408 return self._url
412
409
413 def read(self, amt=None):
410 def read(self, amt=None):
414 # the _rbuf test is only in this first if for speed. It's not
411 # the _rbuf test is only in this first if for speed. It's not
415 # logically necessary
412 # logically necessary
416 if self._rbuf and not amt is None:
413 if self._rbuf and not amt is None:
417 L = len(self._rbuf)
414 L = len(self._rbuf)
418 if amt > L:
415 if amt > L:
419 amt -= L
416 amt -= L
420 else:
417 else:
421 s = self._rbuf[:amt]
418 s = self._rbuf[:amt]
422 self._rbuf = self._rbuf[amt:]
419 self._rbuf = self._rbuf[amt:]
423 return s
420 return s
424
421
425 s = self._rbuf + self._raw_read(amt)
422 s = self._rbuf + self._raw_read(amt)
426 self._rbuf = ''
423 self._rbuf = ''
427 return s
424 return s
428
425
429 # stolen from Python SVN #68532 to fix issue1088
426 # stolen from Python SVN #68532 to fix issue1088
430 def _read_chunked(self, amt):
427 def _read_chunked(self, amt):
431 chunk_left = self.chunk_left
428 chunk_left = self.chunk_left
432 value = ''
429 value = ''
433
430
434 # XXX This accumulates chunks by repeated string concatenation,
431 # XXX This accumulates chunks by repeated string concatenation,
435 # which is not efficient as the number or size of chunks gets big.
432 # which is not efficient as the number or size of chunks gets big.
436 while True:
433 while True:
437 if chunk_left is None:
434 if chunk_left is None:
438 line = self.fp.readline()
435 line = self.fp.readline()
439 i = line.find(';')
436 i = line.find(';')
440 if i >= 0:
437 if i >= 0:
441 line = line[:i] # strip chunk-extensions
438 line = line[:i] # strip chunk-extensions
442 try:
439 try:
443 chunk_left = int(line, 16)
440 chunk_left = int(line, 16)
444 except ValueError:
441 except ValueError:
445 # close the connection as protocol synchronization is
442 # close the connection as protocol synchronization is
446 # probably lost
443 # probably lost
447 self.close()
444 self.close()
448 raise httplib.IncompleteRead(value)
445 raise httplib.IncompleteRead(value)
449 if chunk_left == 0:
446 if chunk_left == 0:
450 break
447 break
451 if amt is None:
448 if amt is None:
452 value += self._safe_read(chunk_left)
449 value += self._safe_read(chunk_left)
453 elif amt < chunk_left:
450 elif amt < chunk_left:
454 value += self._safe_read(amt)
451 value += self._safe_read(amt)
455 self.chunk_left = chunk_left - amt
452 self.chunk_left = chunk_left - amt
456 return value
453 return value
457 elif amt == chunk_left:
454 elif amt == chunk_left:
458 value += self._safe_read(amt)
455 value += self._safe_read(amt)
459 self._safe_read(2) # toss the CRLF at the end of the chunk
456 self._safe_read(2) # toss the CRLF at the end of the chunk
460 self.chunk_left = None
457 self.chunk_left = None
461 return value
458 return value
462 else:
459 else:
463 value += self._safe_read(chunk_left)
460 value += self._safe_read(chunk_left)
464 amt -= chunk_left
461 amt -= chunk_left
465
462
466 # we read the whole chunk, get another
463 # we read the whole chunk, get another
467 self._safe_read(2) # toss the CRLF at the end of the chunk
464 self._safe_read(2) # toss the CRLF at the end of the chunk
468 chunk_left = None
465 chunk_left = None
469
466
470 # read and discard trailer up to the CRLF terminator
467 # read and discard trailer up to the CRLF terminator
471 ### note: we shouldn't have any trailers!
468 ### note: we shouldn't have any trailers!
472 while True:
469 while True:
473 line = self.fp.readline()
470 line = self.fp.readline()
474 if not line:
471 if not line:
475 # a vanishingly small number of sites EOF without
472 # a vanishingly small number of sites EOF without
476 # sending the trailer
473 # sending the trailer
477 break
474 break
478 if line == '\r\n':
475 if line == '\r\n':
479 break
476 break
480
477
481 # we read everything; close the "file"
478 # we read everything; close the "file"
482 self.close()
479 self.close()
483
480
484 return value
481 return value
485
482
486 def readline(self, limit=-1):
483 def readline(self, limit=-1):
487 i = self._rbuf.find('\n')
484 i = self._rbuf.find('\n')
488 while i < 0 and not (0 < limit <= len(self._rbuf)):
485 while i < 0 and not (0 < limit <= len(self._rbuf)):
489 new = self._raw_read(self._rbufsize)
486 new = self._raw_read(self._rbufsize)
490 if not new:
487 if not new:
491 break
488 break
492 i = new.find('\n')
489 i = new.find('\n')
493 if i >= 0:
490 if i >= 0:
494 i = i + len(self._rbuf)
491 i = i + len(self._rbuf)
495 self._rbuf = self._rbuf + new
492 self._rbuf = self._rbuf + new
496 if i < 0:
493 if i < 0:
497 i = len(self._rbuf)
494 i = len(self._rbuf)
498 else:
495 else:
499 i = i + 1
496 i = i + 1
500 if 0 <= limit < len(self._rbuf):
497 if 0 <= limit < len(self._rbuf):
501 i = limit
498 i = limit
502 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
499 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
503 return data
500 return data
504
501
505 def readlines(self, sizehint = 0):
502 def readlines(self, sizehint = 0):
506 total = 0
503 total = 0
507 list = []
504 list = []
508 while True:
505 while True:
509 line = self.readline()
506 line = self.readline()
510 if not line:
507 if not line:
511 break
508 break
512 list.append(line)
509 list.append(line)
513 total += len(line)
510 total += len(line)
514 if sizehint and total >= sizehint:
511 if sizehint and total >= sizehint:
515 break
512 break
516 return list
513 return list
517
514
518 def safesend(self, str):
515 def safesend(self, str):
519 """Send `str' to the server.
516 """Send `str' to the server.
520
517
521 Shamelessly ripped off from httplib to patch a bad behavior.
518 Shamelessly ripped off from httplib to patch a bad behavior.
522 """
519 """
523 # _broken_pipe_resp is an attribute we set in this function
520 # _broken_pipe_resp is an attribute we set in this function
524 # if the socket is closed while we're sending data but
521 # if the socket is closed while we're sending data but
525 # the server sent us a response before hanging up.
522 # the server sent us a response before hanging up.
526 # In that case, we want to pretend to send the rest of the
523 # In that case, we want to pretend to send the rest of the
527 # outgoing data, and then let the user use getresponse()
524 # outgoing data, and then let the user use getresponse()
528 # (which we wrap) to get this last response before
525 # (which we wrap) to get this last response before
529 # opening a new socket.
526 # opening a new socket.
530 if getattr(self, '_broken_pipe_resp', None) is not None:
527 if getattr(self, '_broken_pipe_resp', None) is not None:
531 return
528 return
532
529
533 if self.sock is None:
530 if self.sock is None:
534 if self.auto_open:
531 if self.auto_open:
535 self.connect()
532 self.connect()
536 else:
533 else:
537 raise httplib.NotConnected
534 raise httplib.NotConnected
538
535
539 # send the data to the server. if we get a broken pipe, then close
536 # send the data to the server. if we get a broken pipe, then close
540 # the socket. we want to reconnect when somebody tries to send again.
537 # the socket. we want to reconnect when somebody tries to send again.
541 #
538 #
542 # NOTE: we DO propagate the error, though, because we cannot simply
539 # NOTE: we DO propagate the error, though, because we cannot simply
543 # ignore the error... the caller will know if they can retry.
540 # ignore the error... the caller will know if they can retry.
544 if self.debuglevel > 0:
541 if self.debuglevel > 0:
545 print "send:", repr(str)
542 print "send:", repr(str)
546 try:
543 try:
547 blocksize = 8192
544 blocksize = 8192
548 read = getattr(str, 'read', None)
545 read = getattr(str, 'read', None)
549 if read is not None:
546 if read is not None:
550 if self.debuglevel > 0:
547 if self.debuglevel > 0:
551 print "sending a read()able"
548 print "sending a read()able"
552 data = read(blocksize)
549 data = read(blocksize)
553 while data:
550 while data:
554 self.sock.sendall(data)
551 self.sock.sendall(data)
555 data = read(blocksize)
552 data = read(blocksize)
556 else:
553 else:
557 self.sock.sendall(str)
554 self.sock.sendall(str)
558 except socket.error, v:
555 except socket.error, v:
559 reraise = True
556 reraise = True
560 if v[0] == errno.EPIPE: # Broken pipe
557 if v[0] == errno.EPIPE: # Broken pipe
561 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
558 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
562 self._broken_pipe_resp = None
559 self._broken_pipe_resp = None
563 self._broken_pipe_resp = self.getresponse()
560 self._broken_pipe_resp = self.getresponse()
564 reraise = False
561 reraise = False
565 self.close()
562 self.close()
566 if reraise:
563 if reraise:
567 raise
564 raise
568
565
569 def wrapgetresponse(cls):
566 def wrapgetresponse(cls):
570 """Wraps getresponse in cls with a broken-pipe sane version.
567 """Wraps getresponse in cls with a broken-pipe sane version.
571 """
568 """
572 def safegetresponse(self):
569 def safegetresponse(self):
573 # In safesend() we might set the _broken_pipe_resp
570 # In safesend() we might set the _broken_pipe_resp
574 # attribute, in which case the socket has already
571 # attribute, in which case the socket has already
575 # been closed and we just need to give them the response
572 # been closed and we just need to give them the response
576 # back. Otherwise, we use the normal response path.
573 # back. Otherwise, we use the normal response path.
577 r = getattr(self, '_broken_pipe_resp', None)
574 r = getattr(self, '_broken_pipe_resp', None)
578 if r is not None:
575 if r is not None:
579 return r
576 return r
580 return cls.getresponse(self)
577 return cls.getresponse(self)
581 safegetresponse.__doc__ = cls.getresponse.__doc__
578 safegetresponse.__doc__ = cls.getresponse.__doc__
582 return safegetresponse
579 return safegetresponse
583
580
584 class HTTPConnection(httplib.HTTPConnection):
581 class HTTPConnection(httplib.HTTPConnection):
585 # use the modified response class
582 # use the modified response class
586 response_class = HTTPResponse
583 response_class = HTTPResponse
587 send = safesend
584 send = safesend
588 getresponse = wrapgetresponse(httplib.HTTPConnection)
585 getresponse = wrapgetresponse(httplib.HTTPConnection)
589
586
590
587
591 #########################################################################
588 #########################################################################
592 ##### TEST FUNCTIONS
589 ##### TEST FUNCTIONS
593 #########################################################################
590 #########################################################################
594
591
595 def error_handler(url):
592 def error_handler(url):
596 global HANDLE_ERRORS
593 global HANDLE_ERRORS
597 orig = HANDLE_ERRORS
594 orig = HANDLE_ERRORS
598 keepalive_handler = HTTPHandler()
595 keepalive_handler = HTTPHandler()
599 opener = urllib2.build_opener(keepalive_handler)
596 opener = urllib2.build_opener(keepalive_handler)
600 urllib2.install_opener(opener)
597 urllib2.install_opener(opener)
601 pos = {0: 'off', 1: 'on'}
598 pos = {0: 'off', 1: 'on'}
602 for i in (0, 1):
599 for i in (0, 1):
603 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
600 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
604 HANDLE_ERRORS = i
601 HANDLE_ERRORS = i
605 try:
602 try:
606 fo = urllib2.urlopen(url)
603 fo = urllib2.urlopen(url)
607 fo.read()
604 fo.read()
608 fo.close()
605 fo.close()
609 try:
606 try:
610 status, reason = fo.status, fo.reason
607 status, reason = fo.status, fo.reason
611 except AttributeError:
608 except AttributeError:
612 status, reason = None, None
609 status, reason = None, None
613 except IOError, e:
610 except IOError, e:
614 print " EXCEPTION: %s" % e
611 print " EXCEPTION: %s" % e
615 raise
612 raise
616 else:
613 else:
617 print " status = %s, reason = %s" % (status, reason)
614 print " status = %s, reason = %s" % (status, reason)
618 HANDLE_ERRORS = orig
615 HANDLE_ERRORS = orig
619 hosts = keepalive_handler.open_connections()
616 hosts = keepalive_handler.open_connections()
620 print "open connections:", hosts
617 print "open connections:", hosts
621 keepalive_handler.close_all()
618 keepalive_handler.close_all()
622
619
623 def md5(s):
620 def md5(s):
624 try:
621 try:
625 from hashlib import md5 as _md5
622 from hashlib import md5 as _md5
626 except ImportError:
623 except ImportError:
627 from md5 import md5 as _md5
624 from md5 import md5 as _md5
628 global md5
625 global md5
629 md5 = _md5
626 md5 = _md5
630 return _md5(s)
627 return _md5(s)
631
628
632 def continuity(url):
629 def continuity(url):
633 format = '%25s: %s'
630 format = '%25s: %s'
634
631
635 # first fetch the file with the normal http handler
632 # first fetch the file with the normal http handler
636 opener = urllib2.build_opener()
633 opener = urllib2.build_opener()
637 urllib2.install_opener(opener)
634 urllib2.install_opener(opener)
638 fo = urllib2.urlopen(url)
635 fo = urllib2.urlopen(url)
639 foo = fo.read()
636 foo = fo.read()
640 fo.close()
637 fo.close()
641 m = md5.new(foo)
638 m = md5.new(foo)
642 print format % ('normal urllib', m.hexdigest())
639 print format % ('normal urllib', m.hexdigest())
643
640
644 # now install the keepalive handler and try again
641 # now install the keepalive handler and try again
645 opener = urllib2.build_opener(HTTPHandler())
642 opener = urllib2.build_opener(HTTPHandler())
646 urllib2.install_opener(opener)
643 urllib2.install_opener(opener)
647
644
648 fo = urllib2.urlopen(url)
645 fo = urllib2.urlopen(url)
649 foo = fo.read()
646 foo = fo.read()
650 fo.close()
647 fo.close()
651 m = md5.new(foo)
648 m = md5.new(foo)
652 print format % ('keepalive read', m.hexdigest())
649 print format % ('keepalive read', m.hexdigest())
653
650
654 fo = urllib2.urlopen(url)
651 fo = urllib2.urlopen(url)
655 foo = ''
652 foo = ''
656 while True:
653 while True:
657 f = fo.readline()
654 f = fo.readline()
658 if f:
655 if f:
659 foo = foo + f
656 foo = foo + f
660 else: break
657 else: break
661 fo.close()
658 fo.close()
662 m = md5.new(foo)
659 m = md5.new(foo)
663 print format % ('keepalive readline', m.hexdigest())
660 print format % ('keepalive readline', m.hexdigest())
664
661
665 def comp(N, url):
662 def comp(N, url):
666 print ' making %i connections to:\n %s' % (N, url)
663 print ' making %i connections to:\n %s' % (N, url)
667
664
668 sys.stdout.write(' first using the normal urllib handlers')
665 sys.stdout.write(' first using the normal urllib handlers')
669 # first use normal opener
666 # first use normal opener
670 opener = urllib2.build_opener()
667 opener = urllib2.build_opener()
671 urllib2.install_opener(opener)
668 urllib2.install_opener(opener)
672 t1 = fetch(N, url)
669 t1 = fetch(N, url)
673 print ' TIME: %.3f s' % t1
670 print ' TIME: %.3f s' % t1
674
671
675 sys.stdout.write(' now using the keepalive handler ')
672 sys.stdout.write(' now using the keepalive handler ')
676 # now install the keepalive handler and try again
673 # now install the keepalive handler and try again
677 opener = urllib2.build_opener(HTTPHandler())
674 opener = urllib2.build_opener(HTTPHandler())
678 urllib2.install_opener(opener)
675 urllib2.install_opener(opener)
679 t2 = fetch(N, url)
676 t2 = fetch(N, url)
680 print ' TIME: %.3f s' % t2
677 print ' TIME: %.3f s' % t2
681 print ' improvement factor: %.2f' % (t1 / t2)
678 print ' improvement factor: %.2f' % (t1 / t2)
682
679
683 def fetch(N, url, delay=0):
680 def fetch(N, url, delay=0):
684 import time
681 import time
685 lens = []
682 lens = []
686 starttime = time.time()
683 starttime = time.time()
687 for i in range(N):
684 for i in range(N):
688 if delay and i > 0:
685 if delay and i > 0:
689 time.sleep(delay)
686 time.sleep(delay)
690 fo = urllib2.urlopen(url)
687 fo = urllib2.urlopen(url)
691 foo = fo.read()
688 foo = fo.read()
692 fo.close()
689 fo.close()
693 lens.append(len(foo))
690 lens.append(len(foo))
694 diff = time.time() - starttime
691 diff = time.time() - starttime
695
692
696 j = 0
693 j = 0
697 for i in lens[1:]:
694 for i in lens[1:]:
698 j = j + 1
695 j = j + 1
699 if not i == lens[0]:
696 if not i == lens[0]:
700 print "WARNING: inconsistent length on read %i: %i" % (j, i)
697 print "WARNING: inconsistent length on read %i: %i" % (j, i)
701
698
702 return diff
699 return diff
703
700
704 def test_timeout(url):
701 def test_timeout(url):
705 global DEBUG
702 global DEBUG
706 dbbackup = DEBUG
703 dbbackup = DEBUG
707 class FakeLogger(object):
704 class FakeLogger(object):
708 def debug(self, msg, *args):
705 def debug(self, msg, *args):
709 print msg % args
706 print msg % args
710 info = warning = error = debug
707 info = warning = error = debug
711 DEBUG = FakeLogger()
708 DEBUG = FakeLogger()
712 print " fetching the file to establish a connection"
709 print " fetching the file to establish a connection"
713 fo = urllib2.urlopen(url)
710 fo = urllib2.urlopen(url)
714 data1 = fo.read()
711 data1 = fo.read()
715 fo.close()
712 fo.close()
716
713
717 i = 20
714 i = 20
718 print " waiting %i seconds for the server to close the connection" % i
715 print " waiting %i seconds for the server to close the connection" % i
719 while i > 0:
716 while i > 0:
720 sys.stdout.write('\r %2i' % i)
717 sys.stdout.write('\r %2i' % i)
721 sys.stdout.flush()
718 sys.stdout.flush()
722 time.sleep(1)
719 time.sleep(1)
723 i -= 1
720 i -= 1
724 sys.stderr.write('\r')
721 sys.stderr.write('\r')
725
722
726 print " fetching the file a second time"
723 print " fetching the file a second time"
727 fo = urllib2.urlopen(url)
724 fo = urllib2.urlopen(url)
728 data2 = fo.read()
725 data2 = fo.read()
729 fo.close()
726 fo.close()
730
727
731 if data1 == data2:
728 if data1 == data2:
732 print ' data are identical'
729 print ' data are identical'
733 else:
730 else:
734 print ' ERROR: DATA DIFFER'
731 print ' ERROR: DATA DIFFER'
735
732
736 DEBUG = dbbackup
733 DEBUG = dbbackup
737
734
738
735
739 def test(url, N=10):
736 def test(url, N=10):
740 print "checking error handler (do this on a non-200)"
737 print "checking error handler (do this on a non-200)"
741 try: error_handler(url)
738 try: error_handler(url)
742 except IOError:
739 except IOError:
743 print "exiting - exception will prevent further tests"
740 print "exiting - exception will prevent further tests"
744 sys.exit()
741 sys.exit()
745 print
742 print
746 print "performing continuity test (making sure stuff isn't corrupted)"
743 print "performing continuity test (making sure stuff isn't corrupted)"
747 continuity(url)
744 continuity(url)
748 print
745 print
749 print "performing speed comparison"
746 print "performing speed comparison"
750 comp(N, url)
747 comp(N, url)
751 print
748 print
752 print "performing dropped-connection check"
749 print "performing dropped-connection check"
753 test_timeout(url)
750 test_timeout(url)
754
751
755 if __name__ == '__main__':
752 if __name__ == '__main__':
756 import time
753 import time
757 import sys
754 import sys
758 try:
755 try:
759 N = int(sys.argv[1])
756 N = int(sys.argv[1])
760 url = sys.argv[2]
757 url = sys.argv[2]
761 except (IndexError, ValueError):
758 except (IndexError, ValueError):
762 print "%s <integer> <url>" % sys.argv[0]
759 print "%s <integer> <url>" % sys.argv[0]
763 else:
760 else:
764 test(url, N)
761 test(url, N)
General Comments 0
You need to be logged in to leave comments. Login now