##// END OF EJS Templates
keepalive: switch from thread to threading module...
Pulkit Goyal -
r29456:e61d384e default
parent child Browse files
Show More
@@ -1,759 +1,759 b''
1 # This library is free software; you can redistribute it and/or
1 # This library is free software; you can redistribute it and/or
2 # modify it under the terms of the GNU Lesser General Public
2 # modify it under the terms of the GNU Lesser General Public
3 # License as published by the Free Software Foundation; either
3 # License as published by the Free Software Foundation; either
4 # version 2.1 of the License, or (at your option) any later version.
4 # version 2.1 of the License, or (at your option) any later version.
5 #
5 #
6 # This library is distributed in the hope that it will be useful,
6 # This library is distributed in the hope that it will be useful,
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 # Lesser General Public License for more details.
9 # Lesser General Public License for more details.
10 #
10 #
11 # You should have received a copy of the GNU Lesser General Public
11 # You should have received a copy of the GNU Lesser General Public
12 # License along with this library; if not, see
12 # License along with this library; if not, see
13 # <http://www.gnu.org/licenses/>.
13 # <http://www.gnu.org/licenses/>.
14
14
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
17
17
18 # Modified by Benoit Boissinot:
18 # Modified by Benoit Boissinot:
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
20 # Modified by Dirkjan Ochtman:
20 # Modified by Dirkjan Ochtman:
21 # - import md5 function from a local util module
21 # - import md5 function from a local util module
22 # Modified by Augie Fackler:
22 # Modified by Augie Fackler:
23 # - add safesend method and use it to prevent broken pipe errors
23 # - add safesend method and use it to prevent broken pipe errors
24 # on large POST requests
24 # on large POST requests
25
25
26 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
26 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
27
27
28 >>> import urllib2
28 >>> import urllib2
29 >>> from keepalive import HTTPHandler
29 >>> from keepalive import HTTPHandler
30 >>> keepalive_handler = HTTPHandler()
30 >>> keepalive_handler = HTTPHandler()
31 >>> opener = urlreq.buildopener(keepalive_handler)
31 >>> opener = urlreq.buildopener(keepalive_handler)
32 >>> urlreq.installopener(opener)
32 >>> urlreq.installopener(opener)
33 >>>
33 >>>
34 >>> fo = urlreq.urlopen('http://www.python.org')
34 >>> fo = urlreq.urlopen('http://www.python.org')
35
35
36 If a connection to a given host is requested, and all of the existing
36 If a connection to a given host is requested, and all of the existing
37 connections are still in use, another connection will be opened. If
37 connections are still in use, another connection will be opened. If
38 the handler tries to use an existing connection but it fails in some
38 the handler tries to use an existing connection but it fails in some
39 way, it will be closed and removed from the pool.
39 way, it will be closed and removed from the pool.
40
40
41 To remove the handler, simply re-run build_opener with no arguments, and
41 To remove the handler, simply re-run build_opener with no arguments, and
42 install that opener.
42 install that opener.
43
43
44 You can explicitly close connections by using the close_connection()
44 You can explicitly close connections by using the close_connection()
45 method of the returned file-like object (described below) or you can
45 method of the returned file-like object (described below) or you can
46 use the handler methods:
46 use the handler methods:
47
47
48 close_connection(host)
48 close_connection(host)
49 close_all()
49 close_all()
50 open_connections()
50 open_connections()
51
51
52 NOTE: using the close_connection and close_all methods of the handler
52 NOTE: using the close_connection and close_all methods of the handler
53 should be done with care when using multiple threads.
53 should be done with care when using multiple threads.
54 * there is nothing that prevents another thread from creating new
54 * there is nothing that prevents another thread from creating new
55 connections immediately after connections are closed
55 connections immediately after connections are closed
56 * no checks are done to prevent in-use connections from being closed
56 * no checks are done to prevent in-use connections from being closed
57
57
58 >>> keepalive_handler.close_all()
58 >>> keepalive_handler.close_all()
59
59
60 EXTRA ATTRIBUTES AND METHODS
60 EXTRA ATTRIBUTES AND METHODS
61
61
62 Upon a status of 200, the object returned has a few additional
62 Upon a status of 200, the object returned has a few additional
63 attributes and methods, which should not be used if you want to
63 attributes and methods, which should not be used if you want to
64 remain consistent with the normal urllib2-returned objects:
64 remain consistent with the normal urllib2-returned objects:
65
65
66 close_connection() - close the connection to the host
66 close_connection() - close the connection to the host
67 readlines() - you know, readlines()
67 readlines() - you know, readlines()
68 status - the return status (i.e. 404)
68 status - the return status (i.e. 404)
69 reason - english translation of status (i.e. 'File not found')
69 reason - english translation of status (i.e. 'File not found')
70
70
71 If you want the best of both worlds, use this inside an
71 If you want the best of both worlds, use this inside an
72 AttributeError-catching try:
72 AttributeError-catching try:
73
73
74 >>> try: status = fo.status
74 >>> try: status = fo.status
75 >>> except AttributeError: status = None
75 >>> except AttributeError: status = None
76
76
77 Unfortunately, these are ONLY there if status == 200, so it's not
77 Unfortunately, these are ONLY there if status == 200, so it's not
78 easy to distinguish between non-200 responses. The reason is that
78 easy to distinguish between non-200 responses. The reason is that
79 urllib2 tries to do clever things with error codes 301, 302, 401,
79 urllib2 tries to do clever things with error codes 301, 302, 401,
80 and 407, and it wraps the object upon return.
80 and 407, and it wraps the object upon return.
81
81
82 For python versions earlier than 2.4, you can avoid this fancy error
82 For python versions earlier than 2.4, you can avoid this fancy error
83 handling by setting the module-level global HANDLE_ERRORS to zero.
83 handling by setting the module-level global HANDLE_ERRORS to zero.
84 You see, prior to 2.4, it's the HTTP Handler's job to determine what
84 You see, prior to 2.4, it's the HTTP Handler's job to determine what
85 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
85 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
86 means "pass everything up". In python 2.4, however, this job no
86 means "pass everything up". In python 2.4, however, this job no
87 longer belongs to the HTTP Handler and is now done by a NEW handler,
87 longer belongs to the HTTP Handler and is now done by a NEW handler,
88 HTTPErrorProcessor. Here's the bottom line:
88 HTTPErrorProcessor. Here's the bottom line:
89
89
90 python version < 2.4
90 python version < 2.4
91 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
91 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
92 errors
92 errors
93 HANDLE_ERRORS == 0 pass everything up, error processing is
93 HANDLE_ERRORS == 0 pass everything up, error processing is
94 left to the calling code
94 left to the calling code
95 python version >= 2.4
95 python version >= 2.4
96 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
96 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
97 HANDLE_ERRORS == 0 (default) pass everything up, let the
97 HANDLE_ERRORS == 0 (default) pass everything up, let the
98 other handlers (specifically,
98 other handlers (specifically,
99 HTTPErrorProcessor) decide what to do
99 HTTPErrorProcessor) decide what to do
100
100
101 In practice, setting the variable either way makes little difference
101 In practice, setting the variable either way makes little difference
102 in python 2.4, so for the most consistent behavior across versions,
102 in python 2.4, so for the most consistent behavior across versions,
103 you probably just want to use the defaults, which will give you
103 you probably just want to use the defaults, which will give you
104 exceptions on errors.
104 exceptions on errors.
105
105
106 """
106 """
107
107
108 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
108 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
109
109
110 from __future__ import absolute_import, print_function
110 from __future__ import absolute_import, print_function
111
111
112 import errno
112 import errno
113 import hashlib
113 import hashlib
114 import socket
114 import socket
115 import sys
115 import sys
116 import thread
116 import threading
117
117
118 from . import (
118 from . import (
119 util,
119 util,
120 )
120 )
121
121
122 httplib = util.httplib
122 httplib = util.httplib
123 urlerr = util.urlerr
123 urlerr = util.urlerr
124 urlreq = util.urlreq
124 urlreq = util.urlreq
125
125
126 DEBUG = None
126 DEBUG = None
127
127
128 if sys.version_info < (2, 4):
128 if sys.version_info < (2, 4):
129 HANDLE_ERRORS = 1
129 HANDLE_ERRORS = 1
130 else: HANDLE_ERRORS = 0
130 else: HANDLE_ERRORS = 0
131
131
132 class ConnectionManager(object):
132 class ConnectionManager(object):
133 """
133 """
134 The connection manager must be able to:
134 The connection manager must be able to:
135 * keep track of all existing
135 * keep track of all existing
136 """
136 """
137 def __init__(self):
137 def __init__(self):
138 self._lock = thread.allocate_lock()
138 self._lock = threading.Lock()
139 self._hostmap = {} # map hosts to a list of connections
139 self._hostmap = {} # map hosts to a list of connections
140 self._connmap = {} # map connections to host
140 self._connmap = {} # map connections to host
141 self._readymap = {} # map connection to ready state
141 self._readymap = {} # map connection to ready state
142
142
143 def add(self, host, connection, ready):
143 def add(self, host, connection, ready):
144 self._lock.acquire()
144 self._lock.acquire()
145 try:
145 try:
146 if host not in self._hostmap:
146 if host not in self._hostmap:
147 self._hostmap[host] = []
147 self._hostmap[host] = []
148 self._hostmap[host].append(connection)
148 self._hostmap[host].append(connection)
149 self._connmap[connection] = host
149 self._connmap[connection] = host
150 self._readymap[connection] = ready
150 self._readymap[connection] = ready
151 finally:
151 finally:
152 self._lock.release()
152 self._lock.release()
153
153
154 def remove(self, connection):
154 def remove(self, connection):
155 self._lock.acquire()
155 self._lock.acquire()
156 try:
156 try:
157 try:
157 try:
158 host = self._connmap[connection]
158 host = self._connmap[connection]
159 except KeyError:
159 except KeyError:
160 pass
160 pass
161 else:
161 else:
162 del self._connmap[connection]
162 del self._connmap[connection]
163 del self._readymap[connection]
163 del self._readymap[connection]
164 self._hostmap[host].remove(connection)
164 self._hostmap[host].remove(connection)
165 if not self._hostmap[host]: del self._hostmap[host]
165 if not self._hostmap[host]: del self._hostmap[host]
166 finally:
166 finally:
167 self._lock.release()
167 self._lock.release()
168
168
169 def set_ready(self, connection, ready):
169 def set_ready(self, connection, ready):
170 try:
170 try:
171 self._readymap[connection] = ready
171 self._readymap[connection] = ready
172 except KeyError:
172 except KeyError:
173 pass
173 pass
174
174
175 def get_ready_conn(self, host):
175 def get_ready_conn(self, host):
176 conn = None
176 conn = None
177 self._lock.acquire()
177 self._lock.acquire()
178 try:
178 try:
179 if host in self._hostmap:
179 if host in self._hostmap:
180 for c in self._hostmap[host]:
180 for c in self._hostmap[host]:
181 if self._readymap[c]:
181 if self._readymap[c]:
182 self._readymap[c] = 0
182 self._readymap[c] = 0
183 conn = c
183 conn = c
184 break
184 break
185 finally:
185 finally:
186 self._lock.release()
186 self._lock.release()
187 return conn
187 return conn
188
188
189 def get_all(self, host=None):
189 def get_all(self, host=None):
190 if host:
190 if host:
191 return list(self._hostmap.get(host, []))
191 return list(self._hostmap.get(host, []))
192 else:
192 else:
193 return dict(self._hostmap)
193 return dict(self._hostmap)
194
194
195 class KeepAliveHandler(object):
195 class KeepAliveHandler(object):
196 def __init__(self):
196 def __init__(self):
197 self._cm = ConnectionManager()
197 self._cm = ConnectionManager()
198
198
199 #### Connection Management
199 #### Connection Management
200 def open_connections(self):
200 def open_connections(self):
201 """return a list of connected hosts and the number of connections
201 """return a list of connected hosts and the number of connections
202 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
202 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
203 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
203 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
204
204
205 def close_connection(self, host):
205 def close_connection(self, host):
206 """close connection(s) to <host>
206 """close connection(s) to <host>
207 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
207 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
208 no error occurs if there is no connection to that host."""
208 no error occurs if there is no connection to that host."""
209 for h in self._cm.get_all(host):
209 for h in self._cm.get_all(host):
210 self._cm.remove(h)
210 self._cm.remove(h)
211 h.close()
211 h.close()
212
212
213 def close_all(self):
213 def close_all(self):
214 """close all open connections"""
214 """close all open connections"""
215 for host, conns in self._cm.get_all().iteritems():
215 for host, conns in self._cm.get_all().iteritems():
216 for h in conns:
216 for h in conns:
217 self._cm.remove(h)
217 self._cm.remove(h)
218 h.close()
218 h.close()
219
219
220 def _request_closed(self, request, host, connection):
220 def _request_closed(self, request, host, connection):
221 """tells us that this request is now closed and that the
221 """tells us that this request is now closed and that the
222 connection is ready for another request"""
222 connection is ready for another request"""
223 self._cm.set_ready(connection, 1)
223 self._cm.set_ready(connection, 1)
224
224
225 def _remove_connection(self, host, connection, close=0):
225 def _remove_connection(self, host, connection, close=0):
226 if close:
226 if close:
227 connection.close()
227 connection.close()
228 self._cm.remove(connection)
228 self._cm.remove(connection)
229
229
230 #### Transaction Execution
230 #### Transaction Execution
231 def http_open(self, req):
231 def http_open(self, req):
232 return self.do_open(HTTPConnection, req)
232 return self.do_open(HTTPConnection, req)
233
233
234 def do_open(self, http_class, req):
234 def do_open(self, http_class, req):
235 host = req.get_host()
235 host = req.get_host()
236 if not host:
236 if not host:
237 raise urlerr.urlerror('no host given')
237 raise urlerr.urlerror('no host given')
238
238
239 try:
239 try:
240 h = self._cm.get_ready_conn(host)
240 h = self._cm.get_ready_conn(host)
241 while h:
241 while h:
242 r = self._reuse_connection(h, req, host)
242 r = self._reuse_connection(h, req, host)
243
243
244 # if this response is non-None, then it worked and we're
244 # if this response is non-None, then it worked and we're
245 # done. Break out, skipping the else block.
245 # done. Break out, skipping the else block.
246 if r:
246 if r:
247 break
247 break
248
248
249 # connection is bad - possibly closed by server
249 # connection is bad - possibly closed by server
250 # discard it and ask for the next free connection
250 # discard it and ask for the next free connection
251 h.close()
251 h.close()
252 self._cm.remove(h)
252 self._cm.remove(h)
253 h = self._cm.get_ready_conn(host)
253 h = self._cm.get_ready_conn(host)
254 else:
254 else:
255 # no (working) free connections were found. Create a new one.
255 # no (working) free connections were found. Create a new one.
256 h = http_class(host)
256 h = http_class(host)
257 if DEBUG:
257 if DEBUG:
258 DEBUG.info("creating new connection to %s (%d)",
258 DEBUG.info("creating new connection to %s (%d)",
259 host, id(h))
259 host, id(h))
260 self._cm.add(host, h, 0)
260 self._cm.add(host, h, 0)
261 self._start_transaction(h, req)
261 self._start_transaction(h, req)
262 r = h.getresponse()
262 r = h.getresponse()
263 except (socket.error, httplib.HTTPException) as err:
263 except (socket.error, httplib.HTTPException) as err:
264 raise urlerr.urlerror(err)
264 raise urlerr.urlerror(err)
265
265
266 # if not a persistent connection, don't try to reuse it
266 # if not a persistent connection, don't try to reuse it
267 if r.will_close:
267 if r.will_close:
268 self._cm.remove(h)
268 self._cm.remove(h)
269
269
270 if DEBUG:
270 if DEBUG:
271 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
271 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
272 r._handler = self
272 r._handler = self
273 r._host = host
273 r._host = host
274 r._url = req.get_full_url()
274 r._url = req.get_full_url()
275 r._connection = h
275 r._connection = h
276 r.code = r.status
276 r.code = r.status
277 r.headers = r.msg
277 r.headers = r.msg
278 r.msg = r.reason
278 r.msg = r.reason
279
279
280 if r.status == 200 or not HANDLE_ERRORS:
280 if r.status == 200 or not HANDLE_ERRORS:
281 return r
281 return r
282 else:
282 else:
283 return self.parent.error('http', req, r,
283 return self.parent.error('http', req, r,
284 r.status, r.msg, r.headers)
284 r.status, r.msg, r.headers)
285
285
286 def _reuse_connection(self, h, req, host):
286 def _reuse_connection(self, h, req, host):
287 """start the transaction with a re-used connection
287 """start the transaction with a re-used connection
288 return a response object (r) upon success or None on failure.
288 return a response object (r) upon success or None on failure.
289 This DOES not close or remove bad connections in cases where
289 This DOES not close or remove bad connections in cases where
290 it returns. However, if an unexpected exception occurs, it
290 it returns. However, if an unexpected exception occurs, it
291 will close and remove the connection before re-raising.
291 will close and remove the connection before re-raising.
292 """
292 """
293 try:
293 try:
294 self._start_transaction(h, req)
294 self._start_transaction(h, req)
295 r = h.getresponse()
295 r = h.getresponse()
296 # note: just because we got something back doesn't mean it
296 # note: just because we got something back doesn't mean it
297 # worked. We'll check the version below, too.
297 # worked. We'll check the version below, too.
298 except (socket.error, httplib.HTTPException):
298 except (socket.error, httplib.HTTPException):
299 r = None
299 r = None
300 except: # re-raises
300 except: # re-raises
301 # adding this block just in case we've missed
301 # adding this block just in case we've missed
302 # something we will still raise the exception, but
302 # something we will still raise the exception, but
303 # lets try and close the connection and remove it
303 # lets try and close the connection and remove it
304 # first. We previously got into a nasty loop
304 # first. We previously got into a nasty loop
305 # where an exception was uncaught, and so the
305 # where an exception was uncaught, and so the
306 # connection stayed open. On the next try, the
306 # connection stayed open. On the next try, the
307 # same exception was raised, etc. The trade-off is
307 # same exception was raised, etc. The trade-off is
308 # that it's now possible this call will raise
308 # that it's now possible this call will raise
309 # a DIFFERENT exception
309 # a DIFFERENT exception
310 if DEBUG:
310 if DEBUG:
311 DEBUG.error("unexpected exception - closing "
311 DEBUG.error("unexpected exception - closing "
312 "connection to %s (%d)", host, id(h))
312 "connection to %s (%d)", host, id(h))
313 self._cm.remove(h)
313 self._cm.remove(h)
314 h.close()
314 h.close()
315 raise
315 raise
316
316
317 if r is None or r.version == 9:
317 if r is None or r.version == 9:
318 # httplib falls back to assuming HTTP 0.9 if it gets a
318 # httplib falls back to assuming HTTP 0.9 if it gets a
319 # bad header back. This is most likely to happen if
319 # bad header back. This is most likely to happen if
320 # the socket has been closed by the server since we
320 # the socket has been closed by the server since we
321 # last used the connection.
321 # last used the connection.
322 if DEBUG:
322 if DEBUG:
323 DEBUG.info("failed to re-use connection to %s (%d)",
323 DEBUG.info("failed to re-use connection to %s (%d)",
324 host, id(h))
324 host, id(h))
325 r = None
325 r = None
326 else:
326 else:
327 if DEBUG:
327 if DEBUG:
328 DEBUG.info("re-using connection to %s (%d)", host, id(h))
328 DEBUG.info("re-using connection to %s (%d)", host, id(h))
329
329
330 return r
330 return r
331
331
332 def _start_transaction(self, h, req):
332 def _start_transaction(self, h, req):
333 # What follows mostly reimplements HTTPConnection.request()
333 # What follows mostly reimplements HTTPConnection.request()
334 # except it adds self.parent.addheaders in the mix.
334 # except it adds self.parent.addheaders in the mix.
335 headers = req.headers.copy()
335 headers = req.headers.copy()
336 if sys.version_info >= (2, 4):
336 if sys.version_info >= (2, 4):
337 headers.update(req.unredirected_hdrs)
337 headers.update(req.unredirected_hdrs)
338 headers.update(self.parent.addheaders)
338 headers.update(self.parent.addheaders)
339 headers = dict((n.lower(), v) for n, v in headers.items())
339 headers = dict((n.lower(), v) for n, v in headers.items())
340 skipheaders = {}
340 skipheaders = {}
341 for n in ('host', 'accept-encoding'):
341 for n in ('host', 'accept-encoding'):
342 if n in headers:
342 if n in headers:
343 skipheaders['skip_' + n.replace('-', '_')] = 1
343 skipheaders['skip_' + n.replace('-', '_')] = 1
344 try:
344 try:
345 if req.has_data():
345 if req.has_data():
346 data = req.get_data()
346 data = req.get_data()
347 h.putrequest('POST', req.get_selector(), **skipheaders)
347 h.putrequest('POST', req.get_selector(), **skipheaders)
348 if 'content-type' not in headers:
348 if 'content-type' not in headers:
349 h.putheader('Content-type',
349 h.putheader('Content-type',
350 'application/x-www-form-urlencoded')
350 'application/x-www-form-urlencoded')
351 if 'content-length' not in headers:
351 if 'content-length' not in headers:
352 h.putheader('Content-length', '%d' % len(data))
352 h.putheader('Content-length', '%d' % len(data))
353 else:
353 else:
354 h.putrequest('GET', req.get_selector(), **skipheaders)
354 h.putrequest('GET', req.get_selector(), **skipheaders)
355 except socket.error as err:
355 except socket.error as err:
356 raise urlerr.urlerror(err)
356 raise urlerr.urlerror(err)
357 for k, v in headers.items():
357 for k, v in headers.items():
358 h.putheader(k, v)
358 h.putheader(k, v)
359 h.endheaders()
359 h.endheaders()
360 if req.has_data():
360 if req.has_data():
361 h.send(data)
361 h.send(data)
362
362
363 class HTTPHandler(KeepAliveHandler, urlreq.httphandler):
363 class HTTPHandler(KeepAliveHandler, urlreq.httphandler):
364 pass
364 pass
365
365
366 class HTTPResponse(httplib.HTTPResponse):
366 class HTTPResponse(httplib.HTTPResponse):
367 # we need to subclass HTTPResponse in order to
367 # we need to subclass HTTPResponse in order to
368 # 1) add readline() and readlines() methods
368 # 1) add readline() and readlines() methods
369 # 2) add close_connection() methods
369 # 2) add close_connection() methods
370 # 3) add info() and geturl() methods
370 # 3) add info() and geturl() methods
371
371
372 # in order to add readline(), read must be modified to deal with a
372 # in order to add readline(), read must be modified to deal with a
373 # buffer. example: readline must read a buffer and then spit back
373 # buffer. example: readline must read a buffer and then spit back
374 # one line at a time. The only real alternative is to read one
374 # one line at a time. The only real alternative is to read one
375 # BYTE at a time (ick). Once something has been read, it can't be
375 # BYTE at a time (ick). Once something has been read, it can't be
376 # put back (ok, maybe it can, but that's even uglier than this),
376 # put back (ok, maybe it can, but that's even uglier than this),
377 # so if you THEN do a normal read, you must first take stuff from
377 # so if you THEN do a normal read, you must first take stuff from
378 # the buffer.
378 # the buffer.
379
379
380 # the read method wraps the original to accommodate buffering,
380 # the read method wraps the original to accommodate buffering,
381 # although read() never adds to the buffer.
381 # although read() never adds to the buffer.
382 # Both readline and readlines have been stolen with almost no
382 # Both readline and readlines have been stolen with almost no
383 # modification from socket.py
383 # modification from socket.py
384
384
385
385
386 def __init__(self, sock, debuglevel=0, strict=0, method=None):
386 def __init__(self, sock, debuglevel=0, strict=0, method=None):
387 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
387 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
388 self.fileno = sock.fileno
388 self.fileno = sock.fileno
389 self.code = None
389 self.code = None
390 self._rbuf = ''
390 self._rbuf = ''
391 self._rbufsize = 8096
391 self._rbufsize = 8096
392 self._handler = None # inserted by the handler later
392 self._handler = None # inserted by the handler later
393 self._host = None # (same)
393 self._host = None # (same)
394 self._url = None # (same)
394 self._url = None # (same)
395 self._connection = None # (same)
395 self._connection = None # (same)
396
396
397 _raw_read = httplib.HTTPResponse.read
397 _raw_read = httplib.HTTPResponse.read
398
398
399 def close(self):
399 def close(self):
400 if self.fp:
400 if self.fp:
401 self.fp.close()
401 self.fp.close()
402 self.fp = None
402 self.fp = None
403 if self._handler:
403 if self._handler:
404 self._handler._request_closed(self, self._host,
404 self._handler._request_closed(self, self._host,
405 self._connection)
405 self._connection)
406
406
407 def close_connection(self):
407 def close_connection(self):
408 self._handler._remove_connection(self._host, self._connection, close=1)
408 self._handler._remove_connection(self._host, self._connection, close=1)
409 self.close()
409 self.close()
410
410
411 def info(self):
411 def info(self):
412 return self.headers
412 return self.headers
413
413
414 def geturl(self):
414 def geturl(self):
415 return self._url
415 return self._url
416
416
417 def read(self, amt=None):
417 def read(self, amt=None):
418 # the _rbuf test is only in this first if for speed. It's not
418 # the _rbuf test is only in this first if for speed. It's not
419 # logically necessary
419 # logically necessary
420 if self._rbuf and not amt is None:
420 if self._rbuf and not amt is None:
421 L = len(self._rbuf)
421 L = len(self._rbuf)
422 if amt > L:
422 if amt > L:
423 amt -= L
423 amt -= L
424 else:
424 else:
425 s = self._rbuf[:amt]
425 s = self._rbuf[:amt]
426 self._rbuf = self._rbuf[amt:]
426 self._rbuf = self._rbuf[amt:]
427 return s
427 return s
428
428
429 s = self._rbuf + self._raw_read(amt)
429 s = self._rbuf + self._raw_read(amt)
430 self._rbuf = ''
430 self._rbuf = ''
431 return s
431 return s
432
432
433 # stolen from Python SVN #68532 to fix issue1088
433 # stolen from Python SVN #68532 to fix issue1088
434 def _read_chunked(self, amt):
434 def _read_chunked(self, amt):
435 chunk_left = self.chunk_left
435 chunk_left = self.chunk_left
436 value = ''
436 value = ''
437
437
438 # XXX This accumulates chunks by repeated string concatenation,
438 # XXX This accumulates chunks by repeated string concatenation,
439 # which is not efficient as the number or size of chunks gets big.
439 # which is not efficient as the number or size of chunks gets big.
440 while True:
440 while True:
441 if chunk_left is None:
441 if chunk_left is None:
442 line = self.fp.readline()
442 line = self.fp.readline()
443 i = line.find(';')
443 i = line.find(';')
444 if i >= 0:
444 if i >= 0:
445 line = line[:i] # strip chunk-extensions
445 line = line[:i] # strip chunk-extensions
446 try:
446 try:
447 chunk_left = int(line, 16)
447 chunk_left = int(line, 16)
448 except ValueError:
448 except ValueError:
449 # close the connection as protocol synchronization is
449 # close the connection as protocol synchronization is
450 # probably lost
450 # probably lost
451 self.close()
451 self.close()
452 raise httplib.IncompleteRead(value)
452 raise httplib.IncompleteRead(value)
453 if chunk_left == 0:
453 if chunk_left == 0:
454 break
454 break
455 if amt is None:
455 if amt is None:
456 value += self._safe_read(chunk_left)
456 value += self._safe_read(chunk_left)
457 elif amt < chunk_left:
457 elif amt < chunk_left:
458 value += self._safe_read(amt)
458 value += self._safe_read(amt)
459 self.chunk_left = chunk_left - amt
459 self.chunk_left = chunk_left - amt
460 return value
460 return value
461 elif amt == chunk_left:
461 elif amt == chunk_left:
462 value += self._safe_read(amt)
462 value += self._safe_read(amt)
463 self._safe_read(2) # toss the CRLF at the end of the chunk
463 self._safe_read(2) # toss the CRLF at the end of the chunk
464 self.chunk_left = None
464 self.chunk_left = None
465 return value
465 return value
466 else:
466 else:
467 value += self._safe_read(chunk_left)
467 value += self._safe_read(chunk_left)
468 amt -= chunk_left
468 amt -= chunk_left
469
469
470 # we read the whole chunk, get another
470 # we read the whole chunk, get another
471 self._safe_read(2) # toss the CRLF at the end of the chunk
471 self._safe_read(2) # toss the CRLF at the end of the chunk
472 chunk_left = None
472 chunk_left = None
473
473
474 # read and discard trailer up to the CRLF terminator
474 # read and discard trailer up to the CRLF terminator
475 ### note: we shouldn't have any trailers!
475 ### note: we shouldn't have any trailers!
476 while True:
476 while True:
477 line = self.fp.readline()
477 line = self.fp.readline()
478 if not line:
478 if not line:
479 # a vanishingly small number of sites EOF without
479 # a vanishingly small number of sites EOF without
480 # sending the trailer
480 # sending the trailer
481 break
481 break
482 if line == '\r\n':
482 if line == '\r\n':
483 break
483 break
484
484
485 # we read everything; close the "file"
485 # we read everything; close the "file"
486 self.close()
486 self.close()
487
487
488 return value
488 return value
489
489
490 def readline(self, limit=-1):
490 def readline(self, limit=-1):
491 i = self._rbuf.find('\n')
491 i = self._rbuf.find('\n')
492 while i < 0 and not (0 < limit <= len(self._rbuf)):
492 while i < 0 and not (0 < limit <= len(self._rbuf)):
493 new = self._raw_read(self._rbufsize)
493 new = self._raw_read(self._rbufsize)
494 if not new:
494 if not new:
495 break
495 break
496 i = new.find('\n')
496 i = new.find('\n')
497 if i >= 0:
497 if i >= 0:
498 i = i + len(self._rbuf)
498 i = i + len(self._rbuf)
499 self._rbuf = self._rbuf + new
499 self._rbuf = self._rbuf + new
500 if i < 0:
500 if i < 0:
501 i = len(self._rbuf)
501 i = len(self._rbuf)
502 else:
502 else:
503 i = i + 1
503 i = i + 1
504 if 0 <= limit < len(self._rbuf):
504 if 0 <= limit < len(self._rbuf):
505 i = limit
505 i = limit
506 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
506 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
507 return data
507 return data
508
508
509 def readlines(self, sizehint=0):
509 def readlines(self, sizehint=0):
510 total = 0
510 total = 0
511 list = []
511 list = []
512 while True:
512 while True:
513 line = self.readline()
513 line = self.readline()
514 if not line:
514 if not line:
515 break
515 break
516 list.append(line)
516 list.append(line)
517 total += len(line)
517 total += len(line)
518 if sizehint and total >= sizehint:
518 if sizehint and total >= sizehint:
519 break
519 break
520 return list
520 return list
521
521
522 def safesend(self, str):
522 def safesend(self, str):
523 """Send `str' to the server.
523 """Send `str' to the server.
524
524
525 Shamelessly ripped off from httplib to patch a bad behavior.
525 Shamelessly ripped off from httplib to patch a bad behavior.
526 """
526 """
527 # _broken_pipe_resp is an attribute we set in this function
527 # _broken_pipe_resp is an attribute we set in this function
528 # if the socket is closed while we're sending data but
528 # if the socket is closed while we're sending data but
529 # the server sent us a response before hanging up.
529 # the server sent us a response before hanging up.
530 # In that case, we want to pretend to send the rest of the
530 # In that case, we want to pretend to send the rest of the
531 # outgoing data, and then let the user use getresponse()
531 # outgoing data, and then let the user use getresponse()
532 # (which we wrap) to get this last response before
532 # (which we wrap) to get this last response before
533 # opening a new socket.
533 # opening a new socket.
534 if getattr(self, '_broken_pipe_resp', None) is not None:
534 if getattr(self, '_broken_pipe_resp', None) is not None:
535 return
535 return
536
536
537 if self.sock is None:
537 if self.sock is None:
538 if self.auto_open:
538 if self.auto_open:
539 self.connect()
539 self.connect()
540 else:
540 else:
541 raise httplib.NotConnected
541 raise httplib.NotConnected
542
542
543 # send the data to the server. if we get a broken pipe, then close
543 # send the data to the server. if we get a broken pipe, then close
544 # the socket. we want to reconnect when somebody tries to send again.
544 # the socket. we want to reconnect when somebody tries to send again.
545 #
545 #
546 # NOTE: we DO propagate the error, though, because we cannot simply
546 # NOTE: we DO propagate the error, though, because we cannot simply
547 # ignore the error... the caller will know if they can retry.
547 # ignore the error... the caller will know if they can retry.
548 if self.debuglevel > 0:
548 if self.debuglevel > 0:
549 print("send:", repr(str))
549 print("send:", repr(str))
550 try:
550 try:
551 blocksize = 8192
551 blocksize = 8192
552 read = getattr(str, 'read', None)
552 read = getattr(str, 'read', None)
553 if read is not None:
553 if read is not None:
554 if self.debuglevel > 0:
554 if self.debuglevel > 0:
555 print("sending a read()able")
555 print("sending a read()able")
556 data = read(blocksize)
556 data = read(blocksize)
557 while data:
557 while data:
558 self.sock.sendall(data)
558 self.sock.sendall(data)
559 data = read(blocksize)
559 data = read(blocksize)
560 else:
560 else:
561 self.sock.sendall(str)
561 self.sock.sendall(str)
562 except socket.error as v:
562 except socket.error as v:
563 reraise = True
563 reraise = True
564 if v[0] == errno.EPIPE: # Broken pipe
564 if v[0] == errno.EPIPE: # Broken pipe
565 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
565 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
566 self._broken_pipe_resp = None
566 self._broken_pipe_resp = None
567 self._broken_pipe_resp = self.getresponse()
567 self._broken_pipe_resp = self.getresponse()
568 reraise = False
568 reraise = False
569 self.close()
569 self.close()
570 if reraise:
570 if reraise:
571 raise
571 raise
572
572
573 def wrapgetresponse(cls):
573 def wrapgetresponse(cls):
574 """Wraps getresponse in cls with a broken-pipe sane version.
574 """Wraps getresponse in cls with a broken-pipe sane version.
575 """
575 """
576 def safegetresponse(self):
576 def safegetresponse(self):
577 # In safesend() we might set the _broken_pipe_resp
577 # In safesend() we might set the _broken_pipe_resp
578 # attribute, in which case the socket has already
578 # attribute, in which case the socket has already
579 # been closed and we just need to give them the response
579 # been closed and we just need to give them the response
580 # back. Otherwise, we use the normal response path.
580 # back. Otherwise, we use the normal response path.
581 r = getattr(self, '_broken_pipe_resp', None)
581 r = getattr(self, '_broken_pipe_resp', None)
582 if r is not None:
582 if r is not None:
583 return r
583 return r
584 return cls.getresponse(self)
584 return cls.getresponse(self)
585 safegetresponse.__doc__ = cls.getresponse.__doc__
585 safegetresponse.__doc__ = cls.getresponse.__doc__
586 return safegetresponse
586 return safegetresponse
587
587
588 class HTTPConnection(httplib.HTTPConnection):
588 class HTTPConnection(httplib.HTTPConnection):
589 # use the modified response class
589 # use the modified response class
590 response_class = HTTPResponse
590 response_class = HTTPResponse
591 send = safesend
591 send = safesend
592 getresponse = wrapgetresponse(httplib.HTTPConnection)
592 getresponse = wrapgetresponse(httplib.HTTPConnection)
593
593
594
594
595 #########################################################################
595 #########################################################################
596 ##### TEST FUNCTIONS
596 ##### TEST FUNCTIONS
597 #########################################################################
597 #########################################################################
598
598
599 def error_handler(url):
599 def error_handler(url):
600 global HANDLE_ERRORS
600 global HANDLE_ERRORS
601 orig = HANDLE_ERRORS
601 orig = HANDLE_ERRORS
602 keepalive_handler = HTTPHandler()
602 keepalive_handler = HTTPHandler()
603 opener = urlreq.buildopener(keepalive_handler)
603 opener = urlreq.buildopener(keepalive_handler)
604 urlreq.installopener(opener)
604 urlreq.installopener(opener)
605 pos = {0: 'off', 1: 'on'}
605 pos = {0: 'off', 1: 'on'}
606 for i in (0, 1):
606 for i in (0, 1):
607 print(" fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i))
607 print(" fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i))
608 HANDLE_ERRORS = i
608 HANDLE_ERRORS = i
609 try:
609 try:
610 fo = urlreq.urlopen(url)
610 fo = urlreq.urlopen(url)
611 fo.read()
611 fo.read()
612 fo.close()
612 fo.close()
613 try:
613 try:
614 status, reason = fo.status, fo.reason
614 status, reason = fo.status, fo.reason
615 except AttributeError:
615 except AttributeError:
616 status, reason = None, None
616 status, reason = None, None
617 except IOError as e:
617 except IOError as e:
618 print(" EXCEPTION: %s" % e)
618 print(" EXCEPTION: %s" % e)
619 raise
619 raise
620 else:
620 else:
621 print(" status = %s, reason = %s" % (status, reason))
621 print(" status = %s, reason = %s" % (status, reason))
622 HANDLE_ERRORS = orig
622 HANDLE_ERRORS = orig
623 hosts = keepalive_handler.open_connections()
623 hosts = keepalive_handler.open_connections()
624 print("open connections:", hosts)
624 print("open connections:", hosts)
625 keepalive_handler.close_all()
625 keepalive_handler.close_all()
626
626
627 def continuity(url):
627 def continuity(url):
628 md5 = hashlib.md5
628 md5 = hashlib.md5
629 format = '%25s: %s'
629 format = '%25s: %s'
630
630
631 # first fetch the file with the normal http handler
631 # first fetch the file with the normal http handler
632 opener = urlreq.buildopener()
632 opener = urlreq.buildopener()
633 urlreq.installopener(opener)
633 urlreq.installopener(opener)
634 fo = urlreq.urlopen(url)
634 fo = urlreq.urlopen(url)
635 foo = fo.read()
635 foo = fo.read()
636 fo.close()
636 fo.close()
637 m = md5(foo)
637 m = md5(foo)
638 print(format % ('normal urllib', m.hexdigest()))
638 print(format % ('normal urllib', m.hexdigest()))
639
639
640 # now install the keepalive handler and try again
640 # now install the keepalive handler and try again
641 opener = urlreq.buildopener(HTTPHandler())
641 opener = urlreq.buildopener(HTTPHandler())
642 urlreq.installopener(opener)
642 urlreq.installopener(opener)
643
643
644 fo = urlreq.urlopen(url)
644 fo = urlreq.urlopen(url)
645 foo = fo.read()
645 foo = fo.read()
646 fo.close()
646 fo.close()
647 m = md5(foo)
647 m = md5(foo)
648 print(format % ('keepalive read', m.hexdigest()))
648 print(format % ('keepalive read', m.hexdigest()))
649
649
650 fo = urlreq.urlopen(url)
650 fo = urlreq.urlopen(url)
651 foo = ''
651 foo = ''
652 while True:
652 while True:
653 f = fo.readline()
653 f = fo.readline()
654 if f:
654 if f:
655 foo = foo + f
655 foo = foo + f
656 else: break
656 else: break
657 fo.close()
657 fo.close()
658 m = md5(foo)
658 m = md5(foo)
659 print(format % ('keepalive readline', m.hexdigest()))
659 print(format % ('keepalive readline', m.hexdigest()))
660
660
661 def comp(N, url):
661 def comp(N, url):
662 print(' making %i connections to:\n %s' % (N, url))
662 print(' making %i connections to:\n %s' % (N, url))
663
663
664 sys.stdout.write(' first using the normal urllib handlers')
664 sys.stdout.write(' first using the normal urllib handlers')
665 # first use normal opener
665 # first use normal opener
666 opener = urlreq.buildopener()
666 opener = urlreq.buildopener()
667 urlreq.installopener(opener)
667 urlreq.installopener(opener)
668 t1 = fetch(N, url)
668 t1 = fetch(N, url)
669 print(' TIME: %.3f s' % t1)
669 print(' TIME: %.3f s' % t1)
670
670
671 sys.stdout.write(' now using the keepalive handler ')
671 sys.stdout.write(' now using the keepalive handler ')
672 # now install the keepalive handler and try again
672 # now install the keepalive handler and try again
673 opener = urlreq.buildopener(HTTPHandler())
673 opener = urlreq.buildopener(HTTPHandler())
674 urlreq.installopener(opener)
674 urlreq.installopener(opener)
675 t2 = fetch(N, url)
675 t2 = fetch(N, url)
676 print(' TIME: %.3f s' % t2)
676 print(' TIME: %.3f s' % t2)
677 print(' improvement factor: %.2f' % (t1 / t2))
677 print(' improvement factor: %.2f' % (t1 / t2))
678
678
679 def fetch(N, url, delay=0):
679 def fetch(N, url, delay=0):
680 import time
680 import time
681 lens = []
681 lens = []
682 starttime = time.time()
682 starttime = time.time()
683 for i in range(N):
683 for i in range(N):
684 if delay and i > 0:
684 if delay and i > 0:
685 time.sleep(delay)
685 time.sleep(delay)
686 fo = urlreq.urlopen(url)
686 fo = urlreq.urlopen(url)
687 foo = fo.read()
687 foo = fo.read()
688 fo.close()
688 fo.close()
689 lens.append(len(foo))
689 lens.append(len(foo))
690 diff = time.time() - starttime
690 diff = time.time() - starttime
691
691
692 j = 0
692 j = 0
693 for i in lens[1:]:
693 for i in lens[1:]:
694 j = j + 1
694 j = j + 1
695 if not i == lens[0]:
695 if not i == lens[0]:
696 print("WARNING: inconsistent length on read %i: %i" % (j, i))
696 print("WARNING: inconsistent length on read %i: %i" % (j, i))
697
697
698 return diff
698 return diff
699
699
700 def test_timeout(url):
700 def test_timeout(url):
701 global DEBUG
701 global DEBUG
702 dbbackup = DEBUG
702 dbbackup = DEBUG
703 class FakeLogger(object):
703 class FakeLogger(object):
704 def debug(self, msg, *args):
704 def debug(self, msg, *args):
705 print(msg % args)
705 print(msg % args)
706 info = warning = error = debug
706 info = warning = error = debug
707 DEBUG = FakeLogger()
707 DEBUG = FakeLogger()
708 print(" fetching the file to establish a connection")
708 print(" fetching the file to establish a connection")
709 fo = urlreq.urlopen(url)
709 fo = urlreq.urlopen(url)
710 data1 = fo.read()
710 data1 = fo.read()
711 fo.close()
711 fo.close()
712
712
713 i = 20
713 i = 20
714 print(" waiting %i seconds for the server to close the connection" % i)
714 print(" waiting %i seconds for the server to close the connection" % i)
715 while i > 0:
715 while i > 0:
716 sys.stdout.write('\r %2i' % i)
716 sys.stdout.write('\r %2i' % i)
717 sys.stdout.flush()
717 sys.stdout.flush()
718 time.sleep(1)
718 time.sleep(1)
719 i -= 1
719 i -= 1
720 sys.stderr.write('\r')
720 sys.stderr.write('\r')
721
721
722 print(" fetching the file a second time")
722 print(" fetching the file a second time")
723 fo = urlreq.urlopen(url)
723 fo = urlreq.urlopen(url)
724 data2 = fo.read()
724 data2 = fo.read()
725 fo.close()
725 fo.close()
726
726
727 if data1 == data2:
727 if data1 == data2:
728 print(' data are identical')
728 print(' data are identical')
729 else:
729 else:
730 print(' ERROR: DATA DIFFER')
730 print(' ERROR: DATA DIFFER')
731
731
732 DEBUG = dbbackup
732 DEBUG = dbbackup
733
733
734
734
735 def test(url, N=10):
735 def test(url, N=10):
736 print("checking error handler (do this on a non-200)")
736 print("checking error handler (do this on a non-200)")
737 try: error_handler(url)
737 try: error_handler(url)
738 except IOError:
738 except IOError:
739 print("exiting - exception will prevent further tests")
739 print("exiting - exception will prevent further tests")
740 sys.exit()
740 sys.exit()
741 print('')
741 print('')
742 print("performing continuity test (making sure stuff isn't corrupted)")
742 print("performing continuity test (making sure stuff isn't corrupted)")
743 continuity(url)
743 continuity(url)
744 print('')
744 print('')
745 print("performing speed comparison")
745 print("performing speed comparison")
746 comp(N, url)
746 comp(N, url)
747 print('')
747 print('')
748 print("performing dropped-connection check")
748 print("performing dropped-connection check")
749 test_timeout(url)
749 test_timeout(url)
750
750
751 if __name__ == '__main__':
751 if __name__ == '__main__':
752 import time
752 import time
753 try:
753 try:
754 N = int(sys.argv[1])
754 N = int(sys.argv[1])
755 url = sys.argv[2]
755 url = sys.argv[2]
756 except (IndexError, ValueError):
756 except (IndexError, ValueError):
757 print("%s <integer> <url>" % sys.argv[0])
757 print("%s <integer> <url>" % sys.argv[0])
758 else:
758 else:
759 test(url, N)
759 test(url, N)
General Comments 0
You need to be logged in to leave comments. Login now