##// END OF EJS Templates
keepalive: reorder header precedence...
Gregory Szorc -
r30463:bc0def54 default
parent child Browse files
Show More
@@ -1,759 +1,758 b''
1 # This library is free software; you can redistribute it and/or
1 # This library is free software; you can redistribute it and/or
2 # modify it under the terms of the GNU Lesser General Public
2 # modify it under the terms of the GNU Lesser General Public
3 # License as published by the Free Software Foundation; either
3 # License as published by the Free Software Foundation; either
4 # version 2.1 of the License, or (at your option) any later version.
4 # version 2.1 of the License, or (at your option) any later version.
5 #
5 #
6 # This library is distributed in the hope that it will be useful,
6 # This library is distributed in the hope that it will be useful,
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 # Lesser General Public License for more details.
9 # Lesser General Public License for more details.
10 #
10 #
11 # You should have received a copy of the GNU Lesser General Public
11 # You should have received a copy of the GNU Lesser General Public
12 # License along with this library; if not, see
12 # License along with this library; if not, see
13 # <http://www.gnu.org/licenses/>.
13 # <http://www.gnu.org/licenses/>.
14
14
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
17
17
18 # Modified by Benoit Boissinot:
18 # Modified by Benoit Boissinot:
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
20 # Modified by Dirkjan Ochtman:
20 # Modified by Dirkjan Ochtman:
21 # - import md5 function from a local util module
21 # - import md5 function from a local util module
22 # Modified by Augie Fackler:
22 # Modified by Augie Fackler:
23 # - add safesend method and use it to prevent broken pipe errors
23 # - add safesend method and use it to prevent broken pipe errors
24 # on large POST requests
24 # on large POST requests
25
25
26 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
26 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
27
27
28 >>> import urllib2
28 >>> import urllib2
29 >>> from keepalive import HTTPHandler
29 >>> from keepalive import HTTPHandler
30 >>> keepalive_handler = HTTPHandler()
30 >>> keepalive_handler = HTTPHandler()
31 >>> opener = urlreq.buildopener(keepalive_handler)
31 >>> opener = urlreq.buildopener(keepalive_handler)
32 >>> urlreq.installopener(opener)
32 >>> urlreq.installopener(opener)
33 >>>
33 >>>
34 >>> fo = urlreq.urlopen('http://www.python.org')
34 >>> fo = urlreq.urlopen('http://www.python.org')
35
35
36 If a connection to a given host is requested, and all of the existing
36 If a connection to a given host is requested, and all of the existing
37 connections are still in use, another connection will be opened. If
37 connections are still in use, another connection will be opened. If
38 the handler tries to use an existing connection but it fails in some
38 the handler tries to use an existing connection but it fails in some
39 way, it will be closed and removed from the pool.
39 way, it will be closed and removed from the pool.
40
40
41 To remove the handler, simply re-run build_opener with no arguments, and
41 To remove the handler, simply re-run build_opener with no arguments, and
42 install that opener.
42 install that opener.
43
43
44 You can explicitly close connections by using the close_connection()
44 You can explicitly close connections by using the close_connection()
45 method of the returned file-like object (described below) or you can
45 method of the returned file-like object (described below) or you can
46 use the handler methods:
46 use the handler methods:
47
47
48 close_connection(host)
48 close_connection(host)
49 close_all()
49 close_all()
50 open_connections()
50 open_connections()
51
51
52 NOTE: using the close_connection and close_all methods of the handler
52 NOTE: using the close_connection and close_all methods of the handler
53 should be done with care when using multiple threads.
53 should be done with care when using multiple threads.
54 * there is nothing that prevents another thread from creating new
54 * there is nothing that prevents another thread from creating new
55 connections immediately after connections are closed
55 connections immediately after connections are closed
56 * no checks are done to prevent in-use connections from being closed
56 * no checks are done to prevent in-use connections from being closed
57
57
58 >>> keepalive_handler.close_all()
58 >>> keepalive_handler.close_all()
59
59
60 EXTRA ATTRIBUTES AND METHODS
60 EXTRA ATTRIBUTES AND METHODS
61
61
62 Upon a status of 200, the object returned has a few additional
62 Upon a status of 200, the object returned has a few additional
63 attributes and methods, which should not be used if you want to
63 attributes and methods, which should not be used if you want to
64 remain consistent with the normal urllib2-returned objects:
64 remain consistent with the normal urllib2-returned objects:
65
65
66 close_connection() - close the connection to the host
66 close_connection() - close the connection to the host
67 readlines() - you know, readlines()
67 readlines() - you know, readlines()
68 status - the return status (i.e. 404)
68 status - the return status (i.e. 404)
69 reason - english translation of status (i.e. 'File not found')
69 reason - english translation of status (i.e. 'File not found')
70
70
71 If you want the best of both worlds, use this inside an
71 If you want the best of both worlds, use this inside an
72 AttributeError-catching try:
72 AttributeError-catching try:
73
73
74 >>> try: status = fo.status
74 >>> try: status = fo.status
75 >>> except AttributeError: status = None
75 >>> except AttributeError: status = None
76
76
77 Unfortunately, these are ONLY there if status == 200, so it's not
77 Unfortunately, these are ONLY there if status == 200, so it's not
78 easy to distinguish between non-200 responses. The reason is that
78 easy to distinguish between non-200 responses. The reason is that
79 urllib2 tries to do clever things with error codes 301, 302, 401,
79 urllib2 tries to do clever things with error codes 301, 302, 401,
80 and 407, and it wraps the object upon return.
80 and 407, and it wraps the object upon return.
81
81
82 For python versions earlier than 2.4, you can avoid this fancy error
82 For python versions earlier than 2.4, you can avoid this fancy error
83 handling by setting the module-level global HANDLE_ERRORS to zero.
83 handling by setting the module-level global HANDLE_ERRORS to zero.
84 You see, prior to 2.4, it's the HTTP Handler's job to determine what
84 You see, prior to 2.4, it's the HTTP Handler's job to determine what
85 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
85 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
86 means "pass everything up". In python 2.4, however, this job no
86 means "pass everything up". In python 2.4, however, this job no
87 longer belongs to the HTTP Handler and is now done by a NEW handler,
87 longer belongs to the HTTP Handler and is now done by a NEW handler,
88 HTTPErrorProcessor. Here's the bottom line:
88 HTTPErrorProcessor. Here's the bottom line:
89
89
90 python version < 2.4
90 python version < 2.4
91 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
91 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
92 errors
92 errors
93 HANDLE_ERRORS == 0 pass everything up, error processing is
93 HANDLE_ERRORS == 0 pass everything up, error processing is
94 left to the calling code
94 left to the calling code
95 python version >= 2.4
95 python version >= 2.4
96 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
96 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
97 HANDLE_ERRORS == 0 (default) pass everything up, let the
97 HANDLE_ERRORS == 0 (default) pass everything up, let the
98 other handlers (specifically,
98 other handlers (specifically,
99 HTTPErrorProcessor) decide what to do
99 HTTPErrorProcessor) decide what to do
100
100
101 In practice, setting the variable either way makes little difference
101 In practice, setting the variable either way makes little difference
102 in python 2.4, so for the most consistent behavior across versions,
102 in python 2.4, so for the most consistent behavior across versions,
103 you probably just want to use the defaults, which will give you
103 you probably just want to use the defaults, which will give you
104 exceptions on errors.
104 exceptions on errors.
105
105
106 """
106 """
107
107
108 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
108 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
109
109
110 from __future__ import absolute_import, print_function
110 from __future__ import absolute_import, print_function
111
111
112 import errno
112 import errno
113 import hashlib
113 import hashlib
114 import socket
114 import socket
115 import sys
115 import sys
116 import threading
116 import threading
117
117
118 from . import (
118 from . import (
119 util,
119 util,
120 )
120 )
121
121
122 httplib = util.httplib
122 httplib = util.httplib
123 urlerr = util.urlerr
123 urlerr = util.urlerr
124 urlreq = util.urlreq
124 urlreq = util.urlreq
125
125
126 DEBUG = None
126 DEBUG = None
127
127
128 if sys.version_info < (2, 4):
128 if sys.version_info < (2, 4):
129 HANDLE_ERRORS = 1
129 HANDLE_ERRORS = 1
130 else: HANDLE_ERRORS = 0
130 else: HANDLE_ERRORS = 0
131
131
132 class ConnectionManager(object):
132 class ConnectionManager(object):
133 """
133 """
134 The connection manager must be able to:
134 The connection manager must be able to:
135 * keep track of all existing
135 * keep track of all existing
136 """
136 """
137 def __init__(self):
137 def __init__(self):
138 self._lock = threading.Lock()
138 self._lock = threading.Lock()
139 self._hostmap = {} # map hosts to a list of connections
139 self._hostmap = {} # map hosts to a list of connections
140 self._connmap = {} # map connections to host
140 self._connmap = {} # map connections to host
141 self._readymap = {} # map connection to ready state
141 self._readymap = {} # map connection to ready state
142
142
143 def add(self, host, connection, ready):
143 def add(self, host, connection, ready):
144 self._lock.acquire()
144 self._lock.acquire()
145 try:
145 try:
146 if host not in self._hostmap:
146 if host not in self._hostmap:
147 self._hostmap[host] = []
147 self._hostmap[host] = []
148 self._hostmap[host].append(connection)
148 self._hostmap[host].append(connection)
149 self._connmap[connection] = host
149 self._connmap[connection] = host
150 self._readymap[connection] = ready
150 self._readymap[connection] = ready
151 finally:
151 finally:
152 self._lock.release()
152 self._lock.release()
153
153
154 def remove(self, connection):
154 def remove(self, connection):
155 self._lock.acquire()
155 self._lock.acquire()
156 try:
156 try:
157 try:
157 try:
158 host = self._connmap[connection]
158 host = self._connmap[connection]
159 except KeyError:
159 except KeyError:
160 pass
160 pass
161 else:
161 else:
162 del self._connmap[connection]
162 del self._connmap[connection]
163 del self._readymap[connection]
163 del self._readymap[connection]
164 self._hostmap[host].remove(connection)
164 self._hostmap[host].remove(connection)
165 if not self._hostmap[host]: del self._hostmap[host]
165 if not self._hostmap[host]: del self._hostmap[host]
166 finally:
166 finally:
167 self._lock.release()
167 self._lock.release()
168
168
169 def set_ready(self, connection, ready):
169 def set_ready(self, connection, ready):
170 try:
170 try:
171 self._readymap[connection] = ready
171 self._readymap[connection] = ready
172 except KeyError:
172 except KeyError:
173 pass
173 pass
174
174
175 def get_ready_conn(self, host):
175 def get_ready_conn(self, host):
176 conn = None
176 conn = None
177 self._lock.acquire()
177 self._lock.acquire()
178 try:
178 try:
179 if host in self._hostmap:
179 if host in self._hostmap:
180 for c in self._hostmap[host]:
180 for c in self._hostmap[host]:
181 if self._readymap[c]:
181 if self._readymap[c]:
182 self._readymap[c] = 0
182 self._readymap[c] = 0
183 conn = c
183 conn = c
184 break
184 break
185 finally:
185 finally:
186 self._lock.release()
186 self._lock.release()
187 return conn
187 return conn
188
188
189 def get_all(self, host=None):
189 def get_all(self, host=None):
190 if host:
190 if host:
191 return list(self._hostmap.get(host, []))
191 return list(self._hostmap.get(host, []))
192 else:
192 else:
193 return dict(self._hostmap)
193 return dict(self._hostmap)
194
194
195 class KeepAliveHandler(object):
195 class KeepAliveHandler(object):
196 def __init__(self):
196 def __init__(self):
197 self._cm = ConnectionManager()
197 self._cm = ConnectionManager()
198
198
199 #### Connection Management
199 #### Connection Management
200 def open_connections(self):
200 def open_connections(self):
201 """return a list of connected hosts and the number of connections
201 """return a list of connected hosts and the number of connections
202 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
202 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
203 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
203 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
204
204
205 def close_connection(self, host):
205 def close_connection(self, host):
206 """close connection(s) to <host>
206 """close connection(s) to <host>
207 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
207 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
208 no error occurs if there is no connection to that host."""
208 no error occurs if there is no connection to that host."""
209 for h in self._cm.get_all(host):
209 for h in self._cm.get_all(host):
210 self._cm.remove(h)
210 self._cm.remove(h)
211 h.close()
211 h.close()
212
212
213 def close_all(self):
213 def close_all(self):
214 """close all open connections"""
214 """close all open connections"""
215 for host, conns in self._cm.get_all().iteritems():
215 for host, conns in self._cm.get_all().iteritems():
216 for h in conns:
216 for h in conns:
217 self._cm.remove(h)
217 self._cm.remove(h)
218 h.close()
218 h.close()
219
219
220 def _request_closed(self, request, host, connection):
220 def _request_closed(self, request, host, connection):
221 """tells us that this request is now closed and that the
221 """tells us that this request is now closed and that the
222 connection is ready for another request"""
222 connection is ready for another request"""
223 self._cm.set_ready(connection, 1)
223 self._cm.set_ready(connection, 1)
224
224
225 def _remove_connection(self, host, connection, close=0):
225 def _remove_connection(self, host, connection, close=0):
226 if close:
226 if close:
227 connection.close()
227 connection.close()
228 self._cm.remove(connection)
228 self._cm.remove(connection)
229
229
230 #### Transaction Execution
230 #### Transaction Execution
231 def http_open(self, req):
231 def http_open(self, req):
232 return self.do_open(HTTPConnection, req)
232 return self.do_open(HTTPConnection, req)
233
233
234 def do_open(self, http_class, req):
234 def do_open(self, http_class, req):
235 host = req.get_host()
235 host = req.get_host()
236 if not host:
236 if not host:
237 raise urlerr.urlerror('no host given')
237 raise urlerr.urlerror('no host given')
238
238
239 try:
239 try:
240 h = self._cm.get_ready_conn(host)
240 h = self._cm.get_ready_conn(host)
241 while h:
241 while h:
242 r = self._reuse_connection(h, req, host)
242 r = self._reuse_connection(h, req, host)
243
243
244 # if this response is non-None, then it worked and we're
244 # if this response is non-None, then it worked and we're
245 # done. Break out, skipping the else block.
245 # done. Break out, skipping the else block.
246 if r:
246 if r:
247 break
247 break
248
248
249 # connection is bad - possibly closed by server
249 # connection is bad - possibly closed by server
250 # discard it and ask for the next free connection
250 # discard it and ask for the next free connection
251 h.close()
251 h.close()
252 self._cm.remove(h)
252 self._cm.remove(h)
253 h = self._cm.get_ready_conn(host)
253 h = self._cm.get_ready_conn(host)
254 else:
254 else:
255 # no (working) free connections were found. Create a new one.
255 # no (working) free connections were found. Create a new one.
256 h = http_class(host)
256 h = http_class(host)
257 if DEBUG:
257 if DEBUG:
258 DEBUG.info("creating new connection to %s (%d)",
258 DEBUG.info("creating new connection to %s (%d)",
259 host, id(h))
259 host, id(h))
260 self._cm.add(host, h, 0)
260 self._cm.add(host, h, 0)
261 self._start_transaction(h, req)
261 self._start_transaction(h, req)
262 r = h.getresponse()
262 r = h.getresponse()
263 except (socket.error, httplib.HTTPException) as err:
263 except (socket.error, httplib.HTTPException) as err:
264 raise urlerr.urlerror(err)
264 raise urlerr.urlerror(err)
265
265
266 # if not a persistent connection, don't try to reuse it
266 # if not a persistent connection, don't try to reuse it
267 if r.will_close:
267 if r.will_close:
268 self._cm.remove(h)
268 self._cm.remove(h)
269
269
270 if DEBUG:
270 if DEBUG:
271 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
271 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
272 r._handler = self
272 r._handler = self
273 r._host = host
273 r._host = host
274 r._url = req.get_full_url()
274 r._url = req.get_full_url()
275 r._connection = h
275 r._connection = h
276 r.code = r.status
276 r.code = r.status
277 r.headers = r.msg
277 r.headers = r.msg
278 r.msg = r.reason
278 r.msg = r.reason
279
279
280 if r.status == 200 or not HANDLE_ERRORS:
280 if r.status == 200 or not HANDLE_ERRORS:
281 return r
281 return r
282 else:
282 else:
283 return self.parent.error('http', req, r,
283 return self.parent.error('http', req, r,
284 r.status, r.msg, r.headers)
284 r.status, r.msg, r.headers)
285
285
286 def _reuse_connection(self, h, req, host):
286 def _reuse_connection(self, h, req, host):
287 """start the transaction with a re-used connection
287 """start the transaction with a re-used connection
288 return a response object (r) upon success or None on failure.
288 return a response object (r) upon success or None on failure.
289 This DOES not close or remove bad connections in cases where
289 This DOES not close or remove bad connections in cases where
290 it returns. However, if an unexpected exception occurs, it
290 it returns. However, if an unexpected exception occurs, it
291 will close and remove the connection before re-raising.
291 will close and remove the connection before re-raising.
292 """
292 """
293 try:
293 try:
294 self._start_transaction(h, req)
294 self._start_transaction(h, req)
295 r = h.getresponse()
295 r = h.getresponse()
296 # note: just because we got something back doesn't mean it
296 # note: just because we got something back doesn't mean it
297 # worked. We'll check the version below, too.
297 # worked. We'll check the version below, too.
298 except (socket.error, httplib.HTTPException):
298 except (socket.error, httplib.HTTPException):
299 r = None
299 r = None
300 except: # re-raises
300 except: # re-raises
301 # adding this block just in case we've missed
301 # adding this block just in case we've missed
302 # something we will still raise the exception, but
302 # something we will still raise the exception, but
303 # lets try and close the connection and remove it
303 # lets try and close the connection and remove it
304 # first. We previously got into a nasty loop
304 # first. We previously got into a nasty loop
305 # where an exception was uncaught, and so the
305 # where an exception was uncaught, and so the
306 # connection stayed open. On the next try, the
306 # connection stayed open. On the next try, the
307 # same exception was raised, etc. The trade-off is
307 # same exception was raised, etc. The trade-off is
308 # that it's now possible this call will raise
308 # that it's now possible this call will raise
309 # a DIFFERENT exception
309 # a DIFFERENT exception
310 if DEBUG:
310 if DEBUG:
311 DEBUG.error("unexpected exception - closing "
311 DEBUG.error("unexpected exception - closing "
312 "connection to %s (%d)", host, id(h))
312 "connection to %s (%d)", host, id(h))
313 self._cm.remove(h)
313 self._cm.remove(h)
314 h.close()
314 h.close()
315 raise
315 raise
316
316
317 if r is None or r.version == 9:
317 if r is None or r.version == 9:
318 # httplib falls back to assuming HTTP 0.9 if it gets a
318 # httplib falls back to assuming HTTP 0.9 if it gets a
319 # bad header back. This is most likely to happen if
319 # bad header back. This is most likely to happen if
320 # the socket has been closed by the server since we
320 # the socket has been closed by the server since we
321 # last used the connection.
321 # last used the connection.
322 if DEBUG:
322 if DEBUG:
323 DEBUG.info("failed to re-use connection to %s (%d)",
323 DEBUG.info("failed to re-use connection to %s (%d)",
324 host, id(h))
324 host, id(h))
325 r = None
325 r = None
326 else:
326 else:
327 if DEBUG:
327 if DEBUG:
328 DEBUG.info("re-using connection to %s (%d)", host, id(h))
328 DEBUG.info("re-using connection to %s (%d)", host, id(h))
329
329
330 return r
330 return r
331
331
332 def _start_transaction(self, h, req):
332 def _start_transaction(self, h, req):
333 # What follows mostly reimplements HTTPConnection.request()
333 # What follows mostly reimplements HTTPConnection.request()
334 # except it adds self.parent.addheaders in the mix.
334 # except it adds self.parent.addheaders in the mix.
335 headers = req.headers.copy()
335 headers = dict(self.parent.addheaders)
336 if sys.version_info >= (2, 4):
336 headers.update(req.headers)
337 headers.update(req.unredirected_hdrs)
337 headers.update(req.unredirected_hdrs)
338 headers.update(self.parent.addheaders)
339 headers = dict((n.lower(), v) for n, v in headers.items())
338 headers = dict((n.lower(), v) for n, v in headers.items())
340 skipheaders = {}
339 skipheaders = {}
341 for n in ('host', 'accept-encoding'):
340 for n in ('host', 'accept-encoding'):
342 if n in headers:
341 if n in headers:
343 skipheaders['skip_' + n.replace('-', '_')] = 1
342 skipheaders['skip_' + n.replace('-', '_')] = 1
344 try:
343 try:
345 if req.has_data():
344 if req.has_data():
346 data = req.get_data()
345 data = req.get_data()
347 h.putrequest('POST', req.get_selector(), **skipheaders)
346 h.putrequest('POST', req.get_selector(), **skipheaders)
348 if 'content-type' not in headers:
347 if 'content-type' not in headers:
349 h.putheader('Content-type',
348 h.putheader('Content-type',
350 'application/x-www-form-urlencoded')
349 'application/x-www-form-urlencoded')
351 if 'content-length' not in headers:
350 if 'content-length' not in headers:
352 h.putheader('Content-length', '%d' % len(data))
351 h.putheader('Content-length', '%d' % len(data))
353 else:
352 else:
354 h.putrequest('GET', req.get_selector(), **skipheaders)
353 h.putrequest('GET', req.get_selector(), **skipheaders)
355 except socket.error as err:
354 except socket.error as err:
356 raise urlerr.urlerror(err)
355 raise urlerr.urlerror(err)
357 for k, v in headers.items():
356 for k, v in headers.items():
358 h.putheader(k, v)
357 h.putheader(k, v)
359 h.endheaders()
358 h.endheaders()
360 if req.has_data():
359 if req.has_data():
361 h.send(data)
360 h.send(data)
362
361
363 class HTTPHandler(KeepAliveHandler, urlreq.httphandler):
362 class HTTPHandler(KeepAliveHandler, urlreq.httphandler):
364 pass
363 pass
365
364
366 class HTTPResponse(httplib.HTTPResponse):
365 class HTTPResponse(httplib.HTTPResponse):
367 # we need to subclass HTTPResponse in order to
366 # we need to subclass HTTPResponse in order to
368 # 1) add readline() and readlines() methods
367 # 1) add readline() and readlines() methods
369 # 2) add close_connection() methods
368 # 2) add close_connection() methods
370 # 3) add info() and geturl() methods
369 # 3) add info() and geturl() methods
371
370
372 # in order to add readline(), read must be modified to deal with a
371 # in order to add readline(), read must be modified to deal with a
373 # buffer. example: readline must read a buffer and then spit back
372 # buffer. example: readline must read a buffer and then spit back
374 # one line at a time. The only real alternative is to read one
373 # one line at a time. The only real alternative is to read one
375 # BYTE at a time (ick). Once something has been read, it can't be
374 # BYTE at a time (ick). Once something has been read, it can't be
376 # put back (ok, maybe it can, but that's even uglier than this),
375 # put back (ok, maybe it can, but that's even uglier than this),
377 # so if you THEN do a normal read, you must first take stuff from
376 # so if you THEN do a normal read, you must first take stuff from
378 # the buffer.
377 # the buffer.
379
378
380 # the read method wraps the original to accommodate buffering,
379 # the read method wraps the original to accommodate buffering,
381 # although read() never adds to the buffer.
380 # although read() never adds to the buffer.
382 # Both readline and readlines have been stolen with almost no
381 # Both readline and readlines have been stolen with almost no
383 # modification from socket.py
382 # modification from socket.py
384
383
385
384
386 def __init__(self, sock, debuglevel=0, strict=0, method=None):
385 def __init__(self, sock, debuglevel=0, strict=0, method=None):
387 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
386 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
388 self.fileno = sock.fileno
387 self.fileno = sock.fileno
389 self.code = None
388 self.code = None
390 self._rbuf = ''
389 self._rbuf = ''
391 self._rbufsize = 8096
390 self._rbufsize = 8096
392 self._handler = None # inserted by the handler later
391 self._handler = None # inserted by the handler later
393 self._host = None # (same)
392 self._host = None # (same)
394 self._url = None # (same)
393 self._url = None # (same)
395 self._connection = None # (same)
394 self._connection = None # (same)
396
395
397 _raw_read = httplib.HTTPResponse.read
396 _raw_read = httplib.HTTPResponse.read
398
397
399 def close(self):
398 def close(self):
400 if self.fp:
399 if self.fp:
401 self.fp.close()
400 self.fp.close()
402 self.fp = None
401 self.fp = None
403 if self._handler:
402 if self._handler:
404 self._handler._request_closed(self, self._host,
403 self._handler._request_closed(self, self._host,
405 self._connection)
404 self._connection)
406
405
407 def close_connection(self):
406 def close_connection(self):
408 self._handler._remove_connection(self._host, self._connection, close=1)
407 self._handler._remove_connection(self._host, self._connection, close=1)
409 self.close()
408 self.close()
410
409
411 def info(self):
410 def info(self):
412 return self.headers
411 return self.headers
413
412
414 def geturl(self):
413 def geturl(self):
415 return self._url
414 return self._url
416
415
417 def read(self, amt=None):
416 def read(self, amt=None):
418 # the _rbuf test is only in this first if for speed. It's not
417 # the _rbuf test is only in this first if for speed. It's not
419 # logically necessary
418 # logically necessary
420 if self._rbuf and not amt is None:
419 if self._rbuf and not amt is None:
421 L = len(self._rbuf)
420 L = len(self._rbuf)
422 if amt > L:
421 if amt > L:
423 amt -= L
422 amt -= L
424 else:
423 else:
425 s = self._rbuf[:amt]
424 s = self._rbuf[:amt]
426 self._rbuf = self._rbuf[amt:]
425 self._rbuf = self._rbuf[amt:]
427 return s
426 return s
428
427
429 s = self._rbuf + self._raw_read(amt)
428 s = self._rbuf + self._raw_read(amt)
430 self._rbuf = ''
429 self._rbuf = ''
431 return s
430 return s
432
431
433 # stolen from Python SVN #68532 to fix issue1088
432 # stolen from Python SVN #68532 to fix issue1088
434 def _read_chunked(self, amt):
433 def _read_chunked(self, amt):
435 chunk_left = self.chunk_left
434 chunk_left = self.chunk_left
436 value = ''
435 value = ''
437
436
438 # XXX This accumulates chunks by repeated string concatenation,
437 # XXX This accumulates chunks by repeated string concatenation,
439 # which is not efficient as the number or size of chunks gets big.
438 # which is not efficient as the number or size of chunks gets big.
440 while True:
439 while True:
441 if chunk_left is None:
440 if chunk_left is None:
442 line = self.fp.readline()
441 line = self.fp.readline()
443 i = line.find(';')
442 i = line.find(';')
444 if i >= 0:
443 if i >= 0:
445 line = line[:i] # strip chunk-extensions
444 line = line[:i] # strip chunk-extensions
446 try:
445 try:
447 chunk_left = int(line, 16)
446 chunk_left = int(line, 16)
448 except ValueError:
447 except ValueError:
449 # close the connection as protocol synchronization is
448 # close the connection as protocol synchronization is
450 # probably lost
449 # probably lost
451 self.close()
450 self.close()
452 raise httplib.IncompleteRead(value)
451 raise httplib.IncompleteRead(value)
453 if chunk_left == 0:
452 if chunk_left == 0:
454 break
453 break
455 if amt is None:
454 if amt is None:
456 value += self._safe_read(chunk_left)
455 value += self._safe_read(chunk_left)
457 elif amt < chunk_left:
456 elif amt < chunk_left:
458 value += self._safe_read(amt)
457 value += self._safe_read(amt)
459 self.chunk_left = chunk_left - amt
458 self.chunk_left = chunk_left - amt
460 return value
459 return value
461 elif amt == chunk_left:
460 elif amt == chunk_left:
462 value += self._safe_read(amt)
461 value += self._safe_read(amt)
463 self._safe_read(2) # toss the CRLF at the end of the chunk
462 self._safe_read(2) # toss the CRLF at the end of the chunk
464 self.chunk_left = None
463 self.chunk_left = None
465 return value
464 return value
466 else:
465 else:
467 value += self._safe_read(chunk_left)
466 value += self._safe_read(chunk_left)
468 amt -= chunk_left
467 amt -= chunk_left
469
468
470 # we read the whole chunk, get another
469 # we read the whole chunk, get another
471 self._safe_read(2) # toss the CRLF at the end of the chunk
470 self._safe_read(2) # toss the CRLF at the end of the chunk
472 chunk_left = None
471 chunk_left = None
473
472
474 # read and discard trailer up to the CRLF terminator
473 # read and discard trailer up to the CRLF terminator
475 ### note: we shouldn't have any trailers!
474 ### note: we shouldn't have any trailers!
476 while True:
475 while True:
477 line = self.fp.readline()
476 line = self.fp.readline()
478 if not line:
477 if not line:
479 # a vanishingly small number of sites EOF without
478 # a vanishingly small number of sites EOF without
480 # sending the trailer
479 # sending the trailer
481 break
480 break
482 if line == '\r\n':
481 if line == '\r\n':
483 break
482 break
484
483
485 # we read everything; close the "file"
484 # we read everything; close the "file"
486 self.close()
485 self.close()
487
486
488 return value
487 return value
489
488
490 def readline(self, limit=-1):
489 def readline(self, limit=-1):
491 i = self._rbuf.find('\n')
490 i = self._rbuf.find('\n')
492 while i < 0 and not (0 < limit <= len(self._rbuf)):
491 while i < 0 and not (0 < limit <= len(self._rbuf)):
493 new = self._raw_read(self._rbufsize)
492 new = self._raw_read(self._rbufsize)
494 if not new:
493 if not new:
495 break
494 break
496 i = new.find('\n')
495 i = new.find('\n')
497 if i >= 0:
496 if i >= 0:
498 i = i + len(self._rbuf)
497 i = i + len(self._rbuf)
499 self._rbuf = self._rbuf + new
498 self._rbuf = self._rbuf + new
500 if i < 0:
499 if i < 0:
501 i = len(self._rbuf)
500 i = len(self._rbuf)
502 else:
501 else:
503 i = i + 1
502 i = i + 1
504 if 0 <= limit < len(self._rbuf):
503 if 0 <= limit < len(self._rbuf):
505 i = limit
504 i = limit
506 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
505 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
507 return data
506 return data
508
507
509 def readlines(self, sizehint=0):
508 def readlines(self, sizehint=0):
510 total = 0
509 total = 0
511 list = []
510 list = []
512 while True:
511 while True:
513 line = self.readline()
512 line = self.readline()
514 if not line:
513 if not line:
515 break
514 break
516 list.append(line)
515 list.append(line)
517 total += len(line)
516 total += len(line)
518 if sizehint and total >= sizehint:
517 if sizehint and total >= sizehint:
519 break
518 break
520 return list
519 return list
521
520
522 def safesend(self, str):
521 def safesend(self, str):
523 """Send `str' to the server.
522 """Send `str' to the server.
524
523
525 Shamelessly ripped off from httplib to patch a bad behavior.
524 Shamelessly ripped off from httplib to patch a bad behavior.
526 """
525 """
527 # _broken_pipe_resp is an attribute we set in this function
526 # _broken_pipe_resp is an attribute we set in this function
528 # if the socket is closed while we're sending data but
527 # if the socket is closed while we're sending data but
529 # the server sent us a response before hanging up.
528 # the server sent us a response before hanging up.
530 # In that case, we want to pretend to send the rest of the
529 # In that case, we want to pretend to send the rest of the
531 # outgoing data, and then let the user use getresponse()
530 # outgoing data, and then let the user use getresponse()
532 # (which we wrap) to get this last response before
531 # (which we wrap) to get this last response before
533 # opening a new socket.
532 # opening a new socket.
534 if getattr(self, '_broken_pipe_resp', None) is not None:
533 if getattr(self, '_broken_pipe_resp', None) is not None:
535 return
534 return
536
535
537 if self.sock is None:
536 if self.sock is None:
538 if self.auto_open:
537 if self.auto_open:
539 self.connect()
538 self.connect()
540 else:
539 else:
541 raise httplib.NotConnected
540 raise httplib.NotConnected
542
541
543 # send the data to the server. if we get a broken pipe, then close
542 # send the data to the server. if we get a broken pipe, then close
544 # the socket. we want to reconnect when somebody tries to send again.
543 # the socket. we want to reconnect when somebody tries to send again.
545 #
544 #
546 # NOTE: we DO propagate the error, though, because we cannot simply
545 # NOTE: we DO propagate the error, though, because we cannot simply
547 # ignore the error... the caller will know if they can retry.
546 # ignore the error... the caller will know if they can retry.
548 if self.debuglevel > 0:
547 if self.debuglevel > 0:
549 print("send:", repr(str))
548 print("send:", repr(str))
550 try:
549 try:
551 blocksize = 8192
550 blocksize = 8192
552 read = getattr(str, 'read', None)
551 read = getattr(str, 'read', None)
553 if read is not None:
552 if read is not None:
554 if self.debuglevel > 0:
553 if self.debuglevel > 0:
555 print("sending a read()able")
554 print("sending a read()able")
556 data = read(blocksize)
555 data = read(blocksize)
557 while data:
556 while data:
558 self.sock.sendall(data)
557 self.sock.sendall(data)
559 data = read(blocksize)
558 data = read(blocksize)
560 else:
559 else:
561 self.sock.sendall(str)
560 self.sock.sendall(str)
562 except socket.error as v:
561 except socket.error as v:
563 reraise = True
562 reraise = True
564 if v[0] == errno.EPIPE: # Broken pipe
563 if v[0] == errno.EPIPE: # Broken pipe
565 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
564 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
566 self._broken_pipe_resp = None
565 self._broken_pipe_resp = None
567 self._broken_pipe_resp = self.getresponse()
566 self._broken_pipe_resp = self.getresponse()
568 reraise = False
567 reraise = False
569 self.close()
568 self.close()
570 if reraise:
569 if reraise:
571 raise
570 raise
572
571
573 def wrapgetresponse(cls):
572 def wrapgetresponse(cls):
574 """Wraps getresponse in cls with a broken-pipe sane version.
573 """Wraps getresponse in cls with a broken-pipe sane version.
575 """
574 """
576 def safegetresponse(self):
575 def safegetresponse(self):
577 # In safesend() we might set the _broken_pipe_resp
576 # In safesend() we might set the _broken_pipe_resp
578 # attribute, in which case the socket has already
577 # attribute, in which case the socket has already
579 # been closed and we just need to give them the response
578 # been closed and we just need to give them the response
580 # back. Otherwise, we use the normal response path.
579 # back. Otherwise, we use the normal response path.
581 r = getattr(self, '_broken_pipe_resp', None)
580 r = getattr(self, '_broken_pipe_resp', None)
582 if r is not None:
581 if r is not None:
583 return r
582 return r
584 return cls.getresponse(self)
583 return cls.getresponse(self)
585 safegetresponse.__doc__ = cls.getresponse.__doc__
584 safegetresponse.__doc__ = cls.getresponse.__doc__
586 return safegetresponse
585 return safegetresponse
587
586
588 class HTTPConnection(httplib.HTTPConnection):
587 class HTTPConnection(httplib.HTTPConnection):
589 # use the modified response class
588 # use the modified response class
590 response_class = HTTPResponse
589 response_class = HTTPResponse
591 send = safesend
590 send = safesend
592 getresponse = wrapgetresponse(httplib.HTTPConnection)
591 getresponse = wrapgetresponse(httplib.HTTPConnection)
593
592
594
593
595 #########################################################################
594 #########################################################################
596 ##### TEST FUNCTIONS
595 ##### TEST FUNCTIONS
597 #########################################################################
596 #########################################################################
598
597
599 def error_handler(url):
598 def error_handler(url):
600 global HANDLE_ERRORS
599 global HANDLE_ERRORS
601 orig = HANDLE_ERRORS
600 orig = HANDLE_ERRORS
602 keepalive_handler = HTTPHandler()
601 keepalive_handler = HTTPHandler()
603 opener = urlreq.buildopener(keepalive_handler)
602 opener = urlreq.buildopener(keepalive_handler)
604 urlreq.installopener(opener)
603 urlreq.installopener(opener)
605 pos = {0: 'off', 1: 'on'}
604 pos = {0: 'off', 1: 'on'}
606 for i in (0, 1):
605 for i in (0, 1):
607 print(" fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i))
606 print(" fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i))
608 HANDLE_ERRORS = i
607 HANDLE_ERRORS = i
609 try:
608 try:
610 fo = urlreq.urlopen(url)
609 fo = urlreq.urlopen(url)
611 fo.read()
610 fo.read()
612 fo.close()
611 fo.close()
613 try:
612 try:
614 status, reason = fo.status, fo.reason
613 status, reason = fo.status, fo.reason
615 except AttributeError:
614 except AttributeError:
616 status, reason = None, None
615 status, reason = None, None
617 except IOError as e:
616 except IOError as e:
618 print(" EXCEPTION: %s" % e)
617 print(" EXCEPTION: %s" % e)
619 raise
618 raise
620 else:
619 else:
621 print(" status = %s, reason = %s" % (status, reason))
620 print(" status = %s, reason = %s" % (status, reason))
622 HANDLE_ERRORS = orig
621 HANDLE_ERRORS = orig
623 hosts = keepalive_handler.open_connections()
622 hosts = keepalive_handler.open_connections()
624 print("open connections:", hosts)
623 print("open connections:", hosts)
625 keepalive_handler.close_all()
624 keepalive_handler.close_all()
626
625
627 def continuity(url):
626 def continuity(url):
628 md5 = hashlib.md5
627 md5 = hashlib.md5
629 format = '%25s: %s'
628 format = '%25s: %s'
630
629
631 # first fetch the file with the normal http handler
630 # first fetch the file with the normal http handler
632 opener = urlreq.buildopener()
631 opener = urlreq.buildopener()
633 urlreq.installopener(opener)
632 urlreq.installopener(opener)
634 fo = urlreq.urlopen(url)
633 fo = urlreq.urlopen(url)
635 foo = fo.read()
634 foo = fo.read()
636 fo.close()
635 fo.close()
637 m = md5(foo)
636 m = md5(foo)
638 print(format % ('normal urllib', m.hexdigest()))
637 print(format % ('normal urllib', m.hexdigest()))
639
638
640 # now install the keepalive handler and try again
639 # now install the keepalive handler and try again
641 opener = urlreq.buildopener(HTTPHandler())
640 opener = urlreq.buildopener(HTTPHandler())
642 urlreq.installopener(opener)
641 urlreq.installopener(opener)
643
642
644 fo = urlreq.urlopen(url)
643 fo = urlreq.urlopen(url)
645 foo = fo.read()
644 foo = fo.read()
646 fo.close()
645 fo.close()
647 m = md5(foo)
646 m = md5(foo)
648 print(format % ('keepalive read', m.hexdigest()))
647 print(format % ('keepalive read', m.hexdigest()))
649
648
650 fo = urlreq.urlopen(url)
649 fo = urlreq.urlopen(url)
651 foo = ''
650 foo = ''
652 while True:
651 while True:
653 f = fo.readline()
652 f = fo.readline()
654 if f:
653 if f:
655 foo = foo + f
654 foo = foo + f
656 else: break
655 else: break
657 fo.close()
656 fo.close()
658 m = md5(foo)
657 m = md5(foo)
659 print(format % ('keepalive readline', m.hexdigest()))
658 print(format % ('keepalive readline', m.hexdigest()))
660
659
661 def comp(N, url):
660 def comp(N, url):
662 print(' making %i connections to:\n %s' % (N, url))
661 print(' making %i connections to:\n %s' % (N, url))
663
662
664 sys.stdout.write(' first using the normal urllib handlers')
663 sys.stdout.write(' first using the normal urllib handlers')
665 # first use normal opener
664 # first use normal opener
666 opener = urlreq.buildopener()
665 opener = urlreq.buildopener()
667 urlreq.installopener(opener)
666 urlreq.installopener(opener)
668 t1 = fetch(N, url)
667 t1 = fetch(N, url)
669 print(' TIME: %.3f s' % t1)
668 print(' TIME: %.3f s' % t1)
670
669
671 sys.stdout.write(' now using the keepalive handler ')
670 sys.stdout.write(' now using the keepalive handler ')
672 # now install the keepalive handler and try again
671 # now install the keepalive handler and try again
673 opener = urlreq.buildopener(HTTPHandler())
672 opener = urlreq.buildopener(HTTPHandler())
674 urlreq.installopener(opener)
673 urlreq.installopener(opener)
675 t2 = fetch(N, url)
674 t2 = fetch(N, url)
676 print(' TIME: %.3f s' % t2)
675 print(' TIME: %.3f s' % t2)
677 print(' improvement factor: %.2f' % (t1 / t2))
676 print(' improvement factor: %.2f' % (t1 / t2))
678
677
679 def fetch(N, url, delay=0):
678 def fetch(N, url, delay=0):
680 import time
679 import time
681 lens = []
680 lens = []
682 starttime = time.time()
681 starttime = time.time()
683 for i in range(N):
682 for i in range(N):
684 if delay and i > 0:
683 if delay and i > 0:
685 time.sleep(delay)
684 time.sleep(delay)
686 fo = urlreq.urlopen(url)
685 fo = urlreq.urlopen(url)
687 foo = fo.read()
686 foo = fo.read()
688 fo.close()
687 fo.close()
689 lens.append(len(foo))
688 lens.append(len(foo))
690 diff = time.time() - starttime
689 diff = time.time() - starttime
691
690
692 j = 0
691 j = 0
693 for i in lens[1:]:
692 for i in lens[1:]:
694 j = j + 1
693 j = j + 1
695 if not i == lens[0]:
694 if not i == lens[0]:
696 print("WARNING: inconsistent length on read %i: %i" % (j, i))
695 print("WARNING: inconsistent length on read %i: %i" % (j, i))
697
696
698 return diff
697 return diff
699
698
700 def test_timeout(url):
699 def test_timeout(url):
701 global DEBUG
700 global DEBUG
702 dbbackup = DEBUG
701 dbbackup = DEBUG
703 class FakeLogger(object):
702 class FakeLogger(object):
704 def debug(self, msg, *args):
703 def debug(self, msg, *args):
705 print(msg % args)
704 print(msg % args)
706 info = warning = error = debug
705 info = warning = error = debug
707 DEBUG = FakeLogger()
706 DEBUG = FakeLogger()
708 print(" fetching the file to establish a connection")
707 print(" fetching the file to establish a connection")
709 fo = urlreq.urlopen(url)
708 fo = urlreq.urlopen(url)
710 data1 = fo.read()
709 data1 = fo.read()
711 fo.close()
710 fo.close()
712
711
713 i = 20
712 i = 20
714 print(" waiting %i seconds for the server to close the connection" % i)
713 print(" waiting %i seconds for the server to close the connection" % i)
715 while i > 0:
714 while i > 0:
716 sys.stdout.write('\r %2i' % i)
715 sys.stdout.write('\r %2i' % i)
717 sys.stdout.flush()
716 sys.stdout.flush()
718 time.sleep(1)
717 time.sleep(1)
719 i -= 1
718 i -= 1
720 sys.stderr.write('\r')
719 sys.stderr.write('\r')
721
720
722 print(" fetching the file a second time")
721 print(" fetching the file a second time")
723 fo = urlreq.urlopen(url)
722 fo = urlreq.urlopen(url)
724 data2 = fo.read()
723 data2 = fo.read()
725 fo.close()
724 fo.close()
726
725
727 if data1 == data2:
726 if data1 == data2:
728 print(' data are identical')
727 print(' data are identical')
729 else:
728 else:
730 print(' ERROR: DATA DIFFER')
729 print(' ERROR: DATA DIFFER')
731
730
732 DEBUG = dbbackup
731 DEBUG = dbbackup
733
732
734
733
735 def test(url, N=10):
734 def test(url, N=10):
736 print("checking error handler (do this on a non-200)")
735 print("checking error handler (do this on a non-200)")
737 try: error_handler(url)
736 try: error_handler(url)
738 except IOError:
737 except IOError:
739 print("exiting - exception will prevent further tests")
738 print("exiting - exception will prevent further tests")
740 sys.exit()
739 sys.exit()
741 print('')
740 print('')
742 print("performing continuity test (making sure stuff isn't corrupted)")
741 print("performing continuity test (making sure stuff isn't corrupted)")
743 continuity(url)
742 continuity(url)
744 print('')
743 print('')
745 print("performing speed comparison")
744 print("performing speed comparison")
746 comp(N, url)
745 comp(N, url)
747 print('')
746 print('')
748 print("performing dropped-connection check")
747 print("performing dropped-connection check")
749 test_timeout(url)
748 test_timeout(url)
750
749
751 if __name__ == '__main__':
750 if __name__ == '__main__':
752 import time
751 import time
753 try:
752 try:
754 N = int(sys.argv[1])
753 N = int(sys.argv[1])
755 url = sys.argv[2]
754 url = sys.argv[2]
756 except (IndexError, ValueError):
755 except (IndexError, ValueError):
757 print("%s <integer> <url>" % sys.argv[0])
756 print("%s <integer> <url>" % sys.argv[0])
758 else:
757 else:
759 test(url, N)
758 test(url, N)
General Comments 0
You need to be logged in to leave comments. Login now