##// END OF EJS Templates
keepalive: send HTTP request headers in a deterministic order...
Gregory Szorc -
r31999:aa836f56 default
parent child Browse files
Show More
@@ -1,708 +1,709 b''
1 # This library is free software; you can redistribute it and/or
1 # This library is free software; you can redistribute it and/or
2 # modify it under the terms of the GNU Lesser General Public
2 # modify it under the terms of the GNU Lesser General Public
3 # License as published by the Free Software Foundation; either
3 # License as published by the Free Software Foundation; either
4 # version 2.1 of the License, or (at your option) any later version.
4 # version 2.1 of the License, or (at your option) any later version.
5 #
5 #
6 # This library is distributed in the hope that it will be useful,
6 # This library is distributed in the hope that it will be useful,
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 # Lesser General Public License for more details.
9 # Lesser General Public License for more details.
10 #
10 #
11 # You should have received a copy of the GNU Lesser General Public
11 # You should have received a copy of the GNU Lesser General Public
12 # License along with this library; if not, see
12 # License along with this library; if not, see
13 # <http://www.gnu.org/licenses/>.
13 # <http://www.gnu.org/licenses/>.
14
14
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
17
17
18 # Modified by Benoit Boissinot:
18 # Modified by Benoit Boissinot:
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
19 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
20 # Modified by Dirkjan Ochtman:
20 # Modified by Dirkjan Ochtman:
21 # - import md5 function from a local util module
21 # - import md5 function from a local util module
22 # Modified by Augie Fackler:
22 # Modified by Augie Fackler:
23 # - add safesend method and use it to prevent broken pipe errors
23 # - add safesend method and use it to prevent broken pipe errors
24 # on large POST requests
24 # on large POST requests
25
25
26 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
26 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
27
27
28 >>> import urllib2
28 >>> import urllib2
29 >>> from keepalive import HTTPHandler
29 >>> from keepalive import HTTPHandler
30 >>> keepalive_handler = HTTPHandler()
30 >>> keepalive_handler = HTTPHandler()
31 >>> opener = urlreq.buildopener(keepalive_handler)
31 >>> opener = urlreq.buildopener(keepalive_handler)
32 >>> urlreq.installopener(opener)
32 >>> urlreq.installopener(opener)
33 >>>
33 >>>
34 >>> fo = urlreq.urlopen('http://www.python.org')
34 >>> fo = urlreq.urlopen('http://www.python.org')
35
35
36 If a connection to a given host is requested, and all of the existing
36 If a connection to a given host is requested, and all of the existing
37 connections are still in use, another connection will be opened. If
37 connections are still in use, another connection will be opened. If
38 the handler tries to use an existing connection but it fails in some
38 the handler tries to use an existing connection but it fails in some
39 way, it will be closed and removed from the pool.
39 way, it will be closed and removed from the pool.
40
40
41 To remove the handler, simply re-run build_opener with no arguments, and
41 To remove the handler, simply re-run build_opener with no arguments, and
42 install that opener.
42 install that opener.
43
43
44 You can explicitly close connections by using the close_connection()
44 You can explicitly close connections by using the close_connection()
45 method of the returned file-like object (described below) or you can
45 method of the returned file-like object (described below) or you can
46 use the handler methods:
46 use the handler methods:
47
47
48 close_connection(host)
48 close_connection(host)
49 close_all()
49 close_all()
50 open_connections()
50 open_connections()
51
51
52 NOTE: using the close_connection and close_all methods of the handler
52 NOTE: using the close_connection and close_all methods of the handler
53 should be done with care when using multiple threads.
53 should be done with care when using multiple threads.
54 * there is nothing that prevents another thread from creating new
54 * there is nothing that prevents another thread from creating new
55 connections immediately after connections are closed
55 connections immediately after connections are closed
56 * no checks are done to prevent in-use connections from being closed
56 * no checks are done to prevent in-use connections from being closed
57
57
58 >>> keepalive_handler.close_all()
58 >>> keepalive_handler.close_all()
59
59
60 EXTRA ATTRIBUTES AND METHODS
60 EXTRA ATTRIBUTES AND METHODS
61
61
62 Upon a status of 200, the object returned has a few additional
62 Upon a status of 200, the object returned has a few additional
63 attributes and methods, which should not be used if you want to
63 attributes and methods, which should not be used if you want to
64 remain consistent with the normal urllib2-returned objects:
64 remain consistent with the normal urllib2-returned objects:
65
65
66 close_connection() - close the connection to the host
66 close_connection() - close the connection to the host
67 readlines() - you know, readlines()
67 readlines() - you know, readlines()
68 status - the return status (i.e. 404)
68 status - the return status (i.e. 404)
69 reason - english translation of status (i.e. 'File not found')
69 reason - english translation of status (i.e. 'File not found')
70
70
71 If you want the best of both worlds, use this inside an
71 If you want the best of both worlds, use this inside an
72 AttributeError-catching try:
72 AttributeError-catching try:
73
73
74 >>> try: status = fo.status
74 >>> try: status = fo.status
75 >>> except AttributeError: status = None
75 >>> except AttributeError: status = None
76
76
77 Unfortunately, these are ONLY there if status == 200, so it's not
77 Unfortunately, these are ONLY there if status == 200, so it's not
78 easy to distinguish between non-200 responses. The reason is that
78 easy to distinguish between non-200 responses. The reason is that
79 urllib2 tries to do clever things with error codes 301, 302, 401,
79 urllib2 tries to do clever things with error codes 301, 302, 401,
80 and 407, and it wraps the object upon return.
80 and 407, and it wraps the object upon return.
81 """
81 """
82
82
83 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
83 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
84
84
85 from __future__ import absolute_import, print_function
85 from __future__ import absolute_import, print_function
86
86
87 import errno
87 import errno
88 import hashlib
88 import hashlib
89 import socket
89 import socket
90 import sys
90 import sys
91 import threading
91 import threading
92
92
93 from . import (
93 from . import (
94 util,
94 util,
95 )
95 )
96
96
97 httplib = util.httplib
97 httplib = util.httplib
98 urlerr = util.urlerr
98 urlerr = util.urlerr
99 urlreq = util.urlreq
99 urlreq = util.urlreq
100
100
101 DEBUG = None
101 DEBUG = None
102
102
103 class ConnectionManager(object):
103 class ConnectionManager(object):
104 """
104 """
105 The connection manager must be able to:
105 The connection manager must be able to:
106 * keep track of all existing
106 * keep track of all existing
107 """
107 """
108 def __init__(self):
108 def __init__(self):
109 self._lock = threading.Lock()
109 self._lock = threading.Lock()
110 self._hostmap = {} # map hosts to a list of connections
110 self._hostmap = {} # map hosts to a list of connections
111 self._connmap = {} # map connections to host
111 self._connmap = {} # map connections to host
112 self._readymap = {} # map connection to ready state
112 self._readymap = {} # map connection to ready state
113
113
114 def add(self, host, connection, ready):
114 def add(self, host, connection, ready):
115 self._lock.acquire()
115 self._lock.acquire()
116 try:
116 try:
117 if host not in self._hostmap:
117 if host not in self._hostmap:
118 self._hostmap[host] = []
118 self._hostmap[host] = []
119 self._hostmap[host].append(connection)
119 self._hostmap[host].append(connection)
120 self._connmap[connection] = host
120 self._connmap[connection] = host
121 self._readymap[connection] = ready
121 self._readymap[connection] = ready
122 finally:
122 finally:
123 self._lock.release()
123 self._lock.release()
124
124
125 def remove(self, connection):
125 def remove(self, connection):
126 self._lock.acquire()
126 self._lock.acquire()
127 try:
127 try:
128 try:
128 try:
129 host = self._connmap[connection]
129 host = self._connmap[connection]
130 except KeyError:
130 except KeyError:
131 pass
131 pass
132 else:
132 else:
133 del self._connmap[connection]
133 del self._connmap[connection]
134 del self._readymap[connection]
134 del self._readymap[connection]
135 self._hostmap[host].remove(connection)
135 self._hostmap[host].remove(connection)
136 if not self._hostmap[host]: del self._hostmap[host]
136 if not self._hostmap[host]: del self._hostmap[host]
137 finally:
137 finally:
138 self._lock.release()
138 self._lock.release()
139
139
140 def set_ready(self, connection, ready):
140 def set_ready(self, connection, ready):
141 try:
141 try:
142 self._readymap[connection] = ready
142 self._readymap[connection] = ready
143 except KeyError:
143 except KeyError:
144 pass
144 pass
145
145
146 def get_ready_conn(self, host):
146 def get_ready_conn(self, host):
147 conn = None
147 conn = None
148 self._lock.acquire()
148 self._lock.acquire()
149 try:
149 try:
150 if host in self._hostmap:
150 if host in self._hostmap:
151 for c in self._hostmap[host]:
151 for c in self._hostmap[host]:
152 if self._readymap[c]:
152 if self._readymap[c]:
153 self._readymap[c] = 0
153 self._readymap[c] = 0
154 conn = c
154 conn = c
155 break
155 break
156 finally:
156 finally:
157 self._lock.release()
157 self._lock.release()
158 return conn
158 return conn
159
159
160 def get_all(self, host=None):
160 def get_all(self, host=None):
161 if host:
161 if host:
162 return list(self._hostmap.get(host, []))
162 return list(self._hostmap.get(host, []))
163 else:
163 else:
164 return dict(self._hostmap)
164 return dict(self._hostmap)
165
165
166 class KeepAliveHandler(object):
166 class KeepAliveHandler(object):
167 def __init__(self):
167 def __init__(self):
168 self._cm = ConnectionManager()
168 self._cm = ConnectionManager()
169
169
170 #### Connection Management
170 #### Connection Management
171 def open_connections(self):
171 def open_connections(self):
172 """return a list of connected hosts and the number of connections
172 """return a list of connected hosts and the number of connections
173 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
173 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
174 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
174 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
175
175
176 def close_connection(self, host):
176 def close_connection(self, host):
177 """close connection(s) to <host>
177 """close connection(s) to <host>
178 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
178 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
179 no error occurs if there is no connection to that host."""
179 no error occurs if there is no connection to that host."""
180 for h in self._cm.get_all(host):
180 for h in self._cm.get_all(host):
181 self._cm.remove(h)
181 self._cm.remove(h)
182 h.close()
182 h.close()
183
183
184 def close_all(self):
184 def close_all(self):
185 """close all open connections"""
185 """close all open connections"""
186 for host, conns in self._cm.get_all().iteritems():
186 for host, conns in self._cm.get_all().iteritems():
187 for h in conns:
187 for h in conns:
188 self._cm.remove(h)
188 self._cm.remove(h)
189 h.close()
189 h.close()
190
190
191 def _request_closed(self, request, host, connection):
191 def _request_closed(self, request, host, connection):
192 """tells us that this request is now closed and that the
192 """tells us that this request is now closed and that the
193 connection is ready for another request"""
193 connection is ready for another request"""
194 self._cm.set_ready(connection, 1)
194 self._cm.set_ready(connection, 1)
195
195
196 def _remove_connection(self, host, connection, close=0):
196 def _remove_connection(self, host, connection, close=0):
197 if close:
197 if close:
198 connection.close()
198 connection.close()
199 self._cm.remove(connection)
199 self._cm.remove(connection)
200
200
201 #### Transaction Execution
201 #### Transaction Execution
202 def http_open(self, req):
202 def http_open(self, req):
203 return self.do_open(HTTPConnection, req)
203 return self.do_open(HTTPConnection, req)
204
204
205 def do_open(self, http_class, req):
205 def do_open(self, http_class, req):
206 host = req.get_host()
206 host = req.get_host()
207 if not host:
207 if not host:
208 raise urlerr.urlerror('no host given')
208 raise urlerr.urlerror('no host given')
209
209
210 try:
210 try:
211 h = self._cm.get_ready_conn(host)
211 h = self._cm.get_ready_conn(host)
212 while h:
212 while h:
213 r = self._reuse_connection(h, req, host)
213 r = self._reuse_connection(h, req, host)
214
214
215 # if this response is non-None, then it worked and we're
215 # if this response is non-None, then it worked and we're
216 # done. Break out, skipping the else block.
216 # done. Break out, skipping the else block.
217 if r:
217 if r:
218 break
218 break
219
219
220 # connection is bad - possibly closed by server
220 # connection is bad - possibly closed by server
221 # discard it and ask for the next free connection
221 # discard it and ask for the next free connection
222 h.close()
222 h.close()
223 self._cm.remove(h)
223 self._cm.remove(h)
224 h = self._cm.get_ready_conn(host)
224 h = self._cm.get_ready_conn(host)
225 else:
225 else:
226 # no (working) free connections were found. Create a new one.
226 # no (working) free connections were found. Create a new one.
227 h = http_class(host)
227 h = http_class(host)
228 if DEBUG:
228 if DEBUG:
229 DEBUG.info("creating new connection to %s (%d)",
229 DEBUG.info("creating new connection to %s (%d)",
230 host, id(h))
230 host, id(h))
231 self._cm.add(host, h, 0)
231 self._cm.add(host, h, 0)
232 self._start_transaction(h, req)
232 self._start_transaction(h, req)
233 r = h.getresponse()
233 r = h.getresponse()
234 except (socket.error, httplib.HTTPException) as err:
234 except (socket.error, httplib.HTTPException) as err:
235 raise urlerr.urlerror(err)
235 raise urlerr.urlerror(err)
236
236
237 # if not a persistent connection, don't try to reuse it
237 # if not a persistent connection, don't try to reuse it
238 if r.will_close:
238 if r.will_close:
239 self._cm.remove(h)
239 self._cm.remove(h)
240
240
241 if DEBUG:
241 if DEBUG:
242 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
242 DEBUG.info("STATUS: %s, %s", r.status, r.reason)
243 r._handler = self
243 r._handler = self
244 r._host = host
244 r._host = host
245 r._url = req.get_full_url()
245 r._url = req.get_full_url()
246 r._connection = h
246 r._connection = h
247 r.code = r.status
247 r.code = r.status
248 r.headers = r.msg
248 r.headers = r.msg
249 r.msg = r.reason
249 r.msg = r.reason
250
250
251 return r
251 return r
252
252
253 def _reuse_connection(self, h, req, host):
253 def _reuse_connection(self, h, req, host):
254 """start the transaction with a re-used connection
254 """start the transaction with a re-used connection
255 return a response object (r) upon success or None on failure.
255 return a response object (r) upon success or None on failure.
256 This DOES not close or remove bad connections in cases where
256 This DOES not close or remove bad connections in cases where
257 it returns. However, if an unexpected exception occurs, it
257 it returns. However, if an unexpected exception occurs, it
258 will close and remove the connection before re-raising.
258 will close and remove the connection before re-raising.
259 """
259 """
260 try:
260 try:
261 self._start_transaction(h, req)
261 self._start_transaction(h, req)
262 r = h.getresponse()
262 r = h.getresponse()
263 # note: just because we got something back doesn't mean it
263 # note: just because we got something back doesn't mean it
264 # worked. We'll check the version below, too.
264 # worked. We'll check the version below, too.
265 except (socket.error, httplib.HTTPException):
265 except (socket.error, httplib.HTTPException):
266 r = None
266 r = None
267 except: # re-raises
267 except: # re-raises
268 # adding this block just in case we've missed
268 # adding this block just in case we've missed
269 # something we will still raise the exception, but
269 # something we will still raise the exception, but
270 # lets try and close the connection and remove it
270 # lets try and close the connection and remove it
271 # first. We previously got into a nasty loop
271 # first. We previously got into a nasty loop
272 # where an exception was uncaught, and so the
272 # where an exception was uncaught, and so the
273 # connection stayed open. On the next try, the
273 # connection stayed open. On the next try, the
274 # same exception was raised, etc. The trade-off is
274 # same exception was raised, etc. The trade-off is
275 # that it's now possible this call will raise
275 # that it's now possible this call will raise
276 # a DIFFERENT exception
276 # a DIFFERENT exception
277 if DEBUG:
277 if DEBUG:
278 DEBUG.error("unexpected exception - closing "
278 DEBUG.error("unexpected exception - closing "
279 "connection to %s (%d)", host, id(h))
279 "connection to %s (%d)", host, id(h))
280 self._cm.remove(h)
280 self._cm.remove(h)
281 h.close()
281 h.close()
282 raise
282 raise
283
283
284 if r is None or r.version == 9:
284 if r is None or r.version == 9:
285 # httplib falls back to assuming HTTP 0.9 if it gets a
285 # httplib falls back to assuming HTTP 0.9 if it gets a
286 # bad header back. This is most likely to happen if
286 # bad header back. This is most likely to happen if
287 # the socket has been closed by the server since we
287 # the socket has been closed by the server since we
288 # last used the connection.
288 # last used the connection.
289 if DEBUG:
289 if DEBUG:
290 DEBUG.info("failed to re-use connection to %s (%d)",
290 DEBUG.info("failed to re-use connection to %s (%d)",
291 host, id(h))
291 host, id(h))
292 r = None
292 r = None
293 else:
293 else:
294 if DEBUG:
294 if DEBUG:
295 DEBUG.info("re-using connection to %s (%d)", host, id(h))
295 DEBUG.info("re-using connection to %s (%d)", host, id(h))
296
296
297 return r
297 return r
298
298
299 def _start_transaction(self, h, req):
299 def _start_transaction(self, h, req):
300 # What follows mostly reimplements HTTPConnection.request()
300 # What follows mostly reimplements HTTPConnection.request()
301 # except it adds self.parent.addheaders in the mix.
301 # except it adds self.parent.addheaders in the mix and sends headers
302 headers = dict(self.parent.addheaders)
302 # in a deterministic order (to make testing easier).
303 headers.update(req.headers)
303 headers = util.sortdict(self.parent.addheaders)
304 headers.update(req.unredirected_hdrs)
304 headers.update(sorted(req.headers.items()))
305 headers = dict((n.lower(), v) for n, v in headers.items())
305 headers.update(sorted(req.unredirected_hdrs.items()))
306 headers = util.sortdict((n.lower(), v) for n, v in headers.items())
306 skipheaders = {}
307 skipheaders = {}
307 for n in ('host', 'accept-encoding'):
308 for n in ('host', 'accept-encoding'):
308 if n in headers:
309 if n in headers:
309 skipheaders['skip_' + n.replace('-', '_')] = 1
310 skipheaders['skip_' + n.replace('-', '_')] = 1
310 try:
311 try:
311 if req.has_data():
312 if req.has_data():
312 data = req.get_data()
313 data = req.get_data()
313 h.putrequest(
314 h.putrequest(
314 req.get_method(), req.get_selector(), **skipheaders)
315 req.get_method(), req.get_selector(), **skipheaders)
315 if 'content-type' not in headers:
316 if 'content-type' not in headers:
316 h.putheader('Content-type',
317 h.putheader('Content-type',
317 'application/x-www-form-urlencoded')
318 'application/x-www-form-urlencoded')
318 if 'content-length' not in headers:
319 if 'content-length' not in headers:
319 h.putheader('Content-length', '%d' % len(data))
320 h.putheader('Content-length', '%d' % len(data))
320 else:
321 else:
321 h.putrequest(
322 h.putrequest(
322 req.get_method(), req.get_selector(), **skipheaders)
323 req.get_method(), req.get_selector(), **skipheaders)
323 except socket.error as err:
324 except socket.error as err:
324 raise urlerr.urlerror(err)
325 raise urlerr.urlerror(err)
325 for k, v in headers.items():
326 for k, v in headers.items():
326 h.putheader(k, v)
327 h.putheader(k, v)
327 h.endheaders()
328 h.endheaders()
328 if req.has_data():
329 if req.has_data():
329 h.send(data)
330 h.send(data)
330
331
331 class HTTPHandler(KeepAliveHandler, urlreq.httphandler):
332 class HTTPHandler(KeepAliveHandler, urlreq.httphandler):
332 pass
333 pass
333
334
334 class HTTPResponse(httplib.HTTPResponse):
335 class HTTPResponse(httplib.HTTPResponse):
335 # we need to subclass HTTPResponse in order to
336 # we need to subclass HTTPResponse in order to
336 # 1) add readline() and readlines() methods
337 # 1) add readline() and readlines() methods
337 # 2) add close_connection() methods
338 # 2) add close_connection() methods
338 # 3) add info() and geturl() methods
339 # 3) add info() and geturl() methods
339
340
340 # in order to add readline(), read must be modified to deal with a
341 # in order to add readline(), read must be modified to deal with a
341 # buffer. example: readline must read a buffer and then spit back
342 # buffer. example: readline must read a buffer and then spit back
342 # one line at a time. The only real alternative is to read one
343 # one line at a time. The only real alternative is to read one
343 # BYTE at a time (ick). Once something has been read, it can't be
344 # BYTE at a time (ick). Once something has been read, it can't be
344 # put back (ok, maybe it can, but that's even uglier than this),
345 # put back (ok, maybe it can, but that's even uglier than this),
345 # so if you THEN do a normal read, you must first take stuff from
346 # so if you THEN do a normal read, you must first take stuff from
346 # the buffer.
347 # the buffer.
347
348
348 # the read method wraps the original to accommodate buffering,
349 # the read method wraps the original to accommodate buffering,
349 # although read() never adds to the buffer.
350 # although read() never adds to the buffer.
350 # Both readline and readlines have been stolen with almost no
351 # Both readline and readlines have been stolen with almost no
351 # modification from socket.py
352 # modification from socket.py
352
353
353
354
354 def __init__(self, sock, debuglevel=0, strict=0, method=None):
355 def __init__(self, sock, debuglevel=0, strict=0, method=None):
355 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
356 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
356 self.fileno = sock.fileno
357 self.fileno = sock.fileno
357 self.code = None
358 self.code = None
358 self._rbuf = ''
359 self._rbuf = ''
359 self._rbufsize = 8096
360 self._rbufsize = 8096
360 self._handler = None # inserted by the handler later
361 self._handler = None # inserted by the handler later
361 self._host = None # (same)
362 self._host = None # (same)
362 self._url = None # (same)
363 self._url = None # (same)
363 self._connection = None # (same)
364 self._connection = None # (same)
364
365
365 _raw_read = httplib.HTTPResponse.read
366 _raw_read = httplib.HTTPResponse.read
366
367
367 def close(self):
368 def close(self):
368 if self.fp:
369 if self.fp:
369 self.fp.close()
370 self.fp.close()
370 self.fp = None
371 self.fp = None
371 if self._handler:
372 if self._handler:
372 self._handler._request_closed(self, self._host,
373 self._handler._request_closed(self, self._host,
373 self._connection)
374 self._connection)
374
375
375 def close_connection(self):
376 def close_connection(self):
376 self._handler._remove_connection(self._host, self._connection, close=1)
377 self._handler._remove_connection(self._host, self._connection, close=1)
377 self.close()
378 self.close()
378
379
379 def info(self):
380 def info(self):
380 return self.headers
381 return self.headers
381
382
382 def geturl(self):
383 def geturl(self):
383 return self._url
384 return self._url
384
385
385 def read(self, amt=None):
386 def read(self, amt=None):
386 # the _rbuf test is only in this first if for speed. It's not
387 # the _rbuf test is only in this first if for speed. It's not
387 # logically necessary
388 # logically necessary
388 if self._rbuf and not amt is None:
389 if self._rbuf and not amt is None:
389 L = len(self._rbuf)
390 L = len(self._rbuf)
390 if amt > L:
391 if amt > L:
391 amt -= L
392 amt -= L
392 else:
393 else:
393 s = self._rbuf[:amt]
394 s = self._rbuf[:amt]
394 self._rbuf = self._rbuf[amt:]
395 self._rbuf = self._rbuf[amt:]
395 return s
396 return s
396
397
397 s = self._rbuf + self._raw_read(amt)
398 s = self._rbuf + self._raw_read(amt)
398 self._rbuf = ''
399 self._rbuf = ''
399 return s
400 return s
400
401
401 # stolen from Python SVN #68532 to fix issue1088
402 # stolen from Python SVN #68532 to fix issue1088
402 def _read_chunked(self, amt):
403 def _read_chunked(self, amt):
403 chunk_left = self.chunk_left
404 chunk_left = self.chunk_left
404 parts = []
405 parts = []
405
406
406 while True:
407 while True:
407 if chunk_left is None:
408 if chunk_left is None:
408 line = self.fp.readline()
409 line = self.fp.readline()
409 i = line.find(';')
410 i = line.find(';')
410 if i >= 0:
411 if i >= 0:
411 line = line[:i] # strip chunk-extensions
412 line = line[:i] # strip chunk-extensions
412 try:
413 try:
413 chunk_left = int(line, 16)
414 chunk_left = int(line, 16)
414 except ValueError:
415 except ValueError:
415 # close the connection as protocol synchronization is
416 # close the connection as protocol synchronization is
416 # probably lost
417 # probably lost
417 self.close()
418 self.close()
418 raise httplib.IncompleteRead(''.join(parts))
419 raise httplib.IncompleteRead(''.join(parts))
419 if chunk_left == 0:
420 if chunk_left == 0:
420 break
421 break
421 if amt is None:
422 if amt is None:
422 parts.append(self._safe_read(chunk_left))
423 parts.append(self._safe_read(chunk_left))
423 elif amt < chunk_left:
424 elif amt < chunk_left:
424 parts.append(self._safe_read(amt))
425 parts.append(self._safe_read(amt))
425 self.chunk_left = chunk_left - amt
426 self.chunk_left = chunk_left - amt
426 return ''.join(parts)
427 return ''.join(parts)
427 elif amt == chunk_left:
428 elif amt == chunk_left:
428 parts.append(self._safe_read(amt))
429 parts.append(self._safe_read(amt))
429 self._safe_read(2) # toss the CRLF at the end of the chunk
430 self._safe_read(2) # toss the CRLF at the end of the chunk
430 self.chunk_left = None
431 self.chunk_left = None
431 return ''.join(parts)
432 return ''.join(parts)
432 else:
433 else:
433 parts.append(self._safe_read(chunk_left))
434 parts.append(self._safe_read(chunk_left))
434 amt -= chunk_left
435 amt -= chunk_left
435
436
436 # we read the whole chunk, get another
437 # we read the whole chunk, get another
437 self._safe_read(2) # toss the CRLF at the end of the chunk
438 self._safe_read(2) # toss the CRLF at the end of the chunk
438 chunk_left = None
439 chunk_left = None
439
440
440 # read and discard trailer up to the CRLF terminator
441 # read and discard trailer up to the CRLF terminator
441 ### note: we shouldn't have any trailers!
442 ### note: we shouldn't have any trailers!
442 while True:
443 while True:
443 line = self.fp.readline()
444 line = self.fp.readline()
444 if not line:
445 if not line:
445 # a vanishingly small number of sites EOF without
446 # a vanishingly small number of sites EOF without
446 # sending the trailer
447 # sending the trailer
447 break
448 break
448 if line == '\r\n':
449 if line == '\r\n':
449 break
450 break
450
451
451 # we read everything; close the "file"
452 # we read everything; close the "file"
452 self.close()
453 self.close()
453
454
454 return ''.join(parts)
455 return ''.join(parts)
455
456
456 def readline(self):
457 def readline(self):
457 # Fast path for a line is already available in read buffer.
458 # Fast path for a line is already available in read buffer.
458 i = self._rbuf.find('\n')
459 i = self._rbuf.find('\n')
459 if i >= 0:
460 if i >= 0:
460 i += 1
461 i += 1
461 line = self._rbuf[:i]
462 line = self._rbuf[:i]
462 self._rbuf = self._rbuf[i:]
463 self._rbuf = self._rbuf[i:]
463 return line
464 return line
464
465
465 # No newline in local buffer. Read until we find one.
466 # No newline in local buffer. Read until we find one.
466 chunks = [self._rbuf]
467 chunks = [self._rbuf]
467 i = -1
468 i = -1
468 readsize = self._rbufsize
469 readsize = self._rbufsize
469 while True:
470 while True:
470 new = self._raw_read(readsize)
471 new = self._raw_read(readsize)
471 if not new:
472 if not new:
472 break
473 break
473
474
474 chunks.append(new)
475 chunks.append(new)
475 i = new.find('\n')
476 i = new.find('\n')
476 if i >= 0:
477 if i >= 0:
477 break
478 break
478
479
479 # We either have exhausted the stream or have a newline in chunks[-1].
480 # We either have exhausted the stream or have a newline in chunks[-1].
480
481
481 # EOF
482 # EOF
482 if i == -1:
483 if i == -1:
483 self._rbuf = ''
484 self._rbuf = ''
484 return ''.join(chunks)
485 return ''.join(chunks)
485
486
486 i += 1
487 i += 1
487 self._rbuf = chunks[-1][i:]
488 self._rbuf = chunks[-1][i:]
488 chunks[-1] = chunks[-1][:i]
489 chunks[-1] = chunks[-1][:i]
489 return ''.join(chunks)
490 return ''.join(chunks)
490
491
491 def readlines(self, sizehint=0):
492 def readlines(self, sizehint=0):
492 total = 0
493 total = 0
493 list = []
494 list = []
494 while True:
495 while True:
495 line = self.readline()
496 line = self.readline()
496 if not line:
497 if not line:
497 break
498 break
498 list.append(line)
499 list.append(line)
499 total += len(line)
500 total += len(line)
500 if sizehint and total >= sizehint:
501 if sizehint and total >= sizehint:
501 break
502 break
502 return list
503 return list
503
504
504 def safesend(self, str):
505 def safesend(self, str):
505 """Send `str' to the server.
506 """Send `str' to the server.
506
507
507 Shamelessly ripped off from httplib to patch a bad behavior.
508 Shamelessly ripped off from httplib to patch a bad behavior.
508 """
509 """
509 # _broken_pipe_resp is an attribute we set in this function
510 # _broken_pipe_resp is an attribute we set in this function
510 # if the socket is closed while we're sending data but
511 # if the socket is closed while we're sending data but
511 # the server sent us a response before hanging up.
512 # the server sent us a response before hanging up.
512 # In that case, we want to pretend to send the rest of the
513 # In that case, we want to pretend to send the rest of the
513 # outgoing data, and then let the user use getresponse()
514 # outgoing data, and then let the user use getresponse()
514 # (which we wrap) to get this last response before
515 # (which we wrap) to get this last response before
515 # opening a new socket.
516 # opening a new socket.
516 if getattr(self, '_broken_pipe_resp', None) is not None:
517 if getattr(self, '_broken_pipe_resp', None) is not None:
517 return
518 return
518
519
519 if self.sock is None:
520 if self.sock is None:
520 if self.auto_open:
521 if self.auto_open:
521 self.connect()
522 self.connect()
522 else:
523 else:
523 raise httplib.NotConnected
524 raise httplib.NotConnected
524
525
525 # send the data to the server. if we get a broken pipe, then close
526 # send the data to the server. if we get a broken pipe, then close
526 # the socket. we want to reconnect when somebody tries to send again.
527 # the socket. we want to reconnect when somebody tries to send again.
527 #
528 #
528 # NOTE: we DO propagate the error, though, because we cannot simply
529 # NOTE: we DO propagate the error, though, because we cannot simply
529 # ignore the error... the caller will know if they can retry.
530 # ignore the error... the caller will know if they can retry.
530 if self.debuglevel > 0:
531 if self.debuglevel > 0:
531 print("send:", repr(str))
532 print("send:", repr(str))
532 try:
533 try:
533 blocksize = 8192
534 blocksize = 8192
534 read = getattr(str, 'read', None)
535 read = getattr(str, 'read', None)
535 if read is not None:
536 if read is not None:
536 if self.debuglevel > 0:
537 if self.debuglevel > 0:
537 print("sending a read()able")
538 print("sending a read()able")
538 data = read(blocksize)
539 data = read(blocksize)
539 while data:
540 while data:
540 self.sock.sendall(data)
541 self.sock.sendall(data)
541 data = read(blocksize)
542 data = read(blocksize)
542 else:
543 else:
543 self.sock.sendall(str)
544 self.sock.sendall(str)
544 except socket.error as v:
545 except socket.error as v:
545 reraise = True
546 reraise = True
546 if v[0] == errno.EPIPE: # Broken pipe
547 if v[0] == errno.EPIPE: # Broken pipe
547 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
548 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
548 self._broken_pipe_resp = None
549 self._broken_pipe_resp = None
549 self._broken_pipe_resp = self.getresponse()
550 self._broken_pipe_resp = self.getresponse()
550 reraise = False
551 reraise = False
551 self.close()
552 self.close()
552 if reraise:
553 if reraise:
553 raise
554 raise
554
555
555 def wrapgetresponse(cls):
556 def wrapgetresponse(cls):
556 """Wraps getresponse in cls with a broken-pipe sane version.
557 """Wraps getresponse in cls with a broken-pipe sane version.
557 """
558 """
558 def safegetresponse(self):
559 def safegetresponse(self):
559 # In safesend() we might set the _broken_pipe_resp
560 # In safesend() we might set the _broken_pipe_resp
560 # attribute, in which case the socket has already
561 # attribute, in which case the socket has already
561 # been closed and we just need to give them the response
562 # been closed and we just need to give them the response
562 # back. Otherwise, we use the normal response path.
563 # back. Otherwise, we use the normal response path.
563 r = getattr(self, '_broken_pipe_resp', None)
564 r = getattr(self, '_broken_pipe_resp', None)
564 if r is not None:
565 if r is not None:
565 return r
566 return r
566 return cls.getresponse(self)
567 return cls.getresponse(self)
567 safegetresponse.__doc__ = cls.getresponse.__doc__
568 safegetresponse.__doc__ = cls.getresponse.__doc__
568 return safegetresponse
569 return safegetresponse
569
570
570 class HTTPConnection(httplib.HTTPConnection):
571 class HTTPConnection(httplib.HTTPConnection):
571 # use the modified response class
572 # use the modified response class
572 response_class = HTTPResponse
573 response_class = HTTPResponse
573 send = safesend
574 send = safesend
574 getresponse = wrapgetresponse(httplib.HTTPConnection)
575 getresponse = wrapgetresponse(httplib.HTTPConnection)
575
576
576
577
577 #########################################################################
578 #########################################################################
578 ##### TEST FUNCTIONS
579 ##### TEST FUNCTIONS
579 #########################################################################
580 #########################################################################
580
581
581
582
582 def continuity(url):
583 def continuity(url):
583 md5 = hashlib.md5
584 md5 = hashlib.md5
584 format = '%25s: %s'
585 format = '%25s: %s'
585
586
586 # first fetch the file with the normal http handler
587 # first fetch the file with the normal http handler
587 opener = urlreq.buildopener()
588 opener = urlreq.buildopener()
588 urlreq.installopener(opener)
589 urlreq.installopener(opener)
589 fo = urlreq.urlopen(url)
590 fo = urlreq.urlopen(url)
590 foo = fo.read()
591 foo = fo.read()
591 fo.close()
592 fo.close()
592 m = md5(foo)
593 m = md5(foo)
593 print(format % ('normal urllib', m.hexdigest()))
594 print(format % ('normal urllib', m.hexdigest()))
594
595
595 # now install the keepalive handler and try again
596 # now install the keepalive handler and try again
596 opener = urlreq.buildopener(HTTPHandler())
597 opener = urlreq.buildopener(HTTPHandler())
597 urlreq.installopener(opener)
598 urlreq.installopener(opener)
598
599
599 fo = urlreq.urlopen(url)
600 fo = urlreq.urlopen(url)
600 foo = fo.read()
601 foo = fo.read()
601 fo.close()
602 fo.close()
602 m = md5(foo)
603 m = md5(foo)
603 print(format % ('keepalive read', m.hexdigest()))
604 print(format % ('keepalive read', m.hexdigest()))
604
605
605 fo = urlreq.urlopen(url)
606 fo = urlreq.urlopen(url)
606 foo = ''
607 foo = ''
607 while True:
608 while True:
608 f = fo.readline()
609 f = fo.readline()
609 if f:
610 if f:
610 foo = foo + f
611 foo = foo + f
611 else: break
612 else: break
612 fo.close()
613 fo.close()
613 m = md5(foo)
614 m = md5(foo)
614 print(format % ('keepalive readline', m.hexdigest()))
615 print(format % ('keepalive readline', m.hexdigest()))
615
616
616 def comp(N, url):
617 def comp(N, url):
617 print(' making %i connections to:\n %s' % (N, url))
618 print(' making %i connections to:\n %s' % (N, url))
618
619
619 util.stdout.write(' first using the normal urllib handlers')
620 util.stdout.write(' first using the normal urllib handlers')
620 # first use normal opener
621 # first use normal opener
621 opener = urlreq.buildopener()
622 opener = urlreq.buildopener()
622 urlreq.installopener(opener)
623 urlreq.installopener(opener)
623 t1 = fetch(N, url)
624 t1 = fetch(N, url)
624 print(' TIME: %.3f s' % t1)
625 print(' TIME: %.3f s' % t1)
625
626
626 util.stdout.write(' now using the keepalive handler ')
627 util.stdout.write(' now using the keepalive handler ')
627 # now install the keepalive handler and try again
628 # now install the keepalive handler and try again
628 opener = urlreq.buildopener(HTTPHandler())
629 opener = urlreq.buildopener(HTTPHandler())
629 urlreq.installopener(opener)
630 urlreq.installopener(opener)
630 t2 = fetch(N, url)
631 t2 = fetch(N, url)
631 print(' TIME: %.3f s' % t2)
632 print(' TIME: %.3f s' % t2)
632 print(' improvement factor: %.2f' % (t1 / t2))
633 print(' improvement factor: %.2f' % (t1 / t2))
633
634
634 def fetch(N, url, delay=0):
635 def fetch(N, url, delay=0):
635 import time
636 import time
636 lens = []
637 lens = []
637 starttime = time.time()
638 starttime = time.time()
638 for i in range(N):
639 for i in range(N):
639 if delay and i > 0:
640 if delay and i > 0:
640 time.sleep(delay)
641 time.sleep(delay)
641 fo = urlreq.urlopen(url)
642 fo = urlreq.urlopen(url)
642 foo = fo.read()
643 foo = fo.read()
643 fo.close()
644 fo.close()
644 lens.append(len(foo))
645 lens.append(len(foo))
645 diff = time.time() - starttime
646 diff = time.time() - starttime
646
647
647 j = 0
648 j = 0
648 for i in lens[1:]:
649 for i in lens[1:]:
649 j = j + 1
650 j = j + 1
650 if not i == lens[0]:
651 if not i == lens[0]:
651 print("WARNING: inconsistent length on read %i: %i" % (j, i))
652 print("WARNING: inconsistent length on read %i: %i" % (j, i))
652
653
653 return diff
654 return diff
654
655
655 def test_timeout(url):
656 def test_timeout(url):
656 global DEBUG
657 global DEBUG
657 dbbackup = DEBUG
658 dbbackup = DEBUG
658 class FakeLogger(object):
659 class FakeLogger(object):
659 def debug(self, msg, *args):
660 def debug(self, msg, *args):
660 print(msg % args)
661 print(msg % args)
661 info = warning = error = debug
662 info = warning = error = debug
662 DEBUG = FakeLogger()
663 DEBUG = FakeLogger()
663 print(" fetching the file to establish a connection")
664 print(" fetching the file to establish a connection")
664 fo = urlreq.urlopen(url)
665 fo = urlreq.urlopen(url)
665 data1 = fo.read()
666 data1 = fo.read()
666 fo.close()
667 fo.close()
667
668
668 i = 20
669 i = 20
669 print(" waiting %i seconds for the server to close the connection" % i)
670 print(" waiting %i seconds for the server to close the connection" % i)
670 while i > 0:
671 while i > 0:
671 util.stdout.write('\r %2i' % i)
672 util.stdout.write('\r %2i' % i)
672 util.stdout.flush()
673 util.stdout.flush()
673 time.sleep(1)
674 time.sleep(1)
674 i -= 1
675 i -= 1
675 util.stderr.write('\r')
676 util.stderr.write('\r')
676
677
677 print(" fetching the file a second time")
678 print(" fetching the file a second time")
678 fo = urlreq.urlopen(url)
679 fo = urlreq.urlopen(url)
679 data2 = fo.read()
680 data2 = fo.read()
680 fo.close()
681 fo.close()
681
682
682 if data1 == data2:
683 if data1 == data2:
683 print(' data are identical')
684 print(' data are identical')
684 else:
685 else:
685 print(' ERROR: DATA DIFFER')
686 print(' ERROR: DATA DIFFER')
686
687
687 DEBUG = dbbackup
688 DEBUG = dbbackup
688
689
689
690
690 def test(url, N=10):
691 def test(url, N=10):
691 print("performing continuity test (making sure stuff isn't corrupted)")
692 print("performing continuity test (making sure stuff isn't corrupted)")
692 continuity(url)
693 continuity(url)
693 print('')
694 print('')
694 print("performing speed comparison")
695 print("performing speed comparison")
695 comp(N, url)
696 comp(N, url)
696 print('')
697 print('')
697 print("performing dropped-connection check")
698 print("performing dropped-connection check")
698 test_timeout(url)
699 test_timeout(url)
699
700
700 if __name__ == '__main__':
701 if __name__ == '__main__':
701 import time
702 import time
702 try:
703 try:
703 N = int(sys.argv[1])
704 N = int(sys.argv[1])
704 url = sys.argv[2]
705 url = sys.argv[2]
705 except (IndexError, ValueError):
706 except (IndexError, ValueError):
706 print("%s <integer> <url>" % sys.argv[0])
707 print("%s <integer> <url>" % sys.argv[0])
707 else:
708 else:
708 test(url, N)
709 test(url, N)
General Comments 0
You need to be logged in to leave comments. Login now