##// END OF EJS Templates
keepalive: handle broken pipes gracefully during large POSTs
Augie Fackler -
r9726:430e59ff default
parent child Browse files
Show More
@@ -1,671 +1,741 b''
1 1 # This library is free software; you can redistribute it and/or
2 2 # modify it under the terms of the GNU Lesser General Public
3 3 # License as published by the Free Software Foundation; either
4 4 # version 2.1 of the License, or (at your option) any later version.
5 5 #
6 6 # This library is distributed in the hope that it will be useful,
7 7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
8 8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9 9 # Lesser General Public License for more details.
10 10 #
11 11 # You should have received a copy of the GNU Lesser General Public
12 12 # License along with this library; if not, write to the
13 13 # Free Software Foundation, Inc.,
14 14 # 59 Temple Place, Suite 330,
15 15 # Boston, MA 02111-1307 USA
16 16
17 17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
18 18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19 19
20 20 # Modified by Benoit Boissinot:
21 21 # - fix for digest auth (inspired from urllib2.py @ Python v2.4)
22 22 # Modified by Dirkjan Ochtman:
23 23 # - import md5 function from a local util module
24 24 # Modified by Martin Geisler:
25 25 # - moved md5 function from local util module to this module
26 # Modified by Augie Fackler:
27 # - add safesend method and use it to prevent broken pipe errors
28 # on large POST requests
26 29
27 30 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
28 31
29 32 >>> import urllib2
30 33 >>> from keepalive import HTTPHandler
31 34 >>> keepalive_handler = HTTPHandler()
32 35 >>> opener = urllib2.build_opener(keepalive_handler)
33 36 >>> urllib2.install_opener(opener)
34 37 >>>
35 38 >>> fo = urllib2.urlopen('http://www.python.org')
36 39
37 40 If a connection to a given host is requested, and all of the existing
38 41 connections are still in use, another connection will be opened. If
39 42 the handler tries to use an existing connection but it fails in some
40 43 way, it will be closed and removed from the pool.
41 44
42 45 To remove the handler, simply re-run build_opener with no arguments, and
43 46 install that opener.
44 47
45 48 You can explicitly close connections by using the close_connection()
46 49 method of the returned file-like object (described below) or you can
47 50 use the handler methods:
48 51
49 52 close_connection(host)
50 53 close_all()
51 54 open_connections()
52 55
53 56 NOTE: using the close_connection and close_all methods of the handler
54 57 should be done with care when using multiple threads.
55 58 * there is nothing that prevents another thread from creating new
56 59 connections immediately after connections are closed
57 60 * no checks are done to prevent in-use connections from being closed
58 61
59 62 >>> keepalive_handler.close_all()
60 63
61 64 EXTRA ATTRIBUTES AND METHODS
62 65
63 66 Upon a status of 200, the object returned has a few additional
64 67 attributes and methods, which should not be used if you want to
65 68 remain consistent with the normal urllib2-returned objects:
66 69
67 70 close_connection() - close the connection to the host
68 71 readlines() - you know, readlines()
69 72 status - the return status (ie 404)
70 73 reason - english translation of status (ie 'File not found')
71 74
72 75 If you want the best of both worlds, use this inside an
73 76 AttributeError-catching try:
74 77
75 78 >>> try: status = fo.status
76 79 >>> except AttributeError: status = None
77 80
78 81 Unfortunately, these are ONLY there if status == 200, so it's not
79 82 easy to distinguish between non-200 responses. The reason is that
80 83 urllib2 tries to do clever things with error codes 301, 302, 401,
81 84 and 407, and it wraps the object upon return.
82 85
83 86 For python versions earlier than 2.4, you can avoid this fancy error
84 87 handling by setting the module-level global HANDLE_ERRORS to zero.
85 88 You see, prior to 2.4, it's the HTTP Handler's job to determine what
86 89 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
87 90 means "pass everything up". In python 2.4, however, this job no
88 91 longer belongs to the HTTP Handler and is now done by a NEW handler,
89 92 HTTPErrorProcessor. Here's the bottom line:
90 93
91 94 python version < 2.4
92 95 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
93 96 errors
94 97 HANDLE_ERRORS == 0 pass everything up, error processing is
95 98 left to the calling code
96 99 python version >= 2.4
97 100 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
98 101 HANDLE_ERRORS == 0 (default) pass everything up, let the
99 102 other handlers (specifically,
100 103 HTTPErrorProcessor) decide what to do
101 104
102 105 In practice, setting the variable either way makes little difference
103 106 in python 2.4, so for the most consistent behavior across versions,
104 107 you probably just want to use the defaults, which will give you
105 108 exceptions on errors.
106 109
107 110 """
108 111
109 112 # $Id: keepalive.py,v 1.14 2006/04/04 21:00:32 mstenner Exp $
110 113
111 import urllib2
114 import errno
112 115 import httplib
113 116 import socket
114 117 import thread
118 import urllib2
115 119
116 120 DEBUG = None
117 121
118 122 import sys
119 123 if sys.version_info < (2, 4): HANDLE_ERRORS = 1
120 124 else: HANDLE_ERRORS = 0
121 125
122 126 class ConnectionManager:
123 127 """
124 128 The connection manager must be able to:
125 129 * keep track of all existing
126 130 """
127 131 def __init__(self):
128 132 self._lock = thread.allocate_lock()
129 133 self._hostmap = {} # map hosts to a list of connections
130 134 self._connmap = {} # map connections to host
131 135 self._readymap = {} # map connection to ready state
132 136
133 137 def add(self, host, connection, ready):
134 138 self._lock.acquire()
135 139 try:
136 140 if not host in self._hostmap: self._hostmap[host] = []
137 141 self._hostmap[host].append(connection)
138 142 self._connmap[connection] = host
139 143 self._readymap[connection] = ready
140 144 finally:
141 145 self._lock.release()
142 146
143 147 def remove(self, connection):
144 148 self._lock.acquire()
145 149 try:
146 150 try:
147 151 host = self._connmap[connection]
148 152 except KeyError:
149 153 pass
150 154 else:
151 155 del self._connmap[connection]
152 156 del self._readymap[connection]
153 157 self._hostmap[host].remove(connection)
154 158 if not self._hostmap[host]: del self._hostmap[host]
155 159 finally:
156 160 self._lock.release()
157 161
158 162 def set_ready(self, connection, ready):
159 163 try: self._readymap[connection] = ready
160 164 except KeyError: pass
161 165
162 166 def get_ready_conn(self, host):
163 167 conn = None
164 168 self._lock.acquire()
165 169 try:
166 170 if host in self._hostmap:
167 171 for c in self._hostmap[host]:
168 172 if self._readymap[c]:
169 173 self._readymap[c] = 0
170 174 conn = c
171 175 break
172 176 finally:
173 177 self._lock.release()
174 178 return conn
175 179
176 180 def get_all(self, host=None):
177 181 if host:
178 182 return list(self._hostmap.get(host, []))
179 183 else:
180 184 return dict(self._hostmap)
181 185
182 186 class KeepAliveHandler:
183 187 def __init__(self):
184 188 self._cm = ConnectionManager()
185 189
186 190 #### Connection Management
187 191 def open_connections(self):
188 192 """return a list of connected hosts and the number of connections
189 193 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
190 194 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
191 195
192 196 def close_connection(self, host):
193 197 """close connection(s) to <host>
194 198 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
195 199 no error occurs if there is no connection to that host."""
196 200 for h in self._cm.get_all(host):
197 201 self._cm.remove(h)
198 202 h.close()
199 203
200 204 def close_all(self):
201 205 """close all open connections"""
202 206 for host, conns in self._cm.get_all().iteritems():
203 207 for h in conns:
204 208 self._cm.remove(h)
205 209 h.close()
206 210
207 211 def _request_closed(self, request, host, connection):
208 212 """tells us that this request is now closed and the the
209 213 connection is ready for another request"""
210 214 self._cm.set_ready(connection, 1)
211 215
212 216 def _remove_connection(self, host, connection, close=0):
213 217 if close: connection.close()
214 218 self._cm.remove(connection)
215 219
216 220 #### Transaction Execution
217 221 def http_open(self, req):
218 222 return self.do_open(HTTPConnection, req)
219 223
220 224 def do_open(self, http_class, req):
221 225 host = req.get_host()
222 226 if not host:
223 227 raise urllib2.URLError('no host given')
224 228
225 229 try:
226 230 h = self._cm.get_ready_conn(host)
227 231 while h:
228 232 r = self._reuse_connection(h, req, host)
229 233
230 234 # if this response is non-None, then it worked and we're
231 235 # done. Break out, skipping the else block.
232 236 if r: break
233 237
234 238 # connection is bad - possibly closed by server
235 239 # discard it and ask for the next free connection
236 240 h.close()
237 241 self._cm.remove(h)
238 242 h = self._cm.get_ready_conn(host)
239 243 else:
240 244 # no (working) free connections were found. Create a new one.
241 245 h = http_class(host)
242 246 if DEBUG: DEBUG.info("creating new connection to %s (%d)",
243 247 host, id(h))
244 248 self._cm.add(host, h, 0)
245 249 self._start_transaction(h, req)
246 250 r = h.getresponse()
247 251 except (socket.error, httplib.HTTPException), err:
248 252 raise urllib2.URLError(err)
249 253
250 254 # if not a persistent connection, don't try to reuse it
251 255 if r.will_close: self._cm.remove(h)
252 256
253 257 if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
254 258 r._handler = self
255 259 r._host = host
256 260 r._url = req.get_full_url()
257 261 r._connection = h
258 262 r.code = r.status
259 263 r.headers = r.msg
260 264 r.msg = r.reason
261 265
262 266 if r.status == 200 or not HANDLE_ERRORS:
263 267 return r
264 268 else:
265 269 return self.parent.error('http', req, r,
266 270 r.status, r.msg, r.headers)
267 271
268 272 def _reuse_connection(self, h, req, host):
269 273 """start the transaction with a re-used connection
270 274 return a response object (r) upon success or None on failure.
271 275 This DOES not close or remove bad connections in cases where
272 276 it returns. However, if an unexpected exception occurs, it
273 277 will close and remove the connection before re-raising.
274 278 """
275 279 try:
276 280 self._start_transaction(h, req)
277 281 r = h.getresponse()
278 282 # note: just because we got something back doesn't mean it
279 283 # worked. We'll check the version below, too.
280 284 except (socket.error, httplib.HTTPException):
281 285 r = None
282 286 except:
283 287 # adding this block just in case we've missed
284 288 # something we will still raise the exception, but
285 289 # lets try and close the connection and remove it
286 290 # first. We previously got into a nasty loop
287 291 # where an exception was uncaught, and so the
288 292 # connection stayed open. On the next try, the
289 293 # same exception was raised, etc. The tradeoff is
290 294 # that it's now possible this call will raise
291 295 # a DIFFERENT exception
292 296 if DEBUG: DEBUG.error("unexpected exception - closing " + \
293 297 "connection to %s (%d)", host, id(h))
294 298 self._cm.remove(h)
295 299 h.close()
296 300 raise
297 301
298 302 if r is None or r.version == 9:
299 303 # httplib falls back to assuming HTTP 0.9 if it gets a
300 304 # bad header back. This is most likely to happen if
301 305 # the socket has been closed by the server since we
302 306 # last used the connection.
303 307 if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
304 308 host, id(h))
305 309 r = None
306 310 else:
307 311 if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
308 312
309 313 return r
310 314
311 315 def _start_transaction(self, h, req):
312 316 # What follows mostly reimplements HTTPConnection.request()
313 317 # except it adds self.parent.addheaders in the mix.
314 318 headers = req.headers.copy()
315 319 if sys.version_info >= (2, 4):
316 320 headers.update(req.unredirected_hdrs)
317 321 headers.update(self.parent.addheaders)
318 322 headers = dict((n.lower(), v) for n,v in headers.items())
319 323 skipheaders = {}
320 324 for n in ('host', 'accept-encoding'):
321 325 if n in headers:
322 326 skipheaders['skip_' + n.replace('-', '_')] = 1
323 327 try:
324 328 if req.has_data():
325 329 data = req.get_data()
326 330 h.putrequest('POST', req.get_selector(), **skipheaders)
327 331 if 'content-type' not in headers:
328 332 h.putheader('Content-type',
329 333 'application/x-www-form-urlencoded')
330 334 if 'content-length' not in headers:
331 335 h.putheader('Content-length', '%d' % len(data))
332 336 else:
333 337 h.putrequest('GET', req.get_selector(), **skipheaders)
334 338 except (socket.error), err:
335 339 raise urllib2.URLError(err)
336 340 for k, v in headers.items():
337 341 h.putheader(k, v)
338 342 h.endheaders()
339 343 if req.has_data():
340 344 h.send(data)
341 345
342 346 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
343 347 pass
344 348
345 349 class HTTPResponse(httplib.HTTPResponse):
346 350 # we need to subclass HTTPResponse in order to
347 351 # 1) add readline() and readlines() methods
348 352 # 2) add close_connection() methods
349 353 # 3) add info() and geturl() methods
350 354
351 355 # in order to add readline(), read must be modified to deal with a
352 356 # buffer. example: readline must read a buffer and then spit back
353 357 # one line at a time. The only real alternative is to read one
354 358 # BYTE at a time (ick). Once something has been read, it can't be
355 359 # put back (ok, maybe it can, but that's even uglier than this),
356 360 # so if you THEN do a normal read, you must first take stuff from
357 361 # the buffer.
358 362
359 363 # the read method wraps the original to accomodate buffering,
360 364 # although read() never adds to the buffer.
361 365 # Both readline and readlines have been stolen with almost no
362 366 # modification from socket.py
363 367
364 368
365 369 def __init__(self, sock, debuglevel=0, strict=0, method=None):
366 370 if method: # the httplib in python 2.3 uses the method arg
367 371 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
368 372 else: # 2.2 doesn't
369 373 httplib.HTTPResponse.__init__(self, sock, debuglevel)
370 374 self.fileno = sock.fileno
371 375 self.code = None
372 376 self._rbuf = ''
373 377 self._rbufsize = 8096
374 378 self._handler = None # inserted by the handler later
375 379 self._host = None # (same)
376 380 self._url = None # (same)
377 381 self._connection = None # (same)
378 382
379 383 _raw_read = httplib.HTTPResponse.read
380 384
381 385 def close(self):
382 386 if self.fp:
383 387 self.fp.close()
384 388 self.fp = None
385 389 if self._handler:
386 390 self._handler._request_closed(self, self._host,
387 391 self._connection)
388 392
389 393 def close_connection(self):
390 394 self._handler._remove_connection(self._host, self._connection, close=1)
391 395 self.close()
392 396
393 397 def info(self):
394 398 return self.headers
395 399
396 400 def geturl(self):
397 401 return self._url
398 402
399 403 def read(self, amt=None):
400 404 # the _rbuf test is only in this first if for speed. It's not
401 405 # logically necessary
402 406 if self._rbuf and not amt is None:
403 407 L = len(self._rbuf)
404 408 if amt > L:
405 409 amt -= L
406 410 else:
407 411 s = self._rbuf[:amt]
408 412 self._rbuf = self._rbuf[amt:]
409 413 return s
410 414
411 415 s = self._rbuf + self._raw_read(amt)
412 416 self._rbuf = ''
413 417 return s
414 418
415 419 # stolen from Python SVN #68532 to fix issue1088
416 420 def _read_chunked(self, amt):
417 421 chunk_left = self.chunk_left
418 422 value = ''
419 423
420 424 # XXX This accumulates chunks by repeated string concatenation,
421 425 # which is not efficient as the number or size of chunks gets big.
422 426 while True:
423 427 if chunk_left is None:
424 428 line = self.fp.readline()
425 429 i = line.find(';')
426 430 if i >= 0:
427 431 line = line[:i] # strip chunk-extensions
428 432 try:
429 433 chunk_left = int(line, 16)
430 434 except ValueError:
431 435 # close the connection as protocol synchronisation is
432 436 # probably lost
433 437 self.close()
434 438 raise httplib.IncompleteRead(value)
435 439 if chunk_left == 0:
436 440 break
437 441 if amt is None:
438 442 value += self._safe_read(chunk_left)
439 443 elif amt < chunk_left:
440 444 value += self._safe_read(amt)
441 445 self.chunk_left = chunk_left - amt
442 446 return value
443 447 elif amt == chunk_left:
444 448 value += self._safe_read(amt)
445 449 self._safe_read(2) # toss the CRLF at the end of the chunk
446 450 self.chunk_left = None
447 451 return value
448 452 else:
449 453 value += self._safe_read(chunk_left)
450 454 amt -= chunk_left
451 455
452 456 # we read the whole chunk, get another
453 457 self._safe_read(2) # toss the CRLF at the end of the chunk
454 458 chunk_left = None
455 459
456 460 # read and discard trailer up to the CRLF terminator
457 461 ### note: we shouldn't have any trailers!
458 462 while True:
459 463 line = self.fp.readline()
460 464 if not line:
461 465 # a vanishingly small number of sites EOF without
462 466 # sending the trailer
463 467 break
464 468 if line == '\r\n':
465 469 break
466 470
467 471 # we read everything; close the "file"
468 472 self.close()
469 473
470 474 return value
471 475
472 476 def readline(self, limit=-1):
473 477 i = self._rbuf.find('\n')
474 478 while i < 0 and not (0 < limit <= len(self._rbuf)):
475 479 new = self._raw_read(self._rbufsize)
476 480 if not new: break
477 481 i = new.find('\n')
478 482 if i >= 0: i = i + len(self._rbuf)
479 483 self._rbuf = self._rbuf + new
480 484 if i < 0: i = len(self._rbuf)
481 485 else: i = i+1
482 486 if 0 <= limit < len(self._rbuf): i = limit
483 487 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
484 488 return data
485 489
486 490 def readlines(self, sizehint = 0):
487 491 total = 0
488 492 list = []
489 493 while 1:
490 494 line = self.readline()
491 495 if not line: break
492 496 list.append(line)
493 497 total += len(line)
494 498 if sizehint and total >= sizehint:
495 499 break
496 500 return list
497 501
502 def safesend(self, str):
503 """Send `str' to the server.
504
505 Shamelessly ripped off from httplib to patch a bad behavior.
506 """
507 # _broken_pipe_resp is an attribute we set in this function
508 # if the socket is closed while we're sending data but
509 # the server sent us a response before hanging up.
510 # In that case, we want to pretend to send the rest of the
511 # outgoing data, and then let the user use getresponse()
512 # (which we wrap) to get this last response before
513 # opening a new socket.
514 if getattr(self, '_broken_pipe_resp', None) is not None:
515 return
516
517 if self.sock is None:
518 if self.auto_open:
519 self.connect()
520 else:
521 raise httplib.NotConnected()
522
523 # send the data to the server. if we get a broken pipe, then close
524 # the socket. we want to reconnect when somebody tries to send again.
525 #
526 # NOTE: we DO propagate the error, though, because we cannot simply
527 # ignore the error... the caller will know if they can retry.
528 if self.debuglevel > 0:
529 print "send:", repr(str)
530 try:
531 blocksize=8192
532 if hasattr(str,'read') :
533 if self.debuglevel > 0: print "sendIng a read()able"
534 data=str.read(blocksize)
535 while data:
536 self.sock.sendall(data)
537 data=str.read(blocksize)
538 else:
539 self.sock.sendall(str)
540 except socket.error, v:
541 reraise = True
542 if v[0] == errno.EPIPE: # Broken pipe
543 if self._HTTPConnection__state == httplib._CS_REQ_SENT:
544 self._broken_pipe_resp = None
545 self._broken_pipe_resp = self.getresponse()
546 reraise = False
547 self.close()
548 if reraise:
549 raise
550
551 def wrapgetresponse(cls):
552 """Wraps getresponse in cls with a broken-pipe sane version.
553 """
554 def safegetresponse(self):
555 # In safesend() we might set the _broken_pipe_resp
556 # attribute, in which case the socket has already
557 # been closed and we just need to give them the response
558 # back. Otherwise, we use the normal response path.
559 r = getattr(self, '_broken_pipe_resp', None)
560 if r is not None:
561 return r
562 return cls.getresponse(self)
563 safegetresponse.__doc__ = cls.getresponse.__doc__
564 return safegetresponse
498 565
499 566 class HTTPConnection(httplib.HTTPConnection):
500 567 # use the modified response class
501 568 response_class = HTTPResponse
569 send = safesend
570 getresponse = wrapgetresponse(httplib.HTTPConnection)
571
502 572
503 573 #########################################################################
504 574 ##### TEST FUNCTIONS
505 575 #########################################################################
506 576
507 577 def error_handler(url):
508 578 global HANDLE_ERRORS
509 579 orig = HANDLE_ERRORS
510 580 keepalive_handler = HTTPHandler()
511 581 opener = urllib2.build_opener(keepalive_handler)
512 582 urllib2.install_opener(opener)
513 583 pos = {0: 'off', 1: 'on'}
514 584 for i in (0, 1):
515 585 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
516 586 HANDLE_ERRORS = i
517 587 try:
518 588 fo = urllib2.urlopen(url)
519 589 fo.read()
520 590 fo.close()
521 591 try: status, reason = fo.status, fo.reason
522 592 except AttributeError: status, reason = None, None
523 593 except IOError, e:
524 594 print " EXCEPTION: %s" % e
525 595 raise
526 596 else:
527 597 print " status = %s, reason = %s" % (status, reason)
528 598 HANDLE_ERRORS = orig
529 599 hosts = keepalive_handler.open_connections()
530 600 print "open connections:", hosts
531 601 keepalive_handler.close_all()
532 602
533 603 def md5(s):
534 604 try:
535 605 from hashlib import md5 as _md5
536 606 except ImportError:
537 607 from md5 import md5 as _md5
538 608 global md5
539 609 md5 = _md5
540 610 return _md5(s)
541 611
542 612 def continuity(url):
543 613 format = '%25s: %s'
544 614
545 615 # first fetch the file with the normal http handler
546 616 opener = urllib2.build_opener()
547 617 urllib2.install_opener(opener)
548 618 fo = urllib2.urlopen(url)
549 619 foo = fo.read()
550 620 fo.close()
551 621 m = md5.new(foo)
552 622 print format % ('normal urllib', m.hexdigest())
553 623
554 624 # now install the keepalive handler and try again
555 625 opener = urllib2.build_opener(HTTPHandler())
556 626 urllib2.install_opener(opener)
557 627
558 628 fo = urllib2.urlopen(url)
559 629 foo = fo.read()
560 630 fo.close()
561 631 m = md5.new(foo)
562 632 print format % ('keepalive read', m.hexdigest())
563 633
564 634 fo = urllib2.urlopen(url)
565 635 foo = ''
566 636 while 1:
567 637 f = fo.readline()
568 638 if f: foo = foo + f
569 639 else: break
570 640 fo.close()
571 641 m = md5.new(foo)
572 642 print format % ('keepalive readline', m.hexdigest())
573 643
574 644 def comp(N, url):
575 645 print ' making %i connections to:\n %s' % (N, url)
576 646
577 647 sys.stdout.write(' first using the normal urllib handlers')
578 648 # first use normal opener
579 649 opener = urllib2.build_opener()
580 650 urllib2.install_opener(opener)
581 651 t1 = fetch(N, url)
582 652 print ' TIME: %.3f s' % t1
583 653
584 654 sys.stdout.write(' now using the keepalive handler ')
585 655 # now install the keepalive handler and try again
586 656 opener = urllib2.build_opener(HTTPHandler())
587 657 urllib2.install_opener(opener)
588 658 t2 = fetch(N, url)
589 659 print ' TIME: %.3f s' % t2
590 660 print ' improvement factor: %.2f' % (t1/t2, )
591 661
592 662 def fetch(N, url, delay=0):
593 663 import time
594 664 lens = []
595 665 starttime = time.time()
596 666 for i in range(N):
597 667 if delay and i > 0: time.sleep(delay)
598 668 fo = urllib2.urlopen(url)
599 669 foo = fo.read()
600 670 fo.close()
601 671 lens.append(len(foo))
602 672 diff = time.time() - starttime
603 673
604 674 j = 0
605 675 for i in lens[1:]:
606 676 j = j + 1
607 677 if not i == lens[0]:
608 678 print "WARNING: inconsistent length on read %i: %i" % (j, i)
609 679
610 680 return diff
611 681
612 682 def test_timeout(url):
613 683 global DEBUG
614 684 dbbackup = DEBUG
615 685 class FakeLogger:
616 686 def debug(self, msg, *args): print msg % args
617 687 info = warning = error = debug
618 688 DEBUG = FakeLogger()
619 689 print " fetching the file to establish a connection"
620 690 fo = urllib2.urlopen(url)
621 691 data1 = fo.read()
622 692 fo.close()
623 693
624 694 i = 20
625 695 print " waiting %i seconds for the server to close the connection" % i
626 696 while i > 0:
627 697 sys.stdout.write('\r %2i' % i)
628 698 sys.stdout.flush()
629 699 time.sleep(1)
630 700 i -= 1
631 701 sys.stderr.write('\r')
632 702
633 703 print " fetching the file a second time"
634 704 fo = urllib2.urlopen(url)
635 705 data2 = fo.read()
636 706 fo.close()
637 707
638 708 if data1 == data2:
639 709 print ' data are identical'
640 710 else:
641 711 print ' ERROR: DATA DIFFER'
642 712
643 713 DEBUG = dbbackup
644 714
645 715
646 716 def test(url, N=10):
647 717 print "checking error hander (do this on a non-200)"
648 718 try: error_handler(url)
649 719 except IOError:
650 720 print "exiting - exception will prevent further tests"
651 721 sys.exit()
652 722 print
653 723 print "performing continuity test (making sure stuff isn't corrupted)"
654 724 continuity(url)
655 725 print
656 726 print "performing speed comparison"
657 727 comp(N, url)
658 728 print
659 729 print "performing dropped-connection check"
660 730 test_timeout(url)
661 731
662 732 if __name__ == '__main__':
663 733 import time
664 734 import sys
665 735 try:
666 736 N = int(sys.argv[1])
667 737 url = sys.argv[2]
668 738 except:
669 739 print "%s <integer> <url>" % sys.argv[0]
670 740 else:
671 741 test(url, N)
@@ -1,533 +1,537 b''
1 1 # url.py - HTTP handling for mercurial
2 2 #
3 3 # Copyright 2005, 2006, 2007, 2008 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2006, 2007 Alexis S. L. Carvalho <alexis@cecm.usp.br>
5 5 # Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2, incorporated herein by reference.
9 9
10 10 import urllib, urllib2, urlparse, httplib, os, re, socket, cStringIO
11 11 from i18n import _
12 12 import keepalive, util
13 13
14 14 def hidepassword(url):
15 15 '''hide user credential in a url string'''
16 16 scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
17 17 netloc = re.sub('([^:]*):([^@]*)@(.*)', r'\1:***@\3', netloc)
18 18 return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
19 19
20 20 def removeauth(url):
21 21 '''remove all authentication information from a url string'''
22 22 scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
23 23 netloc = netloc[netloc.find('@')+1:]
24 24 return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
25 25
26 26 def netlocsplit(netloc):
27 27 '''split [user[:passwd]@]host[:port] into 4-tuple.'''
28 28
29 29 a = netloc.find('@')
30 30 if a == -1:
31 31 user, passwd = None, None
32 32 else:
33 33 userpass, netloc = netloc[:a], netloc[a+1:]
34 34 c = userpass.find(':')
35 35 if c == -1:
36 36 user, passwd = urllib.unquote(userpass), None
37 37 else:
38 38 user = urllib.unquote(userpass[:c])
39 39 passwd = urllib.unquote(userpass[c+1:])
40 40 c = netloc.find(':')
41 41 if c == -1:
42 42 host, port = netloc, None
43 43 else:
44 44 host, port = netloc[:c], netloc[c+1:]
45 45 return host, port, user, passwd
46 46
47 47 def netlocunsplit(host, port, user=None, passwd=None):
48 48 '''turn host, port, user, passwd into [user[:passwd]@]host[:port].'''
49 49 if port:
50 50 hostport = host + ':' + port
51 51 else:
52 52 hostport = host
53 53 if user:
54 54 if passwd:
55 55 userpass = urllib.quote(user) + ':' + urllib.quote(passwd)
56 56 else:
57 57 userpass = urllib.quote(user)
58 58 return userpass + '@' + hostport
59 59 return hostport
60 60
61 61 _safe = ('abcdefghijklmnopqrstuvwxyz'
62 62 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
63 63 '0123456789' '_.-/')
64 64 _safeset = None
65 65 _hex = None
66 66 def quotepath(path):
67 67 '''quote the path part of a URL
68 68
69 69 This is similar to urllib.quote, but it also tries to avoid
70 70 quoting things twice (inspired by wget):
71 71
72 72 >>> quotepath('abc def')
73 73 'abc%20def'
74 74 >>> quotepath('abc%20def')
75 75 'abc%20def'
76 76 >>> quotepath('abc%20 def')
77 77 'abc%20%20def'
78 78 >>> quotepath('abc def%20')
79 79 'abc%20def%20'
80 80 >>> quotepath('abc def%2')
81 81 'abc%20def%252'
82 82 >>> quotepath('abc def%')
83 83 'abc%20def%25'
84 84 '''
85 85 global _safeset, _hex
86 86 if _safeset is None:
87 87 _safeset = set(_safe)
88 88 _hex = set('abcdefABCDEF0123456789')
89 89 l = list(path)
90 90 for i in xrange(len(l)):
91 91 c = l[i]
92 92 if c == '%' and i + 2 < len(l) and (l[i+1] in _hex and l[i+2] in _hex):
93 93 pass
94 94 elif c not in _safeset:
95 95 l[i] = '%%%02X' % ord(c)
96 96 return ''.join(l)
97 97
98 98 class passwordmgr(urllib2.HTTPPasswordMgrWithDefaultRealm):
99 99 def __init__(self, ui):
100 100 urllib2.HTTPPasswordMgrWithDefaultRealm.__init__(self)
101 101 self.ui = ui
102 102
103 103 def find_user_password(self, realm, authuri):
104 104 authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm.find_user_password(
105 105 self, realm, authuri)
106 106 user, passwd = authinfo
107 107 if user and passwd:
108 108 self._writedebug(user, passwd)
109 109 return (user, passwd)
110 110
111 111 if not user:
112 112 auth = self.readauthtoken(authuri)
113 113 if auth:
114 114 user, passwd = auth.get('username'), auth.get('password')
115 115 if not user or not passwd:
116 116 if not self.ui.interactive():
117 117 raise util.Abort(_('http authorization required'))
118 118
119 119 self.ui.write(_("http authorization required\n"))
120 120 self.ui.status(_("realm: %s\n") % realm)
121 121 if user:
122 122 self.ui.status(_("user: %s\n") % user)
123 123 else:
124 124 user = self.ui.prompt(_("user:"), default=None)
125 125
126 126 if not passwd:
127 127 passwd = self.ui.getpass()
128 128
129 129 self.add_password(realm, authuri, user, passwd)
130 130 self._writedebug(user, passwd)
131 131 return (user, passwd)
132 132
133 133 def _writedebug(self, user, passwd):
134 134 msg = _('http auth: user %s, password %s\n')
135 135 self.ui.debug(msg % (user, passwd and '*' * len(passwd) or 'not set'))
136 136
137 137 def readauthtoken(self, uri):
138 138 # Read configuration
139 139 config = dict()
140 140 for key, val in self.ui.configitems('auth'):
141 141 group, setting = key.split('.', 1)
142 142 gdict = config.setdefault(group, dict())
143 143 gdict[setting] = val
144 144
145 145 # Find the best match
146 146 scheme, hostpath = uri.split('://', 1)
147 147 bestlen = 0
148 148 bestauth = None
149 149 for auth in config.itervalues():
150 150 prefix = auth.get('prefix')
151 151 if not prefix: continue
152 152 p = prefix.split('://', 1)
153 153 if len(p) > 1:
154 154 schemes, prefix = [p[0]], p[1]
155 155 else:
156 156 schemes = (auth.get('schemes') or 'https').split()
157 157 if (prefix == '*' or hostpath.startswith(prefix)) and \
158 158 len(prefix) > bestlen and scheme in schemes:
159 159 bestlen = len(prefix)
160 160 bestauth = auth
161 161 return bestauth
162 162
163 163 class proxyhandler(urllib2.ProxyHandler):
164 164 def __init__(self, ui):
165 165 proxyurl = ui.config("http_proxy", "host") or os.getenv('http_proxy')
166 166 # XXX proxyauthinfo = None
167 167
168 168 if proxyurl:
169 169 # proxy can be proper url or host[:port]
170 170 if not (proxyurl.startswith('http:') or
171 171 proxyurl.startswith('https:')):
172 172 proxyurl = 'http://' + proxyurl + '/'
173 173 snpqf = urlparse.urlsplit(proxyurl)
174 174 proxyscheme, proxynetloc, proxypath, proxyquery, proxyfrag = snpqf
175 175 hpup = netlocsplit(proxynetloc)
176 176
177 177 proxyhost, proxyport, proxyuser, proxypasswd = hpup
178 178 if not proxyuser:
179 179 proxyuser = ui.config("http_proxy", "user")
180 180 proxypasswd = ui.config("http_proxy", "passwd")
181 181
182 182 # see if we should use a proxy for this url
183 183 no_list = [ "localhost", "127.0.0.1" ]
184 184 no_list.extend([p.lower() for
185 185 p in ui.configlist("http_proxy", "no")])
186 186 no_list.extend([p.strip().lower() for
187 187 p in os.getenv("no_proxy", '').split(',')
188 188 if p.strip()])
189 189 # "http_proxy.always" config is for running tests on localhost
190 190 if ui.configbool("http_proxy", "always"):
191 191 self.no_list = []
192 192 else:
193 193 self.no_list = no_list
194 194
195 195 proxyurl = urlparse.urlunsplit((
196 196 proxyscheme, netlocunsplit(proxyhost, proxyport,
197 197 proxyuser, proxypasswd or ''),
198 198 proxypath, proxyquery, proxyfrag))
199 199 proxies = {'http': proxyurl, 'https': proxyurl}
200 200 ui.debug('proxying through http://%s:%s\n' %
201 201 (proxyhost, proxyport))
202 202 else:
203 203 proxies = {}
204 204
205 205 # urllib2 takes proxy values from the environment and those
206 206 # will take precedence if found, so drop them
207 207 for env in ["HTTP_PROXY", "http_proxy", "no_proxy"]:
208 208 try:
209 209 if env in os.environ:
210 210 del os.environ[env]
211 211 except OSError:
212 212 pass
213 213
214 214 urllib2.ProxyHandler.__init__(self, proxies)
215 215 self.ui = ui
216 216
217 217 def proxy_open(self, req, proxy, type_):
218 218 host = req.get_host().split(':')[0]
219 219 if host in self.no_list:
220 220 return None
221 221
222 222 # work around a bug in Python < 2.4.2
223 223 # (it leaves a "\n" at the end of Proxy-authorization headers)
224 224 baseclass = req.__class__
225 225 class _request(baseclass):
226 226 def add_header(self, key, val):
227 227 if key.lower() == 'proxy-authorization':
228 228 val = val.strip()
229 229 return baseclass.add_header(self, key, val)
230 230 req.__class__ = _request
231 231
232 232 return urllib2.ProxyHandler.proxy_open(self, req, proxy, type_)
233 233
234 234 class httpsendfile(file):
235 235 def __len__(self):
236 236 return os.fstat(self.fileno()).st_size
237 237
238 238 def _gen_sendfile(connection):
239 239 def _sendfile(self, data):
240 240 # send a file
241 241 if isinstance(data, httpsendfile):
242 242 # if auth required, some data sent twice, so rewind here
243 243 data.seek(0)
244 244 for chunk in util.filechunkiter(data):
245 245 connection.send(self, chunk)
246 246 else:
247 247 connection.send(self, data)
248 248 return _sendfile
249 249
250 250 has_https = hasattr(urllib2, 'HTTPSHandler')
251 251 if has_https:
252 252 try:
253 253 # avoid using deprecated/broken FakeSocket in python 2.6
254 254 import ssl
255 255 _ssl_wrap_socket = ssl.wrap_socket
256 256 except ImportError:
257 257 def _ssl_wrap_socket(sock, key_file, cert_file):
258 258 ssl = socket.ssl(sock, key_file, cert_file)
259 259 return httplib.FakeSocket(sock, ssl)
260 260
261 261 class httpconnection(keepalive.HTTPConnection):
262 262 # must be able to send big bundle as stream.
263 263 send = _gen_sendfile(keepalive.HTTPConnection)
264 264
265 265 def _proxytunnel(self):
266 266 proxyheaders = dict(
267 267 [(x, self.headers[x]) for x in self.headers
268 268 if x.lower().startswith('proxy-')])
269 269 self._set_hostport(self.host, self.port)
270 270 self.send('CONNECT %s:%d HTTP/1.0\r\n' % (self.realhost, self.realport))
271 271 for header in proxyheaders.iteritems():
272 272 self.send('%s: %s\r\n' % header)
273 273 self.send('\r\n')
274 274
275 275 # majority of the following code is duplicated from
276 276 # httplib.HTTPConnection as there are no adequate places to
277 277 # override functions to provide the needed functionality
278 278 res = self.response_class(self.sock,
279 279 strict=self.strict,
280 280 method=self._method)
281 281
282 282 while True:
283 283 version, status, reason = res._read_status()
284 284 if status != httplib.CONTINUE:
285 285 break
286 286 while True:
287 287 skip = res.fp.readline().strip()
288 288 if not skip:
289 289 break
290 290 res.status = status
291 291 res.reason = reason.strip()
292 292
293 293 if res.status == 200:
294 294 while True:
295 295 line = res.fp.readline()
296 296 if line == '\r\n':
297 297 break
298 298 return True
299 299
300 300 if version == 'HTTP/1.0':
301 301 res.version = 10
302 302 elif version.startswith('HTTP/1.'):
303 303 res.version = 11
304 304 elif version == 'HTTP/0.9':
305 305 res.version = 9
306 306 else:
307 307 raise httplib.UnknownProtocol(version)
308 308
309 309 if res.version == 9:
310 310 res.length = None
311 311 res.chunked = 0
312 312 res.will_close = 1
313 313 res.msg = httplib.HTTPMessage(cStringIO.StringIO())
314 314 return False
315 315
316 316 res.msg = httplib.HTTPMessage(res.fp)
317 317 res.msg.fp = None
318 318
319 319 # are we using the chunked-style of transfer encoding?
320 320 trenc = res.msg.getheader('transfer-encoding')
321 321 if trenc and trenc.lower() == "chunked":
322 322 res.chunked = 1
323 323 res.chunk_left = None
324 324 else:
325 325 res.chunked = 0
326 326
327 327 # will the connection close at the end of the response?
328 328 res.will_close = res._check_close()
329 329
330 330 # do we have a Content-Length?
331 331 # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked"
332 332 length = res.msg.getheader('content-length')
333 333 if length and not res.chunked:
334 334 try:
335 335 res.length = int(length)
336 336 except ValueError:
337 337 res.length = None
338 338 else:
339 339 if res.length < 0: # ignore nonsensical negative lengths
340 340 res.length = None
341 341 else:
342 342 res.length = None
343 343
344 344 # does the body have a fixed length? (of zero)
345 345 if (status == httplib.NO_CONTENT or status == httplib.NOT_MODIFIED or
346 346 100 <= status < 200 or # 1xx codes
347 347 res._method == 'HEAD'):
348 348 res.length = 0
349 349
350 350 # if the connection remains open, and we aren't using chunked, and
351 351 # a content-length was not provided, then assume that the connection
352 352 # WILL close.
353 353 if (not res.will_close and
354 354 not res.chunked and
355 355 res.length is None):
356 356 res.will_close = 1
357 357
358 358 self.proxyres = res
359 359
360 360 return False
361 361
362 362 def connect(self):
363 363 if has_https and self.realhost: # use CONNECT proxy
364 364 self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
365 365 self.sock.connect((self.host, self.port))
366 366 if self._proxytunnel():
367 367 # we do not support client x509 certificates
368 368 self.sock = _ssl_wrap_socket(self.sock, None, None)
369 369 else:
370 370 keepalive.HTTPConnection.connect(self)
371 371
372 372 def getresponse(self):
373 373 proxyres = getattr(self, 'proxyres', None)
374 374 if proxyres:
375 375 if proxyres.will_close:
376 376 self.close()
377 377 self.proxyres = None
378 378 return proxyres
379 379 return keepalive.HTTPConnection.getresponse(self)
380 380
381 381 class httphandler(keepalive.HTTPHandler):
382 382 def http_open(self, req):
383 383 return self.do_open(httpconnection, req)
384 384
385 385 def _start_transaction(self, h, req):
386 386 if req.get_selector() == req.get_full_url(): # has proxy
387 387 urlparts = urlparse.urlparse(req.get_selector())
388 388 if urlparts[0] == 'https': # only use CONNECT for HTTPS
389 389 if ':' in urlparts[1]:
390 390 realhost, realport = urlparts[1].split(':')
391 391 realport = int(realport)
392 392 else:
393 393 realhost = urlparts[1]
394 394 realport = 443
395 395
396 396 h.realhost = realhost
397 397 h.realport = realport
398 398 h.headers = req.headers.copy()
399 399 h.headers.update(self.parent.addheaders)
400 400 return keepalive.HTTPHandler._start_transaction(self, h, req)
401 401
402 402 h.realhost = None
403 403 h.realport = None
404 404 h.headers = None
405 405 return keepalive.HTTPHandler._start_transaction(self, h, req)
406 406
407 407 def __del__(self):
408 408 self.close_all()
409 409
410 410 if has_https:
411 class httpsconnection(httplib.HTTPSConnection):
411 class BetterHTTPS(httplib.HTTPSConnection):
412 send = keepalive.safesend
413
414 class httpsconnection(BetterHTTPS):
412 415 response_class = keepalive.HTTPResponse
413 416 # must be able to send big bundle as stream.
414 send = _gen_sendfile(httplib.HTTPSConnection)
417 send = _gen_sendfile(BetterHTTPS)
418 getresponse = keepalive.wrapgetresponse(httplib.HTTPSConnection)
415 419
416 420 class httpshandler(keepalive.KeepAliveHandler, urllib2.HTTPSHandler):
417 421 def __init__(self, ui):
418 422 keepalive.KeepAliveHandler.__init__(self)
419 423 urllib2.HTTPSHandler.__init__(self)
420 424 self.ui = ui
421 425 self.pwmgr = passwordmgr(self.ui)
422 426
423 427 def https_open(self, req):
424 428 self.auth = self.pwmgr.readauthtoken(req.get_full_url())
425 429 return self.do_open(self._makeconnection, req)
426 430
427 431 def _makeconnection(self, host, port=443, *args, **kwargs):
428 432 keyfile = None
429 433 certfile = None
430 434
431 435 if args: # key_file
432 436 keyfile = args.pop(0)
433 437 if args: # cert_file
434 438 certfile = args.pop(0)
435 439
436 440 # if the user has specified different key/cert files in
437 441 # hgrc, we prefer these
438 442 if self.auth and 'key' in self.auth and 'cert' in self.auth:
439 443 keyfile = self.auth['key']
440 444 certfile = self.auth['cert']
441 445
442 446 # let host port take precedence
443 447 if ':' in host and '[' not in host or ']:' in host:
444 448 host, port = host.rsplit(':', 1)
445 449 port = int(port)
446 450 if '[' in host:
447 451 host = host[1:-1]
448 452
449 453 return httpsconnection(host, port, keyfile, certfile, *args, **kwargs)
450 454
451 455 # In python < 2.5 AbstractDigestAuthHandler raises a ValueError if
452 456 # it doesn't know about the auth type requested. This can happen if
453 457 # somebody is using BasicAuth and types a bad password.
454 458 class httpdigestauthhandler(urllib2.HTTPDigestAuthHandler):
455 459 def http_error_auth_reqed(self, auth_header, host, req, headers):
456 460 try:
457 461 return urllib2.HTTPDigestAuthHandler.http_error_auth_reqed(
458 462 self, auth_header, host, req, headers)
459 463 except ValueError, inst:
460 464 arg = inst.args[0]
461 465 if arg.startswith("AbstractDigestAuthHandler doesn't know "):
462 466 return
463 467 raise
464 468
465 469 def getauthinfo(path):
466 470 scheme, netloc, urlpath, query, frag = urlparse.urlsplit(path)
467 471 if not urlpath:
468 472 urlpath = '/'
469 473 if scheme != 'file':
470 474 # XXX: why are we quoting the path again with some smart
471 475 # heuristic here? Anyway, it cannot be done with file://
472 476 # urls since path encoding is os/fs dependent (see
473 477 # urllib.pathname2url() for details).
474 478 urlpath = quotepath(urlpath)
475 479 host, port, user, passwd = netlocsplit(netloc)
476 480
477 481 # urllib cannot handle URLs with embedded user or passwd
478 482 url = urlparse.urlunsplit((scheme, netlocunsplit(host, port),
479 483 urlpath, query, frag))
480 484 if user:
481 485 netloc = host
482 486 if port:
483 487 netloc += ':' + port
484 488 # Python < 2.4.3 uses only the netloc to search for a password
485 489 authinfo = (None, (url, netloc), user, passwd or '')
486 490 else:
487 491 authinfo = None
488 492 return url, authinfo
489 493
490 494 handlerfuncs = []
491 495
492 496 def opener(ui, authinfo=None):
493 497 '''
494 498 construct an opener suitable for urllib2
495 499 authinfo will be added to the password manager
496 500 '''
497 501 handlers = [httphandler()]
498 502 if has_https:
499 503 handlers.append(httpshandler(ui))
500 504
501 505 handlers.append(proxyhandler(ui))
502 506
503 507 passmgr = passwordmgr(ui)
504 508 if authinfo is not None:
505 509 passmgr.add_password(*authinfo)
506 510 user, passwd = authinfo[2:4]
507 511 ui.debug('http auth: user %s, password %s\n' %
508 512 (user, passwd and '*' * len(passwd) or 'not set'))
509 513
510 514 handlers.extend((urllib2.HTTPBasicAuthHandler(passmgr),
511 515 httpdigestauthhandler(passmgr)))
512 516 handlers.extend([h(ui, passmgr) for h in handlerfuncs])
513 517 opener = urllib2.build_opener(*handlers)
514 518
515 519 # 1.0 here is the _protocol_ version
516 520 opener.addheaders = [('User-agent', 'mercurial/proto-1.0')]
517 521 opener.addheaders.append(('Accept', 'application/mercurial-0.1'))
518 522 return opener
519 523
520 524 scheme_re = re.compile(r'^([a-zA-Z0-9+-.]+)://')
521 525
522 526 def open(ui, url, data=None):
523 527 scheme = None
524 528 m = scheme_re.search(url)
525 529 if m:
526 530 scheme = m.group(1).lower()
527 531 if not scheme:
528 532 path = util.normpath(os.path.abspath(url))
529 533 url = 'file://' + urllib.pathname2url(path)
530 534 authinfo = None
531 535 else:
532 536 url, authinfo = getauthinfo(url)
533 537 return opener(ui, authinfo).open(url, data)
General Comments 0
You need to be logged in to leave comments. Login now