##// END OF EJS Templates
Lots of work on exception handling, including tests for traceback printing....
Lots of work on exception handling, including tests for traceback printing. We finally have some tests for various exception mode printing, via doctests that exercise all three modes! Also changed handling of sys.exit(X) to only print the summary message, as SystemExit is most often a 'handled' exception. It can still be 100% silenced via '%run -e', but now it's much less intrusive. Added a new %tb magic to print the last available traceback with the current xmode. One can then re-print the last traceback with more detail if desired, without having to cause it again.

File last commit:

r1396:ee81bbeb
r2440:0caaf43a
Show More
fetchparse.py
90 lines | 2.8 KiB | text/x-python | PythonLexer
"""
An exceptionally lousy site spider
Ken Kinder <ken@kenkinder.com>
This module gives an example of how the TaskClient interface to the
IPython controller works. Before running this script start the IPython controller
and some engines using something like::
ipcluster -n 4
"""
from twisted.python.failure import Failure
from IPython.kernel import client
import time
fetchParse = """
from twisted.web import microdom
import urllib2
import urlparse
def fetchAndParse(url, data=None):
links = []
try:
page = urllib2.urlopen(url, data=data)
except Exception:
return links
else:
if page.headers.type == 'text/html':
doc = microdom.parseString(page.read(), beExtremelyLenient=True)
for node in doc.getElementsByTagName('a'):
if node.getAttribute('href'):
links.append(urlparse.urljoin(url, node.getAttribute('href')))
return links
"""
class DistributedSpider(object):
# Time to wait between polling for task results.
pollingDelay = 0.5
def __init__(self, site):
self.tc = client.TaskClient()
self.rc = client.MultiEngineClient()
self.rc.execute(fetchParse)
self.allLinks = []
self.linksWorking = {}
self.linksDone = {}
self.site = site
def visitLink(self, url):
if url not in self.allLinks:
self.allLinks.append(url)
if url.startswith(self.site):
print ' ', url
self.linksWorking[url] = self.tc.run(client.StringTask('links = fetchAndParse(url)', pull=['links'], push={'url': url}))
def onVisitDone(self, result, url):
print url, ':'
self.linksDone[url] = None
del self.linksWorking[url]
if isinstance(result.failure, Failure):
txt = result.failure.getTraceback()
for line in txt.split('\n'):
print ' ', line
else:
for link in result.ns.links:
self.visitLink(link)
def run(self):
self.visitLink(self.site)
while self.linksWorking:
print len(self.linksWorking), 'pending...'
self.synchronize()
time.sleep(self.pollingDelay)
def synchronize(self):
for url, taskId in self.linksWorking.items():
# Calling get_task_result with block=False will return None if the
# task is not done yet. This provides a simple way of polling.
result = self.tc.get_task_result(taskId, block=False)
if result is not None:
self.onVisitDone(result, url)
def main():
distributedSpider = DistributedSpider(raw_input('Enter site to crawl: '))
distributedSpider.run()
if __name__ == '__main__':
main()