##// END OF EJS Templates
Let the auto pylab backend detection work....
Let the auto pylab backend detection work. The kernel knows how to find the user's backend choice, so if no backend is explicitly given, read it from the matplotlibrc file.

File last commit:

r1396:ee81bbeb
r2978:f52feb29
Show More
fetchparse.py
90 lines | 2.8 KiB | text/x-python | PythonLexer
"""
An exceptionally lousy site spider
Ken Kinder <ken@kenkinder.com>
This module gives an example of how the TaskClient interface to the
IPython controller works. Before running this script start the IPython controller
and some engines using something like::
ipcluster -n 4
"""
from twisted.python.failure import Failure
from IPython.kernel import client
import time
fetchParse = """
from twisted.web import microdom
import urllib2
import urlparse
def fetchAndParse(url, data=None):
links = []
try:
page = urllib2.urlopen(url, data=data)
except Exception:
return links
else:
if page.headers.type == 'text/html':
doc = microdom.parseString(page.read(), beExtremelyLenient=True)
for node in doc.getElementsByTagName('a'):
if node.getAttribute('href'):
links.append(urlparse.urljoin(url, node.getAttribute('href')))
return links
"""
class DistributedSpider(object):
# Time to wait between polling for task results.
pollingDelay = 0.5
def __init__(self, site):
self.tc = client.TaskClient()
self.rc = client.MultiEngineClient()
self.rc.execute(fetchParse)
self.allLinks = []
self.linksWorking = {}
self.linksDone = {}
self.site = site
def visitLink(self, url):
if url not in self.allLinks:
self.allLinks.append(url)
if url.startswith(self.site):
print ' ', url
self.linksWorking[url] = self.tc.run(client.StringTask('links = fetchAndParse(url)', pull=['links'], push={'url': url}))
def onVisitDone(self, result, url):
print url, ':'
self.linksDone[url] = None
del self.linksWorking[url]
if isinstance(result.failure, Failure):
txt = result.failure.getTraceback()
for line in txt.split('\n'):
print ' ', line
else:
for link in result.ns.links:
self.visitLink(link)
def run(self):
self.visitLink(self.site)
while self.linksWorking:
print len(self.linksWorking), 'pending...'
self.synchronize()
time.sleep(self.pollingDelay)
def synchronize(self):
for url, taskId in self.linksWorking.items():
# Calling get_task_result with block=False will return None if the
# task is not done yet. This provides a simple way of polling.
result = self.tc.get_task_result(taskId, block=False)
if result is not None:
self.onVisitDone(result, url)
def main():
distributedSpider = DistributedSpider(raw_input('Enter site to crawl: '))
distributedSpider.run()
if __name__ == '__main__':
main()