##// END OF EJS Templates
Ported the IPython Sphinx directive to 0.11....
Ported the IPython Sphinx directive to 0.11. This was originally written by John Hunter for the 0.10 API, now it works with 0.11. We still need to automate its test suite, but at least now it runs and the script itself can be executed as a test that produces screen output and figures in a subdir.

File last commit:

r1396:ee81bbeb
r2439:858c6e09
Show More
fetchparse.py
90 lines | 2.8 KiB | text/x-python | PythonLexer
"""
An exceptionally lousy site spider
Ken Kinder <ken@kenkinder.com>
This module gives an example of how the TaskClient interface to the
IPython controller works. Before running this script start the IPython controller
and some engines using something like::
ipcluster -n 4
"""
from twisted.python.failure import Failure
from IPython.kernel import client
import time
fetchParse = """
from twisted.web import microdom
import urllib2
import urlparse
def fetchAndParse(url, data=None):
links = []
try:
page = urllib2.urlopen(url, data=data)
except Exception:
return links
else:
if page.headers.type == 'text/html':
doc = microdom.parseString(page.read(), beExtremelyLenient=True)
for node in doc.getElementsByTagName('a'):
if node.getAttribute('href'):
links.append(urlparse.urljoin(url, node.getAttribute('href')))
return links
"""
class DistributedSpider(object):
# Time to wait between polling for task results.
pollingDelay = 0.5
def __init__(self, site):
self.tc = client.TaskClient()
self.rc = client.MultiEngineClient()
self.rc.execute(fetchParse)
self.allLinks = []
self.linksWorking = {}
self.linksDone = {}
self.site = site
def visitLink(self, url):
if url not in self.allLinks:
self.allLinks.append(url)
if url.startswith(self.site):
print ' ', url
self.linksWorking[url] = self.tc.run(client.StringTask('links = fetchAndParse(url)', pull=['links'], push={'url': url}))
def onVisitDone(self, result, url):
print url, ':'
self.linksDone[url] = None
del self.linksWorking[url]
if isinstance(result.failure, Failure):
txt = result.failure.getTraceback()
for line in txt.split('\n'):
print ' ', line
else:
for link in result.ns.links:
self.visitLink(link)
def run(self):
self.visitLink(self.site)
while self.linksWorking:
print len(self.linksWorking), 'pending...'
self.synchronize()
time.sleep(self.pollingDelay)
def synchronize(self):
for url, taskId in self.linksWorking.items():
# Calling get_task_result with block=False will return None if the
# task is not done yet. This provides a simple way of polling.
result = self.tc.get_task_result(taskId, block=False)
if result is not None:
self.onVisitDone(result, url)
def main():
distributedSpider = DistributedSpider(raw_input('Enter site to crawl: '))
distributedSpider.run()
if __name__ == '__main__':
main()