##// END OF EJS Templates
Cleaned up release tools directory....
Cleaned up release tools directory. Converted almost all to python scripts and made toollib to collect common functions and avoid repetition. Properly commented and documented what each script does. The run_ipy_in_profiler one seems broken, I'm not sure what to do with it. We need to either fix it or remove it later, but it's not critical for 0.10.

File last commit:

r1396:ee81bbeb
r2118:ec9810f7
Show More
fetchparse.py
90 lines | 2.8 KiB | text/x-python | PythonLexer
Brian E Granger
Adding examples from ipython1-dev to docs/examples/kernel. These ...
r1337 """
An exceptionally lousy site spider
Ken Kinder <ken@kenkinder.com>
This module gives an example of how the TaskClient interface to the
IPython controller works. Before running this script start the IPython controller
and some engines using something like::
ipcluster -n 4
"""
from twisted.python.failure import Failure
Brian E Granger
Fixed most of the examples. A few still don't work, but this is a start.
r1338 from IPython.kernel import client
Brian E Granger
Adding examples from ipython1-dev to docs/examples/kernel. These ...
r1337 import time
fetchParse = """
from twisted.web import microdom
import urllib2
import urlparse
def fetchAndParse(url, data=None):
links = []
try:
page = urllib2.urlopen(url, data=data)
except Exception:
return links
else:
if page.headers.type == 'text/html':
doc = microdom.parseString(page.read(), beExtremelyLenient=True)
for node in doc.getElementsByTagName('a'):
if node.getAttribute('href'):
links.append(urlparse.urljoin(url, node.getAttribute('href')))
return links
"""
class DistributedSpider(object):
# Time to wait between polling for task results.
pollingDelay = 0.5
def __init__(self, site):
self.tc = client.TaskClient()
self.rc = client.MultiEngineClient()
self.rc.execute(fetchParse)
self.allLinks = []
self.linksWorking = {}
self.linksDone = {}
self.site = site
def visitLink(self, url):
if url not in self.allLinks:
self.allLinks.append(url)
if url.startswith(self.site):
print ' ', url
Brian E Granger
Fixing more tests and examples after the task, map and @parallel work.
r1396 self.linksWorking[url] = self.tc.run(client.StringTask('links = fetchAndParse(url)', pull=['links'], push={'url': url}))
Brian E Granger
Adding examples from ipython1-dev to docs/examples/kernel. These ...
r1337
def onVisitDone(self, result, url):
print url, ':'
self.linksDone[url] = None
del self.linksWorking[url]
if isinstance(result.failure, Failure):
txt = result.failure.getTraceback()
for line in txt.split('\n'):
print ' ', line
else:
for link in result.ns.links:
self.visitLink(link)
def run(self):
self.visitLink(self.site)
while self.linksWorking:
print len(self.linksWorking), 'pending...'
self.synchronize()
time.sleep(self.pollingDelay)
def synchronize(self):
for url, taskId in self.linksWorking.items():
# Calling get_task_result with block=False will return None if the
# task is not done yet. This provides a simple way of polling.
result = self.tc.get_task_result(taskId, block=False)
if result is not None:
self.onVisitDone(result, url)
def main():
distributedSpider = DistributedSpider(raw_input('Enter site to crawl: '))
distributedSpider.run()
if __name__ == '__main__':
main()