upstream/ipython Files · docs/examples/kernel/fetchparse.py

Added diagnostics printout at the end of the test suite....

Added diagnostics printout at the end of the test suite. This will make it easier for us to understand problem reports from users.

Brian E Granger - - Load All Authors

File last commit:

r1396:ee81bbeb


                r2496:f440a2cd

Download file

             fetchparse.py
        
                    90 lines
            
             | 2.8 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / docs / examples / kernel / fetchparse.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      """

      An exceptionally lousy site spider

      Ken Kinder <ken@kenkinder.com>

      This module gives an example of how the TaskClient interface to the 

      IPython controller works.  Before running this script start the IPython controller

      and some engines using something like::

          ipcluster -n 4

      """

      from twisted.python.failure import Failure

      from IPython.kernel import client

      import time

      fetchParse = """

      from twisted.web import microdom

      import urllib2

      import urlparse

      def fetchAndParse(url, data=None):

          links = []

          try:

              page = urllib2.urlopen(url, data=data)

          except Exception:

              return links

          else:

              if page.headers.type == 'text/html':

                  doc = microdom.parseString(page.read(), beExtremelyLenient=True)

                  for node in doc.getElementsByTagName('a'):

                      if node.getAttribute('href'):

                          links.append(urlparse.urljoin(url, node.getAttribute('href')))

              return links

      """

      class DistributedSpider(object):

          # Time to wait between polling for task results.

          pollingDelay = 0.5

          def __init__(self, site):

              self.tc = client.TaskClient()

              self.rc = client.MultiEngineClient()

              self.rc.execute(fetchParse)

              self.allLinks = []

              self.linksWorking = {}

              self.linksDone = {}

              self.site = site

          def visitLink(self, url):

              if url not in self.allLinks:

                  self.allLinks.append(url)

                  if url.startswith(self.site):

                      print '    ', url

                      self.linksWorking[url] = self.tc.run(client.StringTask('links = fetchAndParse(url)', pull=['links'], push={'url': url}))

          def onVisitDone(self, result, url):

              print url, ':'

              self.linksDone[url] = None

              del self.linksWorking[url]

              if isinstance(result.failure, Failure):

                  txt = result.failure.getTraceback()

                  for line in txt.split('\n'):

                      print '    ', line

              else:

                  for link in result.ns.links:

                      self.visitLink(link)

          def run(self):

              self.visitLink(self.site)

              while self.linksWorking:

                  print len(self.linksWorking), 'pending...'

                  self.synchronize()

                  time.sleep(self.pollingDelay)

          def synchronize(self):

              for url, taskId in self.linksWorking.items():

                  # Calling get_task_result with block=False will return None if the

                  # task is not done yet.  This provides a simple way of polling.

                  result = self.tc.get_task_result(taskId, block=False)

                  if result is not None:

                      self.onVisitDone(result, url)

      def main():

          distributedSpider = DistributedSpider(raw_input('Enter site to crawl: '))

          distributedSpider.run()

      if __name__ == '__main__':

          main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				"""
				An exceptionally lousy site spider
				Ken Kinder <ken@kenkinder.com>

				This module gives an example of how the TaskClient interface to the
				IPython controller works. Before running this script start the IPython controller
				and some engines using something like::

				ipcluster -n 4
				"""
				from twisted.python.failure import Failure
				from IPython.kernel import client
				import time

				fetchParse = """
				from twisted.web import microdom
				import urllib2
				import urlparse

				def fetchAndParse(url, data=None):
				links = []
				try:
				page = urllib2.urlopen(url, data=data)
				except Exception:
				return links
				else:
				if page.headers.type == 'text/html':
				doc = microdom.parseString(page.read(), beExtremelyLenient=True)
				for node in doc.getElementsByTagName('a'):
				if node.getAttribute('href'):
				links.append(urlparse.urljoin(url, node.getAttribute('href')))
				return links
				"""

				class DistributedSpider(object):

				# Time to wait between polling for task results.
				pollingDelay = 0.5

				def __init__(self, site):
				self.tc = client.TaskClient()
				self.rc = client.MultiEngineClient()
				self.rc.execute(fetchParse)

				self.allLinks = []
				self.linksWorking = {}
				self.linksDone = {}

				self.site = site

				def visitLink(self, url):
				if url not in self.allLinks:
				self.allLinks.append(url)
				if url.startswith(self.site):
				print ' ', url
				self.linksWorking[url] = self.tc.run(client.StringTask('links = fetchAndParse(url)', pull=['links'], push={'url': url}))

				def onVisitDone(self, result, url):
				print url, ':'
				self.linksDone[url] = None
				del self.linksWorking[url]
				if isinstance(result.failure, Failure):
				txt = result.failure.getTraceback()
				for line in txt.split('\n'):
				print ' ', line
				else:
				for link in result.ns.links:
				self.visitLink(link)

				def run(self):
				self.visitLink(self.site)
				while self.linksWorking:
				print len(self.linksWorking), 'pending...'
				self.synchronize()
				time.sleep(self.pollingDelay)

				def synchronize(self):
				for url, taskId in self.linksWorking.items():
				# Calling get_task_result with block=False will return None if the
				# task is not done yet. This provides a simple way of polling.
				result = self.tc.get_task_result(taskId, block=False)
				if result is not None:
				self.onVisitDone(result, url)

				def main():
				distributedSpider = DistributedSpider(raw_input('Enter site to crawl: '))
				distributedSpider.run()

				if __name__ == '__main__':
				main()