upstream/ipython Files · examples/Parallel Computing/fetchparse.py

Lots of doc work.

Brian E. Granger - - Load All Authors

File last commit:

r16120:24b93a1d


                r16120:24b93a1d

Download file

             fetchparse.py
        
                    99 lines
            
             | 2.9 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / examples / Parallel Computing / fetchparse.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      """

      An exceptionally lousy site spider

      Ken Kinder <ken@kenkinder.com>

      Updated for newparallel by Min Ragan-Kelley <benjaminrk@gmail.com>

      This module gives an example of how the task interface to the

      IPython controller works.  Before running this script start the IPython controller

      and some engines using something like::

          ipcluster start -n 4

      """

      from __future__ import print_function

      import sys

      from IPython.parallel import Client, error

      import time

      import BeautifulSoup # this isn't necessary, but it helps throw the dependency error earlier

      def fetchAndParse(url, data=None):

          import urllib2

          import urlparse

          import BeautifulSoup

          links = []

          try:

              page = urllib2.urlopen(url, data=data)

          except Exception:

              return links

          else:

              if page.headers.type == 'text/html':

                  doc = BeautifulSoup.BeautifulSoup(page.read())

                  for node in doc.findAll('a'):

                      href = node.get('href', None)

                      if href:

                          links.append(urlparse.urljoin(url, href))

              return links

      class DistributedSpider(object):

          # Time to wait between polling for task results.

          pollingDelay = 0.5

          def __init__(self, site):

              self.client = Client()

              self.view = self.client.load_balanced_view()

              self.mux = self.client[:]

              self.allLinks = []

              self.linksWorking = {}

              self.linksDone = {}

              self.site = site

          def visitLink(self, url):

              if url not in self.allLinks:

                  self.allLinks.append(url)

                  if url.startswith(self.site):

                      print('    ', url)

                      self.linksWorking[url] = self.view.apply(fetchAndParse, url)

          def onVisitDone(self, links, url):

              print(url, ':')

              self.linksDone[url] = None

              del self.linksWorking[url]

              for link in links:

                  self.visitLink(link)

          def run(self):

              self.visitLink(self.site)

              while self.linksWorking:

                  print(len(self.linksWorking), 'pending...')

                  self.synchronize()

                  time.sleep(self.pollingDelay)

          def synchronize(self):

              for url, ar in self.linksWorking.items():

                  # Calling get_task_result with block=False will return None if the

                  # task is not done yet.  This provides a simple way of polling.

                  try:

                      links = ar.get(0)

                  except error.TimeoutError:

                      continue

                  except Exception as e:

                      self.linksDone[url] = None

                      del self.linksWorking[url]

                      print(url, ':', e.traceback)

                  else:

                      self.onVisitDone(links, url)

      def main():

          if len(sys.argv) > 1:

              site = sys.argv[1]

          else:

              site = raw_input('Enter site to crawl: ')

          distributedSpider = DistributedSpider(site)

          distributedSpider.run()

      if __name__ == '__main__':

          main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				"""
				An exceptionally lousy site spider
				Ken Kinder <ken@kenkinder.com>

				Updated for newparallel by Min Ragan-Kelley <benjaminrk@gmail.com>

				This module gives an example of how the task interface to the
				IPython controller works. Before running this script start the IPython controller
				and some engines using something like::

				ipcluster start -n 4
				"""
				from __future__ import print_function

				import sys
				from IPython.parallel import Client, error
				import time
				import BeautifulSoup # this isn't necessary, but it helps throw the dependency error earlier

				def fetchAndParse(url, data=None):
				import urllib2
				import urlparse
				import BeautifulSoup
				links = []
				try:
				page = urllib2.urlopen(url, data=data)
				except Exception:
				return links
				else:
				if page.headers.type == 'text/html':
				doc = BeautifulSoup.BeautifulSoup(page.read())
				for node in doc.findAll('a'):
				href = node.get('href', None)
				if href:
				links.append(urlparse.urljoin(url, href))
				return links

				class DistributedSpider(object):

				# Time to wait between polling for task results.
				pollingDelay = 0.5

				def __init__(self, site):
				self.client = Client()
				self.view = self.client.load_balanced_view()
				self.mux = self.client[:]

				self.allLinks = []
				self.linksWorking = {}
				self.linksDone = {}

				self.site = site

				def visitLink(self, url):
				if url not in self.allLinks:
				self.allLinks.append(url)
				if url.startswith(self.site):
				print(' ', url)
				self.linksWorking[url] = self.view.apply(fetchAndParse, url)

				def onVisitDone(self, links, url):
				print(url, ':')
				self.linksDone[url] = None
				del self.linksWorking[url]
				for link in links:
				self.visitLink(link)

				def run(self):
				self.visitLink(self.site)
				while self.linksWorking:
				print(len(self.linksWorking), 'pending...')
				self.synchronize()
				time.sleep(self.pollingDelay)

				def synchronize(self):
				for url, ar in self.linksWorking.items():
				# Calling get_task_result with block=False will return None if the
				# task is not done yet. This provides a simple way of polling.
				try:
				links = ar.get(0)
				except error.TimeoutError:
				continue
				except Exception as e:
				self.linksDone[url] = None
				del self.linksWorking[url]
				print(url, ':', e.traceback)
				else:
				self.onVisitDone(links, url)

				def main():
				if len(sys.argv) > 1:
				site = sys.argv[1]
				else:
				site = raw_input('Enter site to crawl: ')
				distributedSpider = DistributedSpider(site)
				distributedSpider.run()

				if __name__ == '__main__':
				main()