upstream/ipython Files · docs/examples/newparallel/fetchparse.py

Merge branch 'glut-rebased' of git://github.com/fperez/ipython into glut...

Merge branch 'glut-rebased' of git://github.com/fperez/ipython into glut * 'glut-rebased' of git://github.com/fperez/ipython: Added the command line option Fix code in disable_glut which was not tested and quite buggy Tried to fix the CTRL-C problem (https://github.com/ipython/ipython/pull/742) and take other comments/typos into account Replaced deprecated raise call Fixed typos in comments Canceled window reshape to 1x1 since the idea is now for the user to use this window as the main one because of weird seg-faults problem after user creates its own window (any subsequent gl error would lead to a segfault, even a simple one line requiring a non existent function Event loop integration example Added code for the GLUT interactive session

MinRK - - Load All Authors

File last commit:

r3666:a6a0636a


                r4818:89161a5b

Download file

             fetchparse.py
        
                    97 lines
            
             | 2.9 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / docs / examples / newparallel / fetchparse.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      """

      An exceptionally lousy site spider

      Ken Kinder <ken@kenkinder.com>

      Updated for newparallel by Min Ragan-Kelley <benjaminrk@gmail.com>

      This module gives an example of how the task interface to the 

      IPython controller works.  Before running this script start the IPython controller

      and some engines using something like::

          ipclusterz start -n 4

      """

      import sys

      from IPython.parallel import Client, error

      import time

      import BeautifulSoup # this isn't necessary, but it helps throw the dependency error earlier

      def fetchAndParse(url, data=None):

          import urllib2

          import urlparse

          import BeautifulSoup

          links = []

          try:

              page = urllib2.urlopen(url, data=data)

          except Exception:

              return links

          else:

              if page.headers.type == 'text/html':

                  doc = BeautifulSoup.BeautifulSoup(page.read())

                  for node in doc.findAll('a'):

                      href = node.get('href', None)

                      if href:

                          links.append(urlparse.urljoin(url, href))

              return links

      class DistributedSpider(object):

          # Time to wait between polling for task results.

          pollingDelay = 0.5

          def __init__(self, site):

              self.client = Client()

              self.view = self.client.load_balanced_view()

              self.mux = self.client[:]

              self.allLinks = []

              self.linksWorking = {}

              self.linksDone = {}

              self.site = site

          def visitLink(self, url):

              if url not in self.allLinks:

                  self.allLinks.append(url)

                  if url.startswith(self.site):

                      print '    ', url

                      self.linksWorking[url] = self.view.apply(fetchAndParse, url)

          def onVisitDone(self, links, url):

              print url, ':'

              self.linksDone[url] = None

              del self.linksWorking[url]

              for link in links:

                  self.visitLink(link)

          def run(self):

              self.visitLink(self.site)

              while self.linksWorking:

                  print len(self.linksWorking), 'pending...'

                  self.synchronize()

                  time.sleep(self.pollingDelay)

          def synchronize(self):

              for url, ar in self.linksWorking.items():

                  # Calling get_task_result with block=False will return None if the

                  # task is not done yet.  This provides a simple way of polling.

                  try:

                      links = ar.get(0)

                  except error.TimeoutError:

                      continue

                  except Exception as e:

                      self.linksDone[url] = None

                      del self.linksWorking[url]

                      print url, ':', e.traceback

                  else:

                      self.onVisitDone(links, url)

      def main():

          if len(sys.argv) > 1:

              site = sys.argv[1]

          else:

              site = raw_input('Enter site to crawl: ')

          distributedSpider = DistributedSpider(site)

          distributedSpider.run()

      if __name__ == '__main__':

          main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				"""
				An exceptionally lousy site spider
				Ken Kinder <ken@kenkinder.com>

				Updated for newparallel by Min Ragan-Kelley <benjaminrk@gmail.com>

				This module gives an example of how the task interface to the
				IPython controller works. Before running this script start the IPython controller
				and some engines using something like::

				ipclusterz start -n 4
				"""
				import sys
				from IPython.parallel import Client, error
				import time
				import BeautifulSoup # this isn't necessary, but it helps throw the dependency error earlier

				def fetchAndParse(url, data=None):
				import urllib2
				import urlparse
				import BeautifulSoup
				links = []
				try:
				page = urllib2.urlopen(url, data=data)
				except Exception:
				return links
				else:
				if page.headers.type == 'text/html':
				doc = BeautifulSoup.BeautifulSoup(page.read())
				for node in doc.findAll('a'):
				href = node.get('href', None)
				if href:
				links.append(urlparse.urljoin(url, href))
				return links

				class DistributedSpider(object):

				# Time to wait between polling for task results.
				pollingDelay = 0.5

				def __init__(self, site):
				self.client = Client()
				self.view = self.client.load_balanced_view()
				self.mux = self.client[:]

				self.allLinks = []
				self.linksWorking = {}
				self.linksDone = {}

				self.site = site

				def visitLink(self, url):
				if url not in self.allLinks:
				self.allLinks.append(url)
				if url.startswith(self.site):
				print ' ', url
				self.linksWorking[url] = self.view.apply(fetchAndParse, url)

				def onVisitDone(self, links, url):
				print url, ':'
				self.linksDone[url] = None
				del self.linksWorking[url]
				for link in links:
				self.visitLink(link)

				def run(self):
				self.visitLink(self.site)
				while self.linksWorking:
				print len(self.linksWorking), 'pending...'
				self.synchronize()
				time.sleep(self.pollingDelay)

				def synchronize(self):
				for url, ar in self.linksWorking.items():
				# Calling get_task_result with block=False will return None if the
				# task is not done yet. This provides a simple way of polling.
				try:
				links = ar.get(0)
				except error.TimeoutError:
				continue
				except Exception as e:
				self.linksDone[url] = None
				del self.linksWorking[url]
				print url, ':', e.traceback
				else:
				self.onVisitDone(links, url)

				def main():
				if len(sys.argv) > 1:
				site = sys.argv[1]
				else:
				site = raw_input('Enter site to crawl: ')
				distributedSpider = DistributedSpider(site)
				distributedSpider.run()

				if __name__ == '__main__':
				main()