upstream/ipython Files · examples/Parallel Computing/fetchparse.py

Some fixes for the tree view

Brian E. Granger - - Load All Authors

File last commit:

r16120:24b93a1d


                r18783:9d1735ab

Download file

             fetchparse.py
        
                    99 lines
            
             | 2.9 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / examples / Parallel Computing / fetchparse.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        MinRK
    
update API after sagedays29...

              r3664
            
      """

      An exceptionally lousy site spider

      Ken Kinder <ken@kenkinder.com>

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
      Updated for newparallel by Min Ragan-Kelley <benjaminrk@gmail.com>

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
      This module gives an example of how the task interface to the

        MinRK
    
update API after sagedays29...

              r3664
            
      IPython controller works.  Before running this script start the IPython controller

      and some engines using something like::

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
          ipcluster start -n 4

        MinRK
    
update API after sagedays29...

              r3664
            
      """

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
      from __future__ import print_function

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
      import sys

      from IPython.parallel import Client, error

        MinRK
    
update API after sagedays29...

              r3664
            
      import time

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
      import BeautifulSoup # this isn't necessary, but it helps throw the dependency error earlier

        MinRK
    
update API after sagedays29...

              r3664
            
      def fetchAndParse(url, data=None):

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
          import urllib2

          import urlparse

          import BeautifulSoup

        MinRK
    
update API after sagedays29...

              r3664
            
          links = []

          try:

              page = urllib2.urlopen(url, data=data)

          except Exception:

              return links

          else:

              if page.headers.type == 'text/html':

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
                  doc = BeautifulSoup.BeautifulSoup(page.read())

                  for node in doc.findAll('a'):

                      href = node.get('href', None)

                      if href:

                          links.append(urlparse.urljoin(url, href))

        MinRK
    
update API after sagedays29...

              r3664
            
              return links

      class DistributedSpider(object):

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
update API after sagedays29...

              r3664
            
          # Time to wait between polling for task results.

          pollingDelay = 0.5

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
update API after sagedays29...

              r3664
            
          def __init__(self, site):

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
              self.client = Client()

              self.view = self.client.load_balanced_view()

              self.mux = self.client[:]

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
update API after sagedays29...

              r3664
            
              self.allLinks = []

              self.linksWorking = {}

              self.linksDone = {}

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
update API after sagedays29...

              r3664
            
              self.site = site

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
update API after sagedays29...

              r3664
            
          def visitLink(self, url):

              if url not in self.allLinks:

                  self.allLinks.append(url)

                  if url.startswith(self.site):

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
                      print('    ', url)

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
                      self.linksWorking[url] = self.view.apply(fetchAndParse, url)

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
          def onVisitDone(self, links, url):

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
              print(url, ':')

        MinRK
    
update API after sagedays29...

              r3664
            
              self.linksDone[url] = None

              del self.linksWorking[url]

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
              for link in links:

                  self.visitLink(link)

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
update API after sagedays29...

              r3664
            
          def run(self):

              self.visitLink(self.site)

              while self.linksWorking:

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
                  print(len(self.linksWorking), 'pending...')

        MinRK
    
update API after sagedays29...

              r3664
            
                  self.synchronize()

                  time.sleep(self.pollingDelay)

        Bernardo B. Marques
    
remove all trailling spaces

              r4872
            
        MinRK
    
update API after sagedays29...

              r3664
            
          def synchronize(self):

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
              for url, ar in self.linksWorking.items():

        MinRK
    
update API after sagedays29...

              r3664
            
                  # Calling get_task_result with block=False will return None if the

                  # task is not done yet.  This provides a simple way of polling.

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
                  try:

                      links = ar.get(0)

                  except error.TimeoutError:

                      continue

                  except Exception as e:

                      self.linksDone[url] = None

                      del self.linksWorking[url]

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
                      print(url, ':', e.traceback)

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
                  else:

                      self.onVisitDone(links, url)

        MinRK
    
update API after sagedays29...

              r3664
            
      def main():

        MinRK
    
remove kernel examples already ported to newparallel

              r3675
            
          if len(sys.argv) > 1:

              site = sys.argv[1]

          else:

              site = raw_input('Enter site to crawl: ')

          distributedSpider = DistributedSpider(site)

        MinRK
    
update API after sagedays29...

              r3664
            
          distributedSpider.run()

      if __name__ == '__main__':

          main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

MinRK update API after sagedays29...	r3664	"""
		An exceptionally lousy site spider
		Ken Kinder <ken@kenkinder.com>

MinRK remove kernel examples already ported to newparallel	r3675	Updated for newparallel by Min Ragan-Kelley <benjaminrk@gmail.com>

Bernardo B. Marques remove all trailling spaces	r4872	This module gives an example of how the task interface to the
MinRK update API after sagedays29...	r3664	IPython controller works. Before running this script start the IPython controller
		and some engines using something like::

Thomas Kluyver Update print syntax in parallel examples.	r6455	ipcluster start -n 4
MinRK update API after sagedays29...	r3664	"""
Thomas Kluyver Update print syntax in parallel examples.	r6455	from __future__ import print_function

MinRK remove kernel examples already ported to newparallel	r3675	import sys
		from IPython.parallel import Client, error
MinRK update API after sagedays29...	r3664	import time
MinRK remove kernel examples already ported to newparallel	r3675	import BeautifulSoup # this isn't necessary, but it helps throw the dependency error earlier
MinRK update API after sagedays29...	r3664
		def fetchAndParse(url, data=None):
MinRK remove kernel examples already ported to newparallel	r3675	import urllib2
		import urlparse
		import BeautifulSoup
MinRK update API after sagedays29...	r3664	links = []
		try:
		page = urllib2.urlopen(url, data=data)
		except Exception:
		return links
		else:
		if page.headers.type == 'text/html':
MinRK remove kernel examples already ported to newparallel	r3675	doc = BeautifulSoup.BeautifulSoup(page.read())
		for node in doc.findAll('a'):
		href = node.get('href', None)
		if href:
		links.append(urlparse.urljoin(url, href))
MinRK update API after sagedays29...	r3664	return links

		class DistributedSpider(object):
Bernardo B. Marques remove all trailling spaces	r4872
MinRK update API after sagedays29...	r3664	# Time to wait between polling for task results.
		pollingDelay = 0.5
Bernardo B. Marques remove all trailling spaces	r4872
MinRK update API after sagedays29...	r3664	def __init__(self, site):
MinRK remove kernel examples already ported to newparallel	r3675	self.client = Client()
		self.view = self.client.load_balanced_view()
		self.mux = self.client[:]
Bernardo B. Marques remove all trailling spaces	r4872
MinRK update API after sagedays29...	r3664	self.allLinks = []
		self.linksWorking = {}
		self.linksDone = {}
Bernardo B. Marques remove all trailling spaces	r4872
MinRK update API after sagedays29...	r3664	self.site = site
Bernardo B. Marques remove all trailling spaces	r4872
MinRK update API after sagedays29...	r3664	def visitLink(self, url):
		if url not in self.allLinks:
		self.allLinks.append(url)
		if url.startswith(self.site):
Thomas Kluyver Update print syntax in parallel examples.	r6455	print(' ', url)
MinRK remove kernel examples already ported to newparallel	r3675	self.linksWorking[url] = self.view.apply(fetchAndParse, url)
Bernardo B. Marques remove all trailling spaces	r4872
MinRK remove kernel examples already ported to newparallel	r3675	def onVisitDone(self, links, url):
Thomas Kluyver Update print syntax in parallel examples.	r6455	print(url, ':')
MinRK update API after sagedays29...	r3664	self.linksDone[url] = None
		del self.linksWorking[url]
MinRK remove kernel examples already ported to newparallel	r3675	for link in links:
		self.visitLink(link)
Bernardo B. Marques remove all trailling spaces	r4872
MinRK update API after sagedays29...	r3664	def run(self):
		self.visitLink(self.site)
		while self.linksWorking:
Thomas Kluyver Update print syntax in parallel examples.	r6455	print(len(self.linksWorking), 'pending...')
MinRK update API after sagedays29...	r3664	self.synchronize()
		time.sleep(self.pollingDelay)
Bernardo B. Marques remove all trailling spaces	r4872
MinRK update API after sagedays29...	r3664	def synchronize(self):
MinRK remove kernel examples already ported to newparallel	r3675	for url, ar in self.linksWorking.items():
MinRK update API after sagedays29...	r3664	# Calling get_task_result with block=False will return None if the
		# task is not done yet. This provides a simple way of polling.
MinRK remove kernel examples already ported to newparallel	r3675	try:
		links = ar.get(0)
		except error.TimeoutError:
		continue
		except Exception as e:
		self.linksDone[url] = None
		del self.linksWorking[url]
Thomas Kluyver Update print syntax in parallel examples.	r6455	print(url, ':', e.traceback)
MinRK remove kernel examples already ported to newparallel	r3675	else:
		self.onVisitDone(links, url)
MinRK update API after sagedays29...	r3664
		def main():
MinRK remove kernel examples already ported to newparallel	r3675	if len(sys.argv) > 1:
		site = sys.argv[1]
		else:
		site = raw_input('Enter site to crawl: ')
		distributedSpider = DistributedSpider(site)
MinRK update API after sagedays29...	r3664	distributedSpider.run()

		if __name__ == '__main__':
		main()