upstream/ipython Files · docs/examples/parallel/davinci/pwordfreq.py

receive tasks, even when no engines are registered...

receive tasks, even when no engines are registered Previously, tasks submitted when no engines were registered were left in the upstream ZMQ queue. This prevented the tasks being entered into the Hub's database.

Fernando Perez - - Load All Authors

File last commit:

r4910:0dc49390


                r6092:11fd67e3

Download file

             pwordfreq.py
        
                    80 lines
            
             | 2.2 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / docs / examples / parallel / davinci / pwordfreq.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!/usr/bin/env python

      """Parallel word frequency counter.

      This only works for a local cluster, because the filenames are local paths.

      """

      import os

      import time

      import urllib

      from itertools import repeat

      from wordfreq import print_wordfreq, wordfreq

      from IPython.parallel import Client, Reference

      davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

      def pwordfreq(view, fnames):

          """Parallel word frequency counter.

          view - An IPython DirectView

          fnames - The filenames containing the split data.

          """

          assert len(fnames) == len(view.targets)

          view.scatter('fname', fnames, flatten=True)

          ar = view.apply(wordfreq, Reference('fname'))

          freqs_list = ar.get()

          word_set = set()

          for f in freqs_list:

              word_set.update(f.keys())

          freqs = dict(zip(word_set, repeat(0)))

          for f in freqs_list:

              for word, count in f.iteritems():

                  freqs[word] += count

          return freqs

      if __name__ == '__main__':

          # Create a Client and View

          rc = Client()

          view = rc[:]

          if not os.path.exists('davinci.txt'):

              # download from project gutenberg

              print "Downloading Da Vinci's notebooks from Project Gutenberg"

              urllib.urlretrieve(davinci_url, 'davinci.txt')

          # Run the serial version

          print "Serial word frequency count:"

          text = open('davinci.txt').read()

          tic = time.time()

          freqs = wordfreq(text)

          toc = time.time()

          print_wordfreq(freqs, 10)

          print "Took %.3f s to calcluate"%(toc-tic)

          # The parallel version

          print "\nParallel word frequency count:"

          # split the davinci.txt into one file per engine:

          lines = text.splitlines()

          nlines = len(lines)

          n = len(rc)

          block = nlines/n

          for i in range(n):

              chunk = lines[i*block:i*(block+1)]

              with open('davinci%i.txt'%i, 'w') as f:

                  f.write('\n'.join(chunk))

          cwd = os.path.abspath(os.getcwdu())

          fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]

          tic = time.time()

          pfreqs = pwordfreq(view,fnames)

          toc = time.time()

          print_wordfreq(freqs)

          print "Took %.3f s to calcluate on %i engines"%(toc-tic, len(view.targets))

          # cleanup split files

          map(os.remove, fnames)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!/usr/bin/env python
				"""Parallel word frequency counter.

				This only works for a local cluster, because the filenames are local paths.
				"""


				import os
				import time
				import urllib

				from itertools import repeat

				from wordfreq import print_wordfreq, wordfreq

				from IPython.parallel import Client, Reference

				davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

				def pwordfreq(view, fnames):
				"""Parallel word frequency counter.

				view - An IPython DirectView
				fnames - The filenames containing the split data.
				"""
				assert len(fnames) == len(view.targets)
				view.scatter('fname', fnames, flatten=True)
				ar = view.apply(wordfreq, Reference('fname'))
				freqs_list = ar.get()
				word_set = set()
				for f in freqs_list:
				word_set.update(f.keys())
				freqs = dict(zip(word_set, repeat(0)))
				for f in freqs_list:
				for word, count in f.iteritems():
				freqs[word] += count
				return freqs

				if __name__ == '__main__':
				# Create a Client and View
				rc = Client()

				view = rc[:]

				if not os.path.exists('davinci.txt'):
				# download from project gutenberg
				print "Downloading Da Vinci's notebooks from Project Gutenberg"
				urllib.urlretrieve(davinci_url, 'davinci.txt')

				# Run the serial version
				print "Serial word frequency count:"
				text = open('davinci.txt').read()
				tic = time.time()
				freqs = wordfreq(text)
				toc = time.time()
				print_wordfreq(freqs, 10)
				print "Took %.3f s to calcluate"%(toc-tic)


				# The parallel version
				print "\nParallel word frequency count:"
				# split the davinci.txt into one file per engine:
				lines = text.splitlines()
				nlines = len(lines)
				n = len(rc)
				block = nlines/n
				for i in range(n):
				chunk = lines[iblock:i(block+1)]
				with open('davinci%i.txt'%i, 'w') as f:
				f.write('\n'.join(chunk))

				cwd = os.path.abspath(os.getcwdu())
				fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
				tic = time.time()
				pfreqs = pwordfreq(view,fnames)
				toc = time.time()
				print_wordfreq(freqs)
				print "Took %.3f s to calcluate on %i engines"%(toc-tic, len(view.targets))
				# cleanup split files
				map(os.remove, fnames)