upstream/ipython Files · docs/examples/newparallel/davinci/pwordfreq.py

Fix rpmlint: non-executable-script...

Fix rpmlint: non-executable-script """ This text file contains a shebang or is located in a path dedicated for executables, but lacks the executable bits and cannot thus be executed. If the file is meant to be an executable script, add the executable bits, otherwise remove the shebang or move the file elsewhere. """ Mostly deleting the shebang, but some files contain a __main__ function, so make them executable. This is the last commit of this series and: Closes gh-647.

Jörgen Stenarson - - Load All Authors

File last commit:

r4208:b4b4ede0


                r4574:a8c54759

Download file

             pwordfreq.py
        
                    80 lines
            
             | 2.2 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / docs / examples / newparallel / davinci / pwordfreq.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!/usr/bin/env python

      """Parallel word frequency counter.

      This only works for a local cluster, because the filenames are local paths.

      """

      import os

      import time

      import urllib

      from itertools import repeat

      from wordfreq import print_wordfreq, wordfreq

      from IPython.parallel import Client, Reference

      davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

      def pwordfreq(view, fnames):

          """Parallel word frequency counter.

          view - An IPython DirectView

          fnames - The filenames containing the split data.

          """

          assert len(fnames) == len(view.targets)

          view.scatter('fname', fnames, flatten=True)

          ar = view.apply(wordfreq, Reference('fname'))

          freqs_list = ar.get()

          word_set = set()

          for f in freqs_list:

              word_set.update(f.keys())

          freqs = dict(zip(word_set, repeat(0)))

          for f in freqs_list:

              for word, count in f.iteritems():

                  freqs[word] += count

          return freqs

      if __name__ == '__main__':

          # Create a Client and View

          rc = Client()

          view = rc[:]

          if not os.path.exists('davinci.txt'):

              # download from project gutenberg

              print "Downloading Da Vinci's notebooks from Project Gutenberg"

              urllib.urlretrieve(davinci_url, 'davinci.txt')

          # Run the serial version

          print "Serial word frequency count:"

          text = open('davinci.txt').read()

          tic = time.time()

          freqs = wordfreq(text)

          toc = time.time()

          print_wordfreq(freqs, 10)

          print "Took %.3f s to calcluate"%(toc-tic)

          # The parallel version

          print "\nParallel word frequency count:"

          # split the davinci.txt into one file per engine:

          lines = text.splitlines()

          nlines = len(lines)

          n = len(rc)

          block = nlines/n

          for i in range(n):

              chunk = lines[i*block:i*(block+1)]

              with open('davinci%i.txt'%i, 'w') as f:

                  f.write('\n'.join(chunk))

          cwd = os.path.abspath(os.getcwdu())

          fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]

          tic = time.time()

          pfreqs = pwordfreq(view,fnames)

          toc = time.time()

          print_wordfreq(freqs)

          print "Took %.3f s to calcluate on %i engines"%(toc-tic, len(view.targets))

          # cleanup split files

          map(os.remove, fnames)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!/usr/bin/env python
				"""Parallel word frequency counter.

				This only works for a local cluster, because the filenames are local paths.
				"""


				import os
				import time
				import urllib

				from itertools import repeat

				from wordfreq import print_wordfreq, wordfreq

				from IPython.parallel import Client, Reference

				davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

				def pwordfreq(view, fnames):
				"""Parallel word frequency counter.

				view - An IPython DirectView
				fnames - The filenames containing the split data.
				"""
				assert len(fnames) == len(view.targets)
				view.scatter('fname', fnames, flatten=True)
				ar = view.apply(wordfreq, Reference('fname'))
				freqs_list = ar.get()
				word_set = set()
				for f in freqs_list:
				word_set.update(f.keys())
				freqs = dict(zip(word_set, repeat(0)))
				for f in freqs_list:
				for word, count in f.iteritems():
				freqs[word] += count
				return freqs

				if __name__ == '__main__':
				# Create a Client and View
				rc = Client()

				view = rc[:]

				if not os.path.exists('davinci.txt'):
				# download from project gutenberg
				print "Downloading Da Vinci's notebooks from Project Gutenberg"
				urllib.urlretrieve(davinci_url, 'davinci.txt')

				# Run the serial version
				print "Serial word frequency count:"
				text = open('davinci.txt').read()
				tic = time.time()
				freqs = wordfreq(text)
				toc = time.time()
				print_wordfreq(freqs, 10)
				print "Took %.3f s to calcluate"%(toc-tic)


				# The parallel version
				print "\nParallel word frequency count:"
				# split the davinci.txt into one file per engine:
				lines = text.splitlines()
				nlines = len(lines)
				n = len(rc)
				block = nlines/n
				for i in range(n):
				chunk = lines[iblock:i(block+1)]
				with open('davinci%i.txt'%i, 'w') as f:
				f.write('\n'.join(chunk))

				cwd = os.path.abspath(os.getcwdu())
				fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
				tic = time.time()
				pfreqs = pwordfreq(view,fnames)
				toc = time.time()
				print_wordfreq(freqs)
				print "Took %.3f s to calcluate on %i engines"%(toc-tic, len(view.targets))
				# cleanup split files
				map(os.remove, fnames)