upstream/ipython Files · docs/examples/newparallel/davinci/pwordfreq.py

don't automatically add jobarray or queue lines to user template...

don't automatically add jobarray or queue lines to user template In parallel launchers, the queue and jobarray lines should not be added except in the default templates. user-templates must be fully specified. This prevents conflicts between PBS versions, which may not support jobarrays, etc. These lines are now only added to the *default* templates.

MinRK - - Load All Authors

File last commit:

r3670:45e272d0


                r4183:965bc088

Download file

             pwordfreq.py
        
                    73 lines
            
             | 2.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / docs / examples / newparallel / davinci / pwordfreq.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!/usr/bin/env python

      """Parallel word frequency counter.

      This only works for a local cluster, because the filenames are local paths.

      """

      import os

      import urllib

      from itertools import repeat

      from wordfreq import print_wordfreq, wordfreq

      from IPython.parallel import Client, Reference

      davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

      def pwordfreq(view, fnames):

          """Parallel word frequency counter.

          view - An IPython DirectView

          fnames - The filenames containing the split data.

          """

          assert len(fnames) == len(view.targets)

          view.scatter('fname', fnames, flatten=True)

          ar = view.apply(wordfreq, Reference('fname'))

          freqs_list = ar.get()

          word_set = set()

          for f in freqs_list:

              word_set.update(f.keys())

          freqs = dict(zip(word_set, repeat(0)))

          for f in freqs_list:

              for word, count in f.iteritems():

                  freqs[word] += count

          return freqs

      if __name__ == '__main__':

          # Create a Client and View

          rc = Client()

          view = rc[:]

          if not os.path.exists('davinci.txt'):

              # download from project gutenberg

              print "Downloading Da Vinci's notebooks from Project Gutenberg"

              urllib.urlretrieve(davinci_url, 'davinci.txt')

          # Run the serial version

          print "Serial word frequency count:"

          text = open('davinci.txt').read()

          freqs = wordfreq(text)

          print_wordfreq(freqs, 10)

          # The parallel version

          print "\nParallel word frequency count:"

          # split the davinci.txt into one file per engine:

          lines = text.splitlines()

          nlines = len(lines)

          n = len(rc)

          block = nlines/n

          for i in range(n):

              chunk = lines[i*block:i*(block+1)]

              with open('davinci%i.txt'%i, 'w') as f:

                  f.write('\n'.join(chunk))

          cwd = os.path.abspath(os.getcwd())

          fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]

          pfreqs = pwordfreq(view,fnames)

          print_wordfreq(freqs)

          # cleanup split files

          map(os.remove, fnames)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!/usr/bin/env python
				"""Parallel word frequency counter.

				This only works for a local cluster, because the filenames are local paths.
				"""


				import os
				import urllib

				from itertools import repeat

				from wordfreq import print_wordfreq, wordfreq

				from IPython.parallel import Client, Reference

				davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

				def pwordfreq(view, fnames):
				"""Parallel word frequency counter.

				view - An IPython DirectView
				fnames - The filenames containing the split data.
				"""
				assert len(fnames) == len(view.targets)
				view.scatter('fname', fnames, flatten=True)
				ar = view.apply(wordfreq, Reference('fname'))
				freqs_list = ar.get()
				word_set = set()
				for f in freqs_list:
				word_set.update(f.keys())
				freqs = dict(zip(word_set, repeat(0)))
				for f in freqs_list:
				for word, count in f.iteritems():
				freqs[word] += count
				return freqs

				if __name__ == '__main__':
				# Create a Client and View
				rc = Client()

				view = rc[:]

				if not os.path.exists('davinci.txt'):
				# download from project gutenberg
				print "Downloading Da Vinci's notebooks from Project Gutenberg"
				urllib.urlretrieve(davinci_url, 'davinci.txt')

				# Run the serial version
				print "Serial word frequency count:"
				text = open('davinci.txt').read()
				freqs = wordfreq(text)
				print_wordfreq(freqs, 10)


				# The parallel version
				print "\nParallel word frequency count:"
				# split the davinci.txt into one file per engine:
				lines = text.splitlines()
				nlines = len(lines)
				n = len(rc)
				block = nlines/n
				for i in range(n):
				chunk = lines[iblock:i(block+1)]
				with open('davinci%i.txt'%i, 'w') as f:
				f.write('\n'.join(chunk))

				cwd = os.path.abspath(os.getcwd())
				fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
				pfreqs = pwordfreq(view,fnames)
				print_wordfreq(freqs)
				# cleanup split files
				map(os.remove, fnames)