upstream/ipython Files · docs/examples/newparallel/davinci/pwordfreq.py

Update config docs so they don't refer to Str traitlet type.

MinRK - - Load All Authors

File last commit:

r3670:45e272d0


                r4066:96d15c3b

Download file

             pwordfreq.py
        
                    73 lines
            
             | 2.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / docs / examples / newparallel / davinci / pwordfreq.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        MinRK
    
updates to docs and examples

              r3670
            
      #!/usr/bin/env python

      """Parallel word frequency counter.

      This only works for a local cluster, because the filenames are local paths.

      """

      import os

      import urllib

      from itertools import repeat

      from wordfreq import print_wordfreq, wordfreq

      from IPython.parallel import Client, Reference

      davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

      def pwordfreq(view, fnames):

          """Parallel word frequency counter.

          view - An IPython DirectView

          fnames - The filenames containing the split data.

          """

          assert len(fnames) == len(view.targets)

          view.scatter('fname', fnames, flatten=True)

          ar = view.apply(wordfreq, Reference('fname'))

          freqs_list = ar.get()

          word_set = set()

          for f in freqs_list:

              word_set.update(f.keys())

          freqs = dict(zip(word_set, repeat(0)))

          for f in freqs_list:

              for word, count in f.iteritems():

                  freqs[word] += count

          return freqs

      if __name__ == '__main__':

          # Create a Client and View

          rc = Client()

          view = rc[:]

          if not os.path.exists('davinci.txt'):

              # download from project gutenberg

              print "Downloading Da Vinci's notebooks from Project Gutenberg"

              urllib.urlretrieve(davinci_url, 'davinci.txt')

          # Run the serial version

          print "Serial word frequency count:"

          text = open('davinci.txt').read()

          freqs = wordfreq(text)

          print_wordfreq(freqs, 10)

          # The parallel version

          print "\nParallel word frequency count:"

          # split the davinci.txt into one file per engine:

          lines = text.splitlines()

          nlines = len(lines)

          n = len(rc)

          block = nlines/n

          for i in range(n):

              chunk = lines[i*block:i*(block+1)]

              with open('davinci%i.txt'%i, 'w') as f:

                  f.write('\n'.join(chunk))

          cwd = os.path.abspath(os.getcwd())

          fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]

          pfreqs = pwordfreq(view,fnames)

          print_wordfreq(freqs)

          # cleanup split files

          map(os.remove, fnames)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

MinRK updates to docs and examples	r3670	#!/usr/bin/env python
		"""Parallel word frequency counter.

		This only works for a local cluster, because the filenames are local paths.
		"""


		import os
		import urllib

		from itertools import repeat

		from wordfreq import print_wordfreq, wordfreq

		from IPython.parallel import Client, Reference

		davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

		def pwordfreq(view, fnames):
		"""Parallel word frequency counter.

		view - An IPython DirectView
		fnames - The filenames containing the split data.
		"""
		assert len(fnames) == len(view.targets)
		view.scatter('fname', fnames, flatten=True)
		ar = view.apply(wordfreq, Reference('fname'))
		freqs_list = ar.get()
		word_set = set()
		for f in freqs_list:
		word_set.update(f.keys())
		freqs = dict(zip(word_set, repeat(0)))
		for f in freqs_list:
		for word, count in f.iteritems():
		freqs[word] += count
		return freqs

		if __name__ == '__main__':
		# Create a Client and View
		rc = Client()

		view = rc[:]

		if not os.path.exists('davinci.txt'):
		# download from project gutenberg
		print "Downloading Da Vinci's notebooks from Project Gutenberg"
		urllib.urlretrieve(davinci_url, 'davinci.txt')

		# Run the serial version
		print "Serial word frequency count:"
		text = open('davinci.txt').read()
		freqs = wordfreq(text)
		print_wordfreq(freqs, 10)


		# The parallel version
		print "\nParallel word frequency count:"
		# split the davinci.txt into one file per engine:
		lines = text.splitlines()
		nlines = len(lines)
		n = len(rc)
		block = nlines/n
		for i in range(n):
		chunk = lines[iblock:i(block+1)]
		with open('davinci%i.txt'%i, 'w') as f:
		f.write('\n'.join(chunk))

		cwd = os.path.abspath(os.getcwd())
		fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
		pfreqs = pwordfreq(view,fnames)
		print_wordfreq(freqs)
		# cleanup split files
		map(os.remove, fnames)