upstream/ipython Files · examples/Parallel Computing/daVinci Word Count/pwordfreq.py

Initial messing around....

Initial messing around. Latex tab completion will have to be done outside the normal completer logic as the completer line splitting logic uses \\ as a special character to split lines on. I probably want to put the latex completions first and it if finds any matches, don't do any other completion logic. The only issue is that might short circuit dir/path matching on windows. Hmmm.

MinRK - - Load All Authors

File last commit:

r16150:d2688784


                r17700:7b6d94ef

Download file

             pwordfreq.py
        
                    89 lines
            
             | 2.5 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / examples / Parallel Computing / daVinci Word Count / pwordfreq.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!/usr/bin/env python

      """Parallel word frequency counter.

      This only works for a local cluster, because the filenames are local paths.

      """

      from __future__ import division

      import os

      import time

      import urllib

      from itertools import repeat

      from wordfreq import print_wordfreq, wordfreq

      from IPython.parallel import Client, Reference

      try: #python2

          from urllib import urlretrieve

      except ImportError: #python3

          from urllib.request import urlretrieve

      davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

      def pwordfreq(view, fnames):

          """Parallel word frequency counter.

          view - An IPython DirectView

          fnames - The filenames containing the split data.

          """

          assert len(fnames) == len(view.targets)

          view.scatter('fname', fnames, flatten=True)

          ar = view.apply(wordfreq, Reference('fname'))

          freqs_list = ar.get()

          word_set = set()

          for f in freqs_list:

              word_set.update(f.keys())

          freqs = dict(zip(word_set, repeat(0)))

          for f in freqs_list:

              for word, count in f.items():

                  freqs[word] += count

          return freqs

      if __name__ == '__main__':

          # Create a Client and View

          rc = Client()

          view = rc[:]

          if not os.path.exists('davinci.txt'):

              # download from project gutenberg

              print("Downloading Da Vinci's notebooks from Project Gutenberg")

              urlretrieve(davinci_url, 'davinci.txt')

          # Run the serial version

          print("Serial word frequency count:")

          text = open('davinci.txt').read()

          tic = time.time()

          freqs = wordfreq(text)

          toc = time.time()

          print_wordfreq(freqs, 10)

          print("Took %.3f s to calculate"%(toc-tic))

          # The parallel version

          print("\nParallel word frequency count:")

          # split the davinci.txt into one file per engine:

          lines = text.splitlines()

          nlines = len(lines)

          n = len(rc)

          block = nlines//n

          for i in range(n):

              chunk = lines[i*block:i*(block+1)]

              with open('davinci%i.txt'%i, 'w') as f:

                  f.write('\n'.join(chunk))

          try: #python2

              cwd = os.path.abspath(os.getcwdu())

          except AttributeError: #python3

              cwd = os.path.abspath(os.getcwd())

          fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]

          tic = time.time()

          pfreqs = pwordfreq(view,fnames)

          toc = time.time()

          print_wordfreq(freqs)

          print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets)))

          # cleanup split files

          map(os.remove, fnames)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!/usr/bin/env python
				"""Parallel word frequency counter.

				This only works for a local cluster, because the filenames are local paths.
				"""
				from __future__ import division


				import os
				import time
				import urllib

				from itertools import repeat

				from wordfreq import print_wordfreq, wordfreq

				from IPython.parallel import Client, Reference

				try: #python2
				from urllib import urlretrieve
				except ImportError: #python3
				from urllib.request import urlretrieve

				davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

				def pwordfreq(view, fnames):
				"""Parallel word frequency counter.

				view - An IPython DirectView
				fnames - The filenames containing the split data.
				"""
				assert len(fnames) == len(view.targets)
				view.scatter('fname', fnames, flatten=True)
				ar = view.apply(wordfreq, Reference('fname'))
				freqs_list = ar.get()
				word_set = set()
				for f in freqs_list:
				word_set.update(f.keys())
				freqs = dict(zip(word_set, repeat(0)))
				for f in freqs_list:
				for word, count in f.items():
				freqs[word] += count
				return freqs

				if __name__ == '__main__':
				# Create a Client and View
				rc = Client()

				view = rc[:]

				if not os.path.exists('davinci.txt'):
				# download from project gutenberg
				print("Downloading Da Vinci's notebooks from Project Gutenberg")
				urlretrieve(davinci_url, 'davinci.txt')

				# Run the serial version
				print("Serial word frequency count:")
				text = open('davinci.txt').read()
				tic = time.time()
				freqs = wordfreq(text)
				toc = time.time()
				print_wordfreq(freqs, 10)
				print("Took %.3f s to calculate"%(toc-tic))


				# The parallel version
				print("\nParallel word frequency count:")
				# split the davinci.txt into one file per engine:
				lines = text.splitlines()
				nlines = len(lines)
				n = len(rc)
				block = nlines//n
				for i in range(n):
				chunk = lines[iblock:i(block+1)]
				with open('davinci%i.txt'%i, 'w') as f:
				f.write('\n'.join(chunk))

				try: #python2
				cwd = os.path.abspath(os.getcwdu())
				except AttributeError: #python3
				cwd = os.path.abspath(os.getcwd())
				fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
				tic = time.time()
				pfreqs = pwordfreq(view,fnames)
				toc = time.time()
				print_wordfreq(freqs)
				print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets)))
				# cleanup split files
				map(os.remove, fnames)