upstream/ipython Files · examples/parallel/davinci/pwordfreq.py

Merge remote-tracking branch 'upstream/master'

stonebig <stonebig> - - Load All Authors

File last commit:

r14208:4f12e03d


                r15247:13bba28f

Download file

             pwordfreq.py
        
                    90 lines
            
             | 2.5 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / examples / parallel / davinci / pwordfreq.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        MinRK
    
updates to docs and examples

              r3670
            
      #!/usr/bin/env python

      """Parallel word frequency counter.

      This only works for a local cluster, because the filenames are local paths.

      """

      import os

        MinRK
    
update a few parallel examples...

              r4184
            
      import time

        MinRK
    
updates to docs and examples

              r3670
            
      import urllib

      from itertools import repeat

      from wordfreq import print_wordfreq, wordfreq

      from IPython.parallel import Client, Reference

        stonebig <stonebig>
    
make davinci parallel example python3 compatible

              r14201
            
      from __future__ import division 

        stonebig <stonebig>
    
proper python exception handling

              r14208
            
      try: #python2

        stonebig <stonebig>
    
simpler python2 compatibility for davinci code

              r14207
            
          from urllib import urlretrieve

        stonebig <stonebig>
    
proper python exception handling

              r14208
            
      except ImportError: #python3

        stonebig <stonebig>
    
simpler python2 compatibility for davinci code

              r14207
            
          from urllib.request import urlretrieve

        MinRK
    
updates to docs and examples

              r3670
            
      davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

      def pwordfreq(view, fnames):

          """Parallel word frequency counter.

          view - An IPython DirectView

          fnames - The filenames containing the split data.

          """

          assert len(fnames) == len(view.targets)

          view.scatter('fname', fnames, flatten=True)

          ar = view.apply(wordfreq, Reference('fname'))

          freqs_list = ar.get()

          word_set = set()

          for f in freqs_list:

              word_set.update(f.keys())

          freqs = dict(zip(word_set, repeat(0)))

          for f in freqs_list:

        stonebig <stonebig>
    
make davinci parallel example python3 compatible

              r14201
            
              for word, count in f.items():

        MinRK
    
updates to docs and examples

              r3670
            
                  freqs[word] += count

          return freqs

      if __name__ == '__main__':

          # Create a Client and View

          rc = Client()

          view = rc[:]

          if not os.path.exists('davinci.txt'):

              # download from project gutenberg

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
              print("Downloading Da Vinci's notebooks from Project Gutenberg")

        stonebig <stonebig>
    
simpler python2 compatibility for davinci code

              r14207
            
              urlretrieve(davinci_url, 'davinci.txt')

        MinRK
    
updates to docs and examples

              r3670
            
          # Run the serial version

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
          print("Serial word frequency count:")

        MinRK
    
updates to docs and examples

              r3670
            
          text = open('davinci.txt').read()

        MinRK
    
update a few parallel examples...

              r4184
            
          tic = time.time()

        MinRK
    
updates to docs and examples

              r3670
            
          freqs = wordfreq(text)

        MinRK
    
update a few parallel examples...

              r4184
            
          toc = time.time()

        MinRK
    
updates to docs and examples

              r3670
            
          print_wordfreq(freqs, 10)

        stonebig <stonebig>
    
simpler python2 compatibility for davinci code

              r14207
            
          print("Took %.3f s to calculate"%(toc-tic))

        MinRK
    
updates to docs and examples

              r3670
            
          # The parallel version

        Thomas Kluyver
    
Update print syntax in parallel examples.

              r6455
            
          print("\nParallel word frequency count:")

        MinRK
    
updates to docs and examples

              r3670
            
          # split the davinci.txt into one file per engine:

          lines = text.splitlines()

          nlines = len(lines)

          n = len(rc)

        stonebig <stonebig>
    
make davinci parallel example python3 compatible

              r14201
            
          block = nlines//n

        MinRK
    
updates to docs and examples

              r3670
            
          for i in range(n):

              chunk = lines[i*block:i*(block+1)]

              with open('davinci%i.txt'%i, 'w') as f:

                  f.write('\n'.join(chunk))

        stonebig <stonebig>
    
proper python exception handling

              r14208
            
          try: #python2

        stonebig <stonebig>
    
make davinci parallel example python3 compatible

              r14201
            
              cwd = os.path.abspath(os.getcwdu())

        stonebig <stonebig>
    
proper python exception handling

              r14208
            
          except AttributeError: #python3

        stonebig <stonebig>
    
make davinci parallel example python3 compatible

              r14201
            
              cwd = os.path.abspath(os.getcwd())

        MinRK
    
updates to docs and examples

              r3670
            
          fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]

        MinRK
    
update a few parallel examples...

              r4184
            
          tic = time.time()

        MinRK
    
updates to docs and examples

              r3670
            
          pfreqs = pwordfreq(view,fnames)

        MinRK
    
update a few parallel examples...

              r4184
            
          toc = time.time()

        MinRK
    
updates to docs and examples

              r3670
            
          print_wordfreq(freqs)

        stonebig <stonebig>
    
simpler python2 compatibility for davinci code

              r14207
            
          print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets)))

        MinRK
    
updates to docs and examples

              r3670
            
          # cleanup split files

          map(os.remove, fnames)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

MinRK updates to docs and examples	r3670	#!/usr/bin/env python
		"""Parallel word frequency counter.

		This only works for a local cluster, because the filenames are local paths.
		"""


		import os
MinRK update a few parallel examples...	r4184	import time
MinRK updates to docs and examples	r3670	import urllib

		from itertools import repeat

		from wordfreq import print_wordfreq, wordfreq

		from IPython.parallel import Client, Reference

stonebig <stonebig> make davinci parallel example python3 compatible	r14201	from __future__ import division

stonebig <stonebig> proper python exception handling	r14208	try: #python2
stonebig <stonebig> simpler python2 compatibility for davinci code	r14207	from urllib import urlretrieve
stonebig <stonebig> proper python exception handling	r14208	except ImportError: #python3
stonebig <stonebig> simpler python2 compatibility for davinci code	r14207	from urllib.request import urlretrieve

MinRK updates to docs and examples	r3670	davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

		def pwordfreq(view, fnames):
		"""Parallel word frequency counter.

		view - An IPython DirectView
		fnames - The filenames containing the split data.
		"""
		assert len(fnames) == len(view.targets)
		view.scatter('fname', fnames, flatten=True)
		ar = view.apply(wordfreq, Reference('fname'))
		freqs_list = ar.get()
		word_set = set()
		for f in freqs_list:
		word_set.update(f.keys())
		freqs = dict(zip(word_set, repeat(0)))
		for f in freqs_list:
stonebig <stonebig> make davinci parallel example python3 compatible	r14201	for word, count in f.items():
MinRK updates to docs and examples	r3670	freqs[word] += count
		return freqs

		if __name__ == '__main__':
		# Create a Client and View
		rc = Client()

		view = rc[:]

		if not os.path.exists('davinci.txt'):
		# download from project gutenberg
Thomas Kluyver Update print syntax in parallel examples.	r6455	print("Downloading Da Vinci's notebooks from Project Gutenberg")
stonebig <stonebig> simpler python2 compatibility for davinci code	r14207	urlretrieve(davinci_url, 'davinci.txt')
MinRK updates to docs and examples	r3670
		# Run the serial version
Thomas Kluyver Update print syntax in parallel examples.	r6455	print("Serial word frequency count:")
MinRK updates to docs and examples	r3670	text = open('davinci.txt').read()
MinRK update a few parallel examples...	r4184	tic = time.time()
MinRK updates to docs and examples	r3670	freqs = wordfreq(text)
MinRK update a few parallel examples...	r4184	toc = time.time()
MinRK updates to docs and examples	r3670	print_wordfreq(freqs, 10)
stonebig <stonebig> simpler python2 compatibility for davinci code	r14207	print("Took %.3f s to calculate"%(toc-tic))
MinRK updates to docs and examples	r3670

		# The parallel version
Thomas Kluyver Update print syntax in parallel examples.	r6455	print("\nParallel word frequency count:")
MinRK updates to docs and examples	r3670	# split the davinci.txt into one file per engine:
		lines = text.splitlines()
		nlines = len(lines)
		n = len(rc)
stonebig <stonebig> make davinci parallel example python3 compatible	r14201	block = nlines//n
MinRK updates to docs and examples	r3670	for i in range(n):
		chunk = lines[iblock:i(block+1)]
		with open('davinci%i.txt'%i, 'w') as f:
		f.write('\n'.join(chunk))

stonebig <stonebig> proper python exception handling	r14208	try: #python2
stonebig <stonebig> make davinci parallel example python3 compatible	r14201	cwd = os.path.abspath(os.getcwdu())
stonebig <stonebig> proper python exception handling	r14208	except AttributeError: #python3
stonebig <stonebig> make davinci parallel example python3 compatible	r14201	cwd = os.path.abspath(os.getcwd())
MinRK updates to docs and examples	r3670	fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
MinRK update a few parallel examples...	r4184	tic = time.time()
MinRK updates to docs and examples	r3670	pfreqs = pwordfreq(view,fnames)
MinRK update a few parallel examples...	r4184	toc = time.time()
MinRK updates to docs and examples	r3670	print_wordfreq(freqs)
stonebig <stonebig> simpler python2 compatibility for davinci code	r14207	print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets)))
MinRK updates to docs and examples	r3670	# cleanup split files
		map(os.remove, fnames)