upstream/ipython Commit - r4184:b74d704b

update a few parallel examples...

MinRK -

r4184:b74d704b

parent child

docs/examples/newparallel/davinci/pwordfreq.py

0 +7 0

              #!/usr/bin/env python
              """Parallel word frequency counter.
              This only works for a local cluster, because the filenames are local paths.
              """
              import os
+             import time
              import urllib
              from itertools import repeat
              from wordfreq import print_wordfreq, wordfreq
              from IPython.parallel import Client, Reference
              davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"
              def pwordfreq(view, fnames):
                  """Parallel word frequency counter.
                  view - An IPython DirectView
                  fnames - The filenames containing the split data.
                  """
                  assert len(fnames) == len(view.targets)
                  view.scatter('fname', fnames, flatten=True)
                  ar = view.apply(wordfreq, Reference('fname'))
                  freqs_list = ar.get()
                  word_set = set()
                  for f in freqs_list:
                      word_set.update(f.keys())
                  freqs = dict(zip(word_set, repeat(0)))
                  for f in freqs_list:
                      for word, count in f.iteritems():
                          freqs[word] += count
                  return freqs
              if __name__ == '__main__':
                  # Create a Client and View
                  rc = Client()
                  view = rc[:]
                  if not os.path.exists('davinci.txt'):
                      # download from project gutenberg
                      print "Downloading Da Vinci's notebooks from Project Gutenberg"
                      urllib.urlretrieve(davinci_url, 'davinci.txt')
                  # Run the serial version
                  print "Serial word frequency count:"
                  text = open('davinci.txt').read()
+                 tic = time.time()
                  freqs = wordfreq(text)
+                 toc = time.time()
                  print_wordfreq(freqs, 10)
+                 print "Took %.3f s to calcluate"%(toc-tic)
                  # The parallel version
                  print "\nParallel word frequency count:"
                  # split the davinci.txt into one file per engine:
                  lines = text.splitlines()
                  nlines = len(lines)
                  n = len(rc)
                  block = nlines/n
                  for i in range(n):
                      chunk = lines[i*block:i*(block+1)]
                      with open('davinci%i.txt'%i, 'w') as f:
                          f.write('\n'.join(chunk))
                  cwd = os.path.abspath(os.getcwd())
                  fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
+                 tic = time.time()
                  pfreqs = pwordfreq(view,fnames)
+                 toc = time.time()
                  print_wordfreq(freqs)
+                 print "Took %.3f s to calcluate on %i engines"%(toc-tic, len(view.targets))
                  # cleanup split files
                  map(os.remove, fnames)

docs/examples/newparallel/demo/map.py

0 +3 -4

              from IPython.parallel import *
              client = Client()
-             view = client[:]
+             view = client.load_balanced_view()
              @view.remote(block=True)
              def square(a):
                  """return square of a number"""
                  return a*a
              squares = map(square, range(42))
              # but that blocked between each result; not exactly useful
              square.block = False
              arlist = map(square, range(42))
              # submitted very fast
              # wait for the results:
              squares2 = [ r.get() for r in arlist ]
              # now the more convenient @parallel decorator, which has a map method:
-             @view.parallel(block=False)
+             view2 = client[:]
+             @view2.parallel(block=False)
              def psquare(a):
                  """return square of a number"""
                  return a*a
              # this chunks the data into n-negines jobs, not 42 jobs:
              ar = psquare.map(range(42))
              # wait for the results to be done:
              squares3 = ar.get()
              print squares == squares2, squares3==squares
              # True
  No newline at end of file

docs/examples/newparallel/demo/throughput.py

0 +2 -24

              import time
              import numpy as np
              from IPython import parallel
              nlist = map(int, np.logspace(2,9,16,base=2))
              nlist2 = map(int, np.logspace(2,8,15,base=2))
              tlist = map(int, np.logspace(7,22,16,base=2))
              nt = 16
              def wait(t=0):
                  import time
                  time.sleep(t)
              def echo(s=''):
                  return s
              def time_throughput(nmessages, t=0, f=wait):
                  client = parallel.Client()
-                 view = client[None]
+                 view = client.load_balanced_view()
                  # do one ping before starting timing
                  if f is echo:
                      t = np.random.random(t/8)
                  view.apply_sync(echo, '')
                  client.spin()
                  tic = time.time()
                  for i in xrange(nmessages):
                      view.apply(f, t)
                  lap = time.time()
-                 client.barrier()
+                 client.wait()
                  toc = time.time()
                  return lap-tic, toc-tic
-             def time_twisted(nmessages, t=0, f=wait):
-                 from IPython.kernel import client as kc
-                 client = kc.TaskClient()
-                 if f is wait:
-                     s = "import time; time.sleep(%f)"%t
-                     task = kc.StringTask(s)
-                 elif f is echo:
-                     t = np.random.random(t/8)
-                     s = "s=t"
-                     task = kc.StringTask(s, push=dict(t=t), pull=['s'])
-                 else:
-                     raise
-                 # do one ping before starting timing
-                 client.barrier(client.run(task))
-                 tic = time.time()
-                 tids = []
-                 for i in xrange(nmessages):
-                     tids.append(client.run(task))
-                 lap = time.time()
-                 client.barrier(tids)
-                 toc = time.time()
-                 return lap-tic, toc-tic
              def do_runs(nlist,t=0,f=wait, trials=2, runner=time_throughput):
                  A = np.zeros((len(nlist),2))
                  for i,n in enumerate(nlist):
                      t1 = t2 = 0
                      for _ in range(trials):
                          time.sleep(.25)
                          ts = runner(n,t,f)
                          t1 += ts[0]
                          t2 += ts[1]
                      t1 /= trials
                      t2 /= trials
                      A[i] = (t1,t2)
                      A[i] = n/A[i]
                      print n,A[i]
                  return A
              def do_echo(n,tlist=[0],f=echo, trials=2, runner=time_throughput):
                  A = np.zeros((len(tlist),2))
                  for i,t in enumerate(tlist):
                      t1 = t2 = 0
                      for _ in range(trials):
                          time.sleep(.25)
                          ts = runner(n,t,f)
                          t1 += ts[0]
                          t2 += ts[1]
                      t1 /= trials
                      t2 /= trials
                      A[i] = (t1,t2)
                      A[i] = n/A[i]
                      print t,A[i]
                  return A
   No newline at end of file

docs/examples/newparallel/multienginemap.py

0 +1 -1

              from IPython.parallel import Client
              rc = Client()
              view = rc[:]
              result = view.map_sync(lambda x: 2*x, range(10))
              print "Simple, default map: ", result
              ar = view.map_async(lambda x: 2*x, range(10))
              print "Submitted map, got AsyncResult: ", ar
              result = ar.r
              print "Using map_async: ", result
              @view.parallel(block=True)
              def f(x): return 2*x
-             result = f(range(10))
+             result = f.map(range(10))
              print "Using a parallel function: ", result
  No newline at end of file

docs/examples/newparallel/phistogram.py

0 +1 -1

              """Parallel histogram function"""
              import numpy
-             from IPython.utils.pickleutil import Reference
+             from IPython.parallel import Reference
              def phistogram(view, a, bins=10, rng=None, normed=False):
                  """Compute the histogram of a remote array a.
                  Parameters
                  ----------
                      view
                          IPython DirectView instance
                      a : str
                          String name of the remote array
                      bins : int
                          Number of histogram bins
                      rng : (float, float)
                          Tuple of min, max of the range to histogram
                      normed : boolean
                          Should the histogram counts be normalized to 1
                  """
                  nengines = len(view.targets)
                  # view.push(dict(bins=bins, rng=rng))
                  with view.sync_imports():
                      import numpy
                  rets = view.apply_sync(lambda a, b, rng: numpy.histogram(a,b,rng), Reference(a), bins, rng)
                  hists = [ r[0] for r in rets ]
                  lower_edges = [ r[1] for r in rets ]
                  # view.execute('hist, lower_edges = numpy.histogram(%s, bins, rng)' % a)
                  lower_edges = view.pull('lower_edges', targets=0)
                  hist_array = numpy.array(hists).reshape(nengines, -1)
                  # hist_array.shape = (nengines,-1)
                  total_hist = numpy.sum(hist_array, 0)
                  if normed:
                      total_hist = total_hist/numpy.sum(total_hist,dtype=float)
                  return total_hist, lower_edges

docs/examples/newparallel/task_profiler.py

0 +4 -3

              #!/usr/bin/env python
              """Test the performance of the task farming system.
              This script submits a set of tasks via a LoadBalancedView.  The tasks
              are basically just a time.sleep(t), where t is a random number between
              two limits that can be configured at the command line.  To run
              the script there must first be an IPython controller and engines running::
                  ipclusterz start -n 16
              A good test to run with 16 engines is::
                  python task_profiler.py -n 128 -t 0.01 -T 1.0
              This should show a speedup of 13-14x.  The limitation here is that the
              overhead of a single task is about 0.001-0.01 seconds.
              """
              import random, sys
              from optparse import OptionParser
              from IPython.utils.timing import time
              from IPython.parallel import Client
              def main():
                  parser = OptionParser()
                  parser.set_defaults(n=100)
-                 parser.set_defaults(tmin=1)
-                 parser.set_defaults(tmax=60)
+                 parser.set_defaults(tmin=1e-3)
+                 parser.set_defaults(tmax=1)
                  parser.set_defaults(profile='default')
                  parser.add_option("-n", type='int', dest='n',
                      help='the number of tasks to run')
                  parser.add_option("-t", type='float', dest='tmin',
                      help='the minimum task length in seconds')
                  parser.add_option("-T", type='float', dest='tmax',
                      help='the maximum task length in seconds')
                  parser.add_option("-p", '--profile', type='str', dest='profile',
                      help="the cluster profile [default: 'default']")
                  (opts, args) = parser.parse_args()
                  assert opts.tmax >= opts.tmin, "tmax must not be smaller than tmin"
                  rc = Client()
                  view = rc.load_balanced_view()
                  print view
                  rc.block=True
                  nengines = len(rc.ids)
-                 rc[:].execute('from IPython.utils.timing import time')
+                 with rc[:].sync_imports():
+                     from IPython.utils.timing import time
                  # the jobs should take a random time within a range
                  times = [random.random()*(opts.tmax-opts.tmin)+opts.tmin for i in range(opts.n)]
                  stime = sum(times)
                  print "executing %i tasks, totalling %.1f secs on %i engines"%(opts.n, stime, nengines)
                  time.sleep(1)
                  start = time.time()
                  amr = view.map(time.sleep, times)
                  amr.get()
                  stop = time.time()
                  ptime = stop-start
                  scale = stime/ptime
                  print "executed %.1f secs in %.1f secs"%(stime, ptime)
                  print "%.3fx parallel performance on %i engines"%(scale, nengines)
                  print "%.1f%% of theoretical max"%(100*scale/nengines)
              if __name__ == '__main__':
                  main()

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages