##// END OF EJS Templates
proper python exception handling
stonebig <stonebig> -
Show More
@@ -1,90 +1,90 b''
1 1 #!/usr/bin/env python
2 2 """Parallel word frequency counter.
3 3
4 4 This only works for a local cluster, because the filenames are local paths.
5 5 """
6 6
7 7
8 8 import os
9 9 import time
10 10 import urllib
11 11
12 12 from itertools import repeat
13 13
14 14 from wordfreq import print_wordfreq, wordfreq
15 15
16 16 from IPython.parallel import Client, Reference
17 17
18 18 from __future__ import division
19 19
20 20 try : #python2
21 21 from urllib import urlretrieve
22 except : #python3
22 except ImportError: #python3
23 23 from urllib.request import urlretrieve
24 24
25 25 davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"
26 26
27 27 def pwordfreq(view, fnames):
28 28 """Parallel word frequency counter.
29 29
30 30 view - An IPython DirectView
31 31 fnames - The filenames containing the split data.
32 32 """
33 33 assert len(fnames) == len(view.targets)
34 34 view.scatter('fname', fnames, flatten=True)
35 35 ar = view.apply(wordfreq, Reference('fname'))
36 36 freqs_list = ar.get()
37 37 word_set = set()
38 38 for f in freqs_list:
39 39 word_set.update(f.keys())
40 40 freqs = dict(zip(word_set, repeat(0)))
41 41 for f in freqs_list:
42 42 for word, count in f.items():
43 43 freqs[word] += count
44 44 return freqs
45 45
46 46 if __name__ == '__main__':
47 47 # Create a Client and View
48 48 rc = Client()
49 49
50 50 view = rc[:]
51 51
52 52 if not os.path.exists('davinci.txt'):
53 53 # download from project gutenberg
54 54 print("Downloading Da Vinci's notebooks from Project Gutenberg")
55 55 urlretrieve(davinci_url, 'davinci.txt')
56 56
57 57 # Run the serial version
58 58 print("Serial word frequency count:")
59 59 text = open('davinci.txt').read()
60 60 tic = time.time()
61 61 freqs = wordfreq(text)
62 62 toc = time.time()
63 63 print_wordfreq(freqs, 10)
64 64 print("Took %.3f s to calculate"%(toc-tic))
65 65
66 66
67 67 # The parallel version
68 68 print("\nParallel word frequency count:")
69 69 # split the davinci.txt into one file per engine:
70 70 lines = text.splitlines()
71 71 nlines = len(lines)
72 72 n = len(rc)
73 73 block = nlines//n
74 74 for i in range(n):
75 75 chunk = lines[i*block:i*(block+1)]
76 76 with open('davinci%i.txt'%i, 'w') as f:
77 77 f.write('\n'.join(chunk))
78 78
79 79 try : #python2
80 80 cwd = os.path.abspath(os.getcwdu())
81 except : #python3
81 except AttributeError: #python3
82 82 cwd = os.path.abspath(os.getcwd())
83 83 fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
84 84 tic = time.time()
85 85 pfreqs = pwordfreq(view,fnames)
86 86 toc = time.time()
87 87 print_wordfreq(freqs)
88 88 print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets)))
89 89 # cleanup split files
90 90 map(os.remove, fnames)
@@ -1,162 +1,162 b''
1 1 """Compute statistics on the digits of pi.
2 2
3 3 This uses precomputed digits of pi from the website
4 4 of Professor Yasumasa Kanada at the University of
5 5 Tokoyo: http://www.super-computing.org/
6 6
7 7 Currently, there are only functions to read the
8 8 .txt (non-compressed, non-binary) files, but adding
9 9 support for compression and binary files would be
10 10 straightforward.
11 11
12 12 This focuses on computing the number of times that
13 13 all 1, 2, n digits sequences occur in the digits of pi.
14 14 If the digits of pi are truly random, these frequencies
15 15 should be equal.
16 16 """
17 17
18 18 # Import statements
19 19 from __future__ import division, with_statement
20 20
21 21 import numpy as np
22 22 from matplotlib import pyplot as plt
23 23
24 24 try : #python2
25 25 from urllib import urlretrieve
26 except : #python3
26 except ImportError : #python3
27 27 from urllib.request import urlretrieve
28 28
29 29 # Top-level functions
30 30
31 31 def fetch_pi_file(filename):
32 32 """This will download a segment of pi from super-computing.org
33 33 if the file is not already present.
34 34 """
35 35 import os, urllib
36 36 ftpdir="ftp://pi.super-computing.org/.2/pi200m/"
37 37 if os.path.exists(filename):
38 38 # we already have it
39 39 return
40 40 else:
41 41 # download it
42 42 urlretrieve(ftpdir+filename,filename)
43 43
44 44 def compute_one_digit_freqs(filename):
45 45 """
46 46 Read digits of pi from a file and compute the 1 digit frequencies.
47 47 """
48 48 d = txt_file_to_digits(filename)
49 49 freqs = one_digit_freqs(d)
50 50 return freqs
51 51
52 52 def compute_two_digit_freqs(filename):
53 53 """
54 54 Read digits of pi from a file and compute the 2 digit frequencies.
55 55 """
56 56 d = txt_file_to_digits(filename)
57 57 freqs = two_digit_freqs(d)
58 58 return freqs
59 59
60 60 def reduce_freqs(freqlist):
61 61 """
62 62 Add up a list of freq counts to get the total counts.
63 63 """
64 64 allfreqs = np.zeros_like(freqlist[0])
65 65 for f in freqlist:
66 66 allfreqs += f
67 67 return allfreqs
68 68
69 69 def compute_n_digit_freqs(filename, n):
70 70 """
71 71 Read digits of pi from a file and compute the n digit frequencies.
72 72 """
73 73 d = txt_file_to_digits(filename)
74 74 freqs = n_digit_freqs(d, n)
75 75 return freqs
76 76
77 77 # Read digits from a txt file
78 78
79 79 def txt_file_to_digits(filename, the_type=str):
80 80 """
81 81 Yield the digits of pi read from a .txt file.
82 82 """
83 83 with open(filename, 'r') as f:
84 84 for line in f.readlines():
85 85 for c in line:
86 86 if c != '\n' and c!= ' ':
87 87 yield the_type(c)
88 88
89 89 # Actual counting functions
90 90
91 91 def one_digit_freqs(digits, normalize=False):
92 92 """
93 93 Consume digits of pi and compute 1 digit freq. counts.
94 94 """
95 95 freqs = np.zeros(10, dtype='i4')
96 96 for d in digits:
97 97 freqs[int(d)] += 1
98 98 if normalize:
99 99 freqs = freqs/freqs.sum()
100 100 return freqs
101 101
102 102 def two_digit_freqs(digits, normalize=False):
103 103 """
104 104 Consume digits of pi and compute 2 digits freq. counts.
105 105 """
106 106 freqs = np.zeros(100, dtype='i4')
107 107 last = next(digits)
108 108 this = next(digits)
109 109 for d in digits:
110 110 index = int(last + this)
111 111 freqs[index] += 1
112 112 last = this
113 113 this = d
114 114 if normalize:
115 115 freqs = freqs/freqs.sum()
116 116 return freqs
117 117
118 118 def n_digit_freqs(digits, n, normalize=False):
119 119 """
120 120 Consume digits of pi and compute n digits freq. counts.
121 121
122 122 This should only be used for 1-6 digits.
123 123 """
124 124 freqs = np.zeros(pow(10,n), dtype='i4')
125 125 current = np.zeros(n, dtype=int)
126 126 for i in range(n):
127 127 current[i] = next(digits)
128 128 for d in digits:
129 129 index = int(''.join(map(str, current)))
130 130 freqs[index] += 1
131 131 current[0:-1] = current[1:]
132 132 current[-1] = d
133 133 if normalize:
134 134 freqs = freqs/freqs.sum()
135 135 return freqs
136 136
137 137 # Plotting functions
138 138
139 139 def plot_two_digit_freqs(f2):
140 140 """
141 141 Plot two digits frequency counts using matplotlib.
142 142 """
143 143 f2_copy = f2.copy()
144 144 f2_copy.shape = (10,10)
145 145 ax = plt.matshow(f2_copy)
146 146 plt.colorbar()
147 147 for i in range(10):
148 148 for j in range(10):
149 149 plt.text(i-0.2, j+0.2, str(j)+str(i))
150 150 plt.ylabel('First digit')
151 151 plt.xlabel('Second digit')
152 152 return ax
153 153
154 154 def plot_one_digit_freqs(f1):
155 155 """
156 156 Plot one digit frequency counts using matplotlib.
157 157 """
158 158 ax = plt.plot(f1,'bo-')
159 159 plt.title('Single digit counts in pi')
160 160 plt.xlabel('Digit')
161 161 plt.ylabel('Count')
162 162 return ax
General Comments 0
You need to be logged in to leave comments. Login now