##// END OF EJS Templates
proper python exception handling
stonebig <stonebig> -
Show More
@@ -1,90 +1,90 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 """Parallel word frequency counter.
2 """Parallel word frequency counter.
3
3
4 This only works for a local cluster, because the filenames are local paths.
4 This only works for a local cluster, because the filenames are local paths.
5 """
5 """
6
6
7
7
8 import os
8 import os
9 import time
9 import time
10 import urllib
10 import urllib
11
11
12 from itertools import repeat
12 from itertools import repeat
13
13
14 from wordfreq import print_wordfreq, wordfreq
14 from wordfreq import print_wordfreq, wordfreq
15
15
16 from IPython.parallel import Client, Reference
16 from IPython.parallel import Client, Reference
17
17
18 from __future__ import division
18 from __future__ import division
19
19
20 try : #python2
20 try: #python2
21 from urllib import urlretrieve
21 from urllib import urlretrieve
22 except : #python3
22 except ImportError: #python3
23 from urllib.request import urlretrieve
23 from urllib.request import urlretrieve
24
24
25 davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"
25 davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"
26
26
27 def pwordfreq(view, fnames):
27 def pwordfreq(view, fnames):
28 """Parallel word frequency counter.
28 """Parallel word frequency counter.
29
29
30 view - An IPython DirectView
30 view - An IPython DirectView
31 fnames - The filenames containing the split data.
31 fnames - The filenames containing the split data.
32 """
32 """
33 assert len(fnames) == len(view.targets)
33 assert len(fnames) == len(view.targets)
34 view.scatter('fname', fnames, flatten=True)
34 view.scatter('fname', fnames, flatten=True)
35 ar = view.apply(wordfreq, Reference('fname'))
35 ar = view.apply(wordfreq, Reference('fname'))
36 freqs_list = ar.get()
36 freqs_list = ar.get()
37 word_set = set()
37 word_set = set()
38 for f in freqs_list:
38 for f in freqs_list:
39 word_set.update(f.keys())
39 word_set.update(f.keys())
40 freqs = dict(zip(word_set, repeat(0)))
40 freqs = dict(zip(word_set, repeat(0)))
41 for f in freqs_list:
41 for f in freqs_list:
42 for word, count in f.items():
42 for word, count in f.items():
43 freqs[word] += count
43 freqs[word] += count
44 return freqs
44 return freqs
45
45
46 if __name__ == '__main__':
46 if __name__ == '__main__':
47 # Create a Client and View
47 # Create a Client and View
48 rc = Client()
48 rc = Client()
49
49
50 view = rc[:]
50 view = rc[:]
51
51
52 if not os.path.exists('davinci.txt'):
52 if not os.path.exists('davinci.txt'):
53 # download from project gutenberg
53 # download from project gutenberg
54 print("Downloading Da Vinci's notebooks from Project Gutenberg")
54 print("Downloading Da Vinci's notebooks from Project Gutenberg")
55 urlretrieve(davinci_url, 'davinci.txt')
55 urlretrieve(davinci_url, 'davinci.txt')
56
56
57 # Run the serial version
57 # Run the serial version
58 print("Serial word frequency count:")
58 print("Serial word frequency count:")
59 text = open('davinci.txt').read()
59 text = open('davinci.txt').read()
60 tic = time.time()
60 tic = time.time()
61 freqs = wordfreq(text)
61 freqs = wordfreq(text)
62 toc = time.time()
62 toc = time.time()
63 print_wordfreq(freqs, 10)
63 print_wordfreq(freqs, 10)
64 print("Took %.3f s to calculate"%(toc-tic))
64 print("Took %.3f s to calculate"%(toc-tic))
65
65
66
66
67 # The parallel version
67 # The parallel version
68 print("\nParallel word frequency count:")
68 print("\nParallel word frequency count:")
69 # split the davinci.txt into one file per engine:
69 # split the davinci.txt into one file per engine:
70 lines = text.splitlines()
70 lines = text.splitlines()
71 nlines = len(lines)
71 nlines = len(lines)
72 n = len(rc)
72 n = len(rc)
73 block = nlines//n
73 block = nlines//n
74 for i in range(n):
74 for i in range(n):
75 chunk = lines[i*block:i*(block+1)]
75 chunk = lines[i*block:i*(block+1)]
76 with open('davinci%i.txt'%i, 'w') as f:
76 with open('davinci%i.txt'%i, 'w') as f:
77 f.write('\n'.join(chunk))
77 f.write('\n'.join(chunk))
78
78
79 try : #python2
79 try: #python2
80 cwd = os.path.abspath(os.getcwdu())
80 cwd = os.path.abspath(os.getcwdu())
81 except : #python3
81 except AttributeError: #python3
82 cwd = os.path.abspath(os.getcwd())
82 cwd = os.path.abspath(os.getcwd())
83 fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
83 fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)]
84 tic = time.time()
84 tic = time.time()
85 pfreqs = pwordfreq(view,fnames)
85 pfreqs = pwordfreq(view,fnames)
86 toc = time.time()
86 toc = time.time()
87 print_wordfreq(freqs)
87 print_wordfreq(freqs)
88 print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets)))
88 print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets)))
89 # cleanup split files
89 # cleanup split files
90 map(os.remove, fnames)
90 map(os.remove, fnames)
@@ -1,162 +1,162 b''
1 """Compute statistics on the digits of pi.
1 """Compute statistics on the digits of pi.
2
2
3 This uses precomputed digits of pi from the website
3 This uses precomputed digits of pi from the website
4 of Professor Yasumasa Kanada at the University of
4 of Professor Yasumasa Kanada at the University of
5 Tokoyo: http://www.super-computing.org/
5 Tokoyo: http://www.super-computing.org/
6
6
7 Currently, there are only functions to read the
7 Currently, there are only functions to read the
8 .txt (non-compressed, non-binary) files, but adding
8 .txt (non-compressed, non-binary) files, but adding
9 support for compression and binary files would be
9 support for compression and binary files would be
10 straightforward.
10 straightforward.
11
11
12 This focuses on computing the number of times that
12 This focuses on computing the number of times that
13 all 1, 2, n digits sequences occur in the digits of pi.
13 all 1, 2, n digits sequences occur in the digits of pi.
14 If the digits of pi are truly random, these frequencies
14 If the digits of pi are truly random, these frequencies
15 should be equal.
15 should be equal.
16 """
16 """
17
17
18 # Import statements
18 # Import statements
19 from __future__ import division, with_statement
19 from __future__ import division, with_statement
20
20
21 import numpy as np
21 import numpy as np
22 from matplotlib import pyplot as plt
22 from matplotlib import pyplot as plt
23
23
24 try : #python2
24 try : #python2
25 from urllib import urlretrieve
25 from urllib import urlretrieve
26 except : #python3
26 except ImportError : #python3
27 from urllib.request import urlretrieve
27 from urllib.request import urlretrieve
28
28
29 # Top-level functions
29 # Top-level functions
30
30
31 def fetch_pi_file(filename):
31 def fetch_pi_file(filename):
32 """This will download a segment of pi from super-computing.org
32 """This will download a segment of pi from super-computing.org
33 if the file is not already present.
33 if the file is not already present.
34 """
34 """
35 import os, urllib
35 import os, urllib
36 ftpdir="ftp://pi.super-computing.org/.2/pi200m/"
36 ftpdir="ftp://pi.super-computing.org/.2/pi200m/"
37 if os.path.exists(filename):
37 if os.path.exists(filename):
38 # we already have it
38 # we already have it
39 return
39 return
40 else:
40 else:
41 # download it
41 # download it
42 urlretrieve(ftpdir+filename,filename)
42 urlretrieve(ftpdir+filename,filename)
43
43
44 def compute_one_digit_freqs(filename):
44 def compute_one_digit_freqs(filename):
45 """
45 """
46 Read digits of pi from a file and compute the 1 digit frequencies.
46 Read digits of pi from a file and compute the 1 digit frequencies.
47 """
47 """
48 d = txt_file_to_digits(filename)
48 d = txt_file_to_digits(filename)
49 freqs = one_digit_freqs(d)
49 freqs = one_digit_freqs(d)
50 return freqs
50 return freqs
51
51
52 def compute_two_digit_freqs(filename):
52 def compute_two_digit_freqs(filename):
53 """
53 """
54 Read digits of pi from a file and compute the 2 digit frequencies.
54 Read digits of pi from a file and compute the 2 digit frequencies.
55 """
55 """
56 d = txt_file_to_digits(filename)
56 d = txt_file_to_digits(filename)
57 freqs = two_digit_freqs(d)
57 freqs = two_digit_freqs(d)
58 return freqs
58 return freqs
59
59
60 def reduce_freqs(freqlist):
60 def reduce_freqs(freqlist):
61 """
61 """
62 Add up a list of freq counts to get the total counts.
62 Add up a list of freq counts to get the total counts.
63 """
63 """
64 allfreqs = np.zeros_like(freqlist[0])
64 allfreqs = np.zeros_like(freqlist[0])
65 for f in freqlist:
65 for f in freqlist:
66 allfreqs += f
66 allfreqs += f
67 return allfreqs
67 return allfreqs
68
68
69 def compute_n_digit_freqs(filename, n):
69 def compute_n_digit_freqs(filename, n):
70 """
70 """
71 Read digits of pi from a file and compute the n digit frequencies.
71 Read digits of pi from a file and compute the n digit frequencies.
72 """
72 """
73 d = txt_file_to_digits(filename)
73 d = txt_file_to_digits(filename)
74 freqs = n_digit_freqs(d, n)
74 freqs = n_digit_freqs(d, n)
75 return freqs
75 return freqs
76
76
77 # Read digits from a txt file
77 # Read digits from a txt file
78
78
79 def txt_file_to_digits(filename, the_type=str):
79 def txt_file_to_digits(filename, the_type=str):
80 """
80 """
81 Yield the digits of pi read from a .txt file.
81 Yield the digits of pi read from a .txt file.
82 """
82 """
83 with open(filename, 'r') as f:
83 with open(filename, 'r') as f:
84 for line in f.readlines():
84 for line in f.readlines():
85 for c in line:
85 for c in line:
86 if c != '\n' and c!= ' ':
86 if c != '\n' and c!= ' ':
87 yield the_type(c)
87 yield the_type(c)
88
88
89 # Actual counting functions
89 # Actual counting functions
90
90
91 def one_digit_freqs(digits, normalize=False):
91 def one_digit_freqs(digits, normalize=False):
92 """
92 """
93 Consume digits of pi and compute 1 digit freq. counts.
93 Consume digits of pi and compute 1 digit freq. counts.
94 """
94 """
95 freqs = np.zeros(10, dtype='i4')
95 freqs = np.zeros(10, dtype='i4')
96 for d in digits:
96 for d in digits:
97 freqs[int(d)] += 1
97 freqs[int(d)] += 1
98 if normalize:
98 if normalize:
99 freqs = freqs/freqs.sum()
99 freqs = freqs/freqs.sum()
100 return freqs
100 return freqs
101
101
102 def two_digit_freqs(digits, normalize=False):
102 def two_digit_freqs(digits, normalize=False):
103 """
103 """
104 Consume digits of pi and compute 2 digits freq. counts.
104 Consume digits of pi and compute 2 digits freq. counts.
105 """
105 """
106 freqs = np.zeros(100, dtype='i4')
106 freqs = np.zeros(100, dtype='i4')
107 last = next(digits)
107 last = next(digits)
108 this = next(digits)
108 this = next(digits)
109 for d in digits:
109 for d in digits:
110 index = int(last + this)
110 index = int(last + this)
111 freqs[index] += 1
111 freqs[index] += 1
112 last = this
112 last = this
113 this = d
113 this = d
114 if normalize:
114 if normalize:
115 freqs = freqs/freqs.sum()
115 freqs = freqs/freqs.sum()
116 return freqs
116 return freqs
117
117
118 def n_digit_freqs(digits, n, normalize=False):
118 def n_digit_freqs(digits, n, normalize=False):
119 """
119 """
120 Consume digits of pi and compute n digits freq. counts.
120 Consume digits of pi and compute n digits freq. counts.
121
121
122 This should only be used for 1-6 digits.
122 This should only be used for 1-6 digits.
123 """
123 """
124 freqs = np.zeros(pow(10,n), dtype='i4')
124 freqs = np.zeros(pow(10,n), dtype='i4')
125 current = np.zeros(n, dtype=int)
125 current = np.zeros(n, dtype=int)
126 for i in range(n):
126 for i in range(n):
127 current[i] = next(digits)
127 current[i] = next(digits)
128 for d in digits:
128 for d in digits:
129 index = int(''.join(map(str, current)))
129 index = int(''.join(map(str, current)))
130 freqs[index] += 1
130 freqs[index] += 1
131 current[0:-1] = current[1:]
131 current[0:-1] = current[1:]
132 current[-1] = d
132 current[-1] = d
133 if normalize:
133 if normalize:
134 freqs = freqs/freqs.sum()
134 freqs = freqs/freqs.sum()
135 return freqs
135 return freqs
136
136
137 # Plotting functions
137 # Plotting functions
138
138
139 def plot_two_digit_freqs(f2):
139 def plot_two_digit_freqs(f2):
140 """
140 """
141 Plot two digits frequency counts using matplotlib.
141 Plot two digits frequency counts using matplotlib.
142 """
142 """
143 f2_copy = f2.copy()
143 f2_copy = f2.copy()
144 f2_copy.shape = (10,10)
144 f2_copy.shape = (10,10)
145 ax = plt.matshow(f2_copy)
145 ax = plt.matshow(f2_copy)
146 plt.colorbar()
146 plt.colorbar()
147 for i in range(10):
147 for i in range(10):
148 for j in range(10):
148 for j in range(10):
149 plt.text(i-0.2, j+0.2, str(j)+str(i))
149 plt.text(i-0.2, j+0.2, str(j)+str(i))
150 plt.ylabel('First digit')
150 plt.ylabel('First digit')
151 plt.xlabel('Second digit')
151 plt.xlabel('Second digit')
152 return ax
152 return ax
153
153
154 def plot_one_digit_freqs(f1):
154 def plot_one_digit_freqs(f1):
155 """
155 """
156 Plot one digit frequency counts using matplotlib.
156 Plot one digit frequency counts using matplotlib.
157 """
157 """
158 ax = plt.plot(f1,'bo-')
158 ax = plt.plot(f1,'bo-')
159 plt.title('Single digit counts in pi')
159 plt.title('Single digit counts in pi')
160 plt.xlabel('Digit')
160 plt.xlabel('Digit')
161 plt.ylabel('Count')
161 plt.ylabel('Count')
162 return ax
162 return ax
General Comments 0
You need to be logged in to leave comments. Login now