Show More
@@ -1,90 +1,90 b'' | |||||
1 | #!/usr/bin/env python |
|
1 | #!/usr/bin/env python | |
2 | """Parallel word frequency counter. |
|
2 | """Parallel word frequency counter. | |
3 |
|
3 | |||
4 | This only works for a local cluster, because the filenames are local paths. |
|
4 | This only works for a local cluster, because the filenames are local paths. | |
5 | """ |
|
5 | """ | |
6 |
|
6 | |||
7 |
|
7 | |||
8 | import os |
|
8 | import os | |
9 | import time |
|
9 | import time | |
10 | import urllib |
|
10 | import urllib | |
11 |
|
11 | |||
12 | from itertools import repeat |
|
12 | from itertools import repeat | |
13 |
|
13 | |||
14 | from wordfreq import print_wordfreq, wordfreq |
|
14 | from wordfreq import print_wordfreq, wordfreq | |
15 |
|
15 | |||
16 | from IPython.parallel import Client, Reference |
|
16 | from IPython.parallel import Client, Reference | |
17 |
|
17 | |||
18 | from __future__ import division |
|
18 | from __future__ import division | |
19 |
|
19 | |||
20 |
try |
|
20 | try: #python2 | |
21 | from urllib import urlretrieve |
|
21 | from urllib import urlretrieve | |
22 | except : #python3 |
|
22 | except ImportError: #python3 | |
23 | from urllib.request import urlretrieve |
|
23 | from urllib.request import urlretrieve | |
24 |
|
24 | |||
25 | davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt" |
|
25 | davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt" | |
26 |
|
26 | |||
27 | def pwordfreq(view, fnames): |
|
27 | def pwordfreq(view, fnames): | |
28 | """Parallel word frequency counter. |
|
28 | """Parallel word frequency counter. | |
29 |
|
29 | |||
30 | view - An IPython DirectView |
|
30 | view - An IPython DirectView | |
31 | fnames - The filenames containing the split data. |
|
31 | fnames - The filenames containing the split data. | |
32 | """ |
|
32 | """ | |
33 | assert len(fnames) == len(view.targets) |
|
33 | assert len(fnames) == len(view.targets) | |
34 | view.scatter('fname', fnames, flatten=True) |
|
34 | view.scatter('fname', fnames, flatten=True) | |
35 | ar = view.apply(wordfreq, Reference('fname')) |
|
35 | ar = view.apply(wordfreq, Reference('fname')) | |
36 | freqs_list = ar.get() |
|
36 | freqs_list = ar.get() | |
37 | word_set = set() |
|
37 | word_set = set() | |
38 | for f in freqs_list: |
|
38 | for f in freqs_list: | |
39 | word_set.update(f.keys()) |
|
39 | word_set.update(f.keys()) | |
40 | freqs = dict(zip(word_set, repeat(0))) |
|
40 | freqs = dict(zip(word_set, repeat(0))) | |
41 | for f in freqs_list: |
|
41 | for f in freqs_list: | |
42 | for word, count in f.items(): |
|
42 | for word, count in f.items(): | |
43 | freqs[word] += count |
|
43 | freqs[word] += count | |
44 | return freqs |
|
44 | return freqs | |
45 |
|
45 | |||
46 | if __name__ == '__main__': |
|
46 | if __name__ == '__main__': | |
47 | # Create a Client and View |
|
47 | # Create a Client and View | |
48 | rc = Client() |
|
48 | rc = Client() | |
49 |
|
49 | |||
50 | view = rc[:] |
|
50 | view = rc[:] | |
51 |
|
51 | |||
52 | if not os.path.exists('davinci.txt'): |
|
52 | if not os.path.exists('davinci.txt'): | |
53 | # download from project gutenberg |
|
53 | # download from project gutenberg | |
54 | print("Downloading Da Vinci's notebooks from Project Gutenberg") |
|
54 | print("Downloading Da Vinci's notebooks from Project Gutenberg") | |
55 | urlretrieve(davinci_url, 'davinci.txt') |
|
55 | urlretrieve(davinci_url, 'davinci.txt') | |
56 |
|
56 | |||
57 | # Run the serial version |
|
57 | # Run the serial version | |
58 | print("Serial word frequency count:") |
|
58 | print("Serial word frequency count:") | |
59 | text = open('davinci.txt').read() |
|
59 | text = open('davinci.txt').read() | |
60 | tic = time.time() |
|
60 | tic = time.time() | |
61 | freqs = wordfreq(text) |
|
61 | freqs = wordfreq(text) | |
62 | toc = time.time() |
|
62 | toc = time.time() | |
63 | print_wordfreq(freqs, 10) |
|
63 | print_wordfreq(freqs, 10) | |
64 | print("Took %.3f s to calculate"%(toc-tic)) |
|
64 | print("Took %.3f s to calculate"%(toc-tic)) | |
65 |
|
65 | |||
66 |
|
66 | |||
67 | # The parallel version |
|
67 | # The parallel version | |
68 | print("\nParallel word frequency count:") |
|
68 | print("\nParallel word frequency count:") | |
69 | # split the davinci.txt into one file per engine: |
|
69 | # split the davinci.txt into one file per engine: | |
70 | lines = text.splitlines() |
|
70 | lines = text.splitlines() | |
71 | nlines = len(lines) |
|
71 | nlines = len(lines) | |
72 | n = len(rc) |
|
72 | n = len(rc) | |
73 | block = nlines//n |
|
73 | block = nlines//n | |
74 | for i in range(n): |
|
74 | for i in range(n): | |
75 | chunk = lines[i*block:i*(block+1)] |
|
75 | chunk = lines[i*block:i*(block+1)] | |
76 | with open('davinci%i.txt'%i, 'w') as f: |
|
76 | with open('davinci%i.txt'%i, 'w') as f: | |
77 | f.write('\n'.join(chunk)) |
|
77 | f.write('\n'.join(chunk)) | |
78 |
|
78 | |||
79 |
try |
|
79 | try: #python2 | |
80 | cwd = os.path.abspath(os.getcwdu()) |
|
80 | cwd = os.path.abspath(os.getcwdu()) | |
81 | except : #python3 |
|
81 | except AttributeError: #python3 | |
82 | cwd = os.path.abspath(os.getcwd()) |
|
82 | cwd = os.path.abspath(os.getcwd()) | |
83 | fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)] |
|
83 | fnames = [ os.path.join(cwd, 'davinci%i.txt'%i) for i in range(n)] | |
84 | tic = time.time() |
|
84 | tic = time.time() | |
85 | pfreqs = pwordfreq(view,fnames) |
|
85 | pfreqs = pwordfreq(view,fnames) | |
86 | toc = time.time() |
|
86 | toc = time.time() | |
87 | print_wordfreq(freqs) |
|
87 | print_wordfreq(freqs) | |
88 | print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets))) |
|
88 | print("Took %.3f s to calculate on %i engines"%(toc-tic, len(view.targets))) | |
89 | # cleanup split files |
|
89 | # cleanup split files | |
90 | map(os.remove, fnames) |
|
90 | map(os.remove, fnames) |
@@ -1,162 +1,162 b'' | |||||
1 | """Compute statistics on the digits of pi. |
|
1 | """Compute statistics on the digits of pi. | |
2 |
|
2 | |||
3 | This uses precomputed digits of pi from the website |
|
3 | This uses precomputed digits of pi from the website | |
4 | of Professor Yasumasa Kanada at the University of |
|
4 | of Professor Yasumasa Kanada at the University of | |
5 | Tokoyo: http://www.super-computing.org/ |
|
5 | Tokoyo: http://www.super-computing.org/ | |
6 |
|
6 | |||
7 | Currently, there are only functions to read the |
|
7 | Currently, there are only functions to read the | |
8 | .txt (non-compressed, non-binary) files, but adding |
|
8 | .txt (non-compressed, non-binary) files, but adding | |
9 | support for compression and binary files would be |
|
9 | support for compression and binary files would be | |
10 | straightforward. |
|
10 | straightforward. | |
11 |
|
11 | |||
12 | This focuses on computing the number of times that |
|
12 | This focuses on computing the number of times that | |
13 | all 1, 2, n digits sequences occur in the digits of pi. |
|
13 | all 1, 2, n digits sequences occur in the digits of pi. | |
14 | If the digits of pi are truly random, these frequencies |
|
14 | If the digits of pi are truly random, these frequencies | |
15 | should be equal. |
|
15 | should be equal. | |
16 | """ |
|
16 | """ | |
17 |
|
17 | |||
18 | # Import statements |
|
18 | # Import statements | |
19 | from __future__ import division, with_statement |
|
19 | from __future__ import division, with_statement | |
20 |
|
20 | |||
21 | import numpy as np |
|
21 | import numpy as np | |
22 | from matplotlib import pyplot as plt |
|
22 | from matplotlib import pyplot as plt | |
23 |
|
23 | |||
24 | try : #python2 |
|
24 | try : #python2 | |
25 | from urllib import urlretrieve |
|
25 | from urllib import urlretrieve | |
26 | except : #python3 |
|
26 | except ImportError : #python3 | |
27 | from urllib.request import urlretrieve |
|
27 | from urllib.request import urlretrieve | |
28 |
|
28 | |||
29 | # Top-level functions |
|
29 | # Top-level functions | |
30 |
|
30 | |||
31 | def fetch_pi_file(filename): |
|
31 | def fetch_pi_file(filename): | |
32 | """This will download a segment of pi from super-computing.org |
|
32 | """This will download a segment of pi from super-computing.org | |
33 | if the file is not already present. |
|
33 | if the file is not already present. | |
34 | """ |
|
34 | """ | |
35 | import os, urllib |
|
35 | import os, urllib | |
36 | ftpdir="ftp://pi.super-computing.org/.2/pi200m/" |
|
36 | ftpdir="ftp://pi.super-computing.org/.2/pi200m/" | |
37 | if os.path.exists(filename): |
|
37 | if os.path.exists(filename): | |
38 | # we already have it |
|
38 | # we already have it | |
39 | return |
|
39 | return | |
40 | else: |
|
40 | else: | |
41 | # download it |
|
41 | # download it | |
42 | urlretrieve(ftpdir+filename,filename) |
|
42 | urlretrieve(ftpdir+filename,filename) | |
43 |
|
43 | |||
44 | def compute_one_digit_freqs(filename): |
|
44 | def compute_one_digit_freqs(filename): | |
45 | """ |
|
45 | """ | |
46 | Read digits of pi from a file and compute the 1 digit frequencies. |
|
46 | Read digits of pi from a file and compute the 1 digit frequencies. | |
47 | """ |
|
47 | """ | |
48 | d = txt_file_to_digits(filename) |
|
48 | d = txt_file_to_digits(filename) | |
49 | freqs = one_digit_freqs(d) |
|
49 | freqs = one_digit_freqs(d) | |
50 | return freqs |
|
50 | return freqs | |
51 |
|
51 | |||
52 | def compute_two_digit_freqs(filename): |
|
52 | def compute_two_digit_freqs(filename): | |
53 | """ |
|
53 | """ | |
54 | Read digits of pi from a file and compute the 2 digit frequencies. |
|
54 | Read digits of pi from a file and compute the 2 digit frequencies. | |
55 | """ |
|
55 | """ | |
56 | d = txt_file_to_digits(filename) |
|
56 | d = txt_file_to_digits(filename) | |
57 | freqs = two_digit_freqs(d) |
|
57 | freqs = two_digit_freqs(d) | |
58 | return freqs |
|
58 | return freqs | |
59 |
|
59 | |||
60 | def reduce_freqs(freqlist): |
|
60 | def reduce_freqs(freqlist): | |
61 | """ |
|
61 | """ | |
62 | Add up a list of freq counts to get the total counts. |
|
62 | Add up a list of freq counts to get the total counts. | |
63 | """ |
|
63 | """ | |
64 | allfreqs = np.zeros_like(freqlist[0]) |
|
64 | allfreqs = np.zeros_like(freqlist[0]) | |
65 | for f in freqlist: |
|
65 | for f in freqlist: | |
66 | allfreqs += f |
|
66 | allfreqs += f | |
67 | return allfreqs |
|
67 | return allfreqs | |
68 |
|
68 | |||
69 | def compute_n_digit_freqs(filename, n): |
|
69 | def compute_n_digit_freqs(filename, n): | |
70 | """ |
|
70 | """ | |
71 | Read digits of pi from a file and compute the n digit frequencies. |
|
71 | Read digits of pi from a file and compute the n digit frequencies. | |
72 | """ |
|
72 | """ | |
73 | d = txt_file_to_digits(filename) |
|
73 | d = txt_file_to_digits(filename) | |
74 | freqs = n_digit_freqs(d, n) |
|
74 | freqs = n_digit_freqs(d, n) | |
75 | return freqs |
|
75 | return freqs | |
76 |
|
76 | |||
77 | # Read digits from a txt file |
|
77 | # Read digits from a txt file | |
78 |
|
78 | |||
79 | def txt_file_to_digits(filename, the_type=str): |
|
79 | def txt_file_to_digits(filename, the_type=str): | |
80 | """ |
|
80 | """ | |
81 | Yield the digits of pi read from a .txt file. |
|
81 | Yield the digits of pi read from a .txt file. | |
82 | """ |
|
82 | """ | |
83 | with open(filename, 'r') as f: |
|
83 | with open(filename, 'r') as f: | |
84 | for line in f.readlines(): |
|
84 | for line in f.readlines(): | |
85 | for c in line: |
|
85 | for c in line: | |
86 | if c != '\n' and c!= ' ': |
|
86 | if c != '\n' and c!= ' ': | |
87 | yield the_type(c) |
|
87 | yield the_type(c) | |
88 |
|
88 | |||
89 | # Actual counting functions |
|
89 | # Actual counting functions | |
90 |
|
90 | |||
91 | def one_digit_freqs(digits, normalize=False): |
|
91 | def one_digit_freqs(digits, normalize=False): | |
92 | """ |
|
92 | """ | |
93 | Consume digits of pi and compute 1 digit freq. counts. |
|
93 | Consume digits of pi and compute 1 digit freq. counts. | |
94 | """ |
|
94 | """ | |
95 | freqs = np.zeros(10, dtype='i4') |
|
95 | freqs = np.zeros(10, dtype='i4') | |
96 | for d in digits: |
|
96 | for d in digits: | |
97 | freqs[int(d)] += 1 |
|
97 | freqs[int(d)] += 1 | |
98 | if normalize: |
|
98 | if normalize: | |
99 | freqs = freqs/freqs.sum() |
|
99 | freqs = freqs/freqs.sum() | |
100 | return freqs |
|
100 | return freqs | |
101 |
|
101 | |||
102 | def two_digit_freqs(digits, normalize=False): |
|
102 | def two_digit_freqs(digits, normalize=False): | |
103 | """ |
|
103 | """ | |
104 | Consume digits of pi and compute 2 digits freq. counts. |
|
104 | Consume digits of pi and compute 2 digits freq. counts. | |
105 | """ |
|
105 | """ | |
106 | freqs = np.zeros(100, dtype='i4') |
|
106 | freqs = np.zeros(100, dtype='i4') | |
107 | last = next(digits) |
|
107 | last = next(digits) | |
108 | this = next(digits) |
|
108 | this = next(digits) | |
109 | for d in digits: |
|
109 | for d in digits: | |
110 | index = int(last + this) |
|
110 | index = int(last + this) | |
111 | freqs[index] += 1 |
|
111 | freqs[index] += 1 | |
112 | last = this |
|
112 | last = this | |
113 | this = d |
|
113 | this = d | |
114 | if normalize: |
|
114 | if normalize: | |
115 | freqs = freqs/freqs.sum() |
|
115 | freqs = freqs/freqs.sum() | |
116 | return freqs |
|
116 | return freqs | |
117 |
|
117 | |||
118 | def n_digit_freqs(digits, n, normalize=False): |
|
118 | def n_digit_freqs(digits, n, normalize=False): | |
119 | """ |
|
119 | """ | |
120 | Consume digits of pi and compute n digits freq. counts. |
|
120 | Consume digits of pi and compute n digits freq. counts. | |
121 |
|
121 | |||
122 | This should only be used for 1-6 digits. |
|
122 | This should only be used for 1-6 digits. | |
123 | """ |
|
123 | """ | |
124 | freqs = np.zeros(pow(10,n), dtype='i4') |
|
124 | freqs = np.zeros(pow(10,n), dtype='i4') | |
125 | current = np.zeros(n, dtype=int) |
|
125 | current = np.zeros(n, dtype=int) | |
126 | for i in range(n): |
|
126 | for i in range(n): | |
127 | current[i] = next(digits) |
|
127 | current[i] = next(digits) | |
128 | for d in digits: |
|
128 | for d in digits: | |
129 | index = int(''.join(map(str, current))) |
|
129 | index = int(''.join(map(str, current))) | |
130 | freqs[index] += 1 |
|
130 | freqs[index] += 1 | |
131 | current[0:-1] = current[1:] |
|
131 | current[0:-1] = current[1:] | |
132 | current[-1] = d |
|
132 | current[-1] = d | |
133 | if normalize: |
|
133 | if normalize: | |
134 | freqs = freqs/freqs.sum() |
|
134 | freqs = freqs/freqs.sum() | |
135 | return freqs |
|
135 | return freqs | |
136 |
|
136 | |||
137 | # Plotting functions |
|
137 | # Plotting functions | |
138 |
|
138 | |||
139 | def plot_two_digit_freqs(f2): |
|
139 | def plot_two_digit_freqs(f2): | |
140 | """ |
|
140 | """ | |
141 | Plot two digits frequency counts using matplotlib. |
|
141 | Plot two digits frequency counts using matplotlib. | |
142 | """ |
|
142 | """ | |
143 | f2_copy = f2.copy() |
|
143 | f2_copy = f2.copy() | |
144 | f2_copy.shape = (10,10) |
|
144 | f2_copy.shape = (10,10) | |
145 | ax = plt.matshow(f2_copy) |
|
145 | ax = plt.matshow(f2_copy) | |
146 | plt.colorbar() |
|
146 | plt.colorbar() | |
147 | for i in range(10): |
|
147 | for i in range(10): | |
148 | for j in range(10): |
|
148 | for j in range(10): | |
149 | plt.text(i-0.2, j+0.2, str(j)+str(i)) |
|
149 | plt.text(i-0.2, j+0.2, str(j)+str(i)) | |
150 | plt.ylabel('First digit') |
|
150 | plt.ylabel('First digit') | |
151 | plt.xlabel('Second digit') |
|
151 | plt.xlabel('Second digit') | |
152 | return ax |
|
152 | return ax | |
153 |
|
153 | |||
154 | def plot_one_digit_freqs(f1): |
|
154 | def plot_one_digit_freqs(f1): | |
155 | """ |
|
155 | """ | |
156 | Plot one digit frequency counts using matplotlib. |
|
156 | Plot one digit frequency counts using matplotlib. | |
157 | """ |
|
157 | """ | |
158 | ax = plt.plot(f1,'bo-') |
|
158 | ax = plt.plot(f1,'bo-') | |
159 | plt.title('Single digit counts in pi') |
|
159 | plt.title('Single digit counts in pi') | |
160 | plt.xlabel('Digit') |
|
160 | plt.xlabel('Digit') | |
161 | plt.ylabel('Count') |
|
161 | plt.ylabel('Count') | |
162 | return ax |
|
162 | return ax |
General Comments 0
You need to be logged in to leave comments.
Login now