##// END OF EJS Templates
Minor changes to text_analysis notebook example.
Brian Granger -
Show More
@@ -1,1 +1,1
1 {"cells":[{"cell_type":"text","text":"<h1>Basic Symbolic Quantum Mechanics</h1>"},{"code":"%load_ext sympy_printing","cell_type":"code","prompt_number":1},{"code":"from sympy import sqrt, symbols, Rational\nfrom sympy import expand, Eq, Symbol, simplify, exp, sin\nfrom sympy.physics.quantum import *\nfrom sympy.physics.quantum.qubit import *\nfrom sympy.physics.quantum.gate import *\nfrom sympy.physics.quantum.grover import *\nfrom sympy.physics.quantum.qft import QFT, IQFT, Fourier\nfrom sympy.physics.quantum.circuitplot import circuit_plot","cell_type":"code","prompt_number":2},{"code":"phi, psi = Ket('phi'), Ket('psi')\nalpha, beta = symbols('alpha beta', complex=True)","cell_type":"code","prompt_number":3},{"code":"state = alpha*psi + beta*phi; state\n","cell_type":"code","prompt_number":4},{"code":"ip = Dagger(state)*state; ip\n","cell_type":"code","prompt_number":5},{"code":"qapply(expand(ip))\n","cell_type":"code","prompt_number":6},{"code":"A = Operator('A')\nB = Operator('B')\nC = Operator('C')","cell_type":"code","prompt_number":7},{"code":"A*B == B*A\n","cell_type":"code","prompt_number":8},{"code":"expand((A+B)**2)","cell_type":"code","prompt_number":9},{"code":"comm = Commutator(A,B); comm\n","cell_type":"code","prompt_number":10},{"code":"comm.doit()","cell_type":"code","prompt_number":11},{"code":"comm = Commutator(A*B,B+C); comm","cell_type":"code","prompt_number":12},{"code":"comm.expand(commutator=True)","cell_type":"code","prompt_number":13},{"code":"_.doit().expand()\n","cell_type":"code","prompt_number":14},{"code":"Dagger(_)","cell_type":"code","prompt_number":15},{"code":"%notebook save basic_quantum.ipynb","cell_type":"code","prompt_number":16}]} No newline at end of file
1 {"cells":[{"cell_type":"text","text":"<h1>Basic Symbolic Quantum Mechanics</h1>"},{"code":"%load_ext sympy_printing","cell_type":"code","prompt_number":3},{"code":"from sympy import sqrt, symbols, Rational\nfrom sympy import expand, Eq, Symbol, simplify, exp, sin\nfrom sympy.physics.quantum import *\nfrom sympy.physics.quantum.qubit import *\nfrom sympy.physics.quantum.gate import *\nfrom sympy.physics.quantum.grover import *\nfrom sympy.physics.quantum.qft import QFT, IQFT, Fourier\nfrom sympy.physics.quantum.circuitplot import circuit_plot","cell_type":"code","prompt_number":4},{"code":"phi, psi = Ket('phi'), Ket('psi')\nalpha, beta = symbols('alpha beta', complex=True)","cell_type":"code","prompt_number":5},{"code":"state = alpha*psi + beta*phi; state\n","cell_type":"code","prompt_number":6},{"code":"ip = Dagger(state)*state; ip\n","cell_type":"code","prompt_number":7},{"code":"qapply(expand(ip))\n","cell_type":"code","prompt_number":8},{"code":"A = Operator('A')\nB = Operator('B')\nC = Operator('C')","cell_type":"code","prompt_number":9},{"code":"A*B == B*A\n","cell_type":"code","prompt_number":10},{"code":"expand((A+B)**2)","cell_type":"code","prompt_number":11},{"code":"comm = Commutator(A,B); comm\n","cell_type":"code","prompt_number":12},{"code":"comm.doit()","cell_type":"code","prompt_number":13},{"code":"comm = Commutator(A*B,B+C); comm","cell_type":"code","prompt_number":14},{"code":"comm.expand(commutator=True)","cell_type":"code","prompt_number":15},{"code":"_.doit().expand()\n","cell_type":"code","prompt_number":16},{"code":"Dagger(_)","cell_type":"code","prompt_number":17},{"code":"%notebook save basic_quantum.ipynb","cell_type":"code","prompt_number":16}]} No newline at end of file
@@ -1,1 +1,1
1 {"cells":[{"cell_type":"text","text":"<h1>Symbolic Quantum Computing</h1>"},{"code":"%load_ext sympy_printing","cell_type":"code","prompt_number":2},{"code":"from sympy import sqrt, symbols, Rational\nfrom sympy import expand, Eq, Symbol, simplify, exp, sin\nfrom sympy.physics.quantum import *\nfrom sympy.physics.quantum.qubit import *\nfrom sympy.physics.quantum.gate import *\nfrom sympy.physics.quantum.grover import *\nfrom sympy.physics.quantum.qft import QFT, IQFT, Fourier\nfrom sympy.physics.quantum.circuitplot import circuit_plot","cell_type":"code","prompt_number":3},{"code":"alpha, beta = symbols('alpha beta',real=True)","cell_type":"code","prompt_number":4},{"code":"psi = alpha*Qubit('00') + beta*Qubit('11'); psi\n","cell_type":"code","prompt_number":5},{"code":"Dagger(psi)\n","cell_type":"code","prompt_number":6},{"code":"qapply(Dagger(Qubit('00'))*psi)\n","cell_type":"code","prompt_number":7},{"code":"for state, prob in measure_all(psi):\n display(state)\n display(prob)\n","cell_type":"code","prompt_number":8},{"code":"represent(psi, nqubits=2)\n","cell_type":"code","prompt_number":9},{"code":"g = X(0); g\n","cell_type":"code","prompt_number":10},{"code":"represent(g, nqubits=2)\n","cell_type":"code","prompt_number":11},{"code":"c = H(0)*Qubit('00'); c\n","cell_type":"code","prompt_number":12},{"code":"qapply(c)\n","cell_type":"code","prompt_number":13},{"code":"for g1 in (Y,Z,H):\n for g2 in (Y,Z,H):\n e = Commutator(g1(0),g2(0))\n if g1 != g2:\n display(Eq(e,e.doit()))\n","cell_type":"code","prompt_number":14},{"code":"c = H(0)*X(1)*H(0)**2*CNOT(0,1)*X(1)**3*X(0)*Z(2)**2*S(3)**3; c\n","cell_type":"code","prompt_number":24},{"code":"circuit_plot(c, nqubits=4)","cell_type":"code","prompt_number":25},{"code":"gate_simp(c)\n","cell_type":"code","prompt_number":16},{"code":"circuit_plot(gate_simp(c),nqubits=5)","cell_type":"code","prompt_number":23},{"code":"%notebook save quantum_computing.ipynb","cell_type":"code","prompt_number":35}]} No newline at end of file
1 {"cells":[{"cell_type":"text","text":"<h1>Symbolic Quantum Computing</h1>"},{"code":"%load_ext sympy_printing","cell_type":"code","prompt_number":1},{"code":"from sympy import sqrt, symbols, Rational\nfrom sympy import expand, Eq, Symbol, simplify, exp, sin\nfrom sympy.physics.quantum import *\nfrom sympy.physics.quantum.qubit import *\nfrom sympy.physics.quantum.gate import *\nfrom sympy.physics.quantum.grover import *\nfrom sympy.physics.quantum.qft import QFT, IQFT, Fourier\nfrom sympy.physics.quantum.circuitplot import circuit_plot","cell_type":"code","prompt_number":2},{"code":"alpha, beta = symbols('alpha beta',real=True)","cell_type":"code","prompt_number":3},{"code":"psi = alpha*Qubit('00') + beta*Qubit('11'); psi\n","cell_type":"code","prompt_number":4},{"code":"Dagger(psi)\n","cell_type":"code","prompt_number":5},{"code":"qapply(Dagger(Qubit('00'))*psi)\n","cell_type":"code","prompt_number":6},{"code":"for state, prob in measure_all(psi):\n display(state)\n display(prob)\n","cell_type":"code","prompt_number":7},{"code":"represent(psi, nqubits=2)\n","cell_type":"code","prompt_number":8},{"code":"g = X(0); g\n","cell_type":"code","prompt_number":9},{"code":"represent(g, nqubits=2)\n","cell_type":"code","prompt_number":10},{"code":"c = H(0)*Qubit('00'); c\n","cell_type":"code","prompt_number":11},{"code":"qapply(c)\n","cell_type":"code","prompt_number":12},{"code":"for g1 in (Y,Z,H):\n for g2 in (Y,Z,H):\n e = Commutator(g1(0),g2(0))\n if g1 != g2:\n display(Eq(e,e.doit()))\n","cell_type":"code","prompt_number":13},{"code":"c = H(0)*X(1)*H(0)**2*CNOT(0,1)*X(1)**3*X(0)*Z(2)**2*S(3)**3; c\n","cell_type":"code","prompt_number":14},{"code":"circuit_plot(c, nqubits=4)","cell_type":"code","prompt_number":15},{"code":"gate_simp(c)\n","cell_type":"code","prompt_number":16},{"code":"circuit_plot(gate_simp(c),nqubits=5)","cell_type":"code","prompt_number":17},{"code":"%notebook save quantum_computing.ipynb","cell_type":"code","prompt_number":35}]} No newline at end of file
@@ -1,373 +1,376
1 1 #!/usr/bin/env python
2 2 """Simple text analysis: word frequencies and co-occurrence graph.
3 3
4 4 Usage:
5 5
6 6 text_analysis.py [text_file]
7 7
8 8 This script will analize a plain text file treating it as a list of
9 9 newline-separated sentences (e.g. a list of paper titles).
10 10
11 11 It computes word frequencies (after doing some naive normalization by
12 12 lowercasing and throwing away a few overly common words). It also computes,
13 13 from the most common words, a weighted graph of word co-occurrences and
14 14 displays it, as well as summarizing the graph structure by ranking its nodes in
15 15 descending order of eigenvector centrality.
16 16
17 17 This is meant as an illustration of text processing in Python, using matplotlib
18 18 for visualization and NetworkX for graph-theoretical manipulation. It should
19 19 not be considered production-strength code for serious text analysis.
20 20
21 21 Author: Fernando Perez <fernando.perez@berkeley.edu>
22 22 """
23 23
24 24 #-----------------------------------------------------------------------------
25 25 # Imports
26 26 #-----------------------------------------------------------------------------
27 27
28 28 # From the standard library
29 29 import os
30 30 import re
31 31 import sys
32 32 import urllib2
33 33
34 34 # Third-party libraries
35 35 import networkx as nx
36 36 import numpy as np
37 37
38 38 from matplotlib import pyplot as plt
39 39
40 40 #-----------------------------------------------------------------------------
41 41 # Function definitions
42 42 #-----------------------------------------------------------------------------
43 43
44 44 def rescale_arr(arr,amin,amax):
45 45 """Rescale an array to a new range.
46 46
47 47 Return a new array whose range of values is (amin,amax).
48 48
49 49 Parameters
50 50 ----------
51 51 arr : array-like
52 52
53 53 amin : float
54 54 new minimum value
55 55
56 56 amax : float
57 57 new maximum value
58 58
59 59 Examples
60 60 --------
61 61 >>> a = np.arange(5)
62 62
63 63 >>> rescale_arr(a,3,6)
64 64 array([ 3. , 3.75, 4.5 , 5.25, 6. ])
65 65 """
66 66
67 67 # old bounds
68 68 m = arr.min()
69 69 M = arr.max()
70 70 # scale/offset
71 71 s = float(amax-amin)/(M-m)
72 72 d = amin - s*m
73 73
74 74 # Apply clip before returning to cut off possible overflows outside the
75 75 # intended range due to roundoff error, so that we can absolutely guarantee
76 76 # that on output, there are no values > amax or < amin.
77 77 return np.clip(s*arr+d,amin,amax)
78 78
79 79
80 80 def all_pairs(items):
81 81 """Make all unique pairs (order doesn't matter)"""
82 82 pairs = []
83 83 nitems = len(items)
84 84 for i, wi in enumerate(items):
85 85 for j in range(i+1, nitems):
86 86 pairs.append((wi, items[j]))
87 87 return pairs
88 88
89 89
90 90 def text_cleanup(text, min_length=3,
91 91 remove = set(['for', 'the', 'and', 'with'])):
92 92 """Clean up a list of lowercase strings of text for simple analysis.
93 93
94 94 Splits on whitespace, removes all 'words' less than `min_length` characters
95 95 long, and those in the `remove` set.
96 96
97 97 Returns a list of strings.
98 98 """
99 99 return [w for w in text.lower().split()
100 100 if len(w)>=min_length and w not in remove]
101 101
102 102
103 103 def print_vk(lst):
104 104 """Print a list of value/key pairs nicely formatted in key/value order."""
105 105
106 106 # Find the longest key: remember, the list has value/key paris, so the key
107 107 # is element [1], not [0]
108 108 longest_key = max([len(word) for word, count in lst])
109 109 # Make a format string out of it
110 110 fmt = '%'+str(longest_key)+'s -> %s'
111 111 # Do actual printing
112 112 for k,v in lst:
113 113 print fmt % (k,v)
114 114
115 115
116 116 def word_freq(text):
117 117 """Return a dictionary of word frequencies for the given text.
118 118
119 119 Input text should be given as an iterable of strings."""
120 120
121 121 freqs = {}
122 122 for word in text:
123 123 freqs[word] = freqs.get(word, 0) + 1
124 124 return freqs
125 125
126 126
127 127 def sort_freqs(freqs):
128 128 """Sort a word frequency histogram represented as a dictionary.
129 129
130 130 Parameters
131 131 ----------
132 132 freqs : dict
133 133 A dict with string keys and integer values.
134 134
135 135 Return
136 136 ------
137 137 items : list
138 138 A list of (count, word) pairs.
139 139 """
140 140 items = freqs.items()
141 141 items.sort(key = lambda wc: wc[1])
142 142 return items
143 143 ## words,counts = freqs.keys(),freqs.values()
144 144 ## # Sort by count
145 145 ## items = zip(counts,words)
146 146 ## items.sort()
147 147 ## return items
148 148
149 149
150 150 def summarize_freq_hist(freqs, n=10):
151 151 """Print a simple summary of a word frequencies dictionary.
152 152
153 153 Paramters
154 154 ---------
155 155 freqs : dict or list
156 156 Word frequencies, represented either as a dict of word->count, or as a
157 157 list of count->word pairs.
158 158
159 159 n : int
160 160 The number of least/most frequent words to print.
161 161 """
162 162
163 163 items = sort_freqs(freqs) if isinstance(freqs, dict) else freqs
164 164 print 'Number of unique words:',len(freqs)
165 165 print
166 166 print '%d least frequent words:' % n
167 167 print_vk(items[:n])
168 168 print
169 169 print '%d most frequent words:' % n
170 170 print_vk(items[-n:])
171 171
172 172
173 173 def get_text_from_url(url):
174 174 """Given a url (local file path or remote url), read its contents.
175 175
176 176 If it's a remote URL, it downloads the file and leaves it locally cached
177 177 for future runs. If the local matching file is found, no download is made.
178 178
179 179 Returns
180 180 -------
181 181 text : string
182 182 The contents of the file.
183 183 """
184 184 if url.startswith('http'):
185 185 # remote file, fetch only if needed
186 186 fname = os.path.split(url)[1]
187 187 if os.path.isfile(fname):
188 188 with open(fname, 'r') as f:
189 189 text = f.read()
190 190 else:
191 191 with open(fname, 'w') as f:
192 192 text = urllib2.urlopen(url).read()
193 193 f.write(text)
194 194 else:
195 195 with open(url, 'r') as f:
196 196 text = f.read()
197 197 return text
198 198
199 199
200 200 def co_occurrences(lines, words):
201 201 """Return histogram of co-occurrences of words in a list of lines.
202 202
203 203 Parameters
204 204 ----------
205 205 lines : list
206 206 A list of strings considered as 'sentences' to search for co-occurrences.
207 207
208 208 words : list
209 209 A list of words from which all unordered pairs will be constructed and
210 210 searched for co-occurrences.
211 211 """
212 212 wpairs = all_pairs(words)
213 213
214 214 # Now build histogram of co-occurrences
215 215 co_occur = {}
216 216 for w1, w2 in wpairs:
217 217 rx = re.compile('%s .*%s|%s .*%s' % (w1, w2, w2, w1))
218 218 co_occur[w1, w2] = sum([1 for line in lines if rx.search(line)])
219 219
220 220 return co_occur
221 221
222 222
223 223 def co_occurrences_graph(word_hist, co_occur, cutoff=0):
224 224 """Convert a word histogram with co-occurrences to a weighted graph.
225 225
226 226 Edges are only added if the count is above cutoff.
227 227 """
228 228 g = nx.Graph()
229 229 for word, count in word_hist:
230 230 g.add_node(word, count=count)
231 231 for (w1, w2), count in co_occur.iteritems():
232 232 if count<=cutoff:
233 233 continue
234 234 g.add_edge(w1, w2, weight=count)
235 235 return g
236 236
237 237
238 238 def plot_graph(wgraph, pos=None):
239 239 """Conveniently summarize graph visually"""
240 240 # Plot nodes with size according to count
241 241 sizes = []
242 242 degrees = []
243 243 for n, d in wgraph.nodes_iter(data=True):
244 244 sizes.append(d['count'])
245 245 degrees.append(wgraph.degree(n))
246 246 sizes = rescale_arr(np.array(sizes, dtype=float), 100, 1000)
247 247
248 248 # Compute layout and label edges according to weight
249 249 pos = nx.spring_layout(wgraph) if pos is None else pos
250 250 labels = {}
251 251 width = []
252 252 for n1, n2, d in wgraph.edges_iter(data=True):
253 253 w = d['weight']
254 254 labels[n1, n2] = w
255 255 width.append(w)
256 256
257 257 # remap width to 1-10 range
258 258 width = rescale_arr(np.array(width, dtype=float), 1, 15)
259 259
260 260 # Create figure
261 fig, ax = plt.subplots()
261 fig = plt.figure()
262 ax = fig.add_subplot(111)
262 263 fig.subplots_adjust(0,0,1)
263 264 nx.draw_networkx_nodes(wgraph, pos, node_size=sizes, node_color=degrees,
264 265 alpha=0.8)
265 266 nx.draw_networkx_labels(wgraph, pos, font_size=15, font_weight='bold')
266 267 nx.draw_networkx_edges(wgraph, pos, width=width, edge_color=width,
267 268 edge_cmap=plt.cm.Blues)
268 269 nx.draw_networkx_edge_labels(wgraph, pos, edge_labels=labels)
269 270 ax.set_title('Node color:degree, size:count, edge: co-occurrence count')
270 271
271 272
272 273 def plot_word_histogram(freqs, show=10, title=None):
273 274 """Plot a histogram of word frequencies, limited to the top `show` ones.
274 275 """
275 276 sorted_f = sort_freqs(freqs) if isinstance(freqs, dict) else freqs
276 277
277 278 # Don't show the tail
278 279 if isinstance(show, int):
279 280 # interpret as number of words to show in histogram
280 281 show_f = sorted_f[-show:]
281 282 else:
282 283 # interpret as a fraction
283 284 start = -int(round(show*len(freqs)))
284 285 show_f = sorted_f[start:]
285 286
286 287 # Now, extract words and counts, plot
287 288 n_words = len(show_f)
288 289 ind = np.arange(n_words)
289 290 words = [i[0] for i in show_f]
290 291 counts = [i[1] for i in show_f]
291 292
292 fig, ax = plt.subplots()
293 fig = plt.figure()
294 ax = fig.add_subplot(111)
295
293 296 if n_words<=20:
294 297 # Only show bars and x labels for small histograms, they don't make
295 298 # sense otherwise
296 299 ax.bar(ind, counts)
297 300 ax.set_xticks(ind)
298 301 ax.set_xticklabels(words, rotation=45)
299 302 fig.subplots_adjust(bottom=0.25)
300 303 else:
301 304 # For larger ones, do a step plot
302 305 ax.step(ind, counts)
303 306
304 307 # If it spans more than two decades, use a log scale
305 308 if float(max(counts))/min(counts) > 100:
306 309 ax.set_yscale('log')
307 310
308 311 if title:
309 312 ax.set_title(title)
310 313 return ax
311 314
312 315
313 316 def summarize_centrality(centrality):
314 317 c = centrality.items()
315 318 c.sort(key=lambda x:x[1], reverse=True)
316 319 print '\nGraph centrality'
317 320 for node, cent in c:
318 321 print "%15s: %.3g" % (node, cent)
319 322
320 323 #-----------------------------------------------------------------------------
321 324 # Main script
322 325 #-----------------------------------------------------------------------------
323 326
324 327 # if __name__ == '__main__':
325 328
326 329 # # Configure user variables here
327 330 # # Specify the url (can be a local file path) of the text file to analyze.
328 331 # # If not given, it's read from the command line as the first argument
329 332 #
330 333 # # 11226 titles of recent articles in arxiv/math/prob
331 334 # default_url = "http://bibserver.berkeley.edu/tmp/titles.txt"
332 335 # # Number of words to display in detailed histogram
333 336 # n_words = 15
334 337 # # Number of words to use as nodes for co-occurrence graph.
335 338 # n_nodes = 15
336 339 #
337 340 # # End of user configuration
338 341 #
339 342 # # Actual code starts here
340 343 # try:
341 344 # url = sys.argv[1]
342 345 # except IndexError:
343 346 # url = default_url
344 347 #
345 348 # # Fetch text and do basic preprocessing
346 349 # text = get_text_from_url(url).lower()
347 350 # lines = text.splitlines()
348 351 # words = text_cleanup(text)
349 352 #
350 353 # # Compute frequency histogram
351 354 # wf = word_freq(words)
352 355 # sorted_wf = sort_freqs(wf)
353 356 #
354 357 # # Build a graph from the n_nodes most frequent words
355 358 # popular = sorted_wf[-n_nodes:]
356 359 # pop_words = [wc[0] for wc in popular]
357 360 # co_occur = co_occurrences(lines, pop_words)
358 361 # wgraph = co_occurrences_graph(popular, co_occur, cutoff=1)
359 362 # centrality = nx.eigenvector_centrality_numpy(wgraph)
360 363 #
361 364 # # Print summaries of single-word frequencies and graph structure
362 365 # summarize_freq_hist(sorted_wf)
363 366 # summarize_centrality(centrality)
364 367 #
365 368 # # Plot histogram and graph
366 369 # plt.close('all')
367 370 # plot_word_histogram(sorted_wf, n_words,
368 371 # "Frequencies for %s most frequent words" % n_words)
369 372 # plot_word_histogram(sorted_wf, 1.0, "Frequencies for entire word list")
370 373 # plot_graph(wgraph)
371 374 #
372 375 # # Display figures
373 376 # plt.show()
General Comments 0
You need to be logged in to leave comments. Login now