##// END OF EJS Templates
remove abspath in conversion process
Matthias BUSSONNIER -
Show More
@@ -1,323 +1,323 b''
1 1 from __future__ import print_function, absolute_import
2 2 from converters.utils import remove_fake_files_url
3 3
4 4 # Stdlib
5 5 import codecs
6 6 import io
7 7 import logging
8 8 import os
9 9 import pprint
10 10 from types import FunctionType
11 11
12 12 # From IPython
13 13 from IPython.nbformat import current as nbformat
14 14
15 15 # local
16 16
17 17 #-----------------------------------------------------------------------------
18 18 # Class declarations
19 19 #-----------------------------------------------------------------------------
20 20
21 21 class ConversionException(Exception):
22 22 pass
23 23
24 24 class DocStringInheritor(type):
25 25 """
26 26 This metaclass will walk the list of bases until the desired
27 27 superclass method is found AND if that method has a docstring and only
28 28 THEN does it attach the superdocstring to the derived class method.
29 29
30 30 Please use carefully, I just did the metaclass thing by following
31 31 Michael Foord's Metaclass tutorial
32 32 (http://www.voidspace.org.uk/python/articles/metaclasses.shtml), I may
33 33 have missed a step or two.
34 34
35 35 source:
36 36 http://groups.google.com/group/comp.lang.python/msg/26f7b4fcb4d66c95
37 37 by Paul McGuire
38 38 """
39 39 def __new__(meta, classname, bases, classDict):
40 40 newClassDict = {}
41 41 for attributeName, attribute in classDict.items():
42 42 if type(attribute) == FunctionType:
43 43 # look through bases for matching function by name
44 44 for baseclass in bases:
45 45 if hasattr(baseclass, attributeName):
46 46 basefn = getattr(baseclass, attributeName)
47 47 if basefn.__doc__:
48 48 attribute.__doc__ = basefn.__doc__
49 49 break
50 50 newClassDict[attributeName] = attribute
51 51 return type.__new__(meta, classname, bases, newClassDict)
52 52
53 53 class Converter(object):
54 54 __metaclass__ = DocStringInheritor
55 55 default_encoding = 'utf-8'
56 56 extension = str()
57 57 figures_counter = 0
58 58 infile = str()
59 59 infile_dir = str()
60 60 infile_root = str()
61 61 files_dir = str()
62 62 with_preamble = True
63 63 user_preamble = None
64 64 output = unicode()
65 65 raw_as_verbatim = False
66 66
67 67 def __init__(self, infile):
68 68 self.infile = infile
69 69 self.infile_dir, infile_root = os.path.split(infile)
70 70 infile_root = os.path.splitext(infile_root)[0]
71 71 files_dir = os.path.join(self.infile_dir, infile_root + '_files')
72 72 if not os.path.isdir(files_dir):
73 73 os.mkdir(files_dir)
74 74 self.infile_root = infile_root
75 self.files_dir = os.path.abspath(files_dir)
75 self.files_dir = files_dir
76 76 self.outbase = os.path.join(self.infile_dir, infile_root)
77 77
78 78 def __del__(self):
79 79 if os.path.isdir(self.files_dir) and not os.listdir(self.files_dir):
80 80 os.rmdir(self.files_dir)
81 81
82 82 def dispatch(self, cell_type):
83 83 """return cell_type dependent render method, for example render_code
84 84 """
85 85 return getattr(self, 'render_' + cell_type, self.render_unknown)
86 86
87 87 def dispatch_display_format(self, format):
88 88 """return output_type dependent render method, for example render_output_text
89 89 """
90 90 return getattr(self, 'render_display_format_' + format, self.render_unknown_display)
91 91
92 92 def convert(self, cell_separator='\n'):
93 93 """
94 94 Generic method to converts notebook to a string representation.
95 95
96 96 This is accomplished by dispatching on the cell_type, so subclasses of
97 97 Convereter class do not need to re-implement this method, but just
98 98 need implementation for the methods that will be dispatched.
99 99
100 100 Parameters
101 101 ----------
102 102 cell_separator : string
103 103 Character or string to join cells with. Default is "\n"
104 104
105 105 Returns
106 106 -------
107 107 out : string
108 108 """
109 109 lines = []
110 110 lines.extend(self.optional_header())
111 111 lines.extend(self.main_body(cell_separator))
112 112 lines.extend(self.optional_footer())
113 113 return u'\n'.join(lines)
114 114
115 115 def main_body(self, cell_separator='\n'):
116 116 converted_cells = []
117 117 for worksheet in self.nb.worksheets:
118 118 for cell in worksheet.cells:
119 119 #print(cell.cell_type) # dbg
120 120 conv_fn = self.dispatch(cell.cell_type)
121 121 if cell.cell_type in ('markdown', 'raw'):
122 122 remove_fake_files_url(cell)
123 123 converted_cells.append('\n'.join(conv_fn(cell)))
124 124 cell_lines = cell_separator.join(converted_cells).split('\n')
125 125 return cell_lines
126 126
127 127 def render(self):
128 128 "read, convert, and save self.infile"
129 129 if not hasattr(self, 'nb'):
130 130 self.read()
131 131 self.output = self.convert()
132 132 assert(type(self.output) == unicode)
133 133 return self.save()
134 134
135 135 def read(self):
136 136 "read and parse notebook into NotebookNode called self.nb"
137 137 with open(self.infile) as f:
138 138 self.nb = nbformat.read(f, 'json')
139 139
140 140 def save(self, outfile=None, encoding=None):
141 141 "read and parse notebook into self.nb"
142 142 if outfile is None:
143 143 outfile = self.outbase + '.' + self.extension
144 144 if encoding is None:
145 145 encoding = self.default_encoding
146 146 with io.open(outfile, 'w', encoding=encoding) as f:
147 147 f.write(self.output)
148 148 return os.path.abspath(outfile)
149 149
150 150 def optional_header(self):
151 151 """
152 152 Optional header to insert at the top of the converted notebook
153 153
154 154 Returns a list
155 155 """
156 156 return []
157 157
158 158 def optional_footer(self):
159 159 """
160 160 Optional footer to insert at the end of the converted notebook
161 161
162 162 Returns a list
163 163 """
164 164 return []
165 165
166 166 def _new_figure(self, data, fmt):
167 167 """Create a new figure file in the given format.
168 168
169 169 Returns a path relative to the input file.
170 170 """
171 171 figname = '%s_fig_%02i.%s' % (self.infile_root,
172 172 self.figures_counter, fmt)
173 173 self.figures_counter += 1
174 174 fullname = os.path.join(self.files_dir, figname)
175 175
176 176 # Binary files are base64-encoded, SVG is already XML
177 177 if fmt in ('png', 'jpg', 'pdf'):
178 178 data = data.decode('base64')
179 179 fopen = lambda fname: open(fname, 'wb')
180 180 else:
181 181 fopen = lambda fname: codecs.open(fname, 'wb', self.default_encoding)
182 182
183 183 with fopen(fullname) as f:
184 184 f.write(data)
185 185
186 186 return fullname
187 187
188 188 def render_heading(self, cell):
189 189 """convert a heading cell
190 190
191 191 Returns list."""
192 192 raise NotImplementedError
193 193
194 194 def render_code(self, cell):
195 195 """Convert a code cell
196 196
197 197 Returns list."""
198 198 raise NotImplementedError
199 199
200 200 def render_markdown(self, cell):
201 201 """convert a markdown cell
202 202
203 203 Returns list."""
204 204 raise NotImplementedError
205 205
206 206 def _img_lines(self, img_file):
207 207 """Return list of lines to include an image file."""
208 208 # Note: subclasses may choose to implement format-specific _FMT_lines
209 209 # methods if they so choose (FMT in {png, svg, jpg, pdf}).
210 210 raise NotImplementedError
211 211
212 212 def render_display_data(self, output):
213 213 """convert display data from the output of a code cell
214 214
215 215 Returns list.
216 216 """
217 217 lines = []
218 218
219 219 for fmt in output.keys():
220 220 if fmt in ['png', 'svg', 'jpg', 'pdf']:
221 221 img_file = self._new_figure(output[fmt], fmt)
222 222 # Subclasses can have format-specific render functions (e.g.,
223 223 # latex has to auto-convert all SVG to PDF first).
224 224 lines_fun = getattr(self, '_%s_lines' % fmt, None)
225 225 if not lines_fun:
226 226 lines_fun = self._img_lines
227 227 lines.extend(lines_fun(img_file))
228 228 elif fmt != 'output_type':
229 229 conv_fn = self.dispatch_display_format(fmt)
230 230 lines.extend(conv_fn(output))
231 231 return lines
232 232
233 233 def render_raw(self, cell):
234 234 """convert a cell with raw text
235 235
236 236 Returns list."""
237 237 raise NotImplementedError
238 238
239 239 def render_unknown(self, cell):
240 240 """Render cells of unkown type
241 241
242 242 Returns list."""
243 243 data = pprint.pformat(cell)
244 244 logging.warning('Unknown cell: %s' % cell.cell_type)
245 245 return self._unknown_lines(data)
246 246
247 247 def render_unknown_display(self, output, type):
248 248 """Render cells of unkown type
249 249
250 250 Returns list."""
251 251 data = pprint.pformat(output)
252 252 logging.warning('Unknown output: %s' % output.output_type)
253 253 return self._unknown_lines(data)
254 254
255 255 def render_stream(self, output):
256 256 """render the stream part of an output
257 257
258 258 Returns list.
259 259
260 260 Identical to render_display_format_text
261 261 """
262 262 return self.render_display_format_text(output)
263 263
264 264 def render_pyout(self, output):
265 265 """convert pyout part of a code cell
266 266
267 267 Returns list."""
268 268 raise NotImplementedError
269 269
270 270
271 271 def render_pyerr(self, output):
272 272 """convert pyerr part of a code cell
273 273
274 274 Returns list."""
275 275 raise NotImplementedError
276 276
277 277 def _unknown_lines(self, data):
278 278 """Return list of lines for an unknown cell.
279 279
280 280 Parameters
281 281 ----------
282 282 data : str
283 283 The content of the unknown data as a single string.
284 284 """
285 285 raise NotImplementedError
286 286
287 287 # These are the possible format types in an output node
288 288
289 289 def render_display_format_text(self, output):
290 290 """render the text part of an output
291 291
292 292 Returns list.
293 293 """
294 294 raise NotImplementedError
295 295
296 296 def render_display_format_html(self, output):
297 297 """render the html part of an output
298 298
299 299 Returns list.
300 300 """
301 301 raise NotImplementedError
302 302
303 303 def render_display_format_latex(self, output):
304 304 """render the latex part of an output
305 305
306 306 Returns list.
307 307 """
308 308 raise NotImplementedError
309 309
310 310 def render_display_format_json(self, output):
311 311 """render the json part of an output
312 312
313 313 Returns list.
314 314 """
315 315 raise NotImplementedError
316 316
317 317 def render_display_format_javascript(self, output):
318 318 """render the javascript part of an output
319 319
320 320 Returns list.
321 321 """
322 322 raise NotImplementedError
323 323
@@ -1,1294 +1,1294 b''
1 1 # An Introduction to the Scientific Python Ecosystem
2 2
3 3 While the Python language is an excellent tool for general-purpose programming, with a highly readable syntax, rich and powerful data types (strings, lists, sets, dictionaries, arbitrary length integers, etc) and a very comprehensive standard library, it was not designed specifically for mathematical and scientific computing. Neither the language nor its standard library have facilities for the efficient representation of multidimensional datasets, tools for linear algebra and general matrix manipulations (an essential building block of virtually all technical computing), nor any data visualization facilities.
4 4
5 5 In particular, Python lists are very flexible containers that can be nested arbitrarily deep and which can hold any Python object in them, but they are poorly suited to represent efficiently common mathematical constructs like vectors and matrices. In contrast, much of our modern heritage of scientific computing has been built on top of libraries written in the Fortran language, which has native support for vectors and matrices as well as a library of mathematical functions that can efficiently operate on entire arrays at once.
6 6
7 7 ## Scientific Python: a collaboration of projects built by scientists
8 8
9 9 The scientific community has developed a set of related Python libraries that provide powerful array facilities, linear algebra, numerical algorithms, data visualization and more. In this appendix, we will briefly outline the tools most frequently used for this purpose, that make "Scientific Python" something far more powerful than the Python language alone.
10 10
11 11 For reasons of space, we can only describe in some detail the central Numpy library, but below we provide links to the websites of each project where you can read their documentation in more detail.
12 12
13 13 First, let's look at an overview of the basic tools that most scientists use in daily research with Python. The core of this ecosystem is composed of:
14 14
15 15 * Numpy: the basic library that most others depend on, it provides a powerful array type that can represent multidmensional datasets of many different kinds and that supports arithmetic operations. Numpy also provides a library of common mathematical functions, basic linear algebra, random number generation and Fast Fourier Transforms. Numpy can be found at [numpy.scipy.org](http://numpy.scipy.org)
16 16
17 17 * Scipy: a large collection of numerical algorithms that operate on numpy arrays and provide facilities for many common tasks in scientific computing, including dense and sparse linear algebra support, optimization, special functions, statistics, n-dimensional image processing, signal processing and more. Scipy can be found at [scipy.org](http://scipy.org).
18 18
19 19 * Matplotlib: a data visualization library with a strong focus on producing high-quality output, it supports a variety of common scientific plot types in two and three dimensions, with precise control over the final output and format for publication-quality results. Matplotlib can also be controlled interactively allowing graphical manipulation of your data (zooming, panning, etc) and can be used with most modern user interface toolkits. It can be found at [matplotlib.sf.net](http://matplotlib.sf.net).
20 20
21 21 * IPython: while not strictly scientific in nature, IPython is the interactive environment in which many scientists spend their time. IPython provides a powerful Python shell that integrates tightly with Matplotlib and with easy access to the files and operating system, and which can execute in a terminal or in a graphical Qt console. IPython also has a web-based notebook interface that can combine code with text, mathematical expressions, figures and multimedia. It can be found at [ipython.org](http://ipython.org).
22 22
23 23 While each of these tools can be installed separately, in our opinion the most convenient way today of accessing them (especially on Windows and Mac computers) is to install the [Free Edition of the Enthought Python Distribution](http://www.enthought.com/products/epd_free.php) which contain all the above. Other free alternatives on Windows (but not on Macs) are [Python(x,y)](http://code.google.com/p/pythonxy) and [ Christoph Gohlke's packages page](http://www.lfd.uci.edu/~gohlke/pythonlibs).
24 24
25 25 These four 'core' libraries are in practice complemented by a number of other tools for more specialized work. We will briefly list here the ones that we think are the most commonly needed:
26 26
27 27 * Sympy: a symbolic manipulation tool that turns a Python session into a computer algebra system. It integrates with the IPython notebook, rendering results in properly typeset mathematical notation. [sympy.org](http://sympy.org).
28 28
29 29 * Mayavi: sophisticated 3d data visualization; [code.enthought.com/projects/mayavi](http://code.enthought.com/projects/mayavi).
30 30
31 31 * Cython: a bridge language between Python and C, useful both to optimize performance bottlenecks in Python and to access C libraries directly; [cython.org](http://cython.org).
32 32
33 33 * Pandas: high-performance data structures and data analysis tools, with powerful data alignment and structural manipulation capabilities; [pandas.pydata.org](http://pandas.pydata.org).
34 34
35 35 * Statsmodels: statistical data exploration and model estimation; [statsmodels.sourceforge.net](http://statsmodels.sourceforge.net).
36 36
37 37 * Scikit-learn: general purpose machine learning algorithms with a common interface; [scikit-learn.org](http://scikit-learn.org).
38 38
39 39 * Scikits-image: image processing toolbox; [scikits-image.org](http://scikits-image.org).
40 40
41 41 * NetworkX: analysis of complex networks (in the graph theoretical sense); [networkx.lanl.gov](http://networkx.lanl.gov).
42 42
43 43 * PyTables: management of hierarchical datasets using the industry-standard HDF5 format; [www.pytables.org](http://www.pytables.org).
44 44
45 45 Beyond these, for any specific problem you should look on the internet first, before starting to write code from scratch. There's a good chance that someone, somewhere, has written an open source library that you can use for part or all of your problem.
46 46
47 47 ## A note about the examples below
48 48
49 49 In all subsequent examples, you will see blocks of input code, followed by the results of the code if the code generated output. This output may include text, graphics and other result objects. These blocks of input can be pasted into your interactive IPython session or notebook for you to execute. In the print version of this document, a thin vertical bar on the left of the blocks of input and output shows which blocks go together.
50 50
51 51 If you are reading this text as an actual IPython notebook, you can press `Shift-Enter` or use the 'play' button on the toolbar (right-pointing triangle) to execute each block of code, known as a 'cell' in IPython:
52 52
53 53 <div class="highlight"><pre><span class="c"># This is a block of code, below you&#39;ll see its output</span>
54 54 <span class="k">print</span> <span class="s">&quot;Welcome to the world of scientific computing with Python!&quot;</span>
55 55 </pre></div>
56 56
57 57
58 58 Welcome to the world of scientific computing with Python!
59 59
60 60
61 61 # Motivation: the trapezoidal rule
62 62
63 63 In subsequent sections we'll provide a basic introduction to the nuts and bolts of the basic scientific python tools; but we'll first motivate it with a brief example that illustrates what you can do in a few lines with these tools. For this, we will use the simple problem of approximating a definite integral with the trapezoid rule:
64 64
65 65 $$
66 66 \int_{a}^{b} f(x)\, dx \approx \frac{1}{2} \sum_{k=1}^{N} \left( x_{k} - x_{k-1} \right) \left( f(x_{k}) + f(x_{k-1}) \right).
67 67 $$
68 68
69 69 Our task will be to compute this formula for a function such as:
70 70
71 71 $$
72 72 f(x) = (x-3)(x-5)(x-7)+85
73 73 $$
74 74
75 75 integrated between $a=1$ and $b=9$.
76 76
77 77 First, we define the function and sample it evenly between 0 and 10 at 200 points:
78 78
79 79 <div class="highlight"><pre><span class="k">def</span> <span class="nf">f</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
80 80 <span class="k">return</span> <span class="p">(</span><span class="n">x</span><span class="o">-</span><span class="mi">3</span><span class="p">)</span><span class="o">*</span><span class="p">(</span><span class="n">x</span><span class="o">-</span><span class="mi">5</span><span class="p">)</span><span class="o">*</span><span class="p">(</span><span class="n">x</span><span class="o">-</span><span class="mi">7</span><span class="p">)</span><span class="o">+</span><span class="mi">85</span>
81 81
82 82 <span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
83 83 <span class="n">x</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">linspace</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">200</span><span class="p">)</span>
84 84 <span class="n">y</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
85 85 </pre></div>
86 86
87 87
88 88
89 89 We select $a$ and $b$, our integration limits, and we take only a few points in that region to illustrate the error behavior of the trapezoid approximation:
90 90
91 91 <div class="highlight"><pre><span class="n">a</span><span class="p">,</span> <span class="n">b</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">9</span>
92 92 <span class="n">xint</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="n">logical_and</span><span class="p">(</span><span class="n">x</span><span class="o">&gt;=</span><span class="n">a</span><span class="p">,</span> <span class="n">x</span><span class="o">&lt;=</span><span class="n">b</span><span class="p">)][::</span><span class="mi">30</span><span class="p">]</span>
93 93 <span class="n">yint</span> <span class="o">=</span> <span class="n">y</span><span class="p">[</span><span class="n">logical_and</span><span class="p">(</span><span class="n">x</span><span class="o">&gt;=</span><span class="n">a</span><span class="p">,</span> <span class="n">x</span><span class="o">&lt;=</span><span class="n">b</span><span class="p">)][::</span><span class="mi">30</span><span class="p">]</span>
94 94 </pre></div>
95 95
96 96
97 97
98 98 Let's plot both the function and the area below it in the trapezoid approximation:
99 99
100 100 <div class="highlight"><pre><span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="kn">as</span> <span class="nn">plt</span>
101 101 <span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">lw</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
102 102 <span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">140</span><span class="p">])</span>
103 103 <span class="n">plt</span><span class="o">.</span><span class="n">fill_between</span><span class="p">(</span><span class="n">xint</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">yint</span><span class="p">,</span> <span class="n">facecolor</span><span class="o">=</span><span class="s">&#39;gray&#39;</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.4</span><span class="p">)</span>
104 104 <span class="n">plt</span><span class="o">.</span><span class="n">text</span><span class="p">(</span><span class="mf">0.5</span> <span class="o">*</span> <span class="p">(</span><span class="n">a</span> <span class="o">+</span> <span class="n">b</span><span class="p">),</span> <span class="mi">30</span><span class="p">,</span><span class="s">r&quot;$\int_a^b f(x)dx$&quot;</span><span class="p">,</span> <span class="n">horizontalalignment</span><span class="o">=</span><span class="s">&#39;center&#39;</span><span class="p">,</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">20</span><span class="p">);</span>
105 105 </pre></div>
106 106
107 107
108 108
109 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.svg)
109 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.svg)
110 110
111 111
112 112 Compute the integral both at high accuracy and with the trapezoid approximation
113 113
114 114 <div class="highlight"><pre><span class="kn">from</span> <span class="nn">scipy.integrate</span> <span class="kn">import</span> <span class="n">quad</span><span class="p">,</span> <span class="n">trapz</span>
115 115 <span class="n">integral</span><span class="p">,</span> <span class="n">error</span> <span class="o">=</span> <span class="n">quad</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">9</span><span class="p">)</span>
116 116 <span class="n">trap_integral</span> <span class="o">=</span> <span class="n">trapz</span><span class="p">(</span><span class="n">yint</span><span class="p">,</span> <span class="n">xint</span><span class="p">)</span>
117 117 <span class="k">print</span> <span class="s">&quot;The integral is: </span><span class="si">%g</span><span class="s"> +/- </span><span class="si">%.1e</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">integral</span><span class="p">,</span> <span class="n">error</span><span class="p">)</span>
118 118 <span class="k">print</span> <span class="s">&quot;The trapezoid approximation with&quot;</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">xint</span><span class="p">),</span> <span class="s">&quot;points is:&quot;</span><span class="p">,</span> <span class="n">trap_integral</span>
119 119 <span class="k">print</span> <span class="s">&quot;The absolute error is:&quot;</span><span class="p">,</span> <span class="nb">abs</span><span class="p">(</span><span class="n">integral</span> <span class="o">-</span> <span class="n">trap_integral</span><span class="p">)</span>
120 120 </pre></div>
121 121
122 122
123 123 The integral is: 680 +/- 7.5e-12
124 124 The trapezoid approximation with 6 points is: 621.286411141
125 125 The absolute error is: 58.7135888589
126 126
127 127
128 128 This simple example showed us how, combining the numpy, scipy and matplotlib libraries we can provide an illustration of a standard method in elementary calculus with just a few lines of code. We will now discuss with more detail the basic usage of these tools.
129 129
130 130 # NumPy arrays: the right data structure for scientific computing
131 131
132 132 ## Basics of Numpy arrays
133 133
134 134 We now turn our attention to the Numpy library, which forms the base layer for the entire 'scipy ecosystem'. Once you have installed numpy, you can import it as
135 135
136 136 <div class="highlight"><pre><span class="kn">import</span> <span class="nn">numpy</span>
137 137 </pre></div>
138 138
139 139
140 140
141 141 though in this book we will use the common shorthand
142 142
143 143 <div class="highlight"><pre><span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
144 144 </pre></div>
145 145
146 146
147 147
148 148 As mentioned above, the main object provided by numpy is a powerful array. We'll start by exploring how the numpy array differs from Python lists. We start by creating a simple list and an array with the same contents of the list:
149 149
150 150 <div class="highlight"><pre><span class="n">lst</span> <span class="o">=</span> <span class="p">[</span><span class="mi">10</span><span class="p">,</span> <span class="mi">20</span><span class="p">,</span> <span class="mi">30</span><span class="p">,</span> <span class="mi">40</span><span class="p">]</span>
151 151 <span class="n">arr</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">10</span><span class="p">,</span> <span class="mi">20</span><span class="p">,</span> <span class="mi">30</span><span class="p">,</span> <span class="mi">40</span><span class="p">])</span>
152 152 </pre></div>
153 153
154 154
155 155
156 156 Elements of a one-dimensional array are accessed with the same syntax as a list:
157 157
158 158 <div class="highlight"><pre><span class="n">lst</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
159 159 </pre></div>
160 160
161 161
162 162 <pre>
163 163 10
164 164 </pre>
165 165
166 166
167 167 <div class="highlight"><pre><span class="n">arr</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
168 168 </pre></div>
169 169
170 170
171 171 <pre>
172 172 10
173 173 </pre>
174 174
175 175
176 176 <div class="highlight"><pre><span class="n">arr</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
177 177 </pre></div>
178 178
179 179
180 180 <pre>
181 181 40
182 182 </pre>
183 183
184 184
185 185 <div class="highlight"><pre><span class="n">arr</span><span class="p">[</span><span class="mi">2</span><span class="p">:]</span>
186 186 </pre></div>
187 187
188 188
189 189 <pre>
190 190 array([30, 40])
191 191 </pre>
192 192
193 193
194 194 The first difference to note between lists and arrays is that arrays are *homogeneous*; i.e. all elements of an array must be of the same type. In contrast, lists can contain elements of arbitrary type. For example, we can change the last element in our list above to be a string:
195 195
196 196 <div class="highlight"><pre><span class="n">lst</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="s">&#39;a string inside a list&#39;</span>
197 197 <span class="n">lst</span>
198 198 </pre></div>
199 199
200 200
201 201 <pre>
202 202 [10, 20, 30, 'a string inside a list']
203 203 </pre>
204 204
205 205
206 206 but the same can not be done with an array, as we get an error message:
207 207
208 208 <div class="highlight"><pre><span class="n">arr</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="s">&#39;a string inside an array&#39;</span>
209 209 </pre></div>
210 210
211 211
212 212 ---------------------------------------------------------------------------
213 213 ValueError Traceback (most recent call last)
214 214 /home/fperez/teach/book-math-labtool/<ipython-input-13-29c0bfa5fa8a> in <module>()
215 215 ----> 1 arr[-1] = 'a string inside an array'
216 216
217 217 ValueError: invalid literal for long() with base 10: 'a string inside an array'
218 218
219 219
220 220 The information about the type of an array is contained in its *dtype* attribute:
221 221
222 222 <div class="highlight"><pre><span class="n">arr</span><span class="o">.</span><span class="n">dtype</span>
223 223 </pre></div>
224 224
225 225
226 226 <pre>
227 227 dtype('int32')
228 228 </pre>
229 229
230 230
231 231 Once an array has been created, its dtype is fixed and it can only store elements of the same type. For this example where the dtype is integer, if we store a floating point number it will be automatically converted into an integer:
232 232
233 233 <div class="highlight"><pre><span class="n">arr</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="mf">1.234</span>
234 234 <span class="n">arr</span>
235 235 </pre></div>
236 236
237 237
238 238 <pre>
239 239 array([10, 20, 30, 1])
240 240 </pre>
241 241
242 242
243 243 Above we created an array from an existing list; now let us now see other ways in which we can create arrays, which we'll illustrate next. A common need is to have an array initialized with a constant value, and very often this value is 0 or 1 (suitable as starting value for additive and multiplicative loops respectively); `zeros` creates arrays of all zeros, with any desired dtype:
244 244
245 245 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="nb">float</span><span class="p">)</span>
246 246 </pre></div>
247 247
248 248
249 249 <pre>
250 250 array([ 0., 0., 0., 0., 0.])
251 251 </pre>
252 252
253 253
254 254 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span>
255 255 </pre></div>
256 256
257 257
258 258 <pre>
259 259 array([0, 0, 0])
260 260 </pre>
261 261
262 262
263 263 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="nb">complex</span><span class="p">)</span>
264 264 </pre></div>
265 265
266 266
267 267 <pre>
268 268 array([ 0.+0.j, 0.+0.j, 0.+0.j])
269 269 </pre>
270 270
271 271
272 272 and similarly for `ones`:
273 273
274 274 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;5 ones:&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
275 275 </pre></div>
276 276
277 277
278 278 5 ones: [ 1. 1. 1. 1. 1.]
279 279
280 280
281 281 If we want an array initialized with an arbitrary value, we can create an empty array and then use the fill method to put the value we want into the array:
282 282
283 283 <div class="highlight"><pre><span class="n">a</span> <span class="o">=</span> <span class="n">empty</span><span class="p">(</span><span class="mi">4</span><span class="p">)</span>
284 284 <span class="n">a</span><span class="o">.</span><span class="n">fill</span><span class="p">(</span><span class="mf">5.5</span><span class="p">)</span>
285 285 <span class="n">a</span>
286 286 </pre></div>
287 287
288 288
289 289 <pre>
290 290 array([ 5.5, 5.5, 5.5, 5.5])
291 291 </pre>
292 292
293 293
294 294 Numpy also offers the `arange` function, which works like the builtin `range` but returns an array instead of a list:
295 295
296 296 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
297 297 </pre></div>
298 298
299 299
300 300 <pre>
301 301 array([0, 1, 2, 3, 4])
302 302 </pre>
303 303
304 304
305 305 and the `linspace` and `logspace` functions to create linearly and logarithmically-spaced grids respectively, with a fixed number of points and including both ends of the specified interval:
306 306
307 307 <div class="highlight"><pre><span class="k">print</span> <span class="s">&quot;A linear grid between 0 and 1:&quot;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">linspace</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span>
308 308 <span class="k">print</span> <span class="s">&quot;A logarithmic grid between 10**1 and 10**4: &quot;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">logspace</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
309 309 </pre></div>
310 310
311 311
312 312 A linear grid between 0 and 1: [ 0. 0.25 0.5 0.75 1. ]
313 313 A logarithmic grid between 10**1 and 10**4: [ 10. 100. 1000. 10000.]
314 314
315 315
316 316 Finally, it is often useful to create arrays with random numbers that follow a specific distribution. The `np.random` module contains a number of functions that can be used to this effect, for example this will produce an array of 5 random samples taken from a standard normal distribution (0 mean and variance 1):
317 317
318 318 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">5</span><span class="p">)</span>
319 319 </pre></div>
320 320
321 321
322 322 <pre>
323 323 array([-0.08633343, -0.67375434, 1.00589536, 0.87081651, 1.65597822])
324 324 </pre>
325 325
326 326
327 327 whereas this will also give 5 samples, but from a normal distribution with a mean of 10 and a variance of 3:
328 328
329 329 <div class="highlight"><pre><span class="n">norm10</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span>
330 330 <span class="n">norm10</span>
331 331 </pre></div>
332 332
333 333
334 334 <pre>
335 335 array([ 8.94879575, 5.53038269, 8.24847281, 12.14944165, 11.56209294])
336 336 </pre>
337 337
338 338
339 339 ## Indexing with other arrays
340 340
341 341 Above we saw how to index arrays with single numbers and slices, just like Python lists. But arrays allow for a more sophisticated kind of indexing which is very powerful: you can index an array with another array, and in particular with an array of boolean values. This is particluarly useful to extract information from an array that matches a certain condition.
342 342
343 343 Consider for example that in the array `norm10` we want to replace all values above 9 with the value 0. We can do so by first finding the *mask* that indicates where this condition is true or false:
344 344
345 345 <div class="highlight"><pre><span class="n">mask</span> <span class="o">=</span> <span class="n">norm10</span> <span class="o">&gt;</span> <span class="mi">9</span>
346 346 <span class="n">mask</span>
347 347 </pre></div>
348 348
349 349
350 350 <pre>
351 351 array([False, False, False, True, True], dtype=bool)
352 352 </pre>
353 353
354 354
355 355 Now that we have this mask, we can use it to either read those values or to reset them to 0:
356 356
357 357 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;Values above 9:&#39;</span><span class="p">,</span> <span class="n">norm10</span><span class="p">[</span><span class="n">mask</span><span class="p">]</span>
358 358 </pre></div>
359 359
360 360
361 361 Values above 9: [ 12.14944165 11.56209294]
362 362
363 363
364 364 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;Resetting all values above 9 to 0...&#39;</span>
365 365 <span class="n">norm10</span><span class="p">[</span><span class="n">mask</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
366 366 <span class="k">print</span> <span class="n">norm10</span>
367 367 </pre></div>
368 368
369 369
370 370 Resetting all values above 9 to 0...
371 371 [ 8.94879575 5.53038269 8.24847281 0. 0. ]
372 372
373 373
374 374 ## Arrays with more than one dimension
375 375
376 376 Up until now all our examples have used one-dimensional arrays. But Numpy can create arrays of aribtrary dimensions, and all the methods illustrated in the previous section work with more than one dimension. For example, a list of lists can be used to initialize a two dimensional array:
377 377
378 378 <div class="highlight"><pre><span class="n">lst2</span> <span class="o">=</span> <span class="p">[[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]]</span>
379 379 <span class="n">arr2</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]])</span>
380 380 <span class="n">arr2</span>
381 381 </pre></div>
382 382
383 383
384 384 <pre>
385 385 array([[1, 2],
386 386 [3, 4]])
387 387 </pre>
388 388
389 389
390 390 With two-dimensional arrays we start seeing the power of numpy: while a nested list can be indexed using repeatedly the `[ ]` operator, multidimensional arrays support a much more natural indexing syntax with a single `[ ]` and a set of indices separated by commas:
391 391
392 392 <div class="highlight"><pre><span class="k">print</span> <span class="n">lst2</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">1</span><span class="p">]</span>
393 393 <span class="k">print</span> <span class="n">arr2</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">]</span>
394 394 </pre></div>
395 395
396 396
397 397 2
398 398 2
399 399
400 400
401 401 Most of the array creation functions listed above can be used with more than one dimension, for example:
402 402
403 403 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">2</span><span class="p">,</span><span class="mi">3</span><span class="p">))</span>
404 404 </pre></div>
405 405
406 406
407 407 <pre>
408 408 array([[ 0., 0., 0.],
409 409 [ 0., 0., 0.]])
410 410 </pre>
411 411
412 412
413 413 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">))</span>
414 414 </pre></div>
415 415
416 416
417 417 <pre>
418 418 array([[ 11.26788826, 4.29619866, 11.09346496, 9.73861307],
419 419 [ 10.54025996, 9.5146268 , 10.80367214, 13.62204505]])
420 420 </pre>
421 421
422 422
423 423 In fact, the shape of an array can be changed at any time, as long as the total number of elements is unchanged. For example, if we want a 2x4 array with numbers increasing from 0, the easiest way to create it is:
424 424
425 425 <div class="highlight"><pre><span class="n">arr</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">8</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span><span class="mi">4</span><span class="p">)</span>
426 426 <span class="k">print</span> <span class="n">arr</span>
427 427 </pre></div>
428 428
429 429
430 430 [[0 1 2 3]
431 431 [4 5 6 7]]
432 432
433 433
434 434 With multidimensional arrays, you can also use slices, and you can mix and match slices and single indices in the different dimensions (using the same array as above):
435 435
436 436 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;Slicing in the second row:&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">:</span><span class="mi">4</span><span class="p">]</span>
437 437 <span class="k">print</span> <span class="s">&#39;All rows, third column :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="p">[:,</span> <span class="mi">2</span><span class="p">]</span>
438 438 </pre></div>
439 439
440 440
441 441 Slicing in the second row: [6 7]
442 442 All rows, third column : [2 6]
443 443
444 444
445 445 If you only provide one index, then you will get an array with one less dimension containing that row:
446 446
447 447 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;First row: &#39;</span><span class="p">,</span> <span class="n">arr</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
448 448 <span class="k">print</span> <span class="s">&#39;Second row: &#39;</span><span class="p">,</span> <span class="n">arr</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
449 449 </pre></div>
450 450
451 451
452 452 First row: [0 1 2 3]
453 453 Second row: [4 5 6 7]
454 454
455 455
456 456 Now that we have seen how to create arrays with more than one dimension, it's a good idea to look at some of the most useful properties and methods that arrays have. The following provide basic information about the size, shape and data in the array:
457 457
458 458 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;Data type :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">dtype</span>
459 459 <span class="k">print</span> <span class="s">&#39;Total number of elements :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">size</span>
460 460 <span class="k">print</span> <span class="s">&#39;Number of dimensions :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">ndim</span>
461 461 <span class="k">print</span> <span class="s">&#39;Shape (dimensionality) :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">shape</span>
462 462 <span class="k">print</span> <span class="s">&#39;Memory used (in bytes) :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">nbytes</span>
463 463 </pre></div>
464 464
465 465
466 466 Data type : int32
467 467 Total number of elements : 8
468 468 Number of dimensions : 2
469 469 Shape (dimensionality) : (2, 4)
470 470 Memory used (in bytes) : 32
471 471
472 472
473 473 Arrays also have many useful methods, some especially useful ones are:
474 474
475 475 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;Minimum and maximum :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">min</span><span class="p">(),</span> <span class="n">arr</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
476 476 <span class="k">print</span> <span class="s">&#39;Sum and product of all elements :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">sum</span><span class="p">(),</span> <span class="n">arr</span><span class="o">.</span><span class="n">prod</span><span class="p">()</span>
477 477 <span class="k">print</span> <span class="s">&#39;Mean and standard deviation :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">mean</span><span class="p">(),</span> <span class="n">arr</span><span class="o">.</span><span class="n">std</span><span class="p">()</span>
478 478 </pre></div>
479 479
480 480
481 481 Minimum and maximum : 0 7
482 482 Sum and product of all elements : 28 0
483 483 Mean and standard deviation : 3.5 2.29128784748
484 484
485 485
486 486 For these methods, the above operations area all computed on all the elements of the array. But for a multidimensional array, it's possible to do the computation along a single dimension, by passing the `axis` parameter; for example:
487 487
488 488 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;For the following array:</span><span class="se">\n</span><span class="s">&#39;</span><span class="p">,</span> <span class="n">arr</span>
489 489 <span class="k">print</span> <span class="s">&#39;The sum of elements along the rows is :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
490 490 <span class="k">print</span> <span class="s">&#39;The sum of elements along the columns is :&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
491 491 </pre></div>
492 492
493 493
494 494 For the following array:
495 495 [[0 1 2 3]
496 496 [4 5 6 7]]
497 497 The sum of elements along the rows is : [ 6 22]
498 498 The sum of elements along the columns is : [ 4 6 8 10]
499 499
500 500
501 501 As you can see in this example, the value of the `axis` parameter is the dimension which will be *consumed* once the operation has been carried out. This is why to sum along the rows we use `axis=0`.
502 502
503 503 This can be easily illustrated with an example that has more dimensions; we create an array with 4 dimensions and shape `(3,4,5,6)` and sum along the axis number 2 (i.e. the *third* axis, since in Python all counts are 0-based). That consumes the dimension whose length was 5, leaving us with a new array that has shape `(3,4,6)`:
504 504
505 505 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">3</span><span class="p">,</span><span class="mi">4</span><span class="p">,</span><span class="mi">5</span><span class="p">,</span><span class="mi">6</span><span class="p">))</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">shape</span>
506 506 </pre></div>
507 507
508 508
509 509 <pre>
510 510 (3, 4, 6)
511 511 </pre>
512 512
513 513
514 514 Another widely used property of arrays is the `.T` attribute, which allows you to access the transpose of the array:
515 515
516 516 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;Array:</span><span class="se">\n</span><span class="s">&#39;</span><span class="p">,</span> <span class="n">arr</span>
517 517 <span class="k">print</span> <span class="s">&#39;Transpose:</span><span class="se">\n</span><span class="s">&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="o">.</span><span class="n">T</span>
518 518 </pre></div>
519 519
520 520
521 521 Array:
522 522 [[0 1 2 3]
523 523 [4 5 6 7]]
524 524 Transpose:
525 525 [[0 4]
526 526 [1 5]
527 527 [2 6]
528 528 [3 7]]
529 529
530 530
531 531 We don't have time here to look at all the methods and properties of arrays, here's a complete list. Simply try exploring some of these IPython to learn more, or read their description in the full Numpy documentation:
532 532
533 533 arr.T arr.copy arr.getfield arr.put arr.squeeze
534 534 arr.all arr.ctypes arr.imag arr.ravel arr.std
535 535 arr.any arr.cumprod arr.item arr.real arr.strides
536 536 arr.argmax arr.cumsum arr.itemset arr.repeat arr.sum
537 537 arr.argmin arr.data arr.itemsize arr.reshape arr.swapaxes
538 538 arr.argsort arr.diagonal arr.max arr.resize arr.take
539 539 arr.astype arr.dot arr.mean arr.round arr.tofile
540 540 arr.base arr.dtype arr.min arr.searchsorted arr.tolist
541 541 arr.byteswap arr.dump arr.nbytes arr.setasflat arr.tostring
542 542 arr.choose arr.dumps arr.ndim arr.setfield arr.trace
543 543 arr.clip arr.fill arr.newbyteorder arr.setflags arr.transpose
544 544 arr.compress arr.flags arr.nonzero arr.shape arr.var
545 545 arr.conj arr.flat arr.prod arr.size arr.view
546 546 arr.conjugate arr.flatten arr.ptp arr.sort
547 547
548 548 ## Operating with arrays
549 549
550 550 Arrays support all regular arithmetic operators, and the numpy library also contains a complete collection of basic mathematical functions that operate on arrays. It is important to remember that in general, all operations with arrays are applied *element-wise*, i.e., are applied to all the elements of the array at the same time. Consider for example:
551 551
552 552 <div class="highlight"><pre><span class="n">arr1</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">4</span><span class="p">)</span>
553 553 <span class="n">arr2</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">14</span><span class="p">)</span>
554 554 <span class="k">print</span> <span class="n">arr1</span><span class="p">,</span> <span class="s">&#39;+&#39;</span><span class="p">,</span> <span class="n">arr2</span><span class="p">,</span> <span class="s">&#39;=&#39;</span><span class="p">,</span> <span class="n">arr1</span><span class="o">+</span><span class="n">arr2</span>
555 555 </pre></div>
556 556
557 557
558 558 [0 1 2 3] + [10 11 12 13] = [10 12 14 16]
559 559
560 560
561 561 Importantly, you must remember that even the multiplication operator is by default applied element-wise, it is *not* the matrix multiplication from linear algebra (as is the case in Matlab, for example):
562 562
563 563 <div class="highlight"><pre><span class="k">print</span> <span class="n">arr1</span><span class="p">,</span> <span class="s">&#39;*&#39;</span><span class="p">,</span> <span class="n">arr2</span><span class="p">,</span> <span class="s">&#39;=&#39;</span><span class="p">,</span> <span class="n">arr1</span><span class="o">*</span><span class="n">arr2</span>
564 564 </pre></div>
565 565
566 566
567 567 [0 1 2 3] * [10 11 12 13] = [ 0 11 24 39]
568 568
569 569
570 570 While this means that in principle arrays must always match in their dimensionality in order for an operation to be valid, numpy will *broadcast* dimensions when possible. For example, suppose that you want to add the number 1.5 to `arr1`; the following would be a valid way to do it:
571 571
572 572 <div class="highlight"><pre><span class="n">arr1</span> <span class="o">+</span> <span class="mf">1.5</span><span class="o">*</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">4</span><span class="p">)</span>
573 573 </pre></div>
574 574
575 575
576 576 <pre>
577 577 array([ 1.5, 2.5, 3.5, 4.5])
578 578 </pre>
579 579
580 580
581 581 But thanks to numpy's broadcasting rules, the following is equally valid:
582 582
583 583 <div class="highlight"><pre><span class="n">arr1</span> <span class="o">+</span> <span class="mf">1.5</span>
584 584 </pre></div>
585 585
586 586
587 587 <pre>
588 588 array([ 1.5, 2.5, 3.5, 4.5])
589 589 </pre>
590 590
591 591
592 592 In this case, numpy looked at both operands and saw that the first (`arr1`) was a one-dimensional array of length 4 and the second was a scalar, considered a zero-dimensional object. The broadcasting rules allow numpy to:
593 593
594 594 * *create* new dimensions of length 1 (since this doesn't change the size of the array)
595 595 * 'stretch' a dimension of length 1 that needs to be matched to a dimension of a different size.
596 596
597 597 So in the above example, the scalar 1.5 is effectively:
598 598
599 599 * first 'promoted' to a 1-dimensional array of length 1
600 600 * then, this array is 'stretched' to length 4 to match the dimension of `arr1`.
601 601
602 602 After these two operations are complete, the addition can proceed as now both operands are one-dimensional arrays of length 4.
603 603
604 604 This broadcasting behavior is in practice enormously powerful, especially because when numpy broadcasts to create new dimensions or to 'stretch' existing ones, it doesn't actually replicate the data. In the example above the operation is carried *as if* the 1.5 was a 1-d array with 1.5 in all of its entries, but no actual array was ever created. This can save lots of memory in cases when the arrays in question are large and can have significant performance implications.
605 605
606 606 The general rule is: when operating on two arrays, NumPy compares their shapes element-wise. It starts with the trailing dimensions, and works its way forward, creating dimensions of length 1 as needed. Two dimensions are considered compatible when
607 607
608 608 * they are equal to begin with, or
609 609 * one of them is 1; in this case numpy will do the 'stretching' to make them equal.
610 610
611 611 If these conditions are not met, a `ValueError: frames are not aligned` exception is thrown, indicating that the arrays have incompatible shapes. The size of the resulting array is the maximum size along each dimension of the input arrays.
612 612
613 613 This shows how the broadcasting rules work in several dimensions:
614 614
615 615 <div class="highlight"><pre><span class="n">b</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">])</span>
616 616 <span class="k">print</span> <span class="n">arr</span><span class="p">,</span> <span class="s">&#39;</span><span class="se">\n\n</span><span class="s">+&#39;</span><span class="p">,</span> <span class="n">b</span> <span class="p">,</span> <span class="s">&#39;</span><span class="se">\n</span><span class="s">----------------</span><span class="se">\n</span><span class="s">&#39;</span><span class="p">,</span> <span class="n">arr</span> <span class="o">+</span> <span class="n">b</span>
617 617 </pre></div>
618 618
619 619
620 620 [[0 1 2 3]
621 621 [4 5 6 7]]
622 622
623 623 + [2 3 4 5]
624 624 ----------------
625 625 [[ 2 4 6 8]
626 626 [ 6 8 10 12]]
627 627
628 628
629 629 Now, how could you use broadcasting to say add `[4, 6]` along the rows to `arr` above? Simply performing the direct addition will produce the error we previously mentioned:
630 630
631 631 <div class="highlight"><pre><span class="n">c</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">6</span><span class="p">])</span>
632 632 <span class="n">arr</span> <span class="o">+</span> <span class="n">c</span>
633 633 </pre></div>
634 634
635 635
636 636 ---------------------------------------------------------------------------
637 637 ValueError Traceback (most recent call last)
638 638 /home/fperez/teach/book-math-labtool/<ipython-input-45-62aa20ac1980> in <module>()
639 639 1 c = np.array([4, 6])
640 640 ----> 2 arr + c
641 641
642 642 ValueError: operands could not be broadcast together with shapes (2,4) (2)
643 643
644 644
645 645 According to the rules above, the array `c` would need to have a *trailing* dimension of 1 for the broadcasting to work. It turns out that numpy allows you to 'inject' new dimensions anywhere into an array on the fly, by indexing it with the special object `np.newaxis`:
646 646
647 647 <div class="highlight"><pre><span class="p">(</span><span class="n">c</span><span class="p">[:,</span> <span class="n">np</span><span class="o">.</span><span class="n">newaxis</span><span class="p">])</span><span class="o">.</span><span class="n">shape</span>
648 648 </pre></div>
649 649
650 650
651 651 <pre>
652 652 (2, 1)
653 653 </pre>
654 654
655 655
656 656 This is exactly what we need, and indeed it works:
657 657
658 658 <div class="highlight"><pre><span class="n">arr</span> <span class="o">+</span> <span class="n">c</span><span class="p">[:,</span> <span class="n">np</span><span class="o">.</span><span class="n">newaxis</span><span class="p">]</span>
659 659 </pre></div>
660 660
661 661
662 662 <pre>
663 663 array([[ 4, 5, 6, 7],
664 664 [10, 11, 12, 13]])
665 665 </pre>
666 666
667 667
668 668 For the full broadcasting rules, please see the official Numpy docs, which describe them in detail and with more complex examples.
669 669
670 670 As we mentioned before, Numpy ships with a full complement of mathematical functions that work on entire arrays, including logarithms, exponentials, trigonometric and hyperbolic trigonometric functions, etc. Furthermore, scipy ships a rich special function library in the `scipy.special` module that includes Bessel, Airy, Fresnel, Laguerre and other classical special functions. For example, sampling the sine function at 100 points between $0$ and $2\pi$ is as simple as:
671 671
672 672 <div class="highlight"><pre><span class="n">x</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">linspace</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="o">*</span><span class="n">np</span><span class="o">.</span><span class="n">pi</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span>
673 673 <span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
674 674 </pre></div>
675 675
676 676
677 677
678 678 ## Linear algebra in numpy
679 679
680 680 Numpy ships with a basic linear algebra library, and all arrays have a `dot` method whose behavior is that of the scalar dot product when its arguments are vectors (one-dimensional arrays) and the traditional matrix multiplication when one or both of its arguments are two-dimensional arrays:
681 681
682 682 <div class="highlight"><pre><span class="n">v1</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">])</span>
683 683 <span class="n">v2</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span>
684 684 <span class="k">print</span> <span class="n">v1</span><span class="p">,</span> <span class="s">&#39;.&#39;</span><span class="p">,</span> <span class="n">v2</span><span class="p">,</span> <span class="s">&#39;=&#39;</span><span class="p">,</span> <span class="n">v1</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">v2</span><span class="p">)</span>
685 685 </pre></div>
686 686
687 687
688 688 [2 3 4] . [1 0 1] = 6
689 689
690 690
691 691 Here is a regular matrix-vector multiplication, note that the array `v1` should be viewed as a *column* vector in traditional linear algebra notation; numpy makes no distinction between row and column vectors and simply verifies that the dimensions match the required rules of matrix multiplication, in this case we have a $2 \times 3$ matrix multiplied by a 3-vector, which produces a 2-vector:
692 692
693 693 <div class="highlight"><pre><span class="n">A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">6</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
694 694 <span class="k">print</span> <span class="n">A</span><span class="p">,</span> <span class="s">&#39;x&#39;</span><span class="p">,</span> <span class="n">v1</span><span class="p">,</span> <span class="s">&#39;=&#39;</span><span class="p">,</span> <span class="n">A</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">v1</span><span class="p">)</span>
695 695 </pre></div>
696 696
697 697
698 698 [[0 1 2]
699 699 [3 4 5]] x [2 3 4] = [11 38]
700 700
701 701
702 702 For matrix-matrix multiplication, the same dimension-matching rules must be satisfied, e.g. consider the difference between $A \times A^T$:
703 703
704 704 <div class="highlight"><pre><span class="k">print</span> <span class="n">A</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">A</span><span class="o">.</span><span class="n">T</span><span class="p">)</span>
705 705 </pre></div>
706 706
707 707
708 708 [[ 5 14]
709 709 [14 50]]
710 710
711 711
712 712 and $A^T \times A$:
713 713
714 714 <div class="highlight"><pre><span class="k">print</span> <span class="n">A</span><span class="o">.</span><span class="n">T</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">A</span><span class="p">)</span>
715 715 </pre></div>
716 716
717 717
718 718 [[ 9 12 15]
719 719 [12 17 22]
720 720 [15 22 29]]
721 721
722 722
723 723 Furthermore, the `numpy.linalg` module includes additional functionality such as determinants, matrix norms, Cholesky, eigenvalue and singular value decompositions, etc. For even more linear algebra tools, `scipy.linalg` contains the majority of the tools in the classic LAPACK libraries as well as functions to operate on sparse matrices. We refer the reader to the Numpy and Scipy documentations for additional details on these.
724 724
725 725 ## Reading and writing arrays to disk
726 726
727 727 Numpy lets you read and write arrays into files in a number of ways. In order to use these tools well, it is critical to understand the difference between a *text* and a *binary* file containing numerical data. In a text file, the number $\pi$ could be written as "3.141592653589793", for example: a string of digits that a human can read, with in this case 15 decimal digits. In contrast, that same number written to a binary file would be encoded as 8 characters (bytes) that are not readable by a human but which contain the exact same data that the variable `pi` had in the computer's memory.
728 728
729 729 The tradeoffs between the two modes are thus:
730 730
731 731 * Text mode: occupies more space, precision can be lost (if not all digits are written to disk), but is readable and editable by hand with a text editor. Can *only* be used for one- and two-dimensional arrays.
732 732
733 733 * Binary mode: compact and exact representation of the data in memory, can't be read or edited by hand. Arrays of any size and dimensionality can be saved and read without loss of information.
734 734
735 735 First, let's see how to read and write arrays in text mode. The `np.savetxt` function saves an array to a text file, with options to control the precision, separators and even adding a header:
736 736
737 737 <div class="highlight"><pre><span class="n">arr</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span>
738 738 <span class="n">np</span><span class="o">.</span><span class="n">savetxt</span><span class="p">(</span><span class="s">&#39;test.out&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="p">,</span> <span class="n">fmt</span><span class="o">=</span><span class="s">&#39;</span><span class="si">%.2e</span><span class="s">&#39;</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="s">&quot;My dataset&quot;</span><span class="p">)</span>
739 739 <span class="o">!</span>cat test.out
740 740 </pre></div>
741 741
742 742
743 743 # My dataset
744 744 0.00e+00 1.00e+00 2.00e+00 3.00e+00 4.00e+00
745 745 5.00e+00 6.00e+00 7.00e+00 8.00e+00 9.00e+00
746 746
747 747
748 748 And this same type of file can then be read with the matching `np.loadtxt` function:
749 749
750 750 <div class="highlight"><pre><span class="n">arr2</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">loadtxt</span><span class="p">(</span><span class="s">&#39;test.out&#39;</span><span class="p">)</span>
751 751 <span class="k">print</span> <span class="n">arr2</span>
752 752 </pre></div>
753 753
754 754
755 755 [[ 0. 1. 2. 3. 4.]
756 756 [ 5. 6. 7. 8. 9.]]
757 757
758 758
759 759 For binary data, Numpy provides the `np.save` and `np.savez` routines. The first saves a single array to a file with `.npy` extension, while the latter can be used to save a *group* of arrays into a single file with `.npz` extension. The files created with these routines can then be read with the `np.load` function.
760 760
761 761 Let us first see how to use the simpler `np.save` function to save a single array:
762 762
763 763 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s">&#39;test.npy&#39;</span><span class="p">,</span> <span class="n">arr2</span><span class="p">)</span>
764 764 <span class="c"># Now we read this back</span>
765 765 <span class="n">arr2n</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">&#39;test.npy&#39;</span><span class="p">)</span>
766 766 <span class="c"># Let&#39;s see if any element is non-zero in the difference.</span>
767 767 <span class="c"># A value of True would be a problem.</span>
768 768 <span class="k">print</span> <span class="s">&#39;Any differences?&#39;</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">any</span><span class="p">(</span><span class="n">arr2</span><span class="o">-</span><span class="n">arr2n</span><span class="p">)</span>
769 769 </pre></div>
770 770
771 771
772 772 Any differences? False
773 773
774 774
775 775 Now let us see how the `np.savez` function works. You give it a filename and either a sequence of arrays or a set of keywords. In the first mode, the function will auotmatically name the saved arrays in the archive as `arr_0`, `arr_1`, etc:
776 776
777 777 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">savez</span><span class="p">(</span><span class="s">&#39;test.npz&#39;</span><span class="p">,</span> <span class="n">arr</span><span class="p">,</span> <span class="n">arr2</span><span class="p">)</span>
778 778 <span class="n">arrays</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">&#39;test.npz&#39;</span><span class="p">)</span>
779 779 <span class="n">arrays</span><span class="o">.</span><span class="n">files</span>
780 780 </pre></div>
781 781
782 782
783 783 <pre>
784 784 ['arr_1', 'arr_0']
785 785 </pre>
786 786
787 787
788 788 Alternatively, we can explicitly choose how to name the arrays we save:
789 789
790 790 <div class="highlight"><pre><span class="n">np</span><span class="o">.</span><span class="n">savez</span><span class="p">(</span><span class="s">&#39;test.npz&#39;</span><span class="p">,</span> <span class="n">array1</span><span class="o">=</span><span class="n">arr</span><span class="p">,</span> <span class="n">array2</span><span class="o">=</span><span class="n">arr2</span><span class="p">)</span>
791 791 <span class="n">arrays</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">&#39;test.npz&#39;</span><span class="p">)</span>
792 792 <span class="n">arrays</span><span class="o">.</span><span class="n">files</span>
793 793 </pre></div>
794 794
795 795
796 796 <pre>
797 797 ['array2', 'array1']
798 798 </pre>
799 799
800 800
801 801 The object returned by `np.load` from an `.npz` file works like a dictionary, though you can also access its constituent files by attribute using its special `.f` field; this is best illustrated with an example with the `arrays` object from above:
802 802
803 803 <div class="highlight"><pre><span class="k">print</span> <span class="s">&#39;First row of first array:&#39;</span><span class="p">,</span> <span class="n">arrays</span><span class="p">[</span><span class="s">&#39;array1&#39;</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
804 804 <span class="c"># This is an equivalent way to get the same field</span>
805 805 <span class="k">print</span> <span class="s">&#39;First row of first array:&#39;</span><span class="p">,</span> <span class="n">arrays</span><span class="o">.</span><span class="n">f</span><span class="o">.</span><span class="n">array1</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
806 806 </pre></div>
807 807
808 808
809 809 First row of first array: [0 1 2 3 4]
810 810 First row of first array: [0 1 2 3 4]
811 811
812 812
813 813 This `.npz` format is a very convenient way to package compactly and without loss of information, into a single file, a group of related arrays that pertain to a specific problem. At some point, however, the complexity of your dataset may be such that the optimal approach is to use one of the standard formats in scientific data processing that have been designed to handle complex datasets, such as NetCDF or HDF5.
814 814
815 815 Fortunately, there are tools for manipulating these formats in Python, and for storing data in other ways such as databases. A complete discussion of the possibilities is beyond the scope of this discussion, but of particular interest for scientific users we at least mention the following:
816 816
817 817 * The `scipy.io` module contains routines to read and write Matlab files in `.mat` format and files in the NetCDF format that is widely used in certain scientific disciplines.
818 818
819 819 * For manipulating files in the HDF5 format, there are two excellent options in Python: The PyTables project offers a high-level, object oriented approach to manipulating HDF5 datasets, while the h5py project offers a more direct mapping to the standard HDF5 library interface. Both are excellent tools; if you need to work with HDF5 datasets you should read some of their documentation and examples and decide which approach is a better match for your needs.
820 820
821 821 # High quality data visualization with Matplotlib
822 822
823 823 The [matplotlib](http://matplotlib.sf.net) library is a powerful tool capable of producing complex publication-quality figures with fine layout control in two and three dimensions; here we will only provide a minimal self-contained introduction to its usage that covers the functionality needed for the rest of the book. We encourage the reader to read the tutorials included with the matplotlib documentation as well as to browse its extensive gallery of examples that include source code.
824 824
825 825 Just as we typically use the shorthand `np` for Numpy, we will use `plt` for the `matplotlib.pyplot` module where the easy-to-use plotting functions reside (the library contains a rich object-oriented architecture that we don't have the space to discuss here):
826 826
827 827 <div class="highlight"><pre><span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="kn">as</span> <span class="nn">plt</span>
828 828 </pre></div>
829 829
830 830
831 831
832 832 The most frequently used function is simply called `plot`, here is how you can make a simple plot of $\sin(x)$ for $x \in [0, 2\pi]$ with labels and a grid (we use the semicolon in the last line to suppress the display of some information that is unnecessary right now):
833 833
834 834 <div class="highlight"><pre><span class="n">x</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">linspace</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="o">*</span><span class="n">np</span><span class="o">.</span><span class="n">pi</span><span class="p">)</span>
835 835 <span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
836 836 <span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="n">y</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">&#39;sin(x)&#39;</span><span class="p">)</span>
837 837 <span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
838 838 <span class="n">plt</span><span class="o">.</span><span class="n">grid</span><span class="p">()</span>
839 839 <span class="n">plt</span><span class="o">.</span><span class="n">title</span><span class="p">(</span><span class="s">&#39;Harmonic&#39;</span><span class="p">)</span>
840 840 <span class="n">plt</span><span class="o">.</span><span class="n">xlabel</span><span class="p">(</span><span class="s">&#39;x&#39;</span><span class="p">)</span>
841 841 <span class="n">plt</span><span class="o">.</span><span class="n">ylabel</span><span class="p">(</span><span class="s">&#39;y&#39;</span><span class="p">);</span>
842 842 </pre></div>
843 843
844 844
845 845
846 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.svg)
846 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.svg)
847 847
848 848
849 849 You can control the style, color and other properties of the markers, for example:
850 850
851 851 <div class="highlight"><pre><span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mi">2</span><span class="p">);</span>
852 852 </pre></div>
853 853
854 854
855 855
856 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.svg)
856 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.svg)
857 857
858 858
859 859 <div class="highlight"><pre><span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="s">&#39;o&#39;</span><span class="p">,</span> <span class="n">markersize</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s">&#39;r&#39;</span><span class="p">);</span>
860 860 </pre></div>
861 861
862 862
863 863
864 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.svg)
864 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.svg)
865 865
866 866
867 867 We will now see how to create a few other common plot types, such as a simple error plot:
868 868
869 869 <div class="highlight"><pre><span class="c"># example data</span>
870 870 <span class="n">x</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mf">0.1</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">)</span>
871 871 <span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="n">x</span><span class="p">)</span>
872 872
873 873 <span class="c"># example variable error bar values</span>
874 874 <span class="n">yerr</span> <span class="o">=</span> <span class="mf">0.1</span> <span class="o">+</span> <span class="mf">0.2</span><span class="o">*</span><span class="n">np</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
875 875 <span class="n">xerr</span> <span class="o">=</span> <span class="mf">0.1</span> <span class="o">+</span> <span class="n">yerr</span>
876 876
877 877 <span class="c"># First illustrate basic pyplot interface, using defaults where possible.</span>
878 878 <span class="n">plt</span><span class="o">.</span><span class="n">figure</span><span class="p">()</span>
879 879 <span class="n">plt</span><span class="o">.</span><span class="n">errorbar</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">xerr</span><span class="o">=</span><span class="mf">0.2</span><span class="p">,</span> <span class="n">yerr</span><span class="o">=</span><span class="mf">0.4</span><span class="p">)</span>
880 880 <span class="n">plt</span><span class="o">.</span><span class="n">title</span><span class="p">(</span><span class="s">&quot;Simplest errorbars, 0.2 in x, 0.4 in y&quot;</span><span class="p">);</span>
881 881 </pre></div>
882 882
883 883
884 884
885 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.svg)
885 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.svg)
886 886
887 887
888 888 A simple log plot
889 889
890 890 <div class="highlight"><pre><span class="n">x</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">linspace</span><span class="p">(</span><span class="o">-</span><span class="mi">5</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span>
891 891 <span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="n">x</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span>
892 892 <span class="n">plt</span><span class="o">.</span><span class="n">semilogy</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">);</span>
893 893 </pre></div>
894 894
895 895
896 896
897 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.svg)
897 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.svg)
898 898
899 899
900 900 A histogram annotated with text inside the plot, using the `text` function:
901 901
902 902 <div class="highlight"><pre><span class="n">mu</span><span class="p">,</span> <span class="n">sigma</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">15</span>
903 903 <span class="n">x</span> <span class="o">=</span> <span class="n">mu</span> <span class="o">+</span> <span class="n">sigma</span> <span class="o">*</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10000</span><span class="p">)</span>
904 904
905 905 <span class="c"># the histogram of the data</span>
906 906 <span class="n">n</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">patches</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="n">normed</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">facecolor</span><span class="o">=</span><span class="s">&#39;g&#39;</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.75</span><span class="p">)</span>
907 907
908 908 <span class="n">plt</span><span class="o">.</span><span class="n">xlabel</span><span class="p">(</span><span class="s">&#39;Smarts&#39;</span><span class="p">)</span>
909 909 <span class="n">plt</span><span class="o">.</span><span class="n">ylabel</span><span class="p">(</span><span class="s">&#39;Probability&#39;</span><span class="p">)</span>
910 910 <span class="n">plt</span><span class="o">.</span><span class="n">title</span><span class="p">(</span><span class="s">&#39;Histogram of IQ&#39;</span><span class="p">)</span>
911 911 <span class="c"># This will put a text fragment at the position given:</span>
912 912 <span class="n">plt</span><span class="o">.</span><span class="n">text</span><span class="p">(</span><span class="mi">55</span><span class="p">,</span> <span class="o">.</span><span class="mo">027</span><span class="p">,</span> <span class="s">r&#39;$\mu=100,\ \sigma=15$&#39;</span><span class="p">,</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">14</span><span class="p">)</span>
913 913 <span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">([</span><span class="mi">40</span><span class="p">,</span> <span class="mi">160</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">0.03</span><span class="p">])</span>
914 914 <span class="n">plt</span><span class="o">.</span><span class="n">grid</span><span class="p">(</span><span class="bp">True</span><span class="p">)</span>
915 915 </pre></div>
916 916
917 917
918 918
919 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.svg)
919 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.svg)
920 920
921 921
922 922 ## Image display
923 923
924 924 The `imshow` command can display single or multi-channel images. A simple array of random numbers, plotted in grayscale:
925 925
926 926 <div class="highlight"><pre><span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">cm</span>
927 927 <span class="n">plt</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">10</span><span class="p">),</span> <span class="n">cmap</span><span class="o">=</span><span class="n">cm</span><span class="o">.</span><span class="n">gray</span><span class="p">,</span> <span class="n">interpolation</span><span class="o">=</span><span class="s">&#39;nearest&#39;</span><span class="p">);</span>
928 928 </pre></div>
929 929
930 930
931 931
932 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.svg)
932 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.svg)
933 933
934 934
935 935 A real photograph is a multichannel image, `imshow` interprets it correctly:
936 936
937 937 <div class="highlight"><pre><span class="n">img</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">imread</span><span class="p">(</span><span class="s">&#39;stinkbug.png&#39;</span><span class="p">)</span>
938 938 <span class="k">print</span> <span class="s">&#39;Dimensions of the array img:&#39;</span><span class="p">,</span> <span class="n">img</span><span class="o">.</span><span class="n">shape</span>
939 939 <span class="n">plt</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="n">img</span><span class="p">);</span>
940 940 </pre></div>
941 941
942 942
943 943 Dimensions of the array img: (375, 500, 3)
944 944
945 945
946 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.svg)
946 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.svg)
947 947
948 948
949 949 ## Simple 3d plotting with matplotlib
950 950
951 951 Note that you must execute at least once in your session:
952 952
953 953 <div class="highlight"><pre><span class="kn">from</span> <span class="nn">mpl_toolkits.mplot3d</span> <span class="kn">import</span> <span class="n">Axes3D</span>
954 954 </pre></div>
955 955
956 956
957 957
958 958 One this has been done, you can create 3d axes with the `projection='3d'` keyword to `add_subplot`:
959 959
960 960 fig = plt.figure()
961 961 fig.add_subplot(<other arguments here>, projection='3d')
962 962
963 963 A simple surface plot:
964 964
965 965 <div class="highlight"><pre><span class="kn">from</span> <span class="nn">mpl_toolkits.mplot3d.axes3d</span> <span class="kn">import</span> <span class="n">Axes3D</span>
966 966 <span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">cm</span>
967 967
968 968 <span class="n">fig</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">figure</span><span class="p">()</span>
969 969 <span class="n">ax</span> <span class="o">=</span> <span class="n">fig</span><span class="o">.</span><span class="n">add_subplot</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">projection</span><span class="o">=</span><span class="s">&#39;3d&#39;</span><span class="p">)</span>
970 970 <span class="n">X</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="o">-</span><span class="mi">5</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.25</span><span class="p">)</span>
971 971 <span class="n">Y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="o">-</span><span class="mi">5</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.25</span><span class="p">)</span>
972 972 <span class="n">X</span><span class="p">,</span> <span class="n">Y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">meshgrid</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">)</span>
973 973 <span class="n">R</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">X</span><span class="o">**</span><span class="mi">2</span> <span class="o">+</span> <span class="n">Y</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span>
974 974 <span class="n">Z</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="n">R</span><span class="p">)</span>
975 975 <span class="n">surf</span> <span class="o">=</span> <span class="n">ax</span><span class="o">.</span><span class="n">plot_surface</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">Z</span><span class="p">,</span> <span class="n">rstride</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">cstride</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">cmap</span><span class="o">=</span><span class="n">cm</span><span class="o">.</span><span class="n">jet</span><span class="p">,</span>
976 976 <span class="n">linewidth</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">antialiased</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span>
977 977 <span class="n">ax</span><span class="o">.</span><span class="n">set_zlim3d</span><span class="p">(</span><span class="o">-</span><span class="mf">1.01</span><span class="p">,</span> <span class="mf">1.01</span><span class="p">);</span>
978 978 </pre></div>
979 979
980 980
981 981
982 ![](/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.svg)
982 ![](tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.svg)
983 983
984 984
985 985 # IPython: a powerful interactive environment
986 986
987 987 A key component of the everyday workflow of most scientific computing environments is a good interactive environment, that is, a system in which you can execute small amounts of code and view the results immediately, combining both printing out data and opening graphical visualizations. All modern systems for scientific computing, commercial and open source, include such functionality.
988 988
989 989 Out of the box, Python also offers a simple interactive shell with very limited capabilities. But just like the scientific community built Numpy to provide arrays suited for scientific work (since Pytyhon's lists aren't optimal for this task), it has also developed an interactive environment much more sophisticated than the built-in one. The [IPython project](http://ipython.org) offers a set of tools to make productive use of the Python language, all the while working interactively and with immedate feedback on your results. The basic tools that IPython provides are:
990 990
991 991 1. A powerful terminal shell, with many features designed to increase the fluidity and productivity of everyday scientific workflows, including:
992 992
993 993 * rich introspection of all objects and variables including easy access to the source code of any function
994 994 * powerful and extensible tab completion of variables and filenames,
995 995 * tight integration with matplotlib, supporting interactive figures that don't block the terminal,
996 996 * direct access to the filesystem and underlying operating system,
997 997 * an extensible system for shell-like commands called 'magics' that reduce the work needed to perform many common tasks,
998 998 * tools for easily running, timing, profiling and debugging your codes,
999 999 * syntax highlighted error messages with much more detail than the default Python ones,
1000 1000 * logging and access to all previous history of inputs, including across sessions
1001 1001
1002 1002 2. A Qt console that provides the look and feel of a terminal, but adds support for inline figures, graphical calltips, a persistent session that can survive crashes (even segfaults) of the kernel process, and more.
1003 1003
1004 1004 3. A web-based notebook that can execute code and also contain rich text and figures, mathematical equations and arbitrary HTML. This notebook presents a document-like view with cells where code is executed but that can be edited in-place, reordered, mixed with explanatory text and figures, etc.
1005 1005
1006 1006 4. A high-performance, low-latency system for parallel computing that supports the control of a cluster of IPython engines communicating over a network, with optimizations that minimize unnecessary copying of large objects (especially numpy arrays).
1007 1007
1008 1008 We will now discuss the highlights of the tools 1-3 above so that you can make them an effective part of your workflow. The topic of parallel computing is beyond the scope of this document, but we encourage you to read the extensive [documentation](http://ipython.org/ipython-doc/rel-0.12.1/parallel/index.html) and [tutorials](http://minrk.github.com/scipy-tutorial-2011/) on this available on the IPython website.
1009 1009
1010 1010 ## The IPython terminal
1011 1011
1012 1012 You can start IPython at the terminal simply by typing:
1013 1013
1014 1014 $ ipython
1015 1015
1016 1016 which will provide you some basic information about how to get started and will then open a prompt labeled `In [1]:` for you to start typing. Here we type $2^{64}$ and Python computes the result for us in exact arithmetic, returning it as `Out[1]`:
1017 1017
1018 1018 $ ipython
1019 1019 Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
1020 1020 Type "copyright", "credits" or "license" for more information.
1021 1021
1022 1022 IPython 0.13.dev -- An enhanced Interactive Python.
1023 1023 ? -> Introduction and overview of IPython's features.
1024 1024 %quickref -> Quick reference.
1025 1025 help -> Python's own help system.
1026 1026 object? -> Details about 'object', use 'object??' for extra details.
1027 1027
1028 1028 In [1]: 2**64
1029 1029 Out[1]: 18446744073709551616L
1030 1030
1031 1031 The first thing you should know about IPython is that all your inputs and outputs are saved. There are two variables named `In` and `Out` which are filled as you work with your results. Furthermore, all outputs are also saved to auto-created variables of the form `_NN` where `NN` is the prompt number, and inputs to `_iNN`. This allows you to recover quickly the result of a prior computation by referring to its number even if you forgot to store it as a variable. For example, later on in the above session you can do:
1032 1032
1033 1033 In [6]: print _1
1034 1034 18446744073709551616
1035 1035
1036 1036 We strongly recommend that you take a few minutes to read at least the basic introduction provided by the `?` command, and keep in mind that the `%quickref` command at all times can be used as a quick reference "cheat sheet" of the most frequently used features of IPython.
1037 1037
1038 1038 At the IPython prompt, any valid Python code that you type will be executed similarly to the default Python shell (though often with more informative feedback). But since IPython is a *superset* of the default Python shell; let's have a brief look at some of its additional functionality.
1039 1039
1040 1040 **Object introspection**
1041 1041
1042 1042 A simple `?` command provides a general introduction to IPython, but as indicated in the banner above, you can use the `?` syntax to ask for details about any object. For example, if we type `_1?`, IPython will print the following details about this variable:
1043 1043
1044 1044 In [14]: _1?
1045 1045 Type: long
1046 1046 Base Class: <type 'long'>
1047 1047 String Form:18446744073709551616
1048 1048 Namespace: Interactive
1049 1049 Docstring:
1050 1050 long(x[, base]) -> integer
1051 1051
1052 1052 Convert a string or number to a long integer, if possible. A floating
1053 1053
1054 1054 [etc... snipped for brevity]
1055 1055
1056 1056 If you add a second `?` and for any oobject `x` type `x??`, IPython will try to provide an even more detailed analsysi of the object, including its syntax-highlighted source code when it can be found. It's possible that `x??` returns the same information as `x?`, but in many cases `x??` will indeed provide additional details.
1057 1057
1058 1058 Finally, the `?` syntax is also useful to search *namespaces* with wildcards. Suppose you are wondering if there is any function in Numpy that may do text-related things; with `np.*txt*?`, IPython will print all the names in the `np` namespace (our Numpy shorthand) that have 'txt' anywhere in their name:
1059 1059
1060 1060 In [17]: np.*txt*?
1061 1061 np.genfromtxt
1062 1062 np.loadtxt
1063 1063 np.mafromtxt
1064 1064 np.ndfromtxt
1065 1065 np.recfromtxt
1066 1066 np.savetxt
1067 1067
1068 1068 **Tab completion**
1069 1069
1070 1070 IPython makes the tab key work extra hard for you as a way to rapidly inspect objects and libraries. Whenever you have typed something at the prompt, by hitting the `<tab>` key IPython will try to complete the rest of the line. For this, IPython will analyze the text you had so far and try to search for Python data or files that may match the context you have already provided.
1071 1071
1072 1072 For example, if you type `np.load` and hit the <tab> key, you'll see:
1073 1073
1074 1074 In [21]: np.load<TAB HERE>
1075 1075 np.load np.loads np.loadtxt
1076 1076
1077 1077 so you can quickly find all the load-related functionality in numpy. Tab completion works even for function arguments, for example consider this function definition:
1078 1078
1079 1079 In [20]: def f(x, frobinate=False):
1080 1080 ....: if frobinate:
1081 1081 ....: return x**2
1082 1082 ....:
1083 1083
1084 1084 If you now use the `<tab>` key after having typed 'fro' you'll get all valid Python completions, but those marked with `=` at the end are known to be keywords of your function:
1085 1085
1086 1086 In [21]: f(2, fro<TAB HERE>
1087 1087 frobinate= frombuffer fromfunction frompyfunc fromstring
1088 1088 from fromfile fromiter fromregex frozenset
1089 1089
1090 1090 at this point you can add the `b` letter and hit `<tab>` once more, and IPython will finish the line for you:
1091 1091
1092 1092 In [21]: f(2, frobinate=
1093 1093
1094 1094 As a beginner, simply get into the habit of using `<tab>` after most objects; it should quickly become second nature as you will see how helps keep a fluid workflow and discover useful information. Later on you can also customize this behavior by writing your own completion code, if you so desire.
1095 1095
1096 1096 **Matplotlib integration**
1097 1097
1098 1098 One of the most useful features of IPython for scientists is its tight integration with matplotlib: at the terminal IPython lets you open matplotlib figures without blocking your typing (which is what happens if you try to do the same thing at the default Python shell), and in the Qt console and notebook you can even view your figures embedded in your workspace next to the code that created them.
1099 1099
1100 1100 The matplotlib support can be either activated when you start IPython by passing the `--pylab` flag, or at any point later in your session by using the `%pylab` command. If you start IPython with `--pylab`, you'll see something like this (note the extra message about pylab):
1101 1101
1102 1102 $ ipython --pylab
1103 1103 Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
1104 1104 Type "copyright", "credits" or "license" for more information.
1105 1105
1106 1106 IPython 0.13.dev -- An enhanced Interactive Python.
1107 1107 ? -> Introduction and overview of IPython's features.
1108 1108 %quickref -> Quick reference.
1109 1109 help -> Python's own help system.
1110 1110 object? -> Details about 'object', use 'object??' for extra details.
1111 1111
1112 1112 Welcome to pylab, a matplotlib-based Python environment [backend: Qt4Agg].
1113 1113 For more information, type 'help(pylab)'.
1114 1114
1115 1115 In [1]:
1116 1116
1117 1117 Furthermore, IPython will import `numpy` with the `np` shorthand, `matplotlib.pyplot` as `plt`, and it will also load all of the numpy and pyplot top-level names so that you can directly type something like:
1118 1118
1119 1119 In [1]: x = linspace(0, 2*pi, 200)
1120 1120
1121 1121 In [2]: plot(x, sin(x))
1122 1122 Out[2]: [<matplotlib.lines.Line2D at 0x9e7c16c>]
1123 1123
1124 1124 instead of having to prefix each call with its full signature (as we have been doing in the examples thus far):
1125 1125
1126 1126 In [3]: x = np.linspace(0, 2*np.pi, 200)
1127 1127
1128 1128 In [4]: plt.plot(x, np.sin(x))
1129 1129 Out[4]: [<matplotlib.lines.Line2D at 0x9e900ac>]
1130 1130
1131 1131 This shorthand notation can be a huge time-saver when working interactively (it's a few characters but you are likely to type them hundreds of times in a session). But we should note that as you develop persistent scripts and notebooks meant for reuse, it's best to get in the habit of using the longer notation (known as *fully qualified names* as it's clearer where things come from and it makes for more robust, readable and maintainable code in the long run).
1132 1132
1133 1133 **Access to the operating system and files**
1134 1134
1135 1135 In IPython, you can type `ls` to see your files or `cd` to change directories, just like you would at a regular system prompt:
1136 1136
1137 1137 In [2]: cd tests
1138 1138 /home/fperez/ipython/nbconvert/tests
1139 1139
1140 1140 In [3]: ls test.*
1141 1141 test.aux test.html test.ipynb test.log test.out test.pdf test.rst test.tex
1142 1142
1143 1143 Furthermore, if you use the `!` at the beginning of a line, any commands you pass afterwards go directly to the operating system:
1144 1144
1145 1145 In [4]: !echo "Hello IPython"
1146 1146 Hello IPython
1147 1147
1148 1148 IPython offers a useful twist in this feature: it will substitute in the command the value of any *Python* variable you may have if you prepend it with a `$` sign:
1149 1149
1150 1150 In [5]: message = 'IPython interpolates from Python to the shell'
1151 1151
1152 1152 In [6]: !echo $message
1153 1153 IPython interpolates from Python to the shell
1154 1154
1155 1155 This feature can be extremely useful, as it lets you combine the power and clarity of Python for complex logic with the immediacy and familiarity of many shell commands. Additionally, if you start the line with *two* `$$` signs, the output of the command will be automatically captured as a list of lines, e.g.:
1156 1156
1157 1157 In [10]: !!ls test.*
1158 1158 Out[10]:
1159 1159 ['test.aux',
1160 1160 'test.html',
1161 1161 'test.ipynb',
1162 1162 'test.log',
1163 1163 'test.out',
1164 1164 'test.pdf',
1165 1165 'test.rst',
1166 1166 'test.tex']
1167 1167
1168 1168 As explained above, you can now use this as the variable `_10`. If you directly want to capture the output of a system command to a Python variable, you can use the syntax `=!`:
1169 1169
1170 1170 In [11]: testfiles =! ls test.*
1171 1171
1172 1172 In [12]: print testfiles
1173 1173 ['test.aux', 'test.html', 'test.ipynb', 'test.log', 'test.out', 'test.pdf', 'test.rst', 'test.tex']
1174 1174
1175 1175 Finally, the special `%alias` command lets you define names that are shorthands for system commands, so that you can type them without having to prefix them via `!` explicitly (for example, `ls` is an alias that has been predefined for you at startup).
1176 1176
1177 1177 **Magic commands**
1178 1178
1179 1179 IPython has a system for special commands, called 'magics', that let you control IPython itself and perform many common tasks with a more shell-like syntax: it uses spaces for delimiting arguments, flags can be set with dashes and all arguments are treated as strings, so no additional quoting is required. This kind of syntax is invalid in the Python language but very convenient for interactive typing (less parentheses, commans and quoting everywhere); IPython distinguishes the two by detecting lines that start with the `%` character.
1180 1180
1181 1181 You can learn more about the magic system by simply typing `%magic` at the prompt, which will give you a short description plus the documentation on *all* available magics. If you want to see only a listing of existing magics, you can use `%lsmagic`:
1182 1182
1183 1183 In [4]: lsmagic
1184 1184 Available magic functions:
1185 1185 %alias %autocall %autoindent %automagic %bookmark %c %cd %colors %config %cpaste
1186 1186 %debug %dhist %dirs %doctest_mode %ds %ed %edit %env %gui %hist %history
1187 1187 %install_default_config %install_ext %install_profiles %load_ext %loadpy %logoff %logon
1188 1188 %logstart %logstate %logstop %lsmagic %macro %magic %notebook %page %paste %pastebin
1189 1189 %pd %pdb %pdef %pdoc %pfile %pinfo %pinfo2 %pop %popd %pprint %precision %profile
1190 1190 %prun %psearch %psource %pushd %pwd %pycat %pylab %quickref %recall %rehashx
1191 1191 %reload_ext %rep %rerun %reset %reset_selective %run %save %sc %stop %store %sx %tb
1192 1192 %time %timeit %unalias %unload_ext %who %who_ls %whos %xdel %xmode
1193 1193
1194 1194 Automagic is ON, % prefix NOT needed for magic functions.
1195 1195
1196 1196 Note how the example above omitted the eplicit `%` marker and simply uses `lsmagic`. As long as the 'automagic' feature is on (which it is by default), you can omit the `%` marker as long as there is no ambiguity with a Python variable of the same name.
1197 1197
1198 1198 **Running your code**
1199 1199
1200 1200 While it's easy to type a few lines of code in IPython, for any long-lived work you should keep your codes in Python scripts (or in IPython notebooks, see below). Consider that you have a script, in this case trivially simple for the sake of brevity, named `simple.py`:
1201 1201
1202 1202 In [12]: !cat simple.py
1203 1203 import numpy as np
1204 1204
1205 1205 x = np.random.normal(size=100)
1206 1206
1207 1207 print 'First elment of x:', x[0]
1208 1208
1209 1209 The typical workflow with IPython is to use the `%run` magic to execute your script (you can omit the .py extension if you want). When you run it, the script will execute just as if it had been run at the system prompt with `python simple.py` (though since modules don't get re-executed on new imports by Python, all system initialization is essentially free, which can have a significant run time impact in some cases):
1210 1210
1211 1211 In [13]: run simple
1212 1212 First elment of x: -1.55872256289
1213 1213
1214 1214 Once it completes, all variables defined in it become available for you to use interactively:
1215 1215
1216 1216 In [14]: x.shape
1217 1217 Out[14]: (100,)
1218 1218
1219 1219 This allows you to plot data, try out ideas, etc, in a `%run`/interact/edit cycle that can be very productive. As you start understanding your problem better you can refine your script further, incrementally improving it based on the work you do at the IPython prompt. At any point you can use the `%hist` magic to print out your history without prompts, so that you can copy useful fragments back into the script.
1220 1220
1221 1221 By default, `%run` executes scripts in a completely empty namespace, to better mimic how they would execute at the system prompt with plain Python. But if you use the `-i` flag, the script will also see your interactively defined variables. This lets you edit in a script larger amounts of code that still behave as if you had typed them at the IPython prompt.
1222 1222
1223 1223 You can also get a summary of the time taken by your script with the `-t` flag; consider a different script `randsvd.py` that takes a bit longer to run:
1224 1224
1225 1225 In [21]: run -t randsvd.py
1226 1226
1227 1227 IPython CPU timings (estimated):
1228 1228 User : 0.38 s.
1229 1229 System : 0.04 s.
1230 1230 Wall time: 0.34 s.
1231 1231
1232 1232 `User` is the time spent by the computer executing your code, while `System` is the time the operating system had to work on your behalf, doing things like memory allocation that are needed by your code but that you didn't explicitly program and that happen inside the kernel. The `Wall time` is the time on a 'clock on the wall' between the start and end of your program.
1233 1233
1234 1234 If `Wall > User+System`, your code is most likely waiting idle for certain periods. That could be waiting for data to arrive from a remote source or perhaps because the operating system has to swap large amounts of virtual memory. If you know that your code doesn't explicitly wait for remote data to arrive, you should investigate further to identify possible ways of improving the performance profile.
1235 1235
1236 1236 If you only want to time how long a single statement takes, you don't need to put it into a script as you can use the `%timeit` magic, which uses Python's `timeit` module to very carefully measure timig data; `timeit` can measure even short statements that execute extremely fast:
1237 1237
1238 1238 In [27]: %timeit a=1
1239 1239 10000000 loops, best of 3: 23 ns per loop
1240 1240
1241 1241 and for code that runs longer, it automatically adjusts so the overall measurement doesn't take too long:
1242 1242
1243 1243 In [28]: %timeit np.linalg.svd(x)
1244 1244 1 loops, best of 3: 310 ms per loop
1245 1245
1246 1246 The `%run` magic still has more options for debugging and profiling data; you should read its documentation for many useful details (as always, just type `%run?`).
1247 1247
1248 1248 ## The graphical Qt console
1249 1249
1250 1250 If you type at the system prompt (see the IPython website for installation details, as this requires some additional libraries):
1251 1251
1252 1252 $ ipython qtconsole
1253 1253
1254 1254 instead of opening in a terminal as before, IPython will start a graphical console that at first sight appears just like a terminal, but which is in fact much more capable than a text-only terminal. This is a specialized terminal designed for interactive scientific work, and it supports full multi-line editing with color highlighting and graphical calltips for functions, it can keep multiple IPython sessions open simultaneously in tabs, and when scripts run it can display the figures inline directly in the work area.
1255 1255
1256 1256 <center><img src="ipython_qtconsole2.png" width=400px></center>
1257 1257
1258 1258 % This cell is for the pdflatex output only
1259 1259 \begin{figure}[htbp]
1260 1260 \centering
1261 1261 \includegraphics[width=3in]{ipython_qtconsole2.png}
1262 1262 \caption{The IPython Qt console: a lightweight terminal for scientific exploration, with code, results and graphics in a soingle environment.}
1263 1263 \end{figure}
1264 1264
1265 1265 The Qt console accepts the same `--pylab` startup flags as the terminal, but you can additionally supply the value `--pylab inline`, which enables the support for inline graphics shown in the figure. This is ideal for keeping all the code and figures in the same session, given that the console can save the output of your entire session to HTML or PDF.
1266 1266
1267 1267 Since the Qt console makes it far more convenient than the terminal to edit blocks of code with multiple lines, in this environment it's worth knowing about the `%loadpy` magic function. `%loadpy` takes a path to a local file or remote URL, fetches its contents, and puts it in the work area for you to further edit and execute. It can be an extremely fast and convenient way of loading code from local disk or remote examples from sites such as the [Matplotlib gallery](http://matplotlib.sourceforge.net/gallery.html).
1268 1268
1269 1269 Other than its enhanced capabilities for code and graphics, all of the features of IPython we've explained before remain functional in this graphical console.
1270 1270
1271 1271 ## The IPython Notebook
1272 1272
1273 1273 The third way to interact with IPython, in addition to the terminal and graphical Qt console, is a powerful web interface called the "IPython Notebook". If you run at the system console (you can omit the `pylab` flags if you don't need plotting support):
1274 1274
1275 1275 $ ipython notebook --pylab inline
1276 1276
1277 1277 IPython will start a process that runs a web server in your local machine and to which a web browser can connect. The Notebook is a workspace that lets you execute code in blocks called 'cells' and displays any results and figures, but which can also contain arbitrary text (including LaTeX-formatted mathematical expressions) and any rich media that a modern web browser is capable of displaying.
1278 1278
1279 1279 <center><img src="ipython-notebook-specgram-2.png" width=400px></center>
1280 1280
1281 1281 % This cell is for the pdflatex output only
1282 1282 \begin{figure}[htbp]
1283 1283 \centering
1284 1284 \includegraphics[width=3in]{ipython-notebook-specgram-2.png}
1285 1285 \caption{The IPython Notebook: text, equations, code, results, graphics and other multimedia in an open format for scientific exploration and collaboration}
1286 1286 \end{figure}
1287 1287
1288 1288 In fact, this document was written as a Notebook, and only exported to LaTeX for printing. Inside of each cell, all the features of IPython that we have discussed before remain functional, since ultimately this web client is communicating with the same IPython code that runs in the terminal. But this interface is a much more rich and powerful environment for maintaining long-term "live and executable" scientific documents.
1289 1289
1290 1290 Notebook environments have existed in commercial systems like Mathematica(TM) and Maple(TM) for a long time; in the open source world the [Sage](http://sagemath.org) project blazed this particular trail starting in 2006, and now we bring all the features that have made IPython such a widely used tool to a Notebook model.
1291 1291
1292 1292 Since the Notebook runs as a web application, it is possible to configure it for remote access, letting you run your computations on a persistent server close to your data, which you can then access remotely from any browser-equipped computer. We encourage you to read the extensive documentation provided by the IPython project for details on how to do this and many more features of the notebook.
1293 1293
1294 1294 Finally, as we said earlier, IPython also has a high-level and easy to use set of libraries for parallel computing, that let you control (interactively if desired) not just one IPython but an entire cluster of 'IPython engines'. Unfortunately a detailed discussion of these tools is beyond the scope of this text, but should you need to parallelize your analysis codes, a quick read of the tutorials and examples provided at the IPython site may prove fruitful.
@@ -1,1181 +1,1181 b''
1 1 ## An Introduction to the Scientific Python Ecosystem
2 2
3 3 # While the Python language is an excellent tool for general-purpose programming, with a highly readable syntax, rich and powerful data types (strings, lists, sets, dictionaries, arbitrary length integers, etc) and a very comprehensive standard library, it was not designed specifically for mathematical and scientific computing. Neither the language nor its standard library have facilities for the efficient representation of multidimensional datasets, tools for linear algebra and general matrix manipulations (an essential building block of virtually all technical computing), nor any data visualization facilities.
4 4 #
5 5 # In particular, Python lists are very flexible containers that can be nested arbitrarily deep and which can hold any Python object in them, but they are poorly suited to represent efficiently common mathematical constructs like vectors and matrices. In contrast, much of our modern heritage of scientific computing has been built on top of libraries written in the Fortran language, which has native support for vectors and matrices as well as a library of mathematical functions that can efficiently operate on entire arrays at once.
6 6
7 7 ### Scientific Python: a collaboration of projects built by scientists
8 8
9 9 # The scientific community has developed a set of related Python libraries that provide powerful array facilities, linear algebra, numerical algorithms, data visualization and more. In this appendix, we will briefly outline the tools most frequently used for this purpose, that make "Scientific Python" something far more powerful than the Python language alone.
10 10 #
11 11 # For reasons of space, we can only describe in some detail the central Numpy library, but below we provide links to the websites of each project where you can read their documentation in more detail.
12 12 #
13 13 # First, let's look at an overview of the basic tools that most scientists use in daily research with Python. The core of this ecosystem is composed of:
14 14 #
15 15 # * Numpy: the basic library that most others depend on, it provides a powerful array type that can represent multidmensional datasets of many different kinds and that supports arithmetic operations. Numpy also provides a library of common mathematical functions, basic linear algebra, random number generation and Fast Fourier Transforms. Numpy can be found at [numpy.scipy.org](http://numpy.scipy.org)
16 16 #
17 17 # * Scipy: a large collection of numerical algorithms that operate on numpy arrays and provide facilities for many common tasks in scientific computing, including dense and sparse linear algebra support, optimization, special functions, statistics, n-dimensional image processing, signal processing and more. Scipy can be found at [scipy.org](http://scipy.org).
18 18 #
19 19 # * Matplotlib: a data visualization library with a strong focus on producing high-quality output, it supports a variety of common scientific plot types in two and three dimensions, with precise control over the final output and format for publication-quality results. Matplotlib can also be controlled interactively allowing graphical manipulation of your data (zooming, panning, etc) and can be used with most modern user interface toolkits. It can be found at [matplotlib.sf.net](http://matplotlib.sf.net).
20 20 #
21 21 # * IPython: while not strictly scientific in nature, IPython is the interactive environment in which many scientists spend their time. IPython provides a powerful Python shell that integrates tightly with Matplotlib and with easy access to the files and operating system, and which can execute in a terminal or in a graphical Qt console. IPython also has a web-based notebook interface that can combine code with text, mathematical expressions, figures and multimedia. It can be found at [ipython.org](http://ipython.org).
22 22 #
23 23 # While each of these tools can be installed separately, in our opinion the most convenient way today of accessing them (especially on Windows and Mac computers) is to install the [Free Edition of the Enthought Python Distribution](http://www.enthought.com/products/epd_free.php) which contain all the above. Other free alternatives on Windows (but not on Macs) are [Python(x,y)](http://code.google.com/p/pythonxy) and [ Christoph Gohlke's packages page](http://www.lfd.uci.edu/~gohlke/pythonlibs).
24 24 #
25 25 # These four 'core' libraries are in practice complemented by a number of other tools for more specialized work. We will briefly list here the ones that we think are the most commonly needed:
26 26 #
27 27 # * Sympy: a symbolic manipulation tool that turns a Python session into a computer algebra system. It integrates with the IPython notebook, rendering results in properly typeset mathematical notation. [sympy.org](http://sympy.org).
28 28 #
29 29 # * Mayavi: sophisticated 3d data visualization; [code.enthought.com/projects/mayavi](http://code.enthought.com/projects/mayavi).
30 30 #
31 31 # * Cython: a bridge language between Python and C, useful both to optimize performance bottlenecks in Python and to access C libraries directly; [cython.org](http://cython.org).
32 32 #
33 33 # * Pandas: high-performance data structures and data analysis tools, with powerful data alignment and structural manipulation capabilities; [pandas.pydata.org](http://pandas.pydata.org).
34 34 #
35 35 # * Statsmodels: statistical data exploration and model estimation; [statsmodels.sourceforge.net](http://statsmodels.sourceforge.net).
36 36 #
37 37 # * Scikit-learn: general purpose machine learning algorithms with a common interface; [scikit-learn.org](http://scikit-learn.org).
38 38 #
39 39 # * Scikits-image: image processing toolbox; [scikits-image.org](http://scikits-image.org).
40 40 #
41 41 # * NetworkX: analysis of complex networks (in the graph theoretical sense); [networkx.lanl.gov](http://networkx.lanl.gov).
42 42 #
43 43 # * PyTables: management of hierarchical datasets using the industry-standard HDF5 format; [www.pytables.org](http://www.pytables.org).
44 44 #
45 45 # Beyond these, for any specific problem you should look on the internet first, before starting to write code from scratch. There's a good chance that someone, somewhere, has written an open source library that you can use for part or all of your problem.
46 46
47 47 ### A note about the examples below
48 48
49 49 # In all subsequent examples, you will see blocks of input code, followed by the results of the code if the code generated output. This output may include text, graphics and other result objects. These blocks of input can be pasted into your interactive IPython session or notebook for you to execute. In the print version of this document, a thin vertical bar on the left of the blocks of input and output shows which blocks go together.
50 50 #
51 51 # If you are reading this text as an actual IPython notebook, you can press `Shift-Enter` or use the 'play' button on the toolbar (right-pointing triangle) to execute each block of code, known as a 'cell' in IPython:
52 52
53 53 # In[71]:
54 54 # This is a block of code, below you'll see its output
55 55 print "Welcome to the world of scientific computing with Python!"
56 56
57 57 # Out[71]:
58 58 # Welcome to the world of scientific computing with Python!
59 59 #
60 60 ## Motivation: the trapezoidal rule
61 61
62 62 # In subsequent sections we'll provide a basic introduction to the nuts and bolts of the basic scientific python tools; but we'll first motivate it with a brief example that illustrates what you can do in a few lines with these tools. For this, we will use the simple problem of approximating a definite integral with the trapezoid rule:
63 63 #
64 64 # $$
65 65 # \int_{a}^{b} f(x)\, dx \approx \frac{1}{2} \sum_{k=1}^{N} \left( x_{k} - x_{k-1} \right) \left( f(x_{k}) + f(x_{k-1}) \right).
66 66 # $$
67 67 #
68 68 # Our task will be to compute this formula for a function such as:
69 69 #
70 70 # $$
71 71 # f(x) = (x-3)(x-5)(x-7)+85
72 72 # $$
73 73 #
74 74 # integrated between $a=1$ and $b=9$.
75 75 #
76 76 # First, we define the function and sample it evenly between 0 and 10 at 200 points:
77 77
78 78 # In[1]:
79 79 def f(x):
80 80 return (x-3)*(x-5)*(x-7)+85
81 81
82 82 import numpy as np
83 83 x = np.linspace(0, 10, 200)
84 84 y = f(x)
85 85
86 86 # We select $a$ and $b$, our integration limits, and we take only a few points in that region to illustrate the error behavior of the trapezoid approximation:
87 87
88 88 # In[2]:
89 89 a, b = 1, 9
90 90 xint = x[logical_and(x>=a, x<=b)][::30]
91 91 yint = y[logical_and(x>=a, x<=b)][::30]
92 92
93 93 # Let's plot both the function and the area below it in the trapezoid approximation:
94 94
95 95 # In[3]:
96 96 import matplotlib.pyplot as plt
97 97 plt.plot(x, y, lw=2)
98 98 plt.axis([0, 10, 0, 140])
99 99 plt.fill_between(xint, 0, yint, facecolor='gray', alpha=0.4)
100 100 plt.text(0.5 * (a + b), 30,r"$\int_a^b f(x)dx$", horizontalalignment='center', fontsize=20);
101 101
102 102 # Out[3]:
103 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.svg
103 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.svg
104 104
105 105 # Compute the integral both at high accuracy and with the trapezoid approximation
106 106
107 107 # In[4]:
108 108 from scipy.integrate import quad, trapz
109 109 integral, error = quad(f, 1, 9)
110 110 trap_integral = trapz(yint, xint)
111 111 print "The integral is: %g +/- %.1e" % (integral, error)
112 112 print "The trapezoid approximation with", len(xint), "points is:", trap_integral
113 113 print "The absolute error is:", abs(integral - trap_integral)
114 114
115 115 # Out[4]:
116 116 # The integral is: 680 +/- 7.5e-12
117 117 # The trapezoid approximation with 6 points is: 621.286411141
118 118 # The absolute error is: 58.7135888589
119 119 #
120 120 # This simple example showed us how, combining the numpy, scipy and matplotlib libraries we can provide an illustration of a standard method in elementary calculus with just a few lines of code. We will now discuss with more detail the basic usage of these tools.
121 121
122 122 ## NumPy arrays: the right data structure for scientific computing
123 123
124 124 ### Basics of Numpy arrays
125 125
126 126 # We now turn our attention to the Numpy library, which forms the base layer for the entire 'scipy ecosystem'. Once you have installed numpy, you can import it as
127 127
128 128 # In[5]:
129 129 import numpy
130 130
131 131 # though in this book we will use the common shorthand
132 132
133 133 # In[6]:
134 134 import numpy as np
135 135
136 136 # As mentioned above, the main object provided by numpy is a powerful array. We'll start by exploring how the numpy array differs from Python lists. We start by creating a simple list and an array with the same contents of the list:
137 137
138 138 # In[7]:
139 139 lst = [10, 20, 30, 40]
140 140 arr = np.array([10, 20, 30, 40])
141 141
142 142 # Elements of a one-dimensional array are accessed with the same syntax as a list:
143 143
144 144 # In[8]:
145 145 lst[0]
146 146
147 147 # Out[8]:
148 148 # 10
149 149
150 150
151 151 # In[9]:
152 152 arr[0]
153 153
154 154 # Out[9]:
155 155 # 10
156 156
157 157
158 158 # In[10]:
159 159 arr[-1]
160 160
161 161 # Out[10]:
162 162 # 40
163 163
164 164
165 165 # In[11]:
166 166 arr[2:]
167 167
168 168 # Out[11]:
169 169 # array([30, 40])
170 170
171 171
172 172 # The first difference to note between lists and arrays is that arrays are *homogeneous*; i.e. all elements of an array must be of the same type. In contrast, lists can contain elements of arbitrary type. For example, we can change the last element in our list above to be a string:
173 173
174 174 # In[12]:
175 175 lst[-1] = 'a string inside a list'
176 176 lst
177 177
178 178 # Out[12]:
179 179 # [10, 20, 30, 'a string inside a list']
180 180
181 181
182 182 # but the same can not be done with an array, as we get an error message:
183 183
184 184 # In[13]:
185 185 arr[-1] = 'a string inside an array'
186 186
187 187 # Out[13]:
188 188 ---------------------------------------------------------------------------
189 189 ValueError Traceback (most recent call last)
190 190 /home/fperez/teach/book-math-labtool/<ipython-input-13-29c0bfa5fa8a> in <module>()
191 191 ----> 1 arr[-1] = 'a string inside an array'
192 192
193 193 ValueError: invalid literal for long() with base 10: 'a string inside an array'
194 194
195 195 # The information about the type of an array is contained in its *dtype* attribute:
196 196
197 197 # In[14]:
198 198 arr.dtype
199 199
200 200 # Out[14]:
201 201 # dtype('int32')
202 202
203 203
204 204 # Once an array has been created, its dtype is fixed and it can only store elements of the same type. For this example where the dtype is integer, if we store a floating point number it will be automatically converted into an integer:
205 205
206 206 # In[15]:
207 207 arr[-1] = 1.234
208 208 arr
209 209
210 210 # Out[15]:
211 211 # array([10, 20, 30, 1])
212 212
213 213
214 214 # Above we created an array from an existing list; now let us now see other ways in which we can create arrays, which we'll illustrate next. A common need is to have an array initialized with a constant value, and very often this value is 0 or 1 (suitable as starting value for additive and multiplicative loops respectively); `zeros` creates arrays of all zeros, with any desired dtype:
215 215
216 216 # In[16]:
217 217 np.zeros(5, float)
218 218
219 219 # Out[16]:
220 220 # array([ 0., 0., 0., 0., 0.])
221 221
222 222
223 223 # In[17]:
224 224 np.zeros(3, int)
225 225
226 226 # Out[17]:
227 227 # array([0, 0, 0])
228 228
229 229
230 230 # In[18]:
231 231 np.zeros(3, complex)
232 232
233 233 # Out[18]:
234 234 # array([ 0.+0.j, 0.+0.j, 0.+0.j])
235 235
236 236
237 237 # and similarly for `ones`:
238 238
239 239 # In[19]:
240 240 print '5 ones:', np.ones(5)
241 241
242 242 # Out[19]:
243 243 # 5 ones: [ 1. 1. 1. 1. 1.]
244 244 #
245 245 # If we want an array initialized with an arbitrary value, we can create an empty array and then use the fill method to put the value we want into the array:
246 246
247 247 # In[20]:
248 248 a = empty(4)
249 249 a.fill(5.5)
250 250 a
251 251
252 252 # Out[20]:
253 253 # array([ 5.5, 5.5, 5.5, 5.5])
254 254
255 255
256 256 # Numpy also offers the `arange` function, which works like the builtin `range` but returns an array instead of a list:
257 257
258 258 # In[21]:
259 259 np.arange(5)
260 260
261 261 # Out[21]:
262 262 # array([0, 1, 2, 3, 4])
263 263
264 264
265 265 # and the `linspace` and `logspace` functions to create linearly and logarithmically-spaced grids respectively, with a fixed number of points and including both ends of the specified interval:
266 266
267 267 # In[22]:
268 268 print "A linear grid between 0 and 1:", np.linspace(0, 1, 5)
269 269 print "A logarithmic grid between 10**1 and 10**4: ", np.logspace(1, 4, 4)
270 270
271 271 # Out[22]:
272 272 # A linear grid between 0 and 1: [ 0. 0.25 0.5 0.75 1. ]
273 273 # A logarithmic grid between 10**1 and 10**4: [ 10. 100. 1000. 10000.]
274 274 #
275 275 # Finally, it is often useful to create arrays with random numbers that follow a specific distribution. The `np.random` module contains a number of functions that can be used to this effect, for example this will produce an array of 5 random samples taken from a standard normal distribution (0 mean and variance 1):
276 276
277 277 # In[23]:
278 278 np.random.randn(5)
279 279
280 280 # Out[23]:
281 281 # array([-0.08633343, -0.67375434, 1.00589536, 0.87081651, 1.65597822])
282 282
283 283
284 284 # whereas this will also give 5 samples, but from a normal distribution with a mean of 10 and a variance of 3:
285 285
286 286 # In[24]:
287 287 norm10 = np.random.normal(10, 3, 5)
288 288 norm10
289 289
290 290 # Out[24]:
291 291 # array([ 8.94879575, 5.53038269, 8.24847281, 12.14944165, 11.56209294])
292 292
293 293
294 294 ### Indexing with other arrays
295 295
296 296 # Above we saw how to index arrays with single numbers and slices, just like Python lists. But arrays allow for a more sophisticated kind of indexing which is very powerful: you can index an array with another array, and in particular with an array of boolean values. This is particluarly useful to extract information from an array that matches a certain condition.
297 297 #
298 298 # Consider for example that in the array `norm10` we want to replace all values above 9 with the value 0. We can do so by first finding the *mask* that indicates where this condition is true or false:
299 299
300 300 # In[25]:
301 301 mask = norm10 > 9
302 302 mask
303 303
304 304 # Out[25]:
305 305 # array([False, False, False, True, True], dtype=bool)
306 306
307 307
308 308 # Now that we have this mask, we can use it to either read those values or to reset them to 0:
309 309
310 310 # In[26]:
311 311 print 'Values above 9:', norm10[mask]
312 312
313 313 # Out[26]:
314 314 # Values above 9: [ 12.14944165 11.56209294]
315 315 #
316 316 # In[27]:
317 317 print 'Resetting all values above 9 to 0...'
318 318 norm10[mask] = 0
319 319 print norm10
320 320
321 321 # Out[27]:
322 322 # Resetting all values above 9 to 0...
323 323 # [ 8.94879575 5.53038269 8.24847281 0. 0. ]
324 324 #
325 325 ### Arrays with more than one dimension
326 326
327 327 # Up until now all our examples have used one-dimensional arrays. But Numpy can create arrays of aribtrary dimensions, and all the methods illustrated in the previous section work with more than one dimension. For example, a list of lists can be used to initialize a two dimensional array:
328 328
329 329 # In[28]:
330 330 lst2 = [[1, 2], [3, 4]]
331 331 arr2 = np.array([[1, 2], [3, 4]])
332 332 arr2
333 333
334 334 # Out[28]:
335 335 # array([[1, 2],
336 336 # [3, 4]])
337 337
338 338
339 339 # With two-dimensional arrays we start seeing the power of numpy: while a nested list can be indexed using repeatedly the `[ ]` operator, multidimensional arrays support a much more natural indexing syntax with a single `[ ]` and a set of indices separated by commas:
340 340
341 341 # In[29]:
342 342 print lst2[0][1]
343 343 print arr2[0,1]
344 344
345 345 # Out[29]:
346 346 # 2
347 347 # 2
348 348 #
349 349 # Most of the array creation functions listed above can be used with more than one dimension, for example:
350 350
351 351 # In[30]:
352 352 np.zeros((2,3))
353 353
354 354 # Out[30]:
355 355 # array([[ 0., 0., 0.],
356 356 # [ 0., 0., 0.]])
357 357
358 358
359 359 # In[31]:
360 360 np.random.normal(10, 3, (2, 4))
361 361
362 362 # Out[31]:
363 363 # array([[ 11.26788826, 4.29619866, 11.09346496, 9.73861307],
364 364 # [ 10.54025996, 9.5146268 , 10.80367214, 13.62204505]])
365 365
366 366
367 367 # In fact, the shape of an array can be changed at any time, as long as the total number of elements is unchanged. For example, if we want a 2x4 array with numbers increasing from 0, the easiest way to create it is:
368 368
369 369 # In[32]:
370 370 arr = np.arange(8).reshape(2,4)
371 371 print arr
372 372
373 373 # Out[32]:
374 374 # [[0 1 2 3]
375 375 # [4 5 6 7]]
376 376 #
377 377 # With multidimensional arrays, you can also use slices, and you can mix and match slices and single indices in the different dimensions (using the same array as above):
378 378
379 379 # In[33]:
380 380 print 'Slicing in the second row:', arr[1, 2:4]
381 381 print 'All rows, third column :', arr[:, 2]
382 382
383 383 # Out[33]:
384 384 # Slicing in the second row: [6 7]
385 385 # All rows, third column : [2 6]
386 386 #
387 387 # If you only provide one index, then you will get an array with one less dimension containing that row:
388 388
389 389 # In[34]:
390 390 print 'First row: ', arr[0]
391 391 print 'Second row: ', arr[1]
392 392
393 393 # Out[34]:
394 394 # First row: [0 1 2 3]
395 395 # Second row: [4 5 6 7]
396 396 #
397 397 # Now that we have seen how to create arrays with more than one dimension, it's a good idea to look at some of the most useful properties and methods that arrays have. The following provide basic information about the size, shape and data in the array:
398 398
399 399 # In[35]:
400 400 print 'Data type :', arr.dtype
401 401 print 'Total number of elements :', arr.size
402 402 print 'Number of dimensions :', arr.ndim
403 403 print 'Shape (dimensionality) :', arr.shape
404 404 print 'Memory used (in bytes) :', arr.nbytes
405 405
406 406 # Out[35]:
407 407 # Data type : int32
408 408 # Total number of elements : 8
409 409 # Number of dimensions : 2
410 410 # Shape (dimensionality) : (2, 4)
411 411 # Memory used (in bytes) : 32
412 412 #
413 413 # Arrays also have many useful methods, some especially useful ones are:
414 414
415 415 # In[36]:
416 416 print 'Minimum and maximum :', arr.min(), arr.max()
417 417 print 'Sum and product of all elements :', arr.sum(), arr.prod()
418 418 print 'Mean and standard deviation :', arr.mean(), arr.std()
419 419
420 420 # Out[36]:
421 421 # Minimum and maximum : 0 7
422 422 # Sum and product of all elements : 28 0
423 423 # Mean and standard deviation : 3.5 2.29128784748
424 424 #
425 425 # For these methods, the above operations area all computed on all the elements of the array. But for a multidimensional array, it's possible to do the computation along a single dimension, by passing the `axis` parameter; for example:
426 426
427 427 # In[37]:
428 428 print 'For the following array:\n', arr
429 429 print 'The sum of elements along the rows is :', arr.sum(axis=1)
430 430 print 'The sum of elements along the columns is :', arr.sum(axis=0)
431 431
432 432 # Out[37]:
433 433 # For the following array:
434 434 # [[0 1 2 3]
435 435 # [4 5 6 7]]
436 436 # The sum of elements along the rows is : [ 6 22]
437 437 # The sum of elements along the columns is : [ 4 6 8 10]
438 438 #
439 439 # As you can see in this example, the value of the `axis` parameter is the dimension which will be *consumed* once the operation has been carried out. This is why to sum along the rows we use `axis=0`.
440 440 #
441 441 # This can be easily illustrated with an example that has more dimensions; we create an array with 4 dimensions and shape `(3,4,5,6)` and sum along the axis number 2 (i.e. the *third* axis, since in Python all counts are 0-based). That consumes the dimension whose length was 5, leaving us with a new array that has shape `(3,4,6)`:
442 442
443 443 # In[38]:
444 444 np.zeros((3,4,5,6)).sum(2).shape
445 445
446 446 # Out[38]:
447 447 # (3, 4, 6)
448 448
449 449
450 450 # Another widely used property of arrays is the `.T` attribute, which allows you to access the transpose of the array:
451 451
452 452 # In[39]:
453 453 print 'Array:\n', arr
454 454 print 'Transpose:\n', arr.T
455 455
456 456 # Out[39]:
457 457 # Array:
458 458 # [[0 1 2 3]
459 459 # [4 5 6 7]]
460 460 # Transpose:
461 461 # [[0 4]
462 462 # [1 5]
463 463 # [2 6]
464 464 # [3 7]]
465 465 #
466 466 # We don't have time here to look at all the methods and properties of arrays, here's a complete list. Simply try exploring some of these IPython to learn more, or read their description in the full Numpy documentation:
467 467 #
468 468 # arr.T arr.copy arr.getfield arr.put arr.squeeze
469 469 # arr.all arr.ctypes arr.imag arr.ravel arr.std
470 470 # arr.any arr.cumprod arr.item arr.real arr.strides
471 471 # arr.argmax arr.cumsum arr.itemset arr.repeat arr.sum
472 472 # arr.argmin arr.data arr.itemsize arr.reshape arr.swapaxes
473 473 # arr.argsort arr.diagonal arr.max arr.resize arr.take
474 474 # arr.astype arr.dot arr.mean arr.round arr.tofile
475 475 # arr.base arr.dtype arr.min arr.searchsorted arr.tolist
476 476 # arr.byteswap arr.dump arr.nbytes arr.setasflat arr.tostring
477 477 # arr.choose arr.dumps arr.ndim arr.setfield arr.trace
478 478 # arr.clip arr.fill arr.newbyteorder arr.setflags arr.transpose
479 479 # arr.compress arr.flags arr.nonzero arr.shape arr.var
480 480 # arr.conj arr.flat arr.prod arr.size arr.view
481 481 # arr.conjugate arr.flatten arr.ptp arr.sort
482 482
483 483 ### Operating with arrays
484 484
485 485 # Arrays support all regular arithmetic operators, and the numpy library also contains a complete collection of basic mathematical functions that operate on arrays. It is important to remember that in general, all operations with arrays are applied *element-wise*, i.e., are applied to all the elements of the array at the same time. Consider for example:
486 486
487 487 # In[40]:
488 488 arr1 = np.arange(4)
489 489 arr2 = np.arange(10, 14)
490 490 print arr1, '+', arr2, '=', arr1+arr2
491 491
492 492 # Out[40]:
493 493 # [0 1 2 3] + [10 11 12 13] = [10 12 14 16]
494 494 #
495 495 # Importantly, you must remember that even the multiplication operator is by default applied element-wise, it is *not* the matrix multiplication from linear algebra (as is the case in Matlab, for example):
496 496
497 497 # In[41]:
498 498 print arr1, '*', arr2, '=', arr1*arr2
499 499
500 500 # Out[41]:
501 501 # [0 1 2 3] * [10 11 12 13] = [ 0 11 24 39]
502 502 #
503 503 # While this means that in principle arrays must always match in their dimensionality in order for an operation to be valid, numpy will *broadcast* dimensions when possible. For example, suppose that you want to add the number 1.5 to `arr1`; the following would be a valid way to do it:
504 504
505 505 # In[42]:
506 506 arr1 + 1.5*np.ones(4)
507 507
508 508 # Out[42]:
509 509 # array([ 1.5, 2.5, 3.5, 4.5])
510 510
511 511
512 512 # But thanks to numpy's broadcasting rules, the following is equally valid:
513 513
514 514 # In[43]:
515 515 arr1 + 1.5
516 516
517 517 # Out[43]:
518 518 # array([ 1.5, 2.5, 3.5, 4.5])
519 519
520 520
521 521 # In this case, numpy looked at both operands and saw that the first (`arr1`) was a one-dimensional array of length 4 and the second was a scalar, considered a zero-dimensional object. The broadcasting rules allow numpy to:
522 522 #
523 523 # * *create* new dimensions of length 1 (since this doesn't change the size of the array)
524 524 # * 'stretch' a dimension of length 1 that needs to be matched to a dimension of a different size.
525 525 #
526 526 # So in the above example, the scalar 1.5 is effectively:
527 527 #
528 528 # * first 'promoted' to a 1-dimensional array of length 1
529 529 # * then, this array is 'stretched' to length 4 to match the dimension of `arr1`.
530 530 #
531 531 # After these two operations are complete, the addition can proceed as now both operands are one-dimensional arrays of length 4.
532 532 #
533 533 # This broadcasting behavior is in practice enormously powerful, especially because when numpy broadcasts to create new dimensions or to 'stretch' existing ones, it doesn't actually replicate the data. In the example above the operation is carried *as if* the 1.5 was a 1-d array with 1.5 in all of its entries, but no actual array was ever created. This can save lots of memory in cases when the arrays in question are large and can have significant performance implications.
534 534 #
535 535 # The general rule is: when operating on two arrays, NumPy compares their shapes element-wise. It starts with the trailing dimensions, and works its way forward, creating dimensions of length 1 as needed. Two dimensions are considered compatible when
536 536 #
537 537 # * they are equal to begin with, or
538 538 # * one of them is 1; in this case numpy will do the 'stretching' to make them equal.
539 539 #
540 540 # If these conditions are not met, a `ValueError: frames are not aligned` exception is thrown, indicating that the arrays have incompatible shapes. The size of the resulting array is the maximum size along each dimension of the input arrays.
541 541
542 542 # This shows how the broadcasting rules work in several dimensions:
543 543
544 544 # In[44]:
545 545 b = np.array([2, 3, 4, 5])
546 546 print arr, '\n\n+', b , '\n----------------\n', arr + b
547 547
548 548 # Out[44]:
549 549 # [[0 1 2 3]
550 550 # [4 5 6 7]]
551 551 #
552 552 # + [2 3 4 5]
553 553 # ----------------
554 554 # [[ 2 4 6 8]
555 555 # [ 6 8 10 12]]
556 556 #
557 557 # Now, how could you use broadcasting to say add `[4, 6]` along the rows to `arr` above? Simply performing the direct addition will produce the error we previously mentioned:
558 558
559 559 # In[45]:
560 560 c = np.array([4, 6])
561 561 arr + c
562 562
563 563 # Out[45]:
564 564 ---------------------------------------------------------------------------
565 565 ValueError Traceback (most recent call last)
566 566 /home/fperez/teach/book-math-labtool/<ipython-input-45-62aa20ac1980> in <module>()
567 567 1 c = np.array([4, 6])
568 568 ----> 2 arr + c
569 569
570 570 ValueError: operands could not be broadcast together with shapes (2,4) (2)
571 571
572 572 # According to the rules above, the array `c` would need to have a *trailing* dimension of 1 for the broadcasting to work. It turns out that numpy allows you to 'inject' new dimensions anywhere into an array on the fly, by indexing it with the special object `np.newaxis`:
573 573
574 574 # In[46]:
575 575 (c[:, np.newaxis]).shape
576 576
577 577 # Out[46]:
578 578 # (2, 1)
579 579
580 580
581 581 # This is exactly what we need, and indeed it works:
582 582
583 583 # In[47]:
584 584 arr + c[:, np.newaxis]
585 585
586 586 # Out[47]:
587 587 # array([[ 4, 5, 6, 7],
588 588 # [10, 11, 12, 13]])
589 589
590 590
591 591 # For the full broadcasting rules, please see the official Numpy docs, which describe them in detail and with more complex examples.
592 592
593 593 # As we mentioned before, Numpy ships with a full complement of mathematical functions that work on entire arrays, including logarithms, exponentials, trigonometric and hyperbolic trigonometric functions, etc. Furthermore, scipy ships a rich special function library in the `scipy.special` module that includes Bessel, Airy, Fresnel, Laguerre and other classical special functions. For example, sampling the sine function at 100 points between $0$ and $2\pi$ is as simple as:
594 594
595 595 # In[48]:
596 596 x = np.linspace(0, 2*np.pi, 100)
597 597 y = np.sin(x)
598 598
599 599 ### Linear algebra in numpy
600 600
601 601 # Numpy ships with a basic linear algebra library, and all arrays have a `dot` method whose behavior is that of the scalar dot product when its arguments are vectors (one-dimensional arrays) and the traditional matrix multiplication when one or both of its arguments are two-dimensional arrays:
602 602
603 603 # In[49]:
604 604 v1 = np.array([2, 3, 4])
605 605 v2 = np.array([1, 0, 1])
606 606 print v1, '.', v2, '=', v1.dot(v2)
607 607
608 608 # Out[49]:
609 609 # [2 3 4] . [1 0 1] = 6
610 610 #
611 611 # Here is a regular matrix-vector multiplication, note that the array `v1` should be viewed as a *column* vector in traditional linear algebra notation; numpy makes no distinction between row and column vectors and simply verifies that the dimensions match the required rules of matrix multiplication, in this case we have a $2 \times 3$ matrix multiplied by a 3-vector, which produces a 2-vector:
612 612
613 613 # In[50]:
614 614 A = np.arange(6).reshape(2, 3)
615 615 print A, 'x', v1, '=', A.dot(v1)
616 616
617 617 # Out[50]:
618 618 # [[0 1 2]
619 619 # [3 4 5]] x [2 3 4] = [11 38]
620 620 #
621 621 # For matrix-matrix multiplication, the same dimension-matching rules must be satisfied, e.g. consider the difference between $A \times A^T$:
622 622
623 623 # In[51]:
624 624 print A.dot(A.T)
625 625
626 626 # Out[51]:
627 627 # [[ 5 14]
628 628 # [14 50]]
629 629 #
630 630 # and $A^T \times A$:
631 631
632 632 # In[52]:
633 633 print A.T.dot(A)
634 634
635 635 # Out[52]:
636 636 # [[ 9 12 15]
637 637 # [12 17 22]
638 638 # [15 22 29]]
639 639 #
640 640 # Furthermore, the `numpy.linalg` module includes additional functionality such as determinants, matrix norms, Cholesky, eigenvalue and singular value decompositions, etc. For even more linear algebra tools, `scipy.linalg` contains the majority of the tools in the classic LAPACK libraries as well as functions to operate on sparse matrices. We refer the reader to the Numpy and Scipy documentations for additional details on these.
641 641
642 642 ### Reading and writing arrays to disk
643 643
644 644 # Numpy lets you read and write arrays into files in a number of ways. In order to use these tools well, it is critical to understand the difference between a *text* and a *binary* file containing numerical data. In a text file, the number $\pi$ could be written as "3.141592653589793", for example: a string of digits that a human can read, with in this case 15 decimal digits. In contrast, that same number written to a binary file would be encoded as 8 characters (bytes) that are not readable by a human but which contain the exact same data that the variable `pi` had in the computer's memory.
645 645 #
646 646 # The tradeoffs between the two modes are thus:
647 647 #
648 648 # * Text mode: occupies more space, precision can be lost (if not all digits are written to disk), but is readable and editable by hand with a text editor. Can *only* be used for one- and two-dimensional arrays.
649 649 #
650 650 # * Binary mode: compact and exact representation of the data in memory, can't be read or edited by hand. Arrays of any size and dimensionality can be saved and read without loss of information.
651 651 #
652 652 # First, let's see how to read and write arrays in text mode. The `np.savetxt` function saves an array to a text file, with options to control the precision, separators and even adding a header:
653 653
654 654 # In[53]:
655 655 arr = np.arange(10).reshape(2, 5)
656 656 np.savetxt('test.out', arr, fmt='%.2e', header="My dataset")
657 657 !cat test.out
658 658
659 659 # Out[53]:
660 660 # # My dataset
661 661 # 0.00e+00 1.00e+00 2.00e+00 3.00e+00 4.00e+00
662 662 # 5.00e+00 6.00e+00 7.00e+00 8.00e+00 9.00e+00
663 663 #
664 664 # And this same type of file can then be read with the matching `np.loadtxt` function:
665 665
666 666 # In[54]:
667 667 arr2 = np.loadtxt('test.out')
668 668 print arr2
669 669
670 670 # Out[54]:
671 671 # [[ 0. 1. 2. 3. 4.]
672 672 # [ 5. 6. 7. 8. 9.]]
673 673 #
674 674 # For binary data, Numpy provides the `np.save` and `np.savez` routines. The first saves a single array to a file with `.npy` extension, while the latter can be used to save a *group* of arrays into a single file with `.npz` extension. The files created with these routines can then be read with the `np.load` function.
675 675 #
676 676 # Let us first see how to use the simpler `np.save` function to save a single array:
677 677
678 678 # In[55]:
679 679 np.save('test.npy', arr2)
680 680 # Now we read this back
681 681 arr2n = np.load('test.npy')
682 682 # Let's see if any element is non-zero in the difference.
683 683 # A value of True would be a problem.
684 684 print 'Any differences?', np.any(arr2-arr2n)
685 685
686 686 # Out[55]:
687 687 # Any differences? False
688 688 #
689 689 # Now let us see how the `np.savez` function works. You give it a filename and either a sequence of arrays or a set of keywords. In the first mode, the function will auotmatically name the saved arrays in the archive as `arr_0`, `arr_1`, etc:
690 690
691 691 # In[56]:
692 692 np.savez('test.npz', arr, arr2)
693 693 arrays = np.load('test.npz')
694 694 arrays.files
695 695
696 696 # Out[56]:
697 697 # ['arr_1', 'arr_0']
698 698
699 699
700 700 # Alternatively, we can explicitly choose how to name the arrays we save:
701 701
702 702 # In[57]:
703 703 np.savez('test.npz', array1=arr, array2=arr2)
704 704 arrays = np.load('test.npz')
705 705 arrays.files
706 706
707 707 # Out[57]:
708 708 # ['array2', 'array1']
709 709
710 710
711 711 # The object returned by `np.load` from an `.npz` file works like a dictionary, though you can also access its constituent files by attribute using its special `.f` field; this is best illustrated with an example with the `arrays` object from above:
712 712
713 713 # In[58]:
714 714 print 'First row of first array:', arrays['array1'][0]
715 715 # This is an equivalent way to get the same field
716 716 print 'First row of first array:', arrays.f.array1[0]
717 717
718 718 # Out[58]:
719 719 # First row of first array: [0 1 2 3 4]
720 720 # First row of first array: [0 1 2 3 4]
721 721 #
722 722 # This `.npz` format is a very convenient way to package compactly and without loss of information, into a single file, a group of related arrays that pertain to a specific problem. At some point, however, the complexity of your dataset may be such that the optimal approach is to use one of the standard formats in scientific data processing that have been designed to handle complex datasets, such as NetCDF or HDF5.
723 723 #
724 724 # Fortunately, there are tools for manipulating these formats in Python, and for storing data in other ways such as databases. A complete discussion of the possibilities is beyond the scope of this discussion, but of particular interest for scientific users we at least mention the following:
725 725 #
726 726 # * The `scipy.io` module contains routines to read and write Matlab files in `.mat` format and files in the NetCDF format that is widely used in certain scientific disciplines.
727 727 #
728 728 # * For manipulating files in the HDF5 format, there are two excellent options in Python: The PyTables project offers a high-level, object oriented approach to manipulating HDF5 datasets, while the h5py project offers a more direct mapping to the standard HDF5 library interface. Both are excellent tools; if you need to work with HDF5 datasets you should read some of their documentation and examples and decide which approach is a better match for your needs.
729 729
730 730 ## High quality data visualization with Matplotlib
731 731
732 732 # The [matplotlib](http://matplotlib.sf.net) library is a powerful tool capable of producing complex publication-quality figures with fine layout control in two and three dimensions; here we will only provide a minimal self-contained introduction to its usage that covers the functionality needed for the rest of the book. We encourage the reader to read the tutorials included with the matplotlib documentation as well as to browse its extensive gallery of examples that include source code.
733 733 #
734 734 # Just as we typically use the shorthand `np` for Numpy, we will use `plt` for the `matplotlib.pyplot` module where the easy-to-use plotting functions reside (the library contains a rich object-oriented architecture that we don't have the space to discuss here):
735 735
736 736 # In[59]:
737 737 import matplotlib.pyplot as plt
738 738
739 739 # The most frequently used function is simply called `plot`, here is how you can make a simple plot of $\sin(x)$ for $x \in [0, 2\pi]$ with labels and a grid (we use the semicolon in the last line to suppress the display of some information that is unnecessary right now):
740 740
741 741 # In[60]:
742 742 x = np.linspace(0, 2*np.pi)
743 743 y = np.sin(x)
744 744 plt.plot(x,y, label='sin(x)')
745 745 plt.legend()
746 746 plt.grid()
747 747 plt.title('Harmonic')
748 748 plt.xlabel('x')
749 749 plt.ylabel('y');
750 750
751 751 # Out[60]:
752 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.svg
752 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.svg
753 753
754 754 # You can control the style, color and other properties of the markers, for example:
755 755
756 756 # In[61]:
757 757 plt.plot(x, y, linewidth=2);
758 758
759 759 # Out[61]:
760 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.svg
760 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.svg
761 761
762 762 # In[62]:
763 763 plt.plot(x, y, 'o', markersize=5, color='r');
764 764
765 765 # Out[62]:
766 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.svg
766 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.svg
767 767
768 768 # We will now see how to create a few other common plot types, such as a simple error plot:
769 769
770 770 # In[63]:
771 771 # example data
772 772 x = np.arange(0.1, 4, 0.5)
773 773 y = np.exp(-x)
774 774
775 775 # example variable error bar values
776 776 yerr = 0.1 + 0.2*np.sqrt(x)
777 777 xerr = 0.1 + yerr
778 778
779 779 # First illustrate basic pyplot interface, using defaults where possible.
780 780 plt.figure()
781 781 plt.errorbar(x, y, xerr=0.2, yerr=0.4)
782 782 plt.title("Simplest errorbars, 0.2 in x, 0.4 in y");
783 783
784 784 # Out[63]:
785 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.svg
785 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.svg
786 786
787 787 # A simple log plot
788 788
789 789 # In[64]:
790 790 x = np.linspace(-5, 5)
791 791 y = np.exp(-x**2)
792 792 plt.semilogy(x, y);
793 793
794 794 # Out[64]:
795 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.svg
795 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.svg
796 796
797 797 # A histogram annotated with text inside the plot, using the `text` function:
798 798
799 799 # In[65]:
800 800 mu, sigma = 100, 15
801 801 x = mu + sigma * np.random.randn(10000)
802 802
803 803 # the histogram of the data
804 804 n, bins, patches = plt.hist(x, 50, normed=1, facecolor='g', alpha=0.75)
805 805
806 806 plt.xlabel('Smarts')
807 807 plt.ylabel('Probability')
808 808 plt.title('Histogram of IQ')
809 809 # This will put a text fragment at the position given:
810 810 plt.text(55, .027, r'$\mu=100,\ \sigma=15$', fontsize=14)
811 811 plt.axis([40, 160, 0, 0.03])
812 812 plt.grid(True)
813 813
814 814 # Out[65]:
815 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.svg
815 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.svg
816 816
817 817 ### Image display
818 818
819 819 # The `imshow` command can display single or multi-channel images. A simple array of random numbers, plotted in grayscale:
820 820
821 821 # In[66]:
822 822 from matplotlib import cm
823 823 plt.imshow(np.random.rand(5, 10), cmap=cm.gray, interpolation='nearest');
824 824
825 825 # Out[66]:
826 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.svg
826 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.svg
827 827
828 828 # A real photograph is a multichannel image, `imshow` interprets it correctly:
829 829
830 830 # In[67]:
831 831 img = plt.imread('stinkbug.png')
832 832 print 'Dimensions of the array img:', img.shape
833 833 plt.imshow(img);
834 834
835 835 # Out[67]:
836 836 # Dimensions of the array img: (375, 500, 3)
837 837 #
838 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.svg
838 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.svg
839 839
840 840 ### Simple 3d plotting with matplotlib
841 841
842 842 # Note that you must execute at least once in your session:
843 843
844 844 # In[68]:
845 845 from mpl_toolkits.mplot3d import Axes3D
846 846
847 847 # One this has been done, you can create 3d axes with the `projection='3d'` keyword to `add_subplot`:
848 848 #
849 849 # fig = plt.figure()
850 850 # fig.add_subplot(<other arguments here>, projection='3d')
851 851
852 852 # A simple surface plot:
853 853
854 854 # In[72]:
855 855 from mpl_toolkits.mplot3d.axes3d import Axes3D
856 856 from matplotlib import cm
857 857
858 858 fig = plt.figure()
859 859 ax = fig.add_subplot(1, 1, 1, projection='3d')
860 860 X = np.arange(-5, 5, 0.25)
861 861 Y = np.arange(-5, 5, 0.25)
862 862 X, Y = np.meshgrid(X, Y)
863 863 R = np.sqrt(X**2 + Y**2)
864 864 Z = np.sin(R)
865 865 surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.jet,
866 866 linewidth=0, antialiased=False)
867 867 ax.set_zlim3d(-1.01, 1.01);
868 868
869 869 # Out[72]:
870 # image file: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.svg
870 # image file: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.svg
871 871
872 872 ## IPython: a powerful interactive environment
873 873
874 874 # A key component of the everyday workflow of most scientific computing environments is a good interactive environment, that is, a system in which you can execute small amounts of code and view the results immediately, combining both printing out data and opening graphical visualizations. All modern systems for scientific computing, commercial and open source, include such functionality.
875 875 #
876 876 # Out of the box, Python also offers a simple interactive shell with very limited capabilities. But just like the scientific community built Numpy to provide arrays suited for scientific work (since Pytyhon's lists aren't optimal for this task), it has also developed an interactive environment much more sophisticated than the built-in one. The [IPython project](http://ipython.org) offers a set of tools to make productive use of the Python language, all the while working interactively and with immedate feedback on your results. The basic tools that IPython provides are:
877 877 #
878 878 # 1. A powerful terminal shell, with many features designed to increase the fluidity and productivity of everyday scientific workflows, including:
879 879 #
880 880 # * rich introspection of all objects and variables including easy access to the source code of any function
881 881 # * powerful and extensible tab completion of variables and filenames,
882 882 # * tight integration with matplotlib, supporting interactive figures that don't block the terminal,
883 883 # * direct access to the filesystem and underlying operating system,
884 884 # * an extensible system for shell-like commands called 'magics' that reduce the work needed to perform many common tasks,
885 885 # * tools for easily running, timing, profiling and debugging your codes,
886 886 # * syntax highlighted error messages with much more detail than the default Python ones,
887 887 # * logging and access to all previous history of inputs, including across sessions
888 888 #
889 889 # 2. A Qt console that provides the look and feel of a terminal, but adds support for inline figures, graphical calltips, a persistent session that can survive crashes (even segfaults) of the kernel process, and more.
890 890 #
891 891 # 3. A web-based notebook that can execute code and also contain rich text and figures, mathematical equations and arbitrary HTML. This notebook presents a document-like view with cells where code is executed but that can be edited in-place, reordered, mixed with explanatory text and figures, etc.
892 892 #
893 893 # 4. A high-performance, low-latency system for parallel computing that supports the control of a cluster of IPython engines communicating over a network, with optimizations that minimize unnecessary copying of large objects (especially numpy arrays).
894 894 #
895 895 # We will now discuss the highlights of the tools 1-3 above so that you can make them an effective part of your workflow. The topic of parallel computing is beyond the scope of this document, but we encourage you to read the extensive [documentation](http://ipython.org/ipython-doc/rel-0.12.1/parallel/index.html) and [tutorials](http://minrk.github.com/scipy-tutorial-2011/) on this available on the IPython website.
896 896
897 897 ### The IPython terminal
898 898
899 899 # You can start IPython at the terminal simply by typing:
900 900 #
901 901 # $ ipython
902 902 #
903 903 # which will provide you some basic information about how to get started and will then open a prompt labeled `In [1]:` for you to start typing. Here we type $2^{64}$ and Python computes the result for us in exact arithmetic, returning it as `Out[1]`:
904 904 #
905 905 # $ ipython
906 906 # Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
907 907 # Type "copyright", "credits" or "license" for more information.
908 908 #
909 909 # IPython 0.13.dev -- An enhanced Interactive Python.
910 910 # ? -> Introduction and overview of IPython's features.
911 911 # %quickref -> Quick reference.
912 912 # help -> Python's own help system.
913 913 # object? -> Details about 'object', use 'object??' for extra details.
914 914 #
915 915 # In [1]: 2**64
916 916 # Out[1]: 18446744073709551616L
917 917 #
918 918 # The first thing you should know about IPython is that all your inputs and outputs are saved. There are two variables named `In` and `Out` which are filled as you work with your results. Furthermore, all outputs are also saved to auto-created variables of the form `_NN` where `NN` is the prompt number, and inputs to `_iNN`. This allows you to recover quickly the result of a prior computation by referring to its number even if you forgot to store it as a variable. For example, later on in the above session you can do:
919 919 #
920 920 # In [6]: print _1
921 921 # 18446744073709551616
922 922
923 923 # We strongly recommend that you take a few minutes to read at least the basic introduction provided by the `?` command, and keep in mind that the `%quickref` command at all times can be used as a quick reference "cheat sheet" of the most frequently used features of IPython.
924 924 #
925 925 # At the IPython prompt, any valid Python code that you type will be executed similarly to the default Python shell (though often with more informative feedback). But since IPython is a *superset* of the default Python shell; let's have a brief look at some of its additional functionality.
926 926
927 927 # **Object introspection**
928 928 #
929 929 # A simple `?` command provides a general introduction to IPython, but as indicated in the banner above, you can use the `?` syntax to ask for details about any object. For example, if we type `_1?`, IPython will print the following details about this variable:
930 930 #
931 931 # In [14]: _1?
932 932 # Type: long
933 933 # Base Class: <type 'long'>
934 934 # String Form:18446744073709551616
935 935 # Namespace: Interactive
936 936 # Docstring:
937 937 # long(x[, base]) -> integer
938 938 #
939 939 # Convert a string or number to a long integer, if possible. A floating
940 940 #
941 941 # [etc... snipped for brevity]
942 942 #
943 943 # If you add a second `?` and for any oobject `x` type `x??`, IPython will try to provide an even more detailed analsysi of the object, including its syntax-highlighted source code when it can be found. It's possible that `x??` returns the same information as `x?`, but in many cases `x??` will indeed provide additional details.
944 944 #
945 945 # Finally, the `?` syntax is also useful to search *namespaces* with wildcards. Suppose you are wondering if there is any function in Numpy that may do text-related things; with `np.*txt*?`, IPython will print all the names in the `np` namespace (our Numpy shorthand) that have 'txt' anywhere in their name:
946 946 #
947 947 # In [17]: np.*txt*?
948 948 # np.genfromtxt
949 949 # np.loadtxt
950 950 # np.mafromtxt
951 951 # np.ndfromtxt
952 952 # np.recfromtxt
953 953 # np.savetxt
954 954
955 955 # **Tab completion**
956 956 #
957 957 # IPython makes the tab key work extra hard for you as a way to rapidly inspect objects and libraries. Whenever you have typed something at the prompt, by hitting the `<tab>` key IPython will try to complete the rest of the line. For this, IPython will analyze the text you had so far and try to search for Python data or files that may match the context you have already provided.
958 958 #
959 959 # For example, if you type `np.load` and hit the <tab> key, you'll see:
960 960 #
961 961 # In [21]: np.load<TAB HERE>
962 962 # np.load np.loads np.loadtxt
963 963 #
964 964 # so you can quickly find all the load-related functionality in numpy. Tab completion works even for function arguments, for example consider this function definition:
965 965 #
966 966 # In [20]: def f(x, frobinate=False):
967 967 # ....: if frobinate:
968 968 # ....: return x**2
969 969 # ....:
970 970 #
971 971 # If you now use the `<tab>` key after having typed 'fro' you'll get all valid Python completions, but those marked with `=` at the end are known to be keywords of your function:
972 972 #
973 973 # In [21]: f(2, fro<TAB HERE>
974 974 # frobinate= frombuffer fromfunction frompyfunc fromstring
975 975 # from fromfile fromiter fromregex frozenset
976 976 #
977 977 # at this point you can add the `b` letter and hit `<tab>` once more, and IPython will finish the line for you:
978 978 #
979 979 # In [21]: f(2, frobinate=
980 980 #
981 981 # As a beginner, simply get into the habit of using `<tab>` after most objects; it should quickly become second nature as you will see how helps keep a fluid workflow and discover useful information. Later on you can also customize this behavior by writing your own completion code, if you so desire.
982 982
983 983 # **Matplotlib integration**
984 984 #
985 985 # One of the most useful features of IPython for scientists is its tight integration with matplotlib: at the terminal IPython lets you open matplotlib figures without blocking your typing (which is what happens if you try to do the same thing at the default Python shell), and in the Qt console and notebook you can even view your figures embedded in your workspace next to the code that created them.
986 986 #
987 987 # The matplotlib support can be either activated when you start IPython by passing the `--pylab` flag, or at any point later in your session by using the `%pylab` command. If you start IPython with `--pylab`, you'll see something like this (note the extra message about pylab):
988 988 #
989 989 # $ ipython --pylab
990 990 # Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
991 991 # Type "copyright", "credits" or "license" for more information.
992 992 #
993 993 # IPython 0.13.dev -- An enhanced Interactive Python.
994 994 # ? -> Introduction and overview of IPython's features.
995 995 # %quickref -> Quick reference.
996 996 # help -> Python's own help system.
997 997 # object? -> Details about 'object', use 'object??' for extra details.
998 998 #
999 999 # Welcome to pylab, a matplotlib-based Python environment [backend: Qt4Agg].
1000 1000 # For more information, type 'help(pylab)'.
1001 1001 #
1002 1002 # In [1]:
1003 1003 #
1004 1004 # Furthermore, IPython will import `numpy` with the `np` shorthand, `matplotlib.pyplot` as `plt`, and it will also load all of the numpy and pyplot top-level names so that you can directly type something like:
1005 1005 #
1006 1006 # In [1]: x = linspace(0, 2*pi, 200)
1007 1007 #
1008 1008 # In [2]: plot(x, sin(x))
1009 1009 # Out[2]: [<matplotlib.lines.Line2D at 0x9e7c16c>]
1010 1010 #
1011 1011 # instead of having to prefix each call with its full signature (as we have been doing in the examples thus far):
1012 1012 #
1013 1013 # In [3]: x = np.linspace(0, 2*np.pi, 200)
1014 1014 #
1015 1015 # In [4]: plt.plot(x, np.sin(x))
1016 1016 # Out[4]: [<matplotlib.lines.Line2D at 0x9e900ac>]
1017 1017 #
1018 1018 # This shorthand notation can be a huge time-saver when working interactively (it's a few characters but you are likely to type them hundreds of times in a session). But we should note that as you develop persistent scripts and notebooks meant for reuse, it's best to get in the habit of using the longer notation (known as *fully qualified names* as it's clearer where things come from and it makes for more robust, readable and maintainable code in the long run).
1019 1019
1020 1020 # **Access to the operating system and files**
1021 1021 #
1022 1022 # In IPython, you can type `ls` to see your files or `cd` to change directories, just like you would at a regular system prompt:
1023 1023 #
1024 1024 # In [2]: cd tests
1025 1025 # /home/fperez/ipython/nbconvert/tests
1026 1026 #
1027 1027 # In [3]: ls test.*
1028 1028 # test.aux test.html test.ipynb test.log test.out test.pdf test.rst test.tex
1029 1029 #
1030 1030 # Furthermore, if you use the `!` at the beginning of a line, any commands you pass afterwards go directly to the operating system:
1031 1031 #
1032 1032 # In [4]: !echo "Hello IPython"
1033 1033 # Hello IPython
1034 1034 #
1035 1035 # IPython offers a useful twist in this feature: it will substitute in the command the value of any *Python* variable you may have if you prepend it with a `$` sign:
1036 1036 #
1037 1037 # In [5]: message = 'IPython interpolates from Python to the shell'
1038 1038 #
1039 1039 # In [6]: !echo $message
1040 1040 # IPython interpolates from Python to the shell
1041 1041 #
1042 1042 # This feature can be extremely useful, as it lets you combine the power and clarity of Python for complex logic with the immediacy and familiarity of many shell commands. Additionally, if you start the line with *two* `$$` signs, the output of the command will be automatically captured as a list of lines, e.g.:
1043 1043 #
1044 1044 # In [10]: !!ls test.*
1045 1045 # Out[10]:
1046 1046 # ['test.aux',
1047 1047 # 'test.html',
1048 1048 # 'test.ipynb',
1049 1049 # 'test.log',
1050 1050 # 'test.out',
1051 1051 # 'test.pdf',
1052 1052 # 'test.rst',
1053 1053 # 'test.tex']
1054 1054 #
1055 1055 # As explained above, you can now use this as the variable `_10`. If you directly want to capture the output of a system command to a Python variable, you can use the syntax `=!`:
1056 1056 #
1057 1057 # In [11]: testfiles =! ls test.*
1058 1058 #
1059 1059 # In [12]: print testfiles
1060 1060 # ['test.aux', 'test.html', 'test.ipynb', 'test.log', 'test.out', 'test.pdf', 'test.rst', 'test.tex']
1061 1061 #
1062 1062 # Finally, the special `%alias` command lets you define names that are shorthands for system commands, so that you can type them without having to prefix them via `!` explicitly (for example, `ls` is an alias that has been predefined for you at startup).
1063 1063
1064 1064 # **Magic commands**
1065 1065 #
1066 1066 # IPython has a system for special commands, called 'magics', that let you control IPython itself and perform many common tasks with a more shell-like syntax: it uses spaces for delimiting arguments, flags can be set with dashes and all arguments are treated as strings, so no additional quoting is required. This kind of syntax is invalid in the Python language but very convenient for interactive typing (less parentheses, commans and quoting everywhere); IPython distinguishes the two by detecting lines that start with the `%` character.
1067 1067 #
1068 1068 # You can learn more about the magic system by simply typing `%magic` at the prompt, which will give you a short description plus the documentation on *all* available magics. If you want to see only a listing of existing magics, you can use `%lsmagic`:
1069 1069 #
1070 1070 # In [4]: lsmagic
1071 1071 # Available magic functions:
1072 1072 # %alias %autocall %autoindent %automagic %bookmark %c %cd %colors %config %cpaste
1073 1073 # %debug %dhist %dirs %doctest_mode %ds %ed %edit %env %gui %hist %history
1074 1074 # %install_default_config %install_ext %install_profiles %load_ext %loadpy %logoff %logon
1075 1075 # %logstart %logstate %logstop %lsmagic %macro %magic %notebook %page %paste %pastebin
1076 1076 # %pd %pdb %pdef %pdoc %pfile %pinfo %pinfo2 %pop %popd %pprint %precision %profile
1077 1077 # %prun %psearch %psource %pushd %pwd %pycat %pylab %quickref %recall %rehashx
1078 1078 # %reload_ext %rep %rerun %reset %reset_selective %run %save %sc %stop %store %sx %tb
1079 1079 # %time %timeit %unalias %unload_ext %who %who_ls %whos %xdel %xmode
1080 1080 #
1081 1081 # Automagic is ON, % prefix NOT needed for magic functions.
1082 1082 #
1083 1083 # Note how the example above omitted the eplicit `%` marker and simply uses `lsmagic`. As long as the 'automagic' feature is on (which it is by default), you can omit the `%` marker as long as there is no ambiguity with a Python variable of the same name.
1084 1084
1085 1085 # **Running your code**
1086 1086 #
1087 1087 # While it's easy to type a few lines of code in IPython, for any long-lived work you should keep your codes in Python scripts (or in IPython notebooks, see below). Consider that you have a script, in this case trivially simple for the sake of brevity, named `simple.py`:
1088 1088 #
1089 1089 # In [12]: !cat simple.py
1090 1090 # import numpy as np
1091 1091 #
1092 1092 # x = np.random.normal(size=100)
1093 1093 #
1094 1094 # print 'First elment of x:', x[0]
1095 1095 #
1096 1096 # The typical workflow with IPython is to use the `%run` magic to execute your script (you can omit the .py extension if you want). When you run it, the script will execute just as if it had been run at the system prompt with `python simple.py` (though since modules don't get re-executed on new imports by Python, all system initialization is essentially free, which can have a significant run time impact in some cases):
1097 1097 #
1098 1098 # In [13]: run simple
1099 1099 # First elment of x: -1.55872256289
1100 1100 #
1101 1101 # Once it completes, all variables defined in it become available for you to use interactively:
1102 1102 #
1103 1103 # In [14]: x.shape
1104 1104 # Out[14]: (100,)
1105 1105 #
1106 1106 # This allows you to plot data, try out ideas, etc, in a `%run`/interact/edit cycle that can be very productive. As you start understanding your problem better you can refine your script further, incrementally improving it based on the work you do at the IPython prompt. At any point you can use the `%hist` magic to print out your history without prompts, so that you can copy useful fragments back into the script.
1107 1107 #
1108 1108 # By default, `%run` executes scripts in a completely empty namespace, to better mimic how they would execute at the system prompt with plain Python. But if you use the `-i` flag, the script will also see your interactively defined variables. This lets you edit in a script larger amounts of code that still behave as if you had typed them at the IPython prompt.
1109 1109 #
1110 1110 # You can also get a summary of the time taken by your script with the `-t` flag; consider a different script `randsvd.py` that takes a bit longer to run:
1111 1111 #
1112 1112 # In [21]: run -t randsvd.py
1113 1113 #
1114 1114 # IPython CPU timings (estimated):
1115 1115 # User : 0.38 s.
1116 1116 # System : 0.04 s.
1117 1117 # Wall time: 0.34 s.
1118 1118 #
1119 1119 # `User` is the time spent by the computer executing your code, while `System` is the time the operating system had to work on your behalf, doing things like memory allocation that are needed by your code but that you didn't explicitly program and that happen inside the kernel. The `Wall time` is the time on a 'clock on the wall' between the start and end of your program.
1120 1120 #
1121 1121 # If `Wall > User+System`, your code is most likely waiting idle for certain periods. That could be waiting for data to arrive from a remote source or perhaps because the operating system has to swap large amounts of virtual memory. If you know that your code doesn't explicitly wait for remote data to arrive, you should investigate further to identify possible ways of improving the performance profile.
1122 1122 #
1123 1123 # If you only want to time how long a single statement takes, you don't need to put it into a script as you can use the `%timeit` magic, which uses Python's `timeit` module to very carefully measure timig data; `timeit` can measure even short statements that execute extremely fast:
1124 1124 #
1125 1125 # In [27]: %timeit a=1
1126 1126 # 10000000 loops, best of 3: 23 ns per loop
1127 1127 #
1128 1128 # and for code that runs longer, it automatically adjusts so the overall measurement doesn't take too long:
1129 1129 #
1130 1130 # In [28]: %timeit np.linalg.svd(x)
1131 1131 # 1 loops, best of 3: 310 ms per loop
1132 1132 #
1133 1133 # The `%run` magic still has more options for debugging and profiling data; you should read its documentation for many useful details (as always, just type `%run?`).
1134 1134
1135 1135 ### The graphical Qt console
1136 1136
1137 1137 # If you type at the system prompt (see the IPython website for installation details, as this requires some additional libraries):
1138 1138 #
1139 1139 # $ ipython qtconsole
1140 1140 #
1141 1141 # instead of opening in a terminal as before, IPython will start a graphical console that at first sight appears just like a terminal, but which is in fact much more capable than a text-only terminal. This is a specialized terminal designed for interactive scientific work, and it supports full multi-line editing with color highlighting and graphical calltips for functions, it can keep multiple IPython sessions open simultaneously in tabs, and when scripts run it can display the figures inline directly in the work area.
1142 1142 #
1143 1143 # <center><img src="ipython_qtconsole2.png" width=400px></center>
1144 1144
1145 1145 # % This cell is for the pdflatex output only
1146 1146 # \begin{figure}[htbp]
1147 1147 # \centering
1148 1148 # \includegraphics[width=3in]{ipython_qtconsole2.png}
1149 1149 # \caption{The IPython Qt console: a lightweight terminal for scientific exploration, with code, results and graphics in a soingle environment.}
1150 1150 # \end{figure}
1151 1151
1152 1152 # The Qt console accepts the same `--pylab` startup flags as the terminal, but you can additionally supply the value `--pylab inline`, which enables the support for inline graphics shown in the figure. This is ideal for keeping all the code and figures in the same session, given that the console can save the output of your entire session to HTML or PDF.
1153 1153 #
1154 1154 # Since the Qt console makes it far more convenient than the terminal to edit blocks of code with multiple lines, in this environment it's worth knowing about the `%loadpy` magic function. `%loadpy` takes a path to a local file or remote URL, fetches its contents, and puts it in the work area for you to further edit and execute. It can be an extremely fast and convenient way of loading code from local disk or remote examples from sites such as the [Matplotlib gallery](http://matplotlib.sourceforge.net/gallery.html).
1155 1155 #
1156 1156 # Other than its enhanced capabilities for code and graphics, all of the features of IPython we've explained before remain functional in this graphical console.
1157 1157
1158 1158 ### The IPython Notebook
1159 1159
1160 1160 # The third way to interact with IPython, in addition to the terminal and graphical Qt console, is a powerful web interface called the "IPython Notebook". If you run at the system console (you can omit the `pylab` flags if you don't need plotting support):
1161 1161 #
1162 1162 # $ ipython notebook --pylab inline
1163 1163 #
1164 1164 # IPython will start a process that runs a web server in your local machine and to which a web browser can connect. The Notebook is a workspace that lets you execute code in blocks called 'cells' and displays any results and figures, but which can also contain arbitrary text (including LaTeX-formatted mathematical expressions) and any rich media that a modern web browser is capable of displaying.
1165 1165 #
1166 1166 # <center><img src="ipython-notebook-specgram-2.png" width=400px></center>
1167 1167
1168 1168 # % This cell is for the pdflatex output only
1169 1169 # \begin{figure}[htbp]
1170 1170 # \centering
1171 1171 # \includegraphics[width=3in]{ipython-notebook-specgram-2.png}
1172 1172 # \caption{The IPython Notebook: text, equations, code, results, graphics and other multimedia in an open format for scientific exploration and collaboration}
1173 1173 # \end{figure}
1174 1174
1175 1175 # In fact, this document was written as a Notebook, and only exported to LaTeX for printing. Inside of each cell, all the features of IPython that we have discussed before remain functional, since ultimately this web client is communicating with the same IPython code that runs in the terminal. But this interface is a much more rich and powerful environment for maintaining long-term "live and executable" scientific documents.
1176 1176 #
1177 1177 # Notebook environments have existed in commercial systems like Mathematica(TM) and Maple(TM) for a long time; in the open source world the [Sage](http://sagemath.org) project blazed this particular trail starting in 2006, and now we bring all the features that have made IPython such a widely used tool to a Notebook model.
1178 1178 #
1179 1179 # Since the Notebook runs as a web application, it is possible to configure it for remote access, letting you run your computations on a persistent server close to your data, which you can then access remotely from any browser-equipped computer. We encourage you to read the extensive documentation provided by the IPython project for details on how to do this and many more features of the notebook.
1180 1180 #
1181 1181 # Finally, as we said earlier, IPython also has a high-level and easy to use set of libraries for parallel computing, that let you control (interactively if desired) not just one IPython but an entire cluster of 'IPython engines'. Unfortunately a detailed discussion of these tools is beyond the scope of this text, but should you need to parallelize your analysis codes, a quick read of the tutorials and examples provided at the IPython site may prove fruitful.
@@ -1,2077 +1,2077 b''
1 1 An Introduction to the Scientific Python Ecosystem
2 2 ==================================================
3 3
4 4 While the Python language is an excellent tool for general-purpose
5 5 programming, with a highly readable syntax, rich and powerful data types
6 6 (strings, lists, sets, dictionaries, arbitrary length integers, etc) and
7 7 a very comprehensive standard library, it was not designed specifically
8 8 for mathematical and scientific computing. Neither the language nor its
9 9 standard library have facilities for the efficient representation of
10 10 multidimensional datasets, tools for linear algebra and general matrix
11 11 manipulations (an essential building block of virtually all technical
12 12 computing), nor any data visualization facilities.
13 13
14 14 In particular, Python lists are very flexible containers that can be
15 15 nested arbitrarily deep and which can hold any Python object in them,
16 16 but they are poorly suited to represent efficiently common mathematical
17 17 constructs like vectors and matrices. In contrast, much of our modern
18 18 heritage of scientific computing has been built on top of libraries
19 19 written in the Fortran language, which has native support for vectors
20 20 and matrices as well as a library of mathematical functions that can
21 21 efficiently operate on entire arrays at once.
22 22
23 23 Scientific Python: a collaboration of projects built by scientists
24 24 ------------------------------------------------------------------
25 25
26 26 The scientific community has developed a set of related Python libraries
27 27 that provide powerful array facilities, linear algebra, numerical
28 28 algorithms, data visualization and more. In this appendix, we will
29 29 briefly outline the tools most frequently used for this purpose, that
30 30 make "Scientific Python" something far more powerful than the Python
31 31 language alone.
32 32
33 33 For reasons of space, we can only describe in some detail the central
34 34 Numpy library, but below we provide links to the websites of each
35 35 project where you can read their documentation in more detail.
36 36
37 37 First, let's look at an overview of the basic tools that most scientists
38 38 use in daily research with Python. The core of this ecosystem is
39 39 composed of:
40 40
41 41 - Numpy: the basic library that most others depend on, it provides a
42 42 powerful array type that can represent multidmensional datasets of
43 43 many different kinds and that supports arithmetic operations. Numpy
44 44 also provides a library of common mathematical functions, basic
45 45 linear algebra, random number generation and Fast Fourier Transforms.
46 46 Numpy can be found at `numpy.scipy.org <http://numpy.scipy.org>`_
47 47
48 48 - Scipy: a large collection of numerical algorithms that operate on
49 49 numpy arrays and provide facilities for many common tasks in
50 50 scientific computing, including dense and sparse linear algebra
51 51 support, optimization, special functions, statistics, n-dimensional
52 52 image processing, signal processing and more. Scipy can be found at
53 53 `scipy.org <http://scipy.org>`_.
54 54
55 55 - Matplotlib: a data visualization library with a strong focus on
56 56 producing high-quality output, it supports a variety of common
57 57 scientific plot types in two and three dimensions, with precise
58 58 control over the final output and format for publication-quality
59 59 results. Matplotlib can also be controlled interactively allowing
60 60 graphical manipulation of your data (zooming, panning, etc) and can
61 61 be used with most modern user interface toolkits. It can be found at
62 62 `matplotlib.sf.net <http://matplotlib.sf.net>`_.
63 63
64 64 - IPython: while not strictly scientific in nature, IPython is the
65 65 interactive environment in which many scientists spend their time.
66 66 IPython provides a powerful Python shell that integrates tightly with
67 67 Matplotlib and with easy access to the files and operating system,
68 68 and which can execute in a terminal or in a graphical Qt console.
69 69 IPython also has a web-based notebook interface that can combine code
70 70 with text, mathematical expressions, figures and multimedia. It can
71 71 be found at `ipython.org <http://ipython.org>`_.
72 72
73 73 While each of these tools can be installed separately, in our opinion
74 74 the most convenient way today of accessing them (especially on Windows
75 75 and Mac computers) is to install the `Free Edition of the Enthought
76 76 Python Distribution <http://www.enthought.com/products/epd_free.php>`_
77 77 which contain all the above. Other free alternatives on Windows (but not
78 78 on Macs) are `Python(x,y) <http://code.google.com/p/pythonxy>`_ and
79 79 `Christoph Gohlke's packages
80 80 page <http://www.lfd.uci.edu/~gohlke/pythonlibs>`_.
81 81
82 82 These four 'core' libraries are in practice complemented by a number of
83 83 other tools for more specialized work. We will briefly list here the
84 84 ones that we think are the most commonly needed:
85 85
86 86 - Sympy: a symbolic manipulation tool that turns a Python session into
87 87 a computer algebra system. It integrates with the IPython notebook,
88 88 rendering results in properly typeset mathematical notation.
89 89 `sympy.org <http://sympy.org>`_.
90 90
91 91 - Mayavi: sophisticated 3d data visualization;
92 92 `code.enthought.com/projects/mayavi <http://code.enthought.com/projects/mayavi>`_.
93 93
94 94 - Cython: a bridge language between Python and C, useful both to
95 95 optimize performance bottlenecks in Python and to access C libraries
96 96 directly; `cython.org <http://cython.org>`_.
97 97
98 98 - Pandas: high-performance data structures and data analysis tools,
99 99 with powerful data alignment and structural manipulation
100 100 capabilities; `pandas.pydata.org <http://pandas.pydata.org>`_.
101 101
102 102 - Statsmodels: statistical data exploration and model estimation;
103 103 `statsmodels.sourceforge.net <http://statsmodels.sourceforge.net>`_.
104 104
105 105 - Scikit-learn: general purpose machine learning algorithms with a
106 106 common interface; `scikit-learn.org <http://scikit-learn.org>`_.
107 107
108 108 - Scikits-image: image processing toolbox;
109 109 `scikits-image.org <http://scikits-image.org>`_.
110 110
111 111 - NetworkX: analysis of complex networks (in the graph theoretical
112 112 sense); `networkx.lanl.gov <http://networkx.lanl.gov>`_.
113 113
114 114 - PyTables: management of hierarchical datasets using the
115 115 industry-standard HDF5 format;
116 116 `www.pytables.org <http://www.pytables.org>`_.
117 117
118 118 Beyond these, for any specific problem you should look on the internet
119 119 first, before starting to write code from scratch. There's a good chance
120 120 that someone, somewhere, has written an open source library that you can
121 121 use for part or all of your problem.
122 122
123 123 A note about the examples below
124 124 -------------------------------
125 125
126 126 In all subsequent examples, you will see blocks of input code, followed
127 127 by the results of the code if the code generated output. This output may
128 128 include text, graphics and other result objects. These blocks of input
129 129 can be pasted into your interactive IPython session or notebook for you
130 130 to execute. In the print version of this document, a thin vertical bar
131 131 on the left of the blocks of input and output shows which blocks go
132 132 together.
133 133
134 134 If you are reading this text as an actual IPython notebook, you can
135 135 press ``Shift-Enter`` or use the 'play' button on the toolbar
136 136 (right-pointing triangle) to execute each block of code, known as a
137 137 'cell' in IPython:
138 138
139 139 In[71]:
140 140
141 141 .. code:: python
142 142
143 143 # This is a block of code, below you'll see its output
144 144 print "Welcome to the world of scientific computing with Python!"
145 145
146 146 .. parsed-literal::
147 147
148 148 Welcome to the world of scientific computing with Python!
149 149
150 150
151 151 Motivation: the trapezoidal rule
152 152 ================================
153 153
154 154 In subsequent sections we'll provide a basic introduction to the nuts
155 155 and bolts of the basic scientific python tools; but we'll first motivate
156 156 it with a brief example that illustrates what you can do in a few lines
157 157 with these tools. For this, we will use the simple problem of
158 158 approximating a definite integral with the trapezoid rule:
159 159
160 160 .. math::
161 161
162 162
163 163 \int_{a}^{b} f(x)\, dx \approx \frac{1}{2} \sum_{k=1}^{N} \left( x_{k} - x_{k-1} \right) \left( f(x_{k}) + f(x_{k-1}) \right).
164 164
165 165 Our task will be to compute this formula for a function such as:
166 166
167 167 .. math::
168 168
169 169
170 170 f(x) = (x-3)(x-5)(x-7)+85
171 171
172 172 integrated between :math:`a=1` and :math:`b=9`.
173 173
174 174 First, we define the function and sample it evenly between 0 and 10 at
175 175 200 points:
176 176
177 177 In[1]:
178 178
179 179 .. code:: python
180 180
181 181 def f(x):
182 182 return (x-3)*(x-5)*(x-7)+85
183 183
184 184 import numpy as np
185 185 x = np.linspace(0, 10, 200)
186 186 y = f(x)
187 187
188 188 We select :math:`a` and :math:`b`, our integration limits, and we take
189 189 only a few points in that region to illustrate the error behavior of the
190 190 trapezoid approximation:
191 191
192 192 In[2]:
193 193
194 194 .. code:: python
195 195
196 196 a, b = 1, 9
197 197 xint = x[logical_and(x>=a, x<=b)][::30]
198 198 yint = y[logical_and(x>=a, x<=b)][::30]
199 199
200 200 Let's plot both the function and the area below it in the trapezoid
201 201 approximation:
202 202
203 203 In[3]:
204 204
205 205 .. code:: python
206 206
207 207 import matplotlib.pyplot as plt
208 208 plt.plot(x, y, lw=2)
209 209 plt.axis([0, 10, 0, 140])
210 210 plt.fill_between(xint, 0, yint, facecolor='gray', alpha=0.4)
211 211 plt.text(0.5 * (a + b), 30,r"$\int_a^b f(x)dx$", horizontalalignment='center', fontsize=20);
212 212
213 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.svg
213 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.svg
214 214
215 215 Compute the integral both at high accuracy and with the trapezoid
216 216 approximation
217 217
218 218 In[4]:
219 219
220 220 .. code:: python
221 221
222 222 from scipy.integrate import quad, trapz
223 223 integral, error = quad(f, 1, 9)
224 224 trap_integral = trapz(yint, xint)
225 225 print "The integral is: %g +/- %.1e" % (integral, error)
226 226 print "The trapezoid approximation with", len(xint), "points is:", trap_integral
227 227 print "The absolute error is:", abs(integral - trap_integral)
228 228
229 229 .. parsed-literal::
230 230
231 231 The integral is: 680 +/- 7.5e-12
232 232 The trapezoid approximation with 6 points is: 621.286411141
233 233 The absolute error is: 58.7135888589
234 234
235 235
236 236 This simple example showed us how, combining the numpy, scipy and
237 237 matplotlib libraries we can provide an illustration of a standard method
238 238 in elementary calculus with just a few lines of code. We will now
239 239 discuss with more detail the basic usage of these tools.
240 240
241 241 NumPy arrays: the right data structure for scientific computing
242 242 ===============================================================
243 243
244 244 Basics of Numpy arrays
245 245 ----------------------
246 246
247 247 We now turn our attention to the Numpy library, which forms the base
248 248 layer for the entire 'scipy ecosystem'. Once you have installed numpy,
249 249 you can import it as
250 250
251 251 In[5]:
252 252
253 253 .. code:: python
254 254
255 255 import numpy
256 256
257 257 though in this book we will use the common shorthand
258 258
259 259 In[6]:
260 260
261 261 .. code:: python
262 262
263 263 import numpy as np
264 264
265 265 As mentioned above, the main object provided by numpy is a powerful
266 266 array. We'll start by exploring how the numpy array differs from Python
267 267 lists. We start by creating a simple list and an array with the same
268 268 contents of the list:
269 269
270 270 In[7]:
271 271
272 272 .. code:: python
273 273
274 274 lst = [10, 20, 30, 40]
275 275 arr = np.array([10, 20, 30, 40])
276 276
277 277 Elements of a one-dimensional array are accessed with the same syntax as
278 278 a list:
279 279
280 280 In[8]:
281 281
282 282 .. code:: python
283 283
284 284 lst[0]
285 285
286 286 Out[8]:
287 287
288 288 .. parsed-literal::
289 289
290 290 10
291 291
292 292 In[9]:
293 293
294 294 .. code:: python
295 295
296 296 arr[0]
297 297
298 298 Out[9]:
299 299
300 300 .. parsed-literal::
301 301
302 302 10
303 303
304 304 In[10]:
305 305
306 306 .. code:: python
307 307
308 308 arr[-1]
309 309
310 310 Out[10]:
311 311
312 312 .. parsed-literal::
313 313
314 314 40
315 315
316 316 In[11]:
317 317
318 318 .. code:: python
319 319
320 320 arr[2:]
321 321
322 322 Out[11]:
323 323
324 324 .. parsed-literal::
325 325
326 326 array([30, 40])
327 327
328 328 The first difference to note between lists and arrays is that arrays are
329 329 *homogeneous*; i.e. all elements of an array must be of the same type.
330 330 In contrast, lists can contain elements of arbitrary type. For example,
331 331 we can change the last element in our list above to be a string:
332 332
333 333 In[12]:
334 334
335 335 .. code:: python
336 336
337 337 lst[-1] = 'a string inside a list'
338 338 lst
339 339
340 340 Out[12]:
341 341
342 342 .. parsed-literal::
343 343
344 344 [10, 20, 30, 'a string inside a list']
345 345
346 346 but the same can not be done with an array, as we get an error message:
347 347
348 348 In[13]:
349 349
350 350 .. code:: python
351 351
352 352 arr[-1] = 'a string inside an array'
353 353
354 354 ::
355 355
356 356 ---------------------------------------------------------------------------
357 357 ValueError Traceback (most recent call last)
358 358 /home/fperez/teach/book-math-labtool/<ipython-input-13-29c0bfa5fa8a> in <module>()
359 359 ----> 1 arr[-1] = 'a string inside an array'
360 360
361 361 ValueError: invalid literal for long() with base 10: 'a string inside an array'
362 362
363 363 The information about the type of an array is contained in its *dtype*
364 364 attribute:
365 365
366 366 In[14]:
367 367
368 368 .. code:: python
369 369
370 370 arr.dtype
371 371
372 372 Out[14]:
373 373
374 374 .. parsed-literal::
375 375
376 376 dtype('int32')
377 377
378 378 Once an array has been created, its dtype is fixed and it can only store
379 379 elements of the same type. For this example where the dtype is integer,
380 380 if we store a floating point number it will be automatically converted
381 381 into an integer:
382 382
383 383 In[15]:
384 384
385 385 .. code:: python
386 386
387 387 arr[-1] = 1.234
388 388 arr
389 389
390 390 Out[15]:
391 391
392 392 .. parsed-literal::
393 393
394 394 array([10, 20, 30, 1])
395 395
396 396 Above we created an array from an existing list; now let us now see
397 397 other ways in which we can create arrays, which we'll illustrate next. A
398 398 common need is to have an array initialized with a constant value, and
399 399 very often this value is 0 or 1 (suitable as starting value for additive
400 400 and multiplicative loops respectively); ``zeros`` creates arrays of all
401 401 zeros, with any desired dtype:
402 402
403 403 In[16]:
404 404
405 405 .. code:: python
406 406
407 407 np.zeros(5, float)
408 408
409 409 Out[16]:
410 410
411 411 .. parsed-literal::
412 412
413 413 array([ 0., 0., 0., 0., 0.])
414 414
415 415 In[17]:
416 416
417 417 .. code:: python
418 418
419 419 np.zeros(3, int)
420 420
421 421 Out[17]:
422 422
423 423 .. parsed-literal::
424 424
425 425 array([0, 0, 0])
426 426
427 427 In[18]:
428 428
429 429 .. code:: python
430 430
431 431 np.zeros(3, complex)
432 432
433 433 Out[18]:
434 434
435 435 .. parsed-literal::
436 436
437 437 array([ 0.+0.j, 0.+0.j, 0.+0.j])
438 438
439 439 and similarly for ``ones``:
440 440
441 441 In[19]:
442 442
443 443 .. code:: python
444 444
445 445 print '5 ones:', np.ones(5)
446 446
447 447 .. parsed-literal::
448 448
449 449 5 ones: [ 1. 1. 1. 1. 1.]
450 450
451 451
452 452 If we want an array initialized with an arbitrary value, we can create
453 453 an empty array and then use the fill method to put the value we want
454 454 into the array:
455 455
456 456 In[20]:
457 457
458 458 .. code:: python
459 459
460 460 a = empty(4)
461 461 a.fill(5.5)
462 462 a
463 463
464 464 Out[20]:
465 465
466 466 .. parsed-literal::
467 467
468 468 array([ 5.5, 5.5, 5.5, 5.5])
469 469
470 470 Numpy also offers the ``arange`` function, which works like the builtin
471 471 ``range`` but returns an array instead of a list:
472 472
473 473 In[21]:
474 474
475 475 .. code:: python
476 476
477 477 np.arange(5)
478 478
479 479 Out[21]:
480 480
481 481 .. parsed-literal::
482 482
483 483 array([0, 1, 2, 3, 4])
484 484
485 485 and the ``linspace`` and ``logspace`` functions to create linearly and
486 486 logarithmically-spaced grids respectively, with a fixed number of points
487 487 and including both ends of the specified interval:
488 488
489 489 In[22]:
490 490
491 491 .. code:: python
492 492
493 493 print "A linear grid between 0 and 1:", np.linspace(0, 1, 5)
494 494 print "A logarithmic grid between 10**1 and 10**4: ", np.logspace(1, 4, 4)
495 495
496 496 .. parsed-literal::
497 497
498 498 A linear grid between 0 and 1: [ 0. 0.25 0.5 0.75 1. ]
499 499 A logarithmic grid between 10**1 and 10**4: [ 10. 100. 1000. 10000.]
500 500
501 501
502 502 Finally, it is often useful to create arrays with random numbers that
503 503 follow a specific distribution. The ``np.random`` module contains a
504 504 number of functions that can be used to this effect, for example this
505 505 will produce an array of 5 random samples taken from a standard normal
506 506 distribution (0 mean and variance 1):
507 507
508 508 In[23]:
509 509
510 510 .. code:: python
511 511
512 512 np.random.randn(5)
513 513
514 514 Out[23]:
515 515
516 516 .. parsed-literal::
517 517
518 518 array([-0.08633343, -0.67375434, 1.00589536, 0.87081651, 1.65597822])
519 519
520 520 whereas this will also give 5 samples, but from a normal distribution
521 521 with a mean of 10 and a variance of 3:
522 522
523 523 In[24]:
524 524
525 525 .. code:: python
526 526
527 527 norm10 = np.random.normal(10, 3, 5)
528 528 norm10
529 529
530 530 Out[24]:
531 531
532 532 .. parsed-literal::
533 533
534 534 array([ 8.94879575, 5.53038269, 8.24847281, 12.14944165, 11.56209294])
535 535
536 536 Indexing with other arrays
537 537 --------------------------
538 538
539 539 Above we saw how to index arrays with single numbers and slices, just
540 540 like Python lists. But arrays allow for a more sophisticated kind of
541 541 indexing which is very powerful: you can index an array with another
542 542 array, and in particular with an array of boolean values. This is
543 543 particluarly useful to extract information from an array that matches a
544 544 certain condition.
545 545
546 546 Consider for example that in the array ``norm10`` we want to replace all
547 547 values above 9 with the value 0. We can do so by first finding the
548 548 *mask* that indicates where this condition is true or false:
549 549
550 550 In[25]:
551 551
552 552 .. code:: python
553 553
554 554 mask = norm10 > 9
555 555 mask
556 556
557 557 Out[25]:
558 558
559 559 .. parsed-literal::
560 560
561 561 array([False, False, False, True, True], dtype=bool)
562 562
563 563 Now that we have this mask, we can use it to either read those values or
564 564 to reset them to 0:
565 565
566 566 In[26]:
567 567
568 568 .. code:: python
569 569
570 570 print 'Values above 9:', norm10[mask]
571 571
572 572 .. parsed-literal::
573 573
574 574 Values above 9: [ 12.14944165 11.56209294]
575 575
576 576
577 577 In[27]:
578 578
579 579 .. code:: python
580 580
581 581 print 'Resetting all values above 9 to 0...'
582 582 norm10[mask] = 0
583 583 print norm10
584 584
585 585 .. parsed-literal::
586 586
587 587 Resetting all values above 9 to 0...
588 588 [ 8.94879575 5.53038269 8.24847281 0. 0. ]
589 589
590 590
591 591 Arrays with more than one dimension
592 592 -----------------------------------
593 593
594 594 Up until now all our examples have used one-dimensional arrays. But
595 595 Numpy can create arrays of aribtrary dimensions, and all the methods
596 596 illustrated in the previous section work with more than one dimension.
597 597 For example, a list of lists can be used to initialize a two dimensional
598 598 array:
599 599
600 600 In[28]:
601 601
602 602 .. code:: python
603 603
604 604 lst2 = [[1, 2], [3, 4]]
605 605 arr2 = np.array([[1, 2], [3, 4]])
606 606 arr2
607 607
608 608 Out[28]:
609 609
610 610 .. parsed-literal::
611 611
612 612 array([[1, 2],
613 613 [3, 4]])
614 614
615 615 With two-dimensional arrays we start seeing the power of numpy: while a
616 616 nested list can be indexed using repeatedly the ``[ ]`` operator,
617 617 multidimensional arrays support a much more natural indexing syntax with
618 618 a single ``[ ]`` and a set of indices separated by commas:
619 619
620 620 In[29]:
621 621
622 622 .. code:: python
623 623
624 624 print lst2[0][1]
625 625 print arr2[0,1]
626 626
627 627 .. parsed-literal::
628 628
629 629 2
630 630 2
631 631
632 632
633 633 Most of the array creation functions listed above can be used with more
634 634 than one dimension, for example:
635 635
636 636 In[30]:
637 637
638 638 .. code:: python
639 639
640 640 np.zeros((2,3))
641 641
642 642 Out[30]:
643 643
644 644 .. parsed-literal::
645 645
646 646 array([[ 0., 0., 0.],
647 647 [ 0., 0., 0.]])
648 648
649 649 In[31]:
650 650
651 651 .. code:: python
652 652
653 653 np.random.normal(10, 3, (2, 4))
654 654
655 655 Out[31]:
656 656
657 657 .. parsed-literal::
658 658
659 659 array([[ 11.26788826, 4.29619866, 11.09346496, 9.73861307],
660 660 [ 10.54025996, 9.5146268 , 10.80367214, 13.62204505]])
661 661
662 662 In fact, the shape of an array can be changed at any time, as long as
663 663 the total number of elements is unchanged. For example, if we want a 2x4
664 664 array with numbers increasing from 0, the easiest way to create it is:
665 665
666 666 In[32]:
667 667
668 668 .. code:: python
669 669
670 670 arr = np.arange(8).reshape(2,4)
671 671 print arr
672 672
673 673 .. parsed-literal::
674 674
675 675 [[0 1 2 3]
676 676 [4 5 6 7]]
677 677
678 678
679 679 With multidimensional arrays, you can also use slices, and you can mix
680 680 and match slices and single indices in the different dimensions (using
681 681 the same array as above):
682 682
683 683 In[33]:
684 684
685 685 .. code:: python
686 686
687 687 print 'Slicing in the second row:', arr[1, 2:4]
688 688 print 'All rows, third column :', arr[:, 2]
689 689
690 690 .. parsed-literal::
691 691
692 692 Slicing in the second row: [6 7]
693 693 All rows, third column : [2 6]
694 694
695 695
696 696 If you only provide one index, then you will get an array with one less
697 697 dimension containing that row:
698 698
699 699 In[34]:
700 700
701 701 .. code:: python
702 702
703 703 print 'First row: ', arr[0]
704 704 print 'Second row: ', arr[1]
705 705
706 706 .. parsed-literal::
707 707
708 708 First row: [0 1 2 3]
709 709 Second row: [4 5 6 7]
710 710
711 711
712 712 Now that we have seen how to create arrays with more than one dimension,
713 713 it's a good idea to look at some of the most useful properties and
714 714 methods that arrays have. The following provide basic information about
715 715 the size, shape and data in the array:
716 716
717 717 In[35]:
718 718
719 719 .. code:: python
720 720
721 721 print 'Data type :', arr.dtype
722 722 print 'Total number of elements :', arr.size
723 723 print 'Number of dimensions :', arr.ndim
724 724 print 'Shape (dimensionality) :', arr.shape
725 725 print 'Memory used (in bytes) :', arr.nbytes
726 726
727 727 .. parsed-literal::
728 728
729 729 Data type : int32
730 730 Total number of elements : 8
731 731 Number of dimensions : 2
732 732 Shape (dimensionality) : (2, 4)
733 733 Memory used (in bytes) : 32
734 734
735 735
736 736 Arrays also have many useful methods, some especially useful ones are:
737 737
738 738 In[36]:
739 739
740 740 .. code:: python
741 741
742 742 print 'Minimum and maximum :', arr.min(), arr.max()
743 743 print 'Sum and product of all elements :', arr.sum(), arr.prod()
744 744 print 'Mean and standard deviation :', arr.mean(), arr.std()
745 745
746 746 .. parsed-literal::
747 747
748 748 Minimum and maximum : 0 7
749 749 Sum and product of all elements : 28 0
750 750 Mean and standard deviation : 3.5 2.29128784748
751 751
752 752
753 753 For these methods, the above operations area all computed on all the
754 754 elements of the array. But for a multidimensional array, it's possible
755 755 to do the computation along a single dimension, by passing the ``axis``
756 756 parameter; for example:
757 757
758 758 In[37]:
759 759
760 760 .. code:: python
761 761
762 762 print 'For the following array:\n', arr
763 763 print 'The sum of elements along the rows is :', arr.sum(axis=1)
764 764 print 'The sum of elements along the columns is :', arr.sum(axis=0)
765 765
766 766 .. parsed-literal::
767 767
768 768 For the following array:
769 769 [[0 1 2 3]
770 770 [4 5 6 7]]
771 771 The sum of elements along the rows is : [ 6 22]
772 772 The sum of elements along the columns is : [ 4 6 8 10]
773 773
774 774
775 775 As you can see in this example, the value of the ``axis`` parameter is
776 776 the dimension which will be *consumed* once the operation has been
777 777 carried out. This is why to sum along the rows we use ``axis=0``.
778 778
779 779 This can be easily illustrated with an example that has more dimensions;
780 780 we create an array with 4 dimensions and shape ``(3,4,5,6)`` and sum
781 781 along the axis number 2 (i.e. the *third* axis, since in Python all
782 782 counts are 0-based). That consumes the dimension whose length was 5,
783 783 leaving us with a new array that has shape ``(3,4,6)``:
784 784
785 785 In[38]:
786 786
787 787 .. code:: python
788 788
789 789 np.zeros((3,4,5,6)).sum(2).shape
790 790
791 791 Out[38]:
792 792
793 793 .. parsed-literal::
794 794
795 795 (3, 4, 6)
796 796
797 797 Another widely used property of arrays is the ``.T`` attribute, which
798 798 allows you to access the transpose of the array:
799 799
800 800 In[39]:
801 801
802 802 .. code:: python
803 803
804 804 print 'Array:\n', arr
805 805 print 'Transpose:\n', arr.T
806 806
807 807 .. parsed-literal::
808 808
809 809 Array:
810 810 [[0 1 2 3]
811 811 [4 5 6 7]]
812 812 Transpose:
813 813 [[0 4]
814 814 [1 5]
815 815 [2 6]
816 816 [3 7]]
817 817
818 818
819 819 We don't have time here to look at all the methods and properties of
820 820 arrays, here's a complete list. Simply try exploring some of these
821 821 IPython to learn more, or read their description in the full Numpy
822 822 documentation:
823 823
824 824 ::
825 825
826 826 arr.T arr.copy arr.getfield arr.put arr.squeeze
827 827 arr.all arr.ctypes arr.imag arr.ravel arr.std
828 828 arr.any arr.cumprod arr.item arr.real arr.strides
829 829 arr.argmax arr.cumsum arr.itemset arr.repeat arr.sum
830 830 arr.argmin arr.data arr.itemsize arr.reshape arr.swapaxes
831 831 arr.argsort arr.diagonal arr.max arr.resize arr.take
832 832 arr.astype arr.dot arr.mean arr.round arr.tofile
833 833 arr.base arr.dtype arr.min arr.searchsorted arr.tolist
834 834 arr.byteswap arr.dump arr.nbytes arr.setasflat arr.tostring
835 835 arr.choose arr.dumps arr.ndim arr.setfield arr.trace
836 836 arr.clip arr.fill arr.newbyteorder arr.setflags arr.transpose
837 837 arr.compress arr.flags arr.nonzero arr.shape arr.var
838 838 arr.conj arr.flat arr.prod arr.size arr.view
839 839 arr.conjugate arr.flatten arr.ptp arr.sort
840 840
841 841
842 842 Operating with arrays
843 843 ---------------------
844 844
845 845 Arrays support all regular arithmetic operators, and the numpy library
846 846 also contains a complete collection of basic mathematical functions that
847 847 operate on arrays. It is important to remember that in general, all
848 848 operations with arrays are applied *element-wise*, i.e., are applied to
849 849 all the elements of the array at the same time. Consider for example:
850 850
851 851 In[40]:
852 852
853 853 .. code:: python
854 854
855 855 arr1 = np.arange(4)
856 856 arr2 = np.arange(10, 14)
857 857 print arr1, '+', arr2, '=', arr1+arr2
858 858
859 859 .. parsed-literal::
860 860
861 861 [0 1 2 3] + [10 11 12 13] = [10 12 14 16]
862 862
863 863
864 864 Importantly, you must remember that even the multiplication operator is
865 865 by default applied element-wise, it is *not* the matrix multiplication
866 866 from linear algebra (as is the case in Matlab, for example):
867 867
868 868 In[41]:
869 869
870 870 .. code:: python
871 871
872 872 print arr1, '*', arr2, '=', arr1*arr2
873 873
874 874 .. parsed-literal::
875 875
876 876 [0 1 2 3] * [10 11 12 13] = [ 0 11 24 39]
877 877
878 878
879 879 While this means that in principle arrays must always match in their
880 880 dimensionality in order for an operation to be valid, numpy will
881 881 *broadcast* dimensions when possible. For example, suppose that you want
882 882 to add the number 1.5 to ``arr1``; the following would be a valid way to
883 883 do it:
884 884
885 885 In[42]:
886 886
887 887 .. code:: python
888 888
889 889 arr1 + 1.5*np.ones(4)
890 890
891 891 Out[42]:
892 892
893 893 .. parsed-literal::
894 894
895 895 array([ 1.5, 2.5, 3.5, 4.5])
896 896
897 897 But thanks to numpy's broadcasting rules, the following is equally
898 898 valid:
899 899
900 900 In[43]:
901 901
902 902 .. code:: python
903 903
904 904 arr1 + 1.5
905 905
906 906 Out[43]:
907 907
908 908 .. parsed-literal::
909 909
910 910 array([ 1.5, 2.5, 3.5, 4.5])
911 911
912 912 In this case, numpy looked at both operands and saw that the first
913 913 (``arr1``) was a one-dimensional array of length 4 and the second was a
914 914 scalar, considered a zero-dimensional object. The broadcasting rules
915 915 allow numpy to:
916 916
917 917 - *create* new dimensions of length 1 (since this doesn't change the
918 918 size of the array)
919 919 - 'stretch' a dimension of length 1 that needs to be matched to a
920 920 dimension of a different size.
921 921
922 922 So in the above example, the scalar 1.5 is effectively:
923 923
924 924 - first 'promoted' to a 1-dimensional array of length 1
925 925 - then, this array is 'stretched' to length 4 to match the dimension of
926 926 ``arr1``.
927 927
928 928 After these two operations are complete, the addition can proceed as now
929 929 both operands are one-dimensional arrays of length 4.
930 930
931 931 This broadcasting behavior is in practice enormously powerful,
932 932 especially because when numpy broadcasts to create new dimensions or to
933 933 'stretch' existing ones, it doesn't actually replicate the data. In the
934 934 example above the operation is carried *as if* the 1.5 was a 1-d array
935 935 with 1.5 in all of its entries, but no actual array was ever created.
936 936 This can save lots of memory in cases when the arrays in question are
937 937 large and can have significant performance implications.
938 938
939 939 The general rule is: when operating on two arrays, NumPy compares their
940 940 shapes element-wise. It starts with the trailing dimensions, and works
941 941 its way forward, creating dimensions of length 1 as needed. Two
942 942 dimensions are considered compatible when
943 943
944 944 - they are equal to begin with, or
945 945 - one of them is 1; in this case numpy will do the 'stretching' to make
946 946 them equal.
947 947
948 948 If these conditions are not met, a
949 949 ``ValueError: frames are not aligned`` exception is thrown, indicating
950 950 that the arrays have incompatible shapes. The size of the resulting
951 951 array is the maximum size along each dimension of the input arrays.
952 952
953 953 This shows how the broadcasting rules work in several dimensions:
954 954
955 955 In[44]:
956 956
957 957 .. code:: python
958 958
959 959 b = np.array([2, 3, 4, 5])
960 960 print arr, '\n\n+', b , '\n----------------\n', arr + b
961 961
962 962 .. parsed-literal::
963 963
964 964 [[0 1 2 3]
965 965 [4 5 6 7]]
966 966
967 967 + [2 3 4 5]
968 968 ----------------
969 969 [[ 2 4 6 8]
970 970 [ 6 8 10 12]]
971 971
972 972
973 973 Now, how could you use broadcasting to say add ``[4, 6]`` along the rows
974 974 to ``arr`` above? Simply performing the direct addition will produce the
975 975 error we previously mentioned:
976 976
977 977 In[45]:
978 978
979 979 .. code:: python
980 980
981 981 c = np.array([4, 6])
982 982 arr + c
983 983
984 984 ::
985 985
986 986 ---------------------------------------------------------------------------
987 987 ValueError Traceback (most recent call last)
988 988 /home/fperez/teach/book-math-labtool/<ipython-input-45-62aa20ac1980> in <module>()
989 989 1 c = np.array([4, 6])
990 990 ----> 2 arr + c
991 991
992 992 ValueError: operands could not be broadcast together with shapes (2,4) (2)
993 993
994 994 According to the rules above, the array ``c`` would need to have a
995 995 *trailing* dimension of 1 for the broadcasting to work. It turns out
996 996 that numpy allows you to 'inject' new dimensions anywhere into an array
997 997 on the fly, by indexing it with the special object ``np.newaxis``:
998 998
999 999 In[46]:
1000 1000
1001 1001 .. code:: python
1002 1002
1003 1003 (c[:, np.newaxis]).shape
1004 1004
1005 1005 Out[46]:
1006 1006
1007 1007 .. parsed-literal::
1008 1008
1009 1009 (2, 1)
1010 1010
1011 1011 This is exactly what we need, and indeed it works:
1012 1012
1013 1013 In[47]:
1014 1014
1015 1015 .. code:: python
1016 1016
1017 1017 arr + c[:, np.newaxis]
1018 1018
1019 1019 Out[47]:
1020 1020
1021 1021 .. parsed-literal::
1022 1022
1023 1023 array([[ 4, 5, 6, 7],
1024 1024 [10, 11, 12, 13]])
1025 1025
1026 1026 For the full broadcasting rules, please see the official Numpy docs,
1027 1027 which describe them in detail and with more complex examples.
1028 1028
1029 1029 As we mentioned before, Numpy ships with a full complement of
1030 1030 mathematical functions that work on entire arrays, including logarithms,
1031 1031 exponentials, trigonometric and hyperbolic trigonometric functions, etc.
1032 1032 Furthermore, scipy ships a rich special function library in the
1033 1033 ``scipy.special`` module that includes Bessel, Airy, Fresnel, Laguerre
1034 1034 and other classical special functions. For example, sampling the sine
1035 1035 function at 100 points between :math:`0` and :math:`2\pi` is as simple
1036 1036 as:
1037 1037
1038 1038 In[48]:
1039 1039
1040 1040 .. code:: python
1041 1041
1042 1042 x = np.linspace(0, 2*np.pi, 100)
1043 1043 y = np.sin(x)
1044 1044
1045 1045 Linear algebra in numpy
1046 1046 -----------------------
1047 1047
1048 1048 Numpy ships with a basic linear algebra library, and all arrays have a
1049 1049 ``dot`` method whose behavior is that of the scalar dot product when its
1050 1050 arguments are vectors (one-dimensional arrays) and the traditional
1051 1051 matrix multiplication when one or both of its arguments are
1052 1052 two-dimensional arrays:
1053 1053
1054 1054 In[49]:
1055 1055
1056 1056 .. code:: python
1057 1057
1058 1058 v1 = np.array([2, 3, 4])
1059 1059 v2 = np.array([1, 0, 1])
1060 1060 print v1, '.', v2, '=', v1.dot(v2)
1061 1061
1062 1062 .. parsed-literal::
1063 1063
1064 1064 [2 3 4] . [1 0 1] = 6
1065 1065
1066 1066
1067 1067 Here is a regular matrix-vector multiplication, note that the array
1068 1068 ``v1`` should be viewed as a *column* vector in traditional linear
1069 1069 algebra notation; numpy makes no distinction between row and column
1070 1070 vectors and simply verifies that the dimensions match the required rules
1071 1071 of matrix multiplication, in this case we have a :math:`2 \times 3`
1072 1072 matrix multiplied by a 3-vector, which produces a 2-vector:
1073 1073
1074 1074 In[50]:
1075 1075
1076 1076 .. code:: python
1077 1077
1078 1078 A = np.arange(6).reshape(2, 3)
1079 1079 print A, 'x', v1, '=', A.dot(v1)
1080 1080
1081 1081 .. parsed-literal::
1082 1082
1083 1083 [[0 1 2]
1084 1084 [3 4 5]] x [2 3 4] = [11 38]
1085 1085
1086 1086
1087 1087 For matrix-matrix multiplication, the same dimension-matching rules must
1088 1088 be satisfied, e.g. consider the difference between :math:`A \times A^T`:
1089 1089
1090 1090 In[51]:
1091 1091
1092 1092 .. code:: python
1093 1093
1094 1094 print A.dot(A.T)
1095 1095
1096 1096 .. parsed-literal::
1097 1097
1098 1098 [[ 5 14]
1099 1099 [14 50]]
1100 1100
1101 1101
1102 1102 and :math:`A^T \times A`:
1103 1103
1104 1104 In[52]:
1105 1105
1106 1106 .. code:: python
1107 1107
1108 1108 print A.T.dot(A)
1109 1109
1110 1110 .. parsed-literal::
1111 1111
1112 1112 [[ 9 12 15]
1113 1113 [12 17 22]
1114 1114 [15 22 29]]
1115 1115
1116 1116
1117 1117 Furthermore, the ``numpy.linalg`` module includes additional
1118 1118 functionality such as determinants, matrix norms, Cholesky, eigenvalue
1119 1119 and singular value decompositions, etc. For even more linear algebra
1120 1120 tools, ``scipy.linalg`` contains the majority of the tools in the
1121 1121 classic LAPACK libraries as well as functions to operate on sparse
1122 1122 matrices. We refer the reader to the Numpy and Scipy documentations for
1123 1123 additional details on these.
1124 1124
1125 1125 Reading and writing arrays to disk
1126 1126 ----------------------------------
1127 1127
1128 1128 Numpy lets you read and write arrays into files in a number of ways. In
1129 1129 order to use these tools well, it is critical to understand the
1130 1130 difference between a *text* and a *binary* file containing numerical
1131 1131 data. In a text file, the number :math:`\pi` could be written as
1132 1132 "3.141592653589793", for example: a string of digits that a human can
1133 1133 read, with in this case 15 decimal digits. In contrast, that same number
1134 1134 written to a binary file would be encoded as 8 characters (bytes) that
1135 1135 are not readable by a human but which contain the exact same data that
1136 1136 the variable ``pi`` had in the computer's memory.
1137 1137
1138 1138 The tradeoffs between the two modes are thus:
1139 1139
1140 1140 - Text mode: occupies more space, precision can be lost (if not all
1141 1141 digits are written to disk), but is readable and editable by hand
1142 1142 with a text editor. Can *only* be used for one- and two-dimensional
1143 1143 arrays.
1144 1144
1145 1145 - Binary mode: compact and exact representation of the data in memory,
1146 1146 can't be read or edited by hand. Arrays of any size and
1147 1147 dimensionality can be saved and read without loss of information.
1148 1148
1149 1149 First, let's see how to read and write arrays in text mode. The
1150 1150 ``np.savetxt`` function saves an array to a text file, with options to
1151 1151 control the precision, separators and even adding a header:
1152 1152
1153 1153 In[53]:
1154 1154
1155 1155 .. code:: python
1156 1156
1157 1157 arr = np.arange(10).reshape(2, 5)
1158 1158 np.savetxt('test.out', arr, fmt='%.2e', header="My dataset")
1159 1159 !cat test.out
1160 1160
1161 1161 .. parsed-literal::
1162 1162
1163 1163 # My dataset
1164 1164 0.00e+00 1.00e+00 2.00e+00 3.00e+00 4.00e+00
1165 1165 5.00e+00 6.00e+00 7.00e+00 8.00e+00 9.00e+00
1166 1166
1167 1167
1168 1168 And this same type of file can then be read with the matching
1169 1169 ``np.loadtxt`` function:
1170 1170
1171 1171 In[54]:
1172 1172
1173 1173 .. code:: python
1174 1174
1175 1175 arr2 = np.loadtxt('test.out')
1176 1176 print arr2
1177 1177
1178 1178 .. parsed-literal::
1179 1179
1180 1180 [[ 0. 1. 2. 3. 4.]
1181 1181 [ 5. 6. 7. 8. 9.]]
1182 1182
1183 1183
1184 1184 For binary data, Numpy provides the ``np.save`` and ``np.savez``
1185 1185 routines. The first saves a single array to a file with ``.npy``
1186 1186 extension, while the latter can be used to save a *group* of arrays into
1187 1187 a single file with ``.npz`` extension. The files created with these
1188 1188 routines can then be read with the ``np.load`` function.
1189 1189
1190 1190 Let us first see how to use the simpler ``np.save`` function to save a
1191 1191 single array:
1192 1192
1193 1193 In[55]:
1194 1194
1195 1195 .. code:: python
1196 1196
1197 1197 np.save('test.npy', arr2)
1198 1198 # Now we read this back
1199 1199 arr2n = np.load('test.npy')
1200 1200 # Let's see if any element is non-zero in the difference.
1201 1201 # A value of True would be a problem.
1202 1202 print 'Any differences?', np.any(arr2-arr2n)
1203 1203
1204 1204 .. parsed-literal::
1205 1205
1206 1206 Any differences? False
1207 1207
1208 1208
1209 1209 Now let us see how the ``np.savez`` function works. You give it a
1210 1210 filename and either a sequence of arrays or a set of keywords. In the
1211 1211 first mode, the function will auotmatically name the saved arrays in the
1212 1212 archive as ``arr_0``, ``arr_1``, etc:
1213 1213
1214 1214 In[56]:
1215 1215
1216 1216 .. code:: python
1217 1217
1218 1218 np.savez('test.npz', arr, arr2)
1219 1219 arrays = np.load('test.npz')
1220 1220 arrays.files
1221 1221
1222 1222 Out[56]:
1223 1223
1224 1224 .. parsed-literal::
1225 1225
1226 1226 ['arr_1', 'arr_0']
1227 1227
1228 1228 Alternatively, we can explicitly choose how to name the arrays we save:
1229 1229
1230 1230 In[57]:
1231 1231
1232 1232 .. code:: python
1233 1233
1234 1234 np.savez('test.npz', array1=arr, array2=arr2)
1235 1235 arrays = np.load('test.npz')
1236 1236 arrays.files
1237 1237
1238 1238 Out[57]:
1239 1239
1240 1240 .. parsed-literal::
1241 1241
1242 1242 ['array2', 'array1']
1243 1243
1244 1244 The object returned by ``np.load`` from an ``.npz`` file works like a
1245 1245 dictionary, though you can also access its constituent files by
1246 1246 attribute using its special ``.f`` field; this is best illustrated with
1247 1247 an example with the ``arrays`` object from above:
1248 1248
1249 1249 In[58]:
1250 1250
1251 1251 .. code:: python
1252 1252
1253 1253 print 'First row of first array:', arrays['array1'][0]
1254 1254 # This is an equivalent way to get the same field
1255 1255 print 'First row of first array:', arrays.f.array1[0]
1256 1256
1257 1257 .. parsed-literal::
1258 1258
1259 1259 First row of first array: [0 1 2 3 4]
1260 1260 First row of first array: [0 1 2 3 4]
1261 1261
1262 1262
1263 1263 This ``.npz`` format is a very convenient way to package compactly and
1264 1264 without loss of information, into a single file, a group of related
1265 1265 arrays that pertain to a specific problem. At some point, however, the
1266 1266 complexity of your dataset may be such that the optimal approach is to
1267 1267 use one of the standard formats in scientific data processing that have
1268 1268 been designed to handle complex datasets, such as NetCDF or HDF5.
1269 1269
1270 1270 Fortunately, there are tools for manipulating these formats in Python,
1271 1271 and for storing data in other ways such as databases. A complete
1272 1272 discussion of the possibilities is beyond the scope of this discussion,
1273 1273 but of particular interest for scientific users we at least mention the
1274 1274 following:
1275 1275
1276 1276 - The ``scipy.io`` module contains routines to read and write Matlab
1277 1277 files in ``.mat`` format and files in the NetCDF format that is
1278 1278 widely used in certain scientific disciplines.
1279 1279
1280 1280 - For manipulating files in the HDF5 format, there are two excellent
1281 1281 options in Python: The PyTables project offers a high-level, object
1282 1282 oriented approach to manipulating HDF5 datasets, while the h5py
1283 1283 project offers a more direct mapping to the standard HDF5 library
1284 1284 interface. Both are excellent tools; if you need to work with HDF5
1285 1285 datasets you should read some of their documentation and examples and
1286 1286 decide which approach is a better match for your needs.
1287 1287
1288 1288
1289 1289
1290 1290 High quality data visualization with Matplotlib
1291 1291 ===============================================
1292 1292
1293 1293 The `matplotlib <http://matplotlib.sf.net>`_ library is a powerful tool
1294 1294 capable of producing complex publication-quality figures with fine
1295 1295 layout control in two and three dimensions; here we will only provide a
1296 1296 minimal self-contained introduction to its usage that covers the
1297 1297 functionality needed for the rest of the book. We encourage the reader
1298 1298 to read the tutorials included with the matplotlib documentation as well
1299 1299 as to browse its extensive gallery of examples that include source code.
1300 1300
1301 1301 Just as we typically use the shorthand ``np`` for Numpy, we will use
1302 1302 ``plt`` for the ``matplotlib.pyplot`` module where the easy-to-use
1303 1303 plotting functions reside (the library contains a rich object-oriented
1304 1304 architecture that we don't have the space to discuss here):
1305 1305
1306 1306 In[59]:
1307 1307
1308 1308 .. code:: python
1309 1309
1310 1310 import matplotlib.pyplot as plt
1311 1311
1312 1312 The most frequently used function is simply called ``plot``, here is how
1313 1313 you can make a simple plot of :math:`\sin(x)` for
1314 1314 :math:`x \in [0, 2\pi]` with labels and a grid (we use the semicolon in
1315 1315 the last line to suppress the display of some information that is
1316 1316 unnecessary right now):
1317 1317
1318 1318 In[60]:
1319 1319
1320 1320 .. code:: python
1321 1321
1322 1322 x = np.linspace(0, 2*np.pi)
1323 1323 y = np.sin(x)
1324 1324 plt.plot(x,y, label='sin(x)')
1325 1325 plt.legend()
1326 1326 plt.grid()
1327 1327 plt.title('Harmonic')
1328 1328 plt.xlabel('x')
1329 1329 plt.ylabel('y');
1330 1330
1331 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.svg
1331 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.svg
1332 1332
1333 1333 You can control the style, color and other properties of the markers,
1334 1334 for example:
1335 1335
1336 1336 In[61]:
1337 1337
1338 1338 .. code:: python
1339 1339
1340 1340 plt.plot(x, y, linewidth=2);
1341 1341
1342 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.svg
1342 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.svg
1343 1343
1344 1344 In[62]:
1345 1345
1346 1346 .. code:: python
1347 1347
1348 1348 plt.plot(x, y, 'o', markersize=5, color='r');
1349 1349
1350 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.svg
1350 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.svg
1351 1351
1352 1352 We will now see how to create a few other common plot types, such as a
1353 1353 simple error plot:
1354 1354
1355 1355 In[63]:
1356 1356
1357 1357 .. code:: python
1358 1358
1359 1359 # example data
1360 1360 x = np.arange(0.1, 4, 0.5)
1361 1361 y = np.exp(-x)
1362 1362
1363 1363 # example variable error bar values
1364 1364 yerr = 0.1 + 0.2*np.sqrt(x)
1365 1365 xerr = 0.1 + yerr
1366 1366
1367 1367 # First illustrate basic pyplot interface, using defaults where possible.
1368 1368 plt.figure()
1369 1369 plt.errorbar(x, y, xerr=0.2, yerr=0.4)
1370 1370 plt.title("Simplest errorbars, 0.2 in x, 0.4 in y");
1371 1371
1372 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.svg
1372 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.svg
1373 1373
1374 1374 A simple log plot
1375 1375
1376 1376 In[64]:
1377 1377
1378 1378 .. code:: python
1379 1379
1380 1380 x = np.linspace(-5, 5)
1381 1381 y = np.exp(-x**2)
1382 1382 plt.semilogy(x, y);
1383 1383
1384 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.svg
1384 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.svg
1385 1385
1386 1386 A histogram annotated with text inside the plot, using the ``text``
1387 1387 function:
1388 1388
1389 1389 In[65]:
1390 1390
1391 1391 .. code:: python
1392 1392
1393 1393 mu, sigma = 100, 15
1394 1394 x = mu + sigma * np.random.randn(10000)
1395 1395
1396 1396 # the histogram of the data
1397 1397 n, bins, patches = plt.hist(x, 50, normed=1, facecolor='g', alpha=0.75)
1398 1398
1399 1399 plt.xlabel('Smarts')
1400 1400 plt.ylabel('Probability')
1401 1401 plt.title('Histogram of IQ')
1402 1402 # This will put a text fragment at the position given:
1403 1403 plt.text(55, .027, r'$\mu=100,\ \sigma=15$', fontsize=14)
1404 1404 plt.axis([40, 160, 0, 0.03])
1405 1405 plt.grid(True)
1406 1406
1407 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.svg
1407 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.svg
1408 1408
1409 1409 Image display
1410 1410 -------------
1411 1411
1412 1412 The ``imshow`` command can display single or multi-channel images. A
1413 1413 simple array of random numbers, plotted in grayscale:
1414 1414
1415 1415 In[66]:
1416 1416
1417 1417 .. code:: python
1418 1418
1419 1419 from matplotlib import cm
1420 1420 plt.imshow(np.random.rand(5, 10), cmap=cm.gray, interpolation='nearest');
1421 1421
1422 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.svg
1422 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.svg
1423 1423
1424 1424 A real photograph is a multichannel image, ``imshow`` interprets it
1425 1425 correctly:
1426 1426
1427 1427 In[67]:
1428 1428
1429 1429 .. code:: python
1430 1430
1431 1431 img = plt.imread('stinkbug.png')
1432 1432 print 'Dimensions of the array img:', img.shape
1433 1433 plt.imshow(img);
1434 1434
1435 1435 .. parsed-literal::
1436 1436
1437 1437 Dimensions of the array img: (375, 500, 3)
1438 1438
1439 1439
1440 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.svg
1440 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.svg
1441 1441
1442 1442 Simple 3d plotting with matplotlib
1443 1443 ----------------------------------
1444 1444
1445 1445 Note that you must execute at least once in your session:
1446 1446
1447 1447 In[68]:
1448 1448
1449 1449 .. code:: python
1450 1450
1451 1451 from mpl_toolkits.mplot3d import Axes3D
1452 1452
1453 1453 One this has been done, you can create 3d axes with the
1454 1454 ``projection='3d'`` keyword to ``add_subplot``:
1455 1455
1456 1456 ::
1457 1457
1458 1458 fig = plt.figure()
1459 1459 fig.add_subplot(<other arguments here>, projection='3d')
1460 1460
1461 1461
1462 1462 A simple surface plot:
1463 1463
1464 1464 In[72]:
1465 1465
1466 1466 .. code:: python
1467 1467
1468 1468 from mpl_toolkits.mplot3d.axes3d import Axes3D
1469 1469 from matplotlib import cm
1470 1470
1471 1471 fig = plt.figure()
1472 1472 ax = fig.add_subplot(1, 1, 1, projection='3d')
1473 1473 X = np.arange(-5, 5, 0.25)
1474 1474 Y = np.arange(-5, 5, 0.25)
1475 1475 X, Y = np.meshgrid(X, Y)
1476 1476 R = np.sqrt(X**2 + Y**2)
1477 1477 Z = np.sin(R)
1478 1478 surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.jet,
1479 1479 linewidth=0, antialiased=False)
1480 1480 ax.set_zlim3d(-1.01, 1.01);
1481 1481
1482 .. image:: /Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.svg
1482 .. image:: tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.svg
1483 1483
1484 1484 IPython: a powerful interactive environment
1485 1485 ===========================================
1486 1486
1487 1487 A key component of the everyday workflow of most scientific computing
1488 1488 environments is a good interactive environment, that is, a system in
1489 1489 which you can execute small amounts of code and view the results
1490 1490 immediately, combining both printing out data and opening graphical
1491 1491 visualizations. All modern systems for scientific computing, commercial
1492 1492 and open source, include such functionality.
1493 1493
1494 1494 Out of the box, Python also offers a simple interactive shell with very
1495 1495 limited capabilities. But just like the scientific community built Numpy
1496 1496 to provide arrays suited for scientific work (since Pytyhon's lists
1497 1497 aren't optimal for this task), it has also developed an interactive
1498 1498 environment much more sophisticated than the built-in one. The `IPython
1499 1499 project <http://ipython.org>`_ offers a set of tools to make productive
1500 1500 use of the Python language, all the while working interactively and with
1501 1501 immedate feedback on your results. The basic tools that IPython provides
1502 1502 are:
1503 1503
1504 1504 1. A powerful terminal shell, with many features designed to increase
1505 1505 the fluidity and productivity of everyday scientific workflows,
1506 1506 including:
1507 1507
1508 1508 - rich introspection of all objects and variables including easy
1509 1509 access to the source code of any function
1510 1510 - powerful and extensible tab completion of variables and filenames,
1511 1511 - tight integration with matplotlib, supporting interactive figures
1512 1512 that don't block the terminal,
1513 1513 - direct access to the filesystem and underlying operating system,
1514 1514 - an extensible system for shell-like commands called 'magics' that
1515 1515 reduce the work needed to perform many common tasks,
1516 1516 - tools for easily running, timing, profiling and debugging your
1517 1517 codes,
1518 1518 - syntax highlighted error messages with much more detail than the
1519 1519 default Python ones,
1520 1520 - logging and access to all previous history of inputs, including
1521 1521 across sessions
1522 1522
1523 1523 2. A Qt console that provides the look and feel of a terminal, but adds
1524 1524 support for inline figures, graphical calltips, a persistent session
1525 1525 that can survive crashes (even segfaults) of the kernel process, and
1526 1526 more.
1527 1527
1528 1528 3. A web-based notebook that can execute code and also contain rich text
1529 1529 and figures, mathematical equations and arbitrary HTML. This notebook
1530 1530 presents a document-like view with cells where code is executed but
1531 1531 that can be edited in-place, reordered, mixed with explanatory text
1532 1532 and figures, etc.
1533 1533
1534 1534 4. A high-performance, low-latency system for parallel computing that
1535 1535 supports the control of a cluster of IPython engines communicating
1536 1536 over a network, with optimizations that minimize unnecessary copying
1537 1537 of large objects (especially numpy arrays).
1538 1538
1539 1539 We will now discuss the highlights of the tools 1-3 above so that you
1540 1540 can make them an effective part of your workflow. The topic of parallel
1541 1541 computing is beyond the scope of this document, but we encourage you to
1542 1542 read the extensive
1543 1543 `documentation <http://ipython.org/ipython-doc/rel-0.12.1/parallel/index.html>`_
1544 1544 and `tutorials <http://minrk.github.com/scipy-tutorial-2011/>`_ on this
1545 1545 available on the IPython website.
1546 1546
1547 1547 The IPython terminal
1548 1548 --------------------
1549 1549
1550 1550 You can start IPython at the terminal simply by typing:
1551 1551
1552 1552 ::
1553 1553
1554 1554 $ ipython
1555 1555
1556 1556 which will provide you some basic information about how to get started
1557 1557 and will then open a prompt labeled ``In [1]:`` for you to start typing.
1558 1558 Here we type :math:`2^{64}` and Python computes the result for us in
1559 1559 exact arithmetic, returning it as ``Out[1]``:
1560 1560
1561 1561 ::
1562 1562
1563 1563 $ ipython
1564 1564 Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
1565 1565 Type "copyright", "credits" or "license" for more information.
1566 1566
1567 1567 IPython 0.13.dev -- An enhanced Interactive Python.
1568 1568 ? -> Introduction and overview of IPython's features.
1569 1569 %quickref -> Quick reference.
1570 1570 help -> Python's own help system.
1571 1571 object? -> Details about 'object', use 'object??' for extra details.
1572 1572
1573 1573 In [1]: 2**64
1574 1574 Out[1]: 18446744073709551616L
1575 1575
1576 1576 The first thing you should know about IPython is that all your inputs
1577 1577 and outputs are saved. There are two variables named ``In`` and ``Out``
1578 1578 which are filled as you work with your results. Furthermore, all outputs
1579 1579 are also saved to auto-created variables of the form ``_NN`` where
1580 1580 ``NN`` is the prompt number, and inputs to ``_iNN``. This allows you to
1581 1581 recover quickly the result of a prior computation by referring to its
1582 1582 number even if you forgot to store it as a variable. For example, later
1583 1583 on in the above session you can do:
1584 1584
1585 1585 ::
1586 1586
1587 1587 In [6]: print _1
1588 1588 18446744073709551616
1589 1589
1590 1590
1591 1591 We strongly recommend that you take a few minutes to read at least the
1592 1592 basic introduction provided by the ``?`` command, and keep in mind that
1593 1593 the ``%quickref`` command at all times can be used as a quick reference
1594 1594 "cheat sheet" of the most frequently used features of IPython.
1595 1595
1596 1596 At the IPython prompt, any valid Python code that you type will be
1597 1597 executed similarly to the default Python shell (though often with more
1598 1598 informative feedback). But since IPython is a *superset* of the default
1599 1599 Python shell; let's have a brief look at some of its additional
1600 1600 functionality.
1601 1601
1602 1602 **Object introspection**
1603 1603
1604 1604 A simple ``?`` command provides a general introduction to IPython, but
1605 1605 as indicated in the banner above, you can use the ``?`` syntax to ask
1606 1606 for details about any object. For example, if we type ``_1?``, IPython
1607 1607 will print the following details about this variable:
1608 1608
1609 1609 ::
1610 1610
1611 1611 In [14]: _1?
1612 1612 Type: long
1613 1613 Base Class: <type 'long'>
1614 1614 String Form:18446744073709551616
1615 1615 Namespace: Interactive
1616 1616 Docstring:
1617 1617 long(x[, base]) -> integer
1618 1618
1619 1619 Convert a string or number to a long integer, if possible. A floating
1620 1620
1621 1621 [etc... snipped for brevity]
1622 1622
1623 1623 If you add a second ``?`` and for any oobject ``x`` type ``x??``,
1624 1624 IPython will try to provide an even more detailed analsysi of the
1625 1625 object, including its syntax-highlighted source code when it can be
1626 1626 found. It's possible that ``x??`` returns the same information as
1627 1627 ``x?``, but in many cases ``x??`` will indeed provide additional
1628 1628 details.
1629 1629
1630 1630 Finally, the ``?`` syntax is also useful to search *namespaces* with
1631 1631 wildcards. Suppose you are wondering if there is any function in Numpy
1632 1632 that may do text-related things; with ``np.*txt*?``, IPython will print
1633 1633 all the names in the ``np`` namespace (our Numpy shorthand) that have
1634 1634 'txt' anywhere in their name:
1635 1635
1636 1636 ::
1637 1637
1638 1638 In [17]: np.*txt*?
1639 1639 np.genfromtxt
1640 1640 np.loadtxt
1641 1641 np.mafromtxt
1642 1642 np.ndfromtxt
1643 1643 np.recfromtxt
1644 1644 np.savetxt
1645 1645
1646 1646
1647 1647 **Tab completion**
1648 1648
1649 1649 IPython makes the tab key work extra hard for you as a way to rapidly
1650 1650 inspect objects and libraries. Whenever you have typed something at the
1651 1651 prompt, by hitting the ``<tab>`` key IPython will try to complete the
1652 1652 rest of the line. For this, IPython will analyze the text you had so far
1653 1653 and try to search for Python data or files that may match the context
1654 1654 you have already provided.
1655 1655
1656 1656 For example, if you type ``np.load`` and hit the key, you'll see:
1657 1657
1658 1658 ::
1659 1659
1660 1660 In [21]: np.load<TAB HERE>
1661 1661 np.load np.loads np.loadtxt
1662 1662
1663 1663 so you can quickly find all the load-related functionality in numpy. Tab
1664 1664 completion works even for function arguments, for example consider this
1665 1665 function definition:
1666 1666
1667 1667 ::
1668 1668
1669 1669 In [20]: def f(x, frobinate=False):
1670 1670 ....: if frobinate:
1671 1671 ....: return x**2
1672 1672 ....:
1673 1673
1674 1674 If you now use the ``<tab>`` key after having typed 'fro' you'll get all
1675 1675 valid Python completions, but those marked with ``=`` at the end are
1676 1676 known to be keywords of your function:
1677 1677
1678 1678 ::
1679 1679
1680 1680 In [21]: f(2, fro<TAB HERE>
1681 1681 frobinate= frombuffer fromfunction frompyfunc fromstring
1682 1682 from fromfile fromiter fromregex frozenset
1683 1683
1684 1684 at this point you can add the ``b`` letter and hit ``<tab>`` once more,
1685 1685 and IPython will finish the line for you:
1686 1686
1687 1687 ::
1688 1688
1689 1689 In [21]: f(2, frobinate=
1690 1690
1691 1691 As a beginner, simply get into the habit of using ``<tab>`` after most
1692 1692 objects; it should quickly become second nature as you will see how
1693 1693 helps keep a fluid workflow and discover useful information. Later on
1694 1694 you can also customize this behavior by writing your own completion
1695 1695 code, if you so desire.
1696 1696
1697 1697 **Matplotlib integration**
1698 1698
1699 1699 One of the most useful features of IPython for scientists is its tight
1700 1700 integration with matplotlib: at the terminal IPython lets you open
1701 1701 matplotlib figures without blocking your typing (which is what happens
1702 1702 if you try to do the same thing at the default Python shell), and in the
1703 1703 Qt console and notebook you can even view your figures embedded in your
1704 1704 workspace next to the code that created them.
1705 1705
1706 1706 The matplotlib support can be either activated when you start IPython by
1707 1707 passing the ``--pylab`` flag, or at any point later in your session by
1708 1708 using the ``%pylab`` command. If you start IPython with ``--pylab``,
1709 1709 you'll see something like this (note the extra message about pylab):
1710 1710
1711 1711 ::
1712 1712
1713 1713 $ ipython --pylab
1714 1714 Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
1715 1715 Type "copyright", "credits" or "license" for more information.
1716 1716
1717 1717 IPython 0.13.dev -- An enhanced Interactive Python.
1718 1718 ? -> Introduction and overview of IPython's features.
1719 1719 %quickref -> Quick reference.
1720 1720 help -> Python's own help system.
1721 1721 object? -> Details about 'object', use 'object??' for extra details.
1722 1722
1723 1723 Welcome to pylab, a matplotlib-based Python environment [backend: Qt4Agg].
1724 1724 For more information, type 'help(pylab)'.
1725 1725
1726 1726 In [1]:
1727 1727
1728 1728 Furthermore, IPython will import ``numpy`` with the ``np`` shorthand,
1729 1729 ``matplotlib.pyplot`` as ``plt``, and it will also load all of the numpy
1730 1730 and pyplot top-level names so that you can directly type something like:
1731 1731
1732 1732 ::
1733 1733
1734 1734 In [1]: x = linspace(0, 2*pi, 200)
1735 1735
1736 1736 In [2]: plot(x, sin(x))
1737 1737 Out[2]: [<matplotlib.lines.Line2D at 0x9e7c16c>]
1738 1738
1739 1739 instead of having to prefix each call with its full signature (as we
1740 1740 have been doing in the examples thus far):
1741 1741
1742 1742 ::
1743 1743
1744 1744 In [3]: x = np.linspace(0, 2*np.pi, 200)
1745 1745
1746 1746 In [4]: plt.plot(x, np.sin(x))
1747 1747 Out[4]: [<matplotlib.lines.Line2D at 0x9e900ac>]
1748 1748
1749 1749 This shorthand notation can be a huge time-saver when working
1750 1750 interactively (it's a few characters but you are likely to type them
1751 1751 hundreds of times in a session). But we should note that as you develop
1752 1752 persistent scripts and notebooks meant for reuse, it's best to get in
1753 1753 the habit of using the longer notation (known as *fully qualified names*
1754 1754 as it's clearer where things come from and it makes for more robust,
1755 1755 readable and maintainable code in the long run).
1756 1756
1757 1757 **Access to the operating system and files**
1758 1758
1759 1759 In IPython, you can type ``ls`` to see your files or ``cd`` to change
1760 1760 directories, just like you would at a regular system prompt:
1761 1761
1762 1762 ::
1763 1763
1764 1764 In [2]: cd tests
1765 1765 /home/fperez/ipython/nbconvert/tests
1766 1766
1767 1767 In [3]: ls test.*
1768 1768 test.aux test.html test.ipynb test.log test.out test.pdf test.rst test.tex
1769 1769
1770 1770 Furthermore, if you use the ``!`` at the beginning of a line, any
1771 1771 commands you pass afterwards go directly to the operating system:
1772 1772
1773 1773 ::
1774 1774
1775 1775 In [4]: !echo "Hello IPython"
1776 1776 Hello IPython
1777 1777
1778 1778 IPython offers a useful twist in this feature: it will substitute in the
1779 1779 command the value of any *Python* variable you may have if you prepend
1780 1780 it with a ``$`` sign:
1781 1781
1782 1782 ::
1783 1783
1784 1784 In [5]: message = 'IPython interpolates from Python to the shell'
1785 1785
1786 1786 In [6]: !echo $message
1787 1787 IPython interpolates from Python to the shell
1788 1788
1789 1789 This feature can be extremely useful, as it lets you combine the power
1790 1790 and clarity of Python for complex logic with the immediacy and
1791 1791 familiarity of many shell commands. Additionally, if you start the line
1792 1792 with *two* ``$$`` signs, the output of the command will be automatically
1793 1793 captured as a list of lines, e.g.:
1794 1794
1795 1795 ::
1796 1796
1797 1797 In [10]: !!ls test.*
1798 1798 Out[10]:
1799 1799 ['test.aux',
1800 1800 'test.html',
1801 1801 'test.ipynb',
1802 1802 'test.log',
1803 1803 'test.out',
1804 1804 'test.pdf',
1805 1805 'test.rst',
1806 1806 'test.tex']
1807 1807
1808 1808 As explained above, you can now use this as the variable ``_10``. If you
1809 1809 directly want to capture the output of a system command to a Python
1810 1810 variable, you can use the syntax ``=!``:
1811 1811
1812 1812 ::
1813 1813
1814 1814 In [11]: testfiles =! ls test.*
1815 1815
1816 1816 In [12]: print testfiles
1817 1817 ['test.aux', 'test.html', 'test.ipynb', 'test.log', 'test.out', 'test.pdf', 'test.rst', 'test.tex']
1818 1818
1819 1819 Finally, the special ``%alias`` command lets you define names that are
1820 1820 shorthands for system commands, so that you can type them without having
1821 1821 to prefix them via ``!`` explicitly (for example, ``ls`` is an alias
1822 1822 that has been predefined for you at startup).
1823 1823
1824 1824 **Magic commands**
1825 1825
1826 1826 IPython has a system for special commands, called 'magics', that let you
1827 1827 control IPython itself and perform many common tasks with a more
1828 1828 shell-like syntax: it uses spaces for delimiting arguments, flags can be
1829 1829 set with dashes and all arguments are treated as strings, so no
1830 1830 additional quoting is required. This kind of syntax is invalid in the
1831 1831 Python language but very convenient for interactive typing (less
1832 1832 parentheses, commans and quoting everywhere); IPython distinguishes the
1833 1833 two by detecting lines that start with the ``%`` character.
1834 1834
1835 1835 You can learn more about the magic system by simply typing ``%magic`` at
1836 1836 the prompt, which will give you a short description plus the
1837 1837 documentation on *all* available magics. If you want to see only a
1838 1838 listing of existing magics, you can use ``%lsmagic``:
1839 1839
1840 1840 ::
1841 1841
1842 1842 In [4]: lsmagic
1843 1843 Available magic functions:
1844 1844 %alias %autocall %autoindent %automagic %bookmark %c %cd %colors %config %cpaste
1845 1845 %debug %dhist %dirs %doctest_mode %ds %ed %edit %env %gui %hist %history
1846 1846 %install_default_config %install_ext %install_profiles %load_ext %loadpy %logoff %logon
1847 1847 %logstart %logstate %logstop %lsmagic %macro %magic %notebook %page %paste %pastebin
1848 1848 %pd %pdb %pdef %pdoc %pfile %pinfo %pinfo2 %pop %popd %pprint %precision %profile
1849 1849 %prun %psearch %psource %pushd %pwd %pycat %pylab %quickref %recall %rehashx
1850 1850 %reload_ext %rep %rerun %reset %reset_selective %run %save %sc %stop %store %sx %tb
1851 1851 %time %timeit %unalias %unload_ext %who %who_ls %whos %xdel %xmode
1852 1852
1853 1853 Automagic is ON, % prefix NOT needed for magic functions.
1854 1854
1855 1855 Note how the example above omitted the eplicit ``%`` marker and simply
1856 1856 uses ``lsmagic``. As long as the 'automagic' feature is on (which it is
1857 1857 by default), you can omit the ``%`` marker as long as there is no
1858 1858 ambiguity with a Python variable of the same name.
1859 1859
1860 1860 **Running your code**
1861 1861
1862 1862 While it's easy to type a few lines of code in IPython, for any
1863 1863 long-lived work you should keep your codes in Python scripts (or in
1864 1864 IPython notebooks, see below). Consider that you have a script, in this
1865 1865 case trivially simple for the sake of brevity, named ``simple.py``:
1866 1866
1867 1867 ::
1868 1868
1869 1869 In [12]: !cat simple.py
1870 1870 import numpy as np
1871 1871
1872 1872 x = np.random.normal(size=100)
1873 1873
1874 1874 print 'First elment of x:', x[0]
1875 1875
1876 1876 The typical workflow with IPython is to use the ``%run`` magic to
1877 1877 execute your script (you can omit the .py extension if you want). When
1878 1878 you run it, the script will execute just as if it had been run at the
1879 1879 system prompt with ``python simple.py`` (though since modules don't get
1880 1880 re-executed on new imports by Python, all system initialization is
1881 1881 essentially free, which can have a significant run time impact in some
1882 1882 cases):
1883 1883
1884 1884 ::
1885 1885
1886 1886 In [13]: run simple
1887 1887 First elment of x: -1.55872256289
1888 1888
1889 1889 Once it completes, all variables defined in it become available for you
1890 1890 to use interactively:
1891 1891
1892 1892 ::
1893 1893
1894 1894 In [14]: x.shape
1895 1895 Out[14]: (100,)
1896 1896
1897 1897 This allows you to plot data, try out ideas, etc, in a
1898 1898 ``%run``/interact/edit cycle that can be very productive. As you start
1899 1899 understanding your problem better you can refine your script further,
1900 1900 incrementally improving it based on the work you do at the IPython
1901 1901 prompt. At any point you can use the ``%hist`` magic to print out your
1902 1902 history without prompts, so that you can copy useful fragments back into
1903 1903 the script.
1904 1904
1905 1905 By default, ``%run`` executes scripts in a completely empty namespace,
1906 1906 to better mimic how they would execute at the system prompt with plain
1907 1907 Python. But if you use the ``-i`` flag, the script will also see your
1908 1908 interactively defined variables. This lets you edit in a script larger
1909 1909 amounts of code that still behave as if you had typed them at the
1910 1910 IPython prompt.
1911 1911
1912 1912 You can also get a summary of the time taken by your script with the
1913 1913 ``-t`` flag; consider a different script ``randsvd.py`` that takes a bit
1914 1914 longer to run:
1915 1915
1916 1916 ::
1917 1917
1918 1918 In [21]: run -t randsvd.py
1919 1919
1920 1920 IPython CPU timings (estimated):
1921 1921 User : 0.38 s.
1922 1922 System : 0.04 s.
1923 1923 Wall time: 0.34 s.
1924 1924
1925 1925 ``User`` is the time spent by the computer executing your code, while
1926 1926 ``System`` is the time the operating system had to work on your behalf,
1927 1927 doing things like memory allocation that are needed by your code but
1928 1928 that you didn't explicitly program and that happen inside the kernel.
1929 1929 The ``Wall time`` is the time on a 'clock on the wall' between the start
1930 1930 and end of your program.
1931 1931
1932 1932 If ``Wall > User+System``, your code is most likely waiting idle for
1933 1933 certain periods. That could be waiting for data to arrive from a remote
1934 1934 source or perhaps because the operating system has to swap large amounts
1935 1935 of virtual memory. If you know that your code doesn't explicitly wait
1936 1936 for remote data to arrive, you should investigate further to identify
1937 1937 possible ways of improving the performance profile.
1938 1938
1939 1939 If you only want to time how long a single statement takes, you don't
1940 1940 need to put it into a script as you can use the ``%timeit`` magic, which
1941 1941 uses Python's ``timeit`` module to very carefully measure timig data;
1942 1942 ``timeit`` can measure even short statements that execute extremely
1943 1943 fast:
1944 1944
1945 1945 ::
1946 1946
1947 1947 In [27]: %timeit a=1
1948 1948 10000000 loops, best of 3: 23 ns per loop
1949 1949
1950 1950 and for code that runs longer, it automatically adjusts so the overall
1951 1951 measurement doesn't take too long:
1952 1952
1953 1953 ::
1954 1954
1955 1955 In [28]: %timeit np.linalg.svd(x)
1956 1956 1 loops, best of 3: 310 ms per loop
1957 1957
1958 1958 The ``%run`` magic still has more options for debugging and profiling
1959 1959 data; you should read its documentation for many useful details (as
1960 1960 always, just type ``%run?``).
1961 1961
1962 1962 The graphical Qt console
1963 1963 ------------------------
1964 1964
1965 1965 If you type at the system prompt (see the IPython website for
1966 1966 installation details, as this requires some additional libraries):
1967 1967
1968 1968 ::
1969 1969
1970 1970 $ ipython qtconsole
1971 1971
1972 1972 instead of opening in a terminal as before, IPython will start a
1973 1973 graphical console that at first sight appears just like a terminal, but
1974 1974 which is in fact much more capable than a text-only terminal. This is a
1975 1975 specialized terminal designed for interactive scientific work, and it
1976 1976 supports full multi-line editing with color highlighting and graphical
1977 1977 calltips for functions, it can keep multiple IPython sessions open
1978 1978 simultaneously in tabs, and when scripts run it can display the figures
1979 1979 inline directly in the work area.
1980 1980
1981 1981 .. raw:: html
1982 1982
1983 1983 <center>
1984 1984
1985 1985 .. raw:: html
1986 1986
1987 1987 </center>
1988 1988
1989 1989
1990 1990 % This cell is for the pdflatex output only
1991 1991 \begin{figure}[htbp]
1992 1992 \centering
1993 1993 \includegraphics[width=3in]{ipython_qtconsole2.png}
1994 1994 \caption{The IPython Qt console: a lightweight terminal for scientific exploration, with code, results and graphics in a soingle environment.}
1995 1995 \end{figure}
1996 1996 The Qt console accepts the same ``--pylab`` startup flags as the
1997 1997 terminal, but you can additionally supply the value ``--pylab inline``,
1998 1998 which enables the support for inline graphics shown in the figure. This
1999 1999 is ideal for keeping all the code and figures in the same session, given
2000 2000 that the console can save the output of your entire session to HTML or
2001 2001 PDF.
2002 2002
2003 2003 Since the Qt console makes it far more convenient than the terminal to
2004 2004 edit blocks of code with multiple lines, in this environment it's worth
2005 2005 knowing about the ``%loadpy`` magic function. ``%loadpy`` takes a path
2006 2006 to a local file or remote URL, fetches its contents, and puts it in the
2007 2007 work area for you to further edit and execute. It can be an extremely
2008 2008 fast and convenient way of loading code from local disk or remote
2009 2009 examples from sites such as the `Matplotlib
2010 2010 gallery <http://matplotlib.sourceforge.net/gallery.html>`_.
2011 2011
2012 2012 Other than its enhanced capabilities for code and graphics, all of the
2013 2013 features of IPython we've explained before remain functional in this
2014 2014 graphical console.
2015 2015
2016 2016 The IPython Notebook
2017 2017 --------------------
2018 2018
2019 2019 The third way to interact with IPython, in addition to the terminal and
2020 2020 graphical Qt console, is a powerful web interface called the "IPython
2021 2021 Notebook". If you run at the system console (you can omit the ``pylab``
2022 2022 flags if you don't need plotting support):
2023 2023
2024 2024 ::
2025 2025
2026 2026 $ ipython notebook --pylab inline
2027 2027
2028 2028 IPython will start a process that runs a web server in your local
2029 2029 machine and to which a web browser can connect. The Notebook is a
2030 2030 workspace that lets you execute code in blocks called 'cells' and
2031 2031 displays any results and figures, but which can also contain arbitrary
2032 2032 text (including LaTeX-formatted mathematical expressions) and any rich
2033 2033 media that a modern web browser is capable of displaying.
2034 2034
2035 2035 .. raw:: html
2036 2036
2037 2037 <center>
2038 2038
2039 2039 .. raw:: html
2040 2040
2041 2041 </center>
2042 2042
2043 2043
2044 2044 % This cell is for the pdflatex output only
2045 2045 \begin{figure}[htbp]
2046 2046 \centering
2047 2047 \includegraphics[width=3in]{ipython-notebook-specgram-2.png}
2048 2048 \caption{The IPython Notebook: text, equations, code, results, graphics and other multimedia in an open format for scientific exploration and collaboration}
2049 2049 \end{figure}
2050 2050 In fact, this document was written as a Notebook, and only exported to
2051 2051 LaTeX for printing. Inside of each cell, all the features of IPython
2052 2052 that we have discussed before remain functional, since ultimately this
2053 2053 web client is communicating with the same IPython code that runs in the
2054 2054 terminal. But this interface is a much more rich and powerful
2055 2055 environment for maintaining long-term "live and executable" scientific
2056 2056 documents.
2057 2057
2058 2058 Notebook environments have existed in commercial systems like
2059 2059 Mathematica(TM) and Maple(TM) for a long time; in the open source world
2060 2060 the `Sage <http://sagemath.org>`_ project blazed this particular trail
2061 2061 starting in 2006, and now we bring all the features that have made
2062 2062 IPython such a widely used tool to a Notebook model.
2063 2063
2064 2064 Since the Notebook runs as a web application, it is possible to
2065 2065 configure it for remote access, letting you run your computations on a
2066 2066 persistent server close to your data, which you can then access remotely
2067 2067 from any browser-equipped computer. We encourage you to read the
2068 2068 extensive documentation provided by the IPython project for details on
2069 2069 how to do this and many more features of the notebook.
2070 2070
2071 2071 Finally, as we said earlier, IPython also has a high-level and easy to
2072 2072 use set of libraries for parallel computing, that let you control
2073 2073 (interactively if desired) not just one IPython but an entire cluster of
2074 2074 'IPython engines'. Unfortunately a detailed discussion of these tools is
2075 2075 beyond the scope of this text, but should you need to parallelize your
2076 2076 analysis codes, a quick read of the tutorials and examples provided at
2077 2077 the IPython site may prove fruitful.
@@ -1,2255 +1,2255 b''
1 1 %% This file was auto-generated by IPython, do NOT edit
2 2 %% Conversion from the original notebook file:
3 3 %% tests/ipynbref/IntroNumPy.orig.ipynb
4 4 %%
5 5 \documentclass[11pt,english]{article}
6 6
7 7 %% This is the automatic preamble used by IPython. Note that it does *not*
8 8 %% include a documentclass declaration, that is added at runtime to the overall
9 9 %% document.
10 10
11 11 \usepackage{amsmath}
12 12 \usepackage{amssymb}
13 13 \usepackage{graphicx}
14 14 \usepackage{ucs}
15 15 \usepackage[utf8x]{inputenc}
16 16
17 17 % needed for markdown enumerations to work
18 18 \usepackage{enumerate}
19 19
20 20 % Slightly bigger margins than the latex defaults
21 21 \usepackage{geometry}
22 22 \geometry{verbose,tmargin=3cm,bmargin=3cm,lmargin=2.5cm,rmargin=2.5cm}
23 23
24 24 % Define a few colors for use in code, links and cell shading
25 25 \usepackage{color}
26 26 \definecolor{orange}{cmyk}{0,0.4,0.8,0.2}
27 27 \definecolor{darkorange}{rgb}{.71,0.21,0.01}
28 28 \definecolor{darkgreen}{rgb}{.12,.54,.11}
29 29 \definecolor{myteal}{rgb}{.26, .44, .56}
30 30 \definecolor{gray}{gray}{0.45}
31 31 \definecolor{lightgray}{gray}{.95}
32 32 \definecolor{mediumgray}{gray}{.8}
33 33 \definecolor{inputbackground}{rgb}{.95, .95, .85}
34 34 \definecolor{outputbackground}{rgb}{.95, .95, .95}
35 35 \definecolor{traceback}{rgb}{1, .95, .95}
36 36
37 37 % Framed environments for code cells (inputs, outputs, errors, ...). The
38 38 % various uses of \unskip (or not) at the end were fine-tuned by hand, so don't
39 39 % randomly change them unless you're sure of the effect it will have.
40 40 \usepackage{framed}
41 41
42 42 % remove extraneous vertical space in boxes
43 43 \setlength\fboxsep{0pt}
44 44
45 45 % codecell is the whole input+output set of blocks that a Code cell can
46 46 % generate.
47 47
48 48 % TODO: unfortunately, it seems that using a framed codecell environment breaks
49 49 % the ability of the frames inside of it to be broken across pages. This
50 50 % causes at least the problem of having lots of empty space at the bottom of
51 51 % pages as new frames are moved to the next page, and if a single frame is too
52 52 % long to fit on a page, will completely stop latex from compiling the
53 53 % document. So unless we figure out a solution to this, we'll have to instead
54 54 % leave the codecell env. as empty. I'm keeping the original codecell
55 55 % definition here (a thin vertical bar) for reference, in case we find a
56 56 % solution to the page break issue.
57 57
58 58 %% \newenvironment{codecell}{%
59 59 %% \def\FrameCommand{\color{mediumgray} \vrule width 1pt \hspace{5pt}}%
60 60 %% \MakeFramed{\vspace{-0.5em}}}
61 61 %% {\unskip\endMakeFramed}
62 62
63 63 % For now, make this a no-op...
64 64 \newenvironment{codecell}{}
65 65
66 66 \newenvironment{codeinput}{%
67 67 \def\FrameCommand{\colorbox{inputbackground}}%
68 68 \MakeFramed{\advance\hsize-\width \FrameRestore}}
69 69 {\unskip\endMakeFramed}
70 70
71 71 \newenvironment{codeoutput}{%
72 72 \def\FrameCommand{\colorbox{outputbackground}}%
73 73 \vspace{-1.4em}
74 74 \MakeFramed{\advance\hsize-\width \FrameRestore}}
75 75 {\unskip\medskip\endMakeFramed}
76 76
77 77 \newenvironment{traceback}{%
78 78 \def\FrameCommand{\colorbox{traceback}}%
79 79 \MakeFramed{\advance\hsize-\width \FrameRestore}}
80 80 {\endMakeFramed}
81 81
82 82 % Use and configure listings package for nicely formatted code
83 83 \usepackage{listingsutf8}
84 84 \lstset{
85 85 language=python,
86 86 inputencoding=utf8x,
87 87 extendedchars=\true,
88 88 aboveskip=\smallskipamount,
89 89 belowskip=\smallskipamount,
90 90 xleftmargin=2mm,
91 91 breaklines=true,
92 92 basicstyle=\small \ttfamily,
93 93 showstringspaces=false,
94 94 keywordstyle=\color{blue}\bfseries,
95 95 commentstyle=\color{myteal},
96 96 stringstyle=\color{darkgreen},
97 97 identifierstyle=\color{darkorange},
98 98 columns=fullflexible, % tighter character kerning, like verb
99 99 }
100 100
101 101 % The hyperref package gives us a pdf with properly built
102 102 % internal navigation ('pdf bookmarks' for the table of contents,
103 103 % internal cross-reference links, web links for URLs, etc.)
104 104 \usepackage{hyperref}
105 105 \hypersetup{
106 106 breaklinks=true, % so long urls are correctly broken across lines
107 107 colorlinks=true,
108 108 urlcolor=blue,
109 109 linkcolor=darkorange,
110 110 citecolor=darkgreen,
111 111 }
112 112
113 113 % hardcode size of all verbatim environments to be a bit smaller
114 114 \makeatletter
115 115 \g@addto@macro\@verbatim\small\topsep=0.5em\partopsep=0pt
116 116 \makeatother
117 117
118 118 % Prevent overflowing lines due to urls and other hard-to-break entities.
119 119 \sloppy
120 120
121 121 \begin{document}
122 122
123 123 \section{An Introduction to the Scientific Python Ecosystem}
124 124 While the Python language is an excellent tool for general-purpose
125 125 programming, with a highly readable syntax, rich and powerful data types
126 126 (strings, lists, sets, dictionaries, arbitrary length integers, etc) and
127 127 a very comprehensive standard library, it was not designed specifically
128 128 for mathematical and scientific computing. Neither the language nor its
129 129 standard library have facilities for the efficient representation of
130 130 multidimensional datasets, tools for linear algebra and general matrix
131 131 manipulations (an essential building block of virtually all technical
132 132 computing), nor any data visualization facilities.
133 133
134 134 In particular, Python lists are very flexible containers that can be
135 135 nested arbitrarily deep and which can hold any Python object in them,
136 136 but they are poorly suited to represent efficiently common mathematical
137 137 constructs like vectors and matrices. In contrast, much of our modern
138 138 heritage of scientific computing has been built on top of libraries
139 139 written in the Fortran language, which has native support for vectors
140 140 and matrices as well as a library of mathematical functions that can
141 141 efficiently operate on entire arrays at once.
142 142
143 143 \subsection{Scientific Python: a collaboration of projects built by scientists}
144 144 The scientific community has developed a set of related Python libraries
145 145 that provide powerful array facilities, linear algebra, numerical
146 146 algorithms, data visualization and more. In this appendix, we will
147 147 briefly outline the tools most frequently used for this purpose, that
148 148 make ``Scientific Python'' something far more powerful than the Python
149 149 language alone.
150 150
151 151 For reasons of space, we can only describe in some detail the central
152 152 Numpy library, but below we provide links to the websites of each
153 153 project where you can read their documentation in more detail.
154 154
155 155 First, let's look at an overview of the basic tools that most scientists
156 156 use in daily research with Python. The core of this ecosystem is
157 157 composed of:
158 158
159 159 \begin{itemize}
160 160 \item
161 161 Numpy: the basic library that most others depend on, it provides a
162 162 powerful array type that can represent multidmensional datasets of
163 163 many different kinds and that supports arithmetic operations. Numpy
164 164 also provides a library of common mathematical functions, basic linear
165 165 algebra, random number generation and Fast Fourier Transforms. Numpy
166 166 can be found at \href{http://numpy.scipy.org}{numpy.scipy.org}
167 167 \item
168 168 Scipy: a large collection of numerical algorithms that operate on
169 169 numpy arrays and provide facilities for many common tasks in
170 170 scientific computing, including dense and sparse linear algebra
171 171 support, optimization, special functions, statistics, n-dimensional
172 172 image processing, signal processing and more. Scipy can be found at
173 173 \href{http://scipy.org}{scipy.org}.
174 174 \item
175 175 Matplotlib: a data visualization library with a strong focus on
176 176 producing high-quality output, it supports a variety of common
177 177 scientific plot types in two and three dimensions, with precise
178 178 control over the final output and format for publication-quality
179 179 results. Matplotlib can also be controlled interactively allowing
180 180 graphical manipulation of your data (zooming, panning, etc) and can be
181 181 used with most modern user interface toolkits. It can be found at
182 182 \href{http://matplotlib.sf.net}{matplotlib.sf.net}.
183 183 \item
184 184 IPython: while not strictly scientific in nature, IPython is the
185 185 interactive environment in which many scientists spend their time.
186 186 IPython provides a powerful Python shell that integrates tightly with
187 187 Matplotlib and with easy access to the files and operating system, and
188 188 which can execute in a terminal or in a graphical Qt console. IPython
189 189 also has a web-based notebook interface that can combine code with
190 190 text, mathematical expressions, figures and multimedia. It can be
191 191 found at \href{http://ipython.org}{ipython.org}.
192 192 \end{itemize}
193 193 While each of these tools can be installed separately, in our opinion
194 194 the most convenient way today of accessing them (especially on Windows
195 195 and Mac computers) is to install the
196 196 \href{http://www.enthought.com/products/epd\_free.php}{Free Edition of
197 197 the Enthought Python Distribution} which contain all the above. Other
198 198 free alternatives on Windows (but not on Macs) are
199 199 \href{http://code.google.com/p/pythonxy}{Python(x,y)} and
200 200 \href{http://www.lfd.uci.edu/~gohlke/pythonlibs}{Christoph Gohlke's
201 201 packages page}.
202 202
203 203 These four `core' libraries are in practice complemented by a number of
204 204 other tools for more specialized work. We will briefly list here the
205 205 ones that we think are the most commonly needed:
206 206
207 207 \begin{itemize}
208 208 \item
209 209 Sympy: a symbolic manipulation tool that turns a Python session into a
210 210 computer algebra system. It integrates with the IPython notebook,
211 211 rendering results in properly typeset mathematical notation.
212 212 \href{http://sympy.org}{sympy.org}.
213 213 \item
214 214 Mayavi: sophisticated 3d data visualization;
215 215 \href{http://code.enthought.com/projects/mayavi}{code.enthought.com/projects/mayavi}.
216 216 \item
217 217 Cython: a bridge language between Python and C, useful both to
218 218 optimize performance bottlenecks in Python and to access C libraries
219 219 directly; \href{http://cython.org}{cython.org}.
220 220 \item
221 221 Pandas: high-performance data structures and data analysis tools, with
222 222 powerful data alignment and structural manipulation capabilities;
223 223 \href{http://pandas.pydata.org}{pandas.pydata.org}.
224 224 \item
225 225 Statsmodels: statistical data exploration and model estimation;
226 226 \href{http://statsmodels.sourceforge.net}{statsmodels.sourceforge.net}.
227 227 \item
228 228 Scikit-learn: general purpose machine learning algorithms with a
229 229 common interface; \href{http://scikit-learn.org}{scikit-learn.org}.
230 230 \item
231 231 Scikits-image: image processing toolbox;
232 232 \href{http://scikits-image.org}{scikits-image.org}.
233 233 \item
234 234 NetworkX: analysis of complex networks (in the graph theoretical
235 235 sense); \href{http://networkx.lanl.gov}{networkx.lanl.gov}.
236 236 \item
237 237 PyTables: management of hierarchical datasets using the
238 238 industry-standard HDF5 format;
239 239 \href{http://www.pytables.org}{www.pytables.org}.
240 240 \end{itemize}
241 241 Beyond these, for any specific problem you should look on the internet
242 242 first, before starting to write code from scratch. There's a good chance
243 243 that someone, somewhere, has written an open source library that you can
244 244 use for part or all of your problem.
245 245
246 246 \subsection{A note about the examples below}
247 247 In all subsequent examples, you will see blocks of input code, followed
248 248 by the results of the code if the code generated output. This output may
249 249 include text, graphics and other result objects. These blocks of input
250 250 can be pasted into your interactive IPython session or notebook for you
251 251 to execute. In the print version of this document, a thin vertical bar
252 252 on the left of the blocks of input and output shows which blocks go
253 253 together.
254 254
255 255 If you are reading this text as an actual IPython notebook, you can
256 256 press \texttt{Shift-Enter} or use the `play' button on the toolbar
257 257 (right-pointing triangle) to execute each block of code, known as a
258 258 `cell' in IPython:
259 259
260 260 \begin{codecell}
261 261 \begin{codeinput}
262 262 \begin{lstlisting}
263 263 # This is a block of code, below you'll see its output
264 264 print "Welcome to the world of scientific computing with Python!"
265 265 \end{lstlisting}
266 266 \end{codeinput}
267 267 \begin{codeoutput}
268 268 \begin{verbatim}
269 269 Welcome to the world of scientific computing with Python!
270 270 \end{verbatim}
271 271 \end{codeoutput}
272 272 \end{codecell}
273 273 \section{Motivation: the trapezoidal rule}
274 274 In subsequent sections we'll provide a basic introduction to the nuts
275 275 and bolts of the basic scientific python tools; but we'll first motivate
276 276 it with a brief example that illustrates what you can do in a few lines
277 277 with these tools. For this, we will use the simple problem of
278 278 approximating a definite integral with the trapezoid rule:
279 279
280 280 \[
281 281 \int_{a}^{b} f(x)\, dx \approx \frac{1}{2} \sum_{k=1}^{N} \left( x_{k} - x_{k-1} \right) \left( f(x_{k}) + f(x_{k-1}) \right).
282 282 \]
283 283
284 284 Our task will be to compute this formula for a function such as:
285 285
286 286 \[
287 287 f(x) = (x-3)(x-5)(x-7)+85
288 288 \]
289 289
290 290 integrated between $a=1$ and $b=9$.
291 291
292 292 First, we define the function and sample it evenly between 0 and 10 at
293 293 200 points:
294 294
295 295 \begin{codecell}
296 296 \begin{codeinput}
297 297 \begin{lstlisting}
298 298 def f(x):
299 299 return (x-3)*(x-5)*(x-7)+85
300 300
301 301 import numpy as np
302 302 x = np.linspace(0, 10, 200)
303 303 y = f(x)
304 304 \end{lstlisting}
305 305 \end{codeinput}
306 306 \end{codecell}
307 307 We select $a$ and $b$, our integration limits, and we take only a few
308 308 points in that region to illustrate the error behavior of the trapezoid
309 309 approximation:
310 310
311 311 \begin{codecell}
312 312 \begin{codeinput}
313 313 \begin{lstlisting}
314 314 a, b = 1, 9
315 315 xint = x[logical_and(x>=a, x<=b)][::30]
316 316 yint = y[logical_and(x>=a, x<=b)][::30]
317 317 \end{lstlisting}
318 318 \end{codeinput}
319 319 \end{codecell}
320 320 Let's plot both the function and the area below it in the trapezoid
321 321 approximation:
322 322
323 323 \begin{codecell}
324 324 \begin{codeinput}
325 325 \begin{lstlisting}
326 326 import matplotlib.pyplot as plt
327 327 plt.plot(x, y, lw=2)
328 328 plt.axis([0, 10, 0, 140])
329 329 plt.fill_between(xint, 0, yint, facecolor='gray', alpha=0.4)
330 330 plt.text(0.5 * (a + b), 30,r"$\int_a^b f(x)dx$", horizontalalignment='center', fontsize=20);
331 331 \end{lstlisting}
332 332 \end{codeinput}
333 333 \begin{codeoutput}
334 334 \begin{center}
335 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.pdf}
335 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_00.pdf}
336 336 \par
337 337 \end{center}
338 338 \end{codeoutput}
339 339 \end{codecell}
340 340 Compute the integral both at high accuracy and with the trapezoid
341 341 approximation
342 342
343 343 \begin{codecell}
344 344 \begin{codeinput}
345 345 \begin{lstlisting}
346 346 from scipy.integrate import quad, trapz
347 347 integral, error = quad(f, 1, 9)
348 348 trap_integral = trapz(yint, xint)
349 349 print "The integral is: %g +/- %.1e" % (integral, error)
350 350 print "The trapezoid approximation with", len(xint), "points is:", trap_integral
351 351 print "The absolute error is:", abs(integral - trap_integral)
352 352 \end{lstlisting}
353 353 \end{codeinput}
354 354 \begin{codeoutput}
355 355 \begin{verbatim}
356 356 The integral is: 680 +/- 7.5e-12
357 357 The trapezoid approximation with 6 points is: 621.286411141
358 358 The absolute error is: 58.7135888589
359 359 \end{verbatim}
360 360 \end{codeoutput}
361 361 \end{codecell}
362 362 This simple example showed us how, combining the numpy, scipy and
363 363 matplotlib libraries we can provide an illustration of a standard method
364 364 in elementary calculus with just a few lines of code. We will now
365 365 discuss with more detail the basic usage of these tools.
366 366
367 367 \section{NumPy arrays: the right data structure for scientific computing}
368 368 \subsection{Basics of Numpy arrays}
369 369 We now turn our attention to the Numpy library, which forms the base
370 370 layer for the entire `scipy ecosystem'. Once you have installed numpy,
371 371 you can import it as
372 372
373 373 \begin{codecell}
374 374 \begin{codeinput}
375 375 \begin{lstlisting}
376 376 import numpy
377 377 \end{lstlisting}
378 378 \end{codeinput}
379 379 \end{codecell}
380 380 though in this book we will use the common shorthand
381 381
382 382 \begin{codecell}
383 383 \begin{codeinput}
384 384 \begin{lstlisting}
385 385 import numpy as np
386 386 \end{lstlisting}
387 387 \end{codeinput}
388 388 \end{codecell}
389 389 As mentioned above, the main object provided by numpy is a powerful
390 390 array. We'll start by exploring how the numpy array differs from Python
391 391 lists. We start by creating a simple list and an array with the same
392 392 contents of the list:
393 393
394 394 \begin{codecell}
395 395 \begin{codeinput}
396 396 \begin{lstlisting}
397 397 lst = [10, 20, 30, 40]
398 398 arr = np.array([10, 20, 30, 40])
399 399 \end{lstlisting}
400 400 \end{codeinput}
401 401 \end{codecell}
402 402 Elements of a one-dimensional array are accessed with the same syntax as
403 403 a list:
404 404
405 405 \begin{codecell}
406 406 \begin{codeinput}
407 407 \begin{lstlisting}
408 408 lst[0]
409 409 \end{lstlisting}
410 410 \end{codeinput}
411 411 \begin{codeoutput}
412 412 \begin{verbatim}
413 413 10
414 414 \end{verbatim}
415 415 \end{codeoutput}
416 416 \end{codecell}
417 417 \begin{codecell}
418 418 \begin{codeinput}
419 419 \begin{lstlisting}
420 420 arr[0]
421 421 \end{lstlisting}
422 422 \end{codeinput}
423 423 \begin{codeoutput}
424 424 \begin{verbatim}
425 425 10
426 426 \end{verbatim}
427 427 \end{codeoutput}
428 428 \end{codecell}
429 429 \begin{codecell}
430 430 \begin{codeinput}
431 431 \begin{lstlisting}
432 432 arr[-1]
433 433 \end{lstlisting}
434 434 \end{codeinput}
435 435 \begin{codeoutput}
436 436 \begin{verbatim}
437 437 40
438 438 \end{verbatim}
439 439 \end{codeoutput}
440 440 \end{codecell}
441 441 \begin{codecell}
442 442 \begin{codeinput}
443 443 \begin{lstlisting}
444 444 arr[2:]
445 445 \end{lstlisting}
446 446 \end{codeinput}
447 447 \begin{codeoutput}
448 448 \begin{verbatim}
449 449 array([30, 40])
450 450 \end{verbatim}
451 451 \end{codeoutput}
452 452 \end{codecell}
453 453 The first difference to note between lists and arrays is that arrays are
454 454 \emph{homogeneous}; i.e.~all elements of an array must be of the same
455 455 type. In contrast, lists can contain elements of arbitrary type. For
456 456 example, we can change the last element in our list above to be a
457 457 string:
458 458
459 459 \begin{codecell}
460 460 \begin{codeinput}
461 461 \begin{lstlisting}
462 462 lst[-1] = 'a string inside a list'
463 463 lst
464 464 \end{lstlisting}
465 465 \end{codeinput}
466 466 \begin{codeoutput}
467 467 \begin{verbatim}
468 468 [10, 20, 30, 'a string inside a list']
469 469 \end{verbatim}
470 470 \end{codeoutput}
471 471 \end{codecell}
472 472 but the same can not be done with an array, as we get an error message:
473 473
474 474 \begin{codecell}
475 475 \begin{codeinput}
476 476 \begin{lstlisting}
477 477 arr[-1] = 'a string inside an array'
478 478 \end{lstlisting}
479 479 \end{codeinput}
480 480 \begin{codeoutput}
481 481 \begin{traceback}
482 482 \begin{verbatim}
483 483 ---------------------------------------------------------------------------
484 484 ValueError Traceback (most recent call last)
485 485 /home/fperez/teach/book-math-labtool/<ipython-input-13-29c0bfa5fa8a> in <module>()
486 486 ----> 1 arr[-1] = 'a string inside an array'
487 487
488 488 ValueError: invalid literal for long() with base 10: 'a string inside an array'
489 489 \end{verbatim}
490 490 \end{traceback}
491 491 \end{codeoutput}
492 492 \end{codecell}
493 493 The information about the type of an array is contained in its
494 494 \emph{dtype} attribute:
495 495
496 496 \begin{codecell}
497 497 \begin{codeinput}
498 498 \begin{lstlisting}
499 499 arr.dtype
500 500 \end{lstlisting}
501 501 \end{codeinput}
502 502 \begin{codeoutput}
503 503 \begin{verbatim}
504 504 dtype('int32')
505 505 \end{verbatim}
506 506 \end{codeoutput}
507 507 \end{codecell}
508 508 Once an array has been created, its dtype is fixed and it can only store
509 509 elements of the same type. For this example where the dtype is integer,
510 510 if we store a floating point number it will be automatically converted
511 511 into an integer:
512 512
513 513 \begin{codecell}
514 514 \begin{codeinput}
515 515 \begin{lstlisting}
516 516 arr[-1] = 1.234
517 517 arr
518 518 \end{lstlisting}
519 519 \end{codeinput}
520 520 \begin{codeoutput}
521 521 \begin{verbatim}
522 522 array([10, 20, 30, 1])
523 523 \end{verbatim}
524 524 \end{codeoutput}
525 525 \end{codecell}
526 526 Above we created an array from an existing list; now let us now see
527 527 other ways in which we can create arrays, which we'll illustrate next. A
528 528 common need is to have an array initialized with a constant value, and
529 529 very often this value is 0 or 1 (suitable as starting value for additive
530 530 and multiplicative loops respectively); \texttt{zeros} creates arrays of
531 531 all zeros, with any desired dtype:
532 532
533 533 \begin{codecell}
534 534 \begin{codeinput}
535 535 \begin{lstlisting}
536 536 np.zeros(5, float)
537 537 \end{lstlisting}
538 538 \end{codeinput}
539 539 \begin{codeoutput}
540 540 \begin{verbatim}
541 541 array([ 0., 0., 0., 0., 0.])
542 542 \end{verbatim}
543 543 \end{codeoutput}
544 544 \end{codecell}
545 545 \begin{codecell}
546 546 \begin{codeinput}
547 547 \begin{lstlisting}
548 548 np.zeros(3, int)
549 549 \end{lstlisting}
550 550 \end{codeinput}
551 551 \begin{codeoutput}
552 552 \begin{verbatim}
553 553 array([0, 0, 0])
554 554 \end{verbatim}
555 555 \end{codeoutput}
556 556 \end{codecell}
557 557 \begin{codecell}
558 558 \begin{codeinput}
559 559 \begin{lstlisting}
560 560 np.zeros(3, complex)
561 561 \end{lstlisting}
562 562 \end{codeinput}
563 563 \begin{codeoutput}
564 564 \begin{verbatim}
565 565 array([ 0.+0.j, 0.+0.j, 0.+0.j])
566 566 \end{verbatim}
567 567 \end{codeoutput}
568 568 \end{codecell}
569 569 and similarly for \texttt{ones}:
570 570
571 571 \begin{codecell}
572 572 \begin{codeinput}
573 573 \begin{lstlisting}
574 574 print '5 ones:', np.ones(5)
575 575 \end{lstlisting}
576 576 \end{codeinput}
577 577 \begin{codeoutput}
578 578 \begin{verbatim}
579 579 5 ones: [ 1. 1. 1. 1. 1.]
580 580 \end{verbatim}
581 581 \end{codeoutput}
582 582 \end{codecell}
583 583 If we want an array initialized with an arbitrary value, we can create
584 584 an empty array and then use the fill method to put the value we want
585 585 into the array:
586 586
587 587 \begin{codecell}
588 588 \begin{codeinput}
589 589 \begin{lstlisting}
590 590 a = empty(4)
591 591 a.fill(5.5)
592 592 a
593 593 \end{lstlisting}
594 594 \end{codeinput}
595 595 \begin{codeoutput}
596 596 \begin{verbatim}
597 597 array([ 5.5, 5.5, 5.5, 5.5])
598 598 \end{verbatim}
599 599 \end{codeoutput}
600 600 \end{codecell}
601 601 Numpy also offers the \texttt{arange} function, which works like the
602 602 builtin \texttt{range} but returns an array instead of a list:
603 603
604 604 \begin{codecell}
605 605 \begin{codeinput}
606 606 \begin{lstlisting}
607 607 np.arange(5)
608 608 \end{lstlisting}
609 609 \end{codeinput}
610 610 \begin{codeoutput}
611 611 \begin{verbatim}
612 612 array([0, 1, 2, 3, 4])
613 613 \end{verbatim}
614 614 \end{codeoutput}
615 615 \end{codecell}
616 616 and the \texttt{linspace} and \texttt{logspace} functions to create
617 617 linearly and logarithmically-spaced grids respectively, with a fixed
618 618 number of points and including both ends of the specified interval:
619 619
620 620 \begin{codecell}
621 621 \begin{codeinput}
622 622 \begin{lstlisting}
623 623 print "A linear grid between 0 and 1:", np.linspace(0, 1, 5)
624 624 print "A logarithmic grid between 10**1 and 10**4: ", np.logspace(1, 4, 4)
625 625 \end{lstlisting}
626 626 \end{codeinput}
627 627 \begin{codeoutput}
628 628 \begin{verbatim}
629 629 A linear grid between 0 and 1: [ 0. 0.25 0.5 0.75 1. ]
630 630 A logarithmic grid between 10**1 and 10**4: [ 10. 100. 1000. 10000.]
631 631 \end{verbatim}
632 632 \end{codeoutput}
633 633 \end{codecell}
634 634 Finally, it is often useful to create arrays with random numbers that
635 635 follow a specific distribution. The \texttt{np.random} module contains a
636 636 number of functions that can be used to this effect, for example this
637 637 will produce an array of 5 random samples taken from a standard normal
638 638 distribution (0 mean and variance 1):
639 639
640 640 \begin{codecell}
641 641 \begin{codeinput}
642 642 \begin{lstlisting}
643 643 np.random.randn(5)
644 644 \end{lstlisting}
645 645 \end{codeinput}
646 646 \begin{codeoutput}
647 647 \begin{verbatim}
648 648 array([-0.08633343, -0.67375434, 1.00589536, 0.87081651, 1.65597822])
649 649 \end{verbatim}
650 650 \end{codeoutput}
651 651 \end{codecell}
652 652 whereas this will also give 5 samples, but from a normal distribution
653 653 with a mean of 10 and a variance of 3:
654 654
655 655 \begin{codecell}
656 656 \begin{codeinput}
657 657 \begin{lstlisting}
658 658 norm10 = np.random.normal(10, 3, 5)
659 659 norm10
660 660 \end{lstlisting}
661 661 \end{codeinput}
662 662 \begin{codeoutput}
663 663 \begin{verbatim}
664 664 array([ 8.94879575, 5.53038269, 8.24847281, 12.14944165, 11.56209294])
665 665 \end{verbatim}
666 666 \end{codeoutput}
667 667 \end{codecell}
668 668 \subsection{Indexing with other arrays}
669 669 Above we saw how to index arrays with single numbers and slices, just
670 670 like Python lists. But arrays allow for a more sophisticated kind of
671 671 indexing which is very powerful: you can index an array with another
672 672 array, and in particular with an array of boolean values. This is
673 673 particluarly useful to extract information from an array that matches a
674 674 certain condition.
675 675
676 676 Consider for example that in the array \texttt{norm10} we want to
677 677 replace all values above 9 with the value 0. We can do so by first
678 678 finding the \emph{mask} that indicates where this condition is true or
679 679 false:
680 680
681 681 \begin{codecell}
682 682 \begin{codeinput}
683 683 \begin{lstlisting}
684 684 mask = norm10 > 9
685 685 mask
686 686 \end{lstlisting}
687 687 \end{codeinput}
688 688 \begin{codeoutput}
689 689 \begin{verbatim}
690 690 array([False, False, False, True, True], dtype=bool)
691 691 \end{verbatim}
692 692 \end{codeoutput}
693 693 \end{codecell}
694 694 Now that we have this mask, we can use it to either read those values or
695 695 to reset them to 0:
696 696
697 697 \begin{codecell}
698 698 \begin{codeinput}
699 699 \begin{lstlisting}
700 700 print 'Values above 9:', norm10[mask]
701 701 \end{lstlisting}
702 702 \end{codeinput}
703 703 \begin{codeoutput}
704 704 \begin{verbatim}
705 705 Values above 9: [ 12.14944165 11.56209294]
706 706 \end{verbatim}
707 707 \end{codeoutput}
708 708 \end{codecell}
709 709 \begin{codecell}
710 710 \begin{codeinput}
711 711 \begin{lstlisting}
712 712 print 'Resetting all values above 9 to 0...'
713 713 norm10[mask] = 0
714 714 print norm10
715 715 \end{lstlisting}
716 716 \end{codeinput}
717 717 \begin{codeoutput}
718 718 \begin{verbatim}
719 719 Resetting all values above 9 to 0...
720 720 [ 8.94879575 5.53038269 8.24847281 0. 0. ]
721 721 \end{verbatim}
722 722 \end{codeoutput}
723 723 \end{codecell}
724 724 \subsection{Arrays with more than one dimension}
725 725 Up until now all our examples have used one-dimensional arrays. But
726 726 Numpy can create arrays of aribtrary dimensions, and all the methods
727 727 illustrated in the previous section work with more than one dimension.
728 728 For example, a list of lists can be used to initialize a two dimensional
729 729 array:
730 730
731 731 \begin{codecell}
732 732 \begin{codeinput}
733 733 \begin{lstlisting}
734 734 lst2 = [[1, 2], [3, 4]]
735 735 arr2 = np.array([[1, 2], [3, 4]])
736 736 arr2
737 737 \end{lstlisting}
738 738 \end{codeinput}
739 739 \begin{codeoutput}
740 740 \begin{verbatim}
741 741 array([[1, 2],
742 742 [3, 4]])
743 743 \end{verbatim}
744 744 \end{codeoutput}
745 745 \end{codecell}
746 746 With two-dimensional arrays we start seeing the power of numpy: while a
747 747 nested list can be indexed using repeatedly the \texttt{{[} {]}}
748 748 operator, multidimensional arrays support a much more natural indexing
749 749 syntax with a single \texttt{{[} {]}} and a set of indices separated by
750 750 commas:
751 751
752 752 \begin{codecell}
753 753 \begin{codeinput}
754 754 \begin{lstlisting}
755 755 print lst2[0][1]
756 756 print arr2[0,1]
757 757 \end{lstlisting}
758 758 \end{codeinput}
759 759 \begin{codeoutput}
760 760 \begin{verbatim}
761 761 2
762 762 2
763 763 \end{verbatim}
764 764 \end{codeoutput}
765 765 \end{codecell}
766 766 Most of the array creation functions listed above can be used with more
767 767 than one dimension, for example:
768 768
769 769 \begin{codecell}
770 770 \begin{codeinput}
771 771 \begin{lstlisting}
772 772 np.zeros((2,3))
773 773 \end{lstlisting}
774 774 \end{codeinput}
775 775 \begin{codeoutput}
776 776 \begin{verbatim}
777 777 array([[ 0., 0., 0.],
778 778 [ 0., 0., 0.]])
779 779 \end{verbatim}
780 780 \end{codeoutput}
781 781 \end{codecell}
782 782 \begin{codecell}
783 783 \begin{codeinput}
784 784 \begin{lstlisting}
785 785 np.random.normal(10, 3, (2, 4))
786 786 \end{lstlisting}
787 787 \end{codeinput}
788 788 \begin{codeoutput}
789 789 \begin{verbatim}
790 790 array([[ 11.26788826, 4.29619866, 11.09346496, 9.73861307],
791 791 [ 10.54025996, 9.5146268 , 10.80367214, 13.62204505]])
792 792 \end{verbatim}
793 793 \end{codeoutput}
794 794 \end{codecell}
795 795 In fact, the shape of an array can be changed at any time, as long as
796 796 the total number of elements is unchanged. For example, if we want a 2x4
797 797 array with numbers increasing from 0, the easiest way to create it is:
798 798
799 799 \begin{codecell}
800 800 \begin{codeinput}
801 801 \begin{lstlisting}
802 802 arr = np.arange(8).reshape(2,4)
803 803 print arr
804 804 \end{lstlisting}
805 805 \end{codeinput}
806 806 \begin{codeoutput}
807 807 \begin{verbatim}
808 808 [[0 1 2 3]
809 809 [4 5 6 7]]
810 810 \end{verbatim}
811 811 \end{codeoutput}
812 812 \end{codecell}
813 813 With multidimensional arrays, you can also use slices, and you can mix
814 814 and match slices and single indices in the different dimensions (using
815 815 the same array as above):
816 816
817 817 \begin{codecell}
818 818 \begin{codeinput}
819 819 \begin{lstlisting}
820 820 print 'Slicing in the second row:', arr[1, 2:4]
821 821 print 'All rows, third column :', arr[:, 2]
822 822 \end{lstlisting}
823 823 \end{codeinput}
824 824 \begin{codeoutput}
825 825 \begin{verbatim}
826 826 Slicing in the second row: [6 7]
827 827 All rows, third column : [2 6]
828 828 \end{verbatim}
829 829 \end{codeoutput}
830 830 \end{codecell}
831 831 If you only provide one index, then you will get an array with one less
832 832 dimension containing that row:
833 833
834 834 \begin{codecell}
835 835 \begin{codeinput}
836 836 \begin{lstlisting}
837 837 print 'First row: ', arr[0]
838 838 print 'Second row: ', arr[1]
839 839 \end{lstlisting}
840 840 \end{codeinput}
841 841 \begin{codeoutput}
842 842 \begin{verbatim}
843 843 First row: [0 1 2 3]
844 844 Second row: [4 5 6 7]
845 845 \end{verbatim}
846 846 \end{codeoutput}
847 847 \end{codecell}
848 848 Now that we have seen how to create arrays with more than one dimension,
849 849 it's a good idea to look at some of the most useful properties and
850 850 methods that arrays have. The following provide basic information about
851 851 the size, shape and data in the array:
852 852
853 853 \begin{codecell}
854 854 \begin{codeinput}
855 855 \begin{lstlisting}
856 856 print 'Data type :', arr.dtype
857 857 print 'Total number of elements :', arr.size
858 858 print 'Number of dimensions :', arr.ndim
859 859 print 'Shape (dimensionality) :', arr.shape
860 860 print 'Memory used (in bytes) :', arr.nbytes
861 861 \end{lstlisting}
862 862 \end{codeinput}
863 863 \begin{codeoutput}
864 864 \begin{verbatim}
865 865 Data type : int32
866 866 Total number of elements : 8
867 867 Number of dimensions : 2
868 868 Shape (dimensionality) : (2, 4)
869 869 Memory used (in bytes) : 32
870 870 \end{verbatim}
871 871 \end{codeoutput}
872 872 \end{codecell}
873 873 Arrays also have many useful methods, some especially useful ones are:
874 874
875 875 \begin{codecell}
876 876 \begin{codeinput}
877 877 \begin{lstlisting}
878 878 print 'Minimum and maximum :', arr.min(), arr.max()
879 879 print 'Sum and product of all elements :', arr.sum(), arr.prod()
880 880 print 'Mean and standard deviation :', arr.mean(), arr.std()
881 881 \end{lstlisting}
882 882 \end{codeinput}
883 883 \begin{codeoutput}
884 884 \begin{verbatim}
885 885 Minimum and maximum : 0 7
886 886 Sum and product of all elements : 28 0
887 887 Mean and standard deviation : 3.5 2.29128784748
888 888 \end{verbatim}
889 889 \end{codeoutput}
890 890 \end{codecell}
891 891 For these methods, the above operations area all computed on all the
892 892 elements of the array. But for a multidimensional array, it's possible
893 893 to do the computation along a single dimension, by passing the
894 894 \texttt{axis} parameter; for example:
895 895
896 896 \begin{codecell}
897 897 \begin{codeinput}
898 898 \begin{lstlisting}
899 899 print 'For the following array:\n', arr
900 900 print 'The sum of elements along the rows is :', arr.sum(axis=1)
901 901 print 'The sum of elements along the columns is :', arr.sum(axis=0)
902 902 \end{lstlisting}
903 903 \end{codeinput}
904 904 \begin{codeoutput}
905 905 \begin{verbatim}
906 906 For the following array:
907 907 [[0 1 2 3]
908 908 [4 5 6 7]]
909 909 The sum of elements along the rows is : [ 6 22]
910 910 The sum of elements along the columns is : [ 4 6 8 10]
911 911 \end{verbatim}
912 912 \end{codeoutput}
913 913 \end{codecell}
914 914 As you can see in this example, the value of the \texttt{axis} parameter
915 915 is the dimension which will be \emph{consumed} once the operation has
916 916 been carried out. This is why to sum along the rows we use
917 917 \texttt{axis=0}.
918 918
919 919 This can be easily illustrated with an example that has more dimensions;
920 920 we create an array with 4 dimensions and shape \texttt{(3,4,5,6)} and
921 921 sum along the axis number 2 (i.e.~the \emph{third} axis, since in Python
922 922 all counts are 0-based). That consumes the dimension whose length was 5,
923 923 leaving us with a new array that has shape \texttt{(3,4,6)}:
924 924
925 925 \begin{codecell}
926 926 \begin{codeinput}
927 927 \begin{lstlisting}
928 928 np.zeros((3,4,5,6)).sum(2).shape
929 929 \end{lstlisting}
930 930 \end{codeinput}
931 931 \begin{codeoutput}
932 932 \begin{verbatim}
933 933 (3, 4, 6)
934 934 \end{verbatim}
935 935 \end{codeoutput}
936 936 \end{codecell}
937 937 Another widely used property of arrays is the \texttt{.T} attribute,
938 938 which allows you to access the transpose of the array:
939 939
940 940 \begin{codecell}
941 941 \begin{codeinput}
942 942 \begin{lstlisting}
943 943 print 'Array:\n', arr
944 944 print 'Transpose:\n', arr.T
945 945 \end{lstlisting}
946 946 \end{codeinput}
947 947 \begin{codeoutput}
948 948 \begin{verbatim}
949 949 Array:
950 950 [[0 1 2 3]
951 951 [4 5 6 7]]
952 952 Transpose:
953 953 [[0 4]
954 954 [1 5]
955 955 [2 6]
956 956 [3 7]]
957 957 \end{verbatim}
958 958 \end{codeoutput}
959 959 \end{codecell}
960 960 We don't have time here to look at all the methods and properties of
961 961 arrays, here's a complete list. Simply try exploring some of these
962 962 IPython to learn more, or read their description in the full Numpy
963 963 documentation:
964 964
965 965 \begin{verbatim}
966 966 arr.T arr.copy arr.getfield arr.put arr.squeeze
967 967 arr.all arr.ctypes arr.imag arr.ravel arr.std
968 968 arr.any arr.cumprod arr.item arr.real arr.strides
969 969 arr.argmax arr.cumsum arr.itemset arr.repeat arr.sum
970 970 arr.argmin arr.data arr.itemsize arr.reshape arr.swapaxes
971 971 arr.argsort arr.diagonal arr.max arr.resize arr.take
972 972 arr.astype arr.dot arr.mean arr.round arr.tofile
973 973 arr.base arr.dtype arr.min arr.searchsorted arr.tolist
974 974 arr.byteswap arr.dump arr.nbytes arr.setasflat arr.tostring
975 975 arr.choose arr.dumps arr.ndim arr.setfield arr.trace
976 976 arr.clip arr.fill arr.newbyteorder arr.setflags arr.transpose
977 977 arr.compress arr.flags arr.nonzero arr.shape arr.var
978 978 arr.conj arr.flat arr.prod arr.size arr.view
979 979 arr.conjugate arr.flatten arr.ptp arr.sort
980 980 \end{verbatim}
981 981
982 982
983 983 \subsection{Operating with arrays}
984 984 Arrays support all regular arithmetic operators, and the numpy library
985 985 also contains a complete collection of basic mathematical functions that
986 986 operate on arrays. It is important to remember that in general, all
987 987 operations with arrays are applied \emph{element-wise}, i.e., are
988 988 applied to all the elements of the array at the same time. Consider for
989 989 example:
990 990
991 991 \begin{codecell}
992 992 \begin{codeinput}
993 993 \begin{lstlisting}
994 994 arr1 = np.arange(4)
995 995 arr2 = np.arange(10, 14)
996 996 print arr1, '+', arr2, '=', arr1+arr2
997 997 \end{lstlisting}
998 998 \end{codeinput}
999 999 \begin{codeoutput}
1000 1000 \begin{verbatim}
1001 1001 [0 1 2 3] + [10 11 12 13] = [10 12 14 16]
1002 1002 \end{verbatim}
1003 1003 \end{codeoutput}
1004 1004 \end{codecell}
1005 1005 Importantly, you must remember that even the multiplication operator is
1006 1006 by default applied element-wise, it is \emph{not} the matrix
1007 1007 multiplication from linear algebra (as is the case in Matlab, for
1008 1008 example):
1009 1009
1010 1010 \begin{codecell}
1011 1011 \begin{codeinput}
1012 1012 \begin{lstlisting}
1013 1013 print arr1, '*', arr2, '=', arr1*arr2
1014 1014 \end{lstlisting}
1015 1015 \end{codeinput}
1016 1016 \begin{codeoutput}
1017 1017 \begin{verbatim}
1018 1018 [0 1 2 3] * [10 11 12 13] = [ 0 11 24 39]
1019 1019 \end{verbatim}
1020 1020 \end{codeoutput}
1021 1021 \end{codecell}
1022 1022 While this means that in principle arrays must always match in their
1023 1023 dimensionality in order for an operation to be valid, numpy will
1024 1024 \emph{broadcast} dimensions when possible. For example, suppose that you
1025 1025 want to add the number 1.5 to \texttt{arr1}; the following would be a
1026 1026 valid way to do it:
1027 1027
1028 1028 \begin{codecell}
1029 1029 \begin{codeinput}
1030 1030 \begin{lstlisting}
1031 1031 arr1 + 1.5*np.ones(4)
1032 1032 \end{lstlisting}
1033 1033 \end{codeinput}
1034 1034 \begin{codeoutput}
1035 1035 \begin{verbatim}
1036 1036 array([ 1.5, 2.5, 3.5, 4.5])
1037 1037 \end{verbatim}
1038 1038 \end{codeoutput}
1039 1039 \end{codecell}
1040 1040 But thanks to numpy's broadcasting rules, the following is equally
1041 1041 valid:
1042 1042
1043 1043 \begin{codecell}
1044 1044 \begin{codeinput}
1045 1045 \begin{lstlisting}
1046 1046 arr1 + 1.5
1047 1047 \end{lstlisting}
1048 1048 \end{codeinput}
1049 1049 \begin{codeoutput}
1050 1050 \begin{verbatim}
1051 1051 array([ 1.5, 2.5, 3.5, 4.5])
1052 1052 \end{verbatim}
1053 1053 \end{codeoutput}
1054 1054 \end{codecell}
1055 1055 In this case, numpy looked at both operands and saw that the first
1056 1056 (\texttt{arr1}) was a one-dimensional array of length 4 and the second
1057 1057 was a scalar, considered a zero-dimensional object. The broadcasting
1058 1058 rules allow numpy to:
1059 1059
1060 1060 \begin{itemize}
1061 1061 \item
1062 1062 \emph{create} new dimensions of length 1 (since this doesn't change
1063 1063 the size of the array)
1064 1064 \item
1065 1065 `stretch' a dimension of length 1 that needs to be matched to a
1066 1066 dimension of a different size.
1067 1067 \end{itemize}
1068 1068 So in the above example, the scalar 1.5 is effectively:
1069 1069
1070 1070 \begin{itemize}
1071 1071 \item
1072 1072 first `promoted' to a 1-dimensional array of length 1
1073 1073 \item
1074 1074 then, this array is `stretched' to length 4 to match the dimension of
1075 1075 \texttt{arr1}.
1076 1076 \end{itemize}
1077 1077 After these two operations are complete, the addition can proceed as now
1078 1078 both operands are one-dimensional arrays of length 4.
1079 1079
1080 1080 This broadcasting behavior is in practice enormously powerful,
1081 1081 especially because when numpy broadcasts to create new dimensions or to
1082 1082 `stretch' existing ones, it doesn't actually replicate the data. In the
1083 1083 example above the operation is carried \emph{as if} the 1.5 was a 1-d
1084 1084 array with 1.5 in all of its entries, but no actual array was ever
1085 1085 created. This can save lots of memory in cases when the arrays in
1086 1086 question are large and can have significant performance implications.
1087 1087
1088 1088 The general rule is: when operating on two arrays, NumPy compares their
1089 1089 shapes element-wise. It starts with the trailing dimensions, and works
1090 1090 its way forward, creating dimensions of length 1 as needed. Two
1091 1091 dimensions are considered compatible when
1092 1092
1093 1093 \begin{itemize}
1094 1094 \item
1095 1095 they are equal to begin with, or
1096 1096 \item
1097 1097 one of them is 1; in this case numpy will do the `stretching' to make
1098 1098 them equal.
1099 1099 \end{itemize}
1100 1100 If these conditions are not met, a
1101 1101 \texttt{ValueError: frames are not aligned} exception is thrown,
1102 1102 indicating that the arrays have incompatible shapes. The size of the
1103 1103 resulting array is the maximum size along each dimension of the input
1104 1104 arrays.
1105 1105
1106 1106 This shows how the broadcasting rules work in several dimensions:
1107 1107
1108 1108 \begin{codecell}
1109 1109 \begin{codeinput}
1110 1110 \begin{lstlisting}
1111 1111 b = np.array([2, 3, 4, 5])
1112 1112 print arr, '\n\n+', b , '\n----------------\n', arr + b
1113 1113 \end{lstlisting}
1114 1114 \end{codeinput}
1115 1115 \begin{codeoutput}
1116 1116 \begin{verbatim}
1117 1117 [[0 1 2 3]
1118 1118 [4 5 6 7]]
1119 1119
1120 1120 + [2 3 4 5]
1121 1121 ----------------
1122 1122 [[ 2 4 6 8]
1123 1123 [ 6 8 10 12]]
1124 1124 \end{verbatim}
1125 1125 \end{codeoutput}
1126 1126 \end{codecell}
1127 1127 Now, how could you use broadcasting to say add \texttt{{[}4, 6{]}} along
1128 1128 the rows to \texttt{arr} above? Simply performing the direct addition
1129 1129 will produce the error we previously mentioned:
1130 1130
1131 1131 \begin{codecell}
1132 1132 \begin{codeinput}
1133 1133 \begin{lstlisting}
1134 1134 c = np.array([4, 6])
1135 1135 arr + c
1136 1136 \end{lstlisting}
1137 1137 \end{codeinput}
1138 1138 \begin{codeoutput}
1139 1139 \begin{traceback}
1140 1140 \begin{verbatim}
1141 1141 ---------------------------------------------------------------------------
1142 1142 ValueError Traceback (most recent call last)
1143 1143 /home/fperez/teach/book-math-labtool/<ipython-input-45-62aa20ac1980> in <module>()
1144 1144 1 c = np.array([4, 6])
1145 1145 ----> 2 arr + c
1146 1146
1147 1147 ValueError: operands could not be broadcast together with shapes (2,4) (2)
1148 1148 \end{verbatim}
1149 1149 \end{traceback}
1150 1150 \end{codeoutput}
1151 1151 \end{codecell}
1152 1152 According to the rules above, the array \texttt{c} would need to have a
1153 1153 \emph{trailing} dimension of 1 for the broadcasting to work. It turns
1154 1154 out that numpy allows you to `inject' new dimensions anywhere into an
1155 1155 array on the fly, by indexing it with the special object
1156 1156 \texttt{np.newaxis}:
1157 1157
1158 1158 \begin{codecell}
1159 1159 \begin{codeinput}
1160 1160 \begin{lstlisting}
1161 1161 (c[:, np.newaxis]).shape
1162 1162 \end{lstlisting}
1163 1163 \end{codeinput}
1164 1164 \begin{codeoutput}
1165 1165 \begin{verbatim}
1166 1166 (2, 1)
1167 1167 \end{verbatim}
1168 1168 \end{codeoutput}
1169 1169 \end{codecell}
1170 1170 This is exactly what we need, and indeed it works:
1171 1171
1172 1172 \begin{codecell}
1173 1173 \begin{codeinput}
1174 1174 \begin{lstlisting}
1175 1175 arr + c[:, np.newaxis]
1176 1176 \end{lstlisting}
1177 1177 \end{codeinput}
1178 1178 \begin{codeoutput}
1179 1179 \begin{verbatim}
1180 1180 array([[ 4, 5, 6, 7],
1181 1181 [10, 11, 12, 13]])
1182 1182 \end{verbatim}
1183 1183 \end{codeoutput}
1184 1184 \end{codecell}
1185 1185 For the full broadcasting rules, please see the official Numpy docs,
1186 1186 which describe them in detail and with more complex examples.
1187 1187
1188 1188 As we mentioned before, Numpy ships with a full complement of
1189 1189 mathematical functions that work on entire arrays, including logarithms,
1190 1190 exponentials, trigonometric and hyperbolic trigonometric functions, etc.
1191 1191 Furthermore, scipy ships a rich special function library in the
1192 1192 \texttt{scipy.special} module that includes Bessel, Airy, Fresnel,
1193 1193 Laguerre and other classical special functions. For example, sampling
1194 1194 the sine function at 100 points between $0$ and $2\pi$ is as simple as:
1195 1195
1196 1196 \begin{codecell}
1197 1197 \begin{codeinput}
1198 1198 \begin{lstlisting}
1199 1199 x = np.linspace(0, 2*np.pi, 100)
1200 1200 y = np.sin(x)
1201 1201 \end{lstlisting}
1202 1202 \end{codeinput}
1203 1203 \end{codecell}
1204 1204 \subsection{Linear algebra in numpy}
1205 1205 Numpy ships with a basic linear algebra library, and all arrays have a
1206 1206 \texttt{dot} method whose behavior is that of the scalar dot product
1207 1207 when its arguments are vectors (one-dimensional arrays) and the
1208 1208 traditional matrix multiplication when one or both of its arguments are
1209 1209 two-dimensional arrays:
1210 1210
1211 1211 \begin{codecell}
1212 1212 \begin{codeinput}
1213 1213 \begin{lstlisting}
1214 1214 v1 = np.array([2, 3, 4])
1215 1215 v2 = np.array([1, 0, 1])
1216 1216 print v1, '.', v2, '=', v1.dot(v2)
1217 1217 \end{lstlisting}
1218 1218 \end{codeinput}
1219 1219 \begin{codeoutput}
1220 1220 \begin{verbatim}
1221 1221 [2 3 4] . [1 0 1] = 6
1222 1222 \end{verbatim}
1223 1223 \end{codeoutput}
1224 1224 \end{codecell}
1225 1225 Here is a regular matrix-vector multiplication, note that the array
1226 1226 \texttt{v1} should be viewed as a \emph{column} vector in traditional
1227 1227 linear algebra notation; numpy makes no distinction between row and
1228 1228 column vectors and simply verifies that the dimensions match the
1229 1229 required rules of matrix multiplication, in this case we have a
1230 1230 $2 \times 3$ matrix multiplied by a 3-vector, which produces a 2-vector:
1231 1231
1232 1232 \begin{codecell}
1233 1233 \begin{codeinput}
1234 1234 \begin{lstlisting}
1235 1235 A = np.arange(6).reshape(2, 3)
1236 1236 print A, 'x', v1, '=', A.dot(v1)
1237 1237 \end{lstlisting}
1238 1238 \end{codeinput}
1239 1239 \begin{codeoutput}
1240 1240 \begin{verbatim}
1241 1241 [[0 1 2]
1242 1242 [3 4 5]] x [2 3 4] = [11 38]
1243 1243 \end{verbatim}
1244 1244 \end{codeoutput}
1245 1245 \end{codecell}
1246 1246 For matrix-matrix multiplication, the same dimension-matching rules must
1247 1247 be satisfied, e.g.~consider the difference between $A \times A^T$:
1248 1248
1249 1249 \begin{codecell}
1250 1250 \begin{codeinput}
1251 1251 \begin{lstlisting}
1252 1252 print A.dot(A.T)
1253 1253 \end{lstlisting}
1254 1254 \end{codeinput}
1255 1255 \begin{codeoutput}
1256 1256 \begin{verbatim}
1257 1257 [[ 5 14]
1258 1258 [14 50]]
1259 1259 \end{verbatim}
1260 1260 \end{codeoutput}
1261 1261 \end{codecell}
1262 1262 and $A^T \times A$:
1263 1263
1264 1264 \begin{codecell}
1265 1265 \begin{codeinput}
1266 1266 \begin{lstlisting}
1267 1267 print A.T.dot(A)
1268 1268 \end{lstlisting}
1269 1269 \end{codeinput}
1270 1270 \begin{codeoutput}
1271 1271 \begin{verbatim}
1272 1272 [[ 9 12 15]
1273 1273 [12 17 22]
1274 1274 [15 22 29]]
1275 1275 \end{verbatim}
1276 1276 \end{codeoutput}
1277 1277 \end{codecell}
1278 1278 Furthermore, the \texttt{numpy.linalg} module includes additional
1279 1279 functionality such as determinants, matrix norms, Cholesky, eigenvalue
1280 1280 and singular value decompositions, etc. For even more linear algebra
1281 1281 tools, \texttt{scipy.linalg} contains the majority of the tools in the
1282 1282 classic LAPACK libraries as well as functions to operate on sparse
1283 1283 matrices. We refer the reader to the Numpy and Scipy documentations for
1284 1284 additional details on these.
1285 1285
1286 1286 \subsection{Reading and writing arrays to disk}
1287 1287 Numpy lets you read and write arrays into files in a number of ways. In
1288 1288 order to use these tools well, it is critical to understand the
1289 1289 difference between a \emph{text} and a \emph{binary} file containing
1290 1290 numerical data. In a text file, the number $\pi$ could be written as
1291 1291 ``3.141592653589793'', for example: a string of digits that a human can
1292 1292 read, with in this case 15 decimal digits. In contrast, that same number
1293 1293 written to a binary file would be encoded as 8 characters (bytes) that
1294 1294 are not readable by a human but which contain the exact same data that
1295 1295 the variable \texttt{pi} had in the computer's memory.
1296 1296
1297 1297 The tradeoffs between the two modes are thus:
1298 1298
1299 1299 \begin{itemize}
1300 1300 \item
1301 1301 Text mode: occupies more space, precision can be lost (if not all
1302 1302 digits are written to disk), but is readable and editable by hand with
1303 1303 a text editor. Can \emph{only} be used for one- and two-dimensional
1304 1304 arrays.
1305 1305 \item
1306 1306 Binary mode: compact and exact representation of the data in memory,
1307 1307 can't be read or edited by hand. Arrays of any size and dimensionality
1308 1308 can be saved and read without loss of information.
1309 1309 \end{itemize}
1310 1310 First, let's see how to read and write arrays in text mode. The
1311 1311 \texttt{np.savetxt} function saves an array to a text file, with options
1312 1312 to control the precision, separators and even adding a header:
1313 1313
1314 1314 \begin{codecell}
1315 1315 \begin{codeinput}
1316 1316 \begin{lstlisting}
1317 1317 arr = np.arange(10).reshape(2, 5)
1318 1318 np.savetxt('test.out', arr, fmt='%.2e', header="My dataset")
1319 1319 !cat test.out
1320 1320 \end{lstlisting}
1321 1321 \end{codeinput}
1322 1322 \begin{codeoutput}
1323 1323 \begin{verbatim}
1324 1324 # My dataset
1325 1325 0.00e+00 1.00e+00 2.00e+00 3.00e+00 4.00e+00
1326 1326 5.00e+00 6.00e+00 7.00e+00 8.00e+00 9.00e+00
1327 1327 \end{verbatim}
1328 1328 \end{codeoutput}
1329 1329 \end{codecell}
1330 1330 And this same type of file can then be read with the matching
1331 1331 \texttt{np.loadtxt} function:
1332 1332
1333 1333 \begin{codecell}
1334 1334 \begin{codeinput}
1335 1335 \begin{lstlisting}
1336 1336 arr2 = np.loadtxt('test.out')
1337 1337 print arr2
1338 1338 \end{lstlisting}
1339 1339 \end{codeinput}
1340 1340 \begin{codeoutput}
1341 1341 \begin{verbatim}
1342 1342 [[ 0. 1. 2. 3. 4.]
1343 1343 [ 5. 6. 7. 8. 9.]]
1344 1344 \end{verbatim}
1345 1345 \end{codeoutput}
1346 1346 \end{codecell}
1347 1347 For binary data, Numpy provides the \texttt{np.save} and
1348 1348 \texttt{np.savez} routines. The first saves a single array to a file
1349 1349 with \texttt{.npy} extension, while the latter can be used to save a
1350 1350 \emph{group} of arrays into a single file with \texttt{.npz} extension.
1351 1351 The files created with these routines can then be read with the
1352 1352 \texttt{np.load} function.
1353 1353
1354 1354 Let us first see how to use the simpler \texttt{np.save} function to
1355 1355 save a single array:
1356 1356
1357 1357 \begin{codecell}
1358 1358 \begin{codeinput}
1359 1359 \begin{lstlisting}
1360 1360 np.save('test.npy', arr2)
1361 1361 # Now we read this back
1362 1362 arr2n = np.load('test.npy')
1363 1363 # Let's see if any element is non-zero in the difference.
1364 1364 # A value of True would be a problem.
1365 1365 print 'Any differences?', np.any(arr2-arr2n)
1366 1366 \end{lstlisting}
1367 1367 \end{codeinput}
1368 1368 \begin{codeoutput}
1369 1369 \begin{verbatim}
1370 1370 Any differences? False
1371 1371 \end{verbatim}
1372 1372 \end{codeoutput}
1373 1373 \end{codecell}
1374 1374 Now let us see how the \texttt{np.savez} function works. You give it a
1375 1375 filename and either a sequence of arrays or a set of keywords. In the
1376 1376 first mode, the function will auotmatically name the saved arrays in the
1377 1377 archive as \texttt{arr\_0}, \texttt{arr\_1}, etc:
1378 1378
1379 1379 \begin{codecell}
1380 1380 \begin{codeinput}
1381 1381 \begin{lstlisting}
1382 1382 np.savez('test.npz', arr, arr2)
1383 1383 arrays = np.load('test.npz')
1384 1384 arrays.files
1385 1385 \end{lstlisting}
1386 1386 \end{codeinput}
1387 1387 \begin{codeoutput}
1388 1388 \begin{verbatim}
1389 1389 ['arr_1', 'arr_0']
1390 1390 \end{verbatim}
1391 1391 \end{codeoutput}
1392 1392 \end{codecell}
1393 1393 Alternatively, we can explicitly choose how to name the arrays we save:
1394 1394
1395 1395 \begin{codecell}
1396 1396 \begin{codeinput}
1397 1397 \begin{lstlisting}
1398 1398 np.savez('test.npz', array1=arr, array2=arr2)
1399 1399 arrays = np.load('test.npz')
1400 1400 arrays.files
1401 1401 \end{lstlisting}
1402 1402 \end{codeinput}
1403 1403 \begin{codeoutput}
1404 1404 \begin{verbatim}
1405 1405 ['array2', 'array1']
1406 1406 \end{verbatim}
1407 1407 \end{codeoutput}
1408 1408 \end{codecell}
1409 1409 The object returned by \texttt{np.load} from an \texttt{.npz} file works
1410 1410 like a dictionary, though you can also access its constituent files by
1411 1411 attribute using its special \texttt{.f} field; this is best illustrated
1412 1412 with an example with the \texttt{arrays} object from above:
1413 1413
1414 1414 \begin{codecell}
1415 1415 \begin{codeinput}
1416 1416 \begin{lstlisting}
1417 1417 print 'First row of first array:', arrays['array1'][0]
1418 1418 # This is an equivalent way to get the same field
1419 1419 print 'First row of first array:', arrays.f.array1[0]
1420 1420 \end{lstlisting}
1421 1421 \end{codeinput}
1422 1422 \begin{codeoutput}
1423 1423 \begin{verbatim}
1424 1424 First row of first array: [0 1 2 3 4]
1425 1425 First row of first array: [0 1 2 3 4]
1426 1426 \end{verbatim}
1427 1427 \end{codeoutput}
1428 1428 \end{codecell}
1429 1429 This \texttt{.npz} format is a very convenient way to package compactly
1430 1430 and without loss of information, into a single file, a group of related
1431 1431 arrays that pertain to a specific problem. At some point, however, the
1432 1432 complexity of your dataset may be such that the optimal approach is to
1433 1433 use one of the standard formats in scientific data processing that have
1434 1434 been designed to handle complex datasets, such as NetCDF or HDF5.
1435 1435
1436 1436 Fortunately, there are tools for manipulating these formats in Python,
1437 1437 and for storing data in other ways such as databases. A complete
1438 1438 discussion of the possibilities is beyond the scope of this discussion,
1439 1439 but of particular interest for scientific users we at least mention the
1440 1440 following:
1441 1441
1442 1442 \begin{itemize}
1443 1443 \item
1444 1444 The \texttt{scipy.io} module contains routines to read and write
1445 1445 Matlab files in \texttt{.mat} format and files in the NetCDF format
1446 1446 that is widely used in certain scientific disciplines.
1447 1447 \item
1448 1448 For manipulating files in the HDF5 format, there are two excellent
1449 1449 options in Python: The PyTables project offers a high-level, object
1450 1450 oriented approach to manipulating HDF5 datasets, while the h5py
1451 1451 project offers a more direct mapping to the standard HDF5 library
1452 1452 interface. Both are excellent tools; if you need to work with HDF5
1453 1453 datasets you should read some of their documentation and examples and
1454 1454 decide which approach is a better match for your needs.
1455 1455 \end{itemize}
1456 1456
1457 1457 \section{High quality data visualization with Matplotlib}
1458 1458 The \href{http://matplotlib.sf.net}{matplotlib} library is a powerful
1459 1459 tool capable of producing complex publication-quality figures with fine
1460 1460 layout control in two and three dimensions; here we will only provide a
1461 1461 minimal self-contained introduction to its usage that covers the
1462 1462 functionality needed for the rest of the book. We encourage the reader
1463 1463 to read the tutorials included with the matplotlib documentation as well
1464 1464 as to browse its extensive gallery of examples that include source code.
1465 1465
1466 1466 Just as we typically use the shorthand \texttt{np} for Numpy, we will
1467 1467 use \texttt{plt} for the \texttt{matplotlib.pyplot} module where the
1468 1468 easy-to-use plotting functions reside (the library contains a rich
1469 1469 object-oriented architecture that we don't have the space to discuss
1470 1470 here):
1471 1471
1472 1472 \begin{codecell}
1473 1473 \begin{codeinput}
1474 1474 \begin{lstlisting}
1475 1475 import matplotlib.pyplot as plt
1476 1476 \end{lstlisting}
1477 1477 \end{codeinput}
1478 1478 \end{codecell}
1479 1479 The most frequently used function is simply called \texttt{plot}, here
1480 1480 is how you can make a simple plot of $\sin(x)$ for $x \in [0, 2\pi]$
1481 1481 with labels and a grid (we use the semicolon in the last line to
1482 1482 suppress the display of some information that is unnecessary right now):
1483 1483
1484 1484 \begin{codecell}
1485 1485 \begin{codeinput}
1486 1486 \begin{lstlisting}
1487 1487 x = np.linspace(0, 2*np.pi)
1488 1488 y = np.sin(x)
1489 1489 plt.plot(x,y, label='sin(x)')
1490 1490 plt.legend()
1491 1491 plt.grid()
1492 1492 plt.title('Harmonic')
1493 1493 plt.xlabel('x')
1494 1494 plt.ylabel('y');
1495 1495 \end{lstlisting}
1496 1496 \end{codeinput}
1497 1497 \begin{codeoutput}
1498 1498 \begin{center}
1499 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.pdf}
1499 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_01.pdf}
1500 1500 \par
1501 1501 \end{center}
1502 1502 \end{codeoutput}
1503 1503 \end{codecell}
1504 1504 You can control the style, color and other properties of the markers,
1505 1505 for example:
1506 1506
1507 1507 \begin{codecell}
1508 1508 \begin{codeinput}
1509 1509 \begin{lstlisting}
1510 1510 plt.plot(x, y, linewidth=2);
1511 1511 \end{lstlisting}
1512 1512 \end{codeinput}
1513 1513 \begin{codeoutput}
1514 1514 \begin{center}
1515 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.pdf}
1515 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_02.pdf}
1516 1516 \par
1517 1517 \end{center}
1518 1518 \end{codeoutput}
1519 1519 \end{codecell}
1520 1520 \begin{codecell}
1521 1521 \begin{codeinput}
1522 1522 \begin{lstlisting}
1523 1523 plt.plot(x, y, 'o', markersize=5, color='r');
1524 1524 \end{lstlisting}
1525 1525 \end{codeinput}
1526 1526 \begin{codeoutput}
1527 1527 \begin{center}
1528 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.pdf}
1528 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_03.pdf}
1529 1529 \par
1530 1530 \end{center}
1531 1531 \end{codeoutput}
1532 1532 \end{codecell}
1533 1533 We will now see how to create a few other common plot types, such as a
1534 1534 simple error plot:
1535 1535
1536 1536 \begin{codecell}
1537 1537 \begin{codeinput}
1538 1538 \begin{lstlisting}
1539 1539 # example data
1540 1540 x = np.arange(0.1, 4, 0.5)
1541 1541 y = np.exp(-x)
1542 1542
1543 1543 # example variable error bar values
1544 1544 yerr = 0.1 + 0.2*np.sqrt(x)
1545 1545 xerr = 0.1 + yerr
1546 1546
1547 1547 # First illustrate basic pyplot interface, using defaults where possible.
1548 1548 plt.figure()
1549 1549 plt.errorbar(x, y, xerr=0.2, yerr=0.4)
1550 1550 plt.title("Simplest errorbars, 0.2 in x, 0.4 in y");
1551 1551 \end{lstlisting}
1552 1552 \end{codeinput}
1553 1553 \begin{codeoutput}
1554 1554 \begin{center}
1555 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.pdf}
1555 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_04.pdf}
1556 1556 \par
1557 1557 \end{center}
1558 1558 \end{codeoutput}
1559 1559 \end{codecell}
1560 1560 A simple log plot
1561 1561
1562 1562 \begin{codecell}
1563 1563 \begin{codeinput}
1564 1564 \begin{lstlisting}
1565 1565 x = np.linspace(-5, 5)
1566 1566 y = np.exp(-x**2)
1567 1567 plt.semilogy(x, y);
1568 1568 \end{lstlisting}
1569 1569 \end{codeinput}
1570 1570 \begin{codeoutput}
1571 1571 \begin{center}
1572 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.pdf}
1572 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_05.pdf}
1573 1573 \par
1574 1574 \end{center}
1575 1575 \end{codeoutput}
1576 1576 \end{codecell}
1577 1577 A histogram annotated with text inside the plot, using the \texttt{text}
1578 1578 function:
1579 1579
1580 1580 \begin{codecell}
1581 1581 \begin{codeinput}
1582 1582 \begin{lstlisting}
1583 1583 mu, sigma = 100, 15
1584 1584 x = mu + sigma * np.random.randn(10000)
1585 1585
1586 1586 # the histogram of the data
1587 1587 n, bins, patches = plt.hist(x, 50, normed=1, facecolor='g', alpha=0.75)
1588 1588
1589 1589 plt.xlabel('Smarts')
1590 1590 plt.ylabel('Probability')
1591 1591 plt.title('Histogram of IQ')
1592 1592 # This will put a text fragment at the position given:
1593 1593 plt.text(55, .027, r'$\mu=100,\ \sigma=15$', fontsize=14)
1594 1594 plt.axis([40, 160, 0, 0.03])
1595 1595 plt.grid(True)
1596 1596 \end{lstlisting}
1597 1597 \end{codeinput}
1598 1598 \begin{codeoutput}
1599 1599 \begin{center}
1600 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.pdf}
1600 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_06.pdf}
1601 1601 \par
1602 1602 \end{center}
1603 1603 \end{codeoutput}
1604 1604 \end{codecell}
1605 1605 \subsection{Image display}
1606 1606 The \texttt{imshow} command can display single or multi-channel images.
1607 1607 A simple array of random numbers, plotted in grayscale:
1608 1608
1609 1609 \begin{codecell}
1610 1610 \begin{codeinput}
1611 1611 \begin{lstlisting}
1612 1612 from matplotlib import cm
1613 1613 plt.imshow(np.random.rand(5, 10), cmap=cm.gray, interpolation='nearest');
1614 1614 \end{lstlisting}
1615 1615 \end{codeinput}
1616 1616 \begin{codeoutput}
1617 1617 \begin{center}
1618 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.pdf}
1618 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_07.pdf}
1619 1619 \par
1620 1620 \end{center}
1621 1621 \end{codeoutput}
1622 1622 \end{codecell}
1623 1623 A real photograph is a multichannel image, \texttt{imshow} interprets it
1624 1624 correctly:
1625 1625
1626 1626 \begin{codecell}
1627 1627 \begin{codeinput}
1628 1628 \begin{lstlisting}
1629 1629 img = plt.imread('stinkbug.png')
1630 1630 print 'Dimensions of the array img:', img.shape
1631 1631 plt.imshow(img);
1632 1632 \end{lstlisting}
1633 1633 \end{codeinput}
1634 1634 \begin{codeoutput}
1635 1635 \begin{verbatim}
1636 1636 Dimensions of the array img: (375, 500, 3)
1637 1637 \end{verbatim}
1638 1638 \begin{center}
1639 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.pdf}
1639 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_08.pdf}
1640 1640 \par
1641 1641 \end{center}
1642 1642 \end{codeoutput}
1643 1643 \end{codecell}
1644 1644 \subsection{Simple 3d plotting with matplotlib}
1645 1645 Note that you must execute at least once in your session:
1646 1646
1647 1647 \begin{codecell}
1648 1648 \begin{codeinput}
1649 1649 \begin{lstlisting}
1650 1650 from mpl_toolkits.mplot3d import Axes3D
1651 1651 \end{lstlisting}
1652 1652 \end{codeinput}
1653 1653 \end{codecell}
1654 1654 One this has been done, you can create 3d axes with the
1655 1655 \texttt{projection='3d'} keyword to \texttt{add\_subplot}:
1656 1656
1657 1657 \begin{verbatim}
1658 1658 fig = plt.figure()
1659 1659 fig.add_subplot(<other arguments here>, projection='3d')
1660 1660 \end{verbatim}
1661 1661
1662 1662
1663 1663 A simple surface plot:
1664 1664
1665 1665 \begin{codecell}
1666 1666 \begin{codeinput}
1667 1667 \begin{lstlisting}
1668 1668 from mpl_toolkits.mplot3d.axes3d import Axes3D
1669 1669 from matplotlib import cm
1670 1670
1671 1671 fig = plt.figure()
1672 1672 ax = fig.add_subplot(1, 1, 1, projection='3d')
1673 1673 X = np.arange(-5, 5, 0.25)
1674 1674 Y = np.arange(-5, 5, 0.25)
1675 1675 X, Y = np.meshgrid(X, Y)
1676 1676 R = np.sqrt(X**2 + Y**2)
1677 1677 Z = np.sin(R)
1678 1678 surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.jet,
1679 1679 linewidth=0, antialiased=False)
1680 1680 ax.set_zlim3d(-1.01, 1.01);
1681 1681 \end{lstlisting}
1682 1682 \end{codeinput}
1683 1683 \begin{codeoutput}
1684 1684 \begin{center}
1685 \includegraphics[width=6in]{/Users/bussonniermatthias/nbconvert/tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.pdf}
1685 \includegraphics[width=6in]{tests/ipynbref/IntroNumPy.orig_files/IntroNumPy.orig_fig_09.pdf}
1686 1686 \par
1687 1687 \end{center}
1688 1688 \end{codeoutput}
1689 1689 \end{codecell}
1690 1690 \section{IPython: a powerful interactive environment}
1691 1691 A key component of the everyday workflow of most scientific computing
1692 1692 environments is a good interactive environment, that is, a system in
1693 1693 which you can execute small amounts of code and view the results
1694 1694 immediately, combining both printing out data and opening graphical
1695 1695 visualizations. All modern systems for scientific computing, commercial
1696 1696 and open source, include such functionality.
1697 1697
1698 1698 Out of the box, Python also offers a simple interactive shell with very
1699 1699 limited capabilities. But just like the scientific community built Numpy
1700 1700 to provide arrays suited for scientific work (since Pytyhon's lists
1701 1701 aren't optimal for this task), it has also developed an interactive
1702 1702 environment much more sophisticated than the built-in one. The
1703 1703 \href{http://ipython.org}{IPython project} offers a set of tools to make
1704 1704 productive use of the Python language, all the while working
1705 1705 interactively and with immedate feedback on your results. The basic
1706 1706 tools that IPython provides are:
1707 1707
1708 1708 \begin{enumerate}[1.]
1709 1709 \item
1710 1710 A powerful terminal shell, with many features designed to increase the
1711 1711 fluidity and productivity of everyday scientific workflows, including:
1712 1712
1713 1713 \begin{itemize}
1714 1714 \item
1715 1715 rich introspection of all objects and variables including easy
1716 1716 access to the source code of any function
1717 1717 \item
1718 1718 powerful and extensible tab completion of variables and filenames,
1719 1719 \item
1720 1720 tight integration with matplotlib, supporting interactive figures
1721 1721 that don't block the terminal,
1722 1722 \item
1723 1723 direct access to the filesystem and underlying operating system,
1724 1724 \item
1725 1725 an extensible system for shell-like commands called `magics' that
1726 1726 reduce the work needed to perform many common tasks,
1727 1727 \item
1728 1728 tools for easily running, timing, profiling and debugging your
1729 1729 codes,
1730 1730 \item
1731 1731 syntax highlighted error messages with much more detail than the
1732 1732 default Python ones,
1733 1733 \item
1734 1734 logging and access to all previous history of inputs, including
1735 1735 across sessions
1736 1736 \end{itemize}
1737 1737 \item
1738 1738 A Qt console that provides the look and feel of a terminal, but adds
1739 1739 support for inline figures, graphical calltips, a persistent session
1740 1740 that can survive crashes (even segfaults) of the kernel process, and
1741 1741 more.
1742 1742 \item
1743 1743 A web-based notebook that can execute code and also contain rich text
1744 1744 and figures, mathematical equations and arbitrary HTML. This notebook
1745 1745 presents a document-like view with cells where code is executed but
1746 1746 that can be edited in-place, reordered, mixed with explanatory text
1747 1747 and figures, etc.
1748 1748 \item
1749 1749 A high-performance, low-latency system for parallel computing that
1750 1750 supports the control of a cluster of IPython engines communicating
1751 1751 over a network, with optimizations that minimize unnecessary copying
1752 1752 of large objects (especially numpy arrays).
1753 1753 \end{enumerate}
1754 1754 We will now discuss the highlights of the tools 1-3 above so that you
1755 1755 can make them an effective part of your workflow. The topic of parallel
1756 1756 computing is beyond the scope of this document, but we encourage you to
1757 1757 read the extensive
1758 1758 \href{http://ipython.org/ipython-doc/rel-0.12.1/parallel/index.html}{documentation}
1759 1759 and \href{http://minrk.github.com/scipy-tutorial-2011/}{tutorials} on
1760 1760 this available on the IPython website.
1761 1761
1762 1762 \subsection{The IPython terminal}
1763 1763 You can start IPython at the terminal simply by typing:
1764 1764
1765 1765 \begin{verbatim}
1766 1766 $ ipython
1767 1767 \end{verbatim}
1768 1768 which will provide you some basic information about how to get started
1769 1769 and will then open a prompt labeled \texttt{In {[}1{]}:} for you to
1770 1770 start typing. Here we type $2^{64}$ and Python computes the result for
1771 1771 us in exact arithmetic, returning it as \texttt{Out{[}1{]}}:
1772 1772
1773 1773 \begin{verbatim}
1774 1774 $ ipython
1775 1775 Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
1776 1776 Type "copyright", "credits" or "license" for more information.
1777 1777
1778 1778 IPython 0.13.dev -- An enhanced Interactive Python.
1779 1779 ? -> Introduction and overview of IPython's features.
1780 1780 %quickref -> Quick reference.
1781 1781 help -> Python's own help system.
1782 1782 object? -> Details about 'object', use 'object??' for extra details.
1783 1783
1784 1784 In [1]: 2**64
1785 1785 Out[1]: 18446744073709551616L
1786 1786 \end{verbatim}
1787 1787 The first thing you should know about IPython is that all your inputs
1788 1788 and outputs are saved. There are two variables named \texttt{In} and
1789 1789 \texttt{Out} which are filled as you work with your results.
1790 1790 Furthermore, all outputs are also saved to auto-created variables of the
1791 1791 form \texttt{\_NN} where \texttt{NN} is the prompt number, and inputs to
1792 1792 \texttt{\_iNN}. This allows you to recover quickly the result of a prior
1793 1793 computation by referring to its number even if you forgot to store it as
1794 1794 a variable. For example, later on in the above session you can do:
1795 1795
1796 1796 \begin{verbatim}
1797 1797 In [6]: print _1
1798 1798 18446744073709551616
1799 1799 \end{verbatim}
1800 1800
1801 1801
1802 1802 We strongly recommend that you take a few minutes to read at least the
1803 1803 basic introduction provided by the \texttt{?} command, and keep in mind
1804 1804 that the \texttt{\%quickref} command at all times can be used as a quick
1805 1805 reference ``cheat sheet'' of the most frequently used features of
1806 1806 IPython.
1807 1807
1808 1808 At the IPython prompt, any valid Python code that you type will be
1809 1809 executed similarly to the default Python shell (though often with more
1810 1810 informative feedback). But since IPython is a \emph{superset} of the
1811 1811 default Python shell; let's have a brief look at some of its additional
1812 1812 functionality.
1813 1813
1814 1814 \textbf{Object introspection}
1815 1815
1816 1816 A simple \texttt{?} command provides a general introduction to IPython,
1817 1817 but as indicated in the banner above, you can use the \texttt{?} syntax
1818 1818 to ask for details about any object. For example, if we type
1819 1819 \texttt{\_1?}, IPython will print the following details about this
1820 1820 variable:
1821 1821
1822 1822 \begin{verbatim}
1823 1823 In [14]: _1?
1824 1824 Type: long
1825 1825 Base Class: <type 'long'>
1826 1826 String Form:18446744073709551616
1827 1827 Namespace: Interactive
1828 1828 Docstring:
1829 1829 long(x[, base]) -> integer
1830 1830
1831 1831 Convert a string or number to a long integer, if possible. A floating
1832 1832
1833 1833 [etc... snipped for brevity]
1834 1834 \end{verbatim}
1835 1835 If you add a second \texttt{?} and for any oobject \texttt{x} type
1836 1836 \texttt{x??}, IPython will try to provide an even more detailed analsysi
1837 1837 of the object, including its syntax-highlighted source code when it can
1838 1838 be found. It's possible that \texttt{x??} returns the same information
1839 1839 as \texttt{x?}, but in many cases \texttt{x??} will indeed provide
1840 1840 additional details.
1841 1841
1842 1842 Finally, the \texttt{?} syntax is also useful to search
1843 1843 \emph{namespaces} with wildcards. Suppose you are wondering if there is
1844 1844 any function in Numpy that may do text-related things; with
1845 1845 \texttt{np.*txt*?}, IPython will print all the names in the \texttt{np}
1846 1846 namespace (our Numpy shorthand) that have `txt' anywhere in their name:
1847 1847
1848 1848 \begin{verbatim}
1849 1849 In [17]: np.*txt*?
1850 1850 np.genfromtxt
1851 1851 np.loadtxt
1852 1852 np.mafromtxt
1853 1853 np.ndfromtxt
1854 1854 np.recfromtxt
1855 1855 np.savetxt
1856 1856 \end{verbatim}
1857 1857
1858 1858
1859 1859 \textbf{Tab completion}
1860 1860
1861 1861 IPython makes the tab key work extra hard for you as a way to rapidly
1862 1862 inspect objects and libraries. Whenever you have typed something at the
1863 1863 prompt, by hitting the \texttt{\textless{}tab\textgreater{}} key IPython
1864 1864 will try to complete the rest of the line. For this, IPython will
1865 1865 analyze the text you had so far and try to search for Python data or
1866 1866 files that may match the context you have already provided.
1867 1867
1868 1868 For example, if you type \texttt{np.load} and hit the key, you'll see:
1869 1869
1870 1870 \begin{verbatim}
1871 1871 In [21]: np.load<TAB HERE>
1872 1872 np.load np.loads np.loadtxt
1873 1873 \end{verbatim}
1874 1874 so you can quickly find all the load-related functionality in numpy. Tab
1875 1875 completion works even for function arguments, for example consider this
1876 1876 function definition:
1877 1877
1878 1878 \begin{verbatim}
1879 1879 In [20]: def f(x, frobinate=False):
1880 1880 ....: if frobinate:
1881 1881 ....: return x**2
1882 1882 ....:
1883 1883 \end{verbatim}
1884 1884 If you now use the \texttt{\textless{}tab\textgreater{}} key after
1885 1885 having typed `fro' you'll get all valid Python completions, but those
1886 1886 marked with \texttt{=} at the end are known to be keywords of your
1887 1887 function:
1888 1888
1889 1889 \begin{verbatim}
1890 1890 In [21]: f(2, fro<TAB HERE>
1891 1891 frobinate= frombuffer fromfunction frompyfunc fromstring
1892 1892 from fromfile fromiter fromregex frozenset
1893 1893 \end{verbatim}
1894 1894 at this point you can add the \texttt{b} letter and hit
1895 1895 \texttt{\textless{}tab\textgreater{}} once more, and IPython will finish
1896 1896 the line for you:
1897 1897
1898 1898 \begin{verbatim}
1899 1899 In [21]: f(2, frobinate=
1900 1900 \end{verbatim}
1901 1901 As a beginner, simply get into the habit of using
1902 1902 \texttt{\textless{}tab\textgreater{}} after most objects; it should
1903 1903 quickly become second nature as you will see how helps keep a fluid
1904 1904 workflow and discover useful information. Later on you can also
1905 1905 customize this behavior by writing your own completion code, if you so
1906 1906 desire.
1907 1907
1908 1908 \textbf{Matplotlib integration}
1909 1909
1910 1910 One of the most useful features of IPython for scientists is its tight
1911 1911 integration with matplotlib: at the terminal IPython lets you open
1912 1912 matplotlib figures without blocking your typing (which is what happens
1913 1913 if you try to do the same thing at the default Python shell), and in the
1914 1914 Qt console and notebook you can even view your figures embedded in your
1915 1915 workspace next to the code that created them.
1916 1916
1917 1917 The matplotlib support can be either activated when you start IPython by
1918 1918 passing the \texttt{-{}-pylab} flag, or at any point later in your
1919 1919 session by using the \texttt{\%pylab} command. If you start IPython with
1920 1920 \texttt{-{}-pylab}, you'll see something like this (note the extra
1921 1921 message about pylab):
1922 1922
1923 1923 \begin{verbatim}
1924 1924 $ ipython --pylab
1925 1925 Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
1926 1926 Type "copyright", "credits" or "license" for more information.
1927 1927
1928 1928 IPython 0.13.dev -- An enhanced Interactive Python.
1929 1929 ? -> Introduction and overview of IPython's features.
1930 1930 %quickref -> Quick reference.
1931 1931 help -> Python's own help system.
1932 1932 object? -> Details about 'object', use 'object??' for extra details.
1933 1933
1934 1934 Welcome to pylab, a matplotlib-based Python environment [backend: Qt4Agg].
1935 1935 For more information, type 'help(pylab)'.
1936 1936
1937 1937 In [1]:
1938 1938 \end{verbatim}
1939 1939 Furthermore, IPython will import \texttt{numpy} with the \texttt{np}
1940 1940 shorthand, \texttt{matplotlib.pyplot} as \texttt{plt}, and it will also
1941 1941 load all of the numpy and pyplot top-level names so that you can
1942 1942 directly type something like:
1943 1943
1944 1944 \begin{verbatim}
1945 1945 In [1]: x = linspace(0, 2*pi, 200)
1946 1946
1947 1947 In [2]: plot(x, sin(x))
1948 1948 Out[2]: [<matplotlib.lines.Line2D at 0x9e7c16c>]
1949 1949 \end{verbatim}
1950 1950 instead of having to prefix each call with its full signature (as we
1951 1951 have been doing in the examples thus far):
1952 1952
1953 1953 \begin{verbatim}
1954 1954 In [3]: x = np.linspace(0, 2*np.pi, 200)
1955 1955
1956 1956 In [4]: plt.plot(x, np.sin(x))
1957 1957 Out[4]: [<matplotlib.lines.Line2D at 0x9e900ac>]
1958 1958 \end{verbatim}
1959 1959 This shorthand notation can be a huge time-saver when working
1960 1960 interactively (it's a few characters but you are likely to type them
1961 1961 hundreds of times in a session). But we should note that as you develop
1962 1962 persistent scripts and notebooks meant for reuse, it's best to get in
1963 1963 the habit of using the longer notation (known as \emph{fully qualified
1964 1964 names} as it's clearer where things come from and it makes for more
1965 1965 robust, readable and maintainable code in the long run).
1966 1966
1967 1967 \textbf{Access to the operating system and files}
1968 1968
1969 1969 In IPython, you can type \texttt{ls} to see your files or \texttt{cd} to
1970 1970 change directories, just like you would at a regular system prompt:
1971 1971
1972 1972 \begin{verbatim}
1973 1973 In [2]: cd tests
1974 1974 /home/fperez/ipython/nbconvert/tests
1975 1975
1976 1976 In [3]: ls test.*
1977 1977 test.aux test.html test.ipynb test.log test.out test.pdf test.rst test.tex
1978 1978 \end{verbatim}
1979 1979 Furthermore, if you use the \texttt{!} at the beginning of a line, any
1980 1980 commands you pass afterwards go directly to the operating system:
1981 1981
1982 1982 \begin{verbatim}
1983 1983 In [4]: !echo "Hello IPython"
1984 1984 Hello IPython
1985 1985 \end{verbatim}
1986 1986 IPython offers a useful twist in this feature: it will substitute in the
1987 1987 command the value of any \emph{Python} variable you may have if you
1988 1988 prepend it with a \texttt{\$} sign:
1989 1989
1990 1990 \begin{verbatim}
1991 1991 In [5]: message = 'IPython interpolates from Python to the shell'
1992 1992
1993 1993 In [6]: !echo $message
1994 1994 IPython interpolates from Python to the shell
1995 1995 \end{verbatim}
1996 1996 This feature can be extremely useful, as it lets you combine the power
1997 1997 and clarity of Python for complex logic with the immediacy and
1998 1998 familiarity of many shell commands. Additionally, if you start the line
1999 1999 with \emph{two} \texttt{\$\$} signs, the output of the command will be
2000 2000 automatically captured as a list of lines, e.g.:
2001 2001
2002 2002 \begin{verbatim}
2003 2003 In [10]: !!ls test.*
2004 2004 Out[10]:
2005 2005 ['test.aux',
2006 2006 'test.html',
2007 2007 'test.ipynb',
2008 2008 'test.log',
2009 2009 'test.out',
2010 2010 'test.pdf',
2011 2011 'test.rst',
2012 2012 'test.tex']
2013 2013 \end{verbatim}
2014 2014 As explained above, you can now use this as the variable \texttt{\_10}.
2015 2015 If you directly want to capture the output of a system command to a
2016 2016 Python variable, you can use the syntax \texttt{=!}:
2017 2017
2018 2018 \begin{verbatim}
2019 2019 In [11]: testfiles =! ls test.*
2020 2020
2021 2021 In [12]: print testfiles
2022 2022 ['test.aux', 'test.html', 'test.ipynb', 'test.log', 'test.out', 'test.pdf', 'test.rst', 'test.tex']
2023 2023 \end{verbatim}
2024 2024 Finally, the special \texttt{\%alias} command lets you define names that
2025 2025 are shorthands for system commands, so that you can type them without
2026 2026 having to prefix them via \texttt{!} explicitly (for example,
2027 2027 \texttt{ls} is an alias that has been predefined for you at startup).
2028 2028
2029 2029 \textbf{Magic commands}
2030 2030
2031 2031 IPython has a system for special commands, called `magics', that let you
2032 2032 control IPython itself and perform many common tasks with a more
2033 2033 shell-like syntax: it uses spaces for delimiting arguments, flags can be
2034 2034 set with dashes and all arguments are treated as strings, so no
2035 2035 additional quoting is required. This kind of syntax is invalid in the
2036 2036 Python language but very convenient for interactive typing (less
2037 2037 parentheses, commans and quoting everywhere); IPython distinguishes the
2038 2038 two by detecting lines that start with the \texttt{\%} character.
2039 2039
2040 2040 You can learn more about the magic system by simply typing
2041 2041 \texttt{\%magic} at the prompt, which will give you a short description
2042 2042 plus the documentation on \emph{all} available magics. If you want to
2043 2043 see only a listing of existing magics, you can use \texttt{\%lsmagic}:
2044 2044
2045 2045 \begin{verbatim}
2046 2046 In [4]: lsmagic
2047 2047 Available magic functions:
2048 2048 %alias %autocall %autoindent %automagic %bookmark %c %cd %colors %config %cpaste
2049 2049 %debug %dhist %dirs %doctest_mode %ds %ed %edit %env %gui %hist %history
2050 2050 %install_default_config %install_ext %install_profiles %load_ext %loadpy %logoff %logon
2051 2051 %logstart %logstate %logstop %lsmagic %macro %magic %notebook %page %paste %pastebin
2052 2052 %pd %pdb %pdef %pdoc %pfile %pinfo %pinfo2 %pop %popd %pprint %precision %profile
2053 2053 %prun %psearch %psource %pushd %pwd %pycat %pylab %quickref %recall %rehashx
2054 2054 %reload_ext %rep %rerun %reset %reset_selective %run %save %sc %stop %store %sx %tb
2055 2055 %time %timeit %unalias %unload_ext %who %who_ls %whos %xdel %xmode
2056 2056
2057 2057 Automagic is ON, % prefix NOT needed for magic functions.
2058 2058 \end{verbatim}
2059 2059 Note how the example above omitted the eplicit \texttt{\%} marker and
2060 2060 simply uses \texttt{lsmagic}. As long as the `automagic' feature is on
2061 2061 (which it is by default), you can omit the \texttt{\%} marker as long as
2062 2062 there is no ambiguity with a Python variable of the same name.
2063 2063
2064 2064 \textbf{Running your code}
2065 2065
2066 2066 While it's easy to type a few lines of code in IPython, for any
2067 2067 long-lived work you should keep your codes in Python scripts (or in
2068 2068 IPython notebooks, see below). Consider that you have a script, in this
2069 2069 case trivially simple for the sake of brevity, named \texttt{simple.py}:
2070 2070
2071 2071 \begin{verbatim}
2072 2072 In [12]: !cat simple.py
2073 2073 import numpy as np
2074 2074
2075 2075 x = np.random.normal(size=100)
2076 2076
2077 2077 print 'First elment of x:', x[0]
2078 2078 \end{verbatim}
2079 2079 The typical workflow with IPython is to use the \texttt{\%run} magic to
2080 2080 execute your script (you can omit the .py extension if you want). When
2081 2081 you run it, the script will execute just as if it had been run at the
2082 2082 system prompt with \texttt{python simple.py} (though since modules don't
2083 2083 get re-executed on new imports by Python, all system initialization is
2084 2084 essentially free, which can have a significant run time impact in some
2085 2085 cases):
2086 2086
2087 2087 \begin{verbatim}
2088 2088 In [13]: run simple
2089 2089 First elment of x: -1.55872256289
2090 2090 \end{verbatim}
2091 2091 Once it completes, all variables defined in it become available for you
2092 2092 to use interactively:
2093 2093
2094 2094 \begin{verbatim}
2095 2095 In [14]: x.shape
2096 2096 Out[14]: (100,)
2097 2097 \end{verbatim}
2098 2098 This allows you to plot data, try out ideas, etc, in a
2099 2099 \texttt{\%run}/interact/edit cycle that can be very productive. As you
2100 2100 start understanding your problem better you can refine your script
2101 2101 further, incrementally improving it based on the work you do at the
2102 2102 IPython prompt. At any point you can use the \texttt{\%hist} magic to
2103 2103 print out your history without prompts, so that you can copy useful
2104 2104 fragments back into the script.
2105 2105
2106 2106 By default, \texttt{\%run} executes scripts in a completely empty
2107 2107 namespace, to better mimic how they would execute at the system prompt
2108 2108 with plain Python. But if you use the \texttt{-i} flag, the script will
2109 2109 also see your interactively defined variables. This lets you edit in a
2110 2110 script larger amounts of code that still behave as if you had typed them
2111 2111 at the IPython prompt.
2112 2112
2113 2113 You can also get a summary of the time taken by your script with the
2114 2114 \texttt{-t} flag; consider a different script \texttt{randsvd.py} that
2115 2115 takes a bit longer to run:
2116 2116
2117 2117 \begin{verbatim}
2118 2118 In [21]: run -t randsvd.py
2119 2119
2120 2120 IPython CPU timings (estimated):
2121 2121 User : 0.38 s.
2122 2122 System : 0.04 s.
2123 2123 Wall time: 0.34 s.
2124 2124 \end{verbatim}
2125 2125 \texttt{User} is the time spent by the computer executing your code,
2126 2126 while \texttt{System} is the time the operating system had to work on
2127 2127 your behalf, doing things like memory allocation that are needed by your
2128 2128 code but that you didn't explicitly program and that happen inside the
2129 2129 kernel. The \texttt{Wall time} is the time on a `clock on the wall'
2130 2130 between the start and end of your program.
2131 2131
2132 2132 If \texttt{Wall \textgreater{} User+System}, your code is most likely
2133 2133 waiting idle for certain periods. That could be waiting for data to
2134 2134 arrive from a remote source or perhaps because the operating system has
2135 2135 to swap large amounts of virtual memory. If you know that your code
2136 2136 doesn't explicitly wait for remote data to arrive, you should
2137 2137 investigate further to identify possible ways of improving the
2138 2138 performance profile.
2139 2139
2140 2140 If you only want to time how long a single statement takes, you don't
2141 2141 need to put it into a script as you can use the \texttt{\%timeit} magic,
2142 2142 which uses Python's \texttt{timeit} module to very carefully measure
2143 2143 timig data; \texttt{timeit} can measure even short statements that
2144 2144 execute extremely fast:
2145 2145
2146 2146 \begin{verbatim}
2147 2147 In [27]: %timeit a=1
2148 2148 10000000 loops, best of 3: 23 ns per loop
2149 2149 \end{verbatim}
2150 2150 and for code that runs longer, it automatically adjusts so the overall
2151 2151 measurement doesn't take too long:
2152 2152
2153 2153 \begin{verbatim}
2154 2154 In [28]: %timeit np.linalg.svd(x)
2155 2155 1 loops, best of 3: 310 ms per loop
2156 2156 \end{verbatim}
2157 2157 The \texttt{\%run} magic still has more options for debugging and
2158 2158 profiling data; you should read its documentation for many useful
2159 2159 details (as always, just type \texttt{\%run?}).
2160 2160
2161 2161 \subsection{The graphical Qt console}
2162 2162 If you type at the system prompt (see the IPython website for
2163 2163 installation details, as this requires some additional libraries):
2164 2164
2165 2165 \begin{verbatim}
2166 2166 $ ipython qtconsole
2167 2167 \end{verbatim}
2168 2168 instead of opening in a terminal as before, IPython will start a
2169 2169 graphical console that at first sight appears just like a terminal, but
2170 2170 which is in fact much more capable than a text-only terminal. This is a
2171 2171 specialized terminal designed for interactive scientific work, and it
2172 2172 supports full multi-line editing with color highlighting and graphical
2173 2173 calltips for functions, it can keep multiple IPython sessions open
2174 2174 simultaneously in tabs, and when scripts run it can display the figures
2175 2175 inline directly in the work area.
2176 2176
2177 2177 % This cell is for the pdflatex output only
2178 2178 \begin{figure}[htbp]
2179 2179 \centering
2180 2180 \includegraphics[width=3in]{ipython_qtconsole2.png}
2181 2181 \caption{The IPython Qt console: a lightweight terminal for scientific exploration, with code, results and graphics in a soingle environment.}
2182 2182 \end{figure}
2183 2183 The Qt console accepts the same \texttt{-{}-pylab} startup flags as the
2184 2184 terminal, but you can additionally supply the value
2185 2185 \texttt{-{}-pylab inline}, which enables the support for inline graphics
2186 2186 shown in the figure. This is ideal for keeping all the code and figures
2187 2187 in the same session, given that the console can save the output of your
2188 2188 entire session to HTML or PDF.
2189 2189
2190 2190 Since the Qt console makes it far more convenient than the terminal to
2191 2191 edit blocks of code with multiple lines, in this environment it's worth
2192 2192 knowing about the \texttt{\%loadpy} magic function. \texttt{\%loadpy}
2193 2193 takes a path to a local file or remote URL, fetches its contents, and
2194 2194 puts it in the work area for you to further edit and execute. It can be
2195 2195 an extremely fast and convenient way of loading code from local disk or
2196 2196 remote examples from sites such as the
2197 2197 \href{http://matplotlib.sourceforge.net/gallery.html}{Matplotlib
2198 2198 gallery}.
2199 2199
2200 2200 Other than its enhanced capabilities for code and graphics, all of the
2201 2201 features of IPython we've explained before remain functional in this
2202 2202 graphical console.
2203 2203
2204 2204 \subsection{The IPython Notebook}
2205 2205 The third way to interact with IPython, in addition to the terminal and
2206 2206 graphical Qt console, is a powerful web interface called the ``IPython
2207 2207 Notebook''. If you run at the system console (you can omit the
2208 2208 \texttt{pylab} flags if you don't need plotting support):
2209 2209
2210 2210 \begin{verbatim}
2211 2211 $ ipython notebook --pylab inline
2212 2212 \end{verbatim}
2213 2213 IPython will start a process that runs a web server in your local
2214 2214 machine and to which a web browser can connect. The Notebook is a
2215 2215 workspace that lets you execute code in blocks called `cells' and
2216 2216 displays any results and figures, but which can also contain arbitrary
2217 2217 text (including LaTeX-formatted mathematical expressions) and any rich
2218 2218 media that a modern web browser is capable of displaying.
2219 2219
2220 2220 % This cell is for the pdflatex output only
2221 2221 \begin{figure}[htbp]
2222 2222 \centering
2223 2223 \includegraphics[width=3in]{ipython-notebook-specgram-2.png}
2224 2224 \caption{The IPython Notebook: text, equations, code, results, graphics and other multimedia in an open format for scientific exploration and collaboration}
2225 2225 \end{figure}
2226 2226 In fact, this document was written as a Notebook, and only exported to
2227 2227 LaTeX for printing. Inside of each cell, all the features of IPython
2228 2228 that we have discussed before remain functional, since ultimately this
2229 2229 web client is communicating with the same IPython code that runs in the
2230 2230 terminal. But this interface is a much more rich and powerful
2231 2231 environment for maintaining long-term ``live and executable'' scientific
2232 2232 documents.
2233 2233
2234 2234 Notebook environments have existed in commercial systems like
2235 2235 Mathematica(TM) and Maple(TM) for a long time; in the open source world
2236 2236 the \href{http://sagemath.org}{Sage} project blazed this particular
2237 2237 trail starting in 2006, and now we bring all the features that have made
2238 2238 IPython such a widely used tool to a Notebook model.
2239 2239
2240 2240 Since the Notebook runs as a web application, it is possible to
2241 2241 configure it for remote access, letting you run your computations on a
2242 2242 persistent server close to your data, which you can then access remotely
2243 2243 from any browser-equipped computer. We encourage you to read the
2244 2244 extensive documentation provided by the IPython project for details on
2245 2245 how to do this and many more features of the notebook.
2246 2246
2247 2247 Finally, as we said earlier, IPython also has a high-level and easy to
2248 2248 use set of libraries for parallel computing, that let you control
2249 2249 (interactively if desired) not just one IPython but an entire cluster of
2250 2250 `IPython engines'. Unfortunately a detailed discussion of these tools is
2251 2251 beyond the scope of this text, but should you need to parallelize your
2252 2252 analysis codes, a quick read of the tutorials and examples provided at
2253 2253 the IPython site may prove fruitful.
2254 2254
2255 2255 \end{document}
General Comments 0
You need to be logged in to leave comments. Login now