From 27e2064c7bc5acede06711d5a9639ab3cf016f4b 2012-03-21 14:11:17 From: slojo404 Date: 2012-03-21 14:11:17 Subject: [PATCH] adding a basic test and moving tutorial.rst test file to tests directory --- diff --git a/tests/test_rst2ipynb.py b/tests/test_rst2ipynb.py new file mode 100644 index 0000000..d30cb26 --- /dev/null +++ b/tests/test_rst2ipynb.py @@ -0,0 +1,25 @@ +import os +import errno +import subprocess +import nose.tools as nt + +test_rst_fname = 'tests/tutorial.rst.ref' +ref_ipynb_fname = 'tests/tutorial.ipynb.ref' +test_generate_ipynb_fname = 'tests/tutorial.ipynb' + + +def clean_dir(): + "Remove generated ipynb file created during conversion" + try: + os.unlink(test_generate_ipynb_fname) + except OSError, e: + if e.errno != errno.ENOENT: + raise + + +@nt.with_setup(clean_dir, clean_dir) +def test_command_line(): + with open(ref_ipynb_fname, 'rb') as f: + ref_output = f.read() + output = subprocess.check_output(['./rst2ipynb.py', test_rst_fname]) + nt.assert_equal(ref_output, output) diff --git a/tests/tutorial.ipynb.ref b/tests/tutorial.ipynb.ref new file mode 100644 index 0000000..22548bb --- /dev/null +++ b/tests/tutorial.ipynb.ref @@ -0,0 +1,396 @@ +{ + "metadata": {}, + "nbformat": 3, + "worksheets": [ + { + "cells": [ + { + "cell_type": "heading", + "level": 1, + "source": [ + "An Introduction to machine learning with scikit-learn" + ] + }, + { + "cell_type": "heading", + "level": 1, + "source": [ + "Section contents" + ] + }, + { + "cell_type": "markdown", + "source": [ + "In this section, we introduce the machine learning", + "vocabulary that we use through-out scikit-learn and give a", + "simple learning example." + ] + }, + { + "cell_type": "heading", + "level": 2, + "source": [ + "Machine learning: the problem setting" + ] + }, + { + "cell_type": "markdown", + "source": [ + "In general, a learning problem considers a set of n", + "samples of", + "data and try to predict properties of unknown data. If each sample is", + "more than a single number, and for instance a multi-dimensional entry", + "(aka multivariate", + "data), is it said to have several attributes,", + "or features." + ] + }, + { + "cell_type": "markdown", + "source": [ + "We can separate learning problems in a few large categories:" + ] + }, + { + "cell_type": "markdown", + "source": [ + "supervised learning,", + "in which the data comes with additional attributes that we want to predict", + "(:ref:`Click here `", + "to go to the Scikit-Learn supervised learning page).This problem", + "can be either:" + ] + }, + { + "cell_type": "markdown", + "source": [ + "classification:", + "samples belong to two or more classes and we", + "want to learn from already labeled data how to predict the class", + "of unlabeled data. An example of classification problem would", + "be the digit recognition example, in which the aim is to assign", + "each input vector to one of a finite number of discrete", + "categories." + ] + }, + { + "cell_type": "markdown", + "source": [ + "regression:", + "if the desired output consists of one or more", + "continuous variables, then the task is called regression. An", + "example of a regression problem would be the prediction of the", + "length of a salmon as a function of its age and weight." + ] + }, + { + "cell_type": "markdown", + "source": [ + "unsupervised learning,", + "in which the training data consists of a set of input vectors x", + "without any corresponding target values. The goal in such problems", + "may be to discover groups of similar examples within the data, where", + "it is called clustering,", + "or to determine the distribution of data within the input space, known as", + "density estimation, or", + "to project the data from a high-dimensional space down to two or thee", + "dimensions for the purpose of visualization", + "(:ref:`Click here `", + "to go to the Scikit-Learn unsupervised learning page)." + ] + }, + { + "cell_type": "heading", + "level": 2, + "source": [ + "Training set and testing set" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Machine learning is about learning some properties of a data set", + "and applying them to new data. This is why a common practice in", + "machine learning to evaluate an algorithm is to split the data", + "at hand in two sets, one that we call a training set on which", + "we learn data properties, and one that we call a testing set,", + "on which we test these properties." + ] + }, + { + "cell_type": "heading", + "level": 2, + "source": [ + "Loading an example dataset" + ] + }, + { + "cell_type": "markdown", + "source": [ + "scikit-learn comes with a few standard datasets, for instance the", + "iris and digits", + "datasets for classification and the boston house prices dataset for regression.:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn import datasets", + "iris = datasets.load_iris()", + "digits = datasets.load_digits()" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "A dataset is a dictionary-like object that holds all the data and some", + "metadata about the data. This data is stored in the .data member,", + "which is a n_samples, n_features array. In the case of supervised", + "problem, explanatory variables are stored in the .target member. More", + "details on the different datasets can be found in the :ref:`dedicated", + "section `." + ] + }, + { + "cell_type": "markdown", + "source": [ + "For instance, in the case of the digits dataset, digits.data gives", + "access to the features that can be used to classify the digits samples:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print digits.data # doctest: +NORMALIZE_WHITESPACE" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "and digits.target gives the ground truth for the digit dataset, that", + "is the number corresponding to each digit image that we are trying to", + "learn:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "digits.target" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "heading", + "level": 2, + "source": [ + "Shape of the data arrays" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The data is always a 2D array, n_samples, n_features, although", + "the original data may have had a different shape. In the case of the", + "digits, each original sample is an image of shape 8, 8 and can be", + "accessed using:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "digits.images[0]" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The :ref:`simple example on this dataset", + "` illustrates how starting", + "from the original problem one can shape the data for consumption in", + "the scikit-learn." + ] + }, + { + "cell_type": "heading", + "level": 2, + "source": [ + "Learning and Predicting" + ] + }, + { + "cell_type": "markdown", + "source": [ + "In the case of the digits dataset, the task is to predict the value of a", + "hand-written digit from an image. We are given samples of each of the 10", + "possible classes on which we fit an", + "estimator to be able to predict", + "the labels corresponding to new data." + ] + }, + { + "cell_type": "markdown", + "source": [ + "In scikit-learn, an estimator is just a plain Python class that", + "implements the methods fit(X, Y) and predict(T)." + ] + }, + { + "cell_type": "markdown", + "source": [ + "An example of estimator is the class sklearn.svm.SVC that", + "implements Support Vector Classification. The", + "constructor of an estimator takes as arguments the parameters of the", + "model, but for the time being, we will consider the estimator as a black", + "box:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn import svm", + "clf = svm.SVC(gamma=0.001, C=100.)" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "heading", + "level": 2, + "source": [ + "Choosing the parameters of the model" + ] + }, + { + "cell_type": "markdown", + "source": [ + "In this example we set the value of gamma manually. It is possible", + "to automatically find good values for the parameters by using tools", + "such as :ref:`grid search ` and :ref:`cross validation", + "`." + ] + }, + { + "cell_type": "markdown", + "source": [ + "We call our estimator instance clf as it is a classifier. It now must", + "be fitted to the model, that is, it must learn from the model. This is", + "done by passing our training set to the fit method. As a training", + "set, let us use all the images of our dataset apart from the last", + "one:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf.fit(digits.data[:-1], digits.target[:-1])" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now you can predict new values, in particular, we can ask to the", + "classifier what is the digit of our last image in the digits dataset,", + "which we have not used to train the classifier:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf.predict(digits.data[-1])" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The corresponding image is the following:" + ] + }, + { + "cell_type": "markdown", + "source": [ + "As you can see, it is a challenging task: the images are of poor", + "resolution. Do you agree with the classifier?" + ] + }, + { + "cell_type": "markdown", + "source": [ + "A complete example of this classification problem is available as an", + "example that you can run and study:", + ":ref:`example_plot_digits_classification.py`." + ] + }, + { + "cell_type": "heading", + "level": 2, + "source": [ + "Model persistence" + ] + }, + { + "cell_type": "markdown", + "source": [ + "It is possible to save a model in the scikit by using Python's built-in", + "persistence model, namely pickle:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn import svm", + "from sklearn import datasets", + "clf = svm.SVC()", + "iris = datasets.load_iris()", + "X, y = iris.data, iris.target", + "clf.fit(X, y)", + "import pickle", + "s = pickle.dumps(clf)", + "clf2 = pickle.loads(s)", + "clf2.predict(X[0])", + "y[0]" + ], + "language": "python", + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "In the specific case of the scikit, it may be more interesting to use", + "joblib's replacement of pickle (joblib.dump & joblib.load),", + "which is more efficient on big data, but can only pickle to the disk", + "and not to a string:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.externals import joblib", + "joblib.dump(clf, 'filename.pkl') # doctest: +SKIP" + ], + "language": "python", + "outputs": [] + } + ] + } + ] +} \ No newline at end of file diff --git a/tutorial.rst b/tests/tutorial.rst.ref similarity index 100% rename from tutorial.rst rename to tests/tutorial.rst.ref