From be0fd9937bdff7f54625bfe25cbb2858d3e19f89 Mon Sep 17 00:00:00 2001
From: sentient07 <vxrram95@gmail.com>
Date: Tue, 8 Mar 2016 02:10:59 +0530
Subject: [PATCH 1/4] added dropout

---
 code/dropout.py | 626 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 626 insertions(+)
 create mode 100644 code/dropout.py

diff --git a/code/dropout.py b/code/dropout.py
new file mode 100644
index 00000000..18c1d920
--- /dev/null
+++ b/code/dropout.py
@@ -0,0 +1,626 @@
+from __future__ import print_function
+
+import numpy as np
+import os
+import sys
+import timeit
+import six.moves.cPickle as pickle
+import theano
+import theano.tensor as T
+import theano.tensor.shared_randomstreams
+import gzip
+from collections import OrderedDict
+
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W`
+    and bias vector :math:`b`. Classification is done by projecting data
+    points onto a set of hyperplanes, the distance to which is used to
+    determine a class membership probability.
+    """
+
+    def __init__(self, input, n_in, n_out, W=None, b=None):
+        """ Initialize the parameters of the logistic regression
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+                      architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+                     which the datapoints lie
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+                      which the labels lie
+
+        """
+        # start-snippet-1
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
+        if W is None:
+            self.W = theano.shared(
+                    value=np.zeros((n_in, n_out), dtype=theano.config.floatX),
+                    name='W')
+        else:
+            self.W = W
+
+        # initialize the baises b as a vector of n_out 0s
+        if b is None:
+            self.b = theano.shared(
+                    value=np.zeros((n_out,), dtype=theano.config.floatX),
+                    name='b')
+        else:
+            self.b = b
+
+        # symbolic expression for computing the matrix of class-membership
+        # probabilities
+        # Where:
+        # W is a matrix where column-k represent the separation hyperplane for
+        # class-k
+        # x is a matrix where row-j  represents input training sample-j
+        # b is a vector where element-k represent the free parameter of
+        # hyperplane-k
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
+
+        # symbolic description of how to compute prediction as class whose
+        # probability is maximal
+        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
+        # end-snippet-1
+
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+        # keep track of model input
+        self.input = input
+
+    def negative_log_likelihood(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
+                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \ell (\theta=\{W,b\}, \mathcal{D})
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+
+        Note: we use the mean instead of the sum so that
+              the learning rate is less dependent on the batch size
+        """
+        # start-snippet-2
+        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
+        # number of examples (call it n) in the minibatch
+        # T.arange(y.shape[0]) is a symbolic vector which will contain
+        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
+        # Log-Probabilities (call it LP) with one row per example and
+        # one column per class LP[T.arange(y.shape[0]),y] is a vector
+        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
+        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
+        # the mean (across minibatch examples) of the elements in v,
+        # i.e., the mean log-likelihood across the minibatch.
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+        # end-snippet-2
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+        """
+
+        # check if y has same dimension of y_pred
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError(
+                'y should have the same shape as self.y_pred',
+                ('y', y.type, 'y_pred', self.y_pred.type)
+            )
+        # check if y is of the correct datatype
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+def load_data(dataset):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+
+    # Download the MNIST dataset if it is not present
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+        from six.moves import urllib
+        origin = (
+            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+        )
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
+
+    print('... loading data')
+
+    # Load the dataset
+    with gzip.open(dataset, 'rb') as f:
+        try:
+            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
+        except:
+            train_set, valid_set, test_set = pickle.load(f)
+    # train_set, valid_set, test_set format: tuple(input, target)
+    # input is a numpy.ndarray of 2 dimensions (a matrix)
+    # where each row corresponds to an example. target is a
+    # numpy.ndarray of 1 dimension (vector) that has the same length as
+    # the number of rows in the input. It should give the target
+    # to the example with the same index in the input.
+
+    def shared_dataset(data_xy, borrow=True):
+        """ Function that loads the dataset into shared variables
+
+        The reason we store our dataset in shared variables is to allow
+        Theano to copy it into the GPU memory (when code is run on GPU).
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared(np.asarray(data_x,
+                                               dtype=theano.config.floatX),
+                                 borrow=borrow)
+        shared_y = theano.shared(np.asarray(data_y,
+                                               dtype=theano.config.floatX),
+                                 borrow=borrow)
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are
+        # floats it doesn't make sense) therefore instead of returning
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast(shared_y, 'int32')
+
+    test_set_x, test_set_y = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
+            (test_set_x, test_set_y)]
+    return rval
+
+
+class HiddenLayer(object):
+    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
+                 activation=T.tanh):
+        """
+        Typical hidden layer of a MLP: units are fully-connected and have
+        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
+
+        NOTE : The nonlinearity used here is tanh
+
+        Hidden unit activation is given by: tanh(dot(input,W) + b)
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.dmatrix
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+
+        :type n_in: int
+        :param n_in: dimensionality of input
+
+        :type n_out: int
+        :param n_out: number of hidden units
+
+        :type activation: theano.Op or function
+        :param activation: Non linearity to be applied in the hidden
+                           layer
+        """
+        self.input = input
+        # end-snippet-1
+
+        # `W` is initialized with `W_values` which is uniformely sampled
+        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
+        # for tanh activation function
+        # the output of uniform if converted using asarray to dtype
+        # theano.config.floatX so that the code is runable on GPU
+        # Note : optimal initialization of weights is dependent on the
+        #        activation function used (among other things).
+        #        For example, results presented in [Xavier10] suggest that you
+        #        should use 4 times larger initial weights for sigmoid
+        #        compared to tanh
+        #        We have no info for other function, so we use the same as
+        #        tanh.
+        if W is None:
+            W_values = np.asarray(
+                rng.uniform(
+                    low=-np.sqrt(6. / (n_in + n_out)),
+                    high= np.sqrt(6. / (n_in + n_out)),
+                    size=(n_in, n_out)
+                ),
+                dtype=theano.config.floatX
+            )
+            if activation == theano.tensor.nnet.sigmoid:
+                W_values *= 4
+
+            W = theano.shared(value=W_values, name='W', borrow=True)
+
+        if b is None:
+            b_values = np.zeros((n_out,), dtype=theano.config.floatX)
+            b = theano.shared(value=b_values, name='b', borrow=True)
+
+        self.W = W
+        self.b = b
+
+        lin_output = T.dot(input, self.W) + self.b
+        self.output = (
+            lin_output if activation is None
+            else activation(lin_output)
+        )
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+
+def _dropsout(rng, layer, p):
+    srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(1000))
+    mask = srng.binomial(n=1, p=1-p, size=layer.shape)
+    output = layer*T.cast(mask, theano.config.floatX)
+    return output
+
+class HiddenDropoutLayer(HiddenLayer):
+
+    def __init__(self, rng, input, n_in, n_out,
+                 activation, dropout_rate, W=None, b=None):
+        super(HiddenDropoutLayer, self).__init__(
+                rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b,
+                activation=activation)
+
+        self.output = _dropsout(rng, self.output, p=dropout_rate)
+    
+
+class MLP(object):
+    """Multi-Layer Perceptron Class
+
+    A multilayer perceptron is a feedforward artificial neural network model
+    that has one layer or more of hidden units and nonlinear activations.
+    Intermediate layers usually have as activation function tanh or the
+    sigmoid function (defined here by a ``HiddenLayer`` class)  while the
+    top layer is a softmax layer (defined here by a ``LogisticRegression``
+    class).
+    """
+
+    def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out):
+        """Initialize the parameters for the multilayer perceptron
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+        architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+        which the datapoints lie
+
+        :type n_hidden: int
+        :param n_hidden: number of hidden units
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+        which the labels lie
+
+        """
+        
+        #Dropping out the input layer
+        inp_dropout_layer = _dropsout(rng, input, p=dropout_rates[0])
+        
+        self.drop_layer = HiddenDropoutLayer(rng=rng,
+                    input=inp_dropout_layer,
+                    activation=T.tanh,
+                    n_in=n_in, n_out=n_hidden,
+                    dropout_rate=dropout_rates[1])
+        
+
+        # Since we are dealing with a one hidden layer MLP, this will translate
+        # into a HiddenLayer with a tanh activation function connected to the
+        # LogisticRegression layer; the activation function can be replaced by
+        # sigmoid or any other nonlinear function
+        self.hiddenLayer = HiddenLayer(
+            rng=rng,
+            input=input,
+            n_in=n_in,
+            n_out=n_hidden,
+            W=self.drop_layer.W * (1 - dropout_rates[1]),
+            b=self.drop_layer.b,
+            activation=T.tanh
+        )
+        
+        
+        self.drop_output_layer = LogisticRegression(
+        input=self.drop_layer.output,
+        n_in=n_hidden, 
+        n_out=n_out)
+        
+
+        # The logistic regression layer gets as input the hidden units
+        # of the hidden layer
+        self.logRegressionLayer = LogisticRegression(
+            input=self.hiddenLayer.output,
+            n_in=n_hidden,
+            n_out=n_out,
+            W=self.drop_output_layer.W * (1 - dropout_rates[-1]),
+            b=self.drop_output_layer.b,
+        )
+        
+        
+        self.drop_negative_log_likelihood = self.drop_output_layer.negative_log_likelihood
+        self.dropout_errors = self.drop_output_layer.errors
+
+        # negative log likelihood of the MLP is given by the negative
+        # log likelihood of the output of the model, computed in the
+        # logistic regression layer
+        self.negative_log_likelihood = (
+            self.logRegressionLayer.negative_log_likelihood
+        )
+        # same holds for the function computing the number of errors
+        self.errors = self.logRegressionLayer.errors
+
+        # the parameters of the model are the parameters of the two layer it is
+        # made out of
+        self.params = self.drop_layer.params + self.drop_output_layer.params
+        # end-snippet-3
+
+        # keep track of model input
+        self.input = input
+
+
+
+# In[36]:
+
+
+def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5],
+             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
+    """
+    Demonstrate stochastic gradient descent optimization for a multilayer
+    perceptron
+
+    This is demonstrated on MNIST.
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used (factor for the stochastic
+    gradient
+
+    :type L1_reg: float
+    :param L1_reg: L1-norm's weight when added to the cost (see
+    regularization)
+
+    :type L2_reg: float
+    :param L2_reg: L2-norm's weight when added to the cost (see
+    regularization)
+
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from
+                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+
+
+   """
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print('... building the model')
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()  # index to a [mini]batch
+    x = T.matrix('x')  # the data is presented as rasterized images
+    y = T.ivector('y')  # the labels are presented as 1D vector of
+                        # [int] labels
+
+    rng = np.random.RandomState(1234)
+
+
+    # construct the MLP class
+    classifier = MLP(
+        rng=rng,
+        input=x,
+        n_in=28 * 28,
+        n_hidden=n_hidden,
+        dropout_rates=dropout_rates,
+        n_out=10
+    )
+
+    # start-snippet-4
+    # the cost we minimize during training is the negative log likelihood of
+    # the model plus the regularization terms (L1 and L2); cost is expressed
+    # here symbolically
+    cost = (
+        classifier.negative_log_likelihood(y)
+    )
+    dropout_cost = classifier.drop_negative_log_likelihood(y)
+    
+    # end-snippet-4
+
+    # compiling a Theano function that computes the mistakes that are made
+    # by the model on a minibatch
+    test_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: test_set_x[index * batch_size:(index + 1) * batch_size],
+            y: test_set_y[index * batch_size:(index + 1) * batch_size]
+        }
+    )
+
+    validate_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
+            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
+        }
+    )
+
+    # start-snippet-5
+    # compute the gradient of cost with respect to theta (sotred in params)
+    # the resulting gradients will be stored in a list gparams
+    gparams = []
+    for param in classifier.params:
+        #Changing cost for with dropout layer and without 
+        gparam = T.grad(dropout_cost, param)
+        gparams.append(gparam)
+
+    # specify how to update the parameters of the model as a list of
+    # (variable, update expression) pairs
+
+    # given two lists of the same length, A = [a1, a2, a3, a4] and
+    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
+    # element is a pair formed from the two lists :
+    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
+    #Stochastic Gradient Descent (SGD) updates
+    
+    output = dropout_cost
+    updates = OrderedDict()
+    for param, gparam in zip(classifier.params, gparams) :
+        updates[param] = param - learning_rate * gparam
+
+
+    # compiling a Theano function `train_model` that returns the cost, but
+    # in the same time updates the parameter of the model based on the rules
+    # defined in `updates`
+    train_model = theano.function(
+        inputs=[index],
+        outputs=cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size],
+            y: train_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+    # end-snippet-5
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print('... training')
+
+    # early-stopping parameters
+    patience = 10000  # look as this many examples regardless
+    patience_increase = 2  # wait this much longer when a new best is
+                           # found
+    improvement_threshold = 0.995  # a relative improvement of this much is
+                                   # considered significant
+    validation_frequency = min(n_train_batches, patience // 2)
+                                  # go through this many
+                                  # minibatche before checking the network
+                                  # on the validation set; in this case we
+                                  # check every epoch
+
+    best_validation_loss = np.inf
+    best_iter = 0
+    test_score = 0.
+    start_time = timeit.default_timer()
+
+    epoch = 0
+    done_looping = False
+
+    while (epoch < n_epochs) and (not done_looping):
+        epoch = epoch + 1
+        for minibatch_index in range(n_train_batches):
+
+            minibatch_avg_cost = train_model(minibatch_index)
+            # iteration number
+            iter = (epoch - 1) * n_train_batches + minibatch_index
+
+            if (iter + 1) % validation_frequency == 0:
+                # compute zero-one loss on validation set
+                validation_losses = [validate_model(i) for i
+                                     in range(n_valid_batches)]
+                this_validation_loss = np.mean(validation_losses)
+
+                print(
+                    'epoch %i, minibatch %i/%i, validation error %f %%' %
+                    (
+                        epoch,
+                        minibatch_index + 1,
+                        n_train_batches,
+                        this_validation_loss * 100.
+                    )
+                )
+
+                # if we got the best validation score until now
+                if this_validation_loss < best_validation_loss:
+                    #improve patience if loss improvement is good enough
+                    if (
+                        this_validation_loss < best_validation_loss *
+                        improvement_threshold
+                    ):
+                        patience = max(patience, iter * patience_increase)
+
+                    best_validation_loss = this_validation_loss
+                    best_iter = iter
+
+                    # test it on the test set
+                    test_losses = [test_model(i) for i
+                                   in range(n_test_batches)]
+                    test_score = np.mean(test_losses)
+
+                    print(('     epoch %i, minibatch %i/%i, test error of '
+                           'best model %f %%') %
+                          (epoch, minibatch_index + 1, n_train_batches,
+                           test_score * 100.))
+
+            if patience <= iter:
+                done_looping = True
+                break
+
+    end_time = timeit.default_timer()
+    print(('Optimization complete. Best validation score of %f %% '
+           'obtained at iteration %i, with test performance %f %%') %
+          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
+    print(('The code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
+
+
+if __name__ == '__main__':
+    test_mlp()
+

From ff3f08589fac489e8feb8175c6daedc1913edb2e Mon Sep 17 00:00:00 2001
From: sentient07 <vxrram95@gmail.com>
Date: Tue, 8 Mar 2016 19:16:01 +0530
Subject: [PATCH 2/4] Added docstrings

---
 code/dropout.py      | 241 ++++++-------------------------------------
 code/logistic_sgd.py |  39 +++----
 2 files changed, 51 insertions(+), 229 deletions(-)

diff --git a/code/dropout.py b/code/dropout.py
index 18c1d920..bddcb84a 100644
--- a/code/dropout.py
+++ b/code/dropout.py
@@ -10,208 +10,9 @@
 import theano.tensor.shared_randomstreams
 import gzip
 from collections import OrderedDict
+from logistic_sgd import LogisticRegression, load_data
 
 
-class LogisticRegression(object):
-    """Multi-class Logistic Regression Class
-
-    The logistic regression is fully described by a weight matrix :math:`W`
-    and bias vector :math:`b`. Classification is done by projecting data
-    points onto a set of hyperplanes, the distance to which is used to
-    determine a class membership probability.
-    """
-
-    def __init__(self, input, n_in, n_out, W=None, b=None):
-        """ Initialize the parameters of the logistic regression
-
-        :type input: theano.tensor.TensorType
-        :param input: symbolic variable that describes the input of the
-                      architecture (one minibatch)
-
-        :type n_in: int
-        :param n_in: number of input units, the dimension of the space in
-                     which the datapoints lie
-
-        :type n_out: int
-        :param n_out: number of output units, the dimension of the space in
-                      which the labels lie
-
-        """
-        # start-snippet-1
-        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
-        if W is None:
-            self.W = theano.shared(
-                    value=np.zeros((n_in, n_out), dtype=theano.config.floatX),
-                    name='W')
-        else:
-            self.W = W
-
-        # initialize the baises b as a vector of n_out 0s
-        if b is None:
-            self.b = theano.shared(
-                    value=np.zeros((n_out,), dtype=theano.config.floatX),
-                    name='b')
-        else:
-            self.b = b
-
-        # symbolic expression for computing the matrix of class-membership
-        # probabilities
-        # Where:
-        # W is a matrix where column-k represent the separation hyperplane for
-        # class-k
-        # x is a matrix where row-j  represents input training sample-j
-        # b is a vector where element-k represent the free parameter of
-        # hyperplane-k
-        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
-
-        # symbolic description of how to compute prediction as class whose
-        # probability is maximal
-        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
-        # end-snippet-1
-
-        # parameters of the model
-        self.params = [self.W, self.b]
-
-        # keep track of model input
-        self.input = input
-
-    def negative_log_likelihood(self, y):
-        """Return the mean of the negative log-likelihood of the prediction
-        of this model under a given target distribution.
-
-        .. math::
-
-            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
-            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
-                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-            \ell (\theta=\{W,b\}, \mathcal{D})
-
-        :type y: theano.tensor.TensorType
-        :param y: corresponds to a vector that gives for each example the
-                  correct label
-
-        Note: we use the mean instead of the sum so that
-              the learning rate is less dependent on the batch size
-        """
-        # start-snippet-2
-        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
-        # number of examples (call it n) in the minibatch
-        # T.arange(y.shape[0]) is a symbolic vector which will contain
-        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
-        # Log-Probabilities (call it LP) with one row per example and
-        # one column per class LP[T.arange(y.shape[0]),y] is a vector
-        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
-        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
-        # the mean (across minibatch examples) of the elements in v,
-        # i.e., the mean log-likelihood across the minibatch.
-        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
-        # end-snippet-2
-
-    def errors(self, y):
-        """Return a float representing the number of errors in the minibatch
-        over the total number of examples of the minibatch ; zero one
-        loss over the size of the minibatch
-
-        :type y: theano.tensor.TensorType
-        :param y: corresponds to a vector that gives for each example the
-                  correct label
-        """
-
-        # check if y has same dimension of y_pred
-        if y.ndim != self.y_pred.ndim:
-            raise TypeError(
-                'y should have the same shape as self.y_pred',
-                ('y', y.type, 'y_pred', self.y_pred.type)
-            )
-        # check if y is of the correct datatype
-        if y.dtype.startswith('int'):
-            # the T.neq operator returns a vector of 0s and 1s, where 1
-            # represents a mistake in prediction
-            return T.mean(T.neq(self.y_pred, y))
-        else:
-            raise NotImplementedError()
-
-
-def load_data(dataset):
-    ''' Loads the dataset
-
-    :type dataset: string
-    :param dataset: the path to the dataset (here MNIST)
-    '''
-
-    #############
-    # LOAD DATA #
-    #############
-
-    # Download the MNIST dataset if it is not present
-    data_dir, data_file = os.path.split(dataset)
-    if data_dir == "" and not os.path.isfile(dataset):
-        # Check if dataset is in the data directory.
-        new_path = os.path.join(
-            os.path.split(__file__)[0],
-            "..",
-            "data",
-            dataset
-        )
-        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
-            dataset = new_path
-
-    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
-        from six.moves import urllib
-        origin = (
-            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
-        )
-        print('Downloading data from %s' % origin)
-        urllib.request.urlretrieve(origin, dataset)
-
-    print('... loading data')
-
-    # Load the dataset
-    with gzip.open(dataset, 'rb') as f:
-        try:
-            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
-        except:
-            train_set, valid_set, test_set = pickle.load(f)
-    # train_set, valid_set, test_set format: tuple(input, target)
-    # input is a numpy.ndarray of 2 dimensions (a matrix)
-    # where each row corresponds to an example. target is a
-    # numpy.ndarray of 1 dimension (vector) that has the same length as
-    # the number of rows in the input. It should give the target
-    # to the example with the same index in the input.
-
-    def shared_dataset(data_xy, borrow=True):
-        """ Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Theano to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared(np.asarray(data_x,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        shared_y = theano.shared(np.asarray(data_y,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast(shared_y, 'int32')
-
-    test_set_x, test_set_y = shared_dataset(test_set)
-    valid_set_x, valid_set_y = shared_dataset(valid_set)
-    train_set_x, train_set_y = shared_dataset(train_set)
-
-    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
-            (test_set_x, test_set_y)]
-    return rval
-
 
 class HiddenLayer(object):
     def __init__(self, rng, input, n_in, n_out, W=None, b=None,
@@ -292,10 +93,33 @@ def _dropsout(rng, layer, p):
     output = layer*T.cast(mask, theano.config.floatX)
     return output
 
-class HiddenDropoutLayer(HiddenLayer):
 
+class HiddenDropoutLayer(HiddenLayer):
     def __init__(self, rng, input, n_in, n_out,
                  activation, dropout_rate, W=None, b=None):
+
+        """
+        Dropout layer of a MLP. Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.dmatrix
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+
+        :type n_in: int
+        :param n_in: dimensionality of input
+
+        :type n_out: int
+        :param n_out: number of hidden units
+
+        :type activation: theano.Op or function
+        :param activation: Non linearity to be applied in the hidden
+                           layer
+        :type dropout_rate: list 
+        :param dropout_rate: array containing probabilities of retaining a unit
+        """
         super(HiddenDropoutLayer, self).__init__(
                 rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b,
                 activation=activation)
@@ -331,6 +155,9 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out):
         :type n_hidden: int
         :param n_hidden: number of hidden units
 
+        :type dropout_rate: list 
+        :param dropout_rate: array containing probabilities of retaining a unit
+
         :type n_out: int
         :param n_out: number of output units, the dimension of the space in
         which the labels lie
@@ -416,14 +243,6 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5],
     :param learning_rate: learning rate used (factor for the stochastic
     gradient
 
-    :type L1_reg: float
-    :param L1_reg: L1-norm's weight when added to the cost (see
-    regularization)
-
-    :type L2_reg: float
-    :param L2_reg: L2-norm's weight when added to the cost (see
-    regularization)
-
     :type n_epochs: int
     :param n_epochs: maximal number of epochs to run the optimizer
 
@@ -431,6 +250,8 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5],
     :param dataset: the path of the MNIST dataset file from
                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 
+    :type dropout_rate: list 
+    :param dropout_rate: array containing probabilities of retaining a unit
 
    """
     datasets = load_data(dataset)
@@ -516,7 +337,7 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5],
     # element is a pair formed from the two lists :
     #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
     #Stochastic Gradient Descent (SGD) updates
-    
+
     output = dropout_cost
     updates = OrderedDict()
     for param, gparam in zip(classifier.params, gparams) :
diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py
index 9f4427e7..7aa649ff 100644
--- a/code/logistic_sgd.py
+++ b/code/logistic_sgd.py
@@ -58,8 +58,10 @@ class LogisticRegression(object):
     determine a class membership probability.
     """
 
-    def __init__(self, input, n_in, n_out):
+    def __init__(self, input, n_in, n_out, W=None, b=None):
         """ Initialize the parameters of the logistic regression
+        Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
 
         :type input: theano.tensor.TensorType
         :param input: symbolic variable that describes the input of the
@@ -75,24 +77,23 @@ def __init__(self, input, n_in, n_out):
 
         """
         # start-snippet-1
-        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
-        self.W = theano.shared(
-            value=numpy.zeros(
-                (n_in, n_out),
-                dtype=theano.config.floatX
-            ),
-            name='W',
-            borrow=True
-        )
-        # initialize the biases b as a vector of n_out 0s
-        self.b = theano.shared(
-            value=numpy.zeros(
-                (n_out,),
-                dtype=theano.config.floatX
-            ),
-            name='b',
-            borrow=True
-        )
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out) if 
+        #the parameter W is None
+        if W is None:
+            self.W = theano.shared(
+                    value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX),
+                    name='W')
+        else:
+            self.W = W
+
+        # initialize the baises b as a vector of n_out 0s if the parameter b is
+        #not none
+        if b is None:
+            self.b = theano.shared(
+                    value=numpy.zeros((n_out,), dtype=theano.config.floatX),
+                    name='b')
+        else:
+            self.b = b
 
         # symbolic expression for computing the matrix of class-membership
         # probabilities

From 1022a25d69efc6d8c075347e79ad356508ec37b1 Mon Sep 17 00:00:00 2001
From: sentient07 <vxrram95@gmail.com>
Date: Mon, 14 Mar 2016 23:47:25 +0530
Subject: [PATCH 3/4] Changed the upscaling style

---
 code/dropout.py      | 134 ++++---------------------------------------
 code/logistic_sgd.py |   4 +-
 2 files changed, 14 insertions(+), 124 deletions(-)

diff --git a/code/dropout.py b/code/dropout.py
index bddcb84a..a4ff9c58 100644
--- a/code/dropout.py
+++ b/code/dropout.py
@@ -11,131 +11,21 @@
 import gzip
 from collections import OrderedDict
 from logistic_sgd import LogisticRegression, load_data
-
-
-
-class HiddenLayer(object):
-    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
-                 activation=T.tanh):
-        """
-        Typical hidden layer of a MLP: units are fully-connected and have
-        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
-        and the bias vector b is of shape (n_out,).
-
-        NOTE : The nonlinearity used here is tanh
-
-        Hidden unit activation is given by: tanh(dot(input,W) + b)
-
-        :type rng: numpy.random.RandomState
-        :param rng: a random number generator used to initialize weights
-
-        :type input: theano.tensor.dmatrix
-        :param input: a symbolic tensor of shape (n_examples, n_in)
-
-        :type n_in: int
-        :param n_in: dimensionality of input
-
-        :type n_out: int
-        :param n_out: number of hidden units
-
-        :type activation: theano.Op or function
-        :param activation: Non linearity to be applied in the hidden
-                           layer
-        """
-        self.input = input
-        # end-snippet-1
-
-        # `W` is initialized with `W_values` which is uniformely sampled
-        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
-        # for tanh activation function
-        # the output of uniform if converted using asarray to dtype
-        # theano.config.floatX so that the code is runable on GPU
-        # Note : optimal initialization of weights is dependent on the
-        #        activation function used (among other things).
-        #        For example, results presented in [Xavier10] suggest that you
-        #        should use 4 times larger initial weights for sigmoid
-        #        compared to tanh
-        #        We have no info for other function, so we use the same as
-        #        tanh.
-        if W is None:
-            W_values = np.asarray(
-                rng.uniform(
-                    low=-np.sqrt(6. / (n_in + n_out)),
-                    high= np.sqrt(6. / (n_in + n_out)),
-                    size=(n_in, n_out)
-                ),
-                dtype=theano.config.floatX
-            )
-            if activation == theano.tensor.nnet.sigmoid:
-                W_values *= 4
-
-            W = theano.shared(value=W_values, name='W', borrow=True)
-
-        if b is None:
-            b_values = np.zeros((n_out,), dtype=theano.config.floatX)
-            b = theano.shared(value=b_values, name='b', borrow=True)
-
-        self.W = W
-        self.b = b
-
-        lin_output = T.dot(input, self.W) + self.b
-        self.output = (
-            lin_output if activation is None
-            else activation(lin_output)
-        )
-        # parameters of the model
-        self.params = [self.W, self.b]
+from mlp import HiddenLayer
 
 
 def _dropsout(rng, layer, p):
     srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(1000))
     mask = srng.binomial(n=1, p=1-p, size=layer.shape)
     output = layer*T.cast(mask, theano.config.floatX)
-    return output
-
-
-class HiddenDropoutLayer(HiddenLayer):
-    def __init__(self, rng, input, n_in, n_out,
-                 activation, dropout_rate, W=None, b=None):
-
-        """
-        Dropout layer of a MLP. Weight matrix W is of shape (n_in,n_out)
-        and the bias vector b is of shape (n_out,).
-
-        :type rng: numpy.random.RandomState
-        :param rng: a random number generator used to initialize weights
-
-        :type input: theano.tensor.dmatrix
-        :param input: a symbolic tensor of shape (n_examples, n_in)
-
-        :type n_in: int
-        :param n_in: dimensionality of input
-
-        :type n_out: int
-        :param n_out: number of hidden units
-
-        :type activation: theano.Op or function
-        :param activation: Non linearity to be applied in the hidden
-                           layer
-        :type dropout_rate: list 
-        :param dropout_rate: array containing probabilities of retaining a unit
-        """
-        super(HiddenDropoutLayer, self).__init__(
-                rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b,
-                activation=activation)
-
-        self.output = _dropsout(rng, self.output, p=dropout_rate)
+    return output / (1 - p)
     
 
-class MLP(object):
-    """Multi-Layer Perceptron Class
+class DropoutMLP(object):
+    """Multi-Layer Perceptron Class with partial hidden units
 
-    A multilayer perceptron is a feedforward artificial neural network model
-    that has one layer or more of hidden units and nonlinear activations.
-    Intermediate layers usually have as activation function tanh or the
-    sigmoid function (defined here by a ``HiddenLayer`` class)  while the
-    top layer is a softmax layer (defined here by a ``LogisticRegression``
-    class).
+    An implementation of Multilayer Perceptron with dropping of hidden units at a probability 
+    given by ```1-dropout_rate```.
     """
 
     def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out):
@@ -167,11 +57,11 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out):
         #Dropping out the input layer
         inp_dropout_layer = _dropsout(rng, input, p=dropout_rates[0])
         
-        self.drop_layer = HiddenDropoutLayer(rng=rng,
+        self.drop_layer = HiddenLayer(rng=rng,
                     input=inp_dropout_layer,
-                    activation=T.tanh,
                     n_in=n_in, n_out=n_hidden,
-                    dropout_rate=dropout_rates[1])
+                    activation=T.tanh)
+        self.drop_layer.output = _dropsout(rng, self.drop_layer.output, p=dropout_rates[1])
         
 
         # Since we are dealing with a one hidden layer MLP, this will translate
@@ -183,7 +73,7 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out):
             input=input,
             n_in=n_in,
             n_out=n_hidden,
-            W=self.drop_layer.W * (1 - dropout_rates[1]),
+            W=self.drop_layer.W,
             b=self.drop_layer.b,
             activation=T.tanh
         )
@@ -201,7 +91,7 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out):
             input=self.hiddenLayer.output,
             n_in=n_hidden,
             n_out=n_out,
-            W=self.drop_output_layer.W * (1 - dropout_rates[-1]),
+            W=self.drop_output_layer.W,
             b=self.drop_output_layer.b,
         )
         
@@ -280,7 +170,7 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5],
 
 
     # construct the MLP class
-    classifier = MLP(
+    classifier = HiddenMLP(
         rng=rng,
         input=x,
         n_in=28 * 28,
diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py
index 7aa649ff..0159798b 100644
--- a/code/logistic_sgd.py
+++ b/code/logistic_sgd.py
@@ -86,8 +86,8 @@ def __init__(self, input, n_in, n_out, W=None, b=None):
         else:
             self.W = W
 
-        # initialize the baises b as a vector of n_out 0s if the parameter b is
-        #not none
+        # initialize the biases b as a vector of n_out 0s if the parameter b is
+        #none
         if b is None:
             self.b = theano.shared(
                     value=numpy.zeros((n_out,), dtype=theano.config.floatX),

From 5f814ebeae1c0fb239f930061d2bcec48870b56b Mon Sep 17 00:00:00 2001
From: sentient07 <vxrram95@gmail.com>
Date: Fri, 1 Apr 2016 04:06:58 +0530
Subject: [PATCH 4/4] Added doc file

---
 doc/dropout.txt | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 doc/dropout.txt

diff --git a/doc/dropout.txt b/doc/dropout.txt
new file mode 100644
index 00000000..f8defb7f
--- /dev/null
+++ b/doc/dropout.txt
@@ -0,0 +1,41 @@
+.. index:: Dropout
+
+.. _dropout:
+
+Dropout
+=====================
+
+.. note::
+    This section assumes the reader has already read through :doc:`mlp`. 
+    Overfitting can be reduced by using dropout to prevent complex co-adaptations
+    on the training data. The explanation of the model in this section 
+    is based on the MLP model. 
+    Additionally, it uses the following new Theano functions and concepts:
+    `T.cast`_. 
+    If you intend to run the code on GPU also read `GPU`_.
+
+
+.. note::
+    The code for this section is available for download `here`_.
+
+.. _here: http://deeplearning.net/tutorial/code/dropout.py
+
+.. _T.cast: http://deeplearning.net/software/theano/library/tensor/basic.html#casting
+
+.. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
+
+
+The next architecture we are going to present using Theano is the
+single-hidden-layer Multi-Layer Perceptron (MLP) with 20% dropout 
+to the input data and 50% dropout to the hidden layers. 
+
+The Model
++++++++++
+This model is same as the MLP with the units in the hidden layers dropped 
+randomly with a probability of 50% and input layer dropped with a 
+probability of 20%. Before adding the first hidden layer, 20% dropout is applied to 
+the input data. Then, 50% dropout is added to the first hidden layer followed by 50% 
+dropout to the Logistic Regression layer. This is done via the _dropsout method. 
+The _dropsout method takes in the following parameters, random state , layer and a decimal
+denoting the probability of retaining the units. The method multiplies the input with mask and 
+scales it up by 1/(1-rate). 
\ No newline at end of file