From be0fd9937bdff7f54625bfe25cbb2858d3e19f89 Mon Sep 17 00:00:00 2001 From: sentient07 Date: Tue, 8 Mar 2016 02:10:59 +0530 Subject: [PATCH 1/4] added dropout --- code/dropout.py | 626 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 626 insertions(+) create mode 100644 code/dropout.py diff --git a/code/dropout.py b/code/dropout.py new file mode 100644 index 00000000..18c1d920 --- /dev/null +++ b/code/dropout.py @@ -0,0 +1,626 @@ +from __future__ import print_function + +import numpy as np +import os +import sys +import timeit +import six.moves.cPickle as pickle +import theano +import theano.tensor as T +import theano.tensor.shared_randomstreams +import gzip +from collections import OrderedDict + + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + def __init__(self, input, n_in, n_out, W=None, b=None): + """ Initialize the parameters of the logistic regression + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + # start-snippet-1 + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + if W is None: + self.W = theano.shared( + value=np.zeros((n_in, n_out), dtype=theano.config.floatX), + name='W') + else: + self.W = W + + # initialize the baises b as a vector of n_out 0s + if b is None: + self.b = theano.shared( + value=np.zeros((n_out,), dtype=theano.config.floatX), + name='b') + else: + self.b = b + + # symbolic expression for computing the matrix of class-membership + # probabilities + # Where: + # W is a matrix where column-k represent the separation hyperplane for + # class-k + # x is a matrix where row-j represents input training sample-j + # b is a vector where element-k represent the free parameter of + # hyperplane-k + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) + + # symbolic description of how to compute prediction as class whose + # probability is maximal + self.y_pred = T.argmax(self.p_y_given_x, axis=1) + # end-snippet-1 + + # parameters of the model + self.params = [self.W, self.b] + + # keep track of model input + self.input = input + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} + \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + # start-snippet-2 + # y.shape[0] is (symbolically) the number of rows in y, i.e., + # number of examples (call it n) in the minibatch + # T.arange(y.shape[0]) is a symbolic vector which will contain + # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of + # Log-Probabilities (call it LP) with one row per example and + # one column per class LP[T.arange(y.shape[0]),y] is a vector + # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., + # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is + # the mean (across minibatch examples) of the elements in v, + # i.e., the mean log-likelihood across the minibatch. + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) + # end-snippet-2 + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError( + 'y should have the same shape as self.y_pred', + ('y', y.type, 'y_pred', self.y_pred.type) + ) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +def load_data(dataset): + ''' Loads the dataset + + :type dataset: string + :param dataset: the path to the dataset (here MNIST) + ''' + + ############# + # LOAD DATA # + ############# + + # Download the MNIST dataset if it is not present + data_dir, data_file = os.path.split(dataset) + if data_dir == "" and not os.path.isfile(dataset): + # Check if dataset is in the data directory. + new_path = os.path.join( + os.path.split(__file__)[0], + "..", + "data", + dataset + ) + if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': + dataset = new_path + + if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': + from six.moves import urllib + origin = ( + 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' + ) + print('Downloading data from %s' % origin) + urllib.request.urlretrieve(origin, dataset) + + print('... loading data') + + # Load the dataset + with gzip.open(dataset, 'rb') as f: + try: + train_set, valid_set, test_set = pickle.load(f, encoding='latin1') + except: + train_set, valid_set, test_set = pickle.load(f) + # train_set, valid_set, test_set format: tuple(input, target) + # input is a numpy.ndarray of 2 dimensions (a matrix) + # where each row corresponds to an example. target is a + # numpy.ndarray of 1 dimension (vector) that has the same length as + # the number of rows in the input. It should give the target + # to the example with the same index in the input. + + def shared_dataset(data_xy, borrow=True): + """ Function that loads the dataset into shared variables + + The reason we store our dataset in shared variables is to allow + Theano to copy it into the GPU memory (when code is run on GPU). + Since copying data into the GPU is slow, copying a minibatch everytime + is needed (the default behaviour if the data is not in a shared + variable) would lead to a large decrease in performance. + """ + data_x, data_y = data_xy + shared_x = theano.shared(np.asarray(data_x, + dtype=theano.config.floatX), + borrow=borrow) + shared_y = theano.shared(np.asarray(data_y, + dtype=theano.config.floatX), + borrow=borrow) + # When storing data on the GPU it has to be stored as floats + # therefore we will store the labels as ``floatX`` as well + # (``shared_y`` does exactly that). But during our computations + # we need them as ints (we use labels as index, and if they are + # floats it doesn't make sense) therefore instead of returning + # ``shared_y`` we will have to cast it to int. This little hack + # lets ous get around this issue + return shared_x, T.cast(shared_y, 'int32') + + test_set_x, test_set_y = shared_dataset(test_set) + valid_set_x, valid_set_y = shared_dataset(valid_set) + train_set_x, train_set_y = shared_dataset(train_set) + + rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), + (test_set_x, test_set_y)] + return rval + + +class HiddenLayer(object): + def __init__(self, rng, input, n_in, n_out, W=None, b=None, + activation=T.tanh): + """ + Typical hidden layer of a MLP: units are fully-connected and have + sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + NOTE : The nonlinearity used here is tanh + + Hidden unit activation is given by: tanh(dot(input,W) + b) + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.dmatrix + :param input: a symbolic tensor of shape (n_examples, n_in) + + :type n_in: int + :param n_in: dimensionality of input + + :type n_out: int + :param n_out: number of hidden units + + :type activation: theano.Op or function + :param activation: Non linearity to be applied in the hidden + layer + """ + self.input = input + # end-snippet-1 + + # `W` is initialized with `W_values` which is uniformely sampled + # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) + # for tanh activation function + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + # Note : optimal initialization of weights is dependent on the + # activation function used (among other things). + # For example, results presented in [Xavier10] suggest that you + # should use 4 times larger initial weights for sigmoid + # compared to tanh + # We have no info for other function, so we use the same as + # tanh. + if W is None: + W_values = np.asarray( + rng.uniform( + low=-np.sqrt(6. / (n_in + n_out)), + high= np.sqrt(6. / (n_in + n_out)), + size=(n_in, n_out) + ), + dtype=theano.config.floatX + ) + if activation == theano.tensor.nnet.sigmoid: + W_values *= 4 + + W = theano.shared(value=W_values, name='W', borrow=True) + + if b is None: + b_values = np.zeros((n_out,), dtype=theano.config.floatX) + b = theano.shared(value=b_values, name='b', borrow=True) + + self.W = W + self.b = b + + lin_output = T.dot(input, self.W) + self.b + self.output = ( + lin_output if activation is None + else activation(lin_output) + ) + # parameters of the model + self.params = [self.W, self.b] + + +def _dropsout(rng, layer, p): + srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(1000)) + mask = srng.binomial(n=1, p=1-p, size=layer.shape) + output = layer*T.cast(mask, theano.config.floatX) + return output + +class HiddenDropoutLayer(HiddenLayer): + + def __init__(self, rng, input, n_in, n_out, + activation, dropout_rate, W=None, b=None): + super(HiddenDropoutLayer, self).__init__( + rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b, + activation=activation) + + self.output = _dropsout(rng, self.output, p=dropout_rate) + + +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermediate layers usually have as activation function tanh or the + sigmoid function (defined here by a ``HiddenLayer`` class) while the + top layer is a softmax layer (defined here by a ``LogisticRegression`` + class). + """ + + def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out): + """Initialize the parameters for the multilayer perceptron + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_hidden: int + :param n_hidden: number of hidden units + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + #Dropping out the input layer + inp_dropout_layer = _dropsout(rng, input, p=dropout_rates[0]) + + self.drop_layer = HiddenDropoutLayer(rng=rng, + input=inp_dropout_layer, + activation=T.tanh, + n_in=n_in, n_out=n_hidden, + dropout_rate=dropout_rates[1]) + + + # Since we are dealing with a one hidden layer MLP, this will translate + # into a HiddenLayer with a tanh activation function connected to the + # LogisticRegression layer; the activation function can be replaced by + # sigmoid or any other nonlinear function + self.hiddenLayer = HiddenLayer( + rng=rng, + input=input, + n_in=n_in, + n_out=n_hidden, + W=self.drop_layer.W * (1 - dropout_rates[1]), + b=self.drop_layer.b, + activation=T.tanh + ) + + + self.drop_output_layer = LogisticRegression( + input=self.drop_layer.output, + n_in=n_hidden, + n_out=n_out) + + + # The logistic regression layer gets as input the hidden units + # of the hidden layer + self.logRegressionLayer = LogisticRegression( + input=self.hiddenLayer.output, + n_in=n_hidden, + n_out=n_out, + W=self.drop_output_layer.W * (1 - dropout_rates[-1]), + b=self.drop_output_layer.b, + ) + + + self.drop_negative_log_likelihood = self.drop_output_layer.negative_log_likelihood + self.dropout_errors = self.drop_output_layer.errors + + # negative log likelihood of the MLP is given by the negative + # log likelihood of the output of the model, computed in the + # logistic regression layer + self.negative_log_likelihood = ( + self.logRegressionLayer.negative_log_likelihood + ) + # same holds for the function computing the number of errors + self.errors = self.logRegressionLayer.errors + + # the parameters of the model are the parameters of the two layer it is + # made out of + self.params = self.drop_layer.params + self.drop_output_layer.params + # end-snippet-3 + + # keep track of model input + self.input = input + + + +# In[36]: + + +def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5], + dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): + """ + Demonstrate stochastic gradient descent optimization for a multilayer + perceptron + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :type L1_reg: float + :param L1_reg: L1-norm's weight when added to the cost (see + regularization) + + :type L2_reg: float + :param L2_reg: L2-norm's weight when added to the cost (see + regularization) + + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + + + """ + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size + n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size + n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print('... building the model') + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + rng = np.random.RandomState(1234) + + + # construct the MLP class + classifier = MLP( + rng=rng, + input=x, + n_in=28 * 28, + n_hidden=n_hidden, + dropout_rates=dropout_rates, + n_out=10 + ) + + # start-snippet-4 + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = ( + classifier.negative_log_likelihood(y) + ) + dropout_cost = classifier.drop_negative_log_likelihood(y) + + # end-snippet-4 + + # compiling a Theano function that computes the mistakes that are made + # by the model on a minibatch + test_model = theano.function( + inputs=[index], + outputs=classifier.errors(y), + givens={ + x: test_set_x[index * batch_size:(index + 1) * batch_size], + y: test_set_y[index * batch_size:(index + 1) * batch_size] + } + ) + + validate_model = theano.function( + inputs=[index], + outputs=classifier.errors(y), + givens={ + x: valid_set_x[index * batch_size:(index + 1) * batch_size], + y: valid_set_y[index * batch_size:(index + 1) * batch_size] + } + ) + + # start-snippet-5 + # compute the gradient of cost with respect to theta (sotred in params) + # the resulting gradients will be stored in a list gparams + gparams = [] + for param in classifier.params: + #Changing cost for with dropout layer and without + gparam = T.grad(dropout_cost, param) + gparams.append(gparam) + + # specify how to update the parameters of the model as a list of + # (variable, update expression) pairs + + # given two lists of the same length, A = [a1, a2, a3, a4] and + # B = [b1, b2, b3, b4], zip generates a list C of same size, where each + # element is a pair formed from the two lists : + # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] + #Stochastic Gradient Descent (SGD) updates + + output = dropout_cost + updates = OrderedDict() + for param, gparam in zip(classifier.params, gparams) : + updates[param] = param - learning_rate * gparam + + + # compiling a Theano function `train_model` that returns the cost, but + # in the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function( + inputs=[index], + outputs=cost, + updates=updates, + givens={ + x: train_set_x[index * batch_size: (index + 1) * batch_size], + y: train_set_y[index * batch_size: (index + 1) * batch_size] + } + ) + # end-snippet-5 + + ############### + # TRAIN MODEL # + ############### + print('... training') + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience // 2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_validation_loss = np.inf + best_iter = 0 + test_score = 0. + start_time = timeit.default_timer() + + epoch = 0 + done_looping = False + + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in range(n_train_batches): + + minibatch_avg_cost = train_model(minibatch_index) + # iteration number + iter = (epoch - 1) * n_train_batches + minibatch_index + + if (iter + 1) % validation_frequency == 0: + # compute zero-one loss on validation set + validation_losses = [validate_model(i) for i + in range(n_valid_batches)] + this_validation_loss = np.mean(validation_losses) + + print( + 'epoch %i, minibatch %i/%i, validation error %f %%' % + ( + epoch, + minibatch_index + 1, + n_train_batches, + this_validation_loss * 100. + ) + ) + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + #improve patience if loss improvement is good enough + if ( + this_validation_loss < best_validation_loss * + improvement_threshold + ): + patience = max(patience, iter * patience_increase) + + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = [test_model(i) for i + in range(n_test_batches)] + test_score = np.mean(test_losses) + + print((' epoch %i, minibatch %i/%i, test error of ' + 'best model %f %%') % + (epoch, minibatch_index + 1, n_train_batches, + test_score * 100.)) + + if patience <= iter: + done_looping = True + break + + end_time = timeit.default_timer() + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter + 1, test_score * 100.)) + print(('The code for file ' + + os.path.split(__file__)[1] + + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) + + +if __name__ == '__main__': + test_mlp() + From ff3f08589fac489e8feb8175c6daedc1913edb2e Mon Sep 17 00:00:00 2001 From: sentient07 Date: Tue, 8 Mar 2016 19:16:01 +0530 Subject: [PATCH 2/4] Added docstrings --- code/dropout.py | 241 ++++++------------------------------------- code/logistic_sgd.py | 39 +++---- 2 files changed, 51 insertions(+), 229 deletions(-) diff --git a/code/dropout.py b/code/dropout.py index 18c1d920..bddcb84a 100644 --- a/code/dropout.py +++ b/code/dropout.py @@ -10,208 +10,9 @@ import theano.tensor.shared_randomstreams import gzip from collections import OrderedDict +from logistic_sgd import LogisticRegression, load_data -class LogisticRegression(object): - """Multi-class Logistic Regression Class - - The logistic regression is fully described by a weight matrix :math:`W` - and bias vector :math:`b`. Classification is done by projecting data - points onto a set of hyperplanes, the distance to which is used to - determine a class membership probability. - """ - - def __init__(self, input, n_in, n_out, W=None, b=None): - """ Initialize the parameters of the logistic regression - - :type input: theano.tensor.TensorType - :param input: symbolic variable that describes the input of the - architecture (one minibatch) - - :type n_in: int - :param n_in: number of input units, the dimension of the space in - which the datapoints lie - - :type n_out: int - :param n_out: number of output units, the dimension of the space in - which the labels lie - - """ - # start-snippet-1 - # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - if W is None: - self.W = theano.shared( - value=np.zeros((n_in, n_out), dtype=theano.config.floatX), - name='W') - else: - self.W = W - - # initialize the baises b as a vector of n_out 0s - if b is None: - self.b = theano.shared( - value=np.zeros((n_out,), dtype=theano.config.floatX), - name='b') - else: - self.b = b - - # symbolic expression for computing the matrix of class-membership - # probabilities - # Where: - # W is a matrix where column-k represent the separation hyperplane for - # class-k - # x is a matrix where row-j represents input training sample-j - # b is a vector where element-k represent the free parameter of - # hyperplane-k - self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) - - # symbolic description of how to compute prediction as class whose - # probability is maximal - self.y_pred = T.argmax(self.p_y_given_x, axis=1) - # end-snippet-1 - - # parameters of the model - self.params = [self.W, self.b] - - # keep track of model input - self.input = input - - def negative_log_likelihood(self, y): - """Return the mean of the negative log-likelihood of the prediction - of this model under a given target distribution. - - .. math:: - - \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = - \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} - \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ - \ell (\theta=\{W,b\}, \mathcal{D}) - - :type y: theano.tensor.TensorType - :param y: corresponds to a vector that gives for each example the - correct label - - Note: we use the mean instead of the sum so that - the learning rate is less dependent on the batch size - """ - # start-snippet-2 - # y.shape[0] is (symbolically) the number of rows in y, i.e., - # number of examples (call it n) in the minibatch - # T.arange(y.shape[0]) is a symbolic vector which will contain - # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of - # Log-Probabilities (call it LP) with one row per example and - # one column per class LP[T.arange(y.shape[0]),y] is a vector - # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., - # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is - # the mean (across minibatch examples) of the elements in v, - # i.e., the mean log-likelihood across the minibatch. - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) - # end-snippet-2 - - def errors(self, y): - """Return a float representing the number of errors in the minibatch - over the total number of examples of the minibatch ; zero one - loss over the size of the minibatch - - :type y: theano.tensor.TensorType - :param y: corresponds to a vector that gives for each example the - correct label - """ - - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError( - 'y should have the same shape as self.y_pred', - ('y', y.type, 'y_pred', self.y_pred.type) - ) - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() - - -def load_data(dataset): - ''' Loads the dataset - - :type dataset: string - :param dataset: the path to the dataset (here MNIST) - ''' - - ############# - # LOAD DATA # - ############# - - # Download the MNIST dataset if it is not present - data_dir, data_file = os.path.split(dataset) - if data_dir == "" and not os.path.isfile(dataset): - # Check if dataset is in the data directory. - new_path = os.path.join( - os.path.split(__file__)[0], - "..", - "data", - dataset - ) - if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': - dataset = new_path - - if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': - from six.moves import urllib - origin = ( - 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' - ) - print('Downloading data from %s' % origin) - urllib.request.urlretrieve(origin, dataset) - - print('... loading data') - - # Load the dataset - with gzip.open(dataset, 'rb') as f: - try: - train_set, valid_set, test_set = pickle.load(f, encoding='latin1') - except: - train_set, valid_set, test_set = pickle.load(f) - # train_set, valid_set, test_set format: tuple(input, target) - # input is a numpy.ndarray of 2 dimensions (a matrix) - # where each row corresponds to an example. target is a - # numpy.ndarray of 1 dimension (vector) that has the same length as - # the number of rows in the input. It should give the target - # to the example with the same index in the input. - - def shared_dataset(data_xy, borrow=True): - """ Function that loads the dataset into shared variables - - The reason we store our dataset in shared variables is to allow - Theano to copy it into the GPU memory (when code is run on GPU). - Since copying data into the GPU is slow, copying a minibatch everytime - is needed (the default behaviour if the data is not in a shared - variable) would lead to a large decrease in performance. - """ - data_x, data_y = data_xy - shared_x = theano.shared(np.asarray(data_x, - dtype=theano.config.floatX), - borrow=borrow) - shared_y = theano.shared(np.asarray(data_y, - dtype=theano.config.floatX), - borrow=borrow) - # When storing data on the GPU it has to be stored as floats - # therefore we will store the labels as ``floatX`` as well - # (``shared_y`` does exactly that). But during our computations - # we need them as ints (we use labels as index, and if they are - # floats it doesn't make sense) therefore instead of returning - # ``shared_y`` we will have to cast it to int. This little hack - # lets ous get around this issue - return shared_x, T.cast(shared_y, 'int32') - - test_set_x, test_set_y = shared_dataset(test_set) - valid_set_x, valid_set_y = shared_dataset(valid_set) - train_set_x, train_set_y = shared_dataset(train_set) - - rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), - (test_set_x, test_set_y)] - return rval - class HiddenLayer(object): def __init__(self, rng, input, n_in, n_out, W=None, b=None, @@ -292,10 +93,33 @@ def _dropsout(rng, layer, p): output = layer*T.cast(mask, theano.config.floatX) return output -class HiddenDropoutLayer(HiddenLayer): +class HiddenDropoutLayer(HiddenLayer): def __init__(self, rng, input, n_in, n_out, activation, dropout_rate, W=None, b=None): + + """ + Dropout layer of a MLP. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.dmatrix + :param input: a symbolic tensor of shape (n_examples, n_in) + + :type n_in: int + :param n_in: dimensionality of input + + :type n_out: int + :param n_out: number of hidden units + + :type activation: theano.Op or function + :param activation: Non linearity to be applied in the hidden + layer + :type dropout_rate: list + :param dropout_rate: array containing probabilities of retaining a unit + """ super(HiddenDropoutLayer, self).__init__( rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b, activation=activation) @@ -331,6 +155,9 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out): :type n_hidden: int :param n_hidden: number of hidden units + :type dropout_rate: list + :param dropout_rate: array containing probabilities of retaining a unit + :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie @@ -416,14 +243,6 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5], :param learning_rate: learning rate used (factor for the stochastic gradient - :type L1_reg: float - :param L1_reg: L1-norm's weight when added to the cost (see - regularization) - - :type L2_reg: float - :param L2_reg: L2-norm's weight when added to the cost (see - regularization) - :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer @@ -431,6 +250,8 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5], :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + :type dropout_rate: list + :param dropout_rate: array containing probabilities of retaining a unit """ datasets = load_data(dataset) @@ -516,7 +337,7 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5], # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] #Stochastic Gradient Descent (SGD) updates - + output = dropout_cost updates = OrderedDict() for param, gparam in zip(classifier.params, gparams) : diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py index 9f4427e7..7aa649ff 100644 --- a/code/logistic_sgd.py +++ b/code/logistic_sgd.py @@ -58,8 +58,10 @@ class LogisticRegression(object): determine a class membership probability. """ - def __init__(self, input, n_in, n_out): + def __init__(self, input, n_in, n_out, W=None, b=None): """ Initialize the parameters of the logistic regression + Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the @@ -75,24 +77,23 @@ def __init__(self, input, n_in, n_out): """ # start-snippet-1 - # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - self.W = theano.shared( - value=numpy.zeros( - (n_in, n_out), - dtype=theano.config.floatX - ), - name='W', - borrow=True - ) - # initialize the biases b as a vector of n_out 0s - self.b = theano.shared( - value=numpy.zeros( - (n_out,), - dtype=theano.config.floatX - ), - name='b', - borrow=True - ) + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) if + #the parameter W is None + if W is None: + self.W = theano.shared( + value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), + name='W') + else: + self.W = W + + # initialize the baises b as a vector of n_out 0s if the parameter b is + #not none + if b is None: + self.b = theano.shared( + value=numpy.zeros((n_out,), dtype=theano.config.floatX), + name='b') + else: + self.b = b # symbolic expression for computing the matrix of class-membership # probabilities From 1022a25d69efc6d8c075347e79ad356508ec37b1 Mon Sep 17 00:00:00 2001 From: sentient07 Date: Mon, 14 Mar 2016 23:47:25 +0530 Subject: [PATCH 3/4] Changed the upscaling style --- code/dropout.py | 134 ++++--------------------------------------- code/logistic_sgd.py | 4 +- 2 files changed, 14 insertions(+), 124 deletions(-) diff --git a/code/dropout.py b/code/dropout.py index bddcb84a..a4ff9c58 100644 --- a/code/dropout.py +++ b/code/dropout.py @@ -11,131 +11,21 @@ import gzip from collections import OrderedDict from logistic_sgd import LogisticRegression, load_data - - - -class HiddenLayer(object): - def __init__(self, rng, input, n_in, n_out, W=None, b=None, - activation=T.tanh): - """ - Typical hidden layer of a MLP: units are fully-connected and have - sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) - and the bias vector b is of shape (n_out,). - - NOTE : The nonlinearity used here is tanh - - Hidden unit activation is given by: tanh(dot(input,W) + b) - - :type rng: numpy.random.RandomState - :param rng: a random number generator used to initialize weights - - :type input: theano.tensor.dmatrix - :param input: a symbolic tensor of shape (n_examples, n_in) - - :type n_in: int - :param n_in: dimensionality of input - - :type n_out: int - :param n_out: number of hidden units - - :type activation: theano.Op or function - :param activation: Non linearity to be applied in the hidden - layer - """ - self.input = input - # end-snippet-1 - - # `W` is initialized with `W_values` which is uniformely sampled - # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) - # for tanh activation function - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - # Note : optimal initialization of weights is dependent on the - # activation function used (among other things). - # For example, results presented in [Xavier10] suggest that you - # should use 4 times larger initial weights for sigmoid - # compared to tanh - # We have no info for other function, so we use the same as - # tanh. - if W is None: - W_values = np.asarray( - rng.uniform( - low=-np.sqrt(6. / (n_in + n_out)), - high= np.sqrt(6. / (n_in + n_out)), - size=(n_in, n_out) - ), - dtype=theano.config.floatX - ) - if activation == theano.tensor.nnet.sigmoid: - W_values *= 4 - - W = theano.shared(value=W_values, name='W', borrow=True) - - if b is None: - b_values = np.zeros((n_out,), dtype=theano.config.floatX) - b = theano.shared(value=b_values, name='b', borrow=True) - - self.W = W - self.b = b - - lin_output = T.dot(input, self.W) + self.b - self.output = ( - lin_output if activation is None - else activation(lin_output) - ) - # parameters of the model - self.params = [self.W, self.b] +from mlp import HiddenLayer def _dropsout(rng, layer, p): srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(1000)) mask = srng.binomial(n=1, p=1-p, size=layer.shape) output = layer*T.cast(mask, theano.config.floatX) - return output - - -class HiddenDropoutLayer(HiddenLayer): - def __init__(self, rng, input, n_in, n_out, - activation, dropout_rate, W=None, b=None): - - """ - Dropout layer of a MLP. Weight matrix W is of shape (n_in,n_out) - and the bias vector b is of shape (n_out,). - - :type rng: numpy.random.RandomState - :param rng: a random number generator used to initialize weights - - :type input: theano.tensor.dmatrix - :param input: a symbolic tensor of shape (n_examples, n_in) - - :type n_in: int - :param n_in: dimensionality of input - - :type n_out: int - :param n_out: number of hidden units - - :type activation: theano.Op or function - :param activation: Non linearity to be applied in the hidden - layer - :type dropout_rate: list - :param dropout_rate: array containing probabilities of retaining a unit - """ - super(HiddenDropoutLayer, self).__init__( - rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b, - activation=activation) - - self.output = _dropsout(rng, self.output, p=dropout_rate) + return output / (1 - p) -class MLP(object): - """Multi-Layer Perceptron Class +class DropoutMLP(object): + """Multi-Layer Perceptron Class with partial hidden units - A multilayer perceptron is a feedforward artificial neural network model - that has one layer or more of hidden units and nonlinear activations. - Intermediate layers usually have as activation function tanh or the - sigmoid function (defined here by a ``HiddenLayer`` class) while the - top layer is a softmax layer (defined here by a ``LogisticRegression`` - class). + An implementation of Multilayer Perceptron with dropping of hidden units at a probability + given by ```1-dropout_rate```. """ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out): @@ -167,11 +57,11 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out): #Dropping out the input layer inp_dropout_layer = _dropsout(rng, input, p=dropout_rates[0]) - self.drop_layer = HiddenDropoutLayer(rng=rng, + self.drop_layer = HiddenLayer(rng=rng, input=inp_dropout_layer, - activation=T.tanh, n_in=n_in, n_out=n_hidden, - dropout_rate=dropout_rates[1]) + activation=T.tanh) + self.drop_layer.output = _dropsout(rng, self.drop_layer.output, p=dropout_rates[1]) # Since we are dealing with a one hidden layer MLP, this will translate @@ -183,7 +73,7 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out): input=input, n_in=n_in, n_out=n_hidden, - W=self.drop_layer.W * (1 - dropout_rates[1]), + W=self.drop_layer.W, b=self.drop_layer.b, activation=T.tanh ) @@ -201,7 +91,7 @@ def __init__(self, rng, input, n_in, n_hidden, dropout_rates, n_out): input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out, - W=self.drop_output_layer.W * (1 - dropout_rates[-1]), + W=self.drop_output_layer.W, b=self.drop_output_layer.b, ) @@ -280,7 +170,7 @@ def test_mlp(learning_rate=0.01, n_epochs=1000, dropout_rates = [0.2, 0.5], # construct the MLP class - classifier = MLP( + classifier = HiddenMLP( rng=rng, input=x, n_in=28 * 28, diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py index 7aa649ff..0159798b 100644 --- a/code/logistic_sgd.py +++ b/code/logistic_sgd.py @@ -86,8 +86,8 @@ def __init__(self, input, n_in, n_out, W=None, b=None): else: self.W = W - # initialize the baises b as a vector of n_out 0s if the parameter b is - #not none + # initialize the biases b as a vector of n_out 0s if the parameter b is + #none if b is None: self.b = theano.shared( value=numpy.zeros((n_out,), dtype=theano.config.floatX), From 5f814ebeae1c0fb239f930061d2bcec48870b56b Mon Sep 17 00:00:00 2001 From: sentient07 Date: Fri, 1 Apr 2016 04:06:58 +0530 Subject: [PATCH 4/4] Added doc file --- doc/dropout.txt | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 doc/dropout.txt diff --git a/doc/dropout.txt b/doc/dropout.txt new file mode 100644 index 00000000..f8defb7f --- /dev/null +++ b/doc/dropout.txt @@ -0,0 +1,41 @@ +.. index:: Dropout + +.. _dropout: + +Dropout +===================== + +.. note:: + This section assumes the reader has already read through :doc:`mlp`. + Overfitting can be reduced by using dropout to prevent complex co-adaptations + on the training data. The explanation of the model in this section + is based on the MLP model. + Additionally, it uses the following new Theano functions and concepts: + `T.cast`_. + If you intend to run the code on GPU also read `GPU`_. + + +.. note:: + The code for this section is available for download `here`_. + +.. _here: http://deeplearning.net/tutorial/code/dropout.py + +.. _T.cast: http://deeplearning.net/software/theano/library/tensor/basic.html#casting + +.. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html + + +The next architecture we are going to present using Theano is the +single-hidden-layer Multi-Layer Perceptron (MLP) with 20% dropout +to the input data and 50% dropout to the hidden layers. + +The Model ++++++++++ +This model is same as the MLP with the units in the hidden layers dropped +randomly with a probability of 50% and input layer dropped with a +probability of 20%. Before adding the first hidden layer, 20% dropout is applied to +the input data. Then, 50% dropout is added to the first hidden layer followed by 50% +dropout to the Logistic Regression layer. This is done via the _dropsout method. +The _dropsout method takes in the following parameters, random state , layer and a decimal +denoting the probability of retaining the units. The method multiplies the input with mask and +scales it up by 1/(1-rate). \ No newline at end of file