diff --git a/mlp/costs.py b/mlp/costs.py new file mode 100644 index 0000000..942920a --- /dev/null +++ b/mlp/costs.py @@ -0,0 +1,64 @@ +# Machine Learning Practical (INFR11119), +# Pawel Swietojanski, University of Edinburgh + + +import numpy + + +class Cost(object): + """ + Defines an interface for the cost object + """ + def cost(self, y, t, **kwargs): + """ + Implements a cost for monitoring purposes + :param y: matrix -- an output of the model + :param t: matrix -- an expected output the model should produce + :param kwargs: -- some optional parameters required by the cost + :return: the scalar value representing the cost given y and t + """ + raise NotImplementedError() + + def grad(self, y, t, **kwargs): + """ + Implements a gradient of the cost w.r.t y + :param y: matrix -- an output of the model + :param t: matrix -- an expected output the model should produce + :param kwargs: -- some optional parameters required by the cost + :return: matrix - the gradient of the cost w.r.t y + """ + raise NotImplementedError() + + def get_name(self): + return 'cost' + + +class MSECost(Cost): + def cost(self, y, t, **kwargs): + se = 0.5*numpy.sum((y - t)**2, axis=1) + return numpy.mean(se) + + def grad(self, y, t, **kwargs): + return y - t + + def get_name(self): + return 'mse' + + +class CECost(Cost): + """ + Cross Entropy (Negative log-likelihood) cost for multiple classes + """ + def cost(self, y, t, **kwargs): + #assumes t is 1-of-K coded and y is a softmax + #transformed estimate at the output layer + nll = t * numpy.log(y) + return -numpy.mean(numpy.sum(nll, axis=1)) + + def grad(self, y, t, **kwargs): + #assumes t is 1-of-K coded and y is a softmax + #transformed estimate at the output layer + return y - t + + def get_name(self): + return 'ce' diff --git a/mlp/layers.py b/mlp/layers.py new file mode 100644 index 0000000..c511792 --- /dev/null +++ b/mlp/layers.py @@ -0,0 +1,281 @@ + +# Machine Learning Practical (INFR11119), +# Pawel Swietojanski, University of Edinburgh + +import numpy +import logging +from mlp.costs import Cost + + +logger = logging.getLogger(__name__) + + +class MLP(object): + """ + This is a container for an arbitrary sequence of other transforms + On top of this, the class also keeps the state of the model, i.e. + the result of forward (activations) and backward (deltas) passes + through the model (for a mini-batch), which is required to compute + the gradients for the parameters + """ + def __init__(self, cost): + + assert isinstance(cost, Cost), ( + "Cost needs to be of type mlp.costs.Cost, got %s" % type(cost) + ) + + self.layers = [] #the actual list of network layers + self.activations = [] #keeps forward-pass activations (h from equations) + # for a given minibatch (or features at 0th index) + self.deltas = [] #keeps back-propagated error signals (deltas from equations) + # for a given minibatch and each layer + self.cost = cost + + def fprop(self, x): + """ + + :param inputs: mini-batch of data-points x + :return: y (top layer activation) which is an estimate of y given x + """ + + if len(self.activations) != len(self.layers) + 1: + self.activations = [None]*(len(self.layers) + 1) + + self.activations[0] = x + for i in xrange(0, len(self.layers)): + self.activations[i+1] = self.layers[i].fprop(self.activations[i]) + return self.activations[-1] + + def bprop(self, cost_grad): + """ + :param cost_grad: matrix -- grad of the cost w.r.t y + :return: None, the deltas are kept in the model + """ + + # allocate the list of deltas for each layer + # note, we do not use all of those fields but + # want to keep it aligned 1:1 with activations, + # which will simplify indexing later on when + # computing grads w.r.t parameters + if len(self.deltas) != len(self.activations): + self.deltas = [None]*len(self.activations) + + # treat the top layer in special way, as it deals with the + # cost, which may lead to some simplifications + top_layer_idx = len(self.layers) + self.deltas[top_layer_idx], ograds = self.layers[top_layer_idx - 1].\ + bprop_cost(self.activations[top_layer_idx], cost_grad, self.cost) + + # then back-prop through remaining layers + for i in xrange(top_layer_idx - 1, 0, -1): + self.deltas[i], ograds = self.layers[i - 1].\ + bprop(self.activations[i], ograds) + + def add_layer(self, layer): + self.layers.append(layer) + + def set_layers(self, layers): + self.layers = layers + + def get_name(self): + return 'mlp' + + +class Layer(object): + """ + Abstract class defining an interface for + other transforms. + """ + def __init__(self, rng=None): + + if rng is None: + seed=[2015, 10, 1] + self.rng = numpy.random.RandomState(seed) + else: + self.rng = rng + + def fprop(self, inputs): + """ + Implements a forward propagation through the i-th layer, that is + some form of: + a^i = xW^i + b^i + h^i = f^i(a^i) + with f^i, W^i, b^i denoting a non-linearity, weight matrix and + biases at the i-th layer, respectively and x denoting inputs. + + :param inputs: matrix of features (x) or the output of the previous layer h^{i-1} + :return: h^i, matrix of transformed by layer features + """ + raise NotImplementedError() + + def bprop(self, h, igrads): + """ + Implements a backward propagation through the layer, that is, given + h^i denotes the output of the layer and x^i the input, we compute: + dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i + x^i could be either features (x) or the output of the lower layer h^{i-1} + :param h: it's an activation produced in forward pass + :param igrads, error signal (or gradient) flowing to the layer, note, + this in general case does not corresponds to 'deltas' used to update + the layer's parameters, to get deltas ones need to multiply it with + the dh^i/da^i derivative + :return: a tuple (deltas, ograds) where: + deltas = igrads * dh^i/da^i + ograds = deltas \times da^i/dx^i + """ + raise NotImplementedError() + + def bprop_cost(self, h, igrads, cost=None): + """ + Implements a backward propagation in case the layer directly + deals with the optimised cost (i.e. the top layer) + By default, method should implement a back-prop for default cost, that is + the one that is natural to the layer's output, i.e.: + linear -> mse, softmax -> cross-entropy, sigmoid -> binary cross-entropy + :param h: it's an activation produced in forward pass + :param igrads, error signal (or gradient) flowing to the layer, note, + this in general case does not corresponds to 'deltas' used to update + the layer's parameters, to get deltas ones need to multiply it with + the dh^i/da^i derivative + :return: a tuple (deltas, ograds) where: + deltas = igrads * dh^i/da^i + ograds = deltas \times da^i/dx^i + """ + + raise NotImplementedError() + + def pgrads(self, inputs, deltas): + """ + Return gradients w.r.t parameters + """ + raise NotImplementedError() + + def get_params(self): + raise NotImplementedError() + + def set_params(self): + raise NotImplementedError() + + def get_name(self): + return 'abstract_layer' + + +class Linear(Layer): + + def __init__(self, idim, odim, + rng=None, + irange=0.1): + + super(Linear, self).__init__(rng=rng) + + self.idim = idim + self.odim = odim + + self.W = self.rng.uniform( + -irange, irange, + (self.idim, self.odim)) + + self.b = numpy.zeros((self.odim,), dtype=numpy.float32) + + def fprop(self, inputs): + """ + Implements a forward propagation through the i-th layer, that is + some form of: + a^i = xW^i + b^i + h^i = f^i(a^i) + with f^i, W^i, b^i denoting a non-linearity, weight matrix and + biases of this (i-th) layer, respectively and x denoting inputs. + + :param inputs: matrix of features (x) or the output of the previous layer h^{i-1} + :return: h^i, matrix of transformed by layer features + """ + a = numpy.dot(inputs, self.W) + self.b + # here f() is an identity function, so just return a linear transformation + return a + + def bprop(self, h, igrads): + """ + Implements a backward propagation through the layer, that is, given + h^i denotes the output of the layer and x^i the input, we compute: + dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i + x^i could be either features (x) or the output of the lower layer h^{i-1} + :param h: it's an activation produced in forward pass + :param igrads, error signal (or gradient) flowing to the layer, note, + this in general case does not corresponds to 'deltas' used to update + the layer's parameters, to get deltas ones need to multiply it with + the dh^i/da^i derivative + :return: a tuple (deltas, ograds) where: + deltas = igrads * dh^i/da^i + ograds = deltas \times da^i/dx^i + """ + + # since df^i/da^i = 1 (f is assumed identity function), + # deltas are in fact the same as igrads + ograds = numpy.dot(igrads, self.W.T) + return igrads, ograds + + def bprop_cost(self, h, igrads, cost): + """ + Implements a backward propagation in case the layer directly + deals with the optimised cost (i.e. the top layer) + By default, method should implement a bprop for default cost, that is + the one that is natural to the layer's output, i.e.: + here we implement linear -> mse scenario + :param h: it's an activation produced in forward pass + :param igrads, error signal (or gradient) flowing to the layer, note, + this in general case does not corresponds to 'deltas' used to update + the layer's parameters, to get deltas ones need to multiply it with + the dh^i/da^i derivative + :param cost, mlp.costs.Cost instance defining the used cost + :return: a tuple (deltas, ograds) where: + deltas = igrads * dh^i/da^i + ograds = deltas \times da^i/dx^i + """ + + if cost is None or cost.get_name() == 'mse': + # for linear layer and mean square error cost, + # cost back-prop is the same as standard back-prop + return self.bprop(h, igrads) + else: + raise NotImplementedError('Linear.bprop_cost method not implemented ' + 'for the %s cost' % cost.get_name()) + + def pgrads(self, inputs, deltas): + """ + Return gradients w.r.t parameters + + :param inputs, input to the i-th layer + :param deltas, deltas computed in bprop stage up to -ith layer + :return list of grads w.r.t parameters dE/dW and dE/db in *exactly* + the same order as the params are returned by get_params() + + Note: deltas here contain the whole chain rule leading + from the cost up to the the i-th layer, i.e. + dE/dy^L dy^L/da^L da^L/dh^{L-1} dh^{L-1}/da^{L-1} ... dh^{i}/da^{i} + and here we are just asking about + 1) da^i/dW^i and 2) da^i/db^i + since W and b are only layer's parameters + """ + + grad_W = numpy.dot(inputs.T, deltas) + grad_b = numpy.sum(deltas, axis=0) + + return [grad_W, grad_b] + + def get_params(self): + return [self.W, self.b] + + def set_params(self, params): + #we do not make checks here, but the order on the list + #is assumed to be exactly the same as get_params() returns + self.W = params[0] + self.b = params[1] + + def get_name(self): + return 'linear' + + + + + + \ No newline at end of file diff --git a/mlp/optimisers.py b/mlp/optimisers.py new file mode 100644 index 0000000..9d4b947 --- /dev/null +++ b/mlp/optimisers.py @@ -0,0 +1,170 @@ +# Machine Learning Practical (INFR11119), +# Pawel Swietojanski, University of Edinburgh + +import numpy +import time +import logging + +from mlp.layers import MLP +from mlp.dataset import DataProvider +from mlp.schedulers import LearningRateScheduler + + +logger = logging.getLogger(__name__) + + +class Optimiser(object): + def train_epoch(self, model, train_iter): + raise NotImplementedError() + + def train(self, model, train_iter, valid_iter=None): + raise NotImplementedError() + + def validate(self, model, valid_iterator): + assert isinstance(model, MLP), ( + "Expected model to be a subclass of 'mlp.layers.MLP'" + " class but got %s " % type(model) + ) + + assert isinstance(valid_iterator, DataProvider), ( + "Expected iterator to be a subclass of 'mlp.dataset.DataProvider'" + " class but got %s " % type(valid_iterator) + ) + + acc_list, nll_list = [], [] + for x, t in valid_iterator: + y = model.fprop(x) + nll_list.append(model.cost.cost(y, t)) + acc_list.append(numpy.mean(self.classification_accuracy(y, t))) + + acc = numpy.mean(acc_list) + nll = numpy.mean(nll_list) + + return nll, acc + + @staticmethod + def classification_accuracy(y, t): + """ + Returns classification accuracy given the estimate y and targets t + :param y: matrix -- estimate produced by the model in fprop + :param t: matrix -- target 1-of-K coded + :return: vector of y.shape[0] size with binary values set to 0 + if example was miscalssified or 1 otherwise + """ + y_idx = numpy.argmax(y, axis=1) + t_idx = numpy.argmax(t, axis=1) + rval = numpy.equal(y_idx, t_idx) + return rval + + +class SGDOptimiser(Optimiser): + def __init__(self, lr_scheduler): + super(SGDOptimiser, self).__init__() + + assert isinstance(lr_scheduler, LearningRateScheduler), ( + "Expected lr_scheduler to be a subclass of 'mlp.schedulers.LearningRateScheduler'" + " class but got %s " % type(lr_scheduler) + ) + + self.lr_scheduler = lr_scheduler + + def train_epoch(self, model, train_iterator, learning_rate): + + assert isinstance(model, MLP), ( + "Expected model to be a subclass of 'mlp.layers.MLP'" + " class but got %s " % type(model) + ) + assert isinstance(train_iterator, DataProvider), ( + "Expected iterator to be a subclass of 'mlp.dataset.DataProvider'" + " class but got %s " % type(train_iterator) + ) + + acc_list, nll_list = [], [] + for x, t in train_iterator: + # get the prediction + y = model.fprop(x) + + # compute the cost and grad of the cost w.r.t y + cost = model.cost.cost(y, t) + cost_grad = model.cost.grad(y, t) + + # do backward pass through the model + model.bprop(cost_grad) + + #update the model, here we iterate over layers + #and then over each parameter in the layer + effective_learning_rate = learning_rate / x.shape[0] + + for i in xrange(0, len(model.layers)): + params = model.layers[i].get_params() + grads = model.layers[i].pgrads(model.activations[i], model.deltas[i + 1]) + uparams = [] + for param, grad in zip(params, grads): + param = param - effective_learning_rate * grad + uparams.append(param) + model.layers[i].set_params(uparams) + + nll_list.append(cost) + acc_list.append(numpy.mean(self.classification_accuracy(y, t))) + + return numpy.mean(nll_list), numpy.mean(acc_list) + + def train(self, model, train_iterator, valid_iterator=None): + + converged = False + cost_name = model.cost.get_name() + tr_stats, valid_stats = [], [] + + # do the initial validation + tr_nll, tr_acc = self.validate(model, train_iterator) + logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%' + % (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.)) + tr_stats.append((tr_nll, tr_acc)) + + if valid_iterator is not None: + valid_iterator.reset() + valid_nll, valid_acc = self.validate(model, valid_iterator) + logger.info('Epoch %i: Validation cost (%s) for random model is %.3f. Accuracy is %.2f%%' + % (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.)) + valid_stats.append((valid_nll, valid_acc)) + + while not converged: + train_iterator.reset() + + tstart = time.clock() + tr_nll, tr_acc = self.train_epoch(model=model, + train_iterator=train_iterator, + learning_rate=self.lr_scheduler.get_rate()) + tstop = time.clock() + tr_stats.append((tr_nll, tr_acc)) + + logger.info('Epoch %i: Training cost (%s) is %.3f. Accuracy is %.2f%%' + % (self.lr_scheduler.epoch + 1, cost_name, tr_nll, tr_acc * 100.)) + + vstart = time.clock() + if valid_iterator is not None: + valid_iterator.reset() + valid_nll, valid_acc = self.validate(model, valid_iterator) + logger.info('Epoch %i: Validation cost (%s) is %.3f. Accuracy is %.2f%%' + % (self.lr_scheduler.epoch + 1, cost_name, valid_nll, valid_acc * 100.)) + self.lr_scheduler.get_next_rate(valid_acc) + valid_stats.append((valid_nll, valid_acc)) + else: + self.lr_scheduler.get_next_rate(None) + vstop = time.clock() + + train_speed = train_iterator.num_examples() / (tstop - tstart) + valid_speed = valid_iterator.num_examples() / (vstop - vstart) + tot_time = vstop - tstart + #pps = presentations per second + logger.info("Epoch %i: Took %.0f seconds. Training speed %.0f pps. " + "Validation speed %.0f pps." + % (self.lr_scheduler.epoch, tot_time, train_speed, valid_speed)) + + # we stop training when learning rate, as returned by lr scheduler, is 0 + # this is implementation dependent and depending on lr schedule could happen, + # for example, when max_epochs has been reached or if the progress between + # two consecutive epochs is too small, etc. + converged = (self.lr_scheduler.get_rate() == 0) + + return tr_stats, valid_stats diff --git a/mlp/schedulers.py b/mlp/schedulers.py new file mode 100644 index 0000000..f5499e6 --- /dev/null +++ b/mlp/schedulers.py @@ -0,0 +1,155 @@ +# Machine Learning Practical (INFR11119), +# Pawel Swietojanski, University of Edinburgh + +import logging + + +class LearningRateScheduler(object): + """ + Define an interface for determining learning rates + """ + def __init__(self, max_epochs=100): + self.epoch = 0 + self.max_epochs = max_epochs + + def get_rate(self): + raise NotImplementedError() + + def get_next_rate(self, current_error=None): + self.epoch += 1 + + +class LearningRateList(LearningRateScheduler): + def __init__(self, learning_rates_list, max_epochs): + + super(LearningRateList, self).__init__(max_epochs) + + assert isinstance(learning_rates_list, list), ( + "The learning_rates_list argument expected" + " to be of type list, got %s" % type(learning_rates_list) + ) + self.lr_list = learning_rates_list + + def get_rate(self): + if self.epoch < len(self.lr_list): + return self.lr_list[self.epoch] + return 0.0 + + def get_next_rate(self, current_error=None): + super(LearningRateList, self).get_next_rate(current_error=None) + return self.get_rate() + + +class LearningRateFixed(LearningRateList): + + def __init__(self, learning_rate, max_epochs): + assert learning_rate > 0, ( + "learning rate expected to be > 0, got %f" % learning_rate + ) + super(LearningRateFixed, self).__init__([learning_rate], max_epochs) + + def get_rate(self): + if self.epoch < self.max_epochs: + return self.lr_list[0] + return 0.0 + + def get_next_rate(self, current_error=None): + super(LearningRateFixed, self).get_next_rate(current_error=None) + return self.get_rate() + + +class LearningRateNewBob(LearningRateScheduler): + """ + Exponential learning rate schema + """ + + def __init__(self, start_rate, scale_by=.5, max_epochs=99, \ + min_derror_ramp_start=.5, min_derror_stop=.5, init_error=100.0, \ + patience=0, zero_rate=None, ramping=False): + """ + :type start_rate: float + :param start_rate: + + :type scale_by: float + :param scale_by: + + :type max_epochs: int + :param max_epochs: + + :type min_error_start: float + :param min_error_start: + + :type min_error_stop: float + :param min_error_stop: + + :type init_error: float + :param init_error: + # deltas2 below are just deltas returned by linear Linear,bprop transform + # and are exactly the same as + """ + self.start_rate = start_rate + self.init_error = init_error + self.init_patience = patience + + self.rate = start_rate + self.scale_by = scale_by + self.max_epochs = max_epochs + self.min_derror_ramp_start = min_derror_ramp_start + self.min_derror_stop = min_derror_stop + self.lowest_error = init_error + + self.epoch = 1 + self.ramping = ramping + self.patience = patience + self.zero_rate = zero_rate + + def reset(self): + self.rate = self.start_rate + self.lowest_error = self.init_error + self.epoch = 1 + self.ramping = False + self.patience = self.init_patience + + def get_rate(self): + if (self.epoch==1 and self.zero_rate!=None): + return self.zero_rate + return self.rate + + def get_next_rate(self, current_error): + """ + :type current_error: float + :param current_error: percentage error + + """ + + diff_error = 0.0 + + if ( (self.max_epochs > 10000) or (self.epoch >= self.max_epochs) ): + #logging.debug('Setting rate to 0.0. max_epochs or epoch>=max_epochs') + self.rate = 0.0 + else: + diff_error = self.lowest_error - current_error + + if (current_error < self.lowest_error): + self.lowest_error = current_error + + if (self.ramping): + if (diff_error < self.min_derror_stop): + if (self.patience > 0): + #logging.debug('Patience decreased to %f' % self.patience) + self.patience -= 1 + self.rate *= self.scale_by + else: + #logging.debug('diff_error (%f) < min_derror_stop (%f)' % (diff_error, self.min_derror_stop)) + self.rate = 0.0 + else: + self.rate *= self.scale_by + else: + if (diff_error < self.min_derror_ramp_start): + #logging.debug('Start ramping.') + self.ramping = True + self.rate *= self.scale_by + + self.epoch += 1 + + return self.rate diff --git a/res/code_scheme.svg b/res/code_scheme.svg new file mode 100644 index 0000000..0b0eec8 --- /dev/null +++ b/res/code_scheme.svg @@ -0,0 +1,2030 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + The Model + Forward Computation + Backward computation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E(y,t)(here assumed cross-entropy) + + + + + + + + + + + + + + + + ParameterUpdates + + + + + + + + + + + + + + + + + + +