From 2b1700180eec99d5393e5b920ec7fe8a56fd0c12 Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Mon, 12 Oct 2015 01:50:05 +0100
Subject: [PATCH] addded code-base
---
mlp/costs.py | 64 ++
mlp/layers.py | 281 ++++++
mlp/optimisers.py | 170 ++++
mlp/schedulers.py | 155 ++++
res/code_scheme.svg | 2030 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 2700 insertions(+)
create mode 100644 mlp/costs.py
create mode 100644 mlp/layers.py
create mode 100644 mlp/optimisers.py
create mode 100644 mlp/schedulers.py
create mode 100644 res/code_scheme.svg
diff --git a/mlp/costs.py b/mlp/costs.py
new file mode 100644
index 0000000..942920a
--- /dev/null
+++ b/mlp/costs.py
@@ -0,0 +1,64 @@
+# Machine Learning Practical (INFR11119),
+# Pawel Swietojanski, University of Edinburgh
+
+
+import numpy
+
+
+class Cost(object):
+ """
+ Defines an interface for the cost object
+ """
+ def cost(self, y, t, **kwargs):
+ """
+ Implements a cost for monitoring purposes
+ :param y: matrix -- an output of the model
+ :param t: matrix -- an expected output the model should produce
+ :param kwargs: -- some optional parameters required by the cost
+ :return: the scalar value representing the cost given y and t
+ """
+ raise NotImplementedError()
+
+ def grad(self, y, t, **kwargs):
+ """
+ Implements a gradient of the cost w.r.t y
+ :param y: matrix -- an output of the model
+ :param t: matrix -- an expected output the model should produce
+ :param kwargs: -- some optional parameters required by the cost
+ :return: matrix - the gradient of the cost w.r.t y
+ """
+ raise NotImplementedError()
+
+ def get_name(self):
+ return 'cost'
+
+
+class MSECost(Cost):
+ def cost(self, y, t, **kwargs):
+ se = 0.5*numpy.sum((y - t)**2, axis=1)
+ return numpy.mean(se)
+
+ def grad(self, y, t, **kwargs):
+ return y - t
+
+ def get_name(self):
+ return 'mse'
+
+
+class CECost(Cost):
+ """
+ Cross Entropy (Negative log-likelihood) cost for multiple classes
+ """
+ def cost(self, y, t, **kwargs):
+ #assumes t is 1-of-K coded and y is a softmax
+ #transformed estimate at the output layer
+ nll = t * numpy.log(y)
+ return -numpy.mean(numpy.sum(nll, axis=1))
+
+ def grad(self, y, t, **kwargs):
+ #assumes t is 1-of-K coded and y is a softmax
+ #transformed estimate at the output layer
+ return y - t
+
+ def get_name(self):
+ return 'ce'
diff --git a/mlp/layers.py b/mlp/layers.py
new file mode 100644
index 0000000..c511792
--- /dev/null
+++ b/mlp/layers.py
@@ -0,0 +1,281 @@
+
+# Machine Learning Practical (INFR11119),
+# Pawel Swietojanski, University of Edinburgh
+
+import numpy
+import logging
+from mlp.costs import Cost
+
+
+logger = logging.getLogger(__name__)
+
+
+class MLP(object):
+ """
+ This is a container for an arbitrary sequence of other transforms
+ On top of this, the class also keeps the state of the model, i.e.
+ the result of forward (activations) and backward (deltas) passes
+ through the model (for a mini-batch), which is required to compute
+ the gradients for the parameters
+ """
+ def __init__(self, cost):
+
+ assert isinstance(cost, Cost), (
+ "Cost needs to be of type mlp.costs.Cost, got %s" % type(cost)
+ )
+
+ self.layers = [] #the actual list of network layers
+ self.activations = [] #keeps forward-pass activations (h from equations)
+ # for a given minibatch (or features at 0th index)
+ self.deltas = [] #keeps back-propagated error signals (deltas from equations)
+ # for a given minibatch and each layer
+ self.cost = cost
+
+ def fprop(self, x):
+ """
+
+ :param inputs: mini-batch of data-points x
+ :return: y (top layer activation) which is an estimate of y given x
+ """
+
+ if len(self.activations) != len(self.layers) + 1:
+ self.activations = [None]*(len(self.layers) + 1)
+
+ self.activations[0] = x
+ for i in xrange(0, len(self.layers)):
+ self.activations[i+1] = self.layers[i].fprop(self.activations[i])
+ return self.activations[-1]
+
+ def bprop(self, cost_grad):
+ """
+ :param cost_grad: matrix -- grad of the cost w.r.t y
+ :return: None, the deltas are kept in the model
+ """
+
+ # allocate the list of deltas for each layer
+ # note, we do not use all of those fields but
+ # want to keep it aligned 1:1 with activations,
+ # which will simplify indexing later on when
+ # computing grads w.r.t parameters
+ if len(self.deltas) != len(self.activations):
+ self.deltas = [None]*len(self.activations)
+
+ # treat the top layer in special way, as it deals with the
+ # cost, which may lead to some simplifications
+ top_layer_idx = len(self.layers)
+ self.deltas[top_layer_idx], ograds = self.layers[top_layer_idx - 1].\
+ bprop_cost(self.activations[top_layer_idx], cost_grad, self.cost)
+
+ # then back-prop through remaining layers
+ for i in xrange(top_layer_idx - 1, 0, -1):
+ self.deltas[i], ograds = self.layers[i - 1].\
+ bprop(self.activations[i], ograds)
+
+ def add_layer(self, layer):
+ self.layers.append(layer)
+
+ def set_layers(self, layers):
+ self.layers = layers
+
+ def get_name(self):
+ return 'mlp'
+
+
+class Layer(object):
+ """
+ Abstract class defining an interface for
+ other transforms.
+ """
+ def __init__(self, rng=None):
+
+ if rng is None:
+ seed=[2015, 10, 1]
+ self.rng = numpy.random.RandomState(seed)
+ else:
+ self.rng = rng
+
+ def fprop(self, inputs):
+ """
+ Implements a forward propagation through the i-th layer, that is
+ some form of:
+ a^i = xW^i + b^i
+ h^i = f^i(a^i)
+ with f^i, W^i, b^i denoting a non-linearity, weight matrix and
+ biases at the i-th layer, respectively and x denoting inputs.
+
+ :param inputs: matrix of features (x) or the output of the previous layer h^{i-1}
+ :return: h^i, matrix of transformed by layer features
+ """
+ raise NotImplementedError()
+
+ def bprop(self, h, igrads):
+ """
+ Implements a backward propagation through the layer, that is, given
+ h^i denotes the output of the layer and x^i the input, we compute:
+ dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i
+ x^i could be either features (x) or the output of the lower layer h^{i-1}
+ :param h: it's an activation produced in forward pass
+ :param igrads, error signal (or gradient) flowing to the layer, note,
+ this in general case does not corresponds to 'deltas' used to update
+ the layer's parameters, to get deltas ones need to multiply it with
+ the dh^i/da^i derivative
+ :return: a tuple (deltas, ograds) where:
+ deltas = igrads * dh^i/da^i
+ ograds = deltas \times da^i/dx^i
+ """
+ raise NotImplementedError()
+
+ def bprop_cost(self, h, igrads, cost=None):
+ """
+ Implements a backward propagation in case the layer directly
+ deals with the optimised cost (i.e. the top layer)
+ By default, method should implement a back-prop for default cost, that is
+ the one that is natural to the layer's output, i.e.:
+ linear -> mse, softmax -> cross-entropy, sigmoid -> binary cross-entropy
+ :param h: it's an activation produced in forward pass
+ :param igrads, error signal (or gradient) flowing to the layer, note,
+ this in general case does not corresponds to 'deltas' used to update
+ the layer's parameters, to get deltas ones need to multiply it with
+ the dh^i/da^i derivative
+ :return: a tuple (deltas, ograds) where:
+ deltas = igrads * dh^i/da^i
+ ograds = deltas \times da^i/dx^i
+ """
+
+ raise NotImplementedError()
+
+ def pgrads(self, inputs, deltas):
+ """
+ Return gradients w.r.t parameters
+ """
+ raise NotImplementedError()
+
+ def get_params(self):
+ raise NotImplementedError()
+
+ def set_params(self):
+ raise NotImplementedError()
+
+ def get_name(self):
+ return 'abstract_layer'
+
+
+class Linear(Layer):
+
+ def __init__(self, idim, odim,
+ rng=None,
+ irange=0.1):
+
+ super(Linear, self).__init__(rng=rng)
+
+ self.idim = idim
+ self.odim = odim
+
+ self.W = self.rng.uniform(
+ -irange, irange,
+ (self.idim, self.odim))
+
+ self.b = numpy.zeros((self.odim,), dtype=numpy.float32)
+
+ def fprop(self, inputs):
+ """
+ Implements a forward propagation through the i-th layer, that is
+ some form of:
+ a^i = xW^i + b^i
+ h^i = f^i(a^i)
+ with f^i, W^i, b^i denoting a non-linearity, weight matrix and
+ biases of this (i-th) layer, respectively and x denoting inputs.
+
+ :param inputs: matrix of features (x) or the output of the previous layer h^{i-1}
+ :return: h^i, matrix of transformed by layer features
+ """
+ a = numpy.dot(inputs, self.W) + self.b
+ # here f() is an identity function, so just return a linear transformation
+ return a
+
+ def bprop(self, h, igrads):
+ """
+ Implements a backward propagation through the layer, that is, given
+ h^i denotes the output of the layer and x^i the input, we compute:
+ dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i
+ x^i could be either features (x) or the output of the lower layer h^{i-1}
+ :param h: it's an activation produced in forward pass
+ :param igrads, error signal (or gradient) flowing to the layer, note,
+ this in general case does not corresponds to 'deltas' used to update
+ the layer's parameters, to get deltas ones need to multiply it with
+ the dh^i/da^i derivative
+ :return: a tuple (deltas, ograds) where:
+ deltas = igrads * dh^i/da^i
+ ograds = deltas \times da^i/dx^i
+ """
+
+ # since df^i/da^i = 1 (f is assumed identity function),
+ # deltas are in fact the same as igrads
+ ograds = numpy.dot(igrads, self.W.T)
+ return igrads, ograds
+
+ def bprop_cost(self, h, igrads, cost):
+ """
+ Implements a backward propagation in case the layer directly
+ deals with the optimised cost (i.e. the top layer)
+ By default, method should implement a bprop for default cost, that is
+ the one that is natural to the layer's output, i.e.:
+ here we implement linear -> mse scenario
+ :param h: it's an activation produced in forward pass
+ :param igrads, error signal (or gradient) flowing to the layer, note,
+ this in general case does not corresponds to 'deltas' used to update
+ the layer's parameters, to get deltas ones need to multiply it with
+ the dh^i/da^i derivative
+ :param cost, mlp.costs.Cost instance defining the used cost
+ :return: a tuple (deltas, ograds) where:
+ deltas = igrads * dh^i/da^i
+ ograds = deltas \times da^i/dx^i
+ """
+
+ if cost is None or cost.get_name() == 'mse':
+ # for linear layer and mean square error cost,
+ # cost back-prop is the same as standard back-prop
+ return self.bprop(h, igrads)
+ else:
+ raise NotImplementedError('Linear.bprop_cost method not implemented '
+ 'for the %s cost' % cost.get_name())
+
+ def pgrads(self, inputs, deltas):
+ """
+ Return gradients w.r.t parameters
+
+ :param inputs, input to the i-th layer
+ :param deltas, deltas computed in bprop stage up to -ith layer
+ :return list of grads w.r.t parameters dE/dW and dE/db in *exactly*
+ the same order as the params are returned by get_params()
+
+ Note: deltas here contain the whole chain rule leading
+ from the cost up to the the i-th layer, i.e.
+ dE/dy^L dy^L/da^L da^L/dh^{L-1} dh^{L-1}/da^{L-1} ... dh^{i}/da^{i}
+ and here we are just asking about
+ 1) da^i/dW^i and 2) da^i/db^i
+ since W and b are only layer's parameters
+ """
+
+ grad_W = numpy.dot(inputs.T, deltas)
+ grad_b = numpy.sum(deltas, axis=0)
+
+ return [grad_W, grad_b]
+
+ def get_params(self):
+ return [self.W, self.b]
+
+ def set_params(self, params):
+ #we do not make checks here, but the order on the list
+ #is assumed to be exactly the same as get_params() returns
+ self.W = params[0]
+ self.b = params[1]
+
+ def get_name(self):
+ return 'linear'
+
+
+
+
+
+
\ No newline at end of file
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
new file mode 100644
index 0000000..9d4b947
--- /dev/null
+++ b/mlp/optimisers.py
@@ -0,0 +1,170 @@
+# Machine Learning Practical (INFR11119),
+# Pawel Swietojanski, University of Edinburgh
+
+import numpy
+import time
+import logging
+
+from mlp.layers import MLP
+from mlp.dataset import DataProvider
+from mlp.schedulers import LearningRateScheduler
+
+
+logger = logging.getLogger(__name__)
+
+
+class Optimiser(object):
+ def train_epoch(self, model, train_iter):
+ raise NotImplementedError()
+
+ def train(self, model, train_iter, valid_iter=None):
+ raise NotImplementedError()
+
+ def validate(self, model, valid_iterator):
+ assert isinstance(model, MLP), (
+ "Expected model to be a subclass of 'mlp.layers.MLP'"
+ " class but got %s " % type(model)
+ )
+
+ assert isinstance(valid_iterator, DataProvider), (
+ "Expected iterator to be a subclass of 'mlp.dataset.DataProvider'"
+ " class but got %s " % type(valid_iterator)
+ )
+
+ acc_list, nll_list = [], []
+ for x, t in valid_iterator:
+ y = model.fprop(x)
+ nll_list.append(model.cost.cost(y, t))
+ acc_list.append(numpy.mean(self.classification_accuracy(y, t)))
+
+ acc = numpy.mean(acc_list)
+ nll = numpy.mean(nll_list)
+
+ return nll, acc
+
+ @staticmethod
+ def classification_accuracy(y, t):
+ """
+ Returns classification accuracy given the estimate y and targets t
+ :param y: matrix -- estimate produced by the model in fprop
+ :param t: matrix -- target 1-of-K coded
+ :return: vector of y.shape[0] size with binary values set to 0
+ if example was miscalssified or 1 otherwise
+ """
+ y_idx = numpy.argmax(y, axis=1)
+ t_idx = numpy.argmax(t, axis=1)
+ rval = numpy.equal(y_idx, t_idx)
+ return rval
+
+
+class SGDOptimiser(Optimiser):
+ def __init__(self, lr_scheduler):
+ super(SGDOptimiser, self).__init__()
+
+ assert isinstance(lr_scheduler, LearningRateScheduler), (
+ "Expected lr_scheduler to be a subclass of 'mlp.schedulers.LearningRateScheduler'"
+ " class but got %s " % type(lr_scheduler)
+ )
+
+ self.lr_scheduler = lr_scheduler
+
+ def train_epoch(self, model, train_iterator, learning_rate):
+
+ assert isinstance(model, MLP), (
+ "Expected model to be a subclass of 'mlp.layers.MLP'"
+ " class but got %s " % type(model)
+ )
+ assert isinstance(train_iterator, DataProvider), (
+ "Expected iterator to be a subclass of 'mlp.dataset.DataProvider'"
+ " class but got %s " % type(train_iterator)
+ )
+
+ acc_list, nll_list = [], []
+ for x, t in train_iterator:
+ # get the prediction
+ y = model.fprop(x)
+
+ # compute the cost and grad of the cost w.r.t y
+ cost = model.cost.cost(y, t)
+ cost_grad = model.cost.grad(y, t)
+
+ # do backward pass through the model
+ model.bprop(cost_grad)
+
+ #update the model, here we iterate over layers
+ #and then over each parameter in the layer
+ effective_learning_rate = learning_rate / x.shape[0]
+
+ for i in xrange(0, len(model.layers)):
+ params = model.layers[i].get_params()
+ grads = model.layers[i].pgrads(model.activations[i], model.deltas[i + 1])
+ uparams = []
+ for param, grad in zip(params, grads):
+ param = param - effective_learning_rate * grad
+ uparams.append(param)
+ model.layers[i].set_params(uparams)
+
+ nll_list.append(cost)
+ acc_list.append(numpy.mean(self.classification_accuracy(y, t)))
+
+ return numpy.mean(nll_list), numpy.mean(acc_list)
+
+ def train(self, model, train_iterator, valid_iterator=None):
+
+ converged = False
+ cost_name = model.cost.get_name()
+ tr_stats, valid_stats = [], []
+
+ # do the initial validation
+ tr_nll, tr_acc = self.validate(model, train_iterator)
+ logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%'
+ % (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))
+ tr_stats.append((tr_nll, tr_acc))
+
+ if valid_iterator is not None:
+ valid_iterator.reset()
+ valid_nll, valid_acc = self.validate(model, valid_iterator)
+ logger.info('Epoch %i: Validation cost (%s) for random model is %.3f. Accuracy is %.2f%%'
+ % (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.))
+ valid_stats.append((valid_nll, valid_acc))
+
+ while not converged:
+ train_iterator.reset()
+
+ tstart = time.clock()
+ tr_nll, tr_acc = self.train_epoch(model=model,
+ train_iterator=train_iterator,
+ learning_rate=self.lr_scheduler.get_rate())
+ tstop = time.clock()
+ tr_stats.append((tr_nll, tr_acc))
+
+ logger.info('Epoch %i: Training cost (%s) is %.3f. Accuracy is %.2f%%'
+ % (self.lr_scheduler.epoch + 1, cost_name, tr_nll, tr_acc * 100.))
+
+ vstart = time.clock()
+ if valid_iterator is not None:
+ valid_iterator.reset()
+ valid_nll, valid_acc = self.validate(model, valid_iterator)
+ logger.info('Epoch %i: Validation cost (%s) is %.3f. Accuracy is %.2f%%'
+ % (self.lr_scheduler.epoch + 1, cost_name, valid_nll, valid_acc * 100.))
+ self.lr_scheduler.get_next_rate(valid_acc)
+ valid_stats.append((valid_nll, valid_acc))
+ else:
+ self.lr_scheduler.get_next_rate(None)
+ vstop = time.clock()
+
+ train_speed = train_iterator.num_examples() / (tstop - tstart)
+ valid_speed = valid_iterator.num_examples() / (vstop - vstart)
+ tot_time = vstop - tstart
+ #pps = presentations per second
+ logger.info("Epoch %i: Took %.0f seconds. Training speed %.0f pps. "
+ "Validation speed %.0f pps."
+ % (self.lr_scheduler.epoch, tot_time, train_speed, valid_speed))
+
+ # we stop training when learning rate, as returned by lr scheduler, is 0
+ # this is implementation dependent and depending on lr schedule could happen,
+ # for example, when max_epochs has been reached or if the progress between
+ # two consecutive epochs is too small, etc.
+ converged = (self.lr_scheduler.get_rate() == 0)
+
+ return tr_stats, valid_stats
diff --git a/mlp/schedulers.py b/mlp/schedulers.py
new file mode 100644
index 0000000..f5499e6
--- /dev/null
+++ b/mlp/schedulers.py
@@ -0,0 +1,155 @@
+# Machine Learning Practical (INFR11119),
+# Pawel Swietojanski, University of Edinburgh
+
+import logging
+
+
+class LearningRateScheduler(object):
+ """
+ Define an interface for determining learning rates
+ """
+ def __init__(self, max_epochs=100):
+ self.epoch = 0
+ self.max_epochs = max_epochs
+
+ def get_rate(self):
+ raise NotImplementedError()
+
+ def get_next_rate(self, current_error=None):
+ self.epoch += 1
+
+
+class LearningRateList(LearningRateScheduler):
+ def __init__(self, learning_rates_list, max_epochs):
+
+ super(LearningRateList, self).__init__(max_epochs)
+
+ assert isinstance(learning_rates_list, list), (
+ "The learning_rates_list argument expected"
+ " to be of type list, got %s" % type(learning_rates_list)
+ )
+ self.lr_list = learning_rates_list
+
+ def get_rate(self):
+ if self.epoch < len(self.lr_list):
+ return self.lr_list[self.epoch]
+ return 0.0
+
+ def get_next_rate(self, current_error=None):
+ super(LearningRateList, self).get_next_rate(current_error=None)
+ return self.get_rate()
+
+
+class LearningRateFixed(LearningRateList):
+
+ def __init__(self, learning_rate, max_epochs):
+ assert learning_rate > 0, (
+ "learning rate expected to be > 0, got %f" % learning_rate
+ )
+ super(LearningRateFixed, self).__init__([learning_rate], max_epochs)
+
+ def get_rate(self):
+ if self.epoch < self.max_epochs:
+ return self.lr_list[0]
+ return 0.0
+
+ def get_next_rate(self, current_error=None):
+ super(LearningRateFixed, self).get_next_rate(current_error=None)
+ return self.get_rate()
+
+
+class LearningRateNewBob(LearningRateScheduler):
+ """
+ Exponential learning rate schema
+ """
+
+ def __init__(self, start_rate, scale_by=.5, max_epochs=99, \
+ min_derror_ramp_start=.5, min_derror_stop=.5, init_error=100.0, \
+ patience=0, zero_rate=None, ramping=False):
+ """
+ :type start_rate: float
+ :param start_rate:
+
+ :type scale_by: float
+ :param scale_by:
+
+ :type max_epochs: int
+ :param max_epochs:
+
+ :type min_error_start: float
+ :param min_error_start:
+
+ :type min_error_stop: float
+ :param min_error_stop:
+
+ :type init_error: float
+ :param init_error:
+ # deltas2 below are just deltas returned by linear Linear,bprop transform
+ # and are exactly the same as
+ """
+ self.start_rate = start_rate
+ self.init_error = init_error
+ self.init_patience = patience
+
+ self.rate = start_rate
+ self.scale_by = scale_by
+ self.max_epochs = max_epochs
+ self.min_derror_ramp_start = min_derror_ramp_start
+ self.min_derror_stop = min_derror_stop
+ self.lowest_error = init_error
+
+ self.epoch = 1
+ self.ramping = ramping
+ self.patience = patience
+ self.zero_rate = zero_rate
+
+ def reset(self):
+ self.rate = self.start_rate
+ self.lowest_error = self.init_error
+ self.epoch = 1
+ self.ramping = False
+ self.patience = self.init_patience
+
+ def get_rate(self):
+ if (self.epoch==1 and self.zero_rate!=None):
+ return self.zero_rate
+ return self.rate
+
+ def get_next_rate(self, current_error):
+ """
+ :type current_error: float
+ :param current_error: percentage error
+
+ """
+
+ diff_error = 0.0
+
+ if ( (self.max_epochs > 10000) or (self.epoch >= self.max_epochs) ):
+ #logging.debug('Setting rate to 0.0. max_epochs or epoch>=max_epochs')
+ self.rate = 0.0
+ else:
+ diff_error = self.lowest_error - current_error
+
+ if (current_error < self.lowest_error):
+ self.lowest_error = current_error
+
+ if (self.ramping):
+ if (diff_error < self.min_derror_stop):
+ if (self.patience > 0):
+ #logging.debug('Patience decreased to %f' % self.patience)
+ self.patience -= 1
+ self.rate *= self.scale_by
+ else:
+ #logging.debug('diff_error (%f) < min_derror_stop (%f)' % (diff_error, self.min_derror_stop))
+ self.rate = 0.0
+ else:
+ self.rate *= self.scale_by
+ else:
+ if (diff_error < self.min_derror_ramp_start):
+ #logging.debug('Start ramping.')
+ self.ramping = True
+ self.rate *= self.scale_by
+
+ self.epoch += 1
+
+ return self.rate
diff --git a/res/code_scheme.svg b/res/code_scheme.svg
new file mode 100644
index 0000000..0b0eec8
--- /dev/null
+++ b/res/code_scheme.svg
@@ -0,0 +1,2030 @@
+
+
+
+