diff --git a/mlp/costs.py b/mlp/costs.py index 942920a..9532af1 100644 --- a/mlp/costs.py +++ b/mlp/costs.py @@ -1,64 +1,105 @@ -# Machine Learning Practical (INFR11119), -# Pawel Swietojanski, University of Edinburgh +"""Model costs.""" -import numpy +import numpy as np -class Cost(object): +class MeanSquaredErrorCost(object): """ - Defines an interface for the cost object """ - def cost(self, y, t, **kwargs): - """ - Implements a cost for monitoring purposes - :param y: matrix -- an output of the model - :param t: matrix -- an expected output the model should produce - :param kwargs: -- some optional parameters required by the cost - :return: the scalar value representing the cost given y and t - """ - raise NotImplementedError() - def grad(self, y, t, **kwargs): - """ - Implements a gradient of the cost w.r.t y - :param y: matrix -- an output of the model - :param t: matrix -- an expected output the model should produce - :param kwargs: -- some optional parameters required by the cost - :return: matrix - the gradient of the cost w.r.t y - """ - raise NotImplementedError() + def __call__(self, outputs, targets): + return 0.5 * np.mean(np.sum((outputs - targets)**2, axis=1)) - def get_name(self): - return 'cost' + def grad(self, outputs, targets): + return outputs - targets + + def __repr__(self): + return 'MeanSquaredErrorCost' -class MSECost(Cost): - def cost(self, y, t, **kwargs): - se = 0.5*numpy.sum((y - t)**2, axis=1) - return numpy.mean(se) - - def grad(self, y, t, **kwargs): - return y - t - - def get_name(self): - return 'mse' - - -class CECost(Cost): +class BinaryCrossEntropyCost(object): """ - Cross Entropy (Negative log-likelihood) cost for multiple classes """ - def cost(self, y, t, **kwargs): - #assumes t is 1-of-K coded and y is a softmax - #transformed estimate at the output layer - nll = t * numpy.log(y) - return -numpy.mean(numpy.sum(nll, axis=1)) - def grad(self, y, t, **kwargs): - #assumes t is 1-of-K coded and y is a softmax - #transformed estimate at the output layer - return y - t + def __call__(self, outputs, targets): + return -np.mean( + targets * np.log(outputs) + (1. - targets) * np.log(1. - ouputs)) - def get_name(self): - return 'ce' + def grad(self, outputs, targets): + return (1. - targets) / (1. - outputs) - (targets / outputs) + + def __repr__(self): + return 'BinaryCrossEntropyCost' + + +class BinaryCrossEntropySigmoidCost(object): + """ + """ + + def __call__(self, outputs, targets): + probs = 1. / (1. + np.exp(-outputs)) + return -np.mean( + targets * np.log(probs) + (1. - targets) * np.log(1. - probs)) + + def grad(self, outputs, targets): + probs = 1. / (1. + np.exp(-outputs)) + return probs - targets + + def __repr__(self): + return 'BinaryCrossEntropySigmoidCost' + + +class BinaryAccuracySigmoidCost(object): + """ + """ + + def __call__(self, outputs, targets): + return ((outputs > 0) == targets).mean() + + def ___repr__(self): + return 'BinaryAccuracySigmoidCost' + + +class CrossEntropyCost(object): + """ + """ + + def __call__(self, outputs, targets): + return -np.mean(np.sum(targets * np.log(outputs), axis=1)) + + def grad(self, outputs, targets): + return -targets / outputs + + def __repr__(self): + return 'CrossEntropyCost' + + +class CrossEntropySoftmaxCost(object): + """ + """ + + def __call__(self, outputs, targets): + probs = np.exp(outputs) + probs /= probs.sum(-1)[:, None] + return -np.mean(np.sum(targets * np.log(probs), axis=1)) + + def grad(self, outputs, targets): + probs = np.exp(outputs) + probs /= probs.sum(-1)[:, None] + return probs - targets + + def __repr__(self): + return 'CrossEntropySoftmaxCost' + + +class MulticlassAccuracySoftmaxCost(object): + """ + """ + + def __call__(self, outputs, targets): + probs = np.exp(outputs) + return np.mean(np.argmax(probs, -1) == np.argmax(targets, -1)) + + def __repr__(self): + return 'MulticlassAccuracySoftmaxCost' diff --git a/mlp/initialisers.py b/mlp/initialisers.py new file mode 100644 index 0000000..b9b1614 --- /dev/null +++ b/mlp/initialisers.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +"""Parameter initialisers. + +This module defines classes to initialise the parameters in a layer. +""" + +import numpy as np + +DEFAULT_SEED = 123456 # Default random number generator seed if none provided. + + +class ConstantInit(object): + """Constant parameter initialiser.""" + + def __init__(self, value): + """Construct a constant parameter initialiser. + + Args: + value: Value to initialise parameter to. + """ + self.value = value + + def __call__(self, shape): + return np.ones(shape=shape) * self.value + + +class UniformInit(object): + """Random uniform parameter initialiser.""" + + def __init__(self, low, high, rng=None): + """Construct a random uniform parameter initialiser. + + Args: + low: Lower bound of interval to sample from. + high: Upper bound of interval to sample from. + rng (RandomState): Seeded random number generator. + """ + self.low = low + self.high = high + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + return self.rng.uniform(low=self.low, high=self.high, size=shape) + + +class NormalInit(object): + """Random normal parameter initialiser.""" + + def __init__(self, mean, std, rng=None): + """Construct a random uniform parameter initialiser. + + Args: + mean: Mean of distribution to sample from. + std: Standard deviation of distribution to sample from. + rng (RandomState): Seeded random number generator. + """ + self.mean = mean + self.std = std + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + return self.rng.normal(loc=self.mean, scale=self.std, size=shape) diff --git a/mlp/layers.py b/mlp/layers.py index 1e46058..760a01c 100644 --- a/mlp/layers.py +++ b/mlp/layers.py @@ -1,561 +1,325 @@ +# -*- coding: utf-8 -*- +"""Layer definitions. -# Machine Learning Practical (INFR11119), -# Pawel Swietojanski, University of Edinburgh +This module defines classes which encapsulate a single layer. -import numpy -import logging -from mlp.costs import Cost +These layers map input activations to output activation with the `fprop` +method and map gradients with repsect to outputs to gradients with respect to +their inputs with the `bprop` method. +Some layers will have learnable parameters and so will additionally define +methods for getting and setting parameter and calculating gradients with +respect to the layer parameters. +""" -logger = logging.getLogger(__name__) - - -def max_and_argmax(x, axes=None, keepdims_max=False, keepdims_argmax=False): - """ - Return both max and argmax for the given multi-dimensional array, possibly - preserve the original shapes - :param x: input tensor - :param axes: tuple of ints denoting axes across which - one should perform reduction - :param keepdims_max: boolean, if true, shape of x is preserved in result - :param keepdims_argmax:, boolean, if true, shape of x is preserved in result - :return: max (number) and argmax (indices) of max element along certain axes - in multi-dimensional tensor - """ - if axes is None: - rval_argmax = numpy.argmax(x) - if keepdims_argmax: - rval_argmax = numpy.unravel_index(rval_argmax, x.shape) - else: - if isinstance(axes, int): - axes = (axes,) - axes = tuple(axes) - keep_axes = numpy.array([i for i in range(x.ndim) if i not in axes]) - transposed_x = numpy.transpose(x, numpy.concatenate((keep_axes, axes))) - reshaped_x = transposed_x.reshape(transposed_x.shape[:len(keep_axes)] + (-1,)) - rval_argmax = numpy.asarray(numpy.argmax(reshaped_x, axis=-1), dtype=numpy.int64) - - # rval_max_arg keeps the arg index referencing to the axis along which reduction was performed (axis=-1) - # when keepdims_argmax is True we need to map it back to the original shape of tensor x - # print 'rval maxaarg', rval_argmax.ndim, rval_argmax.shape, rval_argmax - if keepdims_argmax: - dim = tuple([x.shape[a] for a in axes]) - rval_argmax = numpy.array([idx + numpy.unravel_index(val, dim) - for idx, val in numpy.ndenumerate(rval_argmax)]) - # convert to numpy indexing convention (row indices first, then columns) - rval_argmax = zip(*rval_argmax) - - if keepdims_max is False and keepdims_argmax is True: - # this could potentially save O(N) steps by not traversing array once more - # to get max value, haven't benchmark it though - rval_max = x[rval_argmax] - else: - rval_max = numpy.asarray(numpy.amax(x, axis=axes, keepdims=keepdims_max)) - - return rval_max, rval_argmax - - -class MLP(object): - """ - This is a container for an arbitrary sequence of other transforms - On top of this, the class also keeps the state of the model, i.e. - the result of forward (activations) and backward (deltas) passes - through the model (for a mini-batch), which is required to compute - the gradients for the parameters - """ - def __init__(self, cost, rng=None): - - assert isinstance(cost, Cost), ( - "Cost needs to be of type mlp.costs.Cost, got %s" % type(cost) - ) - - self.layers = [] #the actual list of network layers - self.activations = [] #keeps forward-pass activations (h from equations) - # for a given minibatch (or features at 0th index) - self.deltas = [] #keeps back-propagated error signals (deltas from equations) - # for a given minibatch and each layer - self.cost = cost - - if rng is None: - self.rng = numpy.random.RandomState([2015,11,11]) - else: - self.rng = rng - - def fprop(self, x): - """ - - :param inputs: mini-batch of data-points x - :return: y (top layer activation) which is an estimate of y given x - """ - - if len(self.activations) != len(self.layers) + 1: - self.activations = [None]*(len(self.layers) + 1) - - self.activations[0] = x - for i in xrange(0, len(self.layers)): - self.activations[i+1] = self.layers[i].fprop(self.activations[i]) - return self.activations[-1] - - def fprop_dropout(self, x, dp_scheduler): - """ - :param inputs: mini-batch of data-points x - :param dp_scheduler: dropout scheduler - :return: y (top layer activation) which is an estimate of y given x - """ - - if len(self.activations) != len(self.layers) + 1: - self.activations = [None]*(len(self.layers) + 1) - - p_inp, p_hid = dp_scheduler.get_rate() - - d_inp = 1 - p_inp_scaler, p_hid_scaler = 1.0/p_inp, 1.0/p_hid - if p_inp < 1: - d_inp = self.rng.binomial(1, p_inp, size=x.shape) - - self.activations[0] = p_inp_scaler*d_inp*x #it's OK to scale the inputs by p_inp_scaler here - self.activations[1] = self.layers[0].fprop(self.activations[0]) - for i in xrange(1, len(self.layers)): - d_hid = 1 - if p_hid < 1: - d_hid = self.rng.binomial(1, p_hid, size=self.activations[i].shape) - self.activations[i] *= d_hid #but not the hidden activations, since the non-linearity grad *may* explicitly depend on them - self.activations[i+1] = self.layers[i].fprop(p_hid_scaler*self.activations[i]) - - return self.activations[-1] - - def bprop(self, cost_grad, dp_scheduler=None): - """ - :param cost_grad: matrix -- grad of the cost w.r.t y - :return: None, the deltas are kept in the model - """ - - # allocate the list of deltas for each layer - # note, we do not use all of those fields but - # want to keep it aligned 1:1 with activations, - # which will simplify indexing later on when - # computing grads w.r.t parameters - if len(self.deltas) != len(self.activations): - self.deltas = [None]*len(self.activations) - - # treat the top layer in special way, as it deals with the - # cost, which may lead to some simplifications - top_layer_idx = len(self.layers) - self.deltas[top_layer_idx], ograds = self.layers[top_layer_idx - 1].\ - bprop_cost(self.activations[top_layer_idx], cost_grad, self.cost) - - p_hid_scaler = 1.0 - if dp_scheduler is not None: - p_inp, p_hid = dp_scheduler.get_rate() - p_hid_scaler /= p_hid - - # then back-prop through remaining layers - for i in xrange(top_layer_idx - 1, 0, -1): - self.deltas[i], ograds = self.layers[i - 1].\ - bprop(self.activations[i], ograds*p_hid_scaler) - - def add_layer(self, layer): - self.layers.append(layer) - - def set_layers(self, layers): - self.layers = layers - - def get_name(self): - return 'mlp' +import numpy as np +import mlp.initialisers as init class Layer(object): + """Abstract class defining the interface for a layer.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError() + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + raise NotImplementedError() + + +class LayerWithParameters(Layer): + """Abstract class defining the interface for a layer with parameters.""" + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: Array of inputs to layer of shape (batch_size, input_dim). + grads_wrt_to_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + List of arrays of gradients with respect to the layer parameters + with parameter gradients appearing in same order in tuple as + returned from `get_params` method. + """ + raise NotImplementedError() + + def params_cost(self): + """Returns the parameter dependent cost term for this layer. + + If no parameter-dependent cost terms are set this returns zero. + """ + raise NotImplementedError() + + @property + def params(self): + """Returns a list of parameters of layer. + + Returns: + List of current parameter values. This list should be in the + corresponding order to the `values` argument to `set_params`. + """ + raise NotImplementedError() + + @params.setter + def params(self, values): + """Sets layer parameters from a list of values. + + Args: + values: List of values to set parameters to. This list should be + in the corresponding order to what is returned by `get_params`. + """ + raise NotImplementedError() + + +class AffineLayer(LayerWithParameters): + """Layer implementing an affine tranformation of its inputs. + + This layer is parameterised by a weight matrix and bias vector. """ - Abstract class defining an interface for - other transforms. - """ - def __init__(self, rng=None): - if rng is None: - seed=[2015, 10, 1] - self.rng = numpy.random.RandomState(seed) - else: - self.rng = rng + def __init__(self, input_dim, output_dim, + weights_initialiser=init.UniformInit(-0.1, 0.1), + biases_initialiser=init.ConstantInit(0.), + weights_cost=None, biases_cost=None): + """Initialises a parameterised affine layer. + + Args: + input_dim (int): Dimension of inputs to the layer. + output_dim (int): Dimension of the layer outputs. + weights_initialiser: Initialiser for the weight parameters. + biases_initialiser: Initialiser for the bias parameters. + weights_cost: Weights-dependent cost term. + biases_cost: Biases-dependent cost term. + """ + self.input_dim = input_dim + self.output_dim = output_dim + self.weights = weights_initialiser((self.output_dim, self.input_dim)) + self.biases = biases_initialiser(self.output_dim) + self.weights_cost = weights_cost + self.biases_cost = biases_cost def fprop(self, inputs): - """ - Implements a forward propagation through the i-th layer, that is - some form of: - a^i = xW^i + b^i - h^i = f^i(a^i) - with f^i, W^i, b^i denoting a non-linearity, weight matrix and - biases at the i-th layer, respectively and x denoting inputs. + """Forward propagates activations through the layer transformation. - :param inputs: matrix of features (x) or the output of the previous layer h^{i-1} - :return: h^i, matrix of transformed by layer features - """ - raise NotImplementedError() - - def bprop(self, h, igrads): - """ - Implements a backward propagation through the layer, that is, given - h^i denotes the output of the layer and x^i the input, we compute: - dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i - x^i could be either features (x) or the output of the lower layer h^{i-1} - :param h: it's an activation produced in forward pass - :param igrads, error signal (or gradient) flowing to the layer, note, - this in general case does not corresponds to 'deltas' used to update - the layer's parameters, to get deltas ones need to multiply it with - the dh^i/da^i derivative - :return: a tuple (deltas, ograds) where: - deltas = igrads * dh^i/da^i - ograds = deltas \times da^i/dx^i - """ - raise NotImplementedError() + For inputs `x`, outputs `y`, weights `W` and biases `b` the layer + corresponds to `y = W.dot(x) + b`. - def bprop_cost(self, h, igrads, cost=None): + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). """ - Implements a backward propagation in case the layer directly - deals with the optimised cost (i.e. the top layer) - By default, method should implement a back-prop for default cost, that is - the one that is natural to the layer's output, i.e.: - linear -> mse, softmax -> cross-entropy, sigmoid -> binary cross-entropy - :param h: it's an activation produced in forward pass - :param igrads, error signal (or gradient) flowing to the layer, note, - this in general case does not corresponds to 'deltas' used to update - the layer's parameters, to get deltas ones need to multiply it with - the dh^i/da^i derivative - :return: a tuple (deltas, ograds) where: - deltas = igrads * dh^i/da^i - ograds = deltas \times da^i/dx^i + return self.weights.dot(inputs.T).T + self.biases + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return grads_wrt_outputs.dot(self.weights) + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: array of inputs to layer of shape (batch_size, input_dim) + grads_wrt_to_outputs: array of gradients with respect to the layer + outputs of shape (batch_size, output_dim) + + Returns: + list of arrays of gradients with respect to the layer parameters + `[grads_wrt_weights, grads_wrt_biases]`. """ - raise NotImplementedError() + grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs) + grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0) - def pgrads(self, inputs, deltas, **kwargs): + if self.weights_cost is not None: + grads_wrt_weights += self.weights_cost.grad(self.weights) + + if self.biases_cost is not None: + grads_wrt_biases += self.biases_cost.grads(self.biases) + + return [grads_wrt_weights, grads_wrt_biases] + + def params_cost(self): + """Returns the parameter dependent cost term for this layer. + + If no parameter-dependent cost terms are set this returns zero. """ - Return gradients w.r.t parameters - """ - raise NotImplementedError() + params_cost = 0 + if self.weights_cost is not None: + params_cost += self.weights_cost(self.weights) + if self.biases_cost is not None: + params_cost += self.biases_cost(self.biases) + return params_cost - def get_params(self): - raise NotImplementedError() + @property + def params(self): + """A list of layer parameter values: `[weights, biases]`.""" + return [self.weights, self.biases] - def set_params(self): - raise NotImplementedError() + @params.setter + def params(self, values): + self.weights = values[0] + self.biases = values[1] - def get_name(self): - return 'abstract_layer' + def __repr__(self): + return 'AffineLayer(input_dim={0}, output_dim={1})'.format( + self.input_dim, self.output_dim) -class Linear(Layer): - - def __init__(self, idim, odim, - rng=None, - irange=0.1): - - super(Linear, self).__init__(rng=rng) - - self.idim = idim - self.odim = odim - - self.W = self.rng.uniform( - -irange, irange, - (self.idim, self.odim)) - - self.b = numpy.zeros((self.odim,), dtype=numpy.float32) +class SigmoidLayer(Layer): + """Layer implementing an element-wise logistic sigmoid transformation.""" def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to + `y = 1 / (1 + exp(-x))`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). """ - Implements a forward propagation through the i-th layer, that is - some form of: - a^i = xW^i + b^i - h^i = f^i(a^i) - with f^i, W^i, b^i denoting a non-linearity, weight matrix and - biases of this (i-th) layer, respectively and x denoting inputs. + return 1. / (1. + np.exp(-inputs)) - :param inputs: matrix of features (x) or the output of the previous layer h^{i-1} - :return: h^i, matrix of transformed by layer features + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). """ + return grads_wrt_outputs * outputs * (1. - outputs) - #input comes from 4D convolutional tensor, reshape to expected shape - if inputs.ndim == 4: - inputs = inputs.reshape(inputs.shape[0], -1) - - a = numpy.dot(inputs, self.W) + self.b - # here f() is an identity function, so just return a linear transformation - return a - - def bprop(self, h, igrads): - """ - Implements a backward propagation through the layer, that is, given - h^i denotes the output of the layer and x^i the input, we compute: - dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i - x^i could be either features (x) or the output of the lower layer h^{i-1} - :param h: it's an activation produced in forward pass - :param igrads, error signal (or gradient) flowing to the layer, note, - this in general case does not corresponds to 'deltas' used to update - the layer's parameters, to get deltas ones need to multiply it with - the dh^i/da^i derivative - :return: a tuple (deltas, ograds) where: - deltas = igrads * dh^i/da^i - ograds = deltas \times da^i/dx^i - """ - - # since df^i/da^i = 1 (f is assumed identity function), - # deltas are in fact the same as igrads - ograds = numpy.dot(igrads, self.W.T) - return igrads, ograds - - def bprop_cost(self, h, igrads, cost): - """ - Implements a backward propagation in case the layer directly - deals with the optimised cost (i.e. the top layer) - By default, method should implement a bprop for default cost, that is - the one that is natural to the layer's output, i.e.: - here we implement linear -> mse scenario - :param h: it's an activation produced in forward pass - :param igrads, error signal (or gradient) flowing to the layer, note, - this in general case does not corresponds to 'deltas' used to update - the layer's parameters, to get deltas ones need to multiply it with - the dh^i/da^i derivative - :param cost, mlp.costs.Cost instance defining the used cost - :return: a tuple (deltas, ograds) where: - deltas = igrads * dh^i/da^i - ograds = deltas \times da^i/dx^i - """ - - if cost is None or cost.get_name() == 'mse': - # for linear layer and mean square error cost, - # cost back-prop is the same as standard back-prop - return self.bprop(h, igrads) - else: - raise NotImplementedError('Linear.bprop_cost method not implemented ' - 'for the %s cost' % cost.get_name()) - - def pgrads(self, inputs, deltas, l1_weight=0, l2_weight=0): - """ - Return gradients w.r.t parameters - - :param inputs, input to the i-th layer - :param deltas, deltas computed in bprop stage up to -ith layer - :param kwargs, key-value optional arguments - :return list of grads w.r.t parameters dE/dW and dE/db in *exactly* - the same order as the params are returned by get_params() - - Note: deltas here contain the whole chain rule leading - from the cost up to the the i-th layer, i.e. - dE/dy^L dy^L/da^L da^L/dh^{L-1} dh^{L-1}/da^{L-1} ... dh^{i}/da^{i} - and here we are just asking about - 1) da^i/dW^i and 2) da^i/db^i - since W and b are only layer's parameters - """ - - #input comes from 4D convolutional tensor, reshape to expected shape - if inputs.ndim == 4: - inputs = inputs.reshape(inputs.shape[0], -1) - - #you could basically use different scalers for biases - #and weights, but it is not implemented here like this - l2_W_penalty, l2_b_penalty = 0, 0 - if l2_weight > 0: - l2_W_penalty = l2_weight*self.W - l2_b_penalty = l2_weight*self.b - - l1_W_penalty, l1_b_penalty = 0, 0 - if l1_weight > 0: - l1_W_penalty = l1_weight*numpy.sign(self.W) - l1_b_penalty = l1_weight*numpy.sign(self.b) - - grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty - grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty - - return [grad_W, grad_b] - - def get_params(self): - return [self.W, self.b] - - def set_params(self, params): - #we do not make checks here, but the order on the list - #is assumed to be exactly the same as get_params() returns - self.W = params[0] - self.b = params[1] - - def get_name(self): - return 'linear' + def __repr__(self): + return 'SigmoidLayer' -class Sigmoid(Linear): - def __init__(self, idim, odim, - rng=None, - irange=0.1): - - super(Sigmoid, self).__init__(idim, odim, rng, irange) - - def fprop(self, inputs): - #get the linear activations - a = super(Sigmoid, self).fprop(inputs) - #stabilise the exp() computation in case some values in - #'a' get very negative. We limit both tails, however only - #negative values may lead to numerical issues -- exp(-a) - #clip() function does the following operation faster: - # a[a < -30.] = -30, - # a[a > 30.] = 30. - numpy.clip(a, -30.0, 30.0, out=a) - h = 1.0/(1 + numpy.exp(-a)) - return h - - def bprop(self, h, igrads): - dsigm = h * (1.0 - h) - deltas = igrads * dsigm - ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas) - return deltas, ograds - - def bprop_cost(self, h, igrads, cost): - if cost is None or cost.get_name() == 'bce': - return super(Sigmoid, self).bprop(h=h, igrads=igrads) - else: - raise NotImplementedError('Sigmoid.bprop_cost method not implemented ' - 'for the %s cost' % cost.get_name()) - - def get_name(self): - return 'sigmoid' - - -class Softmax(Linear): - - def __init__(self,idim, odim, - rng=None, - irange=0.1): - - super(Softmax, self).__init__(idim, - odim, - rng=rng, - irange=irange) +class ReluLayer(Layer): + """Layer implementing an element-wise rectified linear transformation.""" def fprop(self, inputs): + """Forward propagates activations through the layer transformation. - # compute the linear outputs - a = super(Softmax, self).fprop(inputs) - # apply numerical stabilisation by subtracting max - # from each row (not required for the coursework) - # then compute exponent - assert a.ndim in [1, 2], ( - "Expected the linear activation in Softmax layer to be either " - "vector or matrix, got %ith dimensional tensor" % a.ndim - ) - axis = a.ndim - 1 - exp_a = numpy.exp(a - numpy.max(a, axis=axis, keepdims=True)) - # finally, normalise by the sum within each example - y = exp_a/numpy.sum(exp_a, axis=axis, keepdims=True) + For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`. - return y + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). - def bprop(self, h, igrads): - raise NotImplementedError('Softmax.bprop not implemented for hidden layer.') + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.maximum(inputs, 0.) - def bprop_cost(self, h, igrads, cost): + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. - if cost is None or cost.get_name() == 'ce': - return super(Softmax, self).bprop(h=h, igrads=igrads) - else: - raise NotImplementedError('Softmax.bprop_cost method not implemented ' - 'for %s cost' % cost.get_name()) + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. - def get_name(self): - return 'softmax' + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return (outputs > 0) * grads_wrt_outputs + + def __repr__(self): + return 'ReluLayer' -class Relu(Linear): - def __init__(self, idim, odim, - rng=None, - irange=0.1): - - super(Relu, self).__init__(idim, odim, rng, irange) +class TanhLayer(Layer): + """Layer implementing an element-wise hyperbolic tangent transformation.""" def fprop(self, inputs): - #get the linear activations - a = super(Relu, self).fprop(inputs) - h = numpy.clip(a, 0, 20.0) - #h = numpy.maximum(a, 0) - return h + """Forward propagates activations through the layer transformation. - def bprop(self, h, igrads): - deltas = (h > 0)*igrads - ___, ograds = super(Relu, self).bprop(h=None, igrads=deltas) - return deltas, ograds + For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`. - def bprop_cost(self, h, igrads, cost): - raise NotImplementedError('Relu.bprop_cost method not implemented ' - 'for the %s cost' % cost.get_name()) + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). - def get_name(self): - return 'relu' + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.tanh(inputs) + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. -class Tanh(Linear): - def __init__(self, idim, odim, - rng=None, - irange=0.1): + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. - super(Tanh, self).__init__(idim, odim, rng, irange) + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). - def fprop(self, inputs): - #get the linear activations - a = super(Tanh, self).fprop(inputs) - numpy.clip(a, -30.0, 30.0, out=a) - h = numpy.tanh(a) - return h + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return (1. - outputs**2) * grads_wrt_outputs - def bprop(self, h, igrads): - deltas = (1.0 - h**2) * igrads - ___, ograds = super(Tanh, self).bprop(h=None, igrads=deltas) - return deltas, ograds - - def bprop_cost(self, h, igrads, cost): - raise NotImplementedError('Tanh.bprop_cost method not implemented ' - 'for the %s cost' % cost.get_name()) - - def get_name(self): - return 'tanh' - - -class Maxout(Linear): - def __init__(self, idim, odim, k, - rng=None, - irange=0.05): - - super(Maxout, self).__init__(idim, odim*k, rng, irange) - - self.max_odim = odim - self.k = k - - def fprop(self, inputs): - #get the linear activations - a = super(Maxout, self).fprop(inputs) - ar = a.reshape(a.shape[0], self.max_odim, self.k) - h, h_argmax = max_and_argmax(ar, axes=2, keepdims_max=True, keepdims_argmax=True) - self.h_argmax = h_argmax - return h[:, :, 0] #get rid of the last reduced dimensison (of size 1) - - def bprop(self, h, igrads): - #hack for dropout backprop (ignore dropped neurons). Note, this is not - #entirely correct when h fires at 0 exaclty (but is not dropped, in which case - #derivative should be 1). However, this is rather unlikely to happen (that h fires as 0) - #and probably can be ignored for now. Otherwise, one would have to keep the dropped unit - #indexes and zero grads according to them. - igrads = (h != 0)*igrads - #convert into the shape where upsampling is easier - igrads_up = igrads.reshape(igrads.shape[0], self.max_odim, 1) - #upsample to the linear dimension (but reshaped to (batch_size, maxed_num (1), pool_size) - igrads_up = numpy.tile(igrads_up, (1, 1, self.k)) - #generate mask matrix and set to 1 maxed elements - mask = numpy.zeros_like(igrads_up) - mask[self.h_argmax] = 1.0 - #do bprop through max operator and then reshape into 2D - deltas = (igrads_up * mask).reshape(igrads_up.shape[0], -1) - #and then do bprop thorough linear part - ___, ograds = super(Maxout, self).bprop(h=None, igrads=deltas) - return deltas, ograds - - def bprop_cost(self, h, igrads, cost): - raise NotImplementedError('Maxout.bprop_cost method not implemented ' - 'for the %s cost' % cost.get_name()) - - def get_name(self): - return 'maxout' + def __repr__(self): + return 'TanhLayer' diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py new file mode 100644 index 0000000..4c771cd --- /dev/null +++ b/mlp/learning_rules.py @@ -0,0 +1,42 @@ +"""Learning rules.""" + +import numpy as np + + +class GradientDescentLearningRule(object): + + def __init__(self, learning_rate=1e-3): + self.learning_rate = learning_rate + + def initialise(self, params): + self.params = params + + def reset(self): + pass + + def update_params(self, grads_wrt_params): + for param, grad in zip(self.params, grads_wrt_params): + param -= self.learning_rate * grad + + +class MomentumLearningRule(object): + + def __init__(self, learning_rate=1e-3, mom_coeff=0.9): + self.learning_rate = learning_rate + self.mom_coeff = mom_coeff + + def initialise(self, params): + self.params = params + self.moms = [] + for param in self.params: + self.moms.append(np.zeros_like(param)) + + def reset(self): + for mom in zip(self.moms): + mom *= 0. + + def update_params(self, grads_wrt_params): + for param, mom, grad in zip(self.params, self.moms, grads_wrt_params): + mom *= self.mom_coeff + mom -= self.learning_rate * grad + param += mom diff --git a/mlp/models.py b/mlp/models.py new file mode 100644 index 0000000..18fff11 --- /dev/null +++ b/mlp/models.py @@ -0,0 +1,92 @@ +""" +Model definitions. +""" + +from mlp.layers import LayerWithParameters + + +class SingleLayerModel(object): + """ + """ + def __init__(self, layer): + self.layer = layer + + @property + def params(self): + """ + """ + return self.layer.params + + def fprop(self, inputs): + """ + """ + activations = [inputs, self.layer.fprop(inputs)] + return activations + + def grads_wrt_params(self, activations, grads_wrt_outputs): + """ + """ + return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs) + + def params_cost(self): + """ + """ + return self.layer.params_cost() + + def __repr__(self): + return 'SingleLayerModel(' + str(layer) + ')' + + +class MultipleLayerModel(object): + """ + """ + def __init__(self, layers): + self.layers = layers + + @property + def params(self): + """ + """ + params = [] + for layer in self.layers: + if isinstance(layer, LayerWithParameters): + params += layer.params + return params + + def fprop(self, inputs): + """ + """ + activations = [inputs] + for i, layer in enumerate(self.layers): + activations.append(self.layers[i].fprop(activations[i])) + return activations + + def grads_wrt_params(self, activations, grads_wrt_outputs): + """ + """ + grads_wrt_params = [] + for i, layer in enumerate(self.layers[::-1]): + inputs = activations[-i - 2] + outputs = activations[-i - 1] + grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs) + if isinstance(layer, LayerWithParameters): + grads_wrt_params += layer.grads_wrt_params( + inputs, grads_wrt_outputs)[::-1] + grads_wrt_outputs = grads_wrt_inputs + return grads_wrt_params[::-1] + + def params_cost(self): + """ + """ + params_cost = 0. + for layer in self.layers: + if isinstance(layer, LayerWithParameters): + params_cost += layer.params_cost() + return params_cost + + def __repr__(self): + return ( + 'MultiLayerModel(\n ' + + '\n '.join([str(layer) for layer in self.layers]) + + '\n)' + ) diff --git a/mlp/trainers.py b/mlp/trainers.py new file mode 100644 index 0000000..a8c5e23 --- /dev/null +++ b/mlp/trainers.py @@ -0,0 +1,65 @@ +"""Model trainers.""" + +import time +import logging +from collections import OrderedDict + + +logger = logging.getLogger(__name__) + + +class Trainer(object): + + def __init__(self, model, cost, learning_rule, train_dataset, + valid_dataset=None): + self.model = model + self.cost = cost + self.learning_rule = learning_rule + self.learning_rule.initialise(self.model.params) + self.train_dataset = train_dataset + self.valid_dataset = valid_dataset + + def do_training_epoch(self): + for inputs_batch, targets_batch in self.train_dataset: + activations = self.model.fprop(inputs_batch) + grads_wrt_outputs = self.cost.grad(activations[-1], targets_batch) + grads_wrt_params = self.model.grads_wrt_params( + activations, grads_wrt_outputs) + self.learning_rule.update_params(grads_wrt_params) + self.train_dataset.reset() + + def data_cost(self, dataset): + cost = 0. + for inputs_batch, targets_batch in dataset: + activations = self.model.fprop(inputs_batch) + cost += self.cost(activations[-1], targets_batch) + dataset.reset() + return cost + + def get_epoch_stats(self): + epoch_stats = OrderedDict() + epoch_stats['cost(train)'] = self.data_cost( + self.train_dataset) + epoch_stats['cost(valid)'] = self.data_cost( + self.valid_dataset) + epoch_stats['cost(param)'] = self.model.params_cost() + return epoch_stats + + def log_stats(self, epoch, stats): + logger.info('Epoch {0}: {1}'.format( + epoch, + ', '.join(['{0}={1:.3f}'.format(k, v) for (k, v) in stats.items()]) + )) + + def train(self, n_epochs, stats_interval=5): + run_stats = [] + for epoch in range(n_epochs): + start_time = time.clock() + self.do_training_epoch() + epoch_time = time.clock() - start_time + if epoch % stats_interval == 0: + stats = self.get_epoch_stats() + stats['time'] = epoch_time + self.log_stats(epoch, stats) + run_stats.append(stats.items()) + return np.array(run_stats), stats.keys()