Initial refactor of framework.

2016-09-19 07:31:31 +01:00 · 2016-09-19 07:31:31 +01:00 · 89beb62fce
commit 89beb62fce
parent 58452a0de0
6 changed files with 630 additions and 560 deletions
--- a/mlp/costs.py
+++ b/mlp/costs.py
@ -1,64 +1,105 @@
-# Machine Learning Practical (INFR11119),
+"""Model costs."""
 # Pawel Swietojanski, University of Edinburgh
-import numpy
+import numpy as np
-class Cost(object):
+class MeanSquaredErrorCost(object):
    """
    Defines an interface for the cost object
    """
    def cost(self, y, t, **kwargs):
        """
        Implements a cost for monitoring purposes
        :param y: matrix -- an output of the model
        :param t: matrix -- an expected output the model should produce
        :param kwargs: -- some optional parameters required by the cost
        :return: the scalar value representing the cost given y and t
        """
        raise NotImplementedError()
-    def grad(self, y, t, **kwargs):
+    def __call__(self, outputs, targets):
-        """
+        return 0.5 * np.mean(np.sum((outputs - targets)**2, axis=1))
        Implements a gradient of the cost w.r.t y
        :param y: matrix -- an output of the model
        :param t: matrix -- an expected output the model should produce
        :param kwargs: -- some optional parameters required by the cost
        :return: matrix - the gradient of the cost w.r.t y
        """
        raise NotImplementedError()
-    def get_name(self):
+    def grad(self, outputs, targets):
-        return 'cost'
+        return outputs - targets
    def __repr__(self):
        return 'MeanSquaredErrorCost'
-class MSECost(Cost):
+class BinaryCrossEntropyCost(object):
    def cost(self, y, t, **kwargs):
        se = 0.5*numpy.sum((y - t)**2, axis=1)
        return numpy.mean(se)
    def grad(self, y, t, **kwargs):
        return y - t
    def get_name(self):
        return 'mse'
 class CECost(Cost):
    """
    Cross Entropy (Negative log-likelihood) cost for multiple classes
    """
    def cost(self, y, t, **kwargs):
        #assumes t is 1-of-K coded and y is a softmax
        #transformed estimate at the output layer
        nll = t * numpy.log(y)
        return -numpy.mean(numpy.sum(nll, axis=1))
-    def grad(self, y, t, **kwargs):
+    def __call__(self, outputs, targets):
-        #assumes t is 1-of-K coded and y is a softmax
+        return -np.mean(
-        #transformed estimate at the output layer
+            targets * np.log(outputs) + (1. - targets) * np.log(1. - ouputs))
        return y - t
-    def get_name(self):
+    def grad(self, outputs, targets):
-        return 'ce'
+        return (1. - targets) / (1. - outputs) - (targets / outputs)
    def __repr__(self):
        return 'BinaryCrossEntropyCost'
 class BinaryCrossEntropySigmoidCost(object):
    """
    """
    def __call__(self, outputs, targets):
        probs = 1. / (1. + np.exp(-outputs))
        return -np.mean(
            targets * np.log(probs) + (1. - targets) * np.log(1. - probs))
    def grad(self, outputs, targets):
        probs = 1. / (1. + np.exp(-outputs))
        return probs - targets
    def __repr__(self):
        return 'BinaryCrossEntropySigmoidCost'
 class BinaryAccuracySigmoidCost(object):
    """
    """
    def __call__(self, outputs, targets):
        return ((outputs > 0) == targets).mean()
    def ___repr__(self):
        return 'BinaryAccuracySigmoidCost'
 class CrossEntropyCost(object):
    """
    """
    def __call__(self, outputs, targets):
        return -np.mean(np.sum(targets * np.log(outputs), axis=1))
    def grad(self, outputs, targets):
        return -targets / outputs
    def __repr__(self):
        return 'CrossEntropyCost'
 class CrossEntropySoftmaxCost(object):
    """
    """
    def __call__(self, outputs, targets):
        probs = np.exp(outputs)
        probs /= probs.sum(-1)[:, None]
        return -np.mean(np.sum(targets * np.log(probs), axis=1))
    def grad(self, outputs, targets):
        probs = np.exp(outputs)
        probs /= probs.sum(-1)[:, None]
        return probs - targets
    def __repr__(self):
        return 'CrossEntropySoftmaxCost'
 class MulticlassAccuracySoftmaxCost(object):
    """
    """
    def __call__(self, outputs, targets):
        probs = np.exp(outputs)
        return np.mean(np.argmax(probs, -1) == np.argmax(targets, -1))
    def __repr__(self):
        return 'MulticlassAccuracySoftmaxCost'
--- a/mlp/initialisers.py
+++ b/mlp/initialisers.py
@ -0,0 +1,66 @@
 # -*- coding: utf-8 -*-
 """Parameter initialisers.
 This module defines classes to initialise the parameters in a layer.
 """
 import numpy as np
 DEFAULT_SEED = 123456  # Default random number generator seed if none provided.
 class ConstantInit(object):
    """Constant parameter initialiser."""
    def __init__(self, value):
        """Construct a constant parameter initialiser.
        Args:
            value: Value to initialise parameter to.
        """
        self.value = value
    def __call__(self, shape):
        return np.ones(shape=shape) * self.value
 class UniformInit(object):
    """Random uniform parameter initialiser."""
    def __init__(self, low, high, rng=None):
        """Construct a random uniform parameter initialiser.
        Args:
            low: Lower bound of interval to sample from.
            high: Upper bound of interval to sample from.
            rng (RandomState): Seeded random number generator.
        """
        self.low = low
        self.high = high
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def __call__(self, shape):
        return self.rng.uniform(low=self.low, high=self.high, size=shape)
 class NormalInit(object):
    """Random normal parameter initialiser."""
    def __init__(self, mean, std, rng=None):
        """Construct a random uniform parameter initialiser.
        Args:
            mean: Mean of distribution to sample from.
            std: Standard deviation of distribution to sample from.
            rng (RandomState): Seeded random number generator.
        """
        self.mean = mean
        self.std = std
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def __call__(self, shape):
        return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
--- a/mlp/layers.py
+++ b/mlp/layers.py
@ -1,561 +1,325 @@
 # -*- coding: utf-8 -*-
 """Layer definitions.
-# Machine Learning Practical (INFR11119),
+This module defines classes which encapsulate a single layer.
 # Pawel Swietojanski, University of Edinburgh
-import numpy
+These layers map input activations to output activation with the `fprop`
-import logging
+method and map gradients with repsect to outputs to gradients with respect to
-from mlp.costs import Cost
+their inputs with the `bprop` method.
 Some layers will have learnable parameters and so will additionally define
 methods for getting and setting parameter and calculating gradients with
 respect to the layer parameters.
 """
-logger = logging.getLogger(__name__)
+import numpy as np
-
+import mlp.initialisers as init
 def max_and_argmax(x, axes=None, keepdims_max=False, keepdims_argmax=False):
    """
    Return both max and argmax for the given multi-dimensional array, possibly
    preserve the original shapes
    :param x: input tensor
    :param axes: tuple of ints denoting axes across which
                 one should perform reduction
    :param keepdims_max: boolean, if true, shape of x is preserved in result
    :param keepdims_argmax:, boolean, if true, shape of x is preserved in result
    :return: max (number) and argmax (indices) of max element along certain axes
             in multi-dimensional tensor
    """
    if axes is None:
        rval_argmax = numpy.argmax(x)
        if keepdims_argmax:
            rval_argmax = numpy.unravel_index(rval_argmax, x.shape)
    else:
        if isinstance(axes, int):
            axes = (axes,)
        axes = tuple(axes)
        keep_axes = numpy.array([i for i in range(x.ndim) if i not in axes])
        transposed_x = numpy.transpose(x, numpy.concatenate((keep_axes, axes)))
        reshaped_x = transposed_x.reshape(transposed_x.shape[:len(keep_axes)] + (-1,))
        rval_argmax = numpy.asarray(numpy.argmax(reshaped_x, axis=-1), dtype=numpy.int64)
        # rval_max_arg keeps the arg index referencing to the axis along which reduction was performed (axis=-1)
        # when keepdims_argmax is True we need to map it back to the original shape of tensor x
        # print 'rval maxaarg', rval_argmax.ndim, rval_argmax.shape, rval_argmax
        if keepdims_argmax:
            dim = tuple([x.shape[a] for a in axes])
            rval_argmax = numpy.array([idx + numpy.unravel_index(val, dim)
                                       for idx, val in numpy.ndenumerate(rval_argmax)])
            # convert to numpy indexing convention (row indices first, then columns)
            rval_argmax = zip(*rval_argmax)
    if keepdims_max is False and keepdims_argmax is True:
        # this could potentially save O(N) steps by not traversing array once more
        # to get max value, haven't benchmark it though
        rval_max = x[rval_argmax]
    else:
        rval_max = numpy.asarray(numpy.amax(x, axis=axes, keepdims=keepdims_max))
    return rval_max, rval_argmax
 class MLP(object):
    """
    This is a container for an arbitrary sequence of other transforms
    On top of this, the class also keeps the state of the model, i.e.
    the result of forward (activations) and backward (deltas) passes
    through the model (for a mini-batch), which is required to compute
    the gradients for the parameters
    """
    def __init__(self, cost, rng=None):
        assert isinstance(cost, Cost), (
            "Cost needs to be of type mlp.costs.Cost, got %s" % type(cost)
        )
        self.layers = [] #the actual list of network layers
        self.activations = [] #keeps forward-pass activations (h from equations)
                              # for a given minibatch (or features at 0th index)
        self.deltas = [] #keeps back-propagated error signals (deltas from equations)
                         # for a given minibatch and each layer
        self.cost = cost
        if rng is None:
            self.rng = numpy.random.RandomState([2015,11,11])
        else:
            self.rng = rng
    def fprop(self, x):
        """
        :param inputs: mini-batch of data-points x
        :return: y (top layer activation) which is an estimate of y given x
        """
        if len(self.activations) != len(self.layers) + 1:
            self.activations = [None]*(len(self.layers) + 1)
        self.activations[0] = x
        for i in xrange(0, len(self.layers)):
            self.activations[i+1] = self.layers[i].fprop(self.activations[i])
        return self.activations[-1]
    def fprop_dropout(self, x, dp_scheduler):
        """
        :param inputs: mini-batch of data-points x
        :param dp_scheduler: dropout scheduler
        :return: y (top layer activation) which is an estimate of y given x
        """
        if len(self.activations) != len(self.layers) + 1:
            self.activations = [None]*(len(self.layers) + 1)
        p_inp, p_hid = dp_scheduler.get_rate()
        d_inp = 1
        p_inp_scaler, p_hid_scaler = 1.0/p_inp, 1.0/p_hid
        if p_inp < 1:
            d_inp = self.rng.binomial(1, p_inp, size=x.shape)
        self.activations[0] = p_inp_scaler*d_inp*x #it's OK to scale the inputs by p_inp_scaler here
        self.activations[1] = self.layers[0].fprop(self.activations[0])
        for i in xrange(1, len(self.layers)):
            d_hid = 1
            if p_hid < 1:
                d_hid = self.rng.binomial(1, p_hid, size=self.activations[i].shape)
            self.activations[i] *= d_hid #but not the hidden activations, since the non-linearity grad *may* explicitly depend on them
            self.activations[i+1] = self.layers[i].fprop(p_hid_scaler*self.activations[i])
        return self.activations[-1]
    def bprop(self, cost_grad, dp_scheduler=None):
        """
        :param cost_grad: matrix -- grad of the cost w.r.t y
        :return: None, the deltas are kept in the model
        """
        # allocate the list of deltas for each layer
        # note, we do not use all of those fields but
        # want to keep it aligned 1:1 with activations,
        # which will simplify indexing later on when
        # computing grads w.r.t parameters
        if len(self.deltas) != len(self.activations):
            self.deltas = [None]*len(self.activations)
        # treat the top layer in special way, as it deals with the
        # cost, which may lead to some simplifications
        top_layer_idx = len(self.layers)
        self.deltas[top_layer_idx], ograds = self.layers[top_layer_idx - 1].\
            bprop_cost(self.activations[top_layer_idx], cost_grad, self.cost)
        p_hid_scaler = 1.0
        if dp_scheduler is not None:
            p_inp, p_hid = dp_scheduler.get_rate()
            p_hid_scaler /= p_hid
        # then back-prop through remaining layers
        for i in xrange(top_layer_idx - 1, 0, -1):
            self.deltas[i], ograds = self.layers[i - 1].\
                bprop(self.activations[i], ograds*p_hid_scaler)
    def add_layer(self, layer):
        self.layers.append(layer)
    def set_layers(self, layers):
        self.layers = layers
    def get_name(self):
        return 'mlp'
 class Layer(object):
    """Abstract class defining the interface for a layer."""
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        raise NotImplementedError()
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        raise NotImplementedError()
 class LayerWithParameters(Layer):
    """Abstract class defining the interface for a layer with parameters."""
    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.
        Args:
            inputs: Array of inputs to layer of shape (batch_size, input_dim).
            grads_wrt_to_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            List of arrays of gradients with respect to the layer parameters
            with parameter gradients appearing in same order in tuple as
            returned from `get_params` method.
        """
        raise NotImplementedError()
    def params_cost(self):
        """Returns the parameter dependent cost term for this layer.
        If no parameter-dependent cost terms are set this returns zero.
        """
        raise NotImplementedError()
    @property
    def params(self):
        """Returns a list of parameters of layer.
        Returns:
            List of current parameter values. This list should be in the
            corresponding order to the `values` argument to `set_params`.
        """
        raise NotImplementedError()
    @params.setter
    def params(self, values):
        """Sets layer parameters from a list of values.
        Args:
            values: List of values to set parameters to. This list should be
                in the corresponding order to what is returned by `get_params`.
        """
        raise NotImplementedError()
 class AffineLayer(LayerWithParameters):
    """Layer implementing an affine tranformation of its inputs.
    This layer is parameterised by a weight matrix and bias vector.
    """
    Abstract class defining an interface for
    other transforms.
    """
    def __init__(self, rng=None):
-        if rng is None:
+    def __init__(self, input_dim, output_dim,
-            seed=[2015, 10, 1]
+                 weights_initialiser=init.UniformInit(-0.1, 0.1),
-            self.rng = numpy.random.RandomState(seed)
+                 biases_initialiser=init.ConstantInit(0.),
-        else:
+                 weights_cost=None, biases_cost=None):
-            self.rng = rng
+        """Initialises a parameterised affine layer.
        Args:
            input_dim (int): Dimension of inputs to the layer.
            output_dim (int): Dimension of the layer outputs.
            weights_initialiser: Initialiser for the weight parameters.
            biases_initialiser: Initialiser for the bias parameters.
            weights_cost: Weights-dependent cost term.
            biases_cost: Biases-dependent cost term.
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weights = weights_initialiser((self.output_dim, self.input_dim))
        self.biases = biases_initialiser(self.output_dim)
        self.weights_cost = weights_cost
        self.biases_cost = biases_cost
    def fprop(self, inputs):
-        """
+        """Forward propagates activations through the layer transformation.
        Implements a forward propagation through the i-th layer, that is
        some form of:
           a^i = xW^i + b^i
           h^i = f^i(a^i)
        with f^i, W^i, b^i denoting a non-linearity, weight matrix and
        biases at the i-th layer, respectively and x denoting inputs.
-        :param inputs: matrix of features (x) or the output of the previous layer h^{i-1}
+        For inputs `x`, outputs `y`, weights `W` and biases `b` the layer
-        :return: h^i, matrix of transformed by layer features
+        corresponds to `y = W.dot(x) + b`.
        """
        raise NotImplementedError()
    def bprop(self, h, igrads):
        """
        Implements a backward propagation through the layer, that is, given
        h^i denotes the output of the layer and x^i the input, we compute:
        dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i
        x^i could be either features (x) or the output of the lower layer h^{i-1}
        :param h: it's an activation produced in forward pass
        :param igrads, error signal (or gradient) flowing to the layer, note,
               this in general case does not corresponds to 'deltas' used to update
               the layer's parameters, to get deltas ones need to multiply it with
               the dh^i/da^i derivative
        :return: a tuple (deltas, ograds) where:
               deltas = igrads * dh^i/da^i
               ograds = deltas \times da^i/dx^i
        """
        raise NotImplementedError()
-    def bprop_cost(self, h, igrads, cost=None):
+        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
-        Implements a backward propagation in case the layer directly
+        return self.weights.dot(inputs.T).T + self.biases
-        deals with the optimised cost (i.e. the top layer)
+
-        By default, method should implement a back-prop for default cost, that is
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
-        the one that is natural to the layer's output, i.e.:
+        """Back propagates gradients through a layer.
-        linear -> mse, softmax -> cross-entropy, sigmoid -> binary cross-entropy
+
-        :param h: it's an activation produced in forward pass
+        Given gradients with respect to the outputs of the layer calculates the
-        :param igrads, error signal (or gradient) flowing to the layer, note,
+        gradients with respect to the layer inputs.
-               this in general case does not corresponds to 'deltas' used to update
+
-               the layer's parameters, to get deltas ones need to multiply it with
+        Args:
-               the dh^i/da^i derivative
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
-        :return: a tuple (deltas, ograds) where:
+            outputs: Array of layer outputs calculated in forward pass of
-               deltas = igrads * dh^i/da^i
+                shape (batch_size, output_dim).
-               ograds = deltas \times da^i/dx^i
+            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        return grads_wrt_outputs.dot(self.weights)
    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.
        Args:
            inputs: array of inputs to layer of shape (batch_size, input_dim)
            grads_wrt_to_outputs: array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim)
        Returns:
            list of arrays of gradients with respect to the layer parameters
            `[grads_wrt_weights, grads_wrt_biases]`.
        """
-        raise NotImplementedError()
+        grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
        grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
-    def pgrads(self, inputs, deltas, **kwargs):
+        if self.weights_cost is not None:
            grads_wrt_weights += self.weights_cost.grad(self.weights)
        if self.biases_cost is not None:
            grads_wrt_biases += self.biases_cost.grads(self.biases)
        return [grads_wrt_weights, grads_wrt_biases]
    def params_cost(self):
        """Returns the parameter dependent cost term for this layer.
        If no parameter-dependent cost terms are set this returns zero.
        """
-        Return gradients w.r.t parameters
+        params_cost = 0
-        """
+        if self.weights_cost is not None:
-        raise NotImplementedError()
+            params_cost += self.weights_cost(self.weights)
        if self.biases_cost is not None:
            params_cost += self.biases_cost(self.biases)
        return params_cost
-    def get_params(self):
+    @property
-        raise NotImplementedError()
+    def params(self):
        """A list of layer parameter values: `[weights, biases]`."""
        return [self.weights, self.biases]
-    def set_params(self):
+    @params.setter
-        raise NotImplementedError()
+    def params(self, values):
        self.weights = values[0]
        self.biases = values[1]
-    def get_name(self):
+    def __repr__(self):
-        return 'abstract_layer'
+        return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
            self.input_dim, self.output_dim)
-class Linear(Layer):
+class SigmoidLayer(Layer):
-
+    """Layer implementing an element-wise logistic sigmoid transformation."""
    def __init__(self, idim, odim,
                 rng=None,
                 irange=0.1):
        super(Linear, self).__init__(rng=rng)
        self.idim = idim
        self.odim = odim
        self.W = self.rng.uniform(
            -irange, irange,
            (self.idim, self.odim))
        self.b = numpy.zeros((self.odim,), dtype=numpy.float32)
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        For inputs `x` and outputs `y` this corresponds to
        `y = 1 / (1 + exp(-x))`.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
-        Implements a forward propagation through the i-th layer, that is
+        return 1. / (1. + np.exp(-inputs))
        some form of:
           a^i = xW^i + b^i
           h^i = f^i(a^i)
        with f^i, W^i, b^i denoting a non-linearity, weight matrix and
        biases of this (i-th) layer, respectively and x denoting inputs.
-        :param inputs: matrix of features (x) or the output of the previous layer h^{i-1}
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
-        :return: h^i, matrix of transformed by layer features
+        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        return grads_wrt_outputs * outputs * (1. - outputs)
-        #input comes from 4D convolutional tensor, reshape to expected shape
+    def __repr__(self):
-        if inputs.ndim == 4:
+        return 'SigmoidLayer'
            inputs = inputs.reshape(inputs.shape[0], -1)
        a = numpy.dot(inputs, self.W) + self.b
        # here f() is an identity function, so just return a linear transformation
        return a
    def bprop(self, h, igrads):
        """
        Implements a backward propagation through the layer, that is, given
        h^i denotes the output of the layer and x^i the input, we compute:
        dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i
        x^i could be either features (x) or the output of the lower layer h^{i-1}
        :param h: it's an activation produced in forward pass
        :param igrads, error signal (or gradient) flowing to the layer, note,
               this in general case does not corresponds to 'deltas' used to update
               the layer's parameters, to get deltas ones need to multiply it with
               the dh^i/da^i derivative
        :return: a tuple (deltas, ograds) where:
               deltas = igrads * dh^i/da^i
               ograds = deltas \times da^i/dx^i
        """
        # since df^i/da^i = 1 (f is assumed identity function),
        # deltas are in fact the same as igrads
        ograds = numpy.dot(igrads, self.W.T)
        return igrads, ograds
    def bprop_cost(self, h, igrads, cost):
        """
        Implements a backward propagation in case the layer directly
        deals with the optimised cost (i.e. the top layer)
        By default, method should implement a bprop for default cost, that is
        the one that is natural to the layer's output, i.e.:
        here we implement linear -> mse scenario
        :param h: it's an activation produced in forward pass
        :param igrads, error signal (or gradient) flowing to the layer, note,
               this in general case does not corresponds to 'deltas' used to update
               the layer's parameters, to get deltas ones need to multiply it with
               the dh^i/da^i derivative
        :param cost, mlp.costs.Cost instance defining the used cost
        :return: a tuple (deltas, ograds) where:
               deltas = igrads * dh^i/da^i
               ograds = deltas \times da^i/dx^i
        """
        if cost is None or cost.get_name() == 'mse':
            # for linear layer and mean square error cost,
            # cost back-prop is the same as standard back-prop
            return self.bprop(h, igrads)
        else:
            raise NotImplementedError('Linear.bprop_cost method not implemented '
                                      'for the %s cost' % cost.get_name())
    def pgrads(self, inputs, deltas, l1_weight=0, l2_weight=0):
        """
        Return gradients w.r.t parameters
        :param inputs, input to the i-th layer
        :param deltas, deltas computed in bprop stage up to -ith layer
        :param kwargs, key-value optional arguments
        :return list of grads w.r.t parameters dE/dW and dE/db in *exactly*
                the same order as the params are returned by get_params()
        Note: deltas here contain the whole chain rule leading
        from the cost up to the the i-th layer, i.e.
        dE/dy^L dy^L/da^L da^L/dh^{L-1} dh^{L-1}/da^{L-1} ... dh^{i}/da^{i}
        and here we are just asking about
          1) da^i/dW^i and 2) da^i/db^i
        since W and b are only layer's parameters
        """
        #input comes from 4D convolutional tensor, reshape to expected shape
        if inputs.ndim == 4:
            inputs = inputs.reshape(inputs.shape[0], -1)
        #you could basically use different scalers for biases
        #and weights, but it is not implemented here like this
        l2_W_penalty, l2_b_penalty = 0, 0
        if l2_weight > 0:
            l2_W_penalty = l2_weight*self.W
            l2_b_penalty = l2_weight*self.b
        l1_W_penalty, l1_b_penalty = 0, 0
        if l1_weight > 0:
            l1_W_penalty = l1_weight*numpy.sign(self.W)
            l1_b_penalty = l1_weight*numpy.sign(self.b)
        grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty
        grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty
        return [grad_W, grad_b]
    def get_params(self):
        return [self.W, self.b]
    def set_params(self, params):
        #we do not make checks here, but the order on the list
        #is assumed to be exactly the same as get_params() returns
        self.W = params[0]
        self.b = params[1]
    def get_name(self):
        return 'linear'
-class Sigmoid(Linear):
+class ReluLayer(Layer):
-    def __init__(self,  idim, odim,
+    """Layer implementing an element-wise rectified linear transformation."""
                 rng=None,
                 irange=0.1):
        super(Sigmoid, self).__init__(idim, odim, rng, irange)
    def fprop(self, inputs):
        #get the linear activations
        a = super(Sigmoid, self).fprop(inputs)
        #stabilise the exp() computation in case some values in
        #'a' get very negative. We limit both tails, however only
        #negative values may lead to numerical issues -- exp(-a)
        #clip() function does the following operation faster:
        # a[a < -30.] = -30,
        # a[a > 30.] = 30.
        numpy.clip(a, -30.0, 30.0, out=a)
        h = 1.0/(1 + numpy.exp(-a))
        return h
    def bprop(self, h, igrads):
        dsigm = h * (1.0 - h)
        deltas = igrads * dsigm
        ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas)
        return deltas, ograds
    def bprop_cost(self, h, igrads, cost):
        if cost is None or cost.get_name() == 'bce':
            return super(Sigmoid, self).bprop(h=h, igrads=igrads)
        else:
            raise NotImplementedError('Sigmoid.bprop_cost method not implemented '
                                      'for the %s cost' % cost.get_name())
    def get_name(self):
        return 'sigmoid'
 class Softmax(Linear):
    def __init__(self,idim, odim,
                 rng=None,
                 irange=0.1):
        super(Softmax, self).__init__(idim,
                                      odim,
                                      rng=rng,
                                      irange=irange)
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
-        # compute the linear outputs
+        For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
        a = super(Softmax, self).fprop(inputs)
        # apply numerical stabilisation by subtracting max
        # from each row (not required for the coursework)
        # then compute exponent
        assert a.ndim in [1, 2], (
            "Expected the linear activation in Softmax layer to be either "
            "vector or matrix, got %ith dimensional tensor" % a.ndim
        )
        axis = a.ndim - 1
        exp_a = numpy.exp(a - numpy.max(a, axis=axis, keepdims=True))
        # finally, normalise by the sum within each example
        y = exp_a/numpy.sum(exp_a, axis=axis, keepdims=True)
-        return y
+        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
-    def bprop(self, h, igrads):
+        Returns:
-        raise NotImplementedError('Softmax.bprop not implemented for hidden layer.')
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        return np.maximum(inputs, 0.)
-    def bprop_cost(self, h, igrads, cost):
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
-        if cost is None or cost.get_name() == 'ce':
+        Given gradients with respect to the outputs of the layer calculates the
-            return super(Softmax, self).bprop(h=h, igrads=igrads)
+        gradients with respect to the layer inputs.
        else:
            raise NotImplementedError('Softmax.bprop_cost method not implemented '
                                      'for %s cost' % cost.get_name())
-    def get_name(self):
+        Args:
-        return 'softmax'
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        return (outputs > 0) * grads_wrt_outputs
    def __repr__(self):
        return 'ReluLayer'
-class Relu(Linear):
+class TanhLayer(Layer):
-    def __init__(self,  idim, odim,
+    """Layer implementing an element-wise hyperbolic tangent transformation."""
                 rng=None,
                 irange=0.1):
        super(Relu, self).__init__(idim, odim, rng, irange)
    def fprop(self, inputs):
-        #get the linear activations
+        """Forward propagates activations through the layer transformation.
        a = super(Relu, self).fprop(inputs)
        h = numpy.clip(a, 0, 20.0)
        #h = numpy.maximum(a, 0)
        return h
-    def bprop(self, h, igrads):
+        For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`.
        deltas = (h > 0)*igrads
        ___, ograds = super(Relu, self).bprop(h=None, igrads=deltas)
        return deltas, ograds
-    def bprop_cost(self, h, igrads, cost):
+        Args:
-        raise NotImplementedError('Relu.bprop_cost method not implemented '
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
                                      'for the %s cost' % cost.get_name())
-    def get_name(self):
+        Returns:
-        return 'relu'
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        return np.tanh(inputs)
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
-class Tanh(Linear):
+        Given gradients with respect to the outputs of the layer calculates the
-    def __init__(self,  idim, odim,
+        gradients with respect to the layer inputs.
                 rng=None,
                 irange=0.1):
-        super(Tanh, self).__init__(idim, odim, rng, irange)
+        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
-    def fprop(self, inputs):
+        Returns:
-        #get the linear activations
+            Array of gradients with respect to the layer inputs of shape
-        a = super(Tanh, self).fprop(inputs)
+            (batch_size, input_dim).
-        numpy.clip(a, -30.0, 30.0, out=a)
+        """
-        h = numpy.tanh(a)
+        return (1. - outputs**2) * grads_wrt_outputs
        return h
-    def bprop(self, h, igrads):
+    def __repr__(self):
-        deltas = (1.0 - h**2) * igrads
+        return 'TanhLayer'
        ___, ograds = super(Tanh, self).bprop(h=None, igrads=deltas)
        return deltas, ograds
    def bprop_cost(self, h, igrads, cost):
        raise NotImplementedError('Tanh.bprop_cost method not implemented '
                                      'for the %s cost' % cost.get_name())
    def get_name(self):
        return 'tanh'
 class Maxout(Linear):
    def __init__(self,  idim, odim, k,
                 rng=None,
                 irange=0.05):
        super(Maxout, self).__init__(idim, odim*k, rng, irange)
        self.max_odim = odim
        self.k = k
    def fprop(self, inputs):
        #get the linear activations
        a = super(Maxout, self).fprop(inputs)
        ar = a.reshape(a.shape[0], self.max_odim, self.k)
        h, h_argmax = max_and_argmax(ar, axes=2, keepdims_max=True, keepdims_argmax=True)
        self.h_argmax = h_argmax
        return h[:, :, 0] #get rid of the last reduced dimensison (of size 1)
    def bprop(self, h, igrads):
        #hack for dropout backprop (ignore dropped neurons). Note, this is not
        #entirely correct when h fires at 0 exaclty (but is not dropped, in which case
        #derivative should be 1). However, this is rather unlikely to happen (that h fires as 0)
        #and probably can be ignored for now. Otherwise, one would have to keep the dropped unit
        #indexes and zero grads according to them.
        igrads = (h != 0)*igrads
        #convert into the shape where upsampling is easier
        igrads_up = igrads.reshape(igrads.shape[0], self.max_odim, 1)
        #upsample to the linear dimension (but reshaped to (batch_size, maxed_num (1), pool_size)
        igrads_up = numpy.tile(igrads_up, (1, 1, self.k))
        #generate mask matrix and set to 1 maxed elements
        mask = numpy.zeros_like(igrads_up)
        mask[self.h_argmax] = 1.0
        #do bprop through max operator and then reshape into 2D
        deltas = (igrads_up * mask).reshape(igrads_up.shape[0], -1)
        #and then do bprop thorough linear part
        ___, ograds = super(Maxout, self).bprop(h=None, igrads=deltas)
        return deltas, ograds
    def bprop_cost(self, h, igrads, cost):
        raise NotImplementedError('Maxout.bprop_cost method not implemented '
                                      'for the %s cost' % cost.get_name())
    def get_name(self):
        return 'maxout'
--- a/mlp/learning_rules.py
+++ b/mlp/learning_rules.py
@ -0,0 +1,42 @@
 """Learning rules."""
 import numpy as np
 class GradientDescentLearningRule(object):
    def __init__(self, learning_rate=1e-3):
        self.learning_rate = learning_rate
    def initialise(self, params):
        self.params = params
    def reset(self):
        pass
    def update_params(self, grads_wrt_params):
        for param, grad in zip(self.params, grads_wrt_params):
            param -= self.learning_rate * grad
 class MomentumLearningRule(object):
    def __init__(self, learning_rate=1e-3, mom_coeff=0.9):
        self.learning_rate = learning_rate
        self.mom_coeff = mom_coeff
    def initialise(self, params):
        self.params = params
        self.moms = []
        for param in self.params:
            self.moms.append(np.zeros_like(param))
    def reset(self):
        for mom in zip(self.moms):
            mom *= 0.
    def update_params(self, grads_wrt_params):
        for param, mom, grad in zip(self.params, self.moms, grads_wrt_params):
            mom *= self.mom_coeff
            mom -= self.learning_rate * grad
            param += mom
--- a/mlp/models.py
+++ b/mlp/models.py
@ -0,0 +1,92 @@
 """
 Model definitions.
 """
 from mlp.layers import LayerWithParameters
 class SingleLayerModel(object):
    """
    """
    def __init__(self, layer):
        self.layer = layer
    @property
    def params(self):
        """
        """
        return self.layer.params
    def fprop(self, inputs):
        """
        """
        activations = [inputs, self.layer.fprop(inputs)]
        return activations
    def grads_wrt_params(self, activations, grads_wrt_outputs):
        """
        """
        return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
    def params_cost(self):
        """
        """
        return self.layer.params_cost()
    def __repr__(self):
        return 'SingleLayerModel(' + str(layer) + ')'
 class MultipleLayerModel(object):
    """
    """
    def __init__(self, layers):
        self.layers = layers
    @property
    def params(self):
        """
        """
        params = []
        for layer in self.layers:
            if isinstance(layer, LayerWithParameters):
                params += layer.params
        return params
    def fprop(self, inputs):
        """
        """
        activations = [inputs]
        for i, layer in enumerate(self.layers):
            activations.append(self.layers[i].fprop(activations[i]))
        return activations
    def grads_wrt_params(self, activations, grads_wrt_outputs):
        """
        """
        grads_wrt_params = []
        for i, layer in enumerate(self.layers[::-1]):
            inputs = activations[-i - 2]
            outputs = activations[-i - 1]
            grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
            if isinstance(layer, LayerWithParameters):
                grads_wrt_params += layer.grads_wrt_params(
                    inputs, grads_wrt_outputs)[::-1]
            grads_wrt_outputs = grads_wrt_inputs
        return grads_wrt_params[::-1]
    def params_cost(self):
        """
        """
        params_cost = 0.
        for layer in self.layers:
            if isinstance(layer, LayerWithParameters):
                params_cost += layer.params_cost()
        return params_cost
    def __repr__(self):
        return (
            'MultiLayerModel(\n    ' +
            '\n    '.join([str(layer) for layer in self.layers]) +
            '\n)'
        )
--- a/mlp/trainers.py
+++ b/mlp/trainers.py
@ -0,0 +1,65 @@
 """Model trainers."""
 import time
 import logging
 from collections import OrderedDict
 logger = logging.getLogger(__name__)
 class Trainer(object):
    def __init__(self, model, cost, learning_rule, train_dataset,
                 valid_dataset=None):
        self.model = model
        self.cost = cost
        self.learning_rule = learning_rule
        self.learning_rule.initialise(self.model.params)
        self.train_dataset = train_dataset
        self.valid_dataset = valid_dataset
    def do_training_epoch(self):
        for inputs_batch, targets_batch in self.train_dataset:
            activations = self.model.fprop(inputs_batch)
            grads_wrt_outputs = self.cost.grad(activations[-1], targets_batch)
            grads_wrt_params = self.model.grads_wrt_params(
                activations, grads_wrt_outputs)
            self.learning_rule.update_params(grads_wrt_params)
        self.train_dataset.reset()
    def data_cost(self, dataset):
        cost = 0.
        for inputs_batch, targets_batch in dataset:
            activations = self.model.fprop(inputs_batch)
            cost += self.cost(activations[-1], targets_batch)
        dataset.reset()
        return cost
    def get_epoch_stats(self):
        epoch_stats = OrderedDict()
        epoch_stats['cost(train)'] = self.data_cost(
            self.train_dataset)
        epoch_stats['cost(valid)'] = self.data_cost(
            self.valid_dataset)
        epoch_stats['cost(param)'] = self.model.params_cost()
        return epoch_stats
    def log_stats(self, epoch, stats):
        logger.info('Epoch {0}: {1}'.format(
            epoch,
            ', '.join(['{0}={1:.3f}'.format(k, v) for (k, v) in stats.items()])
        ))
    def train(self, n_epochs, stats_interval=5):
        run_stats = []
        for epoch in range(n_epochs):
            start_time = time.clock()
            self.do_training_epoch()
            epoch_time = time.clock() - start_time
            if epoch % stats_interval == 0:
                stats = self.get_epoch_stats()
                stats['time'] = epoch_time
                self.log_stats(epoch, stats)
                run_stats.append(stats.items())
        return np.array(run_stats), stats.keys()