From 04fe5a72798eb8a9ddcbd9fbaa55587c2968bb6e Mon Sep 17 00:00:00 2001
From: Matt Graham <m.m.graham@ed.ac.uk>
Date: Thu, 22 Sep 2016 15:03:29 +0100
Subject: [PATCH] Removing files not relevant to first lab.

---
 mlp/costs.py                          | 173 -------
 mlp/initialisers.py                   |  65 ---
 mlp/layers.py                         | 325 -------------
 mlp/learning_rules.py                 | 161 -------
 mlp/models.py                         | 145 ------
 mlp/optimisers.py                     | 134 ------
 mlp/schedulers.py                     | 172 -------
 mlp/utils.py                          | 361 --------------
 notebooks/02_Linear_models.ipynb      | 650 --------------------------
 notebooks/03_Multi_layer_models.ipynb | 303 ------------
 notebooks/04_Regularisation.ipynb     | 293 ------------
 notebooks/05_Transfer_functions.ipynb | 238 ----------
 12 files changed, 3020 deletions(-)
 delete mode 100644 mlp/costs.py
 delete mode 100644 mlp/initialisers.py
 delete mode 100644 mlp/layers.py
 delete mode 100644 mlp/learning_rules.py
 delete mode 100644 mlp/models.py
 delete mode 100644 mlp/optimisers.py
 delete mode 100644 mlp/schedulers.py
 delete mode 100644 mlp/utils.py
 delete mode 100644 notebooks/02_Linear_models.ipynb
 delete mode 100644 notebooks/03_Multi_layer_models.ipynb
 delete mode 100644 notebooks/04_Regularisation.ipynb
 delete mode 100644 notebooks/05_Transfer_functions.ipynb

diff --git a/mlp/costs.py b/mlp/costs.py
deleted file mode 100644
index bd103b3..0000000
--- a/mlp/costs.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Model costs.
-
-This module defines cost functions, with the aim of model training being to
-minimise the cost function given a set of inputs and target outputs. The cost
-functions typically measure some concept of distance between the model outputs
-and target outputs.
-"""
-
-import numpy as np
-
-
-class MeanSquaredErrorCost(object):
-    """Mean squared error cost."""
-
-    def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Scalar cost function value.
-        """
-        return 0.5 * np.mean(np.sum((outputs - targets)**2, axis=1))
-
-    def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Gradient of cost function with respect to outputs.
-        """
-        return outputs - targets
-
-    def __repr__(self):
-        return 'MeanSquaredErrorCost'
-
-
-class BinaryCrossEntropyCost(object):
-    """Binary cross entropy cost."""
-
-    def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Scalar cost function value.
-        """
-        return -np.mean(
-            targets * np.log(outputs) + (1. - targets) * np.log(1. - ouputs))
-
-    def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Gradient of cost function with respect to outputs.
-        """
-        return (1. - targets) / (1. - outputs) - (targets / outputs)
-
-    def __repr__(self):
-        return 'BinaryCrossEntropyCost'
-
-
-class BinaryCrossEntropySigmoidCost(object):
-    """Binary cross entropy cost with logistic sigmoid applied to outputs."""
-
-    def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Scalar cost function value.
-        """
-        probs = 1. / (1. + np.exp(-outputs))
-        return -np.mean(
-            targets * np.log(probs) + (1. - targets) * np.log(1. - probs))
-
-    def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Gradient of cost function with respect to outputs.
-        """
-        probs = 1. / (1. + np.exp(-outputs))
-        return probs - targets
-
-    def __repr__(self):
-        return 'BinaryCrossEntropySigmoidCost'
-
-
-class CrossEntropyCost(object):
-    """Multi-class cross entropy cost."""
-
-    def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Scalar cost function value.
-        """
-        return -np.mean(np.sum(targets * np.log(outputs), axis=1))
-
-    def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Gradient of cost function with respect to outputs.
-        """
-        return -targets / outputs
-
-    def __repr__(self):
-        return 'CrossEntropyCost'
-
-
-class CrossEntropySoftmaxCost(object):
-    """Multi-class cross entropy cost with Softmax applied to outputs."""
-
-    def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Scalar cost function value.
-        """
-        probs = np.exp(outputs)
-        probs /= probs.sum(-1)[:, None]
-        return -np.mean(np.sum(targets * np.log(probs), axis=1))
-
-    def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
-
-        Args:
-            outputs: Array of model outputs of shape (batch_size, output_dim).
-            targets: Array of target outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Gradient of cost function with respect to outputs.
-        """
-        probs = np.exp(outputs)
-        probs /= probs.sum(-1)[:, None]
-        return probs - targets
-
-    def __repr__(self):
-        return 'CrossEntropySoftmaxCost'
diff --git a/mlp/initialisers.py b/mlp/initialisers.py
deleted file mode 100644
index 243adc2..0000000
--- a/mlp/initialisers.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Parameter initialisers.
-
-This module defines classes to initialise the parameters in a layer.
-"""
-
-import numpy as np
-from mlp import DEFAULT_SEED
-
-
-class ConstantInit(object):
-    """Constant parameter initialiser."""
-
-    def __init__(self, value):
-        """Construct a constant parameter initialiser.
-
-        Args:
-            value: Value to initialise parameter to.
-        """
-        self.value = value
-
-    def __call__(self, shape):
-        return np.ones(shape=shape) * self.value
-
-
-class UniformInit(object):
-    """Random uniform parameter initialiser."""
-
-    def __init__(self, low, high, rng=None):
-        """Construct a random uniform parameter initialiser.
-
-        Args:
-            low: Lower bound of interval to sample from.
-            high: Upper bound of interval to sample from.
-            rng (RandomState): Seeded random number generator.
-        """
-        self.low = low
-        self.high = high
-        if rng is None:
-            rng = np.random.RandomState(DEFAULT_SEED)
-        self.rng = rng
-
-    def __call__(self, shape):
-        return self.rng.uniform(low=self.low, high=self.high, size=shape)
-
-
-class NormalInit(object):
-    """Random normal parameter initialiser."""
-
-    def __init__(self, mean, std, rng=None):
-        """Construct a random uniform parameter initialiser.
-
-        Args:
-            mean: Mean of distribution to sample from.
-            std: Standard deviation of distribution to sample from.
-            rng (RandomState): Seeded random number generator.
-        """
-        self.mean = mean
-        self.std = std
-        if rng is None:
-            rng = np.random.RandomState(DEFAULT_SEED)
-        self.rng = rng
-
-    def __call__(self, shape):
-        return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
diff --git a/mlp/layers.py b/mlp/layers.py
deleted file mode 100644
index 760a01c..0000000
--- a/mlp/layers.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Layer definitions.
-
-This module defines classes which encapsulate a single layer.
-
-These layers map input activations to output activation with the `fprop`
-method and map gradients with repsect to outputs to gradients with respect to
-their inputs with the `bprop` method.
-
-Some layers will have learnable parameters and so will additionally define
-methods for getting and setting parameter and calculating gradients with
-respect to the layer parameters.
-"""
-
-import numpy as np
-import mlp.initialisers as init
-
-
-class Layer(object):
-    """Abstract class defining the interface for a layer."""
-
-    def fprop(self, inputs):
-        """Forward propagates activations through the layer transformation.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-
-        Returns:
-            outputs: Array of layer outputs of shape (batch_size, output_dim).
-        """
-        raise NotImplementedError()
-
-    def bprop(self, inputs, outputs, grads_wrt_outputs):
-        """Back propagates gradients through a layer.
-
-        Given gradients with respect to the outputs of the layer calculates the
-        gradients with respect to the layer inputs.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-            outputs: Array of layer outputs calculated in forward pass of
-                shape (batch_size, output_dim).
-            grads_wrt_outputs: Array of gradients with respect to the layer
-                outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Array of gradients with respect to the layer inputs of shape
-            (batch_size, input_dim).
-        """
-        raise NotImplementedError()
-
-
-class LayerWithParameters(Layer):
-    """Abstract class defining the interface for a layer with parameters."""
-
-    def grads_wrt_params(self, inputs, grads_wrt_outputs):
-        """Calculates gradients with respect to layer parameters.
-
-        Args:
-            inputs: Array of inputs to layer of shape (batch_size, input_dim).
-            grads_wrt_to_outputs: Array of gradients with respect to the layer
-                outputs of shape (batch_size, output_dim).
-
-        Returns:
-            List of arrays of gradients with respect to the layer parameters
-            with parameter gradients appearing in same order in tuple as
-            returned from `get_params` method.
-        """
-        raise NotImplementedError()
-
-    def params_cost(self):
-        """Returns the parameter dependent cost term for this layer.
-
-        If no parameter-dependent cost terms are set this returns zero.
-        """
-        raise NotImplementedError()
-
-    @property
-    def params(self):
-        """Returns a list of parameters of layer.
-
-        Returns:
-            List of current parameter values. This list should be in the
-            corresponding order to the `values` argument to `set_params`.
-        """
-        raise NotImplementedError()
-
-    @params.setter
-    def params(self, values):
-        """Sets layer parameters from a list of values.
-
-        Args:
-            values: List of values to set parameters to. This list should be
-                in the corresponding order to what is returned by `get_params`.
-        """
-        raise NotImplementedError()
-
-
-class AffineLayer(LayerWithParameters):
-    """Layer implementing an affine tranformation of its inputs.
-
-    This layer is parameterised by a weight matrix and bias vector.
-    """
-
-    def __init__(self, input_dim, output_dim,
-                 weights_initialiser=init.UniformInit(-0.1, 0.1),
-                 biases_initialiser=init.ConstantInit(0.),
-                 weights_cost=None, biases_cost=None):
-        """Initialises a parameterised affine layer.
-
-        Args:
-            input_dim (int): Dimension of inputs to the layer.
-            output_dim (int): Dimension of the layer outputs.
-            weights_initialiser: Initialiser for the weight parameters.
-            biases_initialiser: Initialiser for the bias parameters.
-            weights_cost: Weights-dependent cost term.
-            biases_cost: Biases-dependent cost term.
-        """
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.weights = weights_initialiser((self.output_dim, self.input_dim))
-        self.biases = biases_initialiser(self.output_dim)
-        self.weights_cost = weights_cost
-        self.biases_cost = biases_cost
-
-    def fprop(self, inputs):
-        """Forward propagates activations through the layer transformation.
-
-        For inputs `x`, outputs `y`, weights `W` and biases `b` the layer
-        corresponds to `y = W.dot(x) + b`.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-
-        Returns:
-            outputs: Array of layer outputs of shape (batch_size, output_dim).
-        """
-        return self.weights.dot(inputs.T).T + self.biases
-
-    def bprop(self, inputs, outputs, grads_wrt_outputs):
-        """Back propagates gradients through a layer.
-
-        Given gradients with respect to the outputs of the layer calculates the
-        gradients with respect to the layer inputs.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-            outputs: Array of layer outputs calculated in forward pass of
-                shape (batch_size, output_dim).
-            grads_wrt_outputs: Array of gradients with respect to the layer
-                outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Array of gradients with respect to the layer inputs of shape
-            (batch_size, input_dim).
-        """
-        return grads_wrt_outputs.dot(self.weights)
-
-    def grads_wrt_params(self, inputs, grads_wrt_outputs):
-        """Calculates gradients with respect to layer parameters.
-
-        Args:
-            inputs: array of inputs to layer of shape (batch_size, input_dim)
-            grads_wrt_to_outputs: array of gradients with respect to the layer
-                outputs of shape (batch_size, output_dim)
-
-        Returns:
-            list of arrays of gradients with respect to the layer parameters
-            `[grads_wrt_weights, grads_wrt_biases]`.
-        """
-
-        grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
-        grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
-
-        if self.weights_cost is not None:
-            grads_wrt_weights += self.weights_cost.grad(self.weights)
-
-        if self.biases_cost is not None:
-            grads_wrt_biases += self.biases_cost.grads(self.biases)
-
-        return [grads_wrt_weights, grads_wrt_biases]
-
-    def params_cost(self):
-        """Returns the parameter dependent cost term for this layer.
-
-        If no parameter-dependent cost terms are set this returns zero.
-        """
-        params_cost = 0
-        if self.weights_cost is not None:
-            params_cost += self.weights_cost(self.weights)
-        if self.biases_cost is not None:
-            params_cost += self.biases_cost(self.biases)
-        return params_cost
-
-    @property
-    def params(self):
-        """A list of layer parameter values: `[weights, biases]`."""
-        return [self.weights, self.biases]
-
-    @params.setter
-    def params(self, values):
-        self.weights = values[0]
-        self.biases = values[1]
-
-    def __repr__(self):
-        return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
-            self.input_dim, self.output_dim)
-
-
-class SigmoidLayer(Layer):
-    """Layer implementing an element-wise logistic sigmoid transformation."""
-
-    def fprop(self, inputs):
-        """Forward propagates activations through the layer transformation.
-
-        For inputs `x` and outputs `y` this corresponds to
-        `y = 1 / (1 + exp(-x))`.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-
-        Returns:
-            outputs: Array of layer outputs of shape (batch_size, output_dim).
-        """
-        return 1. / (1. + np.exp(-inputs))
-
-    def bprop(self, inputs, outputs, grads_wrt_outputs):
-        """Back propagates gradients through a layer.
-
-        Given gradients with respect to the outputs of the layer calculates the
-        gradients with respect to the layer inputs.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-            outputs: Array of layer outputs calculated in forward pass of
-                shape (batch_size, output_dim).
-            grads_wrt_outputs: Array of gradients with respect to the layer
-                outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Array of gradients with respect to the layer inputs of shape
-            (batch_size, input_dim).
-        """
-        return grads_wrt_outputs * outputs * (1. - outputs)
-
-    def __repr__(self):
-        return 'SigmoidLayer'
-
-
-class ReluLayer(Layer):
-    """Layer implementing an element-wise rectified linear transformation."""
-
-    def fprop(self, inputs):
-        """Forward propagates activations through the layer transformation.
-
-        For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-
-        Returns:
-            outputs: Array of layer outputs of shape (batch_size, output_dim).
-        """
-        return np.maximum(inputs, 0.)
-
-    def bprop(self, inputs, outputs, grads_wrt_outputs):
-        """Back propagates gradients through a layer.
-
-        Given gradients with respect to the outputs of the layer calculates the
-        gradients with respect to the layer inputs.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-            outputs: Array of layer outputs calculated in forward pass of
-                shape (batch_size, output_dim).
-            grads_wrt_outputs: Array of gradients with respect to the layer
-                outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Array of gradients with respect to the layer inputs of shape
-            (batch_size, input_dim).
-        """
-        return (outputs > 0) * grads_wrt_outputs
-
-    def __repr__(self):
-        return 'ReluLayer'
-
-
-class TanhLayer(Layer):
-    """Layer implementing an element-wise hyperbolic tangent transformation."""
-
-    def fprop(self, inputs):
-        """Forward propagates activations through the layer transformation.
-
-        For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-
-        Returns:
-            outputs: Array of layer outputs of shape (batch_size, output_dim).
-        """
-        return np.tanh(inputs)
-
-    def bprop(self, inputs, outputs, grads_wrt_outputs):
-        """Back propagates gradients through a layer.
-
-        Given gradients with respect to the outputs of the layer calculates the
-        gradients with respect to the layer inputs.
-
-        Args:
-            inputs: Array of layer inputs of shape (batch_size, input_dim).
-            outputs: Array of layer outputs calculated in forward pass of
-                shape (batch_size, output_dim).
-            grads_wrt_outputs: Array of gradients with respect to the layer
-                outputs of shape (batch_size, output_dim).
-
-        Returns:
-            Array of gradients with respect to the layer inputs of shape
-            (batch_size, input_dim).
-        """
-        return (1. - outputs**2) * grads_wrt_outputs
-
-    def __repr__(self):
-        return 'TanhLayer'
diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py
deleted file mode 100644
index 4156c23..0000000
--- a/mlp/learning_rules.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Learning rules.
-
-This module contains classes implementing gradient based learning rules.
-"""
-
-import numpy as np
-
-
-class GradientDescentLearningRule(object):
-    """Simple (stochastic) gradient descent learning rule.
-
-    For a scalar loss function `L(p[0], p_[1] ... )` of some set of potentially
-    multidimensional parameters this attempts to find a local minimum of the
-    loss function by applying updates to each parameter of the form
-
-        p[i] := p[i] - learning_rate * dL/dp[i]
-
-    With `learning_rate` a positive scaling parameter.
-
-    The loss function used in successive applications of these updates may be a
-    stochastic estimator of the true loss function (e.g. when the loss with
-    respect to only a subset of data-points is calculated) in which case this
-    will correspond to a stochastic gradient descent learning rule.
-    """
-
-    def __init__(self, learning_rate=1e-3):
-        """Creates a new learning rule object.
-
-        Args:
-            learning_rate: A postive scalar to scale gradient updates to the
-                parameters by. This needs to be carefully set - if too large
-                the learning dynamic will be unstable and may diverge, while
-                if set too small learning will proceed very slowly.
-
-        """
-        assert learning_rate > 0., 'learning_rate should be positive.'
-        self.learning_rate = learning_rate
-
-    def initialise(self, params):
-        """Initialises the state of the learning rule for a set or parameters.
-
-        This must be called before `update_params` is first called.
-
-        Args:
-            params: A list of the parameters to be optimised. Note these will
-                be updated *in-place* to avoid reallocating arrays on each
-                update.
-        """
-        self.params = params
-
-    def reset(self):
-        """Resets any additional state variables to their intial values.
-
-        For this learning rule there are no additional state variables so we
-        do nothing here.
-        """
-        pass
-
-    def update_params(self, grads_wrt_params):
-        """Applies a single gradient descent update to all parameters.
-
-        All parameter updates are performed using in-place operations and so
-        nothing is returned.
-
-        Args:
-            grads_wrt_params: A list of gradients of the scalar loss function
-                with respect to each of the parameters passed to `initialise`
-                previously, with this list expected to be in the same order.
-        """
-        for param, grad in zip(self.params, grads_wrt_params):
-            param -= self.learning_rate * grad
-
-
-class MomentumLearningRule(GradientDescentLearningRule):
-    """Gradient descent with momentum learning rule.
-
-    This extends the basic gradient learning rule by introducing extra
-    momentum state variables for each parameter. These can help the learning
-    dynamic help overcome shallow local minima and speed convergence when
-    making multiple successive steps in a similar direction in parameter space.
-
-    For parameter p[i] and corresponding momentum m[i] the updates for a
-    scalar loss function `L` are of the form
-
-        m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i]
-        p[i] := p[i] + m[i]
-
-    with `learning_rate` a positive scaling parameter for the gradient updates
-    and `mom_coeff` a value in [0, 1] that determines how much 'friction' there
-    is the system and so how quickly previous momentum contributions decay.
-    """
-
-    def __init__(self, learning_rate=1e-3, mom_coeff=0.9):
-        """Creates a new learning rule object.
-
-        Args:
-            learning_rate: A postive scalar to scale gradient updates to the
-                parameters by. This needs to be carefully set - if too large
-                the learning dynamic will be unstable and may diverge, while
-                if set too small learning will proceed very slowly.
-            mom_coeff: A scalar in the range [0, 1] inclusive. This determines
-                the contribution of the previous momentum value to the value
-                after each update. If equal to 0 the momentum is set to exactly
-                the negative scaled gradient each update and so this rule
-                collapses to standard gradient descent. If equal to 1 the
-                momentum will just be decremented by the scaled gradient at
-                each update. This is equivalent to simulating the dynamic in
-                a frictionless system. Due to energy conservation the loss
-                of 'potential energy' as the dynamics moves down the loss
-                function surface will lead to an increasingly large 'kinetic
-                energy' and so speed, meaning the updates will become
-                increasingly large, potentially unstably so. Typically a value
-                less than but close to 1 will avoid these issues and cause the
-                dynamic to converge to a local minima where the gradients are
-                by definition zero.
-        """
-        super(MomentumLearningRule, self).__init__(learning_rate)
-        assert mom_coeff >= 0. and mom_coeff <= 1., (
-            'mom_coeff should be in the range [0, 1].'
-        )
-        self.mom_coeff = mom_coeff
-
-    def initialise(self, params):
-        """Initialises the state of the learning rule for a set or parameters.
-
-        This must be called before `update_params` is first called.
-
-        Args:
-            params: A list of the parameters to be optimised. Note these will
-                be updated *in-place* to avoid reallocating arrays on each
-                update.
-        """
-        super(MomentumLearningRule, self).initialise(params)
-        self.moms = []
-        for param in self.params:
-            self.moms.append(np.zeros_like(param))
-
-    def reset(self):
-        """Resets any additional state variables to their intial values.
-
-        For this learning rule this corresponds to zeroing all the momenta.
-        """
-        for mom in zip(self.moms):
-            mom *= 0.
-
-    def update_params(self, grads_wrt_params):
-        """Applies a single update to all parameters.
-
-        All parameter updates are performed using in-place operations and so
-        nothing is returned.
-
-        Args:
-            grads_wrt_params: A list of gradients of the scalar loss function
-                with respect to each of the parameters passed to `initialise`
-                previously, with this list expected to be in the same order.
-        """
-        for param, mom, grad in zip(self.params, self.moms, grads_wrt_params):
-            mom *= self.mom_coeff
-            mom -= self.learning_rate * grad
-            param += mom
diff --git a/mlp/models.py b/mlp/models.py
deleted file mode 100644
index f4b1f55..0000000
--- a/mlp/models.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Model definitions.
-
-This module implements objects encapsulating learnable models of input-output
-relationships. The model objects implement methods for forward propagating
-the inputs through the transformation(s) defined by the model to produce
-outputs (and intermediate states) and for calculating gradients of scalar
-functions of the outputs with respect to the model parameters.
-"""
-
-from mlp.layers import LayerWithParameters
-
-
-class SingleLayerModel(object):
-    """A model consisting of a single transformation layer."""
-
-    def __init__(self, layer):
-        """Create a new single layer model instance.
-
-        Args:
-            layer: The layer object defining the model architecture.
-        """
-        self.layer = layer
-
-    @property
-    def params(self):
-        """A list of all of the parameters of the model."""
-        return self.layer.params
-
-    def fprop(self, inputs):
-        """Calculate the model outputs corresponding to a batch of inputs.
-
-        Args:
-            inputs: Batch of inputs to the model.
-
-        Returns:
-            List which is a concatenation of the model inputs and model
-            outputs, this being done for consistency of the interface with
-            multi-layer models for which `fprop` returns a list of
-            activations through all immediate layers of the model and including
-            the inputs and outputs.
-        """
-        activations = [inputs, self.layer.fprop(inputs)]
-        return activations
-
-    def grads_wrt_params(self, activations, grads_wrt_outputs):
-        """Calculates gradients with respect to the model parameters.
-
-        Args:
-            activations: List of all activations from forward pass through
-                model using `fprop`.
-            grads_wrt_outputs: Gradient with respect to the model outputs of
-               the scalar function parameter gradients are being calculated
-               for.
-
-        Returns:
-            List of gradients of the scalar function with respect to all model
-            parameters.
-        """
-        return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
-
-    def params_cost(self):
-        """Calculates the parameter dependent cost term of the model."""
-        return self.layer.params_cost()
-
-    def __repr__(self):
-        return 'SingleLayerModel(' + str(layer) + ')'
-
-
-class MultipleLayerModel(object):
-    """A model consisting of multiple layers applied sequentially."""
-
-    def __init__(self, layers):
-        """Create a new multiple layer model instance.
-
-        Args:
-            layers: List of the the layer objecst defining the model in the
-                order they should be applied from inputs to outputs.
-        """
-        self.layers = layers
-
-    @property
-    def params(self):
-        """A list of all of the parameters of the model."""
-        params = []
-        for layer in self.layers:
-            if isinstance(layer, LayerWithParameters):
-                params += layer.params
-        return params
-
-    def fprop(self, inputs):
-        """Forward propagates a batch of inputs through the model.
-
-        Args:
-            inputs: Batch of inputs to the model.
-
-        Returns:
-            List of the activations at the output of all layers of the model
-            plus the inputs (to the first layer) as the first element. The
-            last element of the list corresponds to the model outputs.
-        """
-        activations = [inputs]
-        for i, layer in enumerate(self.layers):
-            activations.append(self.layers[i].fprop(activations[i]))
-        return activations
-
-    def grads_wrt_params(self, activations, grads_wrt_outputs):
-        """Calculates gradients with respect to the model parameters.
-
-        Args:
-            activations: List of all activations from forward pass through
-                model using `fprop`.
-            grads_wrt_outputs: Gradient with respect to the model outputs of
-               the scalar function parameter gradients are being calculated
-               for.
-
-        Returns:
-            List of gradients of the scalar function with respect to all model
-            parameters.
-        """
-        grads_wrt_params = []
-        for i, layer in enumerate(self.layers[::-1]):
-            inputs = activations[-i - 2]
-            outputs = activations[-i - 1]
-            grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
-            if isinstance(layer, LayerWithParameters):
-                grads_wrt_params += layer.grads_wrt_params(
-                    inputs, grads_wrt_outputs)[::-1]
-            grads_wrt_outputs = grads_wrt_inputs
-        return grads_wrt_params[::-1]
-
-    def params_cost(self):
-        """Calculates the parameter dependent cost term of the model."""
-        params_cost = 0.
-        for layer in self.layers:
-            if isinstance(layer, LayerWithParameters):
-                params_cost += layer.params_cost()
-        return params_cost
-
-    def __repr__(self):
-        return (
-            'MultiLayerModel(\n    ' +
-            '\n    '.join([str(layer) for layer in self.layers]) +
-            '\n)'
-        )
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
deleted file mode 100644
index 4ce9e4d..0000000
--- a/mlp/optimisers.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Model optimisers.
-
-This module contains objects implementing (batched) stochastic gradient descent
-based optimisation of models.
-"""
-
-import time
-import logging
-from collections import OrderedDict
-import numpy as np
-
-
-logger = logging.getLogger(__name__)
-
-
-class Optimiser(object):
-    """Basic model optimiser."""
-
-    def __init__(self, model, cost, learning_rule, train_dataset,
-                 valid_dataset=None, data_monitors=None):
-        """Create a new optimiser instance.
-
-        Args:
-            model: The model to optimise.
-            cost: The scalar cost function to minimise.
-            learning_rule: Gradient based learning rule to use to minimise
-                cost.
-            train_dataset: Data provider for training set data batches.
-            valid_dataset: Data provider for validation set data batches.
-            data_monitors: Dictionary of functions evaluated on targets and
-                model outputs (averaged across both full training and
-                validation data sets) to monitor during training in addition
-                to the cost. Keys should correspond to a string label for
-                the statistic being evaluated.
-        """
-        self.model = model
-        self.cost = cost
-        self.learning_rule = learning_rule
-        self.learning_rule.initialise(self.model.params)
-        self.train_dataset = train_dataset
-        self.valid_dataset = valid_dataset
-        self.data_monitors = OrderedDict([('cost', cost)])
-        if data_monitors is not None:
-            self.data_monitors.update(data_monitors)
-
-    def do_training_epoch(self):
-        """Do a single training epoch.
-
-        This iterates through all batches in training dataset, for each
-        calculating the gradient of the estimated loss given the batch with
-        respect to all the model parameters and then updates the model
-        parameters according to the learning rule.
-        """
-        for inputs_batch, targets_batch in self.train_dataset:
-            activations = self.model.fprop(inputs_batch)
-            grads_wrt_outputs = self.cost.grad(activations[-1], targets_batch)
-            grads_wrt_params = self.model.grads_wrt_params(
-                activations, grads_wrt_outputs)
-            self.learning_rule.update_params(grads_wrt_params)
-
-    def eval_monitors(self, dataset, label):
-        """Evaluates the monitors for the given dataset.
-
-        Args:
-            dataset: Dataset to perform evaluation with.
-            label: Tag to add to end of monitor keys to identify dataset.
-
-        Returns:
-            OrderedDict of monitor values evaluated on dataset.
-        """
-        data_mon_vals = OrderedDict([(key + label, 0.) for key
-                                     in self.data_monitors.keys()])
-        for inputs_batch, targets_batch in dataset:
-            activations = self.model.fprop(inputs_batch)
-            for key, data_monitor in self.data_monitors.items():
-                data_mon_vals[key + label] += data_monitor(
-                    activations[-1], targets_batch)
-        for key, data_monitor in self.data_monitors.items():
-            data_mon_vals[key + label] /= dataset.num_batches
-        return data_mon_vals
-
-    def get_epoch_stats(self):
-        """Computes training statistics for an epoch.
-
-        Returns:
-            An OrderedDict with keys corresponding to the statistic labels and
-            values corresponding to the value of the statistic.
-        """
-        epoch_stats = OrderedDict()
-        epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)'))
-        if self.valid_dataset is not None:
-            epoch_stats.update(self.eval_monitors(
-                self.valid_dataset, '(valid)'))
-        epoch_stats['cost(param)'] = self.model.params_cost()
-        return epoch_stats
-
-    def log_stats(self, epoch, epoch_time, stats):
-        """Outputs stats for a training epoch to a logger.
-
-        Args:
-            epoch (int): Epoch counter.
-            epoch_time: Time taken in seconds for the epoch to complete.
-            stats: Monitored stats for the epoch.
-        """
-        logger.info('Epoch {0}: {1:.1f}s to complete\n    {2}'.format(
-            epoch, epoch_time,
-            ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
-        ))
-
-    def train(self, num_epochs, stats_interval=5):
-        """Trains a model for a set number of epochs.
-
-        Args:
-            num_epochs: Number of epochs (complete passes through trainin
-                dataset) to train for.
-            stats_interval: Training statistics will be recorded and logged
-                every `stats_interval` epochs.
-
-        Returns:
-            Tuple with first value being an array of training run statistics
-            and the second being a dict mapping the labels for the statistics
-            recorded to their column index in the array.
-        """
-        run_stats = []
-        for epoch in range(1, num_epochs + 1):
-            start_time = time.clock()
-            self.do_training_epoch()
-            epoch_time = time.clock() - start_time
-            if epoch % stats_interval == 0:
-                stats = self.get_epoch_stats()
-                self.log_stats(epoch, epoch_time, stats)
-                run_stats.append(stats.values())
-        return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}
diff --git a/mlp/schedulers.py b/mlp/schedulers.py
deleted file mode 100644
index 6ae9597..0000000
--- a/mlp/schedulers.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Machine Learning Practical (INFR11119),
-# Pawel Swietojanski, University of Edinburgh
-
-import logging
-
-
-class LearningRateScheduler(object):
-    """
-    Define an interface for determining learning rates
-    """
-    def __init__(self, max_epochs=100):
-        self.epoch = 0
-        self.max_epochs = max_epochs
-
-    def get_rate(self):
-        raise NotImplementedError()
-
-    def get_next_rate(self, current_accuracy=None):
-        self.epoch += 1
-
-
-class LearningRateList(LearningRateScheduler):
-    def __init__(self, learning_rates_list, max_epochs):
-
-        super(LearningRateList, self).__init__(max_epochs)
-
-        assert isinstance(learning_rates_list, list), (
-            "The learning_rates_list argument expected"
-            " to be of type list, got %s" % type(learning_rates_list)
-        )
-        self.lr_list = learning_rates_list
-        
-    def get_rate(self):
-        if self.epoch < len(self.lr_list):
-            return self.lr_list[self.epoch]
-        return 0.0
-    
-    def get_next_rate(self, current_accuracy=None):
-        super(LearningRateList, self).get_next_rate(current_accuracy=None)
-        return self.get_rate()
-
-
-class LearningRateFixed(LearningRateList):
-
-    def __init__(self, learning_rate, max_epochs):
-        assert learning_rate > 0, (
-            "learning rate expected to be > 0, got %f" % learning_rate
-        )
-        super(LearningRateFixed, self).__init__([learning_rate], max_epochs)
-
-    def get_rate(self):
-        if self.epoch < self.max_epochs:
-            return self.lr_list[0]
-        return 0.0
-
-    def get_next_rate(self, current_accuracy=None):
-        super(LearningRateFixed, self).get_next_rate(current_accuracy=None)
-        return self.get_rate()
-
-
-class LearningRateNewBob(LearningRateScheduler):
-    """
-    newbob learning rate schedule.
-    
-    Fixed learning rate until validation set stops improving then exponential
-    decay.
-    """
-    
-    def __init__(self, start_rate, scale_by=.5, max_epochs=99,
-                 min_derror_ramp_start=.5, min_derror_stop=.5, init_error=100.0,
-                 patience=0, zero_rate=None, ramping=False):
-        """
-        :type start_rate: float
-        :param start_rate: 
-        
-        :type scale_by: float
-        :param scale_by: 
-        
-        :type max_epochs: int
-        :param max_epochs: 
-        
-        :type min_error_start: float
-        :param min_error_start: 
-        
-        :type min_error_stop: float
-        :param min_error_stop: 
-        
-        :type init_error: float
-        :param init_error: 
-        """
-        self.start_rate = start_rate
-        self.init_error = init_error
-        self.init_patience = patience
-        
-        self.rate = start_rate
-        self.scale_by = scale_by
-        self.max_epochs = max_epochs
-        self.min_derror_ramp_start = min_derror_ramp_start
-        self.min_derror_stop = min_derror_stop
-        self.lowest_error = init_error
-        
-        self.epoch = 1
-        self.ramping = ramping
-        self.patience = patience
-        self.zero_rate = zero_rate
-        
-    def reset(self):
-        self.rate = self.start_rate
-        self.lowest_error = self.init_error
-        self.epoch = 1
-        self.ramping = False
-        self.patience = self.init_patience
-    
-    def get_rate(self):
-        if (self.epoch==1 and self.zero_rate!=None):
-            return self.zero_rate
-        return self.rate  
-    
-    def get_next_rate(self, current_accuracy):
-        """
-        :type current_accuracy: float
-        :param current_accuracy: current proportion correctly classified
-        
-        """
-        
-        current_error = 1. - current_accuracy
-        diff_error = 0.0
-        
-        if ( (self.max_epochs > 10000) or (self.epoch >= self.max_epochs) ):
-            #logging.debug('Setting rate to 0.0. max_epochs or epoch>=max_epochs')
-            self.rate = 0.0
-        else:
-            diff_error = self.lowest_error - current_error
-            
-            if (current_error < self.lowest_error):
-                self.lowest_error = current_error
-    
-            if (self.ramping):
-                if (diff_error < self.min_derror_stop):
-                    if (self.patience > 0):
-                        #logging.debug('Patience decreased to %f' % self.patience)
-                        self.patience -= 1
-                        self.rate *= self.scale_by
-                    else:
-                        #logging.debug('diff_error (%f) < min_derror_stop (%f)' % (diff_error, self.min_derror_stop))
-                        self.rate = 0.0
-                else:
-                    self.rate *= self.scale_by
-            else:
-                if (diff_error < self.min_derror_ramp_start):
-                    #logging.debug('Start ramping.')
-                    self.ramping = True
-                    self.rate *= self.scale_by
-            
-            self.epoch += 1
-    
-        return self.rate
-
-
-class DropoutFixed(LearningRateList):
-
-    def __init__(self, p_inp_keep, p_hid_keep):
-        assert 0 < p_inp_keep <= 1 and 0 < p_hid_keep <= 1, (
-            "Dropout 'keep' probabilites are suppose to be in (0, 1] range"
-        )
-        super(DropoutFixed, self).__init__([(p_inp_keep, p_hid_keep)], max_epochs=999)
-
-    def get_rate(self):
-        return self.lr_list[0]
-
-    def get_next_rate(self, current_accuracy=None):
-        return self.get_rate()
diff --git a/mlp/utils.py b/mlp/utils.py
deleted file mode 100644
index 34d62e5..0000000
--- a/mlp/utils.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# Machine Learning Practical (INFR11119),
-# Pawel Swietojanski, University of Edinburgh
-
-import numpy
-from mlp.layers import Layer
-
-
-def numerical_gradient(f, x, eps=1e-4, **kwargs):
-    """
-    Implements the following numerical gradient rule
-    df(x)/dx = (f(x+eps)-f(x-eps))/(2eps)
-    """
-
-    xc = x.copy()
-    g = numpy.zeros_like(xc)
-    xf = xc.ravel()
-    gf = g.ravel()
-
-    for i in xrange(xf.shape[0]):
-        xx = xf[i]
-        xf[i] = xx + eps
-        fp_eps, ___ = f(xc, **kwargs)
-        xf[i] = xx - eps
-        fm_eps, ___ = f(xc, **kwargs)
-        xf[i] = xx
-        gf[i] = (fp_eps - fm_eps)/(2*eps)
-
-    return g
-
-
-def verify_gradient(f, x, eps=1e-4, tol=1e-6, **kwargs):
-    """
-    Compares the numerical and analytical gradients.
-    """
-    fval, fgrad = f(x=x, **kwargs)
-    ngrad = numerical_gradient(f=f, x=x, eps=eps, tol=tol, **kwargs)
-
-    fgradnorm = numpy.sqrt(numpy.sum(fgrad**2))
-    ngradnorm = numpy.sqrt(numpy.sum(ngrad**2))
-    diffnorm = numpy.sqrt(numpy.sum((fgrad-ngrad)**2))
-
-    if fgradnorm > 0 or ngradnorm > 0:
-        norm = numpy.maximum(fgradnorm, ngradnorm)
-        if not (diffnorm < tol or diffnorm/norm < tol):
-            raise Exception("Numerical and analytical gradients "
-                            "are different: %s != %s!" % (ngrad, fgrad))
-    else:
-        if not (diffnorm < tol):
-            raise Exception("Numerical and analytical gradients "
-                            "are different: %s != %s!" % (ngrad, fgrad))
-    return True
-
-
-def verify_layer_gradient(layer, x, eps=1e-4, tol=1e-6):
-
-    assert isinstance(layer, Layer), (
-        "Expected to get the instance of Layer class, got"
-        " %s " % type(layer)
-    )
-
-    def grad_layer_wrapper(x, **kwargs):
-        h = layer.fprop(x)
-        deltas, ograds = layer.bprop(h=h, igrads=numpy.ones_like(h))
-        return numpy.sum(h), ograds
-
-    return verify_gradient(f=grad_layer_wrapper, x=x, eps=eps, tol=tol, layer=layer)
-
-
-def test_conv_linear_fprop(layer, kernel_order='ioxy', kernels_first=True,
-                           dtype=numpy.float):
-    """ 
-    Tests forward propagation method of a convolutional layer.
-    
-    Checks the outputs of `fprop` method for a fixed input against known
-    reference values for the outputs and raises an AssertionError if
-    the outputted values are not consistent with the reference values. If
-    tests are all passed returns True.
-    
-    Parameters
-    ----------
-    layer : instance of Layer subclass
-        Convolutional (linear only) layer implementation. It must implement
-        the methods `get_params`, `set_params` and `fprop`.
-    kernel_order : string
-        Specifes dimension ordering assumed for convolutional kernels
-        passed to `layer`. Default is `ioxy` which corresponds to:
-            input channels, output channels, image x, image y
-        The other option is 'oixy' which corresponds to
-            output channels, input channels, image x, image y
-        Any other value will raise a ValueError exception.
-    kernels_first : boolean
-        Specifies order in which parameters are passed to and returned from
-        `get_params` and `set_params`. Default is True which corresponds
-        to signatures of `get_params` and `set_params` being:
-            kernels, biases = layer.get_params()
-            layer.set_params([kernels, biases])
-        If False this corresponds to signatures of `get_params` and 
-        `set_params` being:
-            biases, kernels = layer.get_params()
-            layer.set_params([biases, kernels])
-    dtype : numpy data type
-         Data type to use in numpy arrays passed to layer methods. Default
-         is `numpy.float`.
-            
-    Raises
-    ------
-    AssertionError
-        Raised if output of `layer.fprop` is inconsistent with reference
-        values either in shape or values.
-    ValueError
-        Raised if `kernel_order` is not a valid order string.
-    """
-    inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype)
-    kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype)
-    if kernel_order == 'oixy':
-        kernels = kernels.swapaxes(0, 1)
-    elif kernel_order != 'ioxy':
-        raise ValueError('kernel_order must be one of "ioxy" and "oixy"')
-    biases = numpy.arange(2).astype(dtype)
-    true_output = numpy.array(
-      [[[[  496.,   466.,   436.],
-         [  376.,   346.,   316.],
-         [  256.,   226.,   196.]],
-        [[ 1385.,  1403.,  1421.],
-         [ 1457.,  1475.,  1493.],
-         [ 1529.,  1547.,  1565.]]],
-       [[[ -944.,  -974., -1004.],
-         [-1064., -1094., -1124.],
-         [-1184., -1214., -1244.]],
-        [[ 2249.,  2267.,  2285.],
-         [ 2321.,  2339.,  2357.],
-         [ 2393.,  2411.,  2429.]]]], dtype=dtype)
-    try:
-        orig_params = layer.get_params()
-        if kernels_first:
-            layer.set_params([kernels, biases])
-        else:
-            layer.set_params([biases, kernels])
-        layer_output = layer.fprop(inputs)
-        assert layer_output.shape == true_output.shape, (
-            'Layer fprop gives incorrect shaped output. '
-            'Correct shape is {0} but returned shape is {1}.'
-            .format(true_output.shape, layer_output.shape)
-        )
-        assert numpy.allclose(layer_output, true_output), (
-            'Layer fprop does not give correct output. '
-            'Correct output is {0}\n but returned output is {1}.'
-            .format(true_output, layer_output)
-        )
-    finally:
-        layer.set_params(orig_params)
-    return True
-
-  
-def test_conv_linear_bprop(layer, kernel_order='ioxy', kernels_first=True,
-                           dtype=numpy.float):
-    """ 
-    Tests input gradients backpropagation method of a convolutional layer.
-    
-    Checks the outputs of `bprop` method for a fixed input against known
-    reference values for the outputs and raises an AssertionError if
-    the outputted values are not consistent with the reference values. If
-    tests are all passed returns True.
-    
-    Parameters
-    ----------
-    layer : instance of Layer subclass
-        Convolutional (linear only) layer implementation. It must implement
-        the methods `get_params`, `set_params` and `bprop`.
-    kernel_order : string
-        Specifes dimension ordering assumed for convolutional kernels
-        passed to `layer`. Default is `ioxy` which corresponds to:
-            input channels, output channels, image x, image y
-        The other option is 'oixy' which corresponds to
-            output channels, input channels, image x, image y
-        Any other value will raise a ValueError exception.
-    kernels_first : boolean
-        Specifies order in which parameters are passed to and returned from
-        `get_params` and `set_params`. Default is True which corresponds
-        to signatures of `get_params` and `set_params` being:
-            kernels, biases = layer.get_params()
-            layer.set_params([kernels, biases])
-        If False this corresponds to signatures of `get_params` and 
-        `set_params` being:
-            biases, kernels = layer.get_params()
-            layer.set_params([biases, kernels])
-    dtype : numpy data type
-         Data type to use in numpy arrays passed to layer methods. Default
-         is `numpy.float`.
-            
-    Raises
-    ------
-    AssertionError
-        Raised if output of `layer.bprop` is inconsistent with reference
-        values either in shape or values.
-    ValueError
-        Raised if `kernel_order` is not a valid order string.
-    """
-    inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype)
-    kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype)
-    if kernel_order == 'oixy':
-        kernels = kernels.swapaxes(0, 1)
-    elif kernel_order != 'ioxy':
-        raise ValueError('kernel_order must be one of "ioxy" and "oixy"')
-    biases = numpy.arange(2).astype(dtype)
-    igrads = numpy.arange(-20, 16).reshape((2, 2, 3, 3)).astype(dtype)
-    true_ograds = numpy.array(
-      [[[[ 328.,  605.,  567.,  261.],
-         [ 534.,  976.,  908.,  414.],
-         [ 426.,  772.,  704.,  318.],
-         [ 170.,  305.,  275.,  123.]],
-        [[  80.,  125.,  119.,   45.],
-         [  86.,  112.,  108.,   30.],
-         [  74.,  100.,   96.,   30.],
-         [  18.,   17.,   19.,    3.]],
-        [[-168., -355., -329., -171.],
-         [-362., -752., -692., -354.],
-         [-278., -572., -512., -258.],
-         [-134., -271., -237., -117.]]],
-       [[[ -32.,  -79., -117.,  -63.],
-         [-114., -248., -316., -162.],
-         [-222., -452., -520., -258.],
-         [-118., -235., -265., -129.]],
-        [[   8.,   17.,   11.,    9.],
-         [  14.,   40.,   36.,   30.],
-         [   2.,   28.,   24.,   30.],
-         [  18.,   53.,   55.,   39.]],
-        [[  48.,  113.,  139.,   81.],
-         [ 142.,  328.,  388.,  222.],
-         [ 226.,  508.,  568.,  318.],
-         [ 154.,  341.,  375.,  207.]]]], dtype=dtype)
-    try:
-        orig_params = layer.get_params()
-        if kernels_first:
-            layer.set_params([kernels, biases])
-        else:
-            layer.set_params([biases, kernels])
-        layer_deltas, layer_ograds = layer.bprop(None, igrads)
-        assert layer_deltas.shape == igrads.shape, (
-            'Layer bprop give incorrectly shaped deltas output.'
-            'Correct shape is {0} but returned shape is {1}.'
-            .format(igrads.shape, layer_deltas.shape)
-        )
-        assert numpy.allclose(layer_deltas, igrads), (
-            'Layer bprop does not give correct deltas output. '
-            'Correct output is {0}\n but returned output is {1}.'
-            .format(igrads, layer_deltas)
-        )
-        assert layer_ograds.shape == true_ograds.shape, (
-            'Layer bprop gives incorrect shaped ograds output. '
-            'Correct shape is {0} but returned shape is {1}.'
-            .format(true_ograds.shape, layer_ograds.shape)
-        )
-        assert numpy.allclose(layer_ograds, true_ograds), (
-            'Layer bprop does not give correct ograds output. '
-            'Correct output is {0}\n but returned output is {1}.'
-            .format(true_ograds, layer_ograds)
-        )
-    finally:
-        layer.set_params(orig_params)
-    return True
-
-   
-def test_conv_linear_pgrads(layer, kernel_order='ioxy', kernels_first=True,
-                            dtype=numpy.float):
-    """ 
-    Tests parameter gradients backpropagation method of a convolutional layer.
-    
-    Checks the outputs of `pgrads` method for a fixed input against known
-    reference values for the outputs and raises an AssertionError if
-    the outputted values are not consistent with the reference values. If
-    tests are all passed returns True.
-    
-    Parameters
-    ----------
-    layer : instance of Layer subclass
-        Convolutional (linear only) layer implementation. It must implement
-        the methods `get_params`, `set_params` and `pgrads`.
-    kernel_order : string
-        Specifes dimension ordering assumed for convolutional kernels
-        passed to `layer`. Default is `ioxy` which corresponds to:
-            input channels, output channels, image x, image y
-        The other option is 'oixy' which corresponds to
-            output channels, input channels, image x, image y
-        Any other value will raise a ValueError exception.
-    kernels_first : boolean
-        Specifies order in which parameters are passed to and returned from
-        `get_params` and `set_params`. Default is True which corresponds
-        to signatures of `get_params` and `set_params` being:
-            kernels, biases = layer.get_params()
-            layer.set_params([kernels, biases])
-        If False this corresponds to signatures of `get_params` and 
-        `set_params` being:
-            biases, kernels = layer.get_params()
-            layer.set_params([biases, kernels])
-    dtype : numpy data type
-         Data type to use in numpy arrays passed to layer methods. Default
-         is `numpy.float`.
-            
-    Raises
-    ------
-    AssertionError
-        Raised if output of `layer.pgrads` is inconsistent with reference
-        values either in shape or values.
-    ValueError
-        Raised if `kernel_order` is not a valid order string.
-    """
-    inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype)
-    kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype)
-    biases = numpy.arange(2).astype(dtype)
-    deltas = numpy.arange(-20, 16).reshape((2, 2, 3, 3)).astype(dtype)
-    true_kernel_grads = numpy.array(
-      [[[[  390.,   264.],
-         [ -114.,  -240.]],
-        [[ 5088.,  5124.],
-         [ 5232.,  5268.]]],
-       [[[-1626., -1752.],
-         [-2130., -2256.]],
-        [[ 5664.,  5700.],
-         [ 5808.,  5844.]]],
-       [[[-3642., -3768.],
-         [-4146., -4272.]],
-        [[ 6240.,  6276.],
-         [ 6384.,  6420.]]]], dtype=dtype)
-    if kernel_order == 'oixy':
-        kernels = kernels.swapaxes(0, 1)
-        true_kernel_grads = true_kernel_grads.swapaxes(0, 1)
-    elif kernel_order != 'ioxy':
-        raise ValueError('kernel_order must be one of "ioxy" and "oixy"')
-    true_bias_grads = numpy.array([-126.,   36.], dtype=dtype)
-    try:
-        orig_params = layer.get_params()
-        if kernels_first:
-            layer.set_params([kernels, biases])
-        else:
-            layer.set_params([biases, kernels])
-        layer_kernel_grads, layer_bias_grads = layer.pgrads(inputs, deltas)
-        assert layer_kernel_grads.shape == true_kernel_grads.shape, (
-            'Layer pgrads gives incorrect shaped kernel gradients output. '
-            'Correct shape is {0} but returned shape is {1}.'
-            .format(true_kernel_grads.shape, layer_kernel_grads.shape)
-        )
-        assert numpy.allclose(layer_kernel_grads, true_kernel_grads), (
-            'Layer pgrads does not give correct kernel gradients output. '
-            'Correct output is {0}\n but returned output is {1}.'
-            .format(true_kernel_grads, layer_kernel_grads)
-        )
-        assert layer_bias_grads.shape == true_bias_grads.shape, (
-            'Layer pgrads gives incorrect shaped bias gradients output. '
-            'Correct shape is {0} but returned shape is {1}.'
-            .format(true_bias_grads.shape, layer_bias_grads.shape)
-        )
-        assert numpy.allclose(layer_bias_grads, true_bias_grads), (
-            'Layer pgrads does not give correct bias gradients output. '
-            'Correct output is {0}\n but returned output is {1}.'
-            .format(true_bias_grads, layer_bias_grads)
-        )
-    finally:
-        layer.set_params(orig_params)
-    return True
-
diff --git a/notebooks/02_Linear_models.ipynb b/notebooks/02_Linear_models.ipynb
deleted file mode 100644
index 004f3cd..0000000
--- a/notebooks/02_Linear_models.ipynb
+++ /dev/null
@@ -1,650 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "-"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Single Layer Models\n",
-    "\n",
-    "***\n",
-    "### Note on storing matrices in computer memory\n",
-    "\n",
-    "Suppose you want to store the following matrix in memory: $\\left[ \\begin{array}{ccc}\n",
-    "1 & 2 & 3 \\\\\n",
-    "4 & 5 & 6 \\\\\n",
-    "7 & 8 & 9 \\end{array} \\right]$ \n",
-    "\n",
-    "If you allocate the memory at once for the whole matrix, then the above matrix would be organised as a vector in one of two possible forms:\n",
-    "\n",
-    "* Row-wise layout where the order would look like: $\\left [ \\begin{array}{ccccccccc}\n",
-    "1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\end{array} \\right ]$\n",
-    "* Column-wise layout where the order would look like: $\\left [ \\begin{array}{ccccccccc}\n",
-    "1 & 4 & 7 & 2 & 5 & 8 & 3 & 6 & 9 \\end{array} \\right ]$\n",
-    "\n",
-    "Although `numpy` can easily handle both formats (possibly with some computational overhead), in our code we will stick with the more modern (and default) `C`-like approach and use the row-wise format (in contrast to Fortran that used a column-wise approach). \n",
-    "\n",
-    "This means, that in this tutorial:\n",
-    "* vectors are kept row-wise $\\mathbf{x} = (x_1, x_1, \\ldots, x_D) $ (rather than $\\mathbf{x} = (x_1, x_1, \\ldots, x_D)^T$)\n",
-    "* similarly, in case of matrices we will stick to: $\\left[ \\begin{array}{cccc}\n",
-    "x_{11} & x_{12} & \\ldots & x_{1D} \\\\\n",
-    "x_{21} & x_{22} & \\ldots & x_{2D} \\\\\n",
-    "x_{31} & x_{32} & \\ldots & x_{3D} \\\\ \\end{array} \\right]$ and each row (i.e. $\\left[ \\begin{array}{cccc} x_{11} & x_{12} & \\ldots & x_{1D} \\end{array} \\right]$) represents a single data-point (like one MNIST image or one window of observations)\n",
-    "\n",
-    "In lecture slides you will find the equations following the conventional mathematical approach, using column vectors, but you can easily map between column-major and row-major organisations using a matrix transpose.\n",
-    "\n",
-    "***\n",
-    "\n",
-    "## Linear and Affine Transforms\n",
-    "\n",
-    "The basis of all linear models is the so called affine transform, which is a transform that implements a linear transformation and translation of the input features. The transforms we are going to use are parameterised by:\n",
-    "\n",
-    "  * A weight matrix $\\mathbf{W} \\in \\mathbb{R}^{D\\times K}$: where element $w_{ik}$ is the weight from input $x_i$ to output $y_k$\n",
-    "  * A bias vector $\\mathbf{b}\\in R^{K}$ : where element $b_{k}$ is the bias for output $k$\n",
-    "\n",
-    "Note, the bias is simply some additive term, and can be easily incorporated into an additional row in weight matrix and an additional input in the inputs which is set to $1.0$ (as in the below picture taken from the lecture slides). However, here (and in the code) we will keep them separate.\n",
-    "\n",
-    "![Making Predictions](res/singleLayerNetWts-1.png)\n",
-    "\n",
-    "For instance, for the above example of 5-dimensional input vector by $\\mathbf{x} = (x_1, x_2, x_3, x_4, x_5)$, weight matrix $\\mathbf{W}=\\left[ \\begin{array}{ccc}\n",
-    "w_{11} & w_{12} & w_{13} \\\\\n",
-    "w_{21} & w_{22} & w_{23} \\\\\n",
-    "w_{31} & w_{32} & w_{33} \\\\\n",
-    "w_{41} & w_{42} & w_{43} \\\\\n",
-    "w_{51} & w_{52} & w_{53} \\\\ \\end{array} \\right]$, bias vector $\\mathbf{b} = (b_1, b_2, b_3)$ and outputs $\\mathbf{y} = (y_1, y_2, y_3)$, one can write the transformation as follows:\n",
-    "\n",
-    "(for the $i$-th output)\n",
-    "\n",
-    "(1) $\n",
-    "\\begin{equation}\n",
-    "   y_i = b_i + \\sum_j x_jw_{ji}\n",
-    "\\end{equation}\n",
-    "$\n",
-    "\n",
-    "or the equivalent vector form (where $\\mathbf w_i$ is the $i$-th column of $\\mathbf W$, but note, when we **slice** the $i$th column we will get a **vector** $\\mathbf w_i = (w_{1i}, w_{2i},  w_{3i},  w_{4i}, w_{5i})$, hence the transpose for $\\mathbf w_i$ in the below equation):\n",
-    "\n",
-    "(2) $\n",
-    "\\begin{equation}\n",
-    "   y_i = b_i + \\mathbf x \\mathbf w_i^T\n",
-    "\\end{equation}\n",
-    "$\n",
-    "\n",
-    "The same operation can be also written in matrix form, to compute all the outputs $\\mathbf{y}$ at the same time:\n",
-    "\n",
-    "(3) $\n",
-    "\\begin{equation}\n",
-    "  \\mathbf y=\\mathbf x\\mathbf W + \\mathbf b\n",
-    "\\end{equation}\n",
-    "$\n",
-    "\n",
-    "This is equivalent to slides 12/13 in lecture 1, except we are using row vectors.\n",
-    "\n",
-    "When $\\mathbf{x}$ is a mini-batch (contains $B$ data-points of dimension $D$ each), i.e. $\\left[ \\begin{array}{cccc}\n",
-    "x_{11} & x_{12} & \\ldots & x_{1D} \\\\\n",
-    "x_{21} & x_{22} & \\ldots & x_{2D} \\\\\n",
-    "\\cdots \\\\\n",
-    "x_{B1} & x_{B2} & \\ldots & x_{BD} \\\\ \\end{array} \\right]$ equation (3) effectively becomes to be\n",
-    "\n",
-    "(4) $\n",
-    "\\begin{equation}\n",
-    "  \\mathbf Y=\\mathbf X\\mathbf W + \\mathbf b\n",
-    "\\end{equation}\n",
-    "$\n",
-    "\n",
-    "where $\\mathbf{W} \\in \\mathbb{R}^{D\\times K}$ and both $\\mathbf{X}\\in\\mathbb{R}^{B\\times D}$ and $\\mathbf{Y}\\in\\mathbb{R}^{B\\times K}$ are matrices, and $\\mathbf{b}\\in\\mathbb{R}^{1\\times K}$ needs to be <a href=\"http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html\">broadcasted</a> $B$ times (numpy will do this by default). However, we will not make an explicit distinction between a special case for $B=1$ and $B>1$ and simply use equation (3) instead, although $\\mathbf{x}$ and hence $\\mathbf{y}$ could be matrices. From an implementation point of view, it does not matter.\n",
-    "\n",
-    "The desired functionality for matrix multiplication in numpy is provided by <a href=\"http://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html\">numpy.dot</a> function. If you haven't use it so far, get familiar with it as we will use it extensively."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### A general note on random number generators\n",
-    "\n",
-    "It is generally a good practice (for machine learning applications **not** for cryptography!) to seed a pseudo-random number generator once at the beginning of the experiment, and use it later through the code where necesarry. This makes it easier to reproduce results since random initialisations can be replicated. As such, within this course we are going use a single random generator object, similar to the below:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "import numpy\n",
-    "\n",
-    "#initialise the random generator to be used later\n",
-    "seed=[2015, 10, 1]\n",
-    "random_generator = numpy.random.RandomState(seed)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Exercise 1 \n",
-    "\n",
-    "Using `numpy.dot`, implement **forward** propagation through the linear transform defined by equations (3) and (4) for $B=1$ and $B>1$ i.e. use parameters $\\mathbf{W}$ and $\\mathbf{b}$ with data $\\mathbf{X}$ to determine $\\mathbf{Y}$. Use `MNISTDataProvider` (introduced last week) to generate $\\mathbf{X}$. We are going to write a function for each equation:\n",
-    "1. `y1_equation_1`: Return the value of the $1^{st}$ dimension of $\\mathbf{y}$ (the output of the first output node) given a single training data point $\\mathbf{x}$ using a sum\n",
-    "1. `y1_equation_2`: Repeat above using vector multiplication (use `numpy.dot()`)\n",
-    "1. `y_equation_3`: Return the value of $\\mathbf{y}$ (the whole output layer) given a single training data point $\\mathbf{x}$\n",
-    "1. `Y_equation_4`: Return the value of $\\mathbf{Y}$ given $\\mathbf{X}$\n",
-    "\n",
-    "We have initialised $\\mathbf{b}$ to zeros and randomly generated $\\mathbf{W}$ for you. The constants introduced above are:\n",
-    "* The number of data points $B = 3$\n",
-    "* The dimensionality of the input $D = 784$\n",
-    "* The dimensionality of the output $K = 10$"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "from mlp.dataset import MNISTDataProvider\n",
-    "\n",
-    "mnist_dp = MNISTDataProvider(dset='valid', batch_size=3, max_num_batches=1, randomize=False)\n",
-    "B = 3\n",
-    "D = 784\n",
-    "K = 10\n",
-    "irange = 0.1\n",
-    "W = random_generator.uniform(-irange, irange, (D, K)) \n",
-    "b = numpy.zeros((10,))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "mnist_dp.reset()\n",
-    "\n",
-    "#implement following functions, then run the cell\n",
-    "def y1_equation_1(x, W, b):\n",
-    "    raise NotImplementedError()\n",
-    "    \n",
-    "def y1_equation_2(x, W, b):\n",
-    "    raise NotImplementedError()\n",
-    "\n",
-    "def y_equation_3(x, W, b):\n",
-    "    #use numpy.dot\n",
-    "    raise NotImplementedError()\n",
-    "\n",
-    "def Y_equation_4(x, W, b):\n",
-    "    #use numpy.dot\n",
-    "    raise NotImplementedError()\n",
-    "\n",
-    "for X, t in mnist_dp:\n",
-    "    n = 0\n",
-    "    y1e1 = y1_equation_1(x[n], W, b)\n",
-    "    y1e2 = y1_equation_2(x[n], W, b)\n",
-    "    ye3 = y_equation_3(x[n], W, b)\n",
-    "    Ye4 = Y_equation_4(x, W, b)\n",
-    "\n",
-    "print 'y1e1', y1e1\n",
-    "print 'y1e1', y1e1\n",
-    "print 'ye3', ye3\n",
-    "print 'Ye4', ye4\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "## Exercise 2\n",
-    "\n",
-    "Modify the examples from Exercise 1 to perform **backward** propagation, that is, given $\\mathbf{y}$ (obtained in the previous step) and weight matrix $\\mathbf{W}$, project $\\mathbf{y}$ onto the input space $\\mathbf{x}$ (ignore or set to zero the biases towards $\\mathbf{x}$ in backward pass, and note, we are **not** trying to reconstruct the original $\\mathbf{x}$). Mathematically, we are interested in the following transformation: $\\mathbf{z}=\\mathbf{y}\\mathbf{W}^T$"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "***\n",
-    "## Exercise 3 (optional)\n",
-    "\n",
-    "In case you do not fully understand how matrix-vector and/or matrix-matrix products work, consider implementing `my_dot_mat_mat` function  (you have been given `my_dot_vec_mat` code to look at as an example) which takes as the input the following arguments:\n",
-    "\n",
-    "* D-dimensional input vector $\\mathbf{x} = (x_1, x_2, \\ldots, x_D) $.\n",
-    "* Weight matrix $\\mathbf{W}\\in\\mathbb{R}^{D\\times K}$:\n",
-    "\n",
-    "and returns:\n",
-    "\n",
-    "* K-dimensional output vector $\\mathbf{y} = (y_1, \\ldots, y_K) $\n",
-    "\n",
-    "Your job is to write a variant that works in a mini-batch mode where both $\\mathbf{x}\\in\\mathbb{R}^{B\\times D}$ and $\\mathbf{y}\\in\\mathbb{R}^{B\\times K}$ are matrices in which each rows contain one of $B$ data-points from mini-batch (rather than  $\\mathbf{x}\\in\\mathbb{R}^{D}$ and $\\mathbf{y}\\in\\mathbb{R}^{K}$)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "def my_dot_vec_mat(x, W):\n",
-    "    J = x.shape[0]\n",
-    "    K = W.shape[1]\n",
-    "    assert (J == W.shape[0]), (\n",
-    "        \"Number of columns of x expected to \"\n",
-    "        \" to be equal to the number of rows in \"\n",
-    "        \"W, bot got shapes %s, %s\" % (x.shape, W.shape)\n",
-    "    )\n",
-    "    y = numpy.zeros((K,))\n",
-    "    for k in xrange(0, K):\n",
-    "        for j in xrange(0, J):\n",
-    "            y[k] += x[j] * W[j,k]\n",
-    "                \n",
-    "    return y"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "irange = 0.1 #+-range from which we draw the random numbers\n",
-    "\n",
-    "x = random_generator.uniform(-irange, irange, (5,)) \n",
-    "W = random_generator.uniform(-irange, irange, (5,3)) \n",
-    "\n",
-    "y_my = my_dot_vec_mat(x, W)\n",
-    "y_np = numpy.dot(x, W)\n",
-    "\n",
-    "same = numpy.allclose(y_my, y_np)\n",
-    "\n",
-    "if same:\n",
-    "    print 'Well done!'\n",
-    "else:\n",
-    "    print 'Matrices are different:'\n",
-    "    print 'y_my is: ', y_my\n",
-    "    print 'y_np is: ', y_np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def my_dot_mat_mat(x, W):\n",
-    "    I = x.shape[0]\n",
-    "    J = x.shape[1]\n",
-    "    K = W.shape[1]\n",
-    "    assert (J == W.shape[0]), (\n",
-    "        \"Number of columns in of x expected to \"\n",
-    "        \" to be the same as rows in W, got\"\n",
-    "    )\n",
-    "    #allocate the output container\n",
-    "    y = numpy.zeros((I, K))\n",
-    "    \n",
-    "    #implement here matrix-matrix inner product here\n",
-    "    raise NotImplementedError('Write me!')\n",
-    "                \n",
-    "    return y"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Test whether you get comparable numbers to what numpy is producing:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "irange = 0.1 #+-range from which we draw the random numbers\n",
-    "\n",
-    "x = random_generator.uniform(-irange, irange, (2,5)) \n",
-    "W = random_generator.uniform(-irange, irange, (5,3)) \n",
-    "\n",
-    "y_my = my_dot_mat_mat(x, W)\n",
-    "y_np = numpy.dot(x, W)\n",
-    "\n",
-    "same = numpy.allclose(y_my, y_np)\n",
-    "\n",
-    "if same:\n",
-    "    print 'Well done!'\n",
-    "else:\n",
-    "    print 'Matrices are different:'\n",
-    "    print 'y_my is: ', y_my\n",
-    "    print 'y_np is: ', y_np"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now we benchmark each approach (we do it in separate cells, as timeit currently can measure whole cell execuiton only)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "#generate bit bigger matrices, to better evaluate timings\n",
-    "x = random_generator.uniform(-irange, irange, (10, 1000))\n",
-    "W = random_generator.uniform(-irange, irange, (1000, 100))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "print 'my_dot timings:'\n",
-    "%timeit -n10 my_dot_mat_mat(x, W)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "print 'numpy.dot timings:'\n",
-    "%timeit -n10 numpy.dot(x, W)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Optional section ends here**\n",
-    "***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Iterative learning of linear models\n",
-    "\n",
-    "We will learn the model with stochastic gradient descent on N data-points using mean square error (MSE) loss function, which is defined as follows:\n",
-    "\n",
-    "(5) $\n",
-    "E = \\frac{1}{2} \\sum_{n=1}^N ||\\mathbf{y}^n - \\mathbf{t}^n||^2 =  \\sum_{n=1}^N E^n \\\\\n",
-    "  E^n = \\frac{1}{2} ||\\mathbf{y}^n - \\mathbf{t}^n||^2\n",
-    "$\n",
-    "\n",
-    "(6) $ E^n = \\frac{1}{2} \\sum_{k=1}^K (y_k^n - t_k^n)^2 $\n",
-    "  \n",
-    "Hence, the gradient w.r.t (with respect to) the $r$ output y of the model is defined as, so called delta function, $\\delta_r$: \n",
-    "\n",
-    "(8) $\\frac{\\partial{E^n}}{\\partial{y_{r}}} = (y^n_r - t^n_r) =  \\delta^n_r \\quad ; \\quad\n",
-    "    \\delta^n_r = y^n_r - t^n_r \\\\\n",
-    "    \\frac{\\partial{E}}{\\partial{y_{r}}} = \\sum_{n=1}^N \\frac{\\partial{E^n}}{\\partial{y_{r}}} = \\sum_{n=1}^N \\delta^n_r\n",
-    "$\n",
-    "\n",
-    "Similarly, using the above $\\delta^n_r$ one can express the gradient of the  weight $w_{sr}$ (from the s-th input to the r-th output) for linear model and MSE cost as follows:\n",
-    "\n",
-    "(9) $\n",
-    "    \\frac{\\partial{E^n}}{\\partial{w_{sr}}} = (y^n_r - t^n_r)x_s^n =  \\delta^n_r x_s^n \\quad\\\\\n",
-    "    \\frac{\\partial{E}}{\\partial{w_{sr}}} = \\sum_{n=1}^N \\frac{\\partial{E^n}}{\\partial{w_{rs}}} = \\sum_{n=1}^N \\delta^n_r x_s^n\n",
-    "$\n",
-    "\n",
-    "and the gradient for bias parameter at the $r$-th output is:\n",
-    "\n",
-    "(10) $\n",
-    "    \\frac{\\partial{E}}{\\partial{b_{r}}} = \\sum_{n=1}^N \\frac{\\partial{E^n}}{\\partial{b_{r}}} = \\sum_{n=1}^N \\delta^n_r\n",
-    "$"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "![Making Predictions](res/singleLayerNetPredict.png)\n",
-    " \n",
-    "  * Input vector $\\mathbf{x} = (x_1, x_2, \\ldots, x_D) $\n",
-    "  * Output scalar $y_1$\n",
-    "  * Weight matrix $\\mathbf{W}$: $w_{ik}$ is the weight from input $x_i$ to output $y_k$. Note, here this is really a vector since a single scalar output, y_1.\n",
-    "  * Scalar bias $b$ for the only output in our model \n",
-    "  * Scalar target $t$ for the only output in out model\n",
-    "  \n",
-    "First, ensure you can make use of the data provider (note, for training data has been normalised to zero mean and unit variance, hence different effective range than one can find in file):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "from mlp.dataset import MetOfficeDataProvider\n",
-    "\n",
-    "modp = MetOfficeDataProvider(10, batch_size=10, max_num_batches=2, randomize=False)\n",
-    "\n",
-    "%precision 2\n",
-    "for x, t in modp:\n",
-    "    print 'Observations: ', x\n",
-    "    print 'To predict: ', t"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Exercise 4\n",
-    "\n",
-    "The below code implements a very simple variant of stochastic gradient descent for the rainfall prediction example. Your task is to implement 5 functions in the next cell and then run two next cells that 1) build sgd functions and 2) run the actual training."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "#When implementing those, take into account the mini-batch case, for which one is\n",
-    "#expected to sum the errors for each example\n",
-    "\n",
-    "def fprop(x, W, b):\n",
-    "    #code implementing eq. (3)\n",
-    "    raise NotImplementedError('Write me!')\n",
-    "\n",
-    "def cost(y, t):\n",
-    "    #Mean Square Error cost, equation (5)\n",
-    "    raise NotImplementedError('Write me!')\n",
-    "\n",
-    "def cost_grad(y, t):\n",
-    "    #Gradient of the cost w.r.t y equation (8)\n",
-    "    raise NotImplementedError('Write me!')\n",
-    "\n",
-    "def cost_wrt_W(cost_grad, x):\n",
-    "    #Gradient of the cost w.r.t W, equation (9)\n",
-    "    raise NotImplementedError('Write me!')\n",
-    "    \n",
-    "def cost_wrt_b(cost_grad):\n",
-    "    #Gradient of the cost w.r.t to b, equation (10)\n",
-    "    raise NotImplementedError('Write me!')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "def sgd_epoch(data_provider, W, b, learning_rate):\n",
-    "    mse_stats = []\n",
-    "    \n",
-    "    #get the minibatch of data\n",
-    "    for x, t in data_provider:\n",
-    "        \n",
-    "        #1. get the estimate of y\n",
-    "        y = fprop(x, W, b)\n",
-    "\n",
-    "        #2. compute the loss function\n",
-    "        tmp = cost(y, t)\n",
-    "        mse_stats.append(tmp)\n",
-    "        \n",
-    "        #3. compute the grad of the cost w.r.t the output layer activation y\n",
-    "        #i.e. how the cost changes when output y changes\n",
-    "        cost_grad_deltas = cost_grad(y, t)\n",
-    "\n",
-    "        #4. compute the gradients w.r.t model's parameters\n",
-    "        grad_W = cost_wrt_W(cost_grad_deltas, x)\n",
-    "        grad_b = cost_wrt_b(cost_grad_deltas)\n",
-    "\n",
-    "        #4. Update the model, we update with the mean gradient\n",
-    "        # over the minibatch, rather than sum of particular gradients\n",
-    "        # in a minibatch, to do so we scale the learning rate by batch_size\n",
-    "        batch_size = x.shape[0]\n",
-    "        effect_learn_rate = learning_rate / batch_size\n",
-    "\n",
-    "        W = W - effect_learn_rate * grad_W\n",
-    "        b = b - effect_learn_rate * grad_b\n",
-    "    \n",
-    "    return W, b, numpy.mean(mse_stats)\n",
-    "\n",
-    "def sgd(data_provider, W, b, learning_rate=0.1, max_epochs=10):\n",
-    "    \n",
-    "    for epoch in xrange(0, max_epochs):\n",
-    "        #reset the data provider\n",
-    "        data_provider.reset()\n",
-    "        \n",
-    "        #train for one epoch\n",
-    "        W, b, mean_cost = \\\n",
-    "            sgd_epoch(data_provider, W, b, learning_rate)\n",
-    "                \n",
-    "        print \"MSE training cost after %d-th epoch is %f\" % (epoch + 1, mean_cost)\n",
-    "    \n",
-    "    return W, b\n",
-    "        \n",
-    "        "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "#some hyper-parameters\n",
-    "window_size = 12\n",
-    "irange = 0.1\n",
-    "learning_rate = 0.01\n",
-    "max_epochs=40\n",
-    "\n",
-    "# note, while developing you can set max_num_batches to some positive number to limit\n",
-    "# the number of training data-points (you will get feedback faster)\n",
-    "mdp = MetOfficeDataProvider(window_size, batch_size=10, max_num_batches=-100, randomize=False)\n",
-    "\n",
-    "#initialise the parameters\n",
-    "W = random_generator.uniform(-irange, irange, (window_size, 1))\n",
-    "b = random_generator.uniform(-irange, irange, (1, ))\n",
-    "\n",
-    "#train the model\n",
-    "sgd(mdp, W, b, learning_rate=learning_rate, max_epochs=max_epochs)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "## Exercise 5\n",
-    "\n",
-    "Modify the above prediction (regression) problem so the model makes a binary classification whether the the weather is going to be one of those \\{rainy, not-rainy} (look at slide 12 of the 2nd lecture)\n",
-    "\n",
-    "Tip: You need to introduce the following changes:\n",
-    "1. Modify `MetOfficeDataProvider` (for example, inherit from MetOfficeDataProvider to create a new class MetOfficeDataProviderBin) and modify `next()` function so it returns as `targets` either 0 (not-rainy - if the the amount of rain [before mean/variance normalisation] is equal to 0) or 1 (rainy -- otherwise).\n",
-    "2. Modify the functions from previous exercise so the fprop implements `sigmoid` on top of affine transform.\n",
-    "3. Modify cost function to binary cross-entropy\n",
-    "4. Make sure you compute the gradients correctly (as you have changed both the output and the cost)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/notebooks/03_Multi_layer_models.ipynb b/notebooks/03_Multi_layer_models.ipynb
deleted file mode 100644
index d5230ac..0000000
--- a/notebooks/03_Multi_layer_models.ipynb
+++ /dev/null
@@ -1,303 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Introduction\n",
-    "\n",
-    "This tutorial is an introduction to the first coursework about multi-layer networks (also known as Multi-Layer Perceptrons - MLPs - or Deep Neural Networks - DNNs). Here, we will show how to build a single layer linear model (similar to the one from the previous lab) for MNIST digit classification using the provided code-base. \n",
-    "\n",
-    "The principal purpose of this introduction is to get you familiar with how to connect the code blocks (and what operations each of them implements) in order to set up an experiment that includes 1) building the model structure 2) optimising the model's parameters (weights) and 3) evaluating the model on test data. \n",
-    "\n",
-    "## For those affected by notebook kernel issues\n",
-    "\n",
-    "In case you are still having issues with running notebook kernels, have a look at [this note](https://github.com/CSTR-Edinburgh/mlpractical/blob/master/kernel_issue_fix.md) on the GitHub.\n",
-    "\n",
-    "## Virtual environments\n",
-    "\n",
-    "Before you proceed onwards, remember to activate your virtual environment:\n",
-    "   * If you were in last week's Tuesday or Wednesday group type `activate_mlp` or `source ~/mlpractical/venv/bin/activate`\n",
-    "   * If you were in the Monday group:\n",
-    "      + and if you have chosen the **comfy** way type: `workon mlpractical`\n",
-    "      + and if you have chosen the **generic** way, `source` your virutal environment using `source` and specyfing the path to the activate script (you need to localise it yourself, there were not any general recommendations w.r.t dir structure and people have installed it in different places, usually somewhere in the home directories. If you cannot easily find it by yourself, use something like: `find . -iname activate` ):\n",
-    "\n",
-    "## Syncing the git repository\n",
-    "\n",
-    "Look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a> for more details. But in short, we recommend to create a separate branch for the coursework, as follows:\n",
-    "\n",
-    "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
-    "2. List the branches and check which is currently active by typing: `git checkout`\n",
-    "3. If you are not in `master` branch, switch to it by typing: \n",
-    "```\n",
-    "git checkout master\n",
-    " ```\n",
-    "4. Then update the repository (note, assuming master does not have any conflicts), if there are some, have a look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a>\n",
-    "```\n",
-    "git pull\n",
-    "```\n",
-    "5. And now, create the new branch & swith to it by typing:\n",
-    "```\n",
-    "git checkout -b coursework1\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Multi Layer Models\n",
-    "\n",
-    "Today, we shall build models which can have an arbitrary number of hidden layers.  Please have a look at the  diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n",
-    "\n",
-    "![Making Predictions](res/code_scheme.svg)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "1. Structuring the model\n",
-    "   * The model (for now) is allowed to have a sequence of layers, mapping inputs $\\mathbf{x}$ to outputs $\\mathbf{y}$. \n",
-    "   * This operation is implemented as a special type of a layer in `mlp.layers.MLP` class. It keeps a sequence of other layers (of various typyes like Linear, Sigmoid, Softmax, etc.) as well as the internal state of a model for a mini-batch, that is, the intermediate data produced in *forward* and *backward* passes.\n",
-    "2. Forward computation\n",
-    "    * `mlp.layers.MLP` provides an `fprop()` method that iterates over defined layers propagates $\\mathbf{x}$ to $\\mathbf{y}$. \n",
-    "    * Each layer (look at `mlp.layers.Linear` attached below) also implements an `fprop()` method, which performs an atomic, for the given layer, operation. Most often, for the $i$-th layer, we want to obtain a linear transform $\\mathbf a^i$ of the inputs, and apply some non-linear transfer function $f^i(\\mathbf a^i)$ to produce the output $\\mathbf h^i$. Note, in general each layer may implement different activation functions $f^i()$, however for now we will use only `sigmoid` and `softmax`\n",
-    "3. Backward computation\n",
-    "   * Similarly, `mlp.layers.MLP` also implements a `bprop()` function, to back-propagate the errors from the top to the bottom layer. This class also keeps the back-propagated statistics ($\\delta$) to be used later when computing the gradients with respect to the parameters.\n",
-    "   * This functionality is also re-implemented by particular layers (again, have a look at the `bprop` function of `mlp.layers.Linear`). `bprop()`  returns both $\\delta$ (needed to update the parameters) but also back-progapates the gradient down to the inputs. Also note, that depending on whether the layer is the top or not (i.e. if it deals directly with the cost function or not) some simplifications may apply ( as with cross-entropy and softmax). That's why when implementing a new type of layer that may be used as an output layer one also need to specify the implementation of `bprop_cost()`.\n",
-    "4. Learning the model\n",
-    "   * The actual evaluation of the cost as well as the *forward* and *backward* passes may be found in the `train_epoch()` method of `mlp.optimisers.SGDOptimiser`\n",
-    "   * This function also calls the `pgrads()` method on each layer, that given activations and deltas, returns the list of the gradients of the cost with respect to the model parameters, i.e. $\\frac{\\partial{\\mathbf{E}}}{\\partial{\\mathbf{W^i}}}$ and  $\\frac{\\partial{\\mathbf{E}}}{\\partial{\\mathbf{b}^i}}$ at the above diagram (look at an example implementation in `mlp.layers.Linear`)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": false
-   },
-   "source": [
-    "Example code for the above\n",
-    "```python\n",
-    "# %load -s Linear mlp/layers.py\n",
-    "class Linear(Layer):\n",
-    "\n",
-    "    def __init__(self, idim, odim,\n",
-    "                 rng=None,\n",
-    "                 irange=0.1):\n",
-    "\n",
-    "        super(Linear, self).__init__(rng=rng)\n",
-    "\n",
-    "        self.idim = idim\n",
-    "        self.odim = odim\n",
-    "\n",
-    "        self.W = self.rng.uniform(\n",
-    "            -irange, irange,\n",
-    "            (self.idim, self.odim))\n",
-    "\n",
-    "        self.b = numpy.zeros((self.odim,), dtype=numpy.float32)\n",
-    "\n",
-    "    def fprop(self, inputs):\n",
-    "        \"\"\"\n",
-    "        Implements a forward propagation through the i-th layer, that is\n",
-    "        some form of:\n",
-    "           a^i = xW^i + b^i\n",
-    "           h^i = f^i(a^i)\n",
-    "        with f^i, W^i, b^i denoting a non-linearity, weight matrix and\n",
-    "        biases of this (i-th) layer, respectively and x denoting inputs.\n",
-    "\n",
-    "        :param inputs: matrix of features (x) or the output of the previous layer h^{i-1}\n",
-    "        :return: h^i, matrix of transformed by layer features\n",
-    "        \"\"\"\n",
-    "        a = numpy.dot(inputs, self.W) + self.b\n",
-    "        # here f() is an identity function, so just return a linear transformation\n",
-    "        return a\n",
-    "\n",
-    "    def bprop(self, h, igrads):\n",
-    "        \"\"\"\n",
-    "        Implements a backward propagation through the layer, that is, given\n",
-    "        h^i denotes the output of the layer and x^i the input, we compute:\n",
-    "        dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i\n",
-    "        x^i could be either features (x) or the output of the lower layer h^{i-1}\n",
-    "        :param h: it's an activation produced in forward pass\n",
-    "        :param igrads, error signal (or gradient) flowing to the layer, note,\n",
-    "               this in general case does not corresponds to 'deltas' used to update\n",
-    "               the layer's parameters, to get deltas ones need to multiply it with\n",
-    "               the dh^i/da^i derivative\n",
-    "        :return: a tuple (deltas, ograds) where:\n",
-    "               deltas = igrads * dh^i/da^i\n",
-    "               ograds = deltas \\times da^i/dx^i\n",
-    "        \"\"\"\n",
-    "\n",
-    "        # since df^i/da^i = 1 (f is assumed identity function),\n",
-    "        # deltas are in fact the same as igrads\n",
-    "        ograds = numpy.dot(igrads, self.W.T)\n",
-    "        return igrads, ograds\n",
-    "\n",
-    "    def bprop_cost(self, h, igrads, cost):\n",
-    "        \"\"\"\n",
-    "        Implements a backward propagation in case the layer directly\n",
-    "        deals with the optimised cost (i.e. the top layer)\n",
-    "        By default, method should implement a bprop for default cost, that is\n",
-    "        the one that is natural to the layer's output, i.e.:\n",
-    "        here we implement linear -> mse scenario\n",
-    "        :param h: it's an activation produced in forward pass\n",
-    "        :param igrads, error signal (or gradient) flowing to the layer, note,\n",
-    "               this in general case does not corresponds to 'deltas' used to update\n",
-    "               the layer's parameters, to get deltas ones need to multiply it with\n",
-    "               the dh^i/da^i derivative\n",
-    "        :param cost, mlp.costs.Cost instance defining the used cost\n",
-    "        :return: a tuple (deltas, ograds) where:\n",
-    "               deltas = igrads * dh^i/da^i\n",
-    "               ograds = deltas \\times da^i/dx^i\n",
-    "        \"\"\"\n",
-    "\n",
-    "        if cost is None or cost.get_name() == 'mse':\n",
-    "            # for linear layer and mean square error cost,\n",
-    "            # cost back-prop is the same as standard back-prop\n",
-    "            return self.bprop(h, igrads)\n",
-    "        else:\n",
-    "            raise NotImplementedError('Linear.bprop_cost method not implemented '\n",
-    "                                      'for the %s cost' % cost.get_name())\n",
-    "\n",
-    "    def pgrads(self, inputs, deltas):\n",
-    "        \"\"\"\n",
-    "        Return gradients w.r.t parameters\n",
-    "\n",
-    "        :param inputs, input to the i-th layer\n",
-    "        :param deltas, deltas computed in bprop stage up to -ith layer\n",
-    "        :return list of grads w.r.t parameters dE/dW and dE/db in *exactly*\n",
-    "                the same order as the params are returned by get_params()\n",
-    "\n",
-    "        Note: deltas here contain the whole chain rule leading\n",
-    "        from the cost up to the the i-th layer, i.e.\n",
-    "        dE/dy^L dy^L/da^L da^L/dh^{L-1} dh^{L-1}/da^{L-1} ... dh^{i}/da^{i}\n",
-    "        and here we are just asking about\n",
-    "          1) da^i/dW^i and 2) da^i/db^i\n",
-    "        since W and b are only layer's parameters\n",
-    "        \"\"\"\n",
-    "\n",
-    "        grad_W = numpy.dot(inputs.T, deltas)\n",
-    "        grad_b = numpy.sum(deltas, axis=0)\n",
-    "\n",
-    "        return [grad_W, grad_b]\n",
-    "\n",
-    "    def get_params(self):\n",
-    "        return [self.W, self.b]\n",
-    "\n",
-    "    def set_params(self, params):\n",
-    "        #we do not make checks here, but the order on the list\n",
-    "        #is assumed to be exactly the same as get_params() returns\n",
-    "        self.W = params[0]\n",
-    "        self.b = params[1]\n",
-    "\n",
-    "    def get_name(self):\n",
-    "        return 'linear'\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Example 1: Experiment with linear models and MNIST\n",
-    "\n",
-    "The below snippet demonstrates how to use the code we have provided for the coursework 1. Get familiar with it, as from now on we will use till the end of the course, including the 2nd coursework.\n",
-    "\n",
-    "It should be straightforward to extend the following code to more complex models, like stack more layers, change the cost, the optimiser, learning rate schedules, etc.. But **ask** in case something is not clear.\n",
-    "\n",
-    "In this particular example, we use the following components:\n",
-    "  *  One layer mapping data-points ($\\mathbf x$) straight to 10 digits classes represented as 10 (linear) outputs ($\\mathbf y$). This operation is implemented as a linear layer in `mlp.layers.Linear`. Get familiar with this class (read the comments, etc.) as it is going to be a building block for the coursework.\n",
-    "  * One can stack as many different layers as required through the container `mlp.layers.MLP`\n",
-    "  * As an objective here we use the Mean Square Error cost defined in `mlp.costs.MSECost`\n",
-    "  * Our *Stochastic Gradient Descent* optimiser can be found in `mlp.optimisers.SGDOptimiser`. Its parent `mlp.optimisers.Optimiser` implements validation functionality (and an interface in case one need to implement a different optimiser)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "import numpy\n",
-    "import logging\n",
-    "\n",
-    "logger = logging.getLogger()\n",
-    "logger.setLevel(logging.INFO)\n",
-    "\n",
-    "from mlp.layers import MLP, Linear #import required layer types\n",
-    "from mlp.optimisers import SGDOptimiser #import the optimiser\n",
-    "from mlp.dataset import MNISTDataProvider #import data provider\n",
-    "from mlp.costs import MSECost #import the cost we want to use for optimisation\n",
-    "from mlp.schedulers import LearningRateFixed\n",
-    "\n",
-    "rng = numpy.random.RandomState([2015,10,10])\n",
-    "\n",
-    "# define the model structure, here just one linear layer\n",
-    "# and mean square error cost\n",
-    "cost = MSECost()\n",
-    "model = MLP(cost=cost)\n",
-    "model.add_layer(Linear(idim=784, odim=10, rng=rng))\n",
-    "#one can stack more layers here\n",
-    "\n",
-    "# define the optimiser, here stochasitc gradient descent\n",
-    "# with fixed learning rate and max_epochs as stopping criterion\n",
-    "lr_scheduler = LearningRateFixed(learning_rate=0.01, max_epochs=20)\n",
-    "optimiser = SGDOptimiser(lr_scheduler=lr_scheduler)\n",
-    "\n",
-    "logger.info('Initialising data providers...')\n",
-    "train_dp = MNISTDataProvider(dset='train', batch_size=100, max_num_batches=-10, randomize=True)\n",
-    "valid_dp = MNISTDataProvider(dset='valid', batch_size=100, max_num_batches=-10, randomize=False)\n",
-    "\n",
-    "logger.info('Training started...')\n",
-    "optimiser.train(model, train_dp, valid_dp)\n",
-    "\n",
-    "logger.info('Testing the model on test set:')\n",
-    "test_dp = MNISTDataProvider(dset='eval', batch_size=100, max_num_batches=-10, randomize=False)\n",
-    "cost, accuracy = optimiser.validate(model, test_dp)\n",
-    "logger.info('MNIST test set accuracy is %.2f %% (cost is %.3f)'%(accuracy*100., cost))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Exercise\n",
-    "\n",
-    "Modify the above code by adding an intemediate linear layer of size 200 hidden units between input and output layers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/notebooks/04_Regularisation.ipynb b/notebooks/04_Regularisation.ipynb
deleted file mode 100644
index 24f2349..0000000
--- a/notebooks/04_Regularisation.ipynb
+++ /dev/null
@@ -1,293 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Introduction\n",
-    "\n",
-    "This tutorial focuses on implementation of three reqularisaion techniques: two of them add a regularisation term to the cost function based on the *L1* and *L2* norms;  the third technique, called *Dropout*, is a form of noise injection by random corruption of information carried by the hidden units during training.\n",
-    "\n",
-    "\n",
-    "## Virtual environments\n",
-    "\n",
-    "Before you proceed onwards, remember to activate your virtual environment by typing `activate_mlp` or `source ~/mlpractical/venv/bin/activate` (or if you did the original install the \"comfy way\" type: `workon mlpractical`).\n",
-    "\n",
-    "\n",
-    "## Syncing the git repository\n",
-    "\n",
-    "Look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a> for more details. But in short, we recommend to create a separate branch for this lab, as follows:\n",
-    "\n",
-    "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
-    "2. List the branches and check which are currently active by typing: `git branch`\n",
-    "3. If you have followed our recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
-    "```\n",
-    "git commit -am \"finished coursework\"\n",
-    "```\n",
-    "4. Now you can switch to `master` branch by typing: \n",
-    "```\n",
-    "git checkout master\n",
-    " ```\n",
-    "5. To update the repository (note, assuming master does not have any conflicts), if there are some, have a look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a>\n",
-    "```\n",
-    "git pull\n",
-    "```\n",
-    "6. And now, create the new branch & swith to it by typing:\n",
-    "```\n",
-    "git checkout -b lab4\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Regularisation\n",
-    "\n",
-    "Regularisation add a *complexity term* to the cost function. Its purpose is to put some prior on the model's parameters, which will penalise complexity. The most common prior is perhaps the one which assumes smoother solutions (the one which are not able to fit training data too well) are better as they are more likely to better generalise to unseen data. \n",
-    "\n",
-    "A way to incorporate such a prior in the model is to add some term that penalise certain configurations of the parameters -- either from growing too large ($L_2$) or the one that prefers a solution that could be modelled with fewer parameters ($L_1$), hence encouraging some parameters to become 0. One can, of course, combine many such priors when optimising the model, however, in the lab we shall use $L_1$ and/or $L_2$ priors.\n",
-    "\n",
-    "$L_1$ and $L_2$ priors can be easily incorporated into the training objective through additive terms, as follows:\n",
-    "\n",
-    "(1) $\n",
-    " \\begin{align*}\n",
-    "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
-    "    \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} + \\underbrace{\\beta_{L_2} E^n_{L_2}}_{\\text{prior term}}\n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "where $ E^n_{\\text{train}} = - \\sum_{k=1}^K t^n_k \\ln y^n_k $ is the cross-entropy cost function,  $\\beta_{L_1}$ and $\\beta_{L_2}$ are non-negative constants specified in advance (hyper-parameters) and $E^n_{L_1}$ and $E^n_{L_2}$ are norm metrics specifying certain properties of the parameters:\n",
-    "\n",
-    "(2) $\n",
-    " \\begin{align*}\n",
-    " E^n_{L_p}(\\mathbf{W}) = ||\\mathbf{W}||_p = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "where $p$ denotes the norm-order (for regularisation either 1 or 2). Notice, in practice for computational purposes we will rather compute squared $L_{p=2}$ norm, which omits the square root in (2), that is:\n",
-    "\n",
-    "(3)$ \\begin{align*}\n",
-    " E^n_{L_{p=2}}(\\mathbf{W}) = ||\\mathbf{W}||^2_2 = \\left ( \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2 \\right )^{\\frac{1}{2}} \\right )^2 = \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2\n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "## $L_{p=2}$ (Weight Decay)\n",
-    "\n",
-    "Our cost with $L_{2}$ regulariser then becomes ($\\frac{1}{2}$ simplifies a derivative later):\n",
-    "\n",
-    "(4) $\n",
-    " \\begin{align*}\n",
-    "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
-    "    \\underbrace{\\beta_{L_2} \\frac{1}{2} E^n_{L_2}}_{\\text{prior term}}\n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
-    "\n",
-    "(5) $\n",
-    "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} 0.5 E^n_{L_2}) }{\\partial w_i} \n",
-    "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} 0.5 \\frac{\\partial\n",
-    "      E^n_{L_2}}{\\partial w_i} \\right) \n",
-    "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} w_i \\right)\n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "And the actual update we to the $W_i$ parameter is:\n",
-    "\n",
-    "(6) $\n",
-    "\\begin{align*}\n",
-    "  \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} w_i \\right) \n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "where $\\eta$ is learning rate. \n",
-    "\n",
-    "Exercise 1 gives some more implementational suggestions on how to incorporate this technique into the lab code, the cost related prior contributions (equation (1)) are computed in mlp.optimisers.Optimiser.compute_prior_costs() and your job is to add the relevant optimisation related code when computing the gradients w.r.t parameters. \n",
-    "\n",
-    "## $L_{p=1}$ (Sparsity)\n",
-    "\n",
-    "Our cost with $L_{1}$ regulariser then becomes:\n",
-    "\n",
-    "(7) $\n",
-    " \\begin{align*}\n",
-    "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
-    "    \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} \n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
-    "\n",
-    "(8) $\\begin{align*}\n",
-    "  \\frac{\\partial E^n}{\\partial w_i} =  \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i}  =  \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1}  \\mbox{sgn}(w_i)\n",
-    "\\end{align*}\n",
-    "$\n",
-    "\n",
-    "And the actual update we to the $W_i$ parameter is:\n",
-    "\n",
-    "(9) $\\begin{align*}\n",
-    "  \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n",
-    "\\end{align*}$\n",
-    "\n",
-    "Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n",
-    "\n",
-    "One can also easily apply those penalty terms for biases, however, this is usually not necessary as biases do not affect the smoothness of the solution (given data).\n",
-    "\n",
-    "## Dropout\n",
-    "\n",
-    "For a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality), Dropout implements the following transformation:\n",
-    "\n",
-    "(10) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
-    "\n",
-    "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which element $d^l_{ij}$ is sampled from the Bernoulli distribution:\n",
-    "\n",
-    "(11) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
-    "\n",
-    "with $0<p^l_d<1$ denoting the probability that the given unit is kept unchanged (the \"dropping probability\" is thus $1-p^l_d$). We ignore here the extreme scenarios in which $p^l_d=1$ and there is no dropout applied (hence the training would be exactly the same as in standard SGD) or in which $p^l_d=0$ whereby all units would be dropped, hence the model would not learn anything.\n",
-    "\n",
-    "The probability $p^l_d$ is a hyperparameter (like learning rate) meaning it needs to be provided before training and also very often tuned for the given task. As the notation suggests, it can be specified separately for each layer, including the scenario where $l=0$ when some random dimensions in the input features (pixels in the image for MNIST) are being also corrupted.\n",
-    "\n",
-    "### Keeping the $l$-th layer output $\\mathbf{\\hat h}^l$ (input to the upper layer) appropiately scaled at test-time\n",
-    "\n",
-    "The other issue one needs to take into account is the mismatch that arises between training and test (runtime) stages when dropout is applied. Since dropout is not applied at the testing (run-time) stage, the average input to the unit in the upper layer will be bigger compared to the training stage (where some inputs were set to 0), on average $1/p^l_d$ times bigger. \n",
-    "\n",
-    "To account for this mismatch one could either:\n",
-    "\n",
-    "1. When training is finished scale the final weight matrices $\\mathbf{W}^l, l=1,\\ldots,L$ by $p^{l-1}_d$ (remember, $p^{0}_d$ is the probability related to dropping input features), as mentioned in the lecture\n",
-    "2. Scale the activations in equation (10) during training, that is, for each mini-batch multiply $\\mathbf{\\hat h}^l$ by $1/p^l_d$ to compensate for dropped units and then at run-time use the model as usual, **without** scaling. Make sure the $1/p^l_d$ scaler is taken into account for both forward and backward passes.\n",
-    "\n",
-    "In this lab we recommend option 2 as it will make some things easier to implement. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from mlp.dataset import MNISTDataProvider\n",
-    "\n",
-    "train_dp = MNISTDataProvider(dset='train', batch_size=10, max_num_batches=100, randomize=True)\n",
-    "valid_dp = MNISTDataProvider(dset='valid', batch_size=10000, randomize=False)\n",
-    "test_dp = MNISTDataProvider(dset='eval', batch_size=10000, randomize=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 1:  Implement L2 based regularisation\n",
-    "\n",
-    "Implement an L2 regularisation method (for the weight matrices, optionally for the biases). Test your solution on a one hidden layer model similar to the one used in Task 4 for coursework 1 (800 hidden units) -- but limit the training data to 1000 (random) data-points (keep the validation and test sets the same). You may use the data providers specified in the above cell. \n",
-    "\n",
-    "*Note (optional): We limit both the amount of data as well as the size of a mini-batch - this is due to the fact that those two parameters directly affect the number of updates we do to the model's parameters per epoch (i.e. for `batch_size=100` and `max_num_batches=10` one can only adjust parameters `10` times per epoch versus `100` times in the case when `batch_size=10` and `max_num_batches=100`). Since SGD relies on making many small upates, this ratio (number of updates given data) is another hyper-parameter one should consider before optimisation.*\n",
-    "\n",
-    "First build and train an unregularised model as a basline. Then train regularised models starting with $\\beta_{L2}$ set to 0.0001 and do a search over different values of $\\beta_{L2}$. Observe how different $L_2$ penalties affect the model's ability to fit training and validation data.\n",
-    "\n",
-    "Implementation tips:\n",
-    "* Have a look at the constructor of mlp.optimiser.SGDOptimiser class; it has been modified to take more optimisation-related arguments.\n",
-    "* The best place to implement regularisation terms is in the `pgrads` method of the mlp.layers.Layer class or its subclasses. See equations (6) and (9)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 2: Implement L1 based regularisation\n",
-    "\n",
-    "Implement the L1 regularisation penalty. Test your solution on a one hidden layer model similar to the one used in Exercise 1. Then train an $L_1$ regularised model starting with $\\beta_{L1}=0.0001$ and again search over different values of this parameter. Observe how different $L_1$ penalties affect the model's ability to fit training and validation data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 3:\n",
-    "    \n",
-    "Dropout applied to input features (turning some random pixels on or off) may be also viewed as a form of data augmentation -- as we effectively create images that differ in some way from the training set;  but also the model is tasked to properly classify imperfect data-points.\n",
-    "\n",
-    "Your task in this exercise is to pick a random digit from the MNIST dataset (use MNISTDataProvider) and corrupt it pixel-wise with different levels of probabilities $p_{d} \\in \\{0.9, 0.7, 0.5, 0.2, 0.1\\}$ (reminder, dropout probability is $1-p_d$) that is, for each pixel $x_{i,j}$ in image $\\mathbf{X} \\in \\mathbb{R}^{W\\times H}$:\n",
-    "\n",
-    "$\\begin{align}\n",
-    "d_{i,j} & \\sim\\ \\mbox{Bernoulli}(p_{d}) \\\\\n",
-    "x_{i,j} &=\n",
-    "\\begin{cases}\n",
-    "     0     & \\quad \\text{if } d_{i,j} = 0\\\\\n",
-    "     x_{i,j}       & \\quad \\text{if } d_{i,j} = 1\\\\\n",
-    "\\end{cases}\n",
-    "\\end{align}\n",
-    "$\n",
-    "\n",
-    "Plot the solution as a 2x3 grid of images for each $p_d$ scenario, at position (0, 0) plot an original (uncorrupted) image.\n",
-    "\n",
-    "Tip: You may use numpy.random.binomial function to draw samples from Bernoulli distribution."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 4: Implement Dropout \n",
-    "\n",
-    "Implement the dropout regularisation technique. Then for the same initial configuration as used in Exercise 1. investigate the effectivness of different dropout rates applied to input features and/or hidden layers. Start with $p_{inp}=0.5$ and $p_{hid}=0.5$ and do a search for better settings of these parameters. Dropout usually slows training down (approximately by a factor of two) so train dropout models for around twice as many epochs as the baseline model.\n",
-    "\n",
-    "Implementation tips:\n",
-    "* Add a function `fprop_dropout` to `mlp.layers.MLP` class which (on top of `inputs` argument) takes also dropout-related argument(s) and perform dropout forward propagation through the model.\n",
-    "* Also you need to introduce some modifications to the `mlp.optimisers.SGDOptimiser.train_epoch()` function.\n",
-    "* Design and implement a dropout scheduler in a similar way to how learning rates are handled (that is, allowing for a schedule which is kept independent of the implementation in `mlp.optimisers.SGDOptimiser.train()`). \n",
-    "   +  For this exercise implement only a fixed dropout scheduler - `DropoutFixed`, but your implementation should allow to easily add other schedules in the future. \n",
-    "   +  A dropout scheduler of any type should return a tuple of two numbers $(p_{inp},\\; p_{hid})$, the first one is dropout factor for input features (data-points), and the latter dropout factor for hidden layers (assumed the same for all hidden layers)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/notebooks/05_Transfer_functions.ipynb b/notebooks/05_Transfer_functions.ipynb
deleted file mode 100644
index b07022c..0000000
--- a/notebooks/05_Transfer_functions.ipynb
+++ /dev/null
@@ -1,238 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Introduction\n",
-    "\n",
-    "This tutorial focuses on implementation of alternatives to sigmoid transfer functions for hidden units.  (*Transfer functions* are also called *activation functions* or *nonlinearities*.) First, we will work with hyperboilc tangent (tanh) and then unbounded (or partially unbounded) piecewise linear functions: Rectifying Linear Units (ReLU) and Maxout.\n",
-    "\n",
-    "\n",
-    "## Virtual environments\n",
-    "\n",
-    "Before you proceed onwards, remember to activate your virtual environment by typing `activate_mlp` or `source ~/mlpractical/venv/bin/activate` (or if you did the original install the \"comfy way\" type: `workon mlpractical`).\n",
-    "\n",
-    "\n",
-    "## Syncing the git repository\n",
-    "\n",
-    "Look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a> for more details. But in short, we recommend to create a separate branch for this lab, as follows:\n",
-    "\n",
-    "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
-    "2. List the branches and check which are currently active by typing: `git branch`\n",
-    "3. If you have followed our recommendations, you should be in the `lab4` branch, please commit your local changed to the repo index by typing:\n",
-    "```\n",
-    "git commit -am \"finished lab4\"\n",
-    "```\n",
-    "4. Now you can switch to `master` branch by typing: \n",
-    "```\n",
-    "git checkout master\n",
-    " ```\n",
-    "5. To update the repository (note, assuming master does not have any conflicts), if there are some, have a look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a>\n",
-    "```\n",
-    "git pull\n",
-    "```\n",
-    "6. And now, create the new branch & switch to it by typing:\n",
-    "```\n",
-    "git checkout -b lab5\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Overview of alternative transfer functions\n",
-    "\n",
-    "Now, we briefly summarise some other possible choices for hidden layer transfer functions.\n",
-    "\n",
-    "## Tanh\n",
-    "\n",
-    "Given a linear activation $a_{i}$ tanh implements the following operation:\n",
-    "\n",
-    "(1) $h_i(a_i) = \\mbox{tanh}(a_i) = \\frac{\\exp(a_i) - \\exp(-a_i)}{\\exp(a_i) + \\exp(-a_i)}$\n",
-    "\n",
-    "Hence, the derivative of $h_i$ with respect to $a_i$ is:\n",
-    "\n",
-    "(2) $\\begin{align}\n",
-    "\\frac{\\partial h_i}{\\partial a_i} &= 1 - h^2_i\n",
-    "\\end{align}\n",
-    "$\n",
-    "\n",
-    "\n",
-    "## ReLU\n",
-    "\n",
-    "Given a linear activation $a_{i}$ relu implements the following operation:\n",
-    "\n",
-    "(3) $h_i(a_i) = \\max(0, a_i)$\n",
-    "\n",
-    "Hence, the gradient is :\n",
-    "\n",
-    "(4) $\\begin{align}\n",
-    "\\frac{\\partial h_i}{\\partial a_i} &=\n",
-    "\\begin{cases}\n",
-    "     1     & \\quad \\text{if } a_i > 0 \\\\\n",
-    "     0       & \\quad \\text{if } a_i \\leq 0 \\\\\n",
-    "\\end{cases}\n",
-    "\\end{align}\n",
-    "$\n",
-    "\n",
-    "ReLU implements a form of data-driven sparsity, that is, on average the activations are sparse (many of them are 0) but the general sparsity pattern will depend on particular data-point. This is different from sparsity obtained in model's parameters one can obtain with $L1$ regularisation as the latter affect all data-points in the same way.\n",
-    "\n",
-    "## Maxout\n",
-    "\n",
-    "Maxout is an example of data-driven type of non-linearity in which the transfer function can be learned from data. That is, the model can build a non-linear transfer function from piecewise linear components.  These linear components, depending on the number of linear regions used in the pooling operator (given by parameter $K$), can approximate  arbitrary functions, such as ReLU, abs, etc.\n",
-    "\n",
-    "Given some subset (group, pool) of $K$ linear activations $a_{j}, a_{j+1}, \\ldots, a_{j+K}$ at the $l$-th layer, maxout implements the following operation:\n",
-    "\n",
-    "(5) $h_i(a_j, a_{j+1}, \\ldots, a_{j+K}) = \\max(a_j, a_{j+1}, \\ldots, a_{j+K})$\n",
-    "\n",
-    "Hence, the gradient of $h_i$ w.r.t to the pooling region $a_{j}, a_{j+1}, \\ldots, a_{j+K}$  is :\n",
-    "\n",
-    "(6) $\\begin{align}\n",
-    "\\frac{\\partial h_i}{\\partial (a_j, a_{j+1}, \\ldots, a_{j+K})} &=\n",
-    "\\begin{cases}\n",
-    "     1     & \\quad \\text{for the max activation}  \\\\\n",
-    "     0       & \\quad \\text{otherwise} \\\\\n",
-    "\\end{cases}\n",
-    "\\end{align}\n",
-    "$\n",
-    "\n",
-    "Implementation tips are given in Exercise 3.\n",
-    "\n",
-    "# On weight initialisation\n",
-    "\n",
-    "Activation functions directly affect the \"network dynamics\", that is, the magnitudes of the statistics each layer is producing. For example, *squashing* non-linearities like sigmoid or tanh bring the linear activations to a certain bounded range. ReLU, on the contrary, has an unbounded positive side. This directly affects all statistics collected in forward and backward passes as well as the gradients w.r.t paramters - hence also the pace at which the model learns. That is why learning rate is usually required to be tuned for given the characterictics of the non-linearities used. \n",
-    "\n",
-    "Another important hyperparameter is the initial range used to initialise the weight matrices.  We have largely ignored it so far (although if you did further experiments in coursework 1, you may have found setting it had an effect on training deeper networks with 4 or 5 hidden layers).  However, for sigmoidal non-linearities (sigmoid, tanh) the initialisation range is an important hyperparameter and a considerable amount of research has been put into determining what is the best strategy for choosing it. In fact, one of the early triggers of the recent resurgence of deep learning was pre-training - techniques for initialising weights in an unsupervised manner so that one can effectively train deeper models in supervised fashion later.  \n",
-    "\n",
-    "## Sigmoidal transfer functions\n",
-    "\n",
-    "Y. LeCun in [Efficient Backprop](http://link.springer.com/chapter/10.1007%2F3-540-49430-8_2) recommends the following setting of the initial range $r$ for sigmoidal units (assuming that the data has been normalised to zero mean, unit variance): \n",
-    "\n",
-    "(7) $ r = \\frac{1}{\\sqrt{N_{IN}}} $\n",
-    "\n",
-    "where $N_{IN}$ is the number of inputs to the given layer and the weights are then sampled from the (usually uniform) distribution $U(-r,r)$. The motivation is to keep the initial forward-pass signal in the linear region of the sigmoid non-linearity so that the gradients are large enough for training to proceed (note that the sigmoidal non-linearities saturate when activations are either very positive or very negative, leading to very small gradients and hence poor learning dynamics).\n",
-    "\n",
-    "The initialisation used in (7) however leads to different magnitudes of activations/gradients at different layers (due to multiplicative nature of the computations) and more recently, [Glorot et. al](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf) proposed the so-called *normalised initialisation*, which ensures the variance of the forward signal (activations) is approximately the same in each layer. The same applies to the gradients obtained in backward pass.  \n",
-    "\n",
-    "The $r$ in the *normalised initialisation* for $\\mbox{tanh}$ non-linearity is then:\n",
-    "\n",
-    "(8) $ r = \\frac{\\sqrt{6}}{\\sqrt{N_{IN}+N_{OUT}}} $\n",
-    "\n",
-    "For the sigmoid (logistic) non-linearity, to get similiar characteristics, one should scale $r$ in (8) by 4, that is:\n",
-    "\n",
-    "(9) $ r = \\frac{4\\sqrt{6}}{\\sqrt{N_{IN}+N_{OUT}}} $\n",
-    "\n",
-    "## Piece-wise linear transfer functions (ReLU, Maxout)\n",
-    "\n",
-    "For unbounded transfer functions initialisation is not as crucial as for sigmoidal ones. This is due to the fact that their gradients do not diminish (they are acutally more likely to explode) and they do not saturate (ReLU saturates at 0, but not on the positive slope, where gradient is 1 everywhere).  (In practice ReLU is sometimes \"clipped\" with a maximum value, typically 20).\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 1:  Implement the tanh transfer function\n",
-    "\n",
-    "Your implementation should follow the code conventions used to build other layer types (for example, Sigmoid and Softmax). Test your solution by training a one-hidden-layer model with 100 hidden units, similiar to the one used in Task 3a in the coursework. \n",
-    "\n",
-    "Tune the learning rate and compare the initial ranges in equations (7) and (8). Note that there might not be much difference for one-hidden-layer model, but you can easily notice a substantial gain from using (8) (or (9) for  logistic sigmoid activation) for deeper models, for example, the 5 hidden-layer network from the first coursework.\n",
-    "\n",
-    "Implementation tip: Use numpy.tanh() to compute the non-linearity.  Use the irange argument when creating the given layer type to provide the initial sampling range."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 2: Implement ReLU\n",
-    "\n",
-    "Again, your implementation should follow the conventions used to build Linear, Sigmoid and Softmax layers. As in exercise 1, test your solution by training a one-hidden-layer model with 100 hidden units, similiar to the one used in Task 3a in the coursework. Tune the learning rate (start with the initial one set to 0.1) with the initial weight range set to 0.05."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 3: Implement Maxout\n",
-    "\n",
-    "As with the previous two exercises, your implementation should follow the conventions used to build the Linear, Sigmoid and Softmax layers. For now implement only non-overlapping pools (i.e. the pool in which all activations $a_{j}, a_{j+1}, \\ldots, a_{j+K}$ belong to only one pool). As before, test your solution by training a one-hidden-layer model with 100 hidden units, similiar to the one used in Task 3a in the coursework. Use the same optimisation hyper-parameters (learning rate, initial weights range) as you used for ReLU models. Tune the pool size $K$ (but keep the number of total parameters fixed).\n",
-    "\n",
-    "Note: The Max operator reduces dimensionality, hence for example, to get 100 hidden maxout units with pooling size set to $K=2$ the size of linear part needs to be set to $100K$ (assuming non-overlapping pools). This affects how you compute the total number of weights in the model.\n",
-    "\n",
-    "Implementation tips: To back-propagate through the maxout layer, one needs to keep track of which linear activation $a_{j}, a_{j+1}, \\ldots, a_{j+K}$ was the maximum in each pool. The convenient way to do so is by storing the indices of the maximum units in the fprop function and then in the backprop stage pass the gradient only through those (i.e. for example, one can build an auxiliary matrix where each element is either 1 (if unit was maximum, and passed forward through the max operator for a given data-point) or 0 otherwise. Then in the backward pass it suffices to upsample the maxout *igrads* signal to the linear layer dimension and element-wise multiply by the aforemenioned auxiliary matrix.\n",
-    "\n",
-    "*Optional:* Implement the generic pooling mechanism by introducing an additional *stride* hyper-parameter $0<S\\leq K$. It specifies how many units you move to build the next pool. For instance, for non-overlapping pooling with $S=K=3$ one would build the first two maxout units as: $h_1=\\max(a_1,a_2,a_3)$ and $h_2=\\max(a_4,a_5,a_6)$. However, after setting $S=1$ the pools should share some subset of linear activations: $h_1=\\max(a_1,a_2,a_3)$ and $h_2=\\max(a_2,a_3,a_4)$."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercise 4: Train all the above models with dropout\n",
-    "\n",
-    "Try all of the above non-linearities with dropout training. Use the dropout hyper-parameters $\\{p_{inp}, p_{hid}\\}$ that worked best for sigmoid models from the previous lab.\n",
-    "\n",
-    "Note: the code for dropout you were asked to implement last week has not been given as a solution for this week - as a result you need to move/merge the required dropout parts from your previous *lab4* branch (or implement it if you haven't already done so). \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}