diff --git a/notebooks/Convolutional layer tests (with implementation).ipynb b/notebooks/Convolutional layer tests (with implementation).ipynb
new file mode 100644
index 0000000..73a6bde
--- /dev/null
+++ b/notebooks/Convolutional layer tests (with implementation).ipynb	
@@ -0,0 +1,997 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For those who decide to implement and experiment with convolutional layers for the second coursework, below a skeleton class and associated test functions for the `fprop`, `bprop` and `grads_wrt_params` methods of the class are included.\n",
+    "\n",
+    "The test functions assume that in your implementation of `fprop` for the convolutional layer, outputs are calculated only for 'valid' overlaps of the kernel filters with the input - i.e. without any padding.\n",
+    "\n",
+    "It is also assumed that if convolutions with non-unit strides are implemented the default behaviour is to take unit-strides, with the test cases only correct for unit strides in both directions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import mlp.layers as layers\n",
+    "import mlp.initialisers as init\n",
+    "\n",
+    "from scipy.ndimage.filters import convolve\n",
+    "from scipy.signal import convolve2d, correlate2d\n",
+    "\n",
+    "class ConvolutionalLayer(layers.LayerWithParameters):\n",
+    "    \"\"\"Layer implementing a 2D convolution-based transformation of its inputs.\n",
+    "\n",
+    "    The layer is parameterised by a set of 2D convolutional kernels, a four\n",
+    "    dimensional array of shape\n",
+    "        (num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2)\n",
+    "    and a bias vector, a one dimensional array of shape\n",
+    "        (num_output_channels,)\n",
+    "    i.e. one shared bias per output channel.\n",
+    "\n",
+    "    Assuming no-padding is applied to the inputs so that outputs are only\n",
+    "    calculated for positions where the kernel filters fully overlap with the\n",
+    "    inputs, and that unit strides are used the outputs will have spatial extent\n",
+    "        output_dim_1 = input_dim_1 - kernel_dim_1 + 1\n",
+    "        output_dim_2 = input_dim_2 - kernel_dim_2 + 1\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, num_input_channels, num_output_channels,\n",
+    "                 input_dim_1, input_dim_2,\n",
+    "                 kernel_dim_1, kernel_dim_2,\n",
+    "                 kernels_init=init.UniformInit(-0.01, 0.01),\n",
+    "                 biases_init=init.ConstantInit(0.),\n",
+    "                 kernels_penalty=None, biases_penalty=None):\n",
+    "        \"\"\"Initialises a parameterised convolutional layer.\n",
+    "\n",
+    "        Args:\n",
+    "            num_input_channels (int): Number of channels in inputs to\n",
+    "                layer (this may be number of colour channels in the input\n",
+    "                images if used as the first layer in a model, or the\n",
+    "                number of output channels, a.k.a. feature maps, from a\n",
+    "                a previous convolutional layer).\n",
+    "            num_output_channels (int): Number of channels in outputs\n",
+    "                from the layer, a.k.a. number of feature maps.\n",
+    "            input_dim_1 (int): Size of first input dimension of each 2D\n",
+    "                channel of inputs.\n",
+    "            input_dim_2 (int): Size of second input dimension of each 2D\n",
+    "                channel of inputs.\n",
+    "            kernel_dim_x (int): Size of first dimension of each 2D channel of\n",
+    "                kernels.\n",
+    "            kernel_dim_y (int): Size of second dimension of each 2D channel of\n",
+    "                kernels.\n",
+    "            kernels_intialiser: Initialiser for the kernel parameters.\n",
+    "            biases_initialiser: Initialiser for the bias parameters.\n",
+    "            kernels_penalty: Kernel-dependent penalty term (regulariser) or\n",
+    "                None if no regularisation is to be applied to the kernels.\n",
+    "            biases_penalty: Biases-dependent penalty term (regulariser) or\n",
+    "                None if no regularisation is to be applied to the biases.\n",
+    "        \"\"\"\n",
+    "        self.num_input_channels = num_input_channels\n",
+    "        self.num_output_channels = num_output_channels\n",
+    "        self.input_dim_1 = input_dim_1\n",
+    "        self.input_dim_2 = input_dim_2\n",
+    "        self.kernel_dim_1 = kernel_dim_1\n",
+    "        self.kernel_dim_2 = kernel_dim_2\n",
+    "        self.kernels_init = kernels_init\n",
+    "        self.biases_init = biases_init\n",
+    "        self.kernels_shape = (\n",
+    "            num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2\n",
+    "        )\n",
+    "        self.inputs_shape = (\n",
+    "            None, num_input_channels, input_dim_1, input_dim_2\n",
+    "        )\n",
+    "        self.kernels = self.kernels_init(self.kernels_shape)\n",
+    "        self.biases = self.biases_init(num_output_channels)\n",
+    "        self.kernels_penalty = kernels_penalty\n",
+    "        self.biases_penalty = biases_penalty\n",
+    "\n",
+    "    def fprop(self, inputs):\n",
+    "        \"\"\"Forward propagates activations through the layer transformation.\n",
+    "\n",
+    "        For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer\n",
+    "        corresponds to `y = conv2d(x, K) + b`.\n",
+    "\n",
+    "        Args:\n",
+    "            inputs: Array of layer inputs of shape \n",
+    "                (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
+    "\n",
+    "        Returns:\n",
+    "            outputs: Array of layer outputs of shape \n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "        \"\"\"\n",
+    "        output_dim_1 = self.input_dim_1 - self.kernel_dim_1 + 1\n",
+    "        output_dim_2 = self.input_dim_2 - self.kernel_dim_2 + 1\n",
+    "        batch_size = inputs.shape[0]\n",
+    "        outputs = np.zeros((batch_size, self.num_output_channels, \n",
+    "                            output_dim_1, output_dim_2))\n",
+    "        for b in range(batch_size):\n",
+    "            for o in range(self.num_output_channels):\n",
+    "                for i in range(self.num_input_channels):\n",
+    "                    outputs[b, o] += convolve2d(\n",
+    "                        inputs[b, i], self.kernels[o, i], mode='valid')\n",
+    "                outputs[b, o] += self.biases[o]\n",
+    "        return outputs\n",
+    "\n",
+    "    def bprop(self, inputs, outputs, grads_wrt_outputs):\n",
+    "        \"\"\"Back propagates gradients through a layer.\n",
+    "\n",
+    "        Given gradients with respect to the outputs of the layer calculates the\n",
+    "        gradients with respect to the layer inputs.\n",
+    "\n",
+    "        Args:\n",
+    "            inputs: Array of layer inputs of shape\n",
+    "                (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
+    "            outputs: Array of layer outputs calculated in forward pass of\n",
+    "                shape\n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "            grads_wrt_outputs: Array of gradients with respect to the layer\n",
+    "                outputs of shape\n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "\n",
+    "        Returns:\n",
+    "            Array of gradients with respect to the layer inputs of shape\n",
+    "            (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
+    "        \"\"\"\n",
+    "        output_dim_1, output_dim_2 = grads_wrt_outputs.shape[-2:]\n",
+    "        batch_size = inputs.shape[0]\n",
+    "        pad_1 = self.kernel_dim_1 - 1\n",
+    "        pad_2 = self.kernel_dim_2 - 1\n",
+    "        padded_grads_wrt_outputs = np.zeros(\n",
+    "            (batch_size, self.num_output_channels, \n",
+    "             output_dim_1 + 2 * pad_1, output_dim_2 + 2 * pad_2)\n",
+    "        )\n",
+    "        padded_grads_wrt_outputs[\n",
+    "            :, :, pad_1:pad_1 + output_dim_1, pad_2:pad_2 + output_dim_2] = grads_wrt_outputs\n",
+    "        grads_wrt_inputs = np.zeros(\n",
+    "            (batch_size, self.num_input_channels, self.input_dim_1, self.input_dim_2))\n",
+    "        for b in range(batch_size):\n",
+    "            for o in range(self.num_output_channels):\n",
+    "                for i in range(self.num_input_channels):\n",
+    "                    grads_wrt_inputs[b, i] += correlate2d(\n",
+    "                        padded_grads_wrt_outputs[b, o], self.kernels[o, i], mode='valid')\n",
+    "        return grads_wrt_inputs\n",
+    "\n",
+    "    def grads_wrt_params(self, inputs, grads_wrt_outputs):\n",
+    "        \"\"\"Calculates gradients with respect to layer parameters.\n",
+    "\n",
+    "        Args:\n",
+    "            inputs: array of inputs to layer of shape (batch_size, input_dim)\n",
+    "            grads_wrt_to_outputs: array of gradients with respect to the layer\n",
+    "                outputs of shape\n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "\n",
+    "        Returns:\n",
+    "            list of arrays of gradients with respect to the layer parameters\n",
+    "            `[grads_wrt_kernels, grads_wrt_biases]`.\n",
+    "        \"\"\"\n",
+    "        output_dim_1, output_dim_2 = grads_wrt_outputs.shape[-2:]\n",
+    "        batch_size = inputs.shape[0]\n",
+    "        grads_wrt_kernels = np.zeros(self.kernels_shape)\n",
+    "        for b in range(batch_size):\n",
+    "            for o in range(self.num_output_channels):\n",
+    "                for i in range(self.num_input_channels):\n",
+    "                    grads_wrt_kernels[o, i] += correlate2d(\n",
+    "                        grads_wrt_outputs[b, o], inputs[b, i], mode='valid')\n",
+    "        grads_wrt_biases = grads_wrt_outputs.sum((0, 2, 3))\n",
+    "        return grads_wrt_kernels, grads_wrt_biases\n",
+    "\n",
+    "    def params_penalty(self):\n",
+    "        \"\"\"Returns the parameter dependent penalty term for this layer.\n",
+    "\n",
+    "        If no parameter-dependent penalty terms are set this returns zero.\n",
+    "        \"\"\"\n",
+    "        params_penalty = 0\n",
+    "        if self.kernels_penalty is not None:\n",
+    "            params_penalty += self.kernels_penalty(self.kernels)\n",
+    "        if self.biases_penalty is not None:\n",
+    "            params_penalty += self.biases_penalty(self.biases)\n",
+    "        return params_penalty\n",
+    "\n",
+    "    @property\n",
+    "    def params(self):\n",
+    "        \"\"\"A list of layer parameter values: `[kernels, biases]`.\"\"\"\n",
+    "        return [self.kernels, self.biases]\n",
+    "\n",
+    "    @params.setter\n",
+    "    def params(self, values):\n",
+    "        self.kernels = values[0]\n",
+    "        self.biases = values[1]\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return (\n",
+    "            'ConvolutionalLayer(\\n'\n",
+    "            '    num_input_channels={0}, num_output_channels={1},\\n'\n",
+    "            '    input_dim_1={2}, input_dim_2={3},\\n'\n",
+    "            '    kernel_dim_1={4}, kernel_dim_2={5}\\n'\n",
+    "            ')'\n",
+    "            .format(self.num_input_channels, self.num_output_channels,\n",
+    "                    self.input_dim_1, self.input_dim_2, self.kernel_dim_1,\n",
+    "                    self.kernel_dim_2)\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext cython"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%%cython --compile-args=-fopenmp --link-args=-fopenmp\n",
+    "#!python\n",
+    "#cython: embedsignature=True\n",
+    "\n",
+    "import numpy as np\n",
+    "cimport numpy as np\n",
+    "cimport cython\n",
+    "from cython.view cimport array\n",
+    "from cython.parallel import prange\n",
+    "\n",
+    "DTYPE = np.float64\n",
+    "ctypedef np.float64_t DTYPE_t\n",
+    "\n",
+    "@cython.boundscheck(False)\n",
+    "@cython.wraparound(False)\n",
+    "@cython.initializedcheck(False)\n",
+    "def conv2d_fprop(\n",
+    "        double[:, :, :, :] inputs, double[:, :, :, :] kernels, \n",
+    "        int stride_x=1, int stride_y=1):\n",
+    "    cdef int i, k, x, y, c, m, n, l, t\n",
+    "    cdef int batch_size = inputs.shape[0]   \n",
+    "    cdef int in_shape_x = inputs.shape[2]\n",
+    "    cdef int in_shape_y = inputs.shape[3]\n",
+    "    cdef int n_out_channels = kernels.shape[0]\n",
+    "    cdef int n_in_channels = kernels.shape[1]\n",
+    "    cdef int kernel_shape_x = kernels.shape[2]\n",
+    "    cdef int kernel_shape_y = kernels.shape[3]\n",
+    "    cdef int out_shape_x = (in_shape_x - kernel_shape_x + 1) // stride_x\n",
+    "    cdef int out_shape_y = (in_shape_y - kernel_shape_y + 1) // stride_y\n",
+    "    cdef double [:, :, :, :] outputs = np.zeros(\n",
+    "        (batch_size, n_out_channels, out_shape_x, out_shape_y))\n",
+    "    for i in range(batch_size):\n",
+    "        for k in range(n_out_channels):\n",
+    "            for x in range(out_shape_x):\n",
+    "                for y in range(out_shape_y):\n",
+    "                    l = x * stride_x\n",
+    "                    t = y * stride_y\n",
+    "                    for c in range(n_in_channels):\n",
+    "                        for m in range(kernel_shape_x):\n",
+    "                            for n in range(kernel_shape_y):\n",
+    "                                outputs[i, k, x, y] += (\n",
+    "                                    kernels[k, c, m, n] * inputs[i, c, l + m, t + n])\n",
+    "    return outputs\n",
+    "\n",
+    "@cython.boundscheck(False)\n",
+    "@cython.wraparound(False)\n",
+    "@cython.initializedcheck(False)\n",
+    "def conv2d_bprop(\n",
+    "           double[:, :, :, :] grads_at_output, \n",
+    "           double[:, :, :, :] kernels,\n",
+    "           int stride_x=1, int stride_y=1):\n",
+    "    cdef int i, k, x, y, c, m, n, l, t\n",
+    "    cdef int batch_size = grads_at_output.shape[0]\n",
+    "    cdef int n_out_channels = kernels.shape[0]\n",
+    "    cdef int n_in_channels = kernels.shape[1]\n",
+    "    cdef int kernel_shape_x = kernels.shape[2]\n",
+    "    cdef int kernel_shape_y = kernels.shape[3]\n",
+    "    cdef int out_shape_x = grads_at_output.shape[2]\n",
+    "    cdef int out_shape_y = grads_at_output.shape[3]\n",
+    "    cdef int in_shape_x = out_shape_x * stride_x + kernel_shape_x - 1\n",
+    "    cdef int in_shape_y = out_shape_y * stride_y + kernel_shape_y - 1\n",
+    "    cdef double [:, :, :, :] grads_at_input = np.zeros(\n",
+    "        (batch_size, n_in_channels, in_shape_x, in_shape_y))\n",
+    "    for i in range(batch_size):\n",
+    "        for k in range(n_out_channels):\n",
+    "            for x in range(out_shape_x):\n",
+    "                for y in range(out_shape_y):\n",
+    "                    l = x * stride_x\n",
+    "                    t = y * stride_y\n",
+    "                    for c in range(n_in_channels):\n",
+    "                        for m in range(kernel_shape_x):\n",
+    "                            for n in range(kernel_shape_y):\n",
+    "                                grads_at_input[i, c, l + m, t + n] += (\n",
+    "                                    kernels[k, c, m, n] * grads_at_output[i, k, x, y])\n",
+    "    return grads_at_input\n",
+    "\n",
+    "@cython.boundscheck(False)\n",
+    "@cython.wraparound(False)\n",
+    "@cython.initializedcheck(False)\n",
+    "def conv2d_grads_wrt_kernels(\n",
+    "           double[:, :, :, :] grads_at_output, \n",
+    "           double[:, :, :, :] inputs,\n",
+    "           int kernel_shape_x, int kernel_shape_y,\n",
+    "           int stride_x=1, int stride_y=1):\n",
+    "    cdef int i, k, x, y, c, m, n, l, t\n",
+    "    cdef int batch_size = grads_at_output.shape[0]\n",
+    "    cdef int n_out_channels = grads_at_output.shape[1]\n",
+    "    cdef int n_in_channels = inputs.shape[1]\n",
+    "    cdef int out_shape_x = grads_at_output.shape[2]\n",
+    "    cdef int out_shape_y = grads_at_output.shape[3]\n",
+    "    cdef int in_shape_x = inputs.shape[2]\n",
+    "    cdef int in_shape_y = inputs.shape[3]\n",
+    "    cdef double [:, :, :, :] kernel_grads = np.zeros(\n",
+    "        (n_out_channels, n_in_channels, kernel_shape_x, kernel_shape_y))\n",
+    "    for i in range(batch_size):\n",
+    "        for k in range(n_out_channels):\n",
+    "            for x in range(out_shape_x):\n",
+    "                for y in range(out_shape_y):\n",
+    "                    l = x * stride_x\n",
+    "                    t = y * stride_y\n",
+    "                    for c in range(n_in_channels):\n",
+    "                        for m in range(kernel_shape_x):\n",
+    "                            for n in range(kernel_shape_y):\n",
+    "                                kernel_grads[k, c, m, n] += (\n",
+    "                                    inputs[i, c, l + m, t + n] * grads_at_output[i, k, x, y])\n",
+    "    return kernel_grads"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "class CythonConvolutionalLayer(ConvolutionalLayer):\n",
+    "    \n",
+    "    def __init__(self, num_input_channels, num_output_channels,\n",
+    "                 input_dim_1, input_dim_2,\n",
+    "                 kernel_dim_1, kernel_dim_2,\n",
+    "                 kernels_init=init.UniformInit(-0.01, 0.01),\n",
+    "                 biases_init=init.ConstantInit(0.),\n",
+    "                 kernels_penalty=None, biases_penalty=None):\n",
+    "        super(CythonConvolutionalLayer, self).__init__(\n",
+    "            num_input_channels, num_output_channels,\n",
+    "            input_dim_1, input_dim_2,\n",
+    "            kernel_dim_1, kernel_dim_2,\n",
+    "            kernels_init, biases_init, \n",
+    "            kernels_penalty, biases_penalty\n",
+    "        )\n",
+    "        self.kernels = self.kernels.astype(np.double)\n",
+    "        self.biases = self.biases.astype(np.double)\n",
+    "\n",
+    "    def fprop(self, inputs):\n",
+    "        \"\"\"Forward propagates activations through the layer transformation.\n",
+    "\n",
+    "        For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer\n",
+    "        corresponds to `y = conv2d(x, K) + b`.\n",
+    "\n",
+    "        Args:\n",
+    "            inputs: Array of layer inputs of shape \n",
+    "                (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
+    "\n",
+    "        Returns:\n",
+    "            outputs: Array of layer outputs of shape \n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "        \"\"\"\n",
+    "        return np.array(conv2d_fprop(inputs, self.kernels)) + self.biases[None, :, None, None]\n",
+    "\n",
+    "    def bprop(self, inputs, outputs, grads_wrt_outputs):\n",
+    "        \"\"\"Back propagates gradients through a layer.\n",
+    "\n",
+    "        Given gradients with respect to the outputs of the layer calculates the\n",
+    "        gradients with respect to the layer inputs.\n",
+    "\n",
+    "        Args:\n",
+    "            inputs: Array of layer inputs of shape\n",
+    "                (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
+    "            outputs: Array of layer outputs calculated in forward pass of\n",
+    "                shape\n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "            grads_wrt_outputs: Array of gradients with respect to the layer\n",
+    "                outputs of shape\n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "\n",
+    "        Returns:\n",
+    "            Array of gradients with respect to the layer inputs of shape\n",
+    "            (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
+    "        \"\"\"\n",
+    "        return np.array(conv2d_bprop(grads_wrt_outputs, self.kernels, 1, 1))\n",
+    "\n",
+    "    def grads_wrt_params(self, inputs, grads_wrt_outputs):\n",
+    "        \"\"\"Calculates gradients with respect to layer parameters.\n",
+    "\n",
+    "        Args:\n",
+    "            inputs: array of inputs to layer of shape (batch_size, input_dim)\n",
+    "            grads_wrt_to_outputs: array of gradients with respect to the layer\n",
+    "                outputs of shape\n",
+    "                (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
+    "\n",
+    "        Returns:\n",
+    "            list of arrays of gradients with respect to the layer parameters\n",
+    "            `[grads_wrt_kernels, grads_wrt_biases]`.\n",
+    "        \"\"\"\n",
+    "        grads_wrt_kernels = conv2d_grads_wrt_kernels(\n",
+    "            grads_wrt_outputs, inputs, self.kernel_dim_1, self.kernel_dim_2, 1, 1)\n",
+    "        grads_wrt_biases = grads_wrt_outputs.sum((0, 2, 3))\n",
+    "        return np.array(grads_wrt_kernels), grads_wrt_biases\n",
+    "\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return (\n",
+    "            'CythonConvolutionalLayer(\\n'\n",
+    "            '    num_input_channels={0}, num_output_channels={1},\\n'\n",
+    "            '    input_dim_1={2}, input_dim_2={3},\\n'\n",
+    "            '    kernel_dim_1={4}, kernel_dim_2={5}\\n'\n",
+    "            ')'\n",
+    "            .format(self.num_input_channels, self.num_output_channels,\n",
+    "                    self.input_dim_1, self.input_dim_2, self.kernel_dim_1,\n",
+    "                    self.kernel_dim_2)\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The three test functions are defined in the cell below. All the functions take as first argument the *class* corresponding to the convolutional layer implementation to be tested (**not** an instance of the class). It is assumed the class being tested has an `__init__` method with at least all of the arguments defined in the skeleton definition above. A boolean second argument to each function can be used to specify if the layer implements a cross-correlation or convolution based operation (see note in [seventh lecture slides](http://www.inf.ed.ac.uk/teaching/courses/mlp/2016/mlp07-cnn.pdf))."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "def test_conv_layer_fprop(layer_class, do_cross_correlation=False):\n",
+    "    \"\"\"Tests `fprop` method of a convolutional layer.\n",
+    "    \n",
+    "    Checks the outputs of `fprop` method for a fixed input against known\n",
+    "    reference values for the outputs and raises an AssertionError if\n",
+    "    the outputted values are not consistent with the reference values. If\n",
+    "    tests are all passed returns True.\n",
+    "    \n",
+    "    Args:\n",
+    "        layer_class: Convolutional layer implementation following the \n",
+    "            interface defined in the provided skeleton class.\n",
+    "        do_cross_correlation: Whether the layer implements an operation\n",
+    "            corresponding to cross-correlation (True) i.e kernels are\n",
+    "            not flipped before sliding over inputs, or convolution\n",
+    "            (False) with filters being flipped.\n",
+    "\n",
+    "    Raises:\n",
+    "        AssertionError: Raised if output of `layer.fprop` is inconsistent \n",
+    "            with reference values either in shape or values.\n",
+    "    \"\"\"\n",
+    "    inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)\n",
+    "    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)\n",
+    "    if do_cross_correlation:\n",
+    "        kernels = kernels[:, :, ::-1, ::-1]\n",
+    "    biases = np.arange(2).astype(np.double)\n",
+    "    true_output = np.array(\n",
+    "        [[[[ -958., -1036., -1114.],\n",
+    "           [-1270., -1348., -1426.],\n",
+    "           [-1582., -1660., -1738.]],\n",
+    "          [[ 1707.,  1773.,  1839.],\n",
+    "           [ 1971.,  2037.,  2103.],\n",
+    "           [ 2235.,  2301.,  2367.]]],\n",
+    "         [[[-4702., -4780., -4858.],\n",
+    "           [-5014., -5092., -5170.],\n",
+    "           [-5326., -5404., -5482.]],\n",
+    "          [[ 4875.,  4941.,  5007.],\n",
+    "           [ 5139.,  5205.,  5271.],\n",
+    "           [ 5403.,  5469.,  5535.]]]]\n",
+    "    )\n",
+    "    layer = layer_class(\n",
+    "        num_input_channels=kernels.shape[1], \n",
+    "        num_output_channels=kernels.shape[0], \n",
+    "        input_dim_1=inputs.shape[2], \n",
+    "        input_dim_2=inputs.shape[3],\n",
+    "        kernel_dim_1=kernels.shape[2],\n",
+    "        kernel_dim_2=kernels.shape[3]\n",
+    "    )\n",
+    "    layer.params = [kernels, biases]\n",
+    "    layer_output = layer.fprop(inputs)\n",
+    "    assert layer_output.shape == true_output.shape, (\n",
+    "        'Layer fprop gives incorrect shaped output. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_output.shape, layer_output.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_output, true_output), (\n",
+    "        'Layer fprop does not give correct output. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
+    "        .format(true_output, layer_output)\n",
+    "    )\n",
+    "    return True\n",
+    "\n",
+    "def test_conv_layer_bprop(layer_class, do_cross_correlation=False):\n",
+    "    \"\"\"Tests `bprop` method of a convolutional layer.\n",
+    "    \n",
+    "    Checks the outputs of `bprop` method for a fixed input against known\n",
+    "    reference values for the gradients with respect to inputs and raises \n",
+    "    an AssertionError if the returned values are not consistent with the\n",
+    "    reference values. If tests are all passed returns True.\n",
+    "    \n",
+    "    Args:\n",
+    "        layer_class: Convolutional layer implementation following the \n",
+    "            interface defined in the provided skeleton class.\n",
+    "        do_cross_correlation: Whether the layer implements an operation\n",
+    "            corresponding to cross-correlation (True) i.e kernels are\n",
+    "            not flipped before sliding over inputs, or convolution\n",
+    "            (False) with filters being flipped.\n",
+    "\n",
+    "    Raises:\n",
+    "        AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
+    "            with reference values either in shape or values.\n",
+    "    \"\"\"\n",
+    "    inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)\n",
+    "    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)\n",
+    "    if do_cross_correlation:\n",
+    "        kernels = kernels[:, :, ::-1, ::-1]\n",
+    "    biases = np.arange(2).astype(np.double)\n",
+    "    grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3)).astype(np.double)\n",
+    "    outputs = np.array(\n",
+    "        [[[[ -958., -1036., -1114.],\n",
+    "           [-1270., -1348., -1426.],\n",
+    "           [-1582., -1660., -1738.]],\n",
+    "          [[ 1707.,  1773.,  1839.],\n",
+    "           [ 1971.,  2037.,  2103.],\n",
+    "           [ 2235.,  2301.,  2367.]]],\n",
+    "         [[[-4702., -4780., -4858.],\n",
+    "           [-5014., -5092., -5170.],\n",
+    "           [-5326., -5404., -5482.]],\n",
+    "          [[ 4875.,  4941.,  5007.],\n",
+    "           [ 5139.,  5205.,  5271.],\n",
+    "           [ 5403.,  5469.,  5535.]]]]\n",
+    "    )\n",
+    "    true_grads_wrt_inputs = np.array(\n",
+    "      [[[[ 147.,  319.,  305.,  162.],\n",
+    "         [ 338.,  716.,  680.,  354.],\n",
+    "         [ 290.,  608.,  572.,  294.],\n",
+    "         [ 149.,  307.,  285.,  144.]],\n",
+    "        [[  23.,   79.,   81.,   54.],\n",
+    "         [ 114.,  284.,  280.,  162.],\n",
+    "         [ 114.,  272.,  268.,  150.],\n",
+    "         [  73.,  163.,  157.,   84.]],\n",
+    "        [[-101., -161., -143.,  -54.],\n",
+    "         [-110., -148., -120.,  -30.],\n",
+    "         [ -62.,  -64.,  -36.,    6.],\n",
+    "         [  -3.,   19.,   29.,   24.]]],\n",
+    "       [[[  39.,   67.,   53.,   18.],\n",
+    "         [  50.,   68.,   32.,   -6.],\n",
+    "         [   2.,  -40.,  -76.,  -66.],\n",
+    "         [ -31.,  -89., -111.,  -72.]],\n",
+    "        [[  59.,  115.,  117.,   54.],\n",
+    "         [ 114.,  212.,  208.,   90.],\n",
+    "         [ 114.,  200.,  196.,   78.],\n",
+    "         [  37.,   55.,   49.,   12.]],\n",
+    "        [[  79.,  163.,  181.,   90.],\n",
+    "         [ 178.,  356.,  384.,  186.],\n",
+    "         [ 226.,  440.,  468.,  222.],\n",
+    "         [ 105.,  199.,  209.,   96.]]]])\n",
+    "    layer = layer_class(\n",
+    "        num_input_channels=kernels.shape[1], \n",
+    "        num_output_channels=kernels.shape[0], \n",
+    "        input_dim_1=inputs.shape[2], \n",
+    "        input_dim_2=inputs.shape[3],\n",
+    "        kernel_dim_1=kernels.shape[2],\n",
+    "        kernel_dim_2=kernels.shape[3]\n",
+    "    )\n",
+    "    layer.params = [kernels, biases]\n",
+    "    layer_grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)\n",
+    "    assert layer_grads_wrt_inputs.shape == true_grads_wrt_inputs.shape, (\n",
+    "        'Layer bprop returns incorrect shaped array. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_grads_wrt_inputs.shape, layer_grads_wrt_inputs.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_grads_wrt_inputs, true_grads_wrt_inputs), (\n",
+    "        'Layer bprop does not return correct values. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}'\n",
+    "        .format(true_grads_wrt_inputs, layer_grads_wrt_inputs)\n",
+    "    )\n",
+    "    return True\n",
+    "\n",
+    "def test_conv_layer_grad_wrt_params(\n",
+    "        layer_class, do_cross_correlation=False):\n",
+    "    \"\"\"Tests `grad_wrt_params` method of a convolutional layer.\n",
+    "    \n",
+    "    Checks the outputs of `grad_wrt_params` method for fixed inputs \n",
+    "    against known reference values for the gradients with respect to \n",
+    "    kernels and biases, and raises an AssertionError if the returned\n",
+    "    values are not consistent with the reference values. If tests\n",
+    "    are all passed returns True.\n",
+    "    \n",
+    "    Args:\n",
+    "        layer_class: Convolutional layer implementation following the \n",
+    "            interface defined in the provided skeleton class.\n",
+    "        do_cross_correlation: Whether the layer implements an operation\n",
+    "            corresponding to cross-correlation (True) i.e kernels are\n",
+    "            not flipped before sliding over inputs, or convolution\n",
+    "            (False) with filters being flipped.\n",
+    "\n",
+    "    Raises:\n",
+    "        AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
+    "            with reference values either in shape or values.\n",
+    "    \"\"\"\n",
+    "    inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)\n",
+    "    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)\n",
+    "    biases = np.arange(2).astype(np.double)\n",
+    "    grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3)).astype(np.double)\n",
+    "    true_kernel_grads = np.array(\n",
+    "        [[[[ -240.,  -114.],\n",
+    "         [  264.,   390.]],\n",
+    "        [[-2256., -2130.],\n",
+    "         [-1752., -1626.]],\n",
+    "        [[-4272., -4146.],\n",
+    "         [-3768., -3642.]]],\n",
+    "       [[[ 5268.,  5232.],\n",
+    "         [ 5124.,  5088.]],\n",
+    "        [[ 5844.,  5808.],\n",
+    "         [ 5700.,  5664.]],\n",
+    "        [[ 6420.,  6384.],\n",
+    "         [ 6276.,  6240.]]]])\n",
+    "    if do_cross_correlation:\n",
+    "        kernels = kernels[:, :, ::-1, ::-1]\n",
+    "        true_kernel_grads = true_kernel_grads[:, :, ::-1, ::-1]\n",
+    "    true_bias_grads = np.array([-126.,   36.])\n",
+    "    layer = layer_class(\n",
+    "        num_input_channels=kernels.shape[1], \n",
+    "        num_output_channels=kernels.shape[0], \n",
+    "        input_dim_1=inputs.shape[2], \n",
+    "        input_dim_2=inputs.shape[3],\n",
+    "        kernel_dim_1=kernels.shape[2],\n",
+    "        kernel_dim_2=kernels.shape[3]\n",
+    "    )\n",
+    "    layer.params = [kernels, biases]\n",
+    "    layer_kernel_grads, layer_bias_grads = (\n",
+    "        layer.grads_wrt_params(inputs, grads_wrt_outputs))\n",
+    "    assert layer_kernel_grads.shape == true_kernel_grads.shape, (\n",
+    "        'grads_wrt_params gives incorrect shaped kernel gradients output. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_kernel_grads.shape, layer_kernel_grads.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_kernel_grads, true_kernel_grads), (\n",
+    "        'grads_wrt_params does not give correct kernel gradients output. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
+    "        .format(true_kernel_grads, layer_kernel_grads)\n",
+    "    )\n",
+    "    assert layer_bias_grads.shape == true_bias_grads.shape, (\n",
+    "        'grads_wrt_params gives incorrect shaped bias gradients output. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_bias_grads.shape, layer_bias_grads.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_bias_grads, true_bias_grads), (\n",
+    "        'grads_wrt_params does not give correct bias gradients output. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
+    "        .format(true_bias_grads, layer_bias_grads)\n",
+    "    )\n",
+    "    return True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "An example of using the test functions if given in the cell below. This assumes you implement a convolution (rather than cross-correlation) operation. If the implementation is correct "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "All tests passed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_correct = test_conv_layer_fprop(ConvolutionalLayer, False)\n",
+    "all_correct &= test_conv_layer_bprop(ConvolutionalLayer, False)\n",
+    "all_correct &= test_conv_layer_grad_wrt_params(ConvolutionalLayer, False)\n",
+    "if all_correct:\n",
+    "    print('All tests passed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "All tests passed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_correct = test_conv_layer_fprop(CythonConvolutionalLayer, True)\n",
+    "all_correct &= test_conv_layer_bprop(CythonConvolutionalLayer, True)\n",
+    "all_correct &= test_conv_layer_grad_wrt_params(CythonConvolutionalLayer, True)\n",
+    "if all_correct:\n",
+    "    print('All tests passed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import logging\n",
+    "from mlp.layers import ReluLayer, ReshapeLayer, AffineLayer\n",
+    "from mlp.errors import CrossEntropySoftmaxError\n",
+    "from mlp.models import MultipleLayerModel\n",
+    "from mlp.initialisers import UniformInit, ConstantInit, GlorotUniformInit\n",
+    "from mlp.learning_rules import GradientDescentLearningRule\n",
+    "from mlp.data_providers import MNISTDataProvider\n",
+    "from mlp.optimisers import Optimiser\n",
+    "%matplotlib inline\n",
+    "plt.style.use('ggplot')\n",
+    "\n",
+    "# Seed a random number generator\n",
+    "seed = 6102016 \n",
+    "rng = np.random.RandomState(seed)\n",
+    "\n",
+    "# Set up a logger object to print info about the training run to stdout\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.INFO)\n",
+    "logger.handlers = [logging.StreamHandler()]\n",
+    "\n",
+    "# Create data provider objects for the MNIST data set\n",
+    "train_data = MNISTDataProvider('train', rng=rng)\n",
+    "valid_data = MNISTDataProvider('valid', rng=rng)\n",
+    "input_dim, output_dim = 784, 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "train_data.inputs = train_data.inputs.astype(np.double)\n",
+    "valid_data.inputs = valid_data.inputs.astype(np.double)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0:\n",
+      "  error(train)=2.30e+00, acc(train)=1.40e-01, error(valid)=2.30e+00, acc(valid)=1.34e-01, params_penalty=0.00e+00\n",
+      "Epoch 1: 20.20s to complete\n",
+      "  error(train)=3.70e-01, acc(train)=8.91e-01, error(valid)=3.38e-01, acc(valid)=9.03e-01, params_penalty=0.00e+00\n",
+      "Epoch 2: 23.26s to complete\n",
+      "  error(train)=3.38e-01, acc(train)=9.03e-01, error(valid)=3.11e-01, acc(valid)=9.12e-01, params_penalty=0.00e+00\n",
+      "Epoch 3: 23.27s to complete\n",
+      "  error(train)=3.18e-01, acc(train)=9.10e-01, error(valid)=3.00e-01, acc(valid)=9.16e-01, params_penalty=0.00e+00\n",
+      "Epoch 4: 25.56s to complete\n",
+      "  error(train)=3.14e-01, acc(train)=9.10e-01, error(valid)=2.96e-01, acc(valid)=9.17e-01, params_penalty=0.00e+00\n",
+      "Epoch 5: 25.17s to complete\n",
+      "  error(train)=3.06e-01, acc(train)=9.13e-01, error(valid)=2.85e-01, acc(valid)=9.19e-01, params_penalty=0.00e+00\n",
+      "Epoch 6: 25.59s to complete\n",
+      "  error(train)=3.07e-01, acc(train)=9.14e-01, error(valid)=2.87e-01, acc(valid)=9.20e-01, params_penalty=0.00e+00\n",
+      "Epoch 7: 28.70s to complete\n",
+      "  error(train)=3.00e-01, acc(train)=9.15e-01, error(valid)=2.82e-01, acc(valid)=9.19e-01, params_penalty=0.00e+00\n",
+      "Epoch 8: 24.72s to complete\n",
+      "  error(train)=2.91e-01, acc(train)=9.18e-01, error(valid)=2.75e-01, acc(valid)=9.24e-01, params_penalty=0.00e+00\n",
+      "Epoch 9: 29.81s to complete\n",
+      "  error(train)=2.91e-01, acc(train)=9.19e-01, error(valid)=2.75e-01, acc(valid)=9.24e-01, params_penalty=0.00e+00\n",
+      "Epoch 10: 26.75s to complete\n",
+      "  error(train)=2.88e-01, acc(train)=9.19e-01, error(valid)=2.72e-01, acc(valid)=9.26e-01, params_penalty=0.00e+00\n"
+     ]
+    }
+   ],
+   "source": [
+    "batch_size = 50\n",
+    "kernel_dim_1 = 4\n",
+    "kernel_dim_2 = 4\n",
+    "input_dim_1 = 28\n",
+    "input_dim_2 = 28\n",
+    "num_output_channels = 1\n",
+    "num_input_channels = 1\n",
+    "learning_rate = 0.01\n",
+    "num_epochs = 10\n",
+    "stats_interval = 1\n",
+    "\n",
+    "# Reset random number generator and data provider states on each run\n",
+    "# to ensure reproducibility of results\n",
+    "rng.seed(seed)\n",
+    "train_data.reset()\n",
+    "valid_data.reset()\n",
+    "\n",
+    "# Alter data-provider batch size\n",
+    "train_data.batch_size = batch_size \n",
+    "valid_data.batch_size = batch_size\n",
+    "\n",
+    "# Create a parameter initialiser which will sample random uniform values\n",
+    "# from [-init_scale, init_scale]\n",
+    "kernels_init = UniformInit(-0.01, 0.01, rng=rng)\n",
+    "weights_init = GlorotUniformInit(rng=rng)\n",
+    "biases_init = ConstantInit(0.)\n",
+    "\n",
+    "# Create a model with two affine layers\n",
+    "hidden_dim = (input_dim_1 - kernel_dim_1 + 1) * (input_dim_2 - kernel_dim_1 + 1) * num_output_channels\n",
+    "model = MultipleLayerModel([\n",
+    "    ReshapeLayer((num_input_channels, input_dim_1, input_dim_1)),\n",
+    "    ConvolutionalLayer(\n",
+    "            num_input_channels, num_output_channels, \n",
+    "            input_dim_1, input_dim_2, \n",
+    "            kernel_dim_1, kernel_dim_2, \n",
+    "            kernels_init, biases_init\n",
+    "    ),\n",
+    "    ReluLayer(),\n",
+    "    ReshapeLayer(),\n",
+    "    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)\n",
+    "])\n",
+    "\n",
+    "# Initialise a cross entropy error object\n",
+    "error = CrossEntropySoftmaxError()\n",
+    "\n",
+    "# Use a basic gradient descent learning rule\n",
+    "learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)\n",
+    "\n",
+    "# Monitor classification accuracy during training\n",
+    "data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}\n",
+    "\n",
+    "optimiser = Optimiser(\n",
+    "    model, error, learning_rule, train_data, valid_data, data_monitors)\n",
+    "\n",
+    "stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0:\n",
+      "  error(train)=2.30e+00, acc(train)=1.09e-01, error(valid)=2.30e+00, acc(valid)=1.02e-01, params_penalty=0.00e+00\n",
+      "Epoch 1: 9.53s to complete\n",
+      "  error(train)=3.74e-01, acc(train)=8.90e-01, error(valid)=3.41e-01, acc(valid)=9.02e-01, params_penalty=0.00e+00\n",
+      "Epoch 2: 8.50s to complete\n",
+      "  error(train)=3.39e-01, acc(train)=9.03e-01, error(valid)=3.11e-01, acc(valid)=9.11e-01, params_penalty=0.00e+00\n",
+      "Epoch 3: 6.74s to complete\n",
+      "  error(train)=3.19e-01, acc(train)=9.10e-01, error(valid)=3.01e-01, acc(valid)=9.15e-01, params_penalty=0.00e+00\n",
+      "Epoch 4: 10.08s to complete\n",
+      "  error(train)=3.15e-01, acc(train)=9.11e-01, error(valid)=2.96e-01, acc(valid)=9.17e-01, params_penalty=0.00e+00\n",
+      "Epoch 5: 8.82s to complete\n",
+      "  error(train)=3.06e-01, acc(train)=9.14e-01, error(valid)=2.85e-01, acc(valid)=9.21e-01, params_penalty=0.00e+00\n",
+      "Epoch 6: 10.37s to complete\n",
+      "  error(train)=3.07e-01, acc(train)=9.13e-01, error(valid)=2.87e-01, acc(valid)=9.21e-01, params_penalty=0.00e+00\n",
+      "Epoch 7: 7.81s to complete\n",
+      "  error(train)=2.99e-01, acc(train)=9.16e-01, error(valid)=2.82e-01, acc(valid)=9.20e-01, params_penalty=0.00e+00\n",
+      "Epoch 8: 6.83s to complete\n",
+      "  error(train)=2.91e-01, acc(train)=9.18e-01, error(valid)=2.75e-01, acc(valid)=9.25e-01, params_penalty=0.00e+00\n",
+      "Epoch 9: 10.23s to complete\n",
+      "  error(train)=2.91e-01, acc(train)=9.19e-01, error(valid)=2.74e-01, acc(valid)=9.24e-01, params_penalty=0.00e+00\n",
+      "Epoch 10: 7.87s to complete\n",
+      "  error(train)=2.88e-01, acc(train)=9.20e-01, error(valid)=2.71e-01, acc(valid)=9.25e-01, params_penalty=0.00e+00\n"
+     ]
+    }
+   ],
+   "source": [
+    "batch_size = 50\n",
+    "kernel_dim_1 = 4\n",
+    "kernel_dim_2 = 4\n",
+    "input_dim_1 = 28\n",
+    "input_dim_2 = 28\n",
+    "num_output_channels = 1\n",
+    "num_input_channels = 1\n",
+    "learning_rate = 0.01\n",
+    "num_epochs = 10\n",
+    "stats_interval = 1\n",
+    "\n",
+    "# Reset random number generator and data provider states on each run\n",
+    "# to ensure reproducibility of results\n",
+    "rng.seed(seed)\n",
+    "train_data.reset()\n",
+    "valid_data.reset()\n",
+    "\n",
+    "# Alter data-provider batch size\n",
+    "train_data.batch_size = batch_size \n",
+    "valid_data.batch_size = batch_size\n",
+    "\n",
+    "# Create a parameter initialiser which will sample random uniform values\n",
+    "# from [-init_scale, init_scale]\n",
+    "kernels_init = UniformInit(-0.01, 0.01, rng=rng)\n",
+    "weights_init = GlorotUniformInit(rng=rng)\n",
+    "biases_init = ConstantInit(0.)\n",
+    "\n",
+    "# Create a model with two affine layers\n",
+    "hidden_dim = (input_dim_1 - kernel_dim_1 + 1) * (input_dim_2 - kernel_dim_1 + 1) * num_output_channels\n",
+    "model = MultipleLayerModel([\n",
+    "    ReshapeLayer((num_input_channels, input_dim_1, input_dim_1)),\n",
+    "    CythonConvolutionalLayer(\n",
+    "            num_input_channels, num_output_channels, \n",
+    "            input_dim_1, input_dim_2, \n",
+    "            kernel_dim_1, kernel_dim_2, \n",
+    "            kernels_init, biases_init\n",
+    "    ),\n",
+    "    ReluLayer(),\n",
+    "    ReshapeLayer(),\n",
+    "    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)\n",
+    "])\n",
+    "\n",
+    "# Initialise a cross entropy error object\n",
+    "error = CrossEntropySoftmaxError()\n",
+    "\n",
+    "# Use a basic gradient descent learning rule\n",
+    "learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)\n",
+    "\n",
+    "# Monitor classification accuracy during training\n",
+    "data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}\n",
+    "\n",
+    "optimiser = Optimiser(\n",
+    "    model, error, learning_rule, train_data, valid_data, data_monitors)\n",
+    "\n",
+    "stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [conda env:mlp]",
+   "language": "python",
+   "name": "conda-env-mlp-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}