Adding notebook with convolutional layer implementation.

This commit is contained in:
Matt Graham 2017-01-27 18:25:12 +00:00
parent 1e4a198711
commit 43eaa03bcf

View File

@ -0,0 +1,997 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For those who decide to implement and experiment with convolutional layers for the second coursework, below a skeleton class and associated test functions for the `fprop`, `bprop` and `grads_wrt_params` methods of the class are included.\n",
"\n",
"The test functions assume that in your implementation of `fprop` for the convolutional layer, outputs are calculated only for 'valid' overlaps of the kernel filters with the input - i.e. without any padding.\n",
"\n",
"It is also assumed that if convolutions with non-unit strides are implemented the default behaviour is to take unit-strides, with the test cases only correct for unit strides in both directions."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import mlp.layers as layers\n",
"import mlp.initialisers as init\n",
"\n",
"from scipy.ndimage.filters import convolve\n",
"from scipy.signal import convolve2d, correlate2d\n",
"\n",
"class ConvolutionalLayer(layers.LayerWithParameters):\n",
" \"\"\"Layer implementing a 2D convolution-based transformation of its inputs.\n",
"\n",
" The layer is parameterised by a set of 2D convolutional kernels, a four\n",
" dimensional array of shape\n",
" (num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2)\n",
" and a bias vector, a one dimensional array of shape\n",
" (num_output_channels,)\n",
" i.e. one shared bias per output channel.\n",
"\n",
" Assuming no-padding is applied to the inputs so that outputs are only\n",
" calculated for positions where the kernel filters fully overlap with the\n",
" inputs, and that unit strides are used the outputs will have spatial extent\n",
" output_dim_1 = input_dim_1 - kernel_dim_1 + 1\n",
" output_dim_2 = input_dim_2 - kernel_dim_2 + 1\n",
" \"\"\"\n",
"\n",
" def __init__(self, num_input_channels, num_output_channels,\n",
" input_dim_1, input_dim_2,\n",
" kernel_dim_1, kernel_dim_2,\n",
" kernels_init=init.UniformInit(-0.01, 0.01),\n",
" biases_init=init.ConstantInit(0.),\n",
" kernels_penalty=None, biases_penalty=None):\n",
" \"\"\"Initialises a parameterised convolutional layer.\n",
"\n",
" Args:\n",
" num_input_channels (int): Number of channels in inputs to\n",
" layer (this may be number of colour channels in the input\n",
" images if used as the first layer in a model, or the\n",
" number of output channels, a.k.a. feature maps, from a\n",
" a previous convolutional layer).\n",
" num_output_channels (int): Number of channels in outputs\n",
" from the layer, a.k.a. number of feature maps.\n",
" input_dim_1 (int): Size of first input dimension of each 2D\n",
" channel of inputs.\n",
" input_dim_2 (int): Size of second input dimension of each 2D\n",
" channel of inputs.\n",
" kernel_dim_x (int): Size of first dimension of each 2D channel of\n",
" kernels.\n",
" kernel_dim_y (int): Size of second dimension of each 2D channel of\n",
" kernels.\n",
" kernels_intialiser: Initialiser for the kernel parameters.\n",
" biases_initialiser: Initialiser for the bias parameters.\n",
" kernels_penalty: Kernel-dependent penalty term (regulariser) or\n",
" None if no regularisation is to be applied to the kernels.\n",
" biases_penalty: Biases-dependent penalty term (regulariser) or\n",
" None if no regularisation is to be applied to the biases.\n",
" \"\"\"\n",
" self.num_input_channels = num_input_channels\n",
" self.num_output_channels = num_output_channels\n",
" self.input_dim_1 = input_dim_1\n",
" self.input_dim_2 = input_dim_2\n",
" self.kernel_dim_1 = kernel_dim_1\n",
" self.kernel_dim_2 = kernel_dim_2\n",
" self.kernels_init = kernels_init\n",
" self.biases_init = biases_init\n",
" self.kernels_shape = (\n",
" num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2\n",
" )\n",
" self.inputs_shape = (\n",
" None, num_input_channels, input_dim_1, input_dim_2\n",
" )\n",
" self.kernels = self.kernels_init(self.kernels_shape)\n",
" self.biases = self.biases_init(num_output_channels)\n",
" self.kernels_penalty = kernels_penalty\n",
" self.biases_penalty = biases_penalty\n",
"\n",
" def fprop(self, inputs):\n",
" \"\"\"Forward propagates activations through the layer transformation.\n",
"\n",
" For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer\n",
" corresponds to `y = conv2d(x, K) + b`.\n",
"\n",
" Args:\n",
" inputs: Array of layer inputs of shape \n",
" (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
"\n",
" Returns:\n",
" outputs: Array of layer outputs of shape \n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
" \"\"\"\n",
" output_dim_1 = self.input_dim_1 - self.kernel_dim_1 + 1\n",
" output_dim_2 = self.input_dim_2 - self.kernel_dim_2 + 1\n",
" batch_size = inputs.shape[0]\n",
" outputs = np.zeros((batch_size, self.num_output_channels, \n",
" output_dim_1, output_dim_2))\n",
" for b in range(batch_size):\n",
" for o in range(self.num_output_channels):\n",
" for i in range(self.num_input_channels):\n",
" outputs[b, o] += convolve2d(\n",
" inputs[b, i], self.kernels[o, i], mode='valid')\n",
" outputs[b, o] += self.biases[o]\n",
" return outputs\n",
"\n",
" def bprop(self, inputs, outputs, grads_wrt_outputs):\n",
" \"\"\"Back propagates gradients through a layer.\n",
"\n",
" Given gradients with respect to the outputs of the layer calculates the\n",
" gradients with respect to the layer inputs.\n",
"\n",
" Args:\n",
" inputs: Array of layer inputs of shape\n",
" (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
" outputs: Array of layer outputs calculated in forward pass of\n",
" shape\n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
" grads_wrt_outputs: Array of gradients with respect to the layer\n",
" outputs of shape\n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
"\n",
" Returns:\n",
" Array of gradients with respect to the layer inputs of shape\n",
" (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
" \"\"\"\n",
" output_dim_1, output_dim_2 = grads_wrt_outputs.shape[-2:]\n",
" batch_size = inputs.shape[0]\n",
" pad_1 = self.kernel_dim_1 - 1\n",
" pad_2 = self.kernel_dim_2 - 1\n",
" padded_grads_wrt_outputs = np.zeros(\n",
" (batch_size, self.num_output_channels, \n",
" output_dim_1 + 2 * pad_1, output_dim_2 + 2 * pad_2)\n",
" )\n",
" padded_grads_wrt_outputs[\n",
" :, :, pad_1:pad_1 + output_dim_1, pad_2:pad_2 + output_dim_2] = grads_wrt_outputs\n",
" grads_wrt_inputs = np.zeros(\n",
" (batch_size, self.num_input_channels, self.input_dim_1, self.input_dim_2))\n",
" for b in range(batch_size):\n",
" for o in range(self.num_output_channels):\n",
" for i in range(self.num_input_channels):\n",
" grads_wrt_inputs[b, i] += correlate2d(\n",
" padded_grads_wrt_outputs[b, o], self.kernels[o, i], mode='valid')\n",
" return grads_wrt_inputs\n",
"\n",
" def grads_wrt_params(self, inputs, grads_wrt_outputs):\n",
" \"\"\"Calculates gradients with respect to layer parameters.\n",
"\n",
" Args:\n",
" inputs: array of inputs to layer of shape (batch_size, input_dim)\n",
" grads_wrt_to_outputs: array of gradients with respect to the layer\n",
" outputs of shape\n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
"\n",
" Returns:\n",
" list of arrays of gradients with respect to the layer parameters\n",
" `[grads_wrt_kernels, grads_wrt_biases]`.\n",
" \"\"\"\n",
" output_dim_1, output_dim_2 = grads_wrt_outputs.shape[-2:]\n",
" batch_size = inputs.shape[0]\n",
" grads_wrt_kernels = np.zeros(self.kernels_shape)\n",
" for b in range(batch_size):\n",
" for o in range(self.num_output_channels):\n",
" for i in range(self.num_input_channels):\n",
" grads_wrt_kernels[o, i] += correlate2d(\n",
" grads_wrt_outputs[b, o], inputs[b, i], mode='valid')\n",
" grads_wrt_biases = grads_wrt_outputs.sum((0, 2, 3))\n",
" return grads_wrt_kernels, grads_wrt_biases\n",
"\n",
" def params_penalty(self):\n",
" \"\"\"Returns the parameter dependent penalty term for this layer.\n",
"\n",
" If no parameter-dependent penalty terms are set this returns zero.\n",
" \"\"\"\n",
" params_penalty = 0\n",
" if self.kernels_penalty is not None:\n",
" params_penalty += self.kernels_penalty(self.kernels)\n",
" if self.biases_penalty is not None:\n",
" params_penalty += self.biases_penalty(self.biases)\n",
" return params_penalty\n",
"\n",
" @property\n",
" def params(self):\n",
" \"\"\"A list of layer parameter values: `[kernels, biases]`.\"\"\"\n",
" return [self.kernels, self.biases]\n",
"\n",
" @params.setter\n",
" def params(self, values):\n",
" self.kernels = values[0]\n",
" self.biases = values[1]\n",
"\n",
" def __repr__(self):\n",
" return (\n",
" 'ConvolutionalLayer(\\n'\n",
" ' num_input_channels={0}, num_output_channels={1},\\n'\n",
" ' input_dim_1={2}, input_dim_2={3},\\n'\n",
" ' kernel_dim_1={4}, kernel_dim_2={5}\\n'\n",
" ')'\n",
" .format(self.num_input_channels, self.num_output_channels,\n",
" self.input_dim_1, self.input_dim_2, self.kernel_dim_1,\n",
" self.kernel_dim_2)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%load_ext cython"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%cython --compile-args=-fopenmp --link-args=-fopenmp\n",
"#!python\n",
"#cython: embedsignature=True\n",
"\n",
"import numpy as np\n",
"cimport numpy as np\n",
"cimport cython\n",
"from cython.view cimport array\n",
"from cython.parallel import prange\n",
"\n",
"DTYPE = np.float64\n",
"ctypedef np.float64_t DTYPE_t\n",
"\n",
"@cython.boundscheck(False)\n",
"@cython.wraparound(False)\n",
"@cython.initializedcheck(False)\n",
"def conv2d_fprop(\n",
" double[:, :, :, :] inputs, double[:, :, :, :] kernels, \n",
" int stride_x=1, int stride_y=1):\n",
" cdef int i, k, x, y, c, m, n, l, t\n",
" cdef int batch_size = inputs.shape[0] \n",
" cdef int in_shape_x = inputs.shape[2]\n",
" cdef int in_shape_y = inputs.shape[3]\n",
" cdef int n_out_channels = kernels.shape[0]\n",
" cdef int n_in_channels = kernels.shape[1]\n",
" cdef int kernel_shape_x = kernels.shape[2]\n",
" cdef int kernel_shape_y = kernels.shape[3]\n",
" cdef int out_shape_x = (in_shape_x - kernel_shape_x + 1) // stride_x\n",
" cdef int out_shape_y = (in_shape_y - kernel_shape_y + 1) // stride_y\n",
" cdef double [:, :, :, :] outputs = np.zeros(\n",
" (batch_size, n_out_channels, out_shape_x, out_shape_y))\n",
" for i in range(batch_size):\n",
" for k in range(n_out_channels):\n",
" for x in range(out_shape_x):\n",
" for y in range(out_shape_y):\n",
" l = x * stride_x\n",
" t = y * stride_y\n",
" for c in range(n_in_channels):\n",
" for m in range(kernel_shape_x):\n",
" for n in range(kernel_shape_y):\n",
" outputs[i, k, x, y] += (\n",
" kernels[k, c, m, n] * inputs[i, c, l + m, t + n])\n",
" return outputs\n",
"\n",
"@cython.boundscheck(False)\n",
"@cython.wraparound(False)\n",
"@cython.initializedcheck(False)\n",
"def conv2d_bprop(\n",
" double[:, :, :, :] grads_at_output, \n",
" double[:, :, :, :] kernels,\n",
" int stride_x=1, int stride_y=1):\n",
" cdef int i, k, x, y, c, m, n, l, t\n",
" cdef int batch_size = grads_at_output.shape[0]\n",
" cdef int n_out_channels = kernels.shape[0]\n",
" cdef int n_in_channels = kernels.shape[1]\n",
" cdef int kernel_shape_x = kernels.shape[2]\n",
" cdef int kernel_shape_y = kernels.shape[3]\n",
" cdef int out_shape_x = grads_at_output.shape[2]\n",
" cdef int out_shape_y = grads_at_output.shape[3]\n",
" cdef int in_shape_x = out_shape_x * stride_x + kernel_shape_x - 1\n",
" cdef int in_shape_y = out_shape_y * stride_y + kernel_shape_y - 1\n",
" cdef double [:, :, :, :] grads_at_input = np.zeros(\n",
" (batch_size, n_in_channels, in_shape_x, in_shape_y))\n",
" for i in range(batch_size):\n",
" for k in range(n_out_channels):\n",
" for x in range(out_shape_x):\n",
" for y in range(out_shape_y):\n",
" l = x * stride_x\n",
" t = y * stride_y\n",
" for c in range(n_in_channels):\n",
" for m in range(kernel_shape_x):\n",
" for n in range(kernel_shape_y):\n",
" grads_at_input[i, c, l + m, t + n] += (\n",
" kernels[k, c, m, n] * grads_at_output[i, k, x, y])\n",
" return grads_at_input\n",
"\n",
"@cython.boundscheck(False)\n",
"@cython.wraparound(False)\n",
"@cython.initializedcheck(False)\n",
"def conv2d_grads_wrt_kernels(\n",
" double[:, :, :, :] grads_at_output, \n",
" double[:, :, :, :] inputs,\n",
" int kernel_shape_x, int kernel_shape_y,\n",
" int stride_x=1, int stride_y=1):\n",
" cdef int i, k, x, y, c, m, n, l, t\n",
" cdef int batch_size = grads_at_output.shape[0]\n",
" cdef int n_out_channels = grads_at_output.shape[1]\n",
" cdef int n_in_channels = inputs.shape[1]\n",
" cdef int out_shape_x = grads_at_output.shape[2]\n",
" cdef int out_shape_y = grads_at_output.shape[3]\n",
" cdef int in_shape_x = inputs.shape[2]\n",
" cdef int in_shape_y = inputs.shape[3]\n",
" cdef double [:, :, :, :] kernel_grads = np.zeros(\n",
" (n_out_channels, n_in_channels, kernel_shape_x, kernel_shape_y))\n",
" for i in range(batch_size):\n",
" for k in range(n_out_channels):\n",
" for x in range(out_shape_x):\n",
" for y in range(out_shape_y):\n",
" l = x * stride_x\n",
" t = y * stride_y\n",
" for c in range(n_in_channels):\n",
" for m in range(kernel_shape_x):\n",
" for n in range(kernel_shape_y):\n",
" kernel_grads[k, c, m, n] += (\n",
" inputs[i, c, l + m, t + n] * grads_at_output[i, k, x, y])\n",
" return kernel_grads"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class CythonConvolutionalLayer(ConvolutionalLayer):\n",
" \n",
" def __init__(self, num_input_channels, num_output_channels,\n",
" input_dim_1, input_dim_2,\n",
" kernel_dim_1, kernel_dim_2,\n",
" kernels_init=init.UniformInit(-0.01, 0.01),\n",
" biases_init=init.ConstantInit(0.),\n",
" kernels_penalty=None, biases_penalty=None):\n",
" super(CythonConvolutionalLayer, self).__init__(\n",
" num_input_channels, num_output_channels,\n",
" input_dim_1, input_dim_2,\n",
" kernel_dim_1, kernel_dim_2,\n",
" kernels_init, biases_init, \n",
" kernels_penalty, biases_penalty\n",
" )\n",
" self.kernels = self.kernels.astype(np.double)\n",
" self.biases = self.biases.astype(np.double)\n",
"\n",
" def fprop(self, inputs):\n",
" \"\"\"Forward propagates activations through the layer transformation.\n",
"\n",
" For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer\n",
" corresponds to `y = conv2d(x, K) + b`.\n",
"\n",
" Args:\n",
" inputs: Array of layer inputs of shape \n",
" (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
"\n",
" Returns:\n",
" outputs: Array of layer outputs of shape \n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
" \"\"\"\n",
" return np.array(conv2d_fprop(inputs, self.kernels)) + self.biases[None, :, None, None]\n",
"\n",
" def bprop(self, inputs, outputs, grads_wrt_outputs):\n",
" \"\"\"Back propagates gradients through a layer.\n",
"\n",
" Given gradients with respect to the outputs of the layer calculates the\n",
" gradients with respect to the layer inputs.\n",
"\n",
" Args:\n",
" inputs: Array of layer inputs of shape\n",
" (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
" outputs: Array of layer outputs calculated in forward pass of\n",
" shape\n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
" grads_wrt_outputs: Array of gradients with respect to the layer\n",
" outputs of shape\n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
"\n",
" Returns:\n",
" Array of gradients with respect to the layer inputs of shape\n",
" (batch_size, num_input_channels, input_dim_1, input_dim_2).\n",
" \"\"\"\n",
" return np.array(conv2d_bprop(grads_wrt_outputs, self.kernels, 1, 1))\n",
"\n",
" def grads_wrt_params(self, inputs, grads_wrt_outputs):\n",
" \"\"\"Calculates gradients with respect to layer parameters.\n",
"\n",
" Args:\n",
" inputs: array of inputs to layer of shape (batch_size, input_dim)\n",
" grads_wrt_to_outputs: array of gradients with respect to the layer\n",
" outputs of shape\n",
" (batch_size, num_output_channels, output_dim_1, output_dim_2).\n",
"\n",
" Returns:\n",
" list of arrays of gradients with respect to the layer parameters\n",
" `[grads_wrt_kernels, grads_wrt_biases]`.\n",
" \"\"\"\n",
" grads_wrt_kernels = conv2d_grads_wrt_kernels(\n",
" grads_wrt_outputs, inputs, self.kernel_dim_1, self.kernel_dim_2, 1, 1)\n",
" grads_wrt_biases = grads_wrt_outputs.sum((0, 2, 3))\n",
" return np.array(grads_wrt_kernels), grads_wrt_biases\n",
"\n",
"\n",
" def __repr__(self):\n",
" return (\n",
" 'CythonConvolutionalLayer(\\n'\n",
" ' num_input_channels={0}, num_output_channels={1},\\n'\n",
" ' input_dim_1={2}, input_dim_2={3},\\n'\n",
" ' kernel_dim_1={4}, kernel_dim_2={5}\\n'\n",
" ')'\n",
" .format(self.num_input_channels, self.num_output_channels,\n",
" self.input_dim_1, self.input_dim_2, self.kernel_dim_1,\n",
" self.kernel_dim_2)\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The three test functions are defined in the cell below. All the functions take as first argument the *class* corresponding to the convolutional layer implementation to be tested (**not** an instance of the class). It is assumed the class being tested has an `__init__` method with at least all of the arguments defined in the skeleton definition above. A boolean second argument to each function can be used to specify if the layer implements a cross-correlation or convolution based operation (see note in [seventh lecture slides](http://www.inf.ed.ac.uk/teaching/courses/mlp/2016/mlp07-cnn.pdf))."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def test_conv_layer_fprop(layer_class, do_cross_correlation=False):\n",
" \"\"\"Tests `fprop` method of a convolutional layer.\n",
" \n",
" Checks the outputs of `fprop` method for a fixed input against known\n",
" reference values for the outputs and raises an AssertionError if\n",
" the outputted values are not consistent with the reference values. If\n",
" tests are all passed returns True.\n",
" \n",
" Args:\n",
" layer_class: Convolutional layer implementation following the \n",
" interface defined in the provided skeleton class.\n",
" do_cross_correlation: Whether the layer implements an operation\n",
" corresponding to cross-correlation (True) i.e kernels are\n",
" not flipped before sliding over inputs, or convolution\n",
" (False) with filters being flipped.\n",
"\n",
" Raises:\n",
" AssertionError: Raised if output of `layer.fprop` is inconsistent \n",
" with reference values either in shape or values.\n",
" \"\"\"\n",
" inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)\n",
" kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)\n",
" if do_cross_correlation:\n",
" kernels = kernels[:, :, ::-1, ::-1]\n",
" biases = np.arange(2).astype(np.double)\n",
" true_output = np.array(\n",
" [[[[ -958., -1036., -1114.],\n",
" [-1270., -1348., -1426.],\n",
" [-1582., -1660., -1738.]],\n",
" [[ 1707., 1773., 1839.],\n",
" [ 1971., 2037., 2103.],\n",
" [ 2235., 2301., 2367.]]],\n",
" [[[-4702., -4780., -4858.],\n",
" [-5014., -5092., -5170.],\n",
" [-5326., -5404., -5482.]],\n",
" [[ 4875., 4941., 5007.],\n",
" [ 5139., 5205., 5271.],\n",
" [ 5403., 5469., 5535.]]]]\n",
" )\n",
" layer = layer_class(\n",
" num_input_channels=kernels.shape[1], \n",
" num_output_channels=kernels.shape[0], \n",
" input_dim_1=inputs.shape[2], \n",
" input_dim_2=inputs.shape[3],\n",
" kernel_dim_1=kernels.shape[2],\n",
" kernel_dim_2=kernels.shape[3]\n",
" )\n",
" layer.params = [kernels, biases]\n",
" layer_output = layer.fprop(inputs)\n",
" assert layer_output.shape == true_output.shape, (\n",
" 'Layer fprop gives incorrect shaped output. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_output.shape, layer_output.shape)\n",
" )\n",
" assert np.allclose(layer_output, true_output), (\n",
" 'Layer fprop does not give correct output. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
" .format(true_output, layer_output)\n",
" )\n",
" return True\n",
"\n",
"def test_conv_layer_bprop(layer_class, do_cross_correlation=False):\n",
" \"\"\"Tests `bprop` method of a convolutional layer.\n",
" \n",
" Checks the outputs of `bprop` method for a fixed input against known\n",
" reference values for the gradients with respect to inputs and raises \n",
" an AssertionError if the returned values are not consistent with the\n",
" reference values. If tests are all passed returns True.\n",
" \n",
" Args:\n",
" layer_class: Convolutional layer implementation following the \n",
" interface defined in the provided skeleton class.\n",
" do_cross_correlation: Whether the layer implements an operation\n",
" corresponding to cross-correlation (True) i.e kernels are\n",
" not flipped before sliding over inputs, or convolution\n",
" (False) with filters being flipped.\n",
"\n",
" Raises:\n",
" AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
" with reference values either in shape or values.\n",
" \"\"\"\n",
" inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)\n",
" kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)\n",
" if do_cross_correlation:\n",
" kernels = kernels[:, :, ::-1, ::-1]\n",
" biases = np.arange(2).astype(np.double)\n",
" grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3)).astype(np.double)\n",
" outputs = np.array(\n",
" [[[[ -958., -1036., -1114.],\n",
" [-1270., -1348., -1426.],\n",
" [-1582., -1660., -1738.]],\n",
" [[ 1707., 1773., 1839.],\n",
" [ 1971., 2037., 2103.],\n",
" [ 2235., 2301., 2367.]]],\n",
" [[[-4702., -4780., -4858.],\n",
" [-5014., -5092., -5170.],\n",
" [-5326., -5404., -5482.]],\n",
" [[ 4875., 4941., 5007.],\n",
" [ 5139., 5205., 5271.],\n",
" [ 5403., 5469., 5535.]]]]\n",
" )\n",
" true_grads_wrt_inputs = np.array(\n",
" [[[[ 147., 319., 305., 162.],\n",
" [ 338., 716., 680., 354.],\n",
" [ 290., 608., 572., 294.],\n",
" [ 149., 307., 285., 144.]],\n",
" [[ 23., 79., 81., 54.],\n",
" [ 114., 284., 280., 162.],\n",
" [ 114., 272., 268., 150.],\n",
" [ 73., 163., 157., 84.]],\n",
" [[-101., -161., -143., -54.],\n",
" [-110., -148., -120., -30.],\n",
" [ -62., -64., -36., 6.],\n",
" [ -3., 19., 29., 24.]]],\n",
" [[[ 39., 67., 53., 18.],\n",
" [ 50., 68., 32., -6.],\n",
" [ 2., -40., -76., -66.],\n",
" [ -31., -89., -111., -72.]],\n",
" [[ 59., 115., 117., 54.],\n",
" [ 114., 212., 208., 90.],\n",
" [ 114., 200., 196., 78.],\n",
" [ 37., 55., 49., 12.]],\n",
" [[ 79., 163., 181., 90.],\n",
" [ 178., 356., 384., 186.],\n",
" [ 226., 440., 468., 222.],\n",
" [ 105., 199., 209., 96.]]]])\n",
" layer = layer_class(\n",
" num_input_channels=kernels.shape[1], \n",
" num_output_channels=kernels.shape[0], \n",
" input_dim_1=inputs.shape[2], \n",
" input_dim_2=inputs.shape[3],\n",
" kernel_dim_1=kernels.shape[2],\n",
" kernel_dim_2=kernels.shape[3]\n",
" )\n",
" layer.params = [kernels, biases]\n",
" layer_grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)\n",
" assert layer_grads_wrt_inputs.shape == true_grads_wrt_inputs.shape, (\n",
" 'Layer bprop returns incorrect shaped array. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_grads_wrt_inputs.shape, layer_grads_wrt_inputs.shape)\n",
" )\n",
" assert np.allclose(layer_grads_wrt_inputs, true_grads_wrt_inputs), (\n",
" 'Layer bprop does not return correct values. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}'\n",
" .format(true_grads_wrt_inputs, layer_grads_wrt_inputs)\n",
" )\n",
" return True\n",
"\n",
"def test_conv_layer_grad_wrt_params(\n",
" layer_class, do_cross_correlation=False):\n",
" \"\"\"Tests `grad_wrt_params` method of a convolutional layer.\n",
" \n",
" Checks the outputs of `grad_wrt_params` method for fixed inputs \n",
" against known reference values for the gradients with respect to \n",
" kernels and biases, and raises an AssertionError if the returned\n",
" values are not consistent with the reference values. If tests\n",
" are all passed returns True.\n",
" \n",
" Args:\n",
" layer_class: Convolutional layer implementation following the \n",
" interface defined in the provided skeleton class.\n",
" do_cross_correlation: Whether the layer implements an operation\n",
" corresponding to cross-correlation (True) i.e kernels are\n",
" not flipped before sliding over inputs, or convolution\n",
" (False) with filters being flipped.\n",
"\n",
" Raises:\n",
" AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
" with reference values either in shape or values.\n",
" \"\"\"\n",
" inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)\n",
" kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)\n",
" biases = np.arange(2).astype(np.double)\n",
" grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3)).astype(np.double)\n",
" true_kernel_grads = np.array(\n",
" [[[[ -240., -114.],\n",
" [ 264., 390.]],\n",
" [[-2256., -2130.],\n",
" [-1752., -1626.]],\n",
" [[-4272., -4146.],\n",
" [-3768., -3642.]]],\n",
" [[[ 5268., 5232.],\n",
" [ 5124., 5088.]],\n",
" [[ 5844., 5808.],\n",
" [ 5700., 5664.]],\n",
" [[ 6420., 6384.],\n",
" [ 6276., 6240.]]]])\n",
" if do_cross_correlation:\n",
" kernels = kernels[:, :, ::-1, ::-1]\n",
" true_kernel_grads = true_kernel_grads[:, :, ::-1, ::-1]\n",
" true_bias_grads = np.array([-126., 36.])\n",
" layer = layer_class(\n",
" num_input_channels=kernels.shape[1], \n",
" num_output_channels=kernels.shape[0], \n",
" input_dim_1=inputs.shape[2], \n",
" input_dim_2=inputs.shape[3],\n",
" kernel_dim_1=kernels.shape[2],\n",
" kernel_dim_2=kernels.shape[3]\n",
" )\n",
" layer.params = [kernels, biases]\n",
" layer_kernel_grads, layer_bias_grads = (\n",
" layer.grads_wrt_params(inputs, grads_wrt_outputs))\n",
" assert layer_kernel_grads.shape == true_kernel_grads.shape, (\n",
" 'grads_wrt_params gives incorrect shaped kernel gradients output. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_kernel_grads.shape, layer_kernel_grads.shape)\n",
" )\n",
" assert np.allclose(layer_kernel_grads, true_kernel_grads), (\n",
" 'grads_wrt_params does not give correct kernel gradients output. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
" .format(true_kernel_grads, layer_kernel_grads)\n",
" )\n",
" assert layer_bias_grads.shape == true_bias_grads.shape, (\n",
" 'grads_wrt_params gives incorrect shaped bias gradients output. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_bias_grads.shape, layer_bias_grads.shape)\n",
" )\n",
" assert np.allclose(layer_bias_grads, true_bias_grads), (\n",
" 'grads_wrt_params does not give correct bias gradients output. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
" .format(true_bias_grads, layer_bias_grads)\n",
" )\n",
" return True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An example of using the test functions if given in the cell below. This assumes you implement a convolution (rather than cross-correlation) operation. If the implementation is correct "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All tests passed.\n"
]
}
],
"source": [
"all_correct = test_conv_layer_fprop(ConvolutionalLayer, False)\n",
"all_correct &= test_conv_layer_bprop(ConvolutionalLayer, False)\n",
"all_correct &= test_conv_layer_grad_wrt_params(ConvolutionalLayer, False)\n",
"if all_correct:\n",
" print('All tests passed.')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All tests passed.\n"
]
}
],
"source": [
"all_correct = test_conv_layer_fprop(CythonConvolutionalLayer, True)\n",
"all_correct &= test_conv_layer_bprop(CythonConvolutionalLayer, True)\n",
"all_correct &= test_conv_layer_grad_wrt_params(CythonConvolutionalLayer, True)\n",
"if all_correct:\n",
" print('All tests passed.')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import logging\n",
"from mlp.layers import ReluLayer, ReshapeLayer, AffineLayer\n",
"from mlp.errors import CrossEntropySoftmaxError\n",
"from mlp.models import MultipleLayerModel\n",
"from mlp.initialisers import UniformInit, ConstantInit, GlorotUniformInit\n",
"from mlp.learning_rules import GradientDescentLearningRule\n",
"from mlp.data_providers import MNISTDataProvider\n",
"from mlp.optimisers import Optimiser\n",
"%matplotlib inline\n",
"plt.style.use('ggplot')\n",
"\n",
"# Seed a random number generator\n",
"seed = 6102016 \n",
"rng = np.random.RandomState(seed)\n",
"\n",
"# Set up a logger object to print info about the training run to stdout\n",
"logger = logging.getLogger()\n",
"logger.setLevel(logging.INFO)\n",
"logger.handlers = [logging.StreamHandler()]\n",
"\n",
"# Create data provider objects for the MNIST data set\n",
"train_data = MNISTDataProvider('train', rng=rng)\n",
"valid_data = MNISTDataProvider('valid', rng=rng)\n",
"input_dim, output_dim = 784, 10"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_data.inputs = train_data.inputs.astype(np.double)\n",
"valid_data.inputs = valid_data.inputs.astype(np.double)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 0:\n",
" error(train)=2.30e+00, acc(train)=1.40e-01, error(valid)=2.30e+00, acc(valid)=1.34e-01, params_penalty=0.00e+00\n",
"Epoch 1: 20.20s to complete\n",
" error(train)=3.70e-01, acc(train)=8.91e-01, error(valid)=3.38e-01, acc(valid)=9.03e-01, params_penalty=0.00e+00\n",
"Epoch 2: 23.26s to complete\n",
" error(train)=3.38e-01, acc(train)=9.03e-01, error(valid)=3.11e-01, acc(valid)=9.12e-01, params_penalty=0.00e+00\n",
"Epoch 3: 23.27s to complete\n",
" error(train)=3.18e-01, acc(train)=9.10e-01, error(valid)=3.00e-01, acc(valid)=9.16e-01, params_penalty=0.00e+00\n",
"Epoch 4: 25.56s to complete\n",
" error(train)=3.14e-01, acc(train)=9.10e-01, error(valid)=2.96e-01, acc(valid)=9.17e-01, params_penalty=0.00e+00\n",
"Epoch 5: 25.17s to complete\n",
" error(train)=3.06e-01, acc(train)=9.13e-01, error(valid)=2.85e-01, acc(valid)=9.19e-01, params_penalty=0.00e+00\n",
"Epoch 6: 25.59s to complete\n",
" error(train)=3.07e-01, acc(train)=9.14e-01, error(valid)=2.87e-01, acc(valid)=9.20e-01, params_penalty=0.00e+00\n",
"Epoch 7: 28.70s to complete\n",
" error(train)=3.00e-01, acc(train)=9.15e-01, error(valid)=2.82e-01, acc(valid)=9.19e-01, params_penalty=0.00e+00\n",
"Epoch 8: 24.72s to complete\n",
" error(train)=2.91e-01, acc(train)=9.18e-01, error(valid)=2.75e-01, acc(valid)=9.24e-01, params_penalty=0.00e+00\n",
"Epoch 9: 29.81s to complete\n",
" error(train)=2.91e-01, acc(train)=9.19e-01, error(valid)=2.75e-01, acc(valid)=9.24e-01, params_penalty=0.00e+00\n",
"Epoch 10: 26.75s to complete\n",
" error(train)=2.88e-01, acc(train)=9.19e-01, error(valid)=2.72e-01, acc(valid)=9.26e-01, params_penalty=0.00e+00\n"
]
}
],
"source": [
"batch_size = 50\n",
"kernel_dim_1 = 4\n",
"kernel_dim_2 = 4\n",
"input_dim_1 = 28\n",
"input_dim_2 = 28\n",
"num_output_channels = 1\n",
"num_input_channels = 1\n",
"learning_rate = 0.01\n",
"num_epochs = 10\n",
"stats_interval = 1\n",
"\n",
"# Reset random number generator and data provider states on each run\n",
"# to ensure reproducibility of results\n",
"rng.seed(seed)\n",
"train_data.reset()\n",
"valid_data.reset()\n",
"\n",
"# Alter data-provider batch size\n",
"train_data.batch_size = batch_size \n",
"valid_data.batch_size = batch_size\n",
"\n",
"# Create a parameter initialiser which will sample random uniform values\n",
"# from [-init_scale, init_scale]\n",
"kernels_init = UniformInit(-0.01, 0.01, rng=rng)\n",
"weights_init = GlorotUniformInit(rng=rng)\n",
"biases_init = ConstantInit(0.)\n",
"\n",
"# Create a model with two affine layers\n",
"hidden_dim = (input_dim_1 - kernel_dim_1 + 1) * (input_dim_2 - kernel_dim_1 + 1) * num_output_channels\n",
"model = MultipleLayerModel([\n",
" ReshapeLayer((num_input_channels, input_dim_1, input_dim_1)),\n",
" ConvolutionalLayer(\n",
" num_input_channels, num_output_channels, \n",
" input_dim_1, input_dim_2, \n",
" kernel_dim_1, kernel_dim_2, \n",
" kernels_init, biases_init\n",
" ),\n",
" ReluLayer(),\n",
" ReshapeLayer(),\n",
" AffineLayer(hidden_dim, output_dim, weights_init, biases_init)\n",
"])\n",
"\n",
"# Initialise a cross entropy error object\n",
"error = CrossEntropySoftmaxError()\n",
"\n",
"# Use a basic gradient descent learning rule\n",
"learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)\n",
"\n",
"# Monitor classification accuracy during training\n",
"data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}\n",
"\n",
"optimiser = Optimiser(\n",
" model, error, learning_rule, train_data, valid_data, data_monitors)\n",
"\n",
"stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 0:\n",
" error(train)=2.30e+00, acc(train)=1.09e-01, error(valid)=2.30e+00, acc(valid)=1.02e-01, params_penalty=0.00e+00\n",
"Epoch 1: 9.53s to complete\n",
" error(train)=3.74e-01, acc(train)=8.90e-01, error(valid)=3.41e-01, acc(valid)=9.02e-01, params_penalty=0.00e+00\n",
"Epoch 2: 8.50s to complete\n",
" error(train)=3.39e-01, acc(train)=9.03e-01, error(valid)=3.11e-01, acc(valid)=9.11e-01, params_penalty=0.00e+00\n",
"Epoch 3: 6.74s to complete\n",
" error(train)=3.19e-01, acc(train)=9.10e-01, error(valid)=3.01e-01, acc(valid)=9.15e-01, params_penalty=0.00e+00\n",
"Epoch 4: 10.08s to complete\n",
" error(train)=3.15e-01, acc(train)=9.11e-01, error(valid)=2.96e-01, acc(valid)=9.17e-01, params_penalty=0.00e+00\n",
"Epoch 5: 8.82s to complete\n",
" error(train)=3.06e-01, acc(train)=9.14e-01, error(valid)=2.85e-01, acc(valid)=9.21e-01, params_penalty=0.00e+00\n",
"Epoch 6: 10.37s to complete\n",
" error(train)=3.07e-01, acc(train)=9.13e-01, error(valid)=2.87e-01, acc(valid)=9.21e-01, params_penalty=0.00e+00\n",
"Epoch 7: 7.81s to complete\n",
" error(train)=2.99e-01, acc(train)=9.16e-01, error(valid)=2.82e-01, acc(valid)=9.20e-01, params_penalty=0.00e+00\n",
"Epoch 8: 6.83s to complete\n",
" error(train)=2.91e-01, acc(train)=9.18e-01, error(valid)=2.75e-01, acc(valid)=9.25e-01, params_penalty=0.00e+00\n",
"Epoch 9: 10.23s to complete\n",
" error(train)=2.91e-01, acc(train)=9.19e-01, error(valid)=2.74e-01, acc(valid)=9.24e-01, params_penalty=0.00e+00\n",
"Epoch 10: 7.87s to complete\n",
" error(train)=2.88e-01, acc(train)=9.20e-01, error(valid)=2.71e-01, acc(valid)=9.25e-01, params_penalty=0.00e+00\n"
]
}
],
"source": [
"batch_size = 50\n",
"kernel_dim_1 = 4\n",
"kernel_dim_2 = 4\n",
"input_dim_1 = 28\n",
"input_dim_2 = 28\n",
"num_output_channels = 1\n",
"num_input_channels = 1\n",
"learning_rate = 0.01\n",
"num_epochs = 10\n",
"stats_interval = 1\n",
"\n",
"# Reset random number generator and data provider states on each run\n",
"# to ensure reproducibility of results\n",
"rng.seed(seed)\n",
"train_data.reset()\n",
"valid_data.reset()\n",
"\n",
"# Alter data-provider batch size\n",
"train_data.batch_size = batch_size \n",
"valid_data.batch_size = batch_size\n",
"\n",
"# Create a parameter initialiser which will sample random uniform values\n",
"# from [-init_scale, init_scale]\n",
"kernels_init = UniformInit(-0.01, 0.01, rng=rng)\n",
"weights_init = GlorotUniformInit(rng=rng)\n",
"biases_init = ConstantInit(0.)\n",
"\n",
"# Create a model with two affine layers\n",
"hidden_dim = (input_dim_1 - kernel_dim_1 + 1) * (input_dim_2 - kernel_dim_1 + 1) * num_output_channels\n",
"model = MultipleLayerModel([\n",
" ReshapeLayer((num_input_channels, input_dim_1, input_dim_1)),\n",
" CythonConvolutionalLayer(\n",
" num_input_channels, num_output_channels, \n",
" input_dim_1, input_dim_2, \n",
" kernel_dim_1, kernel_dim_2, \n",
" kernels_init, biases_init\n",
" ),\n",
" ReluLayer(),\n",
" ReshapeLayer(),\n",
" AffineLayer(hidden_dim, output_dim, weights_init, biases_init)\n",
"])\n",
"\n",
"# Initialise a cross entropy error object\n",
"error = CrossEntropySoftmaxError()\n",
"\n",
"# Use a basic gradient descent learning rule\n",
"learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)\n",
"\n",
"# Monitor classification accuracy during training\n",
"data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}\n",
"\n",
"optimiser = Optimiser(\n",
" model, error, learning_rule, train_data, valid_data, data_monitors)\n",
"\n",
"stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:mlp]",
"language": "python",
"name": "conda-env-mlp-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}