For those who decide to implement and experiment with convolutional layers for the second coursework, below a skeleton class and associated test functions for the `fprop`, `bprop` and `grads_wrt_params` methods of the class are included.

The test functions assume that in your implementation of `fprop` for the convolutional layer, outputs are calculated only for 'valid' overlaps of the kernel filters with the input - i.e. without any padding.

It is also assumed that if convolutions with non-unit strides are implemented the default behaviour is to take unit-strides, with the test cases only correct for unit strides in both directions.

In [1]:
import mlp.layers as layers
import mlp.initialisers as init

from scipy.ndimage.filters import convolve
from scipy.signal import convolve2d, correlate2d

class ConvolutionalLayer(layers.LayerWithParameters):
    """Layer implementing a 2D convolution-based transformation of its inputs.

    The layer is parameterised by a set of 2D convolutional kernels, a four
    dimensional array of shape
        (num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2)
    and a bias vector, a one dimensional array of shape
        (num_output_channels,)
    i.e. one shared bias per output channel.

    Assuming no-padding is applied to the inputs so that outputs are only
    calculated for positions where the kernel filters fully overlap with the
    inputs, and that unit strides are used the outputs will have spatial extent
        output_dim_1 = input_dim_1 - kernel_dim_1 + 1
        output_dim_2 = input_dim_2 - kernel_dim_2 + 1
    """

    def __init__(self, num_input_channels, num_output_channels,
                 input_dim_1, input_dim_2,
                 kernel_dim_1, kernel_dim_2,
                 kernels_init=init.UniformInit(-0.01, 0.01),
                 biases_init=init.ConstantInit(0.),
                 kernels_penalty=None, biases_penalty=None):
        """Initialises a parameterised convolutional layer.

        Args:
            num_input_channels (int): Number of channels in inputs to
                layer (this may be number of colour channels in the input
                images if used as the first layer in a model, or the
                number of output channels, a.k.a. feature maps, from a
                a previous convolutional layer).
            num_output_channels (int): Number of channels in outputs
                from the layer, a.k.a. number of feature maps.
            input_dim_1 (int): Size of first input dimension of each 2D
                channel of inputs.
            input_dim_2 (int): Size of second input dimension of each 2D
                channel of inputs.
            kernel_dim_x (int): Size of first dimension of each 2D channel of
                kernels.
            kernel_dim_y (int): Size of second dimension of each 2D channel of
                kernels.
            kernels_intialiser: Initialiser for the kernel parameters.
            biases_initialiser: Initialiser for the bias parameters.
            kernels_penalty: Kernel-dependent penalty term (regulariser) or
                None if no regularisation is to be applied to the kernels.
            biases_penalty: Biases-dependent penalty term (regulariser) or
                None if no regularisation is to be applied to the biases.
        """
        self.num_input_channels = num_input_channels
        self.num_output_channels = num_output_channels
        self.input_dim_1 = input_dim_1
        self.input_dim_2 = input_dim_2
        self.kernel_dim_1 = kernel_dim_1
        self.kernel_dim_2 = kernel_dim_2
        self.kernels_init = kernels_init
        self.biases_init = biases_init
        self.kernels_shape = (
            num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2
        )
        self.inputs_shape = (
            None, num_input_channels, input_dim_1, input_dim_2
        )
        self.kernels = self.kernels_init(self.kernels_shape)
        self.biases = self.biases_init(num_output_channels)
        self.kernels_penalty = kernels_penalty
        self.biases_penalty = biases_penalty

    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.

        For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer
        corresponds to `y = conv2d(x, K) + b`.

        Args:
            inputs: Array of layer inputs of shape 
                (batch_size, num_input_channels, input_dim_1, input_dim_2).

        Returns:
            outputs: Array of layer outputs of shape 
                (batch_size, num_output_channels, output_dim_1, output_dim_2).
        """
        output_dim_1 = self.input_dim_1 - self.kernel_dim_1 + 1
        output_dim_2 = self.input_dim_2 - self.kernel_dim_2 + 1
        batch_size = inputs.shape[0]
        outputs = np.zeros((batch_size, self.num_output_channels, 
                            output_dim_1, output_dim_2))
        for b in range(batch_size):
            for o in range(self.num_output_channels):
                for i in range(self.num_input_channels):
                    outputs[b, o] += convolve2d(
                        inputs[b, i], self.kernels[o, i], mode='valid')
                outputs[b, o] += self.biases[o]
        return outputs

    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.

        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.

        Args:
            inputs: Array of layer inputs of shape
                (batch_size, num_input_channels, input_dim_1, input_dim_2).
            outputs: Array of layer outputs calculated in forward pass of
                shape
                (batch_size, num_output_channels, output_dim_1, output_dim_2).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape
                (batch_size, num_output_channels, output_dim_1, output_dim_2).

        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, num_input_channels, input_dim_1, input_dim_2).
        """
        output_dim_1, output_dim_2 = grads_wrt_outputs.shape[-2:]
        batch_size = inputs.shape[0]
        pad_1 = self.kernel_dim_1 - 1
        pad_2 = self.kernel_dim_2 - 1
        padded_grads_wrt_outputs = np.zeros(
            (batch_size, self.num_output_channels, 
             output_dim_1 + 2 * pad_1, output_dim_2 + 2 * pad_2)
        )
        padded_grads_wrt_outputs[
            :, :, pad_1:pad_1 + output_dim_1, pad_2:pad_2 + output_dim_2] = grads_wrt_outputs
        grads_wrt_inputs = np.zeros(
            (batch_size, self.num_input_channels, self.input_dim_1, self.input_dim_2))
        for b in range(batch_size):
            for o in range(self.num_output_channels):
                for i in range(self.num_input_channels):
                    grads_wrt_inputs[b, i] += correlate2d(
                        padded_grads_wrt_outputs[b, o], self.kernels[o, i], mode='valid')
        return grads_wrt_inputs

    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.

        Args:
            inputs: array of inputs to layer of shape (batch_size, input_dim)
            grads_wrt_to_outputs: array of gradients with respect to the layer
                outputs of shape
                (batch_size, num_output_channels, output_dim_1, output_dim_2).

        Returns:
            list of arrays of gradients with respect to the layer parameters
            `[grads_wrt_kernels, grads_wrt_biases]`.
        """
        output_dim_1, output_dim_2 = grads_wrt_outputs.shape[-2:]
        batch_size = inputs.shape[0]
        grads_wrt_kernels = np.zeros(self.kernels_shape)
        for b in range(batch_size):
            for o in range(self.num_output_channels):
                for i in range(self.num_input_channels):
                    grads_wrt_kernels[o, i] += correlate2d(
                        grads_wrt_outputs[b, o], inputs[b, i], mode='valid')
        grads_wrt_biases = grads_wrt_outputs.sum((0, 2, 3))
        return grads_wrt_kernels, grads_wrt_biases

    def params_penalty(self):
        """Returns the parameter dependent penalty term for this layer.

        If no parameter-dependent penalty terms are set this returns zero.
        """
        params_penalty = 0
        if self.kernels_penalty is not None:
            params_penalty += self.kernels_penalty(self.kernels)
        if self.biases_penalty is not None:
            params_penalty += self.biases_penalty(self.biases)
        return params_penalty

    @property
    def params(self):
        """A list of layer parameter values: `[kernels, biases]`."""
        return [self.kernels, self.biases]

    @params.setter
    def params(self, values):
        self.kernels = values[0]
        self.biases = values[1]

    def __repr__(self):
        return (
            'ConvolutionalLayer(\n'
            '    num_input_channels={0}, num_output_channels={1},\n'
            '    input_dim_1={2}, input_dim_2={3},\n'
            '    kernel_dim_1={4}, kernel_dim_2={5}\n'
            ')'
            .format(self.num_input_channels, self.num_output_channels,
                    self.input_dim_1, self.input_dim_2, self.kernel_dim_1,
                    self.kernel_dim_2)
        )

In [2]:
%load_ext cython

In [3]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp
#!python
#cython: embedsignature=True

import numpy as np
cimport numpy as np
cimport cython
from cython.view cimport array
from cython.parallel import prange

DTYPE = np.float64
ctypedef np.float64_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
def conv2d_fprop(
        double[:, :, :, :] inputs, double[:, :, :, :] kernels, 
        int stride_x=1, int stride_y=1):
    cdef int i, k, x, y, c, m, n, l, t
    cdef int batch_size = inputs.shape[0]   
    cdef int in_shape_x = inputs.shape[2]
    cdef int in_shape_y = inputs.shape[3]
    cdef int n_out_channels = kernels.shape[0]
    cdef int n_in_channels = kernels.shape[1]
    cdef int kernel_shape_x = kernels.shape[2]
    cdef int kernel_shape_y = kernels.shape[3]
    cdef int out_shape_x = (in_shape_x - kernel_shape_x + 1) // stride_x
    cdef int out_shape_y = (in_shape_y - kernel_shape_y + 1) // stride_y
    cdef double [:, :, :, :] outputs = np.zeros(
        (batch_size, n_out_channels, out_shape_x, out_shape_y))
    for i in range(batch_size):
        for k in range(n_out_channels):
            for x in range(out_shape_x):
                for y in range(out_shape_y):
                    l = x * stride_x
                    t = y * stride_y
                    for c in range(n_in_channels):
                        for m in range(kernel_shape_x):
                            for n in range(kernel_shape_y):
                                outputs[i, k, x, y] += (
                                    kernels[k, c, m, n] * inputs[i, c, l + m, t + n])
    return outputs

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
def conv2d_bprop(
           double[:, :, :, :] grads_at_output, 
           double[:, :, :, :] kernels,
           int stride_x=1, int stride_y=1):
    cdef int i, k, x, y, c, m, n, l, t
    cdef int batch_size = grads_at_output.shape[0]
    cdef int n_out_channels = kernels.shape[0]
    cdef int n_in_channels = kernels.shape[1]
    cdef int kernel_shape_x = kernels.shape[2]
    cdef int kernel_shape_y = kernels.shape[3]
    cdef int out_shape_x = grads_at_output.shape[2]
    cdef int out_shape_y = grads_at_output.shape[3]
    cdef int in_shape_x = out_shape_x * stride_x + kernel_shape_x - 1
    cdef int in_shape_y = out_shape_y * stride_y + kernel_shape_y - 1
    cdef double [:, :, :, :] grads_at_input = np.zeros(
        (batch_size, n_in_channels, in_shape_x, in_shape_y))
    for i in range(batch_size):
        for k in range(n_out_channels):
            for x in range(out_shape_x):
                for y in range(out_shape_y):
                    l = x * stride_x
                    t = y * stride_y
                    for c in range(n_in_channels):
                        for m in range(kernel_shape_x):
                            for n in range(kernel_shape_y):
                                grads_at_input[i, c, l + m, t + n] += (
                                    kernels[k, c, m, n] * grads_at_output[i, k, x, y])
    return grads_at_input

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
def conv2d_grads_wrt_kernels(
           double[:, :, :, :] grads_at_output, 
           double[:, :, :, :] inputs,
           int kernel_shape_x, int kernel_shape_y,
           int stride_x=1, int stride_y=1):
    cdef int i, k, x, y, c, m, n, l, t
    cdef int batch_size = grads_at_output.shape[0]
    cdef int n_out_channels = grads_at_output.shape[1]
    cdef int n_in_channels = inputs.shape[1]
    cdef int out_shape_x = grads_at_output.shape[2]
    cdef int out_shape_y = grads_at_output.shape[3]
    cdef int in_shape_x = inputs.shape[2]
    cdef int in_shape_y = inputs.shape[3]
    cdef double [:, :, :, :] kernel_grads = np.zeros(
        (n_out_channels, n_in_channels, kernel_shape_x, kernel_shape_y))
    for i in range(batch_size):
        for k in range(n_out_channels):
            for x in range(out_shape_x):
                for y in range(out_shape_y):
                    l = x * stride_x
                    t = y * stride_y
                    for c in range(n_in_channels):
                        for m in range(kernel_shape_x):
                            for n in range(kernel_shape_y):
                                kernel_grads[k, c, m, n] += (
                                    inputs[i, c, l + m, t + n] * grads_at_output[i, k, x, y])
    return kernel_grads

In [4]:
class CythonConvolutionalLayer(ConvolutionalLayer):
    
    def __init__(self, num_input_channels, num_output_channels,
                 input_dim_1, input_dim_2,
                 kernel_dim_1, kernel_dim_2,
                 kernels_init=init.UniformInit(-0.01, 0.01),
                 biases_init=init.ConstantInit(0.),
                 kernels_penalty=None, biases_penalty=None):
        super(CythonConvolutionalLayer, self).__init__(
            num_input_channels, num_output_channels,
            input_dim_1, input_dim_2,
            kernel_dim_1, kernel_dim_2,
            kernels_init, biases_init, 
            kernels_penalty, biases_penalty
        )
        self.kernels = self.kernels.astype(np.double)
        self.biases = self.biases.astype(np.double)

    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.

        For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer
        corresponds to `y = conv2d(x, K) + b`.

        Args:
            inputs: Array of layer inputs of shape 
                (batch_size, num_input_channels, input_dim_1, input_dim_2).

        Returns:
            outputs: Array of layer outputs of shape 
                (batch_size, num_output_channels, output_dim_1, output_dim_2).
        """
        return np.array(conv2d_fprop(inputs, self.kernels)) + self.biases[None, :, None, None]

    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.

        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.

        Args:
            inputs: Array of layer inputs of shape
                (batch_size, num_input_channels, input_dim_1, input_dim_2).
            outputs: Array of layer outputs calculated in forward pass of
                shape
                (batch_size, num_output_channels, output_dim_1, output_dim_2).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape
                (batch_size, num_output_channels, output_dim_1, output_dim_2).

        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, num_input_channels, input_dim_1, input_dim_2).
        """
        return np.array(conv2d_bprop(grads_wrt_outputs, self.kernels, 1, 1))

    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.

        Args:
            inputs: array of inputs to layer of shape (batch_size, input_dim)
            grads_wrt_to_outputs: array of gradients with respect to the layer
                outputs of shape
                (batch_size, num_output_channels, output_dim_1, output_dim_2).

        Returns:
            list of arrays of gradients with respect to the layer parameters
            `[grads_wrt_kernels, grads_wrt_biases]`.
        """
        grads_wrt_kernels = conv2d_grads_wrt_kernels(
            grads_wrt_outputs, inputs, self.kernel_dim_1, self.kernel_dim_2, 1, 1)
        grads_wrt_biases = grads_wrt_outputs.sum((0, 2, 3))
        return np.array(grads_wrt_kernels), grads_wrt_biases


    def __repr__(self):
        return (
            'CythonConvolutionalLayer(\n'
            '    num_input_channels={0}, num_output_channels={1},\n'
            '    input_dim_1={2}, input_dim_2={3},\n'
            '    kernel_dim_1={4}, kernel_dim_2={5}\n'
            ')'
            .format(self.num_input_channels, self.num_output_channels,
                    self.input_dim_1, self.input_dim_2, self.kernel_dim_1,
                    self.kernel_dim_2)
        )

The three test functions are defined in the cell below. All the functions take as first argument the *class* corresponding to the convolutional layer implementation to be tested (**not** an instance of the class). It is assumed the class being tested has an `__init__` method with at least all of the arguments defined in the skeleton definition above. A boolean second argument to each function can be used to specify if the layer implements a cross-correlation or convolution based operation (see note in [seventh lecture slides](http://www.inf.ed.ac.uk/teaching/courses/mlp/2016/mlp07-cnn.pdf)).

In [5]:
import numpy as np

def test_conv_layer_fprop(layer_class, do_cross_correlation=False):
    """Tests `fprop` method of a convolutional layer.
    
    Checks the outputs of `fprop` method for a fixed input against known
    reference values for the outputs and raises an AssertionError if
    the outputted values are not consistent with the reference values. If
    tests are all passed returns True.
    
    Args:
        layer_class: Convolutional layer implementation following the 
            interface defined in the provided skeleton class.
        do_cross_correlation: Whether the layer implements an operation
            corresponding to cross-correlation (True) i.e kernels are
            not flipped before sliding over inputs, or convolution
            (False) with filters being flipped.

    Raises:
        AssertionError: Raised if output of `layer.fprop` is inconsistent 
            with reference values either in shape or values.
    """
    inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)
    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)
    if do_cross_correlation:
        kernels = kernels[:, :, ::-1, ::-1]
    biases = np.arange(2).astype(np.double)
    true_output = np.array(
        [[[[ -958., -1036., -1114.],
           [-1270., -1348., -1426.],
           [-1582., -1660., -1738.]],
          [[ 1707.,  1773.,  1839.],
           [ 1971.,  2037.,  2103.],
           [ 2235.,  2301.,  2367.]]],
         [[[-4702., -4780., -4858.],
           [-5014., -5092., -5170.],
           [-5326., -5404., -5482.]],
          [[ 4875.,  4941.,  5007.],
           [ 5139.,  5205.,  5271.],
           [ 5403.,  5469.,  5535.]]]]
    )
    layer = layer_class(
        num_input_channels=kernels.shape[1], 
        num_output_channels=kernels.shape[0], 
        input_dim_1=inputs.shape[2], 
        input_dim_2=inputs.shape[3],
        kernel_dim_1=kernels.shape[2],
        kernel_dim_2=kernels.shape[3]
    )
    layer.params = [kernels, biases]
    layer_output = layer.fprop(inputs)
    assert layer_output.shape == true_output.shape, (
        'Layer fprop gives incorrect shaped output. '
        'Correct shape is \n\n{0}\n\n but returned shape is \n\n{1}.'
        .format(true_output.shape, layer_output.shape)
    )
    assert np.allclose(layer_output, true_output), (
        'Layer fprop does not give correct output. '
        'Correct output is \n\n{0}\n\n but returned output is \n\n{1}.'
        .format(true_output, layer_output)
    )
    return True

def test_conv_layer_bprop(layer_class, do_cross_correlation=False):
    """Tests `bprop` method of a convolutional layer.
    
    Checks the outputs of `bprop` method for a fixed input against known
    reference values for the gradients with respect to inputs and raises 
    an AssertionError if the returned values are not consistent with the
    reference values. If tests are all passed returns True.
    
    Args:
        layer_class: Convolutional layer implementation following the 
            interface defined in the provided skeleton class.
        do_cross_correlation: Whether the layer implements an operation
            corresponding to cross-correlation (True) i.e kernels are
            not flipped before sliding over inputs, or convolution
            (False) with filters being flipped.

    Raises:
        AssertionError: Raised if output of `layer.bprop` is inconsistent 
            with reference values either in shape or values.
    """
    inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)
    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)
    if do_cross_correlation:
        kernels = kernels[:, :, ::-1, ::-1]
    biases = np.arange(2).astype(np.double)
    grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3)).astype(np.double)
    outputs = np.array(
        [[[[ -958., -1036., -1114.],
           [-1270., -1348., -1426.],
           [-1582., -1660., -1738.]],
          [[ 1707.,  1773.,  1839.],
           [ 1971.,  2037.,  2103.],
           [ 2235.,  2301.,  2367.]]],
         [[[-4702., -4780., -4858.],
           [-5014., -5092., -5170.],
           [-5326., -5404., -5482.]],
          [[ 4875.,  4941.,  5007.],
           [ 5139.,  5205.,  5271.],
           [ 5403.,  5469.,  5535.]]]]
    )
    true_grads_wrt_inputs = np.array(
      [[[[ 147.,  319.,  305.,  162.],
         [ 338.,  716.,  680.,  354.],
         [ 290.,  608.,  572.,  294.],
         [ 149.,  307.,  285.,  144.]],
        [[  23.,   79.,   81.,   54.],
         [ 114.,  284.,  280.,  162.],
         [ 114.,  272.,  268.,  150.],
         [  73.,  163.,  157.,   84.]],
        [[-101., -161., -143.,  -54.],
         [-110., -148., -120.,  -30.],
         [ -62.,  -64.,  -36.,    6.],
         [  -3.,   19.,   29.,   24.]]],
       [[[  39.,   67.,   53.,   18.],
         [  50.,   68.,   32.,   -6.],
         [   2.,  -40.,  -76.,  -66.],
         [ -31.,  -89., -111.,  -72.]],
        [[  59.,  115.,  117.,   54.],
         [ 114.,  212.,  208.,   90.],
         [ 114.,  200.,  196.,   78.],
         [  37.,   55.,   49.,   12.]],
        [[  79.,  163.,  181.,   90.],
         [ 178.,  356.,  384.,  186.],
         [ 226.,  440.,  468.,  222.],
         [ 105.,  199.,  209.,   96.]]]])
    layer = layer_class(
        num_input_channels=kernels.shape[1], 
        num_output_channels=kernels.shape[0], 
        input_dim_1=inputs.shape[2], 
        input_dim_2=inputs.shape[3],
        kernel_dim_1=kernels.shape[2],
        kernel_dim_2=kernels.shape[3]
    )
    layer.params = [kernels, biases]
    layer_grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
    assert layer_grads_wrt_inputs.shape == true_grads_wrt_inputs.shape, (
        'Layer bprop returns incorrect shaped array. '
        'Correct shape is \n\n{0}\n\n but returned shape is \n\n{1}.'
        .format(true_grads_wrt_inputs.shape, layer_grads_wrt_inputs.shape)
    )
    assert np.allclose(layer_grads_wrt_inputs, true_grads_wrt_inputs), (
        'Layer bprop does not return correct values. '
        'Correct output is \n\n{0}\n\n but returned output is \n\n{1}'
        .format(true_grads_wrt_inputs, layer_grads_wrt_inputs)
    )
    return True

def test_conv_layer_grad_wrt_params(
        layer_class, do_cross_correlation=False):
    """Tests `grad_wrt_params` method of a convolutional layer.
    
    Checks the outputs of `grad_wrt_params` method for fixed inputs 
    against known reference values for the gradients with respect to 
    kernels and biases, and raises an AssertionError if the returned
    values are not consistent with the reference values. If tests
    are all passed returns True.
    
    Args:
        layer_class: Convolutional layer implementation following the 
            interface defined in the provided skeleton class.
        do_cross_correlation: Whether the layer implements an operation
            corresponding to cross-correlation (True) i.e kernels are
            not flipped before sliding over inputs, or convolution
            (False) with filters being flipped.

    Raises:
        AssertionError: Raised if output of `layer.bprop` is inconsistent 
            with reference values either in shape or values.
    """
    inputs = np.arange(96).reshape((2, 3, 4, 4)).astype(np.double)
    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2)).astype(np.double)
    biases = np.arange(2).astype(np.double)
    grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3)).astype(np.double)
    true_kernel_grads = np.array(
        [[[[ -240.,  -114.],
         [  264.,   390.]],
        [[-2256., -2130.],
         [-1752., -1626.]],
        [[-4272., -4146.],
         [-3768., -3642.]]],
       [[[ 5268.,  5232.],
         [ 5124.,  5088.]],
        [[ 5844.,  5808.],
         [ 5700.,  5664.]],
        [[ 6420.,  6384.],
         [ 6276.,  6240.]]]])
    if do_cross_correlation:
        kernels = kernels[:, :, ::-1, ::-1]
        true_kernel_grads = true_kernel_grads[:, :, ::-1, ::-1]
    true_bias_grads = np.array([-126.,   36.])
    layer = layer_class(
        num_input_channels=kernels.shape[1], 
        num_output_channels=kernels.shape[0], 
        input_dim_1=inputs.shape[2], 
        input_dim_2=inputs.shape[3],
        kernel_dim_1=kernels.shape[2],
        kernel_dim_2=kernels.shape[3]
    )
    layer.params = [kernels, biases]
    layer_kernel_grads, layer_bias_grads = (
        layer.grads_wrt_params(inputs, grads_wrt_outputs))
    assert layer_kernel_grads.shape == true_kernel_grads.shape, (
        'grads_wrt_params gives incorrect shaped kernel gradients output. '
        'Correct shape is \n\n{0}\n\n but returned shape is \n\n{1}.'
        .format(true_kernel_grads.shape, layer_kernel_grads.shape)
    )
    assert np.allclose(layer_kernel_grads, true_kernel_grads), (
        'grads_wrt_params does not give correct kernel gradients output. '
        'Correct output is \n\n{0}\n\n but returned output is \n\n{1}.'
        .format(true_kernel_grads, layer_kernel_grads)
    )
    assert layer_bias_grads.shape == true_bias_grads.shape, (
        'grads_wrt_params gives incorrect shaped bias gradients output. '
        'Correct shape is \n\n{0}\n\n but returned shape is \n\n{1}.'
        .format(true_bias_grads.shape, layer_bias_grads.shape)
    )
    assert np.allclose(layer_bias_grads, true_bias_grads), (
        'grads_wrt_params does not give correct bias gradients output. '
        'Correct output is \n\n{0}\n\n but returned output is \n\n{1}.'
        .format(true_bias_grads, layer_bias_grads)
    )
    return True

An example of using the test functions if given in the cell below. This assumes you implement a convolution (rather than cross-correlation) operation. If the implementation is correct 

In [6]:
all_correct = test_conv_layer_fprop(ConvolutionalLayer, False)
all_correct &= test_conv_layer_bprop(ConvolutionalLayer, False)
all_correct &= test_conv_layer_grad_wrt_params(ConvolutionalLayer, False)
if all_correct:
    print('All tests passed.')

All tests passed.


In [7]:
all_correct = test_conv_layer_fprop(CythonConvolutionalLayer, True)
all_correct &= test_conv_layer_bprop(CythonConvolutionalLayer, True)
all_correct &= test_conv_layer_grad_wrt_params(CythonConvolutionalLayer, True)
if all_correct:
    print('All tests passed.')

All tests passed.


In [8]:
import numpy as np
import matplotlib.pyplot as plt
import logging
from mlp.layers import ReluLayer, ReshapeLayer, AffineLayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import UniformInit, ConstantInit, GlorotUniformInit
from mlp.learning_rules import GradientDescentLearningRule
from mlp.data_providers import MNISTDataProvider
from mlp.optimisers import Optimiser
%matplotlib inline
plt.style.use('ggplot')

# Seed a random number generator
seed = 6102016 
rng = np.random.RandomState(seed)

# Set up a logger object to print info about the training run to stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]

# Create data provider objects for the MNIST data set
train_data = MNISTDataProvider('train', rng=rng)
valid_data = MNISTDataProvider('valid', rng=rng)
input_dim, output_dim = 784, 10

In [9]:
train_data.inputs = train_data.inputs.astype(np.double)
valid_data.inputs = valid_data.inputs.astype(np.double)

In [10]:
batch_size = 50
kernel_dim_1 = 4
kernel_dim_2 = 4
input_dim_1 = 28
input_dim_2 = 28
num_output_channels = 1
num_input_channels = 1
learning_rate = 0.01
num_epochs = 10
stats_interval = 1

# Reset random number generator and data provider states on each run
# to ensure reproducibility of results
rng.seed(seed)
train_data.reset()
valid_data.reset()

# Alter data-provider batch size
train_data.batch_size = batch_size 
valid_data.batch_size = batch_size

# Create a parameter initialiser which will sample random uniform values
# from [-init_scale, init_scale]
kernels_init = UniformInit(-0.01, 0.01, rng=rng)
weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

# Create a model with two affine layers
hidden_dim = (input_dim_1 - kernel_dim_1 + 1) * (input_dim_2 - kernel_dim_1 + 1) * num_output_channels
model = MultipleLayerModel([
    ReshapeLayer((num_input_channels, input_dim_1, input_dim_1)),
    ConvolutionalLayer(
            num_input_channels, num_output_channels, 
            input_dim_1, input_dim_2, 
            kernel_dim_1, kernel_dim_2, 
            kernels_init, biases_init
    ),
    ReluLayer(),
    ReshapeLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

# Initialise a cross entropy error object
error = CrossEntropySoftmaxError()

# Use a basic gradient descent learning rule
learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)

# Monitor classification accuracy during training
data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}

optimiser = Optimiser(
    model, error, learning_rule, train_data, valid_data, data_monitors)

stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)

Epoch 0:
  error(train)=2.30e+00, acc(train)=1.40e-01, error(valid)=2.30e+00, acc(valid)=1.34e-01, params_penalty=0.00e+00
Epoch 1: 20.20s to complete
  error(train)=3.70e-01, acc(train)=8.91e-01, error(valid)=3.38e-01, acc(valid)=9.03e-01, params_penalty=0.00e+00
Epoch 2: 23.26s to complete
  error(train)=3.38e-01, acc(train)=9.03e-01, error(valid)=3.11e-01, acc(valid)=9.12e-01, params_penalty=0.00e+00
Epoch 3: 23.27s to complete
  error(train)=3.18e-01, acc(train)=9.10e-01, error(valid)=3.00e-01, acc(valid)=9.16e-01, params_penalty=0.00e+00
Epoch 4: 25.56s to complete
  error(train)=3.14e-01, acc(train)=9.10e-01, error(valid)=2.96e-01, acc(valid)=9.17e-01, params_penalty=0.00e+00
Epoch 5: 25.17s to complete
  error(train)=3.06e-01, acc(train)=9.13e-01, error(valid)=2.85e-01, acc(valid)=9.19e-01, params_penalty=0.00e+00
Epoch 6: 25.59s to complete
  error(train)=3.07e-01, acc(train)=9.14e-01, error(valid)=2.87e-01, acc(valid)=9.20e-01, params_penalty=0.00e+00
Epoch 7: 28.70s to comple

In [11]:
batch_size = 50
kernel_dim_1 = 4
kernel_dim_2 = 4
input_dim_1 = 28
input_dim_2 = 28
num_output_channels = 1
num_input_channels = 1
learning_rate = 0.01
num_epochs = 10
stats_interval = 1

# Reset random number generator and data provider states on each run
# to ensure reproducibility of results
rng.seed(seed)
train_data.reset()
valid_data.reset()

# Alter data-provider batch size
train_data.batch_size = batch_size 
valid_data.batch_size = batch_size

# Create a parameter initialiser which will sample random uniform values
# from [-init_scale, init_scale]
kernels_init = UniformInit(-0.01, 0.01, rng=rng)
weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

# Create a model with two affine layers
hidden_dim = (input_dim_1 - kernel_dim_1 + 1) * (input_dim_2 - kernel_dim_1 + 1) * num_output_channels
model = MultipleLayerModel([
    ReshapeLayer((num_input_channels, input_dim_1, input_dim_1)),
    CythonConvolutionalLayer(
            num_input_channels, num_output_channels, 
            input_dim_1, input_dim_2, 
            kernel_dim_1, kernel_dim_2, 
            kernels_init, biases_init
    ),
    ReluLayer(),
    ReshapeLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

# Initialise a cross entropy error object
error = CrossEntropySoftmaxError()

# Use a basic gradient descent learning rule
learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)

# Monitor classification accuracy during training
data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}

optimiser = Optimiser(
    model, error, learning_rule, train_data, valid_data, data_monitors)

stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)

Epoch 0:
  error(train)=2.30e+00, acc(train)=1.09e-01, error(valid)=2.30e+00, acc(valid)=1.02e-01, params_penalty=0.00e+00
Epoch 1: 9.53s to complete
  error(train)=3.74e-01, acc(train)=8.90e-01, error(valid)=3.41e-01, acc(valid)=9.02e-01, params_penalty=0.00e+00
Epoch 2: 8.50s to complete
  error(train)=3.39e-01, acc(train)=9.03e-01, error(valid)=3.11e-01, acc(valid)=9.11e-01, params_penalty=0.00e+00
Epoch 3: 6.74s to complete
  error(train)=3.19e-01, acc(train)=9.10e-01, error(valid)=3.01e-01, acc(valid)=9.15e-01, params_penalty=0.00e+00
Epoch 4: 10.08s to complete
  error(train)=3.15e-01, acc(train)=9.11e-01, error(valid)=2.96e-01, acc(valid)=9.17e-01, params_penalty=0.00e+00
Epoch 5: 8.82s to complete
  error(train)=3.06e-01, acc(train)=9.14e-01, error(valid)=2.85e-01, acc(valid)=9.21e-01, params_penalty=0.00e+00
Epoch 6: 10.37s to complete
  error(train)=3.07e-01, acc(train)=9.13e-01, error(valid)=2.87e-01, acc(valid)=9.21e-01, params_penalty=0.00e+00
Epoch 7: 7.81s to complete
  