CW2 init

2017-11-06 14:04:47 +00:00 · 2017-11-06 14:04:47 +00:00 · 8928c6b52d
commit 8928c6b52d
parent cfadae67b9
30 changed files with 9792 additions and 3800 deletions
--- a/data/emnist-test.npz
+++ b/data/emnist-test.npz
--- a/data/emnist-train.npz
+++ b/data/emnist-train.npz
--- a/data/emnist-valid.npz
+++ b/data/emnist-valid.npz
--- a/mlp/data_providers.py
+++ b/mlp/data_providers.py
@ -199,6 +199,75 @@ class MNISTDataProvider(DataProvider):
        one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
        return one_of_k_targets

+class EMNISTDataProvider(DataProvider):
+    """Data provider for EMNIST handwritten digit images."""
+
+    def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
+                 shuffle_order=True, rng=None):
+        """Create a new EMNIST data provider object.
+
+        Args:
+            which_set: One of 'train', 'valid' or 'eval'. Determines which
+                portion of the EMNIST data this object should provide.
+            batch_size (int): Number of data points to include in each batch.
+            max_num_batches (int): Maximum number of batches to iterate over
+                in an epoch. If `max_num_batches * batch_size > num_data` then
+                only as many batches as the data can be split into will be
+                used. If set to -1 all of the data will be used.
+            shuffle_order (bool): Whether to randomly permute the order of
+                the data before each epoch.
+            rng (RandomState): A seeded random number generator.
+        """
+        # check a valid which_set was provided
+        assert which_set in ['train', 'valid', 'test'], (
+            'Expected which_set to be either train, valid or eval. '
+            'Got {0}'.format(which_set)
+        )
+        self.which_set = which_set
+        self.num_classes = 47
+        # construct path to data using os.path.join to ensure the correct path
+        # separator for the current platform / OS is used
+        # MLP_DATA_DIR environment variable should point to the data directory
+        data_path = os.path.join(
+            os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set))
+        assert os.path.isfile(data_path), (
+            'Data file does not exist at expected path: ' + data_path
+        )
+        # load data from compressed numpy file
+        loaded = np.load(data_path)
+        print(loaded.keys())
+        inputs, targets = loaded['inputs'], loaded['targets']
+        inputs = inputs.astype(np.float32)
+        inputs = np.reshape(inputs, newshape=(-1, 28*28))
+        inputs = inputs / 255.0
+        # pass the loaded data to the parent class __init__
+        super(EMNISTDataProvider, self).__init__(
+            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
+
+    def next(self):
+        """Returns next data batch or raises `StopIteration` if at end."""
+        inputs_batch, targets_batch = super(EMNISTDataProvider, self).next()
+        return inputs_batch, self.to_one_of_k(targets_batch)
+
+    def to_one_of_k(self, int_targets):
+        """Converts integer coded class target to 1 of K coded targets.
+
+        Args:
+            int_targets (ndarray): Array of integer coded class targets (i.e.
+                where an integer from 0 to `num_classes` - 1 is used to
+                indicate which is the correct class). This should be of shape
+                (num_data,).
+
+        Returns:
+            Array of 1 of K coded targets i.e. an array of shape
+            (num_data, num_classes) where for each row all elements are equal
+            to zero except for the column corresponding to the correct class
+            which is equal to one.
+        """
+        one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
+        one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
+        return one_of_k_targets
+

 class MetOfficeDataProvider(DataProvider):
    """South Scotland Met Office weather data provider."""
--- a/mlp/layers.py
+++ b/mlp/layers.py
@ -16,7 +16,6 @@ import numpy as np
 import mlp.initialisers as init
 from mlp import DEFAULT_SEED

-
 class Layer(object):
    """Abstract class defining the interface for a layer."""

@ -96,6 +95,76 @@ class LayerWithParameters(Layer):
        """
        raise NotImplementedError()

+class StochasticLayerWithParameters(Layer):
+    """Specialised layer which uses a stochastic forward propagation."""
+
+    def __init__(self, rng=None):
+        """Constructs a new StochasticLayer object.
+
+        Args:
+            rng (RandomState): Seeded random number generator object.
+        """
+        if rng is None:
+            rng = np.random.RandomState(DEFAULT_SEED)
+        self.rng = rng
+
+    def fprop(self, inputs, stochastic=True):
+        """Forward propagates activations through the layer transformation.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+            stochastic: Flag allowing different deterministic
+                forward-propagation mode in addition to default stochastic
+                forward-propagation e.g. for use at test time. If False
+                a deterministic forward-propagation transformation
+                corresponding to the expected output of the stochastic
+                forward-propagation is applied.
+
+        Returns:
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
+        """
+        raise NotImplementedError()
+    def grads_wrt_params(self, inputs, grads_wrt_outputs):
+        """Calculates gradients with respect to layer parameters.
+
+        Args:
+            inputs: Array of inputs to layer of shape (batch_size, input_dim).
+            grads_wrt_to_outputs: Array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim).
+
+        Returns:
+            List of arrays of gradients with respect to the layer parameters
+            with parameter gradients appearing in same order in tuple as
+            returned from `get_params` method.
+        """
+        raise NotImplementedError()
+
+    def params_penalty(self):
+        """Returns the parameter dependent penalty term for this layer.
+
+        If no parameter-dependent penalty terms are set this returns zero.
+        """
+        raise NotImplementedError()
+
+    @property
+    def params(self):
+        """Returns a list of parameters of layer.
+
+        Returns:
+            List of current parameter values. This list should be in the
+            corresponding order to the `values` argument to `set_params`.
+        """
+        raise NotImplementedError()
+
+    @params.setter
+    def params(self, values):
+        """Sets layer parameters from a list of values.
+
+        Args:
+            values: List of values to set parameters to. This list should be
+                in the corresponding order to what is returned by `get_params`.
+        """
+        raise NotImplementedError()

 class StochasticLayer(Layer):
    """Specialised layer which uses a stochastic forward propagation."""
@ -260,6 +329,94 @@ class AffineLayer(LayerWithParameters):
        return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
            self.input_dim, self.output_dim)

+class BatchNormalizationLayer(StochasticLayerWithParameters):
+    """Layer implementing an affine tranformation of its inputs.
+
+    This layer is parameterised by a weight matrix and bias vector.
+    """
+
+    def __init__(self, input_dim, rng=None):
+        """Initialises a parameterised affine layer.
+
+        Args:
+            input_dim (int): Dimension of inputs to the layer.
+            output_dim (int): Dimension of the layer outputs.
+            weights_initialiser: Initialiser for the weight parameters.
+            biases_initialiser: Initialiser for the bias parameters.
+            weights_penalty: Weights-dependent penalty term (regulariser) or
+                None if no regularisation is to be applied to the weights.
+            biases_penalty: Biases-dependent penalty term (regulariser) or
+                None if no regularisation is to be applied to the biases.
+        """
+        super(BatchNormalizationLayer, self).__init__(rng)
+        self.beta = np.random.normal(size=(input_dim))
+        self.gamma = np.random.normal(size=(input_dim))
+        self.epsilon = 0.00001
+        self.cache = None
+        self.input_dim = input_dim
+
+    def fprop(self, inputs, stochastic=True):
+        """Forward propagates inputs through a layer."""
+
+        raise NotImplementedError
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+            outputs: Array of layer outputs calculated in forward pass of
+                shape (batch_size, output_dim).
+            grads_wrt_outputs: Array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim).
+
+        Returns:
+            Array of gradients with respect to the layer inputs of shape
+            (batch_size, input_dim).
+        """
+
+        raise NotImplementedError
+
+    def grads_wrt_params(self, inputs, grads_wrt_outputs):
+        """Calculates gradients with respect to layer parameters.
+
+        Args:
+            inputs: array of inputs to layer of shape (batch_size, input_dim)
+            grads_wrt_to_outputs: array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim)
+
+        Returns:
+            list of arrays of gradients with respect to the layer parameters
+            `[grads_wrt_weights, grads_wrt_biases]`.
+        """
+        raise NotImplementedError
+
+    def params_penalty(self):
+        """Returns the parameter dependent penalty term for this layer.
+
+        If no parameter-dependent penalty terms are set this returns zero.
+        """
+        params_penalty = 0
+
+        return params_penalty
+
+    @property
+    def params(self):
+        """A list of layer parameter values: `[gammas, betas]`."""
+        return [self.gamma, self.beta]
+
+    @params.setter
+    def params(self, values):
+        self.gamma = values[0]
+        self.beta = values[1]
+
+    def __repr__(self):
+        return 'BatchNormalizationLayer(input_dim={0})'.format(
+            self.input_dim)
+

 class SigmoidLayer(Layer):
    """Layer implementing an element-wise logistic sigmoid transformation."""
@ -300,6 +457,151 @@ class SigmoidLayer(Layer):
    def __repr__(self):
        return 'SigmoidLayer'

+class ConvolutionalLayer(LayerWithParameters):
+    """Layer implementing a 2D convolution-based transformation of its inputs.
+    The layer is parameterised by a set of 2D convolutional kernels, a four
+    dimensional array of shape
+        (num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2)
+    and a bias vector, a one dimensional array of shape
+        (num_output_channels,)
+    i.e. one shared bias per output channel.
+    Assuming no-padding is applied to the inputs so that outputs are only
+    calculated for positions where the kernel filters fully overlap with the
+    inputs, and that unit strides are used the outputs will have spatial extent
+        output_dim_1 = input_dim_1 - kernel_dim_1 + 1
+        output_dim_2 = input_dim_2 - kernel_dim_2 + 1
+    """
+
+    def __init__(self, num_input_channels, num_output_channels,
+                 input_dim_1, input_dim_2,
+                 kernel_dim_1, kernel_dim_2,
+                 kernels_init=init.UniformInit(-0.01, 0.01),
+                 biases_init=init.ConstantInit(0.),
+                 kernels_penalty=None, biases_penalty=None):
+        """Initialises a parameterised convolutional layer.
+        Args:
+            num_input_channels (int): Number of channels in inputs to
+                layer (this may be number of colour channels in the input
+                images if used as the first layer in a model, or the
+                number of output channels, a.k.a. feature maps, from a
+                a previous convolutional layer).
+            num_output_channels (int): Number of channels in outputs
+                from the layer, a.k.a. number of feature maps.
+            input_dim_1 (int): Size of first input dimension of each 2D
+                channel of inputs.
+            input_dim_2 (int): Size of second input dimension of each 2D
+                channel of inputs.
+            kernel_dim_1 (int): Size of first dimension of each 2D channel of
+                kernels.
+            kernel_dim_2 (int): Size of second dimension of each 2D channel of
+                kernels.
+            kernels_intialiser: Initialiser for the kernel parameters.
+            biases_initialiser: Initialiser for the bias parameters.
+            kernels_penalty: Kernel-dependent penalty term (regulariser) or
+                None if no regularisation is to be applied to the kernels.
+            biases_penalty: Biases-dependent penalty term (regulariser) or
+                None if no regularisation is to be applied to the biases.
+        """
+        self.num_input_channels = num_input_channels
+        self.num_output_channels = num_output_channels
+        self.input_dim_1 = input_dim_1
+        self.input_dim_2 = input_dim_2
+        self.kernel_dim_1 = kernel_dim_1
+        self.kernel_dim_2 = kernel_dim_2
+        self.kernels_init = kernels_init
+        self.biases_init = biases_init
+        self.kernels_shape = (
+            num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2
+        )
+        self.inputs_shape = (
+            None, num_input_channels, input_dim_1, input_dim_2
+        )
+        self.kernels = self.kernels_init(self.kernels_shape)
+        self.biases = self.biases_init(num_output_channels)
+        self.kernels_penalty = kernels_penalty
+        self.biases_penalty = biases_penalty
+
+        self.cache = None
+
+    def fprop(self, inputs):
+        """Forward propagates activations through the layer transformation.
+        For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer
+        corresponds to `y = conv2d(x, K) + b`.
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+        Returns:
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
+        """
+        raise NotImplementedError
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs.
+        Args:
+            inputs: Array of layer inputs of shape
+                (batch_size, num_input_channels, input_dim_1, input_dim_2).
+            outputs: Array of layer outputs calculated in forward pass of
+                shape
+                (batch_size, num_output_channels, output_dim_1, output_dim_2).
+            grads_wrt_outputs: Array of gradients with respect to the layer
+                outputs of shape
+                (batch_size, num_output_channels, output_dim_1, output_dim_2).
+        Returns:
+            Array of gradients with respect to the layer inputs of shape
+            (batch_size, input_dim).
+        """
+        # Pad the grads_wrt_outputs
+
+        raise NotImplementedError
+
+    def grads_wrt_params(self, inputs, grads_wrt_outputs):
+        """Calculates gradients with respect to layer parameters.
+        Args:
+            inputs: array of inputs to layer of shape (batch_size, input_dim)
+            grads_wrt_to_outputs: array of gradients with respect to the layer
+                outputs of shape
+                (batch_size, num_output-_channels, output_dim_1, output_dim_2).
+        Returns:
+            list of arrays of gradients with respect to the layer parameters
+            `[grads_wrt_kernels, grads_wrt_biases]`.
+        """
+
+        raise NotImplementedError
+
+    def params_penalty(self):
+        """Returns the parameter dependent penalty term for this layer.
+        If no parameter-dependent penalty terms are set this returns zero.
+        """
+        params_penalty = 0
+        if self.kernels_penalty is not None:
+            params_penalty += self.kernels_penalty(self.kernels)
+        if self.biases_penalty is not None:
+            params_penalty += self.biases_penalty(self.biases)
+        return params_penalty
+
+    @property
+    def params(self):
+        """A list of layer parameter values: `[kernels, biases]`."""
+        return [self.kernels, self.biases]
+
+    @params.setter
+    def params(self, values):
+        self.kernels = values[0]
+        self.biases = values[1]
+
+    def __repr__(self):
+        return (
+            'ConvolutionalLayer(\n'
+            '    num_input_channels={0}, num_output_channels={1},\n'
+            '    input_dim_1={2}, input_dim_2={3},\n'
+            '    kernel_dim_1={4}, kernel_dim_2={5}\n'
+            ')'
+            .format(self.num_input_channels, self.num_output_channels,
+                    self.input_dim_1, self.input_dim_2, self.kernel_dim_1,
+                    self.kernel_dim_2)
+        )
+

 class ReluLayer(Layer):
    """Layer implementing an element-wise rectified linear transformation."""
@ -339,6 +641,102 @@ class ReluLayer(Layer):
    def __repr__(self):
        return 'ReluLayer'

+class LeakyReluLayer(Layer):
+    """Layer implementing an element-wise rectified linear transformation."""
+    def __init__(self, alpha=0.01):
+        self.alpha = alpha
+
+    def fprop(self, inputs):
+        """Forward propagates activations through the layer transformation.
+
+        For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
+        """
+        positive_inputs = np.maximum(inputs, 0.)
+
+        negative_inputs = inputs
+        negative_inputs[negative_inputs>0] = 0.
+        negative_inputs = negative_inputs * self.alpha
+
+        outputs = positive_inputs + negative_inputs
+        return outputs
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs.
+        """
+        positive_gradients = (outputs > 0) * grads_wrt_outputs
+        negative_gradients = self.alpha * (outputs < 0) * grads_wrt_outputs
+        gradients = positive_gradients + negative_gradients
+        return gradients
+
+    def __repr__(self):
+        return 'LeakyReluLayer'
+
+class ELULayer(Layer):
+    """Layer implementing an ELU activation."""
+    def __init__(self, alpha=1.0):
+        self.alpha = alpha
+    def fprop(self, inputs):
+        """Forward propagates activations through the layer transformation.
+
+        For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
+        """
+        positive_inputs = np.maximum(inputs, 0.)
+
+        negative_inputs = np.copy(inputs)
+        negative_inputs[negative_inputs>0] = 0.
+        negative_inputs = self.alpha * (np.exp(negative_inputs) - 1)
+
+        outputs = positive_inputs + negative_inputs
+        return outputs
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs.
+        """
+        positive_gradients = (outputs >= 0) * grads_wrt_outputs
+        outputs_to_use = (outputs < 0) * outputs
+        negative_gradients = (outputs_to_use + self.alpha)
+        negative_gradients[outputs >= 0] = 0.
+        negative_gradients = negative_gradients * grads_wrt_outputs
+        gradients = positive_gradients + negative_gradients
+        return gradients
+
+    def __repr__(self):
+        return 'ELULayer'
+
+class SELULayer(Layer):
+    """Layer implementing an element-wise rectified linear transformation."""
+    #α01 ≈ 1.6733 and λ01 ≈ 1.0507
+    def __init__(self):
+        self.alpha = 1.6733
+        self.lamda = 1.0507
+        self.elu = ELULayer(alpha=self.alpha)
+    def fprop(self, inputs):
+        """Forward propagates activations through the layer transformation.
+
+        For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
+        """
+        outputs = self.lamda * self.elu.fprop(inputs)
+        return outputs
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs.
+        """
+        scaled_outputs = outputs / self.lamda
+        gradients = self.lamda * self.elu.bprop(inputs=inputs, outputs=scaled_outputs,
+                                                grads_wrt_outputs=grads_wrt_outputs)
+        return gradients
+
+    def __repr__(self):
+        return 'SELULayer'

 class TanhLayer(Layer):
    """Layer implementing an element-wise hyperbolic tangent transformation."""
@ -482,4 +880,123 @@ class RadialBasisFunctionLayer(Layer):
        ).sum(-1)

    def __repr__(self):
-        return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)
+        return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)
+
+class DropoutLayer(StochasticLayer):
+    """Layer which stochastically drops input dimensions in its output."""
+
+    def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True):
+        """Construct a new dropout layer.
+
+        Args:
+            rng (RandomState): Seeded random number generator.
+            incl_prob: Scalar value in (0, 1] specifying the probability of
+                each input dimension being included in the output.
+            share_across_batch: Whether to use same dropout mask across
+                all inputs in a batch or use per input masks.
+        """
+        super(DropoutLayer, self).__init__(rng)
+        assert incl_prob > 0. and incl_prob <= 1.
+        self.incl_prob = incl_prob
+        self.share_across_batch = share_across_batch
+        self.rng = rng
+
+    def fprop(self, inputs, stochastic=True):
+        """Forward propagates activations through the layer transformation.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+            stochastic: Flag allowing different deterministic
+                forward-propagation mode in addition to default stochastic
+                forward-propagation e.g. for use at test time. If False
+                a deterministic forward-propagation transformation
+                corresponding to the expected output of the stochastic
+                forward-propagation is applied.
+
+        Returns:
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
+        """
+        if stochastic:
+            mask_shape = (1,) + inputs.shape[1:] if self.share_across_batch else inputs.shape
+            self._mask = (self.rng.uniform(size=mask_shape) < self.incl_prob)
+            return inputs * self._mask
+        else:
+            return inputs * self.incl_prob
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs. This should correspond to
+        default stochastic forward-propagation.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+            outputs: Array of layer outputs calculated in forward pass of
+                shape (batch_size, output_dim).
+            grads_wrt_outputs: Array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim).
+
+        Returns:
+            Array of gradients with respect to the layer inputs of shape
+            (batch_size, input_dim).
+        """
+        return grads_wrt_outputs * self._mask
+
+    def __repr__(self):
+        return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob)
+
+class ReshapeLayer(Layer):
+    """Layer which reshapes dimensions of inputs."""
+
+    def __init__(self, output_shape=None):
+        """Create a new reshape layer object.
+
+        Args:
+            output_shape: Tuple specifying shape each input in batch should
+                be reshaped to in outputs. This **excludes** the batch size
+                so the shape of the final output array will be
+                    (batch_size, ) + output_shape
+                Similarly to numpy.reshape, one shape dimension can be -1. In
+                this case, the value is inferred from the size of the input
+                array and remaining dimensions. The shape specified must be
+                compatible with the input array shape - i.e. the total number
+                of values in the array cannot be changed. If set to `None` the
+                output shape will be set to
+                    (batch_size, -1)
+                which will flatten all the inputs to vectors.
+        """
+        self.output_shape = (-1,) if output_shape is None else output_shape
+
+    def fprop(self, inputs):
+        """Forward propagates activations through the layer transformation.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+
+        Returns:
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
+        """
+        return inputs.reshape((inputs.shape[0],) + self.output_shape)
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+            outputs: Array of layer outputs calculated in forward pass of
+                shape (batch_size, output_dim).
+            grads_wrt_outputs: Array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim).
+
+        Returns:
+            Array of gradients with respect to the layer inputs of shape
+            (batch_size, input_dim).
+        """
+        return grads_wrt_outputs.reshape(inputs.shape)
+
+    def __repr__(self):
+        return 'ReshapeLayer(output_shape={0})'.format(self.output_shape)
--- a/mlp/models.py
+++ b/mlp/models.py
@ -8,7 +8,7 @@ outputs (and intermediate states) and for calculating gradients of scalar
 functions of the outputs with respect to the model parameters.
 """

-from mlp.layers import LayerWithParameters
+from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters


 class SingleLayerModel(object):
@ -60,7 +60,7 @@ class SingleLayerModel(object):
        return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)

    def __repr__(self):
-        return 'SingleLayerModel(' + str(layer) + ')'
+        return 'SingleLayerModel(' + str(self.layer) + ')'


 class MultipleLayerModel(object):
@ -84,7 +84,7 @@ class MultipleLayerModel(object):
                params += layer.params
        return params

-    def fprop(self, inputs):
+    def fprop(self, inputs, evaluation=False):
        """Forward propagates a batch of inputs through the model.

        Args:
@ -97,7 +97,19 @@ class MultipleLayerModel(object):
        """
        activations = [inputs]
        for i, layer in enumerate(self.layers):
-            activations.append(self.layers[i].fprop(activations[i]))
+            if evaluation:
+                if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
+                                                                                   StochasticLayerWithParameters):
+                    current_activations = self.layers[i].fprop(activations[i], stochastic=False)
+                else:
+                    current_activations = self.layers[i].fprop(activations[i])
+            else:
+                if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
+                                                                                   StochasticLayerWithParameters):
+                    current_activations = self.layers[i].fprop(activations[i], stochastic=True)
+                else:
+                    current_activations = self.layers[i].fprop(activations[i])
+            activations.append(current_activations)
        return activations

    def grads_wrt_params(self, activations, grads_wrt_outputs):
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@ -9,7 +9,7 @@ import time
 import logging
 from collections import OrderedDict
 import numpy as np
-
+import tqdm

 logger = logging.getLogger(__name__)

@ -18,7 +18,7 @@ class Optimiser(object):
    """Basic model optimiser."""

    def __init__(self, model, error, learning_rule, train_dataset,
-                 valid_dataset=None, data_monitors=None):
+                 valid_dataset=None, data_monitors=None, notebook=False):
        """Create a new optimiser instance.

        Args:
@ -43,6 +43,11 @@ class Optimiser(object):
        self.data_monitors = OrderedDict([('error', error)])
        if data_monitors is not None:
            self.data_monitors.update(data_monitors)
+        self.notebook = notebook
+        if notebook:
+            self.tqdm_progress = tqdm.tqdm_notebook
+        else:
+            self.tqdm_progress = tqdm.tqdm

    def do_training_epoch(self):
        """Do a single training epoch.
@ -52,12 +57,15 @@ class Optimiser(object):
        respect to all the model parameters and then updates the model
        parameters according to the learning rule.
        """
-        for inputs_batch, targets_batch in self.train_dataset:
-            activations = self.model.fprop(inputs_batch)
-            grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
-            grads_wrt_params = self.model.grads_wrt_params(
-                activations, grads_wrt_outputs)
-            self.learning_rule.update_params(grads_wrt_params)
+        with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar:
+            train_progress_bar.set_description("Epoch Progress")
+            for inputs_batch, targets_batch in self.train_dataset:
+                activations = self.model.fprop(inputs_batch)
+                grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
+                grads_wrt_params = self.model.grads_wrt_params(
+                    activations, grads_wrt_outputs)
+                self.learning_rule.update_params(grads_wrt_params)
+                train_progress_bar.update(1)

    def eval_monitors(self, dataset, label):
        """Evaluates the monitors for the given dataset.
@ -121,17 +129,20 @@ class Optimiser(object):
            and the second being a dict mapping the labels for the statistics
            recorded to their column index in the array.
        """
-        start_train_time = time.clock()
+        start_train_time = time.time()
        run_stats = [list(self.get_epoch_stats().values())]
-        for epoch in range(1, num_epochs + 1):
-            start_time = time.clock()
-            self.do_training_epoch()
-            epoch_time = time.clock() - start_time
-            if epoch % stats_interval == 0:
-                stats = self.get_epoch_stats()
-                self.log_stats(epoch, epoch_time, stats)
-                run_stats.append(list(stats.values()))
-        finish_train_time = time.clock()
+        with self.tqdm_progress(total=num_epochs) as progress_bar:
+            progress_bar.set_description("Experiment Progress")
+            for epoch in range(1, num_epochs + 1):
+                start_time = time.time()
+                self.do_training_epoch()
+                epoch_time = time.time()- start_time
+                if epoch % stats_interval == 0:
+                    stats = self.get_epoch_stats()
+                    self.log_stats(epoch, epoch_time, stats)
+                    run_stats.append(list(stats.values()))
+                progress_bar.update(1)
+        finish_train_time = time.time()
        total_train_time = finish_train_time - start_train_time
        return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time

--- a/notebooks/03_Multiple_layer_models.ipynb
+++ b/notebooks/03_Multiple_layer_models.ipynb
--- a/notebooks/05_Non-linearities_and_regularisation.ipynb
+++ b/notebooks/05_Non-linearities_and_regularisation.ipynb
--- a/notebooks/06_Dropout_and_maxout.ipynb
+++ b/notebooks/06_Dropout_and_maxout.ipynb
--- a/notebooks/BatchNormalizationLayer_tests.ipynb
+++ b/notebooks/BatchNormalizationLayer_tests.ipynb
@ -0,0 +1,155 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from mlp.layers import BatchNormalizationLayer\n",
+    "test_inputs = np.array([[-1.38066782, -0.94725498, -3.05585424,  2.28644454,  0.85520889,\n",
+    "         0.10575624,  0.23618609,  0.84723205,  1.06569909, -2.21704034],\n",
+    "       [ 0.11060968, -0.0747448 ,  0.56809029,  2.45926149, -2.28677816,\n",
+    "        -0.9964566 ,  2.7356007 ,  1.98002308, -0.39032315,  1.46515481]])\n",
+    "test_grads_wrt_outputs = np.array([[-0.43857052,  1.00380109, -1.18425494,  0.00486091,  0.21470207,\n",
+    "        -0.12179054, -0.11508482,  0.738482  , -1.17249238,  0.69188295],\n",
+    "       [ 1.07802015,  0.69901145,  0.81603688, -1.76743026, -1.24418692,\n",
+    "        -0.65729963, -0.50834305, -0.49016145,  1.63749743, -0.71123104]])\n",
+    "\n",
+    "#produce BatchNorm fprop and bprop\n",
+    "activation_layer = BatchNormalizationLayer(input_dim=10)\n",
+    "\n",
+    "beta = np.array(10*[0.3])\n",
+    "gamma = np.array(10*[0.5])\n",
+    "\n",
+    "activation_layer.params = [gamma, beta]\n",
+    "BN_fprop = activation_layer.fprop(test_inputs)\n",
+    "BN_bprop = activation_layer.bprop(\n",
+    "    test_inputs, BN_fprop, test_grads_wrt_outputs)\n",
+    "BN_grads_wrt_params = activation_layer.grads_wrt_params(\n",
+    "    test_inputs, test_grads_wrt_outputs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "true_fprop_outputs = np.array([[-0.1999955 , -0.19998686, -0.19999924, -0.1996655 ,  0.79999899,\n",
+    "         0.79999177, -0.1999984 , -0.19999221,  0.79999528, -0.19999926],\n",
+    "       [ 0.7999955 ,  0.79998686,  0.79999924,  0.7996655 , -0.19999899,\n",
+    "        -0.19999177,  0.7999984 ,  0.79999221, -0.19999528,  0.79999926]])\n",
+    "shape_test=BN_fprop.shape == true_fprop_outputs.shape, (\n",
+    "    'Layer bprop returns incorrect shaped array. '\n",
+    "    'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "    .format(true_fprop_outputs.shape, BN_fprop.shape)\n",
+    ")\n",
+    "numerical_test=np.allclose(np.round(BN_fprop, decimals=2), np.round(true_fprop_outputs, decimals=2)), (\n",
+    "'Layer bprop does not return correct values. '\n",
+    "'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
+    ".format(true_fprop_outputs, BN_fprop, BN_fprop-true_fprop_outputs)\n",
+    ")\n",
+    "\n",
+    "if shape_test and numerical_test:\n",
+    "    print(\"Batch Normalization F-prop test passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "true_bprop_outputs = np.array([[ -9.14558020e-06,   9.17665617e-06,  -8.40575535e-07,\n",
+    "          6.85384297e-03,   9.40668131e-07,   7.99795574e-06,\n",
+    "          5.03719464e-07,   1.69038704e-05,  -1.82061629e-05,\n",
+    "          5.62083224e-07],\n",
+    "       [  9.14558020e-06,  -9.17665617e-06,   8.40575535e-07,\n",
+    "         -6.85384297e-03,  -9.40668131e-07,  -7.99795574e-06,\n",
+    "         -5.03719464e-07,  -1.69038704e-05,   1.82061629e-05,\n",
+    "         -5.62083224e-07]])\n",
+    "shape_test=BN_bprop.shape == true_bprop_outputs.shape, (\n",
+    "    'Layer bprop returns incorrect shaped array. '\n",
+    "    'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "    .format(true_bprop_outputs.shape, BN_bprop.shape)\n",
+    ")\n",
+    "numerical_test=np.allclose(np.round(BN_bprop, decimals=2), np.round(true_bprop_outputs, decimals=2)), (\n",
+    "'Layer bprop does not return correct values. '\n",
+    "'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
+    ".format(true_bprop_outputs, BN_bprop, BN_bprop-true_bprop_outputs)\n",
+    ")\n",
+    "\n",
+    "if shape_test and numerical_test:\n",
+    "    print(\"Batch Normalization B-prop test passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grads_wrt_gamma, grads_wrt_beta = BN_grads_wrt_params\n",
+    "true_grads_wrt_gamma = np.array(([ 1.51657703, -0.30478163,  2.00028878, -1.77110552,  1.45888603,\n",
+    "        0.53550028, -0.39325697, -1.2286243 , -2.8099633 , -1.40311192]))\n",
+    "true_grads_wrt_beta = np.array([ 0.63944963,  1.70281254, -0.36821806, -1.76256935, -1.02948485,\n",
+    "       -0.77909018, -0.62342786,  0.24832055,  0.46500505, -0.01934809])\n",
+    "\n",
+    "grads_gamma_shape_test=grads_wrt_gamma.shape == true_grads_wrt_gamma.shape, (\n",
+    "    'Layer bprop returns incorrect shaped array. '\n",
+    "    'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "    .format(true_grads_wrt_gamma.shape, grads_wrt_gamma.shape)\n",
+    ")\n",
+    "grads_gamma_numerical_test=np.allclose(np.round(grads_wrt_gamma, decimals=2), np.round(true_grads_wrt_gamma, decimals=2)), (\n",
+    "'Layer bprop does not return correct values. '\n",
+    "'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
+    ".format(true_grads_wrt_gamma, grads_wrt_gamma, grads_wrt_gamma-true_grads_wrt_gamma)\n",
+    ")\n",
+    "\n",
+    "grads_beta_shape_test=grads_wrt_beta.shape == true_grads_wrt_beta.shape, (\n",
+    "    'Layer bprop returns incorrect shaped array. '\n",
+    "    'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "    .format(true_grads_wrt_beta.shape, grads_wrt_beta.shape)\n",
+    ")\n",
+    "grads_beta_numerical_test=np.allclose(np.round(grads_wrt_beta, decimals=2), np.round(true_grads_wrt_beta, decimals=2)), (\n",
+    "'Layer bprop does not return correct values. '\n",
+    "'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
+    ".format(true_grads_wrt_beta, grads_wrt_beta, grads_wrt_beta-true_grads_wrt_beta)\n",
+    ")\n",
+    "\n",
+    "if grads_gamma_shape_test and grads_gamma_numerical_test and grads_beta_shape_test and grads_beta_numerical_test:\n",
+    "    print(\"Batch Normalization grads wrt to params test passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/Convolutional
+++ b/notebooks/Convolutional
@ -0,0 +1,307 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below a skeleton class and associated test functions for the `fprop`, `bprop` and `grads_wrt_params` methods of the ConvolutionalLayer class are included.\n",
+    "\n",
+    "The test functions assume that in your implementation of `fprop` for the convolutional layer, outputs are calculated only for 'valid' overlaps of the kernel filters with the input - i.e. without any padding.\n",
+    "\n",
+    "It is also assumed that if convolutions with non-unit strides are implemented the default behaviour is to take unit-strides, with the test cases only correct for unit strides in both directions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The three test functions are defined in the cell below. All the functions take as first argument the *class* corresponding to the convolutional layer implementation to be tested (**not** an instance of the class). It is assumed the class being tested has an `__init__` method with at least all of the arguments defined in the skeleton definition above. A boolean second argument to each function can be used to specify if the layer implements a cross-correlation or convolution based operation (see note in [seventh lecture slides](http://www.inf.ed.ac.uk/teaching/courses/mlp/2016/mlp07-cnn.pdf))."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "def test_conv_layer_fprop(layer_class, do_cross_correlation=False):\n",
+    "    \"\"\"Tests `fprop` method of a convolutional layer.\n",
+    "    \n",
+    "    Checks the outputs of `fprop` method for a fixed input against known\n",
+    "    reference values for the outputs and raises an AssertionError if\n",
+    "    the outputted values are not consistent with the reference values. If\n",
+    "    tests are all passed returns True.\n",
+    "    \n",
+    "    Args:\n",
+    "        layer_class: Convolutional layer implementation following the \n",
+    "            interface defined in the provided skeleton class.\n",
+    "        do_cross_correlation: Whether the layer implements an operation\n",
+    "            corresponding to cross-correlation (True) i.e kernels are\n",
+    "            not flipped before sliding over inputs, or convolution\n",
+    "            (False) with filters being flipped.\n",
+    "\n",
+    "    Raises:\n",
+    "        AssertionError: Raised if output of `layer.fprop` is inconsistent \n",
+    "            with reference values either in shape or values.\n",
+    "    \"\"\"\n",
+    "    inputs = np.arange(96).reshape((2, 3, 4, 4))\n",
+    "    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))\n",
+    "    if do_cross_correlation:\n",
+    "        kernels = kernels[:, :, ::-1, ::-1]\n",
+    "    biases = np.arange(2)\n",
+    "    true_output = np.array(\n",
+    "        [[[[ -958., -1036., -1114.],\n",
+    "           [-1270., -1348., -1426.],\n",
+    "           [-1582., -1660., -1738.]],\n",
+    "          [[ 1707.,  1773.,  1839.],\n",
+    "           [ 1971.,  2037.,  2103.],\n",
+    "           [ 2235.,  2301.,  2367.]]],\n",
+    "         [[[-4702., -4780., -4858.],\n",
+    "           [-5014., -5092., -5170.],\n",
+    "           [-5326., -5404., -5482.]],\n",
+    "          [[ 4875.,  4941.,  5007.],\n",
+    "           [ 5139.,  5205.,  5271.],\n",
+    "           [ 5403.,  5469.,  5535.]]]]\n",
+    "    )\n",
+    "    \n",
+    "    layer = layer_class(\n",
+    "        num_input_channels=kernels.shape[1], \n",
+    "        num_output_channels=kernels.shape[0], \n",
+    "        input_dim_1=inputs.shape[2], \n",
+    "        input_dim_2=inputs.shape[3],\n",
+    "        kernel_dim_1=kernels.shape[2],\n",
+    "        kernel_dim_2=kernels.shape[3]\n",
+    "    )\n",
+    "    layer.params = [kernels, biases]\n",
+    "    layer_output = layer.fprop(inputs)\n",
+    "    \n",
+    "    assert layer_output.shape == true_output.shape, (\n",
+    "        'Layer fprop gives incorrect shaped output. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_output.shape, layer_output.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_output, true_output), (\n",
+    "        'Layer fprop does not give correct output. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}.'\n",
+    "        .format(true_output, layer_output, true_output-layer_output)\n",
+    "    )\n",
+    "    return True\n",
+    "\n",
+    "def test_conv_layer_bprop(layer_class, do_cross_correlation=False):\n",
+    "    \"\"\"Tests `bprop` method of a convolutional layer.\n",
+    "    \n",
+    "    Checks the outputs of `bprop` method for a fixed input against known\n",
+    "    reference values for the gradients with respect to inputs and raises \n",
+    "    an AssertionError if the returned values are not consistent with the\n",
+    "    reference values. If tests are all passed returns True.\n",
+    "    \n",
+    "    Args:\n",
+    "        layer_class: Convolutional layer implementation following the \n",
+    "            interface defined in the provided skeleton class.\n",
+    "        do_cross_correlation: Whether the layer implements an operation\n",
+    "            corresponding to cross-correlation (True) i.e kernels are\n",
+    "            not flipped before sliding over inputs, or convolution\n",
+    "            (False) with filters being flipped.\n",
+    "\n",
+    "    Raises:\n",
+    "        AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
+    "            with reference values either in shape or values.\n",
+    "    \"\"\"\n",
+    "    inputs = np.arange(96).reshape((2, 3, 4, 4))\n",
+    "    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))\n",
+    "    if do_cross_correlation:\n",
+    "        kernels = kernels[:, :, ::-1, ::-1]\n",
+    "    biases = np.arange(2)\n",
+    "    grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3))\n",
+    "    outputs = np.array(\n",
+    "        [[[[ -958., -1036., -1114.],\n",
+    "           [-1270., -1348., -1426.],\n",
+    "           [-1582., -1660., -1738.]],\n",
+    "          [[ 1707.,  1773.,  1839.],\n",
+    "           [ 1971.,  2037.,  2103.],\n",
+    "           [ 2235.,  2301.,  2367.]]],\n",
+    "         [[[-4702., -4780., -4858.],\n",
+    "           [-5014., -5092., -5170.],\n",
+    "           [-5326., -5404., -5482.]],\n",
+    "          [[ 4875.,  4941.,  5007.],\n",
+    "           [ 5139.,  5205.,  5271.],\n",
+    "           [ 5403.,  5469.,  5535.]]]]\n",
+    "    )\n",
+    "    true_grads_wrt_inputs = np.array(\n",
+    "      [[[[ 147.,  319.,  305.,  162.],\n",
+    "         [ 338.,  716.,  680.,  354.],\n",
+    "         [ 290.,  608.,  572.,  294.],\n",
+    "         [ 149.,  307.,  285.,  144.]],\n",
+    "        [[  23.,   79.,   81.,   54.],\n",
+    "         [ 114.,  284.,  280.,  162.],\n",
+    "         [ 114.,  272.,  268.,  150.],\n",
+    "         [  73.,  163.,  157.,   84.]],\n",
+    "        [[-101., -161., -143.,  -54.],\n",
+    "         [-110., -148., -120.,  -30.],\n",
+    "         [ -62.,  -64.,  -36.,    6.],\n",
+    "         [  -3.,   19.,   29.,   24.]]],\n",
+    "       [[[  39.,   67.,   53.,   18.],\n",
+    "         [  50.,   68.,   32.,   -6.],\n",
+    "         [   2.,  -40.,  -76.,  -66.],\n",
+    "         [ -31.,  -89., -111.,  -72.]],\n",
+    "        [[  59.,  115.,  117.,   54.],\n",
+    "         [ 114.,  212.,  208.,   90.],\n",
+    "         [ 114.,  200.,  196.,   78.],\n",
+    "         [  37.,   55.,   49.,   12.]],\n",
+    "        [[  79.,  163.,  181.,   90.],\n",
+    "         [ 178.,  356.,  384.,  186.],\n",
+    "         [ 226.,  440.,  468.,  222.],\n",
+    "         [ 105.,  199.,  209.,   96.]]]])\n",
+    "    layer = layer_class(\n",
+    "        num_input_channels=kernels.shape[1], \n",
+    "        num_output_channels=kernels.shape[0], \n",
+    "        input_dim_1=inputs.shape[2], \n",
+    "        input_dim_2=inputs.shape[3],\n",
+    "        kernel_dim_1=kernels.shape[2],\n",
+    "        kernel_dim_2=kernels.shape[3]\n",
+    "    )\n",
+    "    layer.params = [kernels, biases]\n",
+    "    layer_grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)\n",
+    "    assert layer_grads_wrt_inputs.shape == true_grads_wrt_inputs.shape, (\n",
+    "        'Layer bprop returns incorrect shaped array. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_grads_wrt_inputs.shape, layer_grads_wrt_inputs.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_grads_wrt_inputs, true_grads_wrt_inputs), (\n",
+    "        'Layer bprop does not return correct values. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
+    "        .format(true_grads_wrt_inputs, layer_grads_wrt_inputs, layer_grads_wrt_inputs-true_grads_wrt_inputs)\n",
+    "    )\n",
+    "    return True\n",
+    "\n",
+    "def test_conv_layer_grad_wrt_params(\n",
+    "        layer_class, do_cross_correlation=False):\n",
+    "    \"\"\"Tests `grad_wrt_params` method of a convolutional layer.\n",
+    "    \n",
+    "    Checks the outputs of `grad_wrt_params` method for fixed inputs \n",
+    "    against known reference values for the gradients with respect to \n",
+    "    kernels and biases, and raises an AssertionError if the returned\n",
+    "    values are not consistent with the reference values. If tests\n",
+    "    are all passed returns True.\n",
+    "    \n",
+    "    Args:\n",
+    "        layer_class: Convolutional layer implementation following the \n",
+    "            interface defined in the provided skeleton class.\n",
+    "        do_cross_correlation: Whether the layer implements an operation\n",
+    "            corresponding to cross-correlation (True) i.e kernels are\n",
+    "            not flipped before sliding over inputs, or convolution\n",
+    "            (False) with filters being flipped.\n",
+    "\n",
+    "    Raises:\n",
+    "        AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
+    "            with reference values either in shape or values.\n",
+    "    \"\"\"\n",
+    "    inputs = np.arange(96).reshape((2, 3, 4, 4))\n",
+    "    kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))\n",
+    "    biases = np.arange(2)\n",
+    "    grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3))\n",
+    "    true_kernel_grads = np.array(\n",
+    "        [[[[ -240.,  -114.],\n",
+    "         [  264.,   390.]],\n",
+    "        [[-2256., -2130.],\n",
+    "         [-1752., -1626.]],\n",
+    "        [[-4272., -4146.],\n",
+    "         [-3768., -3642.]]],\n",
+    "       [[[ 5268.,  5232.],\n",
+    "         [ 5124.,  5088.]],\n",
+    "        [[ 5844.,  5808.],\n",
+    "         [ 5700.,  5664.]],\n",
+    "        [[ 6420.,  6384.],\n",
+    "         [ 6276.,  6240.]]]])\n",
+    "    if do_cross_correlation:\n",
+    "        kernels = kernels[:, :, ::-1, ::-1]\n",
+    "        true_kernel_grads = true_kernel_grads[:, :, ::-1, ::-1]\n",
+    "    true_bias_grads = np.array([-126.,   36.])\n",
+    "    layer = layer_class(\n",
+    "        num_input_channels=kernels.shape[1], \n",
+    "        num_output_channels=kernels.shape[0], \n",
+    "        input_dim_1=inputs.shape[2], \n",
+    "        input_dim_2=inputs.shape[3],\n",
+    "        kernel_dim_1=kernels.shape[2],\n",
+    "        kernel_dim_2=kernels.shape[3]\n",
+    "    )\n",
+    "    layer.params = [kernels, biases]\n",
+    "    layer_kernel_grads, layer_bias_grads = (\n",
+    "        layer.grads_wrt_params(inputs, grads_wrt_outputs))\n",
+    "    assert layer_kernel_grads.shape == true_kernel_grads.shape, (\n",
+    "        'grads_wrt_params gives incorrect shaped kernel gradients output. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_kernel_grads.shape, layer_kernel_grads.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_kernel_grads, true_kernel_grads), (\n",
+    "        'grads_wrt_params does not give correct kernel gradients output. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
+    "        .format(true_kernel_grads, layer_kernel_grads)\n",
+    "    )\n",
+    "    assert layer_bias_grads.shape == true_bias_grads.shape, (\n",
+    "        'grads_wrt_params gives incorrect shaped bias gradients output. '\n",
+    "        'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
+    "        .format(true_bias_grads.shape, layer_bias_grads.shape)\n",
+    "    )\n",
+    "    assert np.allclose(layer_bias_grads, true_bias_grads), (\n",
+    "        'grads_wrt_params does not give correct bias gradients output. '\n",
+    "        'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
+    "        .format(true_bias_grads, layer_bias_grads)\n",
+    "    )\n",
+    "    return True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "An example of using the test functions if given in the cell below. This assumes you implement a convolution (rather than cross-correlation) operation. If the implementation is correct "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlp.layers import ConvolutionalLayer\n",
+    "fprop_correct = test_conv_layer_fprop(ConvolutionalLayer, False)\n",
+    "bprop_correct = test_conv_layer_bprop(ConvolutionalLayer, False)\n",
+    "grads_wrt_param_correct = test_conv_layer_grad_wrt_params(ConvolutionalLayer, False)\n",
+    "if fprop_correct and grads_wrt_param_correct and bprop_correct:\n",
+    "    print('All tests passed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/notebooks/Coursework_2.ipynb
+++ b/notebooks/Coursework_2.ipynb
@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Coursework 2\n",
+    "\n",
+    "This notebook is intended to be used as a starting point for your experiments. The instructions can be found in the instructions file located under spec/coursework2.pdf. The methods provided here are just helper functions. If you want more complex graphs such as side by side comparisons of different experiments you should learn more about matplotlib and implement them. Before each experiment remember to re-initialize neural network weights and reset the data providers so you get a properly initialized experiment. For each experiment try to keep most hyperparameters the same except the one under investigation so you can understand what the effects of each are."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "plt.style.use('ggplot')\n",
+    "\n",
+    "def train_model_and_plot_stats(\n",
+    "        model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True):\n",
+    "    \n",
+    "    # As well as monitoring the error over training also monitor classification\n",
+    "    # accuracy i.e. proportion of most-probable predicted classes being equal to targets\n",
+    "    data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}\n",
+    "\n",
+    "    # Use the created objects to initialise a new Optimiser instance.\n",
+    "    optimiser = Optimiser(\n",
+    "        model, error, learning_rule, train_data, valid_data, data_monitors, notebook=notebook)\n",
+    "\n",
+    "    # Run the optimiser for 5 epochs (full passes through the training set)\n",
+    "    # printing statistics every epoch.\n",
+    "    stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)\n",
+    "\n",
+    "    # Plot the change in the validation and training set error over training.\n",
+    "    fig_1 = plt.figure(figsize=(8, 4))\n",
+    "    ax_1 = fig_1.add_subplot(111)\n",
+    "    for k in ['error(train)', 'error(valid)']:\n",
+    "        ax_1.plot(np.arange(1, stats.shape[0]) * stats_interval, \n",
+    "                  stats[1:, keys[k]], label=k)\n",
+    "    ax_1.legend(loc=0)\n",
+    "    ax_1.set_xlabel('Epoch number')\n",
+    "\n",
+    "    # Plot the change in the validation and training set accuracy over training.\n",
+    "    fig_2 = plt.figure(figsize=(8, 4))\n",
+    "    ax_2 = fig_2.add_subplot(111)\n",
+    "    for k in ['acc(train)', 'acc(valid)']:\n",
+    "        ax_2.plot(np.arange(1, stats.shape[0]) * stats_interval, \n",
+    "                  stats[1:, keys[k]], label=k)\n",
+    "    ax_2.legend(loc=0)\n",
+    "    ax_2.set_xlabel('Epoch number')\n",
+    "    \n",
+    "    return stats, keys, run_time, fig_1, ax_1, fig_2, ax_2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The below code will set up the data providers, random number\n",
+    "# generator and logger objects needed for training runs. As\n",
+    "# loading the data from file take a little while you generally\n",
+    "# will probably not want to reload the data providers on\n",
+    "# every training run. If you wish to reset their state you\n",
+    "# should instead use the .reset() method of the data providers.\n",
+    "import numpy as np\n",
+    "import logging\n",
+    "from mlp.data_providers import MNISTDataProvider, EMNISTDataProvider\n",
+    "\n",
+    "# Seed a random number generator\n",
+    "seed = 10102016 \n",
+    "rng = np.random.RandomState(seed)\n",
+    "batch_size = 100\n",
+    "# Set up a logger object to print info about the training run to stdout\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.INFO)\n",
+    "logger.handlers = [logging.StreamHandler()]\n",
+    "\n",
+    "# Create data provider objects for the MNIST data set\n",
+    "train_data = EMNISTDataProvider('train', batch_size=batch_size, rng=rng)\n",
+    "valid_data = EMNISTDataProvider('valid', batch_size=batch_size, rng=rng)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The model set up code below is provided as a starting point.\n",
+    "# You will probably want to add further code cells for the\n",
+    "# different experiments you run.\n",
+    "\n",
+    "from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer, LeakyReluLayer, ELULayer, SELULayer\n",
+    "from mlp.errors import CrossEntropySoftmaxError\n",
+    "from mlp.models import MultipleLayerModel\n",
+    "from mlp.initialisers import ConstantInit, GlorotUniformInit\n",
+    "from mlp.learning_rules import GradientDescentLearningRule\n",
+    "from mlp.optimisers import Optimiser\n",
+    "\n",
+    "#setup hyperparameters\n",
+    "learning_rate = 0.1\n",
+    "num_epochs = 100\n",
+    "stats_interval = 1\n",
+    "input_dim, output_dim, hidden_dim = 784, 47, 100\n",
+    "\n",
+    "weights_init = GlorotUniformInit(rng=rng)\n",
+    "biases_init = ConstantInit(0.)\n",
+    "model = MultipleLayerModel([\n",
+    "    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), \n",
+    "    ReluLayer(),\n",
+    "    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), \n",
+    "    ReluLayer(),\n",
+    "    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)\n",
+    "])\n",
+    "\n",
+    "error = CrossEntropySoftmaxError()\n",
+    "# Use a basic gradient descent learning rule\n",
+    "learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)\n",
+    "\n",
+    "#Remember to use notebook=False when you write a script to be run in a terminal\n",
+    "_ = train_model_and_plot_stats(\n",
+    "    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/report/algorithm.sty
+++ b/report/algorithm.sty
@ -0,0 +1,79 @@
+% ALGORITHM STYLE -- Released 8 April 1996
+%    for LaTeX-2e
+% Copyright -- 1994 Peter Williams
+% E-mail Peter.Williams@dsto.defence.gov.au
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithm}
+\typeout{Document Style `algorithm' - floating environment}
+
+\RequirePackage{float}
+\RequirePackage{ifthen}
+\newcommand{\ALG@within}{nothing}
+\newboolean{ALG@within}
+\setboolean{ALG@within}{false}
+\newcommand{\ALG@floatstyle}{ruled}
+\newcommand{\ALG@name}{Algorithm}
+\newcommand{\listalgorithmname}{List of \ALG@name s}
+
+% Declare Options
+% first appearance
+\DeclareOption{plain}{
+  \renewcommand{\ALG@floatstyle}{plain}
+}
+\DeclareOption{ruled}{
+  \renewcommand{\ALG@floatstyle}{ruled}
+}
+\DeclareOption{boxed}{
+  \renewcommand{\ALG@floatstyle}{boxed}
+}
+% then numbering convention
+\DeclareOption{part}{
+  \renewcommand{\ALG@within}{part}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{chapter}{
+  \renewcommand{\ALG@within}{chapter}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{section}{
+  \renewcommand{\ALG@within}{section}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsection}{
+  \renewcommand{\ALG@within}{subsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsubsection}{
+  \renewcommand{\ALG@within}{subsubsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{nothing}{
+  \renewcommand{\ALG@within}{nothing}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption*{\edef\ALG@name{\CurrentOption}}
+
+% ALGORITHM
+%
+\ProcessOptions
+\floatstyle{\ALG@floatstyle}
+\ifthenelse{\boolean{ALG@within}}{
+  \ifthenelse{\equal{\ALG@within}{part}}
+     {\newfloat{algorithm}{htbp}{loa}[part]}{}
+  \ifthenelse{\equal{\ALG@within}{chapter}}
+     {\newfloat{algorithm}{htbp}{loa}[chapter]}{}
+  \ifthenelse{\equal{\ALG@within}{section}}
+     {\newfloat{algorithm}{htbp}{loa}[section]}{}
+  \ifthenelse{\equal{\ALG@within}{subsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsection]}{}
+  \ifthenelse{\equal{\ALG@within}{subsubsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
+  \ifthenelse{\equal{\ALG@within}{nothing}}
+     {\newfloat{algorithm}{htbp}{loa}}{}
+}{
+  \newfloat{algorithm}{htbp}{loa}
+}
+\floatname{algorithm}{\ALG@name}
+
+\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}
+
--- a/report/algorithmic.sty
+++ b/report/algorithmic.sty
@ -0,0 +1,201 @@
+% ALGORITHMIC STYLE -- Released 8 APRIL 1996
+%    for LaTeX version 2e
+% Copyright -- 1994 Peter Williams
+% E-mail PeterWilliams@dsto.defence.gov.au
+%
+% Modified by Alex Smola (08/2000)
+% E-mail Alex.Smola@anu.edu.au
+%
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithmic}
+\typeout{Document Style `algorithmic' - environment}
+%
+\RequirePackage{ifthen}
+\RequirePackage{calc}
+\newboolean{ALC@noend}
+\setboolean{ALC@noend}{false}
+\newcounter{ALC@line}
+\newcounter{ALC@rem}
+\newlength{\ALC@tlm}
+%
+\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
+%
+\ProcessOptions
+%
+% ALGORITHMIC
+\newcommand{\algorithmicrequire}{\textbf{Require:}}
+\newcommand{\algorithmicensure}{\textbf{Ensure:}}
+\newcommand{\algorithmiccomment}[1]{\{#1\}}
+\newcommand{\algorithmicend}{\textbf{end}}
+\newcommand{\algorithmicif}{\textbf{if}}
+\newcommand{\algorithmicthen}{\textbf{then}}
+\newcommand{\algorithmicelse}{\textbf{else}}
+\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
+\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
+\newcommand{\algorithmicfor}{\textbf{for}}
+\newcommand{\algorithmicforall}{\textbf{for all}}
+\newcommand{\algorithmicdo}{\textbf{do}}
+\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
+\newcommand{\algorithmicwhile}{\textbf{while}}
+\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
+\newcommand{\algorithmicloop}{\textbf{loop}}
+\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
+\newcommand{\algorithmicrepeat}{\textbf{repeat}}
+\newcommand{\algorithmicuntil}{\textbf{until}}
+
+%changed by alex smola
+\newcommand{\algorithmicinput}{\textbf{input}}
+\newcommand{\algorithmicoutput}{\textbf{output}}
+\newcommand{\algorithmicset}{\textbf{set}}
+\newcommand{\algorithmictrue}{\textbf{true}}
+\newcommand{\algorithmicfalse}{\textbf{false}}
+\newcommand{\algorithmicand}{\textbf{and\ }}
+\newcommand{\algorithmicor}{\textbf{or\ }}
+\newcommand{\algorithmicfunction}{\textbf{function}}
+\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
+\newcommand{\algorithmicmain}{\textbf{main}}
+\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
+%end changed by alex smola
+
+\def\ALC@item[#1]{%
+\if@noparitem \@donoparitem
+  \else \if@inlabel \indent \par \fi
+         \ifhmode \unskip\unskip \par \fi
+         \if@newlist \if@nobreak \@nbitem \else
+                        \addpenalty\@beginparpenalty
+                        \addvspace\@topsep \addvspace{-\parskip}\fi
+           \else \addpenalty\@itempenalty \addvspace\itemsep
+          \fi
+    \global\@inlabeltrue
+\fi
+\everypar{\global\@minipagefalse\global\@newlistfalse
+          \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
+             \penalty\z@ \fi
+          \everypar{}}\global\@nobreakfalse
+\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
+\sbox\@tempboxa{\makelabel{#1}}%
+\global\setbox\@labels
+ \hbox{\unhbox\@labels \hskip \itemindent
+       \hskip -\labelwidth \hskip -\ALC@tlm
+       \ifdim \wd\@tempboxa >\labelwidth
+                \box\@tempboxa
+          \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
+       \hskip \ALC@tlm}\ignorespaces}
+%
+\newenvironment{algorithmic}[1][0]{
+\let\@item\ALC@item
+  \newcommand{\ALC@lno}{%
+\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
+{{\footnotesize \arabic{ALC@line}:}}{}%
+}
+\let\@listii\@listi
+\let\@listiii\@listi
+\let\@listiv\@listi
+\let\@listv\@listi
+\let\@listvi\@listi
+\let\@listvii\@listi
+  \newenvironment{ALC@g}{
+    \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
+    \listparindent\z@ \rightmargin\z@ 
+    \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
+    \leftmargin 1em
+    \addtolength{\ALC@tlm}{\leftmargin}
+    }
+  }
+  {\end{list}}
+  \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
+  \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
+{}{\ \algorithmiccomment{##1}}}
+  \newcommand{\REQUIRE}{\item[\algorithmicrequire]}
+  \newcommand{\ENSURE}{\item[\algorithmicensure]}
+  \newcommand{\STATE}{\ALC@it}
+  \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
+%changes by alex smola
+  \newcommand{\INPUT}{\item[\algorithmicinput]}
+  \newcommand{\OUTPUT}{\item[\algorithmicoutput]}
+  \newcommand{\SET}{\item[\algorithmicset]}
+%  \newcommand{\TRUE}{\algorithmictrue}
+%  \newcommand{\FALSE}{\algorithmicfalse}
+  \newcommand{\AND}{\algorithmicand}
+  \newcommand{\OR}{\algorithmicor}
+  \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
+%end changes by alex smola
+  \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
+  \renewcommand{\\}{\@centercr}
+  \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
+    \algorithmicthen\ {##2}}
+  \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\ELSIF}[2][default]%
+{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
+    \algorithmicdo\ {##2}}
+  \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@whl}}
+  \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
+\ALC@com{##1}\begin{ALC@loop}}
+%changed by alex smola
+  \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
+    \ALC@com{##1}\begin{ALC@func}}
+  \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
+    \ALC@com{##1}\begin{ALC@main}}
+%end changed by alex smola
+  \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
+    \ALC@com{##1}\begin{ALC@rpt}}
+    \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
+  \ifthenelse{\boolean{ALC@noend}}{
+    \newcommand{\ENDIF}{\end{ALC@if}}
+    \newcommand{\ENDFOR}{\end{ALC@for}}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}}
+    \newcommand{\ENDMAIN}{\end{ALC@main}}
+  }{
+    \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
+    \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
+    \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
+  } 
+  \renewcommand{\@toodeep}{}
+  \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
+      \itemsep\z@ \itemindent\z@ \listparindent\z@%
+      \partopsep\z@ \parskip\z@ \parsep\z@%
+      \labelsep 0.5em \topsep 0.2em%
+      \ifthenelse{\equal{#1}{0}}
+      {\labelwidth 0.5em }
+      {\labelwidth  1.2em }
+      \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
+      \ALC@tlm\labelsep
+      }
+    }
+  {\end{list}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/report/example-refs.bib
+++ b/report/example-refs.bib
@ -0,0 +1,75 @@
+@inproceedings{langley00,
+ author    = {P. Langley},
+ title     = {Crafting Papers on Machine Learning},
+ year      = {2000},
+ pages     = {1207--1216},
+ editor    = {Pat Langley},
+ booktitle     = {Proceedings of the 17th International Conference
+              on Machine Learning (ICML 2000)},
+ address   = {Stanford, CA},
+ publisher = {Morgan Kaufmann}
+}
+
+@TechReport{mitchell80,
+  author = 	 "T. M. Mitchell",
+  title = 	 "The Need for Biases in Learning Generalizations",
+  institution =  "Computer Science Department, Rutgers University",
+  year = 	 "1980",
+  address =	 "New Brunswick, MA",
+}
+
+@phdthesis{kearns89,
+  author = {M. J. Kearns},
+  title =  {Computational Complexity of Machine Learning},
+  school = {Department of Computer Science, Harvard University},
+  year =   {1989}
+}
+
+@Book{MachineLearningI,
+  editor = 	 "R. S. Michalski and J. G. Carbonell and T.
+		  M. Mitchell",
+  title = 	 "Machine Learning: An Artificial Intelligence
+		  Approach, Vol. I",
+  publisher = 	 "Tioga",
+  year = 	 "1983",
+  address =	 "Palo Alto, CA"
+}
+
+@Book{DudaHart2nd,
+  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
+  title =        "Pattern Classification",
+  publisher =    "John Wiley and Sons",
+  edition =      "2nd",
+  year =         "2000"
+}
+
+@misc{anonymous,
+  title= {Suppressed for Anonymity},
+  author= {Author, N. N.},
+  year= {2011},
+}
+
+@InCollection{Newell81,
+  author =       "A. Newell and P. S. Rosenbloom",
+  title =        "Mechanisms of Skill Acquisition and the Law of
+                  Practice", 
+  booktitle =    "Cognitive Skills and Their Acquisition",
+  pages =        "1--51",
+  publisher =    "Lawrence Erlbaum Associates, Inc.",
+  year =         "1981",
+  editor =       "J. R. Anderson",
+  chapter =      "1",
+  address =      "Hillsdale, NJ"
+}
+
+
+@Article{Samuel59,
+  author = 	 "A. L. Samuel",
+  title = 	 "Some Studies in Machine Learning Using the Game of
+		  Checkers",
+  journal =	 "IBM Journal of Research and Development",
+  year =	 "1959",
+  volume =	 "3",
+  number =	 "3",
+  pages =	 "211--229"
+}
--- a/report/fancyhdr.sty
+++ b/report/fancyhdr.sty
@ -0,0 +1,485 @@
+% fancyhdr.sty version 3.2
+% Fancy headers and footers for LaTeX.
+% Piet van Oostrum, 
+% Dept of Computer and Information Sciences, University of Utrecht,
+% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands
+% Telephone: +31 30 2532180. Email: piet@cs.uu.nl
+% ========================================================================
+% LICENCE:
+% This file may be distributed under the terms of the LaTeX Project Public
+% License, as described in lppl.txt in the base LaTeX distribution.
+% Either version 1 or, at your option, any later version.
+% ========================================================================
+% MODIFICATION HISTORY:
+% Sep 16, 1994
+% version 1.4: Correction for use with \reversemargin
+% Sep 29, 1994:
+% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands
+% Oct 4, 1994:
+% version 1.6: Reset single spacing in headers/footers for use with
+% setspace.sty or doublespace.sty
+% Oct 4, 1994:
+% version 1.7: changed \let\@mkboth\markboth to
+% \def\@mkboth{\protect\markboth} to make it more robust
+% Dec 5, 1994:
+% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more
+% importantly) use the \chapter/sectionmark definitions from ps@headings if
+% they exist (which should be true for all standard classes).
+% May 31, 1995:
+% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage...
+% construction in the doc did not work properly with the fancyplain style. 
+% June 1, 1995:
+% version 1.91: The definition of \@mkboth wasn't restored on subsequent
+% \pagestyle{fancy}'s.
+% June 1, 1995:
+% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain}
+% \pagestyle{fancy} would erroneously select the plain version.
+% June 1, 1995:
+% version 1.93: \fancypagestyle command added.
+% Dec 11, 1995:
+% version 1.94: suggested by Conrad Hughes <chughes@maths.tcd.ie>
+% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule
+% position (old hardcoded value of .3\normalbaselineskip is far too high
+% when used with very small footer fonts).
+% Jan 31, 1996:
+% version 1.95: call \@normalsize in the reset code if that is defined,
+% otherwise \normalsize.
+% this is to solve a problem with ucthesis.cls, as this doesn't
+% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't
+% work as this is optimized to do very little, so there \@normalsize should
+% be called. Hopefully this code works for all versions of LaTeX known to
+% mankind.  
+% April 25, 1996:
+% version 1.96: initialize \headwidth to a magic (negative) value to catch
+% most common cases that people change it before calling \pagestyle{fancy}.
+% Note it can't be initialized when reading in this file, because
+% \textwidth could be changed afterwards. This is quite probable.
+% We also switch to \MakeUppercase rather than \uppercase and introduce a
+% \nouppercase command for use in headers. and footers.
+% May 3, 1996:
+% version 1.97: Two changes:
+% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults
+% for the chapter and section marks. The current version of amsbook and
+% amsart classes don't seem to need them anymore. Moreover the standard
+% latex classes don't use \markboth if twoside isn't selected, and this is
+% confusing as \leftmark doesn't work as expected.
+% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem
+% in the amsbook and amsart classes, that make global changes to \topskip,
+% which are reset in \ps@empty. Hopefully this doesn't break other things.
+% May 7, 1996:
+% version 1.98:
+% Added % after the line  \def\nouppercase
+% May 7, 1996:
+% version 1.99: This is the alpha version of fancyhdr 2.0
+% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf.
+% Changed \headrulewidth, \footrulewidth, \footruleskip to
+% macros rather than length parameters, In this way they can be
+% conditionalized and they don't consume length registers. There is no need
+% to have them as length registers unless you want to do calculations with
+% them, which is unlikely. Note that this may make some uses of them
+% incompatible (i.e. if you have a file that uses \setlength or \xxxx=)
+% May 10, 1996:
+% version 1.99a:
+% Added a few more % signs
+% May 10, 1996:
+% version 1.99b:
+% Changed the syntax of \f@nfor to be resistent to catcode changes of :=
+% Removed the [1] from the defs of \lhead etc. because the parameter is
+% consumed by the \@[xy]lhead etc. macros.
+% June 24, 1997:
+% version 1.99c:
+% corrected \nouppercase to also include the protected form of \MakeUppercase
+% \global added to manipulation of \headwidth.
+% \iffootnote command added.
+% Some comments added about \@fancyhead and \@fancyfoot.
+% Aug 24, 1998
+% version 1.99d
+% Changed the default \ps@empty to \ps@@empty in order to allow
+% \fancypagestyle{empty} redefinition.
+% Oct 11, 2000
+% version 2.0
+% Added LPPL license clause.
+%
+% A check for \headheight is added. An errormessage is given (once) if the
+% header is too large. Empty headers don't generate the error even if
+% \headheight is very small or even 0pt. 
+% Warning added for the use of 'E' option when twoside option is not used.
+% In this case the 'E' fields will never be used.
+%
+% Mar 10, 2002
+% version 2.1beta
+% New command: \fancyhfoffset[place]{length}
+% defines offsets to be applied to the header/footer to let it stick into
+% the margins (if length > 0).
+% place is like in fancyhead, except that only E,O,L,R can be used.
+% This replaces the old calculation based on \headwidth and the marginpar
+% area.
+% \headwidth will be dynamically calculated in the headers/footers when
+% this is used.
+%
+% Mar 26, 2002
+% version 2.1beta2
+% \fancyhfoffset now also takes h,f as possible letters in the argument to
+% allow the header and footer widths to be different.
+% New commands \fancyheadoffset and \fancyfootoffset added comparable to
+% \fancyhead and \fancyfoot.
+% Errormessages and warnings have been made more informative.
+%
+% Dec 9, 2002
+% version 2.1
+% The defaults for \footrulewidth, \plainheadrulewidth and
+% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when
+% someone inadvertantly uses \setlength to change any of these, the value
+% of \z@skip will not be changed, rather an errormessage will be given.
+
+% March 3, 2004
+% Release of version 3.0
+
+% Oct 7, 2004
+% version 3.1
+% Added '\endlinechar=13' to \fancy@reset to prevent problems with
+% includegraphics in header when verbatiminput is active.
+
+% March 22, 2005
+% version 3.2
+% reset \everypar (the real one) in \fancy@reset because spanish.ldf does
+% strange things with \everypar between << and >>.
+
+\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty}
+
+\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else
+                                   \fancy@gbl\def#1{#2\strut}\fi}
+
+\let\fancy@gbl\global
+
+\def\@fancyerrmsg#1{%
+        \ifx\PackageError\undefined
+        \errmessage{#1}\else
+        \PackageError{Fancyhdr}{#1}{}\fi}
+\def\@fancywarning#1{%
+        \ifx\PackageWarning\undefined
+        \errmessage{#1}\else
+        \PackageWarning{Fancyhdr}{#1}{}\fi}
+
+% Usage: \@forc \var{charstring}{command to be executed for each char}
+% This is similar to LaTeX's \@tfor, but expands the charstring.
+
+\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}}
+\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else
+                                    \f@@rc#1#2\f@@rc{#3}\fi}
+\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}}
+
+% Usage: \f@nfor\name:=list\do{body}
+% Like LaTeX's \@for but an empty list is treated as a list with an empty
+% element
+
+\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}%
+    \expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}}
+
+% Usage: \def@ult \cs{defaults}{argument}
+% sets \cs to the characters from defaults appearing in argument
+% or defaults if it would be empty. All characters are lowercased.
+
+\newcommand\def@ult[3]{%
+    \edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a
+    \def#1{}%
+    \@forc\tmpf@ra{#2}%
+        {\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}%
+    \ifx\@empty#1\def#1{#2}\fi}
+% 
+% \if@in <char><set><truecase><falsecase>
+%
+\newcommand{\if@in}[4]{%
+    \edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}%
+    \expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi}
+
+\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}%
+                                     {\f@ncyhf\fancyhead h[]}}
+\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}%
+                                     {\f@ncyhf\fancyfoot f[]}}
+\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}%
+                                   {\f@ncyhf\fancyhf{}[]}}
+
+% New commands for offsets added
+
+\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}%
+                                           {\f@ncyhfoffs\fancyheadoffset h[]}}
+\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}%
+                                           {\f@ncyhfoffs\fancyfootoffset f[]}}
+\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}%
+                                         {\f@ncyhfoffs\fancyhfoffset{}[]}}
+
+% The header and footer fields are stored in command sequences with
+% names of the form: \f@ncy<x><y><z> with <x> for [eo], <y> from [lcr]
+% and <z> from [hf].
+
+\def\f@ncyhf#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lcr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\fancy@def\csname
+                      f@ncy\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}}
+
+\def\f@ncyhfoffs#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\setlength\csname
+                      f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}%
+     \fancy@setoffs}
+
+% Fancyheadings version 1 commands. These are more or less deprecated,
+% but they continue to work.
+
+\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}}
+\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}}
+\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}}
+
+\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}}
+\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}}
+\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}}
+
+\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}}
+\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}}
+\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}}
+
+\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}}
+\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}}
+\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}}
+
+\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}}
+\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}}
+\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}}
+
+\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}}
+\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}}
+\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}}
+
+\newlength{\fancy@headwidth}
+\let\headwidth\fancy@headwidth
+\newlength{\f@ncyO@elh}
+\newlength{\f@ncyO@erh}
+\newlength{\f@ncyO@olh}
+\newlength{\f@ncyO@orh}
+\newlength{\f@ncyO@elf}
+\newlength{\f@ncyO@erf}
+\newlength{\f@ncyO@olf}
+\newlength{\f@ncyO@orf}
+\newcommand{\headrulewidth}{0.4pt}
+\newcommand{\footrulewidth}{0pt}
+\newcommand{\footruleskip}{.3\normalbaselineskip}
+
+% Fancyplain stuff shouldn't be used anymore (rather
+% \fancypagestyle{plain} should be used), but it must be present for
+% compatibility reasons.
+
+\newcommand{\plainheadrulewidth}{0pt}
+\newcommand{\plainfootrulewidth}{0pt}
+\newif\if@fancyplain \@fancyplainfalse
+\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi}
+
+\headwidth=-123456789sp %magic constant
+
+% Command to reset various things in the headers:
+% a.o.  single spacing (taken from setspace.sty)
+% and the catcode of ^^M (so that epsf files in the header work if a
+% verbatim crosses a page boundary)
+% It also defines a \nouppercase command that disables \uppercase and
+% \Makeuppercase. It can only be used in the headers and footers.
+\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf
+\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13
+ \def\baselinestretch{1}%
+ \def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax
+     \expandafter\let\csname MakeUppercase \endcsname\relax##1}}%
+ \ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e
+   \ifx\@normalsize\undefined \normalsize % for ucthesis.cls
+   \else \@normalsize \fi
+ \else% NFSS (2.09) present
+  \@newbaseline%
+ \fi}
+
+% Initialization of the head and foot text.
+
+% The default values still contain \fancyplain for compatibility.
+\fancyhf{} % clear all
+% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages
+% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages
+\if@twoside
+  \fancyhead[el,or]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}}
+\else
+  \fancyhead[l]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[r]{\fancyplain{}{\sl\leftmark}}
+\fi
+\fancyfoot[c]{\rm\thepage} % page number
+
+% Use box 0 as a temp box and dimen 0 as temp dimen. 
+% This can be done, because this code will always
+% be used inside another box, and therefore the changes are local.
+
+\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning
+  {\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J
+    We now make it that large for the rest of the document.^^J
+    This may cause the page layout to be inconsistent, however\@gobble}%
+  \dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi
+  \box0}
+
+% Put together a header or footer given the left, center and
+% right text, fillers at left and right and a rule.
+% The \lap commands put the text into an hbox of zero size,
+% so overlapping text does not generate an errormessage.
+% These macros have 5 parameters:
+% 1. LEFTSIDE BEARING % This determines at which side the header will stick
+%    out. When \fancyhfoffset is used this calculates \headwidth, otherwise
+%    it is \hss or \relax (after expansion).
+% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component.
+% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp.
+% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component.
+% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion).
+
+\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+  \@fancyvbox\headheight{\hbox
+    {\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill
+      \parbox[b]{\headwidth}{\centering#3}\hfill
+      \llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5}
+
+\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+    \@fancyvbox\footskip{\footrule
+      \hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill
+        \parbox[t]{\headwidth}{\centering#3}\hfill
+        \llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5}
+
+\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi
+    \hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}}
+
+\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi
+    \vskip-\footruleskip\vskip-\footrulewidth
+    \hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}}
+
+\def\ps@fancy{%
+\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook
+%
+% Define \MakeUppercase for old LaTeXen.
+% Note: we used \def rather than \let, so that \let\uppercase\relax (from
+% the version 1 documentation) will still work.
+%
+\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}%
+\@ifundefined{chapter}{\def\sectionmark##1{\markboth
+{\MakeUppercase{\ifnum \c@secnumdepth>\z@
+ \thesection\hskip 1em\relax \fi ##1}}{}}%
+\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne
+ \thesubsection\hskip 1em\relax \fi ##1}}}%
+{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne
+ \@chapapp\ \thechapter. \ \fi ##1}}{}}%
+\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@
+ \thesection. \ \fi ##1}}}}%
+%\csname ps@headings\endcsname % use \ps@headings defaults if they exist
+\ps@@fancy
+\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}%
+% Initialize \headwidth if the user didn't
+%
+\ifdim\headwidth<0sp
+%
+% This catches the case that \headwidth hasn't been initialized and the
+% case that the user added something to \headwidth in the expectation that
+% it was initialized to \textwidth. We compensate this now. This loses if
+% the user intended to multiply it by a factor. But that case is more
+% likely done by saying something like \headwidth=1.2\textwidth. 
+% The doc says you have to change \headwidth after the first call to
+% \pagestyle{fancy}. This code is just to catch the most common cases were
+% that requirement is violated.
+%
+    \global\advance\headwidth123456789sp\global\advance\headwidth\textwidth
+\fi}
+\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy}
+\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy}
+\let\ps@@empty\ps@empty
+\def\ps@@fancy{%
+\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip
+\def\@mkboth{\protect\markboth}%
+\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}%
+\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}%
+\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}%
+\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}%
+}
+% Default definitions for compatibility mode:
+% These cause the header/footer to take the defined \headwidth as width
+% And to shift in the direction of the marginpar area
+
+\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi}
+\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi}
+\let\fancy@Oelh\fancy@Oorh
+\let\fancy@Oerh\fancy@Oolh
+
+\let\fancy@Oolf\fancy@Oolh
+\let\fancy@Oorf\fancy@Oorh
+\let\fancy@Oelf\fancy@Oelh
+\let\fancy@Oerf\fancy@Oerh
+
+% New definitions for the use of \fancyhfoffset
+% These calculate the \headwidth from \textwidth and the specified offsets.
+
+\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh
+                   \advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh}
+\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh
+                   \advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh}
+
+\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf
+                   \advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf}
+\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf
+                   \advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf}
+
+\def\fancy@setoffs{%
+% Just in case \let\headwidth\textwidth was used
+  \fancy@gbl\let\headwidth\fancy@headwidth
+  \fancy@gbl\let\fancy@Oolh\fancy@offsolh
+  \fancy@gbl\let\fancy@Oelh\fancy@offselh
+  \fancy@gbl\let\fancy@Oorh\hss
+  \fancy@gbl\let\fancy@Oerh\hss
+  \fancy@gbl\let\fancy@Oolf\fancy@offsolf
+  \fancy@gbl\let\fancy@Oelf\fancy@offself
+  \fancy@gbl\let\fancy@Oorf\hss
+  \fancy@gbl\let\fancy@Oerf\hss}
+
+\newif\iffootnote
+\let\latex@makecol\@makecol
+\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi
+\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol}
+\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi}
+\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi}
+\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi}
+
+\newcommand{\fancypagestyle}[2]{%
+  \@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}}
--- a/report/icml2017.bst
+++ b/report/icml2017.bst
--- a/report/icml_numpapers.pdf
+++ b/report/icml_numpapers.pdf
--- a/report/mlp-cw1-template.pdf
+++ b/report/mlp-cw1-template.pdf
--- a/report/mlp-cw1-template.tex
+++ b/report/mlp-cw1-template.tex
@ -0,0 +1,207 @@
+%% Template for MLP Coursework 1 / 16 October 2017 
+
+%% Based on  LaTeX template for ICML 2017 - example_paper.tex at 
+%%  https://2017.icml.cc/Conferences/2017/StyleAuthorInstructions
+
+\documentclass{article}
+
+\usepackage[T1]{fontenc}
+\usepackage{amssymb,amsmath}
+\usepackage{txfonts}
+\usepackage{microtype}
+
+% For figures
+\usepackage{graphicx}
+\usepackage{subfigure} 
+
+% For citations
+\usepackage{natbib}
+
+% For algorithms
+\usepackage{algorithm}
+\usepackage{algorithmic}
+
+% the hyperref package is used to produce hyperlinks in the
+% resulting PDF.  If this breaks your system, please commend out the
+% following usepackage line and replace \usepackage{mlp2017} with
+% \usepackage[nohyperref]{mlp2017} below.
+\usepackage{hyperref}
+\usepackage{url}
+\urlstyle{same}
+
+% Packages hyperref and algorithmic misbehave sometimes.  We can fix
+% this with the following command.
+\newcommand{\theHalgorithm}{\arabic{algorithm}}
+
+
+% Set up MLP coursework style (based on ICML style)
+\usepackage{mlp2017}
+\mlptitlerunning{MLP Coursework 1 (\studentNumber)}
+\bibliographystyle{icml2017}
+
+
+\DeclareMathOperator{\softmax}{softmax}
+\DeclareMathOperator{\sigmoid}{sigmoid}
+\DeclareMathOperator{\sgn}{sgn}
+\DeclareMathOperator{\relu}{relu}
+\DeclareMathOperator{\lrelu}{lrelu}
+\DeclareMathOperator{\elu}{elu}
+\DeclareMathOperator{\selu}{selu}
+\DeclareMathOperator{\maxout}{maxout}
+
+%% You probably do not need to change anything above this comment
+
+%% REPLACE this with your student number
+\def\studentNumber{s1754321}
+
+\begin{document} 
+
+\twocolumn[
+\mlptitle{MLP Coursework 1: Activation Functions}
+
+\centerline{\studentNumber}
+
+\vskip 7mm
+]
+
+\begin{abstract} 
+The abstract should be 100--200 words long,  providing a concise summary of the contents of your report.
+\end{abstract} 
+
+\section{Introduction}
+\label{sec:intro}
+This document provides a template for the MLP coursework 1 report.  In particular, it structures the document into five sections  (plus an abstract and the references) -- you should keep to this structure for your report.  If you want to use subsections within a section that is fine, but please do not use any deeper structuring.  In this template the text in each section will include an outline of what you should include in each section, along with some practical LaTeX examples (for example figures, tables, algorithms).  Your document should be no longer than \textbf{six pages},  with an additional page allowed for references.
+
+The introduction should place your work in context, giving the overall motivation for the work, and clearly outlining the research questions you have explored -- in this case comparison of the behaviour of the different activation functions,  experimental investigation of the impact of the depth of the network with respect to accuracy, and experimental investigation of different approaches to weight initialisation.  This section should also include a concise description of the MNIST task and  data -- be precise: for example state the size of the training and validation sets.
+
+
+\section{Activation functions}
+\label{sec:actfn}
+This section should cover the theoretical methodology -- in this case you should present the four activation functions: ReLU, Leaky ReLU, ELU, and SELU.  I didn't do it in this document, but the first time you use an acronym you should say what it stands for, for example Restricted Linear Unit (ReLU).  You should use equations to concisely describe each activation function.  For example, ReLU: 
+\begin{equation}
+  \relu(x) = \max(0, x) ,
+\end{equation} 
+which has the gradient:
+\begin{equation}
+  \frac{d}{dx} \relu(x) =
+     \begin{cases} 
+      0      & \quad \text{if } x \leq  0 \\
+      1       & \quad \text{if } x > 0 .
+    \end{cases} 
+\end{equation}
+The \LaTeX for the derivatives is slightly more complicated.  We provided definitions near the top of the file (the part before \verb+\begin{document}+) for \verb+\relu+, \verb+\lrelu+, \verb+\elu+, and \verb+\selu+.  There is no need to discuss the unit tests for these activation functions in this report.
+
+It is probably not needed in this report, but if you would like to include an algorithm in your report, please use the \verb+algorithm+ and \verb+algorithmic+ environments to format pseudocode (for instance, Algorithm~\ref{alg:example}). These require the corresponding style files, \verb+algorithm.sty+ and \verb+algorithmic.sty+ which are supplied with this package. 
+
+\begin{algorithm}[ht]
+\begin{algorithmic}
+   \STATE {\bfseries Input:} data $x_i$, size $m$
+   \REPEAT
+   \STATE Initialize $noChange = true$.
+   \FOR{$i=1$ {\bfseries to} $m-1$}
+   \IF{$x_i > x_{i+1}$} 
+   \STATE Swap $x_i$ and $x_{i+1}$
+   \STATE $noChange = false$
+   \ENDIF
+   \ENDFOR
+   \UNTIL{$noChange$ is $true$}
+\end{algorithmic}
+  \caption{Bubble Sort}
+  \label{alg:example}
+\end{algorithm}
+
+\section{Experimental comparison of activation functions}
+\label{sec:actexpts}
+In this section you should present the results and discussion of your experiments comparing networks using the different activation functions on the MNIST task.  As explained in the coursework document, you should use 2 hidden layers with 100 hidden units per layer for these experiments.  You can compare the learning curves (error vs epoch) for training and/or validation, and the validation set accuracies. 
+
+Your experimental sections should include graphs (for instance, figure~\ref{fig:sample-graph}) and/or tables (for instance, table~\ref{tab:sample-table})\footnote{These examples were taken from the ICML template paper.}, using the \verb+figure+ and \verb+table+ environments, in which you use \verb+\includegraphics+ to include an image (pdf, png, or jpg formats).  Please export graphs as 
+\href{https://en.wikipedia.org/wiki/Vector_graphics}{vector graphics}
+rather than \href{https://en.wikipedia.org/wiki/Raster_graphics}{raster
+files} as this will make sure all detail in the plot is visible.
+Matplotlib supports saving high quality figures in a wide range of
+common image formats using the
+\href{http://matplotlib.org/api/pyplot_api.html\#matplotlib.pyplot.savefig}{\texttt{savefig}}
+function. \textbf{You should use \texttt{savefig} rather than copying
+the screen-resolution raster images outputted in the notebook.} An
+example of using \texttt{savefig} to save a figure as a PDF file (which
+can be included as graphics in a \LaTeX document is given in the coursework document.
+
+If you need a figure or table to stretch across two columns use the \verb+figure*+ or \verb+table*+ environment instead of the \verb+figure+ or \verb+table+ environment.  Use the \verb+subfigure+ environment if you want to include multiple graphics in a single figure.
+
+\begin{figure}[tb]
+\vskip 5mm
+\begin{center}
+\centerline{\includegraphics[width=\columnwidth]{icml_numpapers}}
+\caption{Historical locations and number of accepted papers for International
+  Machine Learning Conferences (ICML 1993 -- ICML 2008) and
+  International Workshops on Machine Learning (ML 1988 -- ML
+  1992). At the time this figure was produced, the number of
+  accepted papers for ICML 2008 was unknown and instead estimated.}
+\label{fig:sample-graph}
+\end{center}
+\vskip -5mm
+\end{figure} 
+
+\begin{table}[tb]
+\vskip 3mm
+\begin{center}
+\begin{small}
+\begin{sc}
+\begin{tabular}{lcccr}
+\hline
+\abovespace\belowspace
+Data set & Naive & Flexible & Better? \\
+\hline
+\abovespace
+Breast    & 95.9$\pm$ 0.2& 96.7$\pm$ 0.2& $\surd$ \\
+Cleveland & 83.3$\pm$ 0.6& 80.0$\pm$ 0.6& $\times$\\
+Glass2    & 61.9$\pm$ 1.4& 83.8$\pm$ 0.7& $\surd$ \\
+Credit    & 74.8$\pm$ 0.5& 78.3$\pm$ 0.6&         \\
+Horse     & 73.3$\pm$ 0.9& 69.7$\pm$ 1.0& $\times$\\
+Meta      & 67.1$\pm$ 0.6& 76.5$\pm$ 0.5& $\surd$ \\
+Pima      & 75.1$\pm$ 0.6& 73.9$\pm$ 0.5&         \\
+\belowspace
+Vehicle   & 44.9$\pm$ 0.6& 61.5$\pm$ 0.4& $\surd$ \\
+\hline
+\end{tabular}
+\end{sc}
+\end{small}
+\caption{Classification accuracies for naive Bayes and flexible 
+Bayes on various data sets.}
+\label{tab:sample-table}
+\end{center}
+\vskip -3mm
+\end{table}
+
+\section{Deep neural network experiments}
+\label{sec:dnnexpts}
+This section should report on your experiments on deeper networks for MNIST.  The two sets of experiments are to explore the impact of the depth of the network (number of hidden layers), and a comparison of different approaches to weight initialisation.
+
+In this section, and in the previous section, you should present your experimental results clearly and concisely, followed by an interpretation and discussion of results. You need to present your results in a way that makes it easy for a reader to understand what they mean. You should facilitate comparisons either using graphs with multiple curves or (if appropriate, e.g. for accuracies) a results table. You need to avoid having too many figures, poorly labelled graphs, and graphs which should be comparable but which use different axis scales. A good presentation will enable the reader to compare trends in the same graph -- each graph should summarise the results relating to a particular research (sub)question.
+
+Your discussion should interpret the results, both in terms of summarising the outcomes of a particular experiment, and attempting to relate to the underlying models. A good report would have some analysis, resulting in an understanding of why particular results are observed, perhaps with reference to the literature. Use bibtex to organise your references -- in this case the references are in the file \verb+example-refs.bib+.  Here is a an example reference \citep{langley00}.  
+
+
+
+
+\section{Conclusions}
+\label{sec:concl}
+You should draw conclusions from the experiments, related to the research questions outlined in the introduction (section~\ref{sec:intro}). You should state the conclusions clearly and concisely. It is good if the conclusion from one experiment influenced what you did in later experiments -- your aim is to learn from your experiments. Extra credit if you relate your findings to what has been reported in the literature.
+
+A good conclusions section would also include a further work discussion, building on work done so far, and referencing the literature where appropriate.
+
+\bibliography{example-refs}
+
+\end{document} 
+
+
+% This document was modified from the file originally made available by
+% Pat Langley and Andrea Danyluk for ICML-2K. This version was
+% created by Lise Getoor and Tobias Scheffer, it was slightly modified  
+% from the 2010 version by Thorsten Joachims & Johannes Fuernkranz, 
+% slightly modified from the 2009 version by Kiri Wagstaff and 
+% Sam Roweis's 2008 version, which is slightly modified from 
+% Prasad Tadepalli's 2007 version which is a lightly 
+% changed version of the previous year's version by Andrew Moore, 
+% which was in turn edited from those of Kristian Kersting and 
+% Codrina Lauth. Alex Smola contributed to the algorithmic style files.  
--- a/report/mlp-cw2-template.pdf
+++ b/report/mlp-cw2-template.pdf
--- a/report/mlp-cw2-template.tex
+++ b/report/mlp-cw2-template.tex
@ -0,0 +1,195 @@
+%% Template for MLP Coursework 2 / 6 November 2017 
+
+%% Based on  LaTeX template for ICML 2017 - example_paper.tex at 
+%%  https://2017.icml.cc/Conferences/2017/StyleAuthorInstructions
+
+\documentclass{article}
+
+\usepackage[T1]{fontenc}
+\usepackage{amssymb,amsmath}
+\usepackage{txfonts}
+\usepackage{microtype}
+
+% For figures
+\usepackage{graphicx}
+\usepackage{subfigure} 
+
+% For citations
+\usepackage{natbib}
+
+% For algorithms
+\usepackage{algorithm}
+\usepackage{algorithmic}
+
+% the hyperref package is used to produce hyperlinks in the
+% resulting PDF.  If this breaks your system, please commend out the
+% following usepackage line and replace \usepackage{mlp2017} with
+% \usepackage[nohyperref]{mlp2017} below.
+\usepackage{hyperref}
+\usepackage{url}
+\urlstyle{same}
+
+% Packages hyperref and algorithmic misbehave sometimes.  We can fix
+% this with the following command.
+\newcommand{\theHalgorithm}{\arabic{algorithm}}
+
+
+% Set up MLP coursework style (based on ICML style)
+\usepackage{mlp2017}
+\mlptitlerunning{MLP Coursework 2 (\studentNumber)}
+\bibliographystyle{icml2017}
+
+
+\DeclareMathOperator{\softmax}{softmax}
+\DeclareMathOperator{\sigmoid}{sigmoid}
+\DeclareMathOperator{\sgn}{sgn}
+\DeclareMathOperator{\relu}{relu}
+\DeclareMathOperator{\lrelu}{lrelu}
+\DeclareMathOperator{\elu}{elu}
+\DeclareMathOperator{\selu}{selu}
+\DeclareMathOperator{\maxout}{maxout}
+
+%% You probably do not need to change anything above this comment
+
+%% REPLACE this with your student number
+\def\studentNumber{sXXXXXXX}
+
+\begin{document} 
+
+\twocolumn[
+\mlptitle{MLP Coursework 2: Learning rules, BatchNorm, and ConvNets}
+
+\centerline{\studentNumber}
+
+\vskip 7mm
+]
+
+\begin{abstract} 
+The abstract should be 100--200 words long,  providing a concise summary of the contents of your report.
+\end{abstract} 
+
+\section{Introduction}
+\label{sec:intro}
+This document provides a template for the MLP coursework 2 report.  This template structures the report into sections, which you are recommended to use, but can change if you wish.  If you want to use subsections within a section that is fine, but please do not use any deeper structuring.  In this template the text in each section will include an outline of what you should include in each section, along with some practical LaTeX examples (for example figures, tables, algorithms).  Your document should be no longer than \textbf{seven pages},  with an additional page allowed for references.
+
+The introduction should place your work in context, giving the overall motivation for the work, and clearly outlining the research questions you have explored.  This section should also include a concise description of the Balanced EMNIST task and  data -- be precise: for example state the size of the training, validation, and test sets.
+
+\section{Baseline systems} 
+In this section you should report your baseline experiments for EMNIST.  No need for theoretical explanations of things covered in the course, but should you go beyond what was covered please explain what you did with references to relevant paper(s) if appropriate.   In this section you should aim to cover the both the ``what'' and the ``why'': \emph{what} you did, giving sufficient information (hyperparameter settings, etc.) so that someone else (e.g. another student on the course) could reproduce your results; and \emph{why} you performed the experiments you are reporting - what you are aiming to discover what is the motivation for the particular experiments you undertook. You should also provide some discussion and interpretation of your results.  
+
+As before, your experimental sections should include graphs (for instance, figure~\ref{fig:sample-graph}) and/or tables (for instance, table~\ref{tab:sample-table})\footnote{These examples were taken from the ICML template paper.}, using the \verb+figure+ and \verb+table+ environments, in which you use \verb+\includegraphics+ to include an image (pdf, png, or jpg formats).  Please export graphs as 
+\href{https://en.wikipedia.org/wiki/Vector_graphics}{vector graphics}
+rather than \href{https://en.wikipedia.org/wiki/Raster_graphics}{raster
+files} as this will make sure all detail in the plot is visible.
+Matplotlib supports saving high quality figures in a wide range of
+common image formats using the
+\href{http://matplotlib.org/api/pyplot_api.html\#matplotlib.pyplot.savefig}{\texttt{savefig}}
+function. \textbf{You should use \texttt{savefig} rather than copying
+the screen-resolution raster images outputted in the notebook.} An
+example of using \texttt{savefig} to save a figure as a PDF file (which
+can be included as graphics in a \LaTeX document is given in the coursework 1 document.
+
+If you need a figure or table to stretch across two columns use the \verb+figure*+ or \verb+table*+ environment instead of the \verb+figure+ or \verb+table+ environment.  Use the \verb+subfigure+ environment if you want to include multiple graphics in a single figure.
+
+\begin{figure}[tb]
+\vskip 5mm
+\begin{center}
+\centerline{\includegraphics[width=\columnwidth]{icml_numpapers}}
+\caption{Historical locations and number of accepted papers for International
+  Machine Learning Conferences (ICML 1993 -- ICML 2008) and
+  International Workshops on Machine Learning (ML 1988 -- ML
+  1992). At the time this figure was produced, the number of
+  accepted papers for ICML 2008 was unknown and instead estimated.}
+\label{fig:sample-graph}
+\end{center}
+\vskip -5mm
+\end{figure} 
+
+\begin{table}[tb]
+\vskip 3mm
+\begin{center}
+\begin{small}
+\begin{sc}
+\begin{tabular}{lcccr}
+\hline
+\abovespace\belowspace
+Data set & Naive & Flexible & Better? \\
+\hline
+\abovespace
+Breast    & 95.9$\pm$ 0.2& 96.7$\pm$ 0.2& $\surd$ \\
+Cleveland & 83.3$\pm$ 0.6& 80.0$\pm$ 0.6& $\times$\\
+Glass2    & 61.9$\pm$ 1.4& 83.8$\pm$ 0.7& $\surd$ \\
+Credit    & 74.8$\pm$ 0.5& 78.3$\pm$ 0.6&         \\
+Horse     & 73.3$\pm$ 0.9& 69.7$\pm$ 1.0& $\times$\\
+Meta      & 67.1$\pm$ 0.6& 76.5$\pm$ 0.5& $\surd$ \\
+Pima      & 75.1$\pm$ 0.6& 73.9$\pm$ 0.5&         \\
+\belowspace
+Vehicle   & 44.9$\pm$ 0.6& 61.5$\pm$ 0.4& $\surd$ \\
+\hline
+\end{tabular}
+\end{sc}
+\end{small}
+\caption{Classification accuracies for naive Bayes and flexible 
+Bayes on various data sets.}
+\label{tab:sample-table}
+\end{center}
+\vskip -3mm
+\end{table}
+
+\section{Learning rules}
+In this section you should compare RMSProp and Adam with gradient descent, introducing these learning rules either as equations or as algorithmic pseudocode.  If you present the different approaches as algorithms, you can use the \verb+algorithm+ and \verb+algorithmic+ environments to format pseudocode (for instance, Algorithm~\ref{alg:example}). These require the corresponding style files, \verb+algorithm.sty+ and \verb+algorithmic.sty+ which are supplied with this package. 
+
+\begin{algorithm}[ht]
+\begin{algorithmic}
+   \STATE {\bfseries Input:} data $x_i$, size $m$
+   \REPEAT
+   \STATE Initialize $noChange = true$.
+   \FOR{$i=1$ {\bfseries to} $m-1$}
+   \IF{$x_i > x_{i+1}$} 
+   \STATE Swap $x_i$ and $x_{i+1}$
+   \STATE $noChange = false$
+   \ENDIF
+   \ENDFOR
+   \UNTIL{$noChange$ is $true$}
+\end{algorithmic}
+  \caption{Bubble Sort}
+  \label{alg:example}
+\end{algorithm}
+
+You should, in your own words, explain what the different learning rules do, and how they differ.  You should then present your experiments and results, comparing and contrasting stochastic gradient descent, RMSProp, and Adam.  As before concentrate on the ``what'' (remember give enough information so someone can reproduce your experiments), the ``why'' (why did you choose the experiments that you performed -- you may have been motivated by your earlier results, by the literature, or by a specific research question), and the interpretation of your results.
+
+In every section, you should present your results in a way that makes it easy for a reader to understand what they mean. You should facilitate comparisons either using graphs with multiple curves or (if appropriate, e.g. for accuracies) a results table. You need to avoid having too many figures, poorly labelled graphs, and graphs which should be comparable but which use different axis scales. A good presentation will enable the reader to compare trends in the same graph -- each graph should summarise the results relating to a particular research (sub)question.
+
+Your discussion should interpret the results, both in terms of summarising the outcomes of a particular experiment, and attempting to relate to the underlying models. A good report would have some analysis, resulting in an understanding of why particular results are observed, perhaps with reference to the literature. Use bibtex to organise your references -- in this case the references are in the file \verb+example-refs.bib+.  Here is a an example reference \citep{langley00}.  
+
+\section{Batch normalisation}
+In this section you should present batch normalisation,  supported using equations or algorithmic pseudocode.  Following this present your experiments, again remembering to include the ``what'', the ``why'', and the interpretation of results.
+
+\section{Convolutional networks}
+In this section you should present your experiments with convolutional networks.  Explain the idea of convolutional layers and pooling layers, and briefly explain how you did the implementation.  There is no need to include chunks of code.  You should report the experiments you have undertaken, again remembering to include \emph{what} experiments you performed (include details of hyperparameters, etc.),  \emph{why} you performed them (what was the motivation for the experiments, what research questions are you exploring), and the interpretation and discussion of your results.
+
+\section{Test results}
+The results reported in the previous sections should be on the validation set.  You should finally report results on the EMNIST test set using what you judge to the be the best deep neural network (without convolutional layers) and the best convolutional network.  Again focus on what the experiments were (be precise), why you chose to do them (in particular, how did you choose the architectures/settings to use with the test set), and a discussion/interpretation of the results.
+
+
+\section{Conclusions}
+\label{sec:concl}
+You should draw conclusions from the experiments, related to the research questions outlined in the introduction (section~\ref{sec:intro}). You should state the conclusions clearly and concisely. It is good if the conclusion from one experiment influenced what you did in later experiments -- your aim is to learn from your experiments. Extra credit if you relate your findings to what has been reported in the literature.
+
+A good conclusions section would also include a further work discussion, building on work done so far, and referencing the literature where appropriate.
+
+\bibliography{example-refs}
+
+\end{document} 
+
+
+% This document was modified from the file originally made available by
+% Pat Langley and Andrea Danyluk for ICML-2K. This version was
+% created by Lise Getoor and Tobias Scheffer, it was slightly modified  
+% from the 2010 version by Thorsten Joachims & Johannes Fuernkranz, 
+% slightly modified from the 2009 version by Kiri Wagstaff and 
+% Sam Roweis's 2008 version, which is slightly modified from 
+% Prasad Tadepalli's 2007 version which is a lightly 
+% changed version of the previous year's version by Andrew Moore, 
+% which was in turn edited from those of Kristian Kersting and 
+% Codrina Lauth. Alex Smola contributed to the algorithmic style files.  
--- a/report/mlp2017.sty
+++ b/report/mlp2017.sty
@ -0,0 +1,720 @@
+% File: mlp2017.sty (LaTeX style file for ICML-2017, version of 2017-05-31)
+
+% Modified by Daniel Roy 2017: changed byline to use footnotes for affiliations, and removed emails
+
+% This file contains the LaTeX formatting parameters for a two-column 
+% conference proceedings that is 8.5 inches wide by 11 inches high.  
+% 
+% Modified by Percy Liang 12/2/2013: changed the year, location from the previous template for ICML 2014
+
+% Modified by Fei Sha 9/2/2013: changed the year, location form the previous template for ICML 2013
+%
+% Modified by Fei Sha 4/24/2013: (1) remove the extra whitespace after the first author's email address (in %the camera-ready version) (2) change the Proceeding ... of ICML 2010 to 2014 so PDF's metadata will show up % correctly
+%
+% Modified by Sanjoy Dasgupta, 2013: changed years, location
+%
+% Modified by Francesco Figari, 2012: changed years, location
+%
+% Modified by Christoph Sawade and Tobias Scheffer, 2011: added line 
+% numbers, changed years
+%
+% Modified by Hal Daume III, 2010: changed years, added hyperlinks
+%
+% Modified by Kiri Wagstaff, 2009: changed years
+%
+% Modified by Sam Roweis, 2008: changed years
+%
+% Modified by Ricardo Silva, 2007: update of the ifpdf verification
+%
+% Modified by Prasad Tadepalli and Andrew Moore, merely changing years. 
+%
+% Modified by Kristian Kersting, 2005, based on Jennifer Dy's 2004 version
+% - running title. If the original title is to long or is breaking a line,
+%   use \mlptitlerunning{...} in the preamble to supply a shorter form.
+%   Added fancyhdr package to get a running head. 
+% - Updated to store the page size because pdflatex does compile the 
+%   page size into the pdf. 
+%
+% Hacked by Terran Lane, 2003:
+% - Updated to use LaTeX2e style file conventions (ProvidesPackage,
+%   etc.)
+% - Added an ``appearing in'' block at the base of the first column
+%   (thus keeping the ``appearing in'' note out of the bottom margin
+%   where the printer should strip in the page numbers).
+% - Added a package option [accepted] that selects between the ``Under
+%   review'' notice (default, when no option is specified) and the
+%   ``Appearing in'' notice (for use when the paper has been accepted
+%   and will appear).
+%
+%   Originally created as:  ml2k.sty (LaTeX style file for ICML-2000)
+%   by P. Langley (12/23/99)
+
+%%%%%%%%%%%%%%%%%%%%
+%% This version of the style file supports both a ``review'' version
+%% and a ``final/accepted'' version.  The difference is only in the
+%% text that appears in the note at the bottom of the first column of
+%% the first page.  The default behavior is to print a note to the
+%% effect that the paper is under review and don't distribute it.  The
+%% final/accepted version prints an ``Appearing in'' note.  To get the
+%% latter behavior, in the calling file change the ``usepackage'' line
+%% from:
+%%	\usepackage{icml2017}
+%% to
+%%	\usepackage[accepted]{icml2017}
+%%%%%%%%%%%%%%%%%%%%
+
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{mlp2017}[2017/01/01 MLP Coursework Style File]
+
+% Use fancyhdr package
+\RequirePackage{fancyhdr}
+\RequirePackage{color}
+\RequirePackage{algorithm}
+\RequirePackage{algorithmic}
+\RequirePackage{natbib}
+\RequirePackage{eso-pic} % used by \AddToShipoutPicture 
+\RequirePackage{forloop}
+
+%%%%%%%% Options
+%\DeclareOption{accepted}{%
+% \renewcommand{\Notice@String}{\ICML@appearing}
+  \gdef\isaccepted{1}
+%}
+\DeclareOption{nohyperref}{%
+  \gdef\nohyperref{1}
+}
+
+\ifdefined\nohyperref\else\ifdefined\hypersetup
+  \definecolor{mydarkblue}{rgb}{0,0.08,0.45}
+  \hypersetup{ %
+    pdftitle={},
+    pdfauthor={},
+    pdfsubject={MLP Coursework 2017-18},
+    pdfkeywords={},
+    pdfborder=0 0 0,
+    pdfpagemode=UseNone,
+    colorlinks=true,
+    linkcolor=mydarkblue,
+    citecolor=mydarkblue,
+    filecolor=mydarkblue,
+    urlcolor=mydarkblue,
+    pdfview=FitH}
+
+  \ifdefined\isaccepted \else
+    \hypersetup{pdfauthor={Anonymous Submission}}
+  \fi
+\fi\fi
+
+%%%%%%%%%%%%%%%%%%%%
+% This string is printed at the bottom of the page for the
+% final/accepted version of the ``appearing in'' note.  Modify it to
+% change that text.
+%%%%%%%%%%%%%%%%%%%%
+\newcommand{\ICML@appearing}{\textit{MLP Coursework 1 2017-18}}
+
+%%%%%%%%%%%%%%%%%%%%
+% This string is printed at the bottom of the page for the draft/under
+% review version of the ``appearing in'' note.  Modify it to change
+% that text.
+%%%%%%%%%%%%%%%%%%%%
+\newcommand{\Notice@String}{MLP Coursework 1 2017-18}
+
+% Cause the declared options to actually be parsed and activated
+\ProcessOptions\relax
+
+% Uncomment the following for debugging.  It will cause LaTeX to dump
+% the version of the ``appearing in'' string that will actually appear
+% in the document.
+%\typeout{>> Notice string='\Notice@String'}
+
+% Change citation commands to be more like old ICML styles
+\newcommand{\yrcite}[1]{\citeyearpar{#1}}
+\renewcommand{\cite}[1]{\citep{#1}}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% to ensure the letter format is used. pdflatex does compile the
+% page size into the pdf. This is done using \pdfpagewidth and 
+% \pdfpageheight. As Latex does not know this directives, we first
+% check whether pdflatex or latex is used.
+%
+% Kristian Kersting 2005
+%
+% in order to account for the more recent use of pdfetex as the default
+% compiler, I have changed the pdf verification.
+%
+% Ricardo Silva 2007
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\paperwidth=210mm
+\paperheight=297mm
+
+% old PDFLaTex verification, circa 2005
+%
+%\newif\ifpdf\ifx\pdfoutput\undefined
+%  \pdffalse % we are not running PDFLaTeX
+%\else
+%  \pdfoutput=1 % we are running PDFLaTeX
+%  \pdftrue
+%\fi
+
+\newif\ifpdf %adapted from ifpdf.sty
+\ifx\pdfoutput\undefined
+\else
+   \ifx\pdfoutput\relax
+   \else
+     \ifcase\pdfoutput
+     \else
+       \pdftrue
+     \fi
+   \fi
+\fi
+
+\ifpdf
+%    \pdfpagewidth=\paperwidth
+%    \pdfpageheight=\paperheight
+  \setlength{\pdfpagewidth}{210mm}
+  \setlength{\pdfpageheight}{297mm}
+\fi
+
+% Physical page layout 
+
+\evensidemargin -5.5mm  
+\oddsidemargin -5.5mm 
+\setlength\textheight{248mm}
+\setlength\textwidth{170mm} 
+\setlength\columnsep{6.5mm}
+\setlength\headheight{10pt}
+\setlength\headsep{10pt} 
+\addtolength{\topmargin}{-20pt}
+
+%\setlength\headheight{1em}
+%\setlength\headsep{1em}
+\addtolength{\topmargin}{-6mm}
+
+%\addtolength{\topmargin}{-2em}
+
+%% The following is adapted from code in the acmconf.sty conference
+%% style file.  The constants in it are somewhat magical, and appear
+%% to work well with the two-column format on US letter paper that
+%% ICML uses, but will break if you change that layout, or if you use
+%% a longer block of text for the copyright notice string.  Fiddle with
+%% them if necessary to get the block to fit/look right.
+%%
+%% -- Terran Lane, 2003
+%%
+%% The following comments are included verbatim from acmconf.sty:
+%%
+%%% This section (written by KBT) handles the 1" box in the lower left
+%%% corner of the left column of the first page by creating a picture,
+%%% and inserting the predefined string at the bottom (with a negative
+%%% displacement to offset the space allocated for a non-existent
+%%% caption).
+%%%
+\def\ftype@copyrightbox{8}
+\def\@copyrightspace{
+% Create a float object positioned at the bottom of the column.  Note
+% that because of the mystical nature of floats, this has to be called
+% before the first column is populated with text (e.g., from the title
+% or abstract blocks).  Otherwise, the text will force the float to
+% the next column.  -- TDRL.
+\@float{copyrightbox}[b]
+\begin{center}
+\setlength{\unitlength}{1pc}
+\begin{picture}(20,1.5)
+% Create a line separating the main text from the note block.
+% 4.818pc==0.8in.
+\put(0,2.5){\line(1,0){4.818}}
+% Insert the text string itself.  Note that the string has to be
+% enclosed in a parbox -- the \put call needs a box object to
+% position.  Without the parbox, the text gets splattered across the
+% bottom of the page semi-randomly.  The 19.75pc distance seems to be
+% the width of the column, though I can't find an appropriate distance
+% variable to substitute here.  -- TDRL.
+\put(0,0){\parbox[b]{19.75pc}{\small \Notice@String}}
+\end{picture}
+\end{center}
+\end@float}
+
+% Note: A few Latex versions need the next line instead of the former.
+% \addtolength{\topmargin}{0.3in}
+% \setlength\footheight{0pt}
+\setlength\footskip{0pt} 
+%\pagestyle{empty} 
+\flushbottom \twocolumn
+\sloppy
+
+% Clear out the addcontentsline command
+\def\addcontentsline#1#2#3{}
+ 
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%% commands for formatting paper title, author names, and addresses. 
+
+%%start%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%% title as running head -- Kristian Kersting 2005 %%%%%%%%%%%%%
+
+
+%\makeatletter
+%\newtoks\mytoksa
+%\newtoks\mytoksb
+%\newcommand\addtomylist[2]{%
+%  \mytoksa\expandafter{#1}%
+%  \mytoksb{#2}%
+%  \edef#1{\the\mytoksa\the\mytoksb}%
+%}
+%\makeatother 
+
+% box to check the size of the running head
+\newbox\titrun
+
+% general page style
+\pagestyle{fancy}
+\fancyhf{}
+\fancyhead{}
+\fancyfoot{}
+% set the width of the head rule to 1 point
+\renewcommand{\headrulewidth}{1pt}
+
+% definition to set the head as running head in the preamble
+\def\mlptitlerunning#1{\gdef\@mlptitlerunning{#1}}
+
+% main definition adapting \mlptitle from 2004
+\long\def\mlptitle#1{%
+
+   %check whether @mlptitlerunning exists
+   % if not \mlptitle is used as running head
+   \ifx\undefined\@mlptitlerunning%
+	\gdef\@mlptitlerunning{#1}
+   \fi
+
+   %add it to pdf information
+  \ifdefined\nohyperref\else\ifdefined\hypersetup
+     \hypersetup{pdftitle={#1}}
+   \fi\fi
+
+   %get the dimension of the running title
+   \global\setbox\titrun=\vbox{\small\bf\@mlptitlerunning}
+
+   % error flag
+   \gdef\@runningtitleerror{0}
+
+   % running title too long
+   \ifdim\wd\titrun>\textwidth%
+	  {\gdef\@runningtitleerror{1}}%
+   % running title breaks a line
+   \else\ifdim\ht\titrun>6.25pt
+	   {\gdef\@runningtitleerror{2}}%
+	\fi
+   \fi 
+
+   % if there is somthing wrong with the running title
+   \ifnum\@runningtitleerror>0
+	   \typeout{}%
+           \typeout{}%
+           \typeout{*******************************************************}%
+           \typeout{Title exceeds size limitations for running head.}%
+           \typeout{Please supply a shorter form for the running head}
+           \typeout{with \string\mlptitlerunning{...}\space prior to \string\begin{document}}%
+           \typeout{*******************************************************}%
+ 	    \typeout{}%
+           \typeout{}%
+           % set default running title
+	   \chead{\small\bf Title Suppressed Due to Excessive Size}%
+    \else
+	   % 'everything' fine, set provided running title
+  	   \chead{\small\bf\@mlptitlerunning}%
+    \fi
+
+  % no running title on the first page of the paper
+  \thispagestyle{empty}
+
+%%%%%%%%%%%%%%%%%%%% Kristian Kersting %%%%%%%%%%%%%%%%%%%%%%%%%  
+%end%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+  {\center\baselineskip 18pt
+                       \toptitlebar{\Large\bf #1}\bottomtitlebar}
+}
+
+
+\gdef\icmlfullauthorlist{}
+\newcommand\addstringtofullauthorlist{\g@addto@macro\icmlfullauthorlist}
+\newcommand\addtofullauthorlist[1]{%
+  \ifdefined\icmlanyauthors%
+    \addstringtofullauthorlist{, #1}%
+  \else%
+    \addstringtofullauthorlist{#1}%
+    \gdef\icmlanyauthors{1}%
+  \fi%
+  \ifdefined\nohyperref\else\ifdefined\hypersetup%
+    \hypersetup{pdfauthor=\icmlfullauthorlist}%
+  \fi\fi}
+
+
+\def\toptitlebar{\hrule height1pt \vskip .25in} 
+\def\bottomtitlebar{\vskip .22in \hrule height1pt \vskip .3in} 
+
+\newenvironment{icmlauthorlist}{%
+  \setlength\topsep{0pt}
+  \setlength\parskip{0pt}
+  \begin{center}
+}{%
+  \end{center}
+}
+
+\newcounter{@affiliationcounter}
+\newcommand{\@pa}[1]{%
+% ``#1''
+\ifcsname the@affil#1\endcsname
+   % do nothing
+\else
+  \ifcsname @icmlsymbol#1\endcsname
+    % nothing
+  \else
+  \stepcounter{@affiliationcounter}%
+  \newcounter{@affil#1}%
+  \setcounter{@affil#1}{\value{@affiliationcounter}}%
+  \fi
+\fi%
+\ifcsname @icmlsymbol#1\endcsname
+  \textsuperscript{\csname @icmlsymbol#1\endcsname\,}%
+\else
+  %\expandafter\footnotemark[\arabic{@affil#1}\,]%
+  \textsuperscript{\arabic{@affil#1}\,}%
+\fi
+}
+
+%\newcommand{\icmlauthor}[2]{%
+%\addtofullauthorlist{#1}%
+%#1\@for\theaffil:=#2\do{\pa{\theaffil}}%
+%}
+\newcommand{\icmlauthor}[2]{%
+  \ifdefined\isaccepted
+    \mbox{\bf #1}\,\@for\theaffil:=#2\do{\@pa{\theaffil}} \addtofullauthorlist{#1}%
+   \else
+    \ifdefined\@icmlfirsttime
+    \else
+      \gdef\@icmlfirsttime{1}
+      \mbox{\bf Anonymous Authors}\@pa{@anon} \addtofullauthorlist{Anonymous Authors}
+     \fi
+    \fi
+}
+
+\newcommand{\icmlsetsymbol}[2]{%
+  \expandafter\gdef\csname @icmlsymbol#1\endcsname{#2}
+ }
+   
+
+\newcommand{\icmlaffiliation}[2]{%
+\ifdefined\isaccepted
+\ifcsname the@affil#1\endcsname
+ \expandafter\gdef\csname @affilname\csname the@affil#1\endcsname\endcsname{#2}%
+\else
+  {\bf AUTHORERR: Error in use of \textbackslash{}icmlaffiliation command. Label ``#1'' not mentioned in some \textbackslash{}icmlauthor\{author name\}\{labels here\} command beforehand. }
+  \typeout{}%
+  \typeout{}%
+  \typeout{*******************************************************}%
+  \typeout{Affiliation label undefined. }%
+  \typeout{Make sure \string\icmlaffiliation\space follows }
+  \typeout{all of \string\icmlauthor\space commands}%
+  \typeout{*******************************************************}%
+  \typeout{}%
+  \typeout{}%
+\fi
+\else % \isaccepted
+ % can be called multiple times... it's idempotent
+ \expandafter\gdef\csname @affilname1\endcsname{Anonymous Institution, Anonymous City, Anonymous Region, Anonymous Country}
+\fi
+}
+
+\newcommand{\icmlcorrespondingauthor}[2]{
+\ifdefined\isaccepted
+ \ifdefined\icmlcorrespondingauthor@text
+   \g@addto@macro\icmlcorrespondingauthor@text{, #1 \textless{}#2\textgreater{}}
+ \else
+   \gdef\icmlcorrespondingauthor@text{#1 \textless{}#2\textgreater{}}
+ \fi
+\else
+\gdef\icmlcorrespondingauthor@text{Anonymous Author \textless{}anon.email@domain.com\textgreater{}}
+\fi
+}
+
+\newcommand{\icmlEqualContribution}{\textsuperscript{*}Equal contribution }
+
+\newcounter{@affilnum}
+\newcommand{\printAffiliationsAndNotice}[1]{%
+\stepcounter{@affiliationcounter}%
+{\let\thefootnote\relax\footnotetext{\hspace*{-\footnotesep}#1%
+\forloop{@affilnum}{1}{\value{@affilnum} < \value{@affiliationcounter}}{
+\textsuperscript{\arabic{@affilnum}}\ifcsname @affilname\the@affilnum\endcsname%
+\csname @affilname\the@affilnum\endcsname%
+\else
+{\bf AUTHORERR: Missing \textbackslash{}icmlaffiliation.}
+\fi
+}.
+\ifdefined\icmlcorrespondingauthor@text
+Correspondence to: \icmlcorrespondingauthor@text.
+\else
+{\bf AUTHORERR: Missing \textbackslash{}icmlcorrespondingauthor.}
+\fi
+
+\ \\
+\Notice@String
+}
+}
+}
+
+  
+%\makeatother
+
+\long\def\icmladdress#1{%
+ {\bf The \textbackslash{}icmladdress command is no longer used.  See the example\_paper PDF .tex for usage of \textbackslash{}icmlauther and \textbackslash{}icmlaffiliation.}
+}
+
+%% keywords as first class citizens
+\def\icmlkeywords#1{%
+%  \ifdefined\isaccepted \else
+%    \par {\bf Keywords:} #1%
+%  \fi
+%  \ifdefined\nohyperref\else\ifdefined\hypersetup
+%    \hypersetup{pdfkeywords={#1}}
+%  \fi\fi
+%  \ifdefined\isaccepted \else
+%    \par {\bf Keywords:} #1%
+%  \fi
+  \ifdefined\nohyperref\else\ifdefined\hypersetup
+    \hypersetup{pdfkeywords={#1}}
+  \fi\fi
+}
+
+% modification to natbib citations
+\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
+
+% Redefinition of the abstract environment. 
+\renewenvironment{abstract}
+   {%
+% Insert the ``appearing in'' copyright notice.
+%\@copyrightspace
+\centerline{\large\bf Abstract}
+    \vspace{-0.12in}\begin{quote}}
+   {\par\end{quote}\vskip 0.12in}
+
+% numbered section headings with different treatment of numbers
+
+\def\@startsection#1#2#3#4#5#6{\if@noskipsec \leavevmode \fi
+   \par \@tempskipa #4\relax
+   \@afterindenttrue
+% Altered the following line to indent a section's first paragraph. 
+%  \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \@afterindentfalse\fi
+   \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \fi
+   \if@nobreak \everypar{}\else
+     \addpenalty{\@secpenalty}\addvspace{\@tempskipa}\fi \@ifstar
+     {\@ssect{#3}{#4}{#5}{#6}}{\@dblarg{\@sict{#1}{#2}{#3}{#4}{#5}{#6}}}}
+
+\def\@sict#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
+     \def\@svsec{}\else 
+     \refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname}\fi
+     \@tempskipa #5\relax
+      \ifdim \@tempskipa>\z@
+        \begingroup #6\relax
+          \@hangfrom{\hskip #3\relax\@svsec.~}{\interlinepenalty \@M #8\par}
+        \endgroup
+       \csname #1mark\endcsname{#7}\addcontentsline
+         {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                      \protect\numberline{\csname the#1\endcsname}\fi
+                    #7}\else
+        \def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
+                      {#7}\addcontentsline
+                           {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                             \protect\numberline{\csname the#1\endcsname}\fi
+                       #7}}\fi
+     \@xsect{#5}}
+
+\def\@sect#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
+     \def\@svsec{}\else 
+     \refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname\hskip 0.4em }\fi
+     \@tempskipa #5\relax
+      \ifdim \@tempskipa>\z@ 
+        \begingroup #6\relax
+          \@hangfrom{\hskip #3\relax\@svsec}{\interlinepenalty \@M #8\par}
+        \endgroup
+       \csname #1mark\endcsname{#7}\addcontentsline
+         {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                      \protect\numberline{\csname the#1\endcsname}\fi
+                    #7}\else
+        \def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
+                      {#7}\addcontentsline
+                           {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                             \protect\numberline{\csname the#1\endcsname}\fi
+                       #7}}\fi
+     \@xsect{#5}}
+
+% section headings with less space above and below them
+\def\thesection {\arabic{section}}
+\def\thesubsection {\thesection.\arabic{subsection}}
+\def\section{\@startsection{section}{1}{\z@}{-0.12in}{0.02in}
+             {\large\bf\raggedright}}
+\def\subsection{\@startsection{subsection}{2}{\z@}{-0.10in}{0.01in}
+                {\normalsize\bf\raggedright}}
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-0.08in}{0.01in}
+                {\normalsize\sc\raggedright}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
+  0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
+  0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+ 
+% Footnotes 
+\footnotesep 6.65pt % 
+\skip\footins 9pt 
+\def\footnoterule{\kern-3pt \hrule width 0.8in \kern 2.6pt } 
+\setcounter{footnote}{0} 
+ 
+% Lists and paragraphs 
+\parindent 0pt 
+\topsep 4pt plus 1pt minus 2pt 
+\partopsep 1pt plus 0.5pt minus 0.5pt 
+\itemsep 2pt plus 1pt minus 0.5pt 
+\parsep 2pt plus 1pt minus 0.5pt 
+\parskip 6pt
+ 
+\leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em 
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em  
+\leftmarginvi .5em 
+\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt 
+ 
+\def\@listi{\leftmargin\leftmargini} 
+\def\@listii{\leftmargin\leftmarginii 
+   \labelwidth\leftmarginii\advance\labelwidth-\labelsep 
+   \topsep 2pt plus 1pt minus 0.5pt 
+   \parsep 1pt plus 0.5pt minus 0.5pt 
+   \itemsep \parsep} 
+\def\@listiii{\leftmargin\leftmarginiii 
+    \labelwidth\leftmarginiii\advance\labelwidth-\labelsep 
+    \topsep 1pt plus 0.5pt minus 0.5pt  
+    \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt 
+    \itemsep \topsep} 
+\def\@listiv{\leftmargin\leftmarginiv 
+     \labelwidth\leftmarginiv\advance\labelwidth-\labelsep} 
+\def\@listv{\leftmargin\leftmarginv 
+     \labelwidth\leftmarginv\advance\labelwidth-\labelsep} 
+\def\@listvi{\leftmargin\leftmarginvi 
+     \labelwidth\leftmarginvi\advance\labelwidth-\labelsep} 
+ 
+\abovedisplayskip 7pt plus2pt minus5pt% 
+\belowdisplayskip \abovedisplayskip 
+\abovedisplayshortskip  0pt plus3pt%    
+\belowdisplayshortskip  4pt plus3pt minus3pt% 
+ 
+% Less leading in most fonts (due to the narrow columns) 
+% The choices were between 1-pt and 1.5-pt leading 
+\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} 
+\def\small{\@setsize\small{10pt}\ixpt\@ixpt} 
+\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt} 
+\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt} 
+\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt} 
+\def\large{\@setsize\large{14pt}\xiipt\@xiipt} 
+\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt} 
+\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt} 
+\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt} 
+\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt} 
+
+% Revised formatting for figure captions and table titles. 
+\newsavebox\newcaptionbox\newdimen\newcaptionboxwid
+
+\long\def\@makecaption#1#2{
+ \vskip 10pt 
+        \baselineskip 11pt
+        \setbox\@tempboxa\hbox{#1. #2}
+        \ifdim \wd\@tempboxa >\hsize
+        \sbox{\newcaptionbox}{\small\sl #1.~}
+        \newcaptionboxwid=\wd\newcaptionbox
+        \usebox\newcaptionbox {\footnotesize #2}
+%        \usebox\newcaptionbox {\small #2}
+        \else 
+          \centerline{{\small\sl #1.} {\small #2}} 
+        \fi}
+
+\def\fnum@figure{Figure \thefigure}
+\def\fnum@table{Table \thetable}
+
+% Strut macros for skipping spaces above and below text in tables. 
+\def\abovestrut#1{\rule[0in]{0in}{#1}\ignorespaces}
+\def\belowstrut#1{\rule[-#1]{0in}{#1}\ignorespaces}
+
+\def\abovespace{\abovestrut{0.20in}}
+\def\aroundspace{\abovestrut{0.20in}\belowstrut{0.10in}}
+\def\belowspace{\belowstrut{0.10in}}
+
+% Various personal itemization commands. 
+\def\texitem#1{\par\noindent\hangindent 12pt
+               \hbox to 12pt {\hss #1 ~}\ignorespaces}
+\def\icmlitem{\texitem{$\bullet$}}
+
+% To comment out multiple lines of text.
+\long\def\comment#1{}
+
+
+
+
+%% Line counter (not in final version). Adapted from NIPS style file by Christoph Sawade
+
+% Vertical Ruler
+% This code is, largely, from the CVPR 2010 conference style file
+% ----- define vruler
+\makeatletter
+\newbox\icmlrulerbox
+\newcount\icmlrulercount
+\newdimen\icmlruleroffset
+\newdimen\cv@lineheight
+\newdimen\cv@boxheight
+\newbox\cv@tmpbox
+\newcount\cv@refno
+\newcount\cv@tot
+% NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
+\newcount\cv@tmpc@ \newcount\cv@tmpc
+\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
+\cv@tmpc=1 %
+\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
+   \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
+\ifnum#2<0\advance\cv@tmpc1\relax-\fi
+\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
+\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\makevruler[#1][#2][#3][#4][#5]{
+	\begingroup\offinterlineskip
+		\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
+		\global\setbox\icmlrulerbox=\vbox to \textheight{%
+			{
+				\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
+				\cv@lineheight=#1\global\icmlrulercount=#2%
+				\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
+				\cv@refno1\vskip-\cv@lineheight\vskip1ex%
+				\loop\setbox\cv@tmpbox=\hbox to0cm{					 % side margin
+					\hfil {\hfil\fillzeros[#4]\icmlrulercount}
+				}%
+				\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
+				\advance\cv@refno1\global\advance\icmlrulercount#3\relax
+				\ifnum\cv@refno<\cv@tot\repeat
+			}
+		}
+	\endgroup
+}%
+\makeatother
+% ----- end of vruler
+
+
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\icmlruler#1{\makevruler[12pt][#1][1][3][\textheight]\usebox{\icmlrulerbox}}
+\AddToShipoutPicture{%
+\icmlruleroffset=\textheight
+\advance\icmlruleroffset by 5.2pt % top margin
+  \color[rgb]{.7,.7,.7}
+  \ifdefined\isaccepted \else
+	  \AtTextUpperLeft{%
+	    \put(\LenToUnit{-35pt},\LenToUnit{-\icmlruleroffset}){%left ruler
+	      \icmlruler{\icmlrulercount}}
+%	    \put(\LenToUnit{1.04\textwidth},\LenToUnit{-\icmlruleroffset}){%right ruler
+%	      \icmlruler{\icmlrulercount}}
+	  }
+	 \fi
+}
+\endinput
--- a/report/natbib.sty
+++ b/report/natbib.sty
--- a/scripts/generate_inputs.py
+++ b/scripts/generate_inputs.py
@ -0,0 +1,63 @@
+import numpy as np
+from mlp.layers import ConvolutionalLayer, BatchNormalizationLayer
+import argparse
+
+parser = argparse.ArgumentParser(description='Welcome to GAN-Shot-Learning script')
+
+parser.add_argument('--student_id', nargs="?", type=str, help='Your student id in the format "sxxxxxxx"')
+
+args = parser.parse_args()
+
+student_id = args.student_id
+
+def generate_inputs(student_id):
+    student_number = student_id
+    tests = np.arange(96).reshape((2, 3, 4, 4))
+    tests[:, 0, :, :] = float(student_number[1:3]) / 10 - 5
+    tests[:, :, 1, :] = float(student_number[3:5]) / 10 - 5
+    tests[:, 2, :, :] = float(student_number[5:7]) / 10 - 5
+    tests[0, 1, :, :] = float(student_number[7]) / 10 - 5
+    return tests
+
+
+
+test_inputs = generate_inputs(student_id)
+test_grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3))
+inputs = np.arange(96).reshape((2, 3, 4, 4))
+kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))
+biases = np.arange(2)
+
+#produce ConvolutionalLayer fprop, bprop and grads_wrt_params
+activation_layer = ConvolutionalLayer(num_input_channels=3, num_output_channels=2, input_dim_1=4, input_dim_2=4,
+                                      kernel_dim_1=2, kernel_dim_2=2)
+activation_layer.params = [kernels, biases]
+conv_fprop = activation_layer.fprop(test_inputs)
+conv_bprop = activation_layer.bprop(
+    test_inputs, conv_fprop, test_grads_wrt_outputs)
+conv_grads_wrt_params = activation_layer.grads_wrt_params(test_inputs, test_grads_wrt_outputs)
+
+test_inputs = np.reshape(test_inputs, newshape=(2, -1))
+test_grads_wrt_outputs = np.arange(-48, 48).reshape((2, -1))
+
+#produce ELU fprop and bprop
+activation_layer = BatchNormalizationLayer(input_dim=48)
+
+beta = np.array(48*[0.3])
+gamma = np.array(48*[0.8])
+
+activation_layer.params = [gamma, beta]
+BN_fprop = activation_layer.fprop(test_inputs)
+BN_bprop = activation_layer.bprop(
+    test_inputs, BN_fprop, test_grads_wrt_outputs)
+BN_grads_wrt_params = activation_layer.grads_wrt_params(
+    test_inputs, test_grads_wrt_outputs)
+
+test_output = "ConvolutionalLayer:\nFprop: {}\nBprop: {}\nGrads_wrt_params: {}\n" \
+              "BatchNormalization:\nFprop: {}\nBprop: {}\nGrads_wrt_params: {}\n"\
+    .format(conv_fprop,
+            conv_bprop,
+            conv_grads_wrt_params,
+            BN_fprop, BN_bprop, BN_grads_wrt_params)
+
+with open("{}_test_file.txt".format(student_id), "w+") as out_file:
+    out_file.write(test_output)
--- a/scripts/s1473470_test_file.txt
+++ b/scripts/s1473470_test_file.txt
@ -0,0 +1,119 @@
+ConvolutionalLayer:
+Fprop: [[[[  141   141   141]
+   [  161   161   161]
+   [  256   256   256]]
+
+  [[ -122  -122  -122]
+   [ -102  -102  -102]
+   [ -127  -127  -127]]]
+
+
+ [[[ -729  -740  -751]
+   [-1079 -1094 -1109]
+   [-1820 -1846 -1872]]
+
+  [[  844   857   870]
+   [  686   695   704]
+   [ 1613  1635  1657]]]]
+Bprop: [[[[ 147  319  305  162]
+   [ 338  716  680  354]
+   [ 290  608  572  294]
+   [ 149  307  285  144]]
+
+  [[  23   79   81   54]
+   [ 114  284  280  162]
+   [ 114  272  268  150]
+   [  73  163  157   84]]
+
+  [[-101 -161 -143  -54]
+   [-110 -148 -120  -30]
+   [ -62  -64  -36    6]
+   [  -3   19   29   24]]]
+
+
+ [[[  39   67   53   18]
+   [  50   68   32   -6]
+   [   2  -40  -76  -66]
+   [ -31  -89 -111  -72]]
+
+  [[  59  115  117   54]
+   [ 114  212  208   90]
+   [ 114  200  196   78]
+   [  37   55   49   12]]
+
+  [[  79  163  181   90]
+   [ 178  356  384  186]
+   [ 226  440  468  222]
+   [ 105  199  209   96]]]]
+Grads_wrt_params: (array([[[[  78,   78],
+         [ 168,  168]],
+
+        [[2332, 2311],
+         [1648, 1636]],
+
+        [[   0,    0],
+         [   0,    0]]],
+
+
+       [[[-138, -138],
+         [ -48,  -48]],
+
+        [[6085, 6010],
+         [5077, 5011]],
+
+        [[   0,    0],
+         [   0,    0]]]]), array([-126,   36]))
+BatchNormalization:
+Fprop: [[ 0.2         0.2         0.2         0.2         0.2         0.2         0.2
+   0.2         0.2         0.2         0.2         0.2         0.2         0.2
+   0.2         0.2        -0.7        -0.7        -0.7        -0.7
+  -0.69999963 -0.69999963 -0.69999963 -0.69999963 -0.7        -0.7        -0.7
+  -0.7        -0.7        -0.7        -0.7        -0.7         0.2         0.2
+   0.2         0.2         0.2         0.2         0.2         0.2         0.2
+   0.2         0.2         0.2         0.2         0.2         0.2         0.2       ]
+ [ 0.2         0.2         0.2         0.2         0.2         0.2         0.2
+   0.2         0.2         0.2         0.2         0.2         0.2         0.2
+   0.2         0.2         1.1         1.1         1.1         1.1
+   1.09999963  1.09999963  1.09999963  1.09999963  1.1         1.1         1.1
+   1.1         1.1         1.1         1.1         1.1         0.2         0.2
+   0.2         0.2         0.2         0.2         0.2         0.2         0.2
+   0.2         0.2         0.2         0.2         0.2         0.2         0.2       ]]
+Bprop: [[ -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03
+   -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03
+   -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03
+   -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03
+   -5.26012975e-09  -5.03790086e-09  -4.82801592e-09  -4.62962964e-09
+   -5.03789471e-06  -5.03789471e-06  -5.03789471e-06  -5.03789471e-06
+   -3.78504939e-09  -3.64132903e-09  -3.50479365e-09  -3.37499995e-09
+   -3.25153687e-09  -3.13402298e-09  -3.02210435e-09  -2.91545187e-09
+   -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03
+   -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03
+   -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03
+   -6.83051975e+03  -6.83051975e+03  -6.83051975e+03  -6.83051975e+03]
+ [  6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03
+    6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03
+    6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03
+    6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03
+    5.26012975e-09   5.03790086e-09   4.82801592e-09   4.62962964e-09
+    5.03789471e-06   5.03789471e-06   5.03789471e-06   5.03789471e-06
+    3.78504939e-09   3.64132903e-09   3.50479365e-09   3.37499995e-09
+    3.25153687e-09   3.13402298e-09   3.02210435e-09   2.91545187e-09
+    6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03
+    6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03
+    6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03
+    6.83051975e+03   6.83051975e+03   6.83051975e+03   6.83051975e+03]]
+Grads_wrt_params: [array([  0.        ,   0.        ,   0.        ,   0.        ,
+         0.        ,   0.        ,   0.        ,   0.        ,
+         0.        ,   0.        ,   0.        ,   0.        ,
+         0.        ,   0.        ,   0.        ,   0.        ,
+        47.9999998 ,  47.9999998 ,  47.99999981,  47.99999981,
+        47.99998041,  47.99998041,  47.99998041,  47.99998041,
+        47.99999984,  47.99999984,  47.99999985,  47.99999985,
+        47.99999985,  47.99999986,  47.99999986,  47.99999986,
+         0.        ,   0.        ,   0.        ,   0.        ,
+         0.        ,   0.        ,   0.        ,   0.        ,
+         0.        ,   0.        ,   0.        ,   0.        ,
+         0.        ,   0.        ,   0.        ,   0.        ]), array([-48, -46, -44, -42, -40, -38, -36, -34, -32, -30, -28, -26, -24,
+       -22, -20, -18, -16, -14, -12, -10,  -8,  -6,  -4,  -2,   0,   2,
+         4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,
+        30,  32,  34,  36,  38,  40,  42,  44,  46])]
--- a/spec/coursework1.pdf
+++ b/spec/coursework1.pdf
--- a/spec/coursework1.tex
+++ b/spec/coursework1.tex
@ -0,0 +1,493 @@
+\documentclass[11pt,]{article}
+\usepackage[T1]{fontenc}
+\usepackage{amssymb,amsmath}
+\usepackage{txfonts}
+\usepackage{microtype}
+\usepackage{amssymb,amsmath}
+\usepackage{graphicx}
+\usepackage{subfigure} 
+\usepackage{natbib}
+\usepackage{paralist}
+\usepackage{hyperref}
+\usepackage{url}
+\urlstyle{same}
+\usepackage{color}
+\usepackage{fancyvrb}
+\newcommand{\VerbBar}{|}
+\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
+\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
+% Add ',fontsize=\small' for more characters per line
+\newenvironment{Shaded}{}{}
+\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}}
+\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.56,0.13,0.00}{{#1}}}
+\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
+\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
+\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
+\newcommand{\CharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
+\newcommand{\StringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
+\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textit{{#1}}}}
+\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{{#1}}}
+\newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
+\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.02,0.16,0.49}{{#1}}}
+\newcommand{\RegionMarkerTok}[1]{{#1}}
+\newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
+\newcommand{\NormalTok}[1]{{#1}}
+
+\hypersetup{breaklinks=true,
+            pdfauthor={},
+            pdftitle={},
+            colorlinks=true,
+            citecolor=blue,
+            urlcolor=blue,
+            linkcolor=magenta,
+            pdfborder={0 0 0}}
+
+\setlength{\parindent}{0pt}
+\setlength{\parskip}{6pt plus 2pt minus 1pt}
+\setlength{\emergencystretch}{3em}  % prevent overfull lines
+\setcounter{secnumdepth}{0}
+
+\usepackage[a4paper,body={170mm,250mm},top=25mm,left=25mm]{geometry}
+\usepackage[sf,bf,small]{titlesec}
+\usepackage{fancyhdr}
+
+\pagestyle{fancy}
+\lhead{\sffamily MLP Coursework 1}
+\rhead{\sffamily Due: 30 October 2017}
+\cfoot{\sffamily \thepage}
+
+\author{}
+\date{}
+
+\DeclareMathOperator{\softmax}{softmax}
+\DeclareMathOperator{\sigmoid}{sigmoid}
+\DeclareMathOperator{\sgn}{sgn}
+\DeclareMathOperator{\relu}{relu}
+\DeclareMathOperator{\lrelu}{lrelu}
+\DeclareMathOperator{\elu}{elu}
+\DeclareMathOperator{\selu}{selu}
+\DeclareMathOperator{\maxout}{maxout}
+
+\begin{document}
+
+\section{Machine Learning Practical: Coursework
+1}
+\label{sec:machine-learning-practical-coursework-1}
+
+\textbf{Release date: Monday 16th October 2017}\\
+\textbf{Due date: 16:00 Monday 30th October 2017}
+
+\subsection{Introduction}
+\label{sec:introduction}
+This coursework is concerned with training multi-layer networks to
+address the MNIST digit classification problem. It builds on the
+material covered in the first three lab notebooks and the first four
+lectures. \textbf{You should complete the first three lab
+notebooks before starting the coursework.} The aim of the coursework is
+to investigate variants of the ReLU activation function for hidden units 
+in multi-layer networks, with respect to the validation set accuracies 
+achieved by the trained models.
+
+
+ 
+
+
+\subsection{Code}
+\label{sec:code}
+
+You should run all of the experiments for the coursework inside the
+Conda environment you set up in first labs.  The code for the coursework is available on the course
+\href{https://github.com/CSTR-Edinburgh/mlpractical/}{Github repository}
+on a branch \texttt{mlp2017-8/coursework1}. To create a local working
+copy of this branch in your local repository you need to do the
+following.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\itemsep1pt\parskip0pt\parsep0pt
+\item
+  Make sure all modified files on the branch you are currently have been
+  committed
+  (\href{https://github.com/CSTR-Edinburgh/mlpractical/blob/mlp2017-8/master/notes/getting-started-in-a-lab.md}{see
+  details here} if you are unsure how to do this).
+\item
+  Fetch changes to the upstream \texttt{origin} repository by running\\
+  \texttt{git fetch origin}
+\item
+  Checkout a new local branch from the fetched branch using\\
+  \texttt{git checkout -b coursework1 origin/mlp2017-8/coursework1}
+\end{enumerate}
+
+You will now have a new branch in your local repository with all the
+code necessary for the coursework in it. In the \texttt{notebooks}
+directory there is a notebook \texttt{Coursework\_1.ipynb} which is
+intended as a starting point for structuring the code for your
+experiments. You will probably want to add additional code cells to this
+as you go along and run new experiments (e.g.~doing each new training
+run in a new cell). You may also wish to use Markdown cells to keep
+notes on the results of experiments.
+
+There will also be a \verb+report+ directory which contains the LaTeX template and style files for the report.  You should copy all these files into the directory which will contain your report.
+
+
+\subsection{Standard network
+architecture}
+\label{sec:standard-network-architecture}
+
+To make the results of your experiments more easily comparable, you
+should try to keep as many of the free choices in the specification of
+the model and learning problem the same across different experiments. If
+you vary only a small number of aspects of the problem at a time this
+will make it easier to interpret the effect of those changes.
+
+In these experiments you should use a multi-layer network with two hidden layers 
+(corresponding to three affine transformations) and a softmax output layer.  The initial baseline
+should use a sigmoid activation function for the hidden layer;   other experiments will explore
+different nonlinear activation functions.  The hidden layers should each contain 100 hidden units.  
+The baseline network can this be defined with the following code (which should be familiar to you from Lab 3):
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CharTok{import} \NormalTok{numpy }\CharTok{as} \NormalTok{np}
+\CharTok{from} \NormalTok{mlp.layers }\CharTok{import} \NormalTok{AffineLayer, SoftmaxLayer, SigmoidLayer}
+\CharTok{from} \NormalTok{mlp.errors }\CharTok{import} \NormalTok{CrossEntropySoftmaxError}
+\CharTok{from} \NormalTok{mlp.models }\CharTok{import} \NormalTok{MultipleLayerModel}
+\CharTok{from} \NormalTok{mlp.initialisers }\CharTok{import} \NormalTok{ConstantInit, GlorotUniformInit}
+
+\NormalTok{seed = }\DecValTok{10102016}
+\NormalTok{rng = np.random.RandomState(seed)}
+
+\NormalTok{input_dim, output_dim, hidden_dim = }\DecValTok{784}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{100}
+
+\NormalTok{weights_init = GlorotUniformInit(rng=rng)}
+\NormalTok{biases_init = ConstantInit(}\DecValTok{0}\NormalTok{.)}
+
+\NormalTok{model = MultipleLayerModel([}
+    \NormalTok{AffineLayer(input_dim, hidden_dim, weights_init, biases_init),}
+    \NormalTok{SigmoidLayer(),}
+    \NormalTok{AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init),}
+    \NormalTok{SigmoidLayer(),}
+    \NormalTok{AffineLayer(hidden_dim, output_dim, weights_init, biases_init)}
+\NormalTok{])}
+
+\NormalTok{error = CrossEntropySoftmaxError()}
+\end{Highlighting}
+\end{Shaded}
+
+Here we are using the Glorot initialisation scheme, discussed in lecture 4.  In part 2B of this coursework you will explore the effect of different initialisation schemes.
+
+The above code creates a network using sigmoid hidden layers; you should modify it to also create a network using ReLU activation functions (see Lab 3).  These two networks will form your baseline systems.
+
+As well as standardising the network architecture, you should also fix
+the hyperparameters of the training procedure not being investigated to
+be the same across different runs. In particular for all experiments you
+should use a \textbf{batch size of 50 and train for a total of 100
+epochs} for all reported runs. You may of course use a smaller number of
+epochs for initial pilot runs.
+
+\subsection{Part 1: Implementing Activation Functions}
+\label{sec:actfns}
+
+In the first part of the assignment you will implement three further 
+activation functions, each of which is related to ReLU \citep{nair2010rectified}:  Leaky ReLU, ELU (Exponential Linear Unit), and SELU (Scaled Exponential Linear Unit).   Each of these units defines an activation function for which $f(x) = x$ when $x>0$, as for ReLU, but avoid having a zero gradient when $x<0$.
+
+\textbf{Leaky ReLU} ($\lrelu(x)$) \citep{maas2013rectifier} has the following form:
+\begin{equation}
+  \lrelu(x) =
+     \begin{cases} 
+      \alpha x     & \quad \text{if } x \leq  0 \\
+      x       & \quad \text{if } x > 0 \\
+    \end{cases} 
+\end{equation}
+Where $\alpha$ is a constant;  typically $\alpha=0.01$, and you can use this value in this coursework.  Note that $\alpha$ can be taken to be a parameter which is learned by back-propagation along with the weights and biases -- this is called Parametric ReLU (PReLU).  
+
+\textbf{ELU} ($\elu(x)$) \citep{clevert2015fast} has the following form:
+\begin{equation}
+  \elu(x) =
+     \begin{cases} 
+      \alpha (\exp(x) - 1)     & \quad \text{if } x \leq  0 \\
+      x       & \quad \text{if } x > 0 \\
+    \end{cases} 
+\end{equation}
+Again $\alpha$ can be taken as a constant or a tunable parameter.  Typically $\alpha=1$, which results in a smooth function, and you can use this value in this coursework.
+
+\textbf{SELU} ($\selu(x)$) \citep{klambauer2017self} has the following form:
+\begin{equation}
+  \selu(x) =
+     \lambda \begin{cases} 
+      \alpha (\exp(x) - 1)     & \quad \text{if } x \leq  0 \\
+      x       & \quad \text{if } x > 0 \\
+    \end{cases} 
+\end{equation}
+In the case of SELU, there is a theoretical argument for optimal values of the two parameters: $\alpha \approx 1.6733$ and $\lambda \approx 1.0507$, and you can use these values in this coursework.
+
+\begin{enumerate}
+  \item Implement each of these activations function as classes \verb+LeakyReluLayer+, \verb+EluLayer+ and \verb+SeluLayer+.  You need to implement \verb+fprop+ and \verb+bprop+ methods for each class.
+  \item Verify the correctness of your implementation using the supplied unit tests in \verb+Activation\_Tests.ipynb+
+  \item Automatically create a test file \verb+sXXXXXXX_test_file.txt+, by running the provided program \verb+generate_inputs.py+ which uses your code for  \verb+LeakyReluLayer+, \verb+EluLayer+ and \verb+SeluLayer+ to run your fprop and bprop methods for each layer on a unique test vector generated using your student ID number.
+\end{enumerate}
+
+For Part 1 of the coursework you need to submit the test file \verb+sXXXXXXX_test_file.txt+ (where sXXXXXXX is replaced with your student number) created in step 3 above.
+
+\subsection{Part 2: MNIST Experiments}
+\label{sec:expts}
+In  Part 2 of the coursework you will experiment with \verb+LeakyReluLayer+, \verb+EluLayer+ and \verb+SeluLayer+ in multi-layer networks trained on MNIST.
+
+\subsubsection{2A:  Comparing activation functions}
+In this sub-part you should compare the behaviour of Leaky ReLU, ELU, and SELU activation functions on the MNIST task.  Carry out all experiments using 2 hidden layers, with 100 units per hidden layer.  You should compare the results with baseline systems of the same architecture using sigmoid units and using ReLU units.  
+
+\subsubsection{2B: Deep neural network experiments}
+In this subpart you will explore the behaviour of deeper networks.  Based on the results of Part 2A, choose one activation function, and compare networks with 2--8 hidden layers, using 100 hidden units per hidden layer.  
+
+Also compare the effect of different initialisation strategies, as discussed in lecture 4.  First look at the effect of weight initialisation based on
+\begin{compactitem}
+\item Fan-in: $w_i \sim U(-\sqrt{3/n_{in}}, \sqrt{3/n_{in}}$
+\item Fan-out: $w_i \sim U(-\sqrt{3/n_{out}}, \sqrt{3/n_{out}}$
+\item Fan-in and Fan-out: $w_i \sim U \left(-\sqrt{6/(n_{in}+n_{out})}, \sqrt{6/(n_{in}+n_{out})}\right)$
+\end{compactitem}
+where $U$ is the uniform distribution.   The first of these corresponds to constraining the estimated variance of a unit to be independent of the number of incoming connections ($n_{in}$);  the second to  constraining the estimated variance of a unit's gradient to be independent of the number of outgoing connections ($n_{out}$);  the third corresponds to Glorot and Bengio's combined initialisation.  
+
+Additionally you could also explore the effect of drawing from a Gaussian distribution compared with a uniform distribution.  In particular you might like to explore initialising a SELU layer drawing from a Gaussian with mean 0 and variance $1/n_{out}$ as recommended by \cite{klambauer2017self}.
+
+For Part 2 of the coursework you need to write and submit a report, using the template provided, in the directory \verb+report+.  Please read the template document \verb+mlp-cw1-template.pdf+ very carefully, as it provides advice and instructions on writing your report.  You can use the LaTeX source file \verb+mlp-cw1-template.tex+ as a template for your report (see below, in the section 'Report').
+
+It is highly recommended that you use LaTeX for your report.  If you have not used LaTeX previously, now is a good time to learn how to use it!
+
+\subsection{Backing up your work}
+\label{sec:backing-up-your-work}
+
+It is \textbf{strongly recommended} you use some method for backing up
+your work. Those working in their AFS homespace on DICE will have their
+work automatically backed up as part of the
+\href{http://computing.help.inf.ed.ac.uk/backups-and-mirrors}{routine
+backup} of all user homespaces. If you are working on a personal
+computer you should have your own backup method in place (e.g.~saving
+additional copies to an external drive, syncing to a cloud service or
+pushing commits to your local Git repository to a private repository on
+Github). \textbf{Loss of work through failure to back up
+\href{http://tinyurl.com/edinflate}{does not consitute a good reason for
+late submission}}.
+
+You may \emph{additionally} wish to keep your coursework under version
+control in your local Git repository on the \texttt{coursework1} branch.
+% This does not need to be limited to the coursework notebook and
+% \texttt{mlp} Python modules - you can also add your report document to
+% the repository.
+
+If you make regular commits of your work on the coursework this will
+allow you to better keep track of the changes you have made and if
+necessary revert to previous versions of files and/or restore
+accidentally deleted work. This is not however required and you should
+note that keeping your work under version control is a distinct issue
+from backing up to guard against hard drive failure. If you are working
+on a personal computer you should still keep an additional back up of
+your work as described above.
+
+\subsection{Report}
+\label{sec:report}
+
+Part two of your coursework submission, worth 70 marks will be a report. The directory
+\verb+coursework1/report+ contains a template for your report (\verb+mlp-cw1-template.txt+);  the generated pdf file (\verb+mlp-cw1-template.pdf+) is also provided, and you should read this file carefully as it contains information about the required structure and experimentation. The template is written in LaTeX, and we strongly recommend that you write your own report using LaTeX, using the supplied document style \verb+mlp2017+ (as in the template).
+
+You should copy the files in the \verb+report+ directory to the directory containing the LaTeX file of your report, as \verb+pdflatex+ will need to access these files when building the pdf document from the LaTeX source file.
+
+Your report should be in a 2-column format, based on the document format used for the ICML conference. The report should be a \textbf{maximum of 6 pages long}, with a further page for references.  We will not read or assess any parts of the report beyond the allowed 6+1 pages.  
+
+Ideally, all figures should be included in your report file as
+\href{https://en.wikipedia.org/wiki/Vector_graphics}{vector graphics}
+rather than \href{https://en.wikipedia.org/wiki/Raster_graphics}{raster
+files} as this will make sure all detail in the plot is visible.
+Matplotlib supports saving high quality figures in a wide range of
+common image formats using the
+\href{http://matplotlib.org/api/pyplot_api.html\#matplotlib.pyplot.savefig}{\texttt{savefig}}
+function. \textbf{You should use \texttt{savefig} rather than copying
+the screen-resolution raster images outputted in the notebook.} An
+example of using \texttt{savefig} to save a figure as a PDF file (which
+can be included as graphics in
+\href{https://en.wikibooks.org/wiki/LaTeX/Importing_Graphics}{LaTeX}
+compiled with \texttt{pdflatex} and in Apple Pages and
+\href{https://support.office.com/en-us/article/Add-a-PDF-to-your-Office-file-74819342-8f00-4ab4-bcbe-0f3df15ab0dc}{Microsoft
+Word} documents) is given below.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CharTok{import} \NormalTok{matplotlib.pyplot }\CharTok{as} \NormalTok{plt}
+\CharTok{import} \NormalTok{numpy }\CharTok{as} \NormalTok{np}
+\CommentTok{# Generate some example data to plot}
+\NormalTok{x = np.linspace(}\DecValTok{0}\NormalTok{., }\DecValTok{1}\NormalTok{., }\DecValTok{100}\NormalTok{)}
+\NormalTok{y1 = np.sin(}\DecValTok{2}\NormalTok{. * np.pi * x)}
+\NormalTok{y2 = np.cos(}\DecValTok{2}\NormalTok{. * np.pi * x)}
+\NormalTok{fig_size = (}\DecValTok{6}\NormalTok{, }\DecValTok{3}\NormalTok{)  }\CommentTok{# Set figure size in inches (width, height)}
+\NormalTok{fig = plt.figure(figsize=fig_size)  }\CommentTok{# Create a new figure object}
+\NormalTok{ax = fig.add_subplot(}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)  }\CommentTok{# Add a single axes to the figure}
+\CommentTok{# Plot lines giving each a label for the legend and setting line width to 2}
+\NormalTok{ax.plot(x, y1, linewidth=}\DecValTok{2}\NormalTok{, label=}\StringTok{'$y = \textbackslash{}sin(2\textbackslash{}pi x)$'}\NormalTok{)}
+\NormalTok{ax.plot(x, y2, linewidth=}\DecValTok{2}\NormalTok{, label=}\StringTok{'$y = \textbackslash{}cos(2\textbackslash{}pi x)$'}\NormalTok{)}
+\CommentTok{# Set the axes labels. Can use LaTeX in labels within $...$ delimiters.}
+\NormalTok{ax.set_xlabel(}\StringTok{'$x$'}\NormalTok{, fontsize=}\DecValTok{12}\NormalTok{)}
+\NormalTok{ax.set_ylabel(}\StringTok{'$y$'}\NormalTok{, fontsize=}\DecValTok{12}\NormalTok{)}
+\NormalTok{ax.grid(}\StringTok{'on'}\NormalTok{)  }\CommentTok{# Turn axes grid on}
+\NormalTok{ax.legend(loc=}\StringTok{'best'}\NormalTok{, fontsize=}\DecValTok{11}\NormalTok{)  }\CommentTok{# Add a legend}
+\NormalTok{fig.tight_layout()  }\CommentTok{# This minimises whitespace around the axes.}
+\NormalTok{fig.savefig(}\StringTok{'file-name.pdf'}\NormalTok{) }\CommentTok{# Save figure to current directory in PDF format}
+\end{Highlighting}
+\end{Shaded}
+
+(If you are using Libre/OpenOffice you should use Scalable Vector Format
+plots instead using \\
+\texttt{fig.savefig('file-name.svg')}. If the
+document editor you are using for the report does not support including
+either PDF or SVG graphics you can instead output high-resolution raster
+images using \texttt{fig.savefig('file-name.png', dpi=200)} however note
+these files will generally be larger than either SVG or PDF formatted
+graphics.)
+
+However to emphasise again: \textbf{It is highly recommended that you use LaTeX.}
+
+If you make use of any any books, articles, web pages or other resources
+you should appropriately cite these in your report. You do not need to
+cite material from the course lecture slides or lab notebooks.
+
+To create a pdf file \verb+mlp-cw1-template.pdf+ from a LaTeX source file (\verb+mlp-cw1-template.tex+), you can run the following in a terminal:
+\begin{verbatim}
+pdflatex mlp-cw1-template
+bibtex mlp-cw1-template
+pdflatex mlp-cw1-template
+pdflatex mlp-cw1-template
+\end{verbatim}
+(Yes, you have to run pdflatex multiple times, in order  for latex to construct the internal document references.)
+
+An alternative, simpler approach uses the \verb+latexmk+ program:
+\begin{verbatim}
+latexmk -pdf mlp-cw1-template
+\end{verbatim}
+
+It is worth learning how to use LaTeX effectively, as it is particularly powerful for mathematical and academic writing.  There are many tutorials on the web.
+
+
+\subsection{Mechanics}
+\label{sec:mechanics}
+
+\textbf{Marks:} 
+This assignment will be assessed out of 100 marks and
+forms 10\% of your final grade for the course.
+
+\textbf{Academic conduct:} 
+Assessed work is subject to University
+regulations on academic
+conduct:\\\url{http://web.inf.ed.ac.uk/infweb/admin/policies/academic-misconduct}
+
+\textbf{Submission:} 
+You can submit more than once up until the submission deadline. All
+submissions are timestamped automatically. Identically named files
+will overwrite earlier submitted versions, so we will mark the latest
+submission that comes in before the deadline.
+
+If you submit anything before the deadline, you may not resubmit
+afterward. (This policy allows us to begin marking submissions
+immediately after the deadline, without having to worry that some may
+need to be re-marked).
+
+If you do not submit anything before the deadline, you may submit {\em
+exactly once} after the deadline, and a late penalty will be applied
+to this submission unless you have received an approved extension.
+Please be aware that late submissions may receive lower priority for
+marking, and marks may not be returned within the same timeframe as
+for on-time submissions.
+
+{\em Warning:} Unfortunately the \verb+submit+ command will technically
+allow you to submit late even if you submitted before the deadline
+(i.e.\ it does not enforce the above policy). Don't do this! We will
+mark the version that we retrieve just after the deadline, and (even
+worse) you may still be penalized for submitting late because the
+timestamp will update.
+
+For additional information about late penalties and extension
+requests, see the School web page below. Do {\bf not} email any course
+staff directly about extension requests; you must follow the
+instructions on the web page.
+
+\url{http://web.inf.ed.ac.uk/infweb/student-services/ito/admin/coursework-projects/late-coursework-extension-requests}
+
+\textbf{Late submission penalty:}  
+Following the University guidelines, 
+late coursework submitted without an authorised extension will be
+recorded as late and the following penalties will apply: 5
+percentage points will be deducted for every calendar day or part
+thereof it is late, up to a maximum of 7 calendar days. After this
+time a mark of zero will be recorded.
+
+\subsection{Submission}
+\label{sec:submission}
+
+Your coursework submission should be done electronically using the
+\href{http://computing.help.inf.ed.ac.uk/submit}{\texttt{submit}}
+command available on DICE machines.
+
+Your submission should include
+
+\begin{itemize}
+\itemsep1pt\parskip0pt\parsep0pt
+\item
+  the unit test file generated in part 1, \verb+sXXXXXXX_test_file.txt+, where your student number replaces \verb+sXXXXXXX+
+\item
+  your completed report as a PDF file, using the provided template
+\item
+  the notebook (\verb+.ipynb+) file you used to run the experiments in
+\item
+  and your local version of the \texttt{mlp} code including any changes
+  you made to the modules (\texttt{.py} files).
+\end{itemize}
+
+You should copy all of the files to a single directory, \verb+coursework1+, e.g.
+
+\begin{verbatim}
+mkdir coursework1
+cp notebooks/Coursework_1.ipynb mlp/*.py coursework1
+cp reports/coursework1.pdf reports/sXXXXXXX_test_file.txt coursework1
+\end{verbatim}
+
+
+and then submit this directory using
+
+\begin{verbatim}
+submit mlp cw1 coursework1
+\end{verbatim}
+
+The \texttt{submit} command will prompt you with the details of the
+submission including the name of the files / directories you are
+submitting and the name of the course and exercise you are submitting
+for and ask you to check if these details are correct. You should check
+these carefully and reply \texttt{y} to submit if you are sure the files
+are correct and \texttt{n} otherwise.
+
+You can amend an existing submission by rerunning the \texttt{submit}
+command any time up to the deadline. It is therefore a good idea
+(particularly if this is your first time using the DICE submit
+mechanism) to do an initial run of the \texttt{submit} command early on
+and then rerun the command if you make any further updates to your
+submisison rather than leaving submission to the last minute.
+
+
+\subsection{Marking Scheme}
+\label{sec:marking-scheme}
+
+\begin{itemize}
+\item
+  Part 1, Activation function implementation (30 marks). Based on your submitted test file.
+\item
+  Part 2, Report (70 marks).  The following aspects will contribute to the mark for your report:
+  \begin{itemize}
+    \item Abstract - how clear is it? does it cover what is reported in the document
+    \item Introduction - do you clear outline and motivate the paper, and describe the research questions investigated?
+    \item Description of activation functions -- is it clear and correct?
+    \item Experiments -- did you carry out the experiments correctly?  are the results clearly presented and described?  
+    \item Interpretation and discussion of results
+    \item Conclusions
+    \item Presentation and clarity of report 
+  \end{itemize}
+\end{itemize}
+
+\bibliographystyle{plainnat}
+\bibliography{cw1-references}
+\end{document}
--- a/spec/cw1-references.bib
+++ b/spec/cw1-references.bib
@ -0,0 +1,29 @@
+@inproceedings{maas2013rectifier,
+  title={Rectifier nonlinearities improve neural network acoustic models},
+  author={Maas, Andrew L and Hannun, Awni Y and Ng, Andrew Y},
+  booktitle={Proc. ICML},
+  volume={30},
+  number={1},
+  year={2013}
+}
+
+@inproceedings{nair2010rectified,
+  title={Rectified linear units improve restricted {Boltzmann} machines},
+  author={Nair, Vinod and Hinton, Geoffrey E},
+  booktitle={Proc ICML},
+  pages={807--814},
+  year={2010}
+}
+
+@article{clevert2015fast,
+  title={Fast and accurate deep network learning by exponential linear units ({ELU}s)},
+  author={Clevert, Djork-Arn{\'e} and Unterthiner, Thomas and Hochreiter, Sepp},
+  journal={arXiv preprint arXiv:1511.07289},
+  year={2015}
+}
+@article{klambauer2017self,
+  title={Self-Normalizing Neural Networks},
+  author={Klambauer, G{\"u}nter and Unterthiner, Thomas and Mayr, Andreas and Hochreiter, Sepp},
+  journal={arXiv preprint arXiv:1706.02515},
+  year={2017}
+}