Add missing files

2024-10-14 10:51:43 +01:00 · 2024-10-14 10:51:43 +01:00 · 5d52a22448
commit 5d52a22448
parent 4657cca862
8 changed files with 1015 additions and 60 deletions
--- a/mlp/data_providers.py
+++ b/mlp/data_providers.py
@ -16,7 +16,7 @@ class DataProvider(object):
    """Generic data provider."""
    def __init__(self, inputs, targets, batch_size, max_num_batches=-1,
-                 shuffle_order=True, rng=None):
+                 shuffle_order=True, rng=None, smooth_labels=False):
        """Create a new data provider object.
        Args:
@ -32,26 +32,60 @@ class DataProvider(object):
            shuffle_order (bool): Whether to randomly permute the order of
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
            smooth_labels (bool): turn on label smoothing
        """
        self.inputs = inputs
        self.targets = targets
-        self.batch_size = batch_size
+        if batch_size < 1:
-        assert max_num_batches != 0 and not max_num_batches < -1, (
+            raise ValueError('batch_size must be >= 1')
-            'max_num_batches should be -1 or > 0')
+        self._batch_size = batch_size
-        self.max_num_batches = max_num_batches
+        if max_num_batches == 0 or max_num_batches < -1:
            raise ValueError('max_num_batches must be -1 or > 0')
        self._max_num_batches = max_num_batches
        self._update_num_batches()
        self.shuffle_order = shuffle_order
        self._current_order = np.arange(inputs.shape[0])
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
        self.smooth_labels = smooth_labels
        self.new_epoch()
    @property
    def batch_size(self):
        """Number of data points to include in each batch."""
        return self._batch_size
    @batch_size.setter
    def batch_size(self, value):
        if value < 1:
            raise ValueError('batch_size must be >= 1')
        self._batch_size = value
        self._update_num_batches()
    @property
    def max_num_batches(self):
        """Maximum number of batches to iterate over in an epoch."""
        return self._max_num_batches
    @max_num_batches.setter
    def max_num_batches(self, value):
        if value == 0 or value < -1:
            raise ValueError('max_num_batches must be -1 or > 0')
        self._max_num_batches = value
        self._update_num_batches()
    def _update_num_batches(self):
        """Updates number of batches to iterate over."""
        # maximum possible number of batches is equal to number of whole times
        # batch_size divides in to the number of data points which can be
        # found using integer division
-        possible_num_batches = self.inputs.shape[0] // batch_size
+        possible_num_batches = self.inputs.shape[0] // self.batch_size
        if self.max_num_batches == -1:
            self.num_batches = possible_num_batches
        else:
            self.num_batches = min(self.max_num_batches, possible_num_batches)
        self.shuffle_order = shuffle_order
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
        self.reset()
    def __iter__(self):
        """Implements Python iterator interface.
@ -63,27 +97,36 @@ class DataProvider(object):
        """
        return self
-    def reset(self):
+    def new_epoch(self):
-        """Resets the provider to the initial state to use in a new epoch."""
+        """Starts a new epoch (pass through data), possibly shuffling first."""
        self._curr_batch = 0
        if self.shuffle_order:
            self.shuffle()
    def shuffle(self):
        """Randomly shuffles order of data."""
        new_order = self.rng.permutation(self.inputs.shape[0])
        self.inputs = self.inputs[new_order]
        self.targets = self.targets[new_order]
    def __next__(self):
        return self.next()
    def reset(self):
        """Resets the provider to the initial state."""
        inv_perm = np.argsort(self._current_order)
        self._current_order = self._current_order[inv_perm]
        self.inputs = self.inputs[inv_perm]
        self.targets = self.targets[inv_perm]
        self.new_epoch()
    def shuffle(self):
        """Randomly shuffles order of data."""
        perm = self.rng.permutation(self.inputs.shape[0])
        self._current_order = self._current_order[perm]
        self.inputs = self.inputs[perm]
        self.targets = self.targets[perm]
    def next(self):
        """Returns next data batch or raises `StopIteration` if at end."""
        if self._curr_batch + 1 > self.num_batches:
-            # no more batches in current iteration through data set so reset
+            # no more batches in current iteration through data set so start
-            # the dataset for another pass and indicate iteration is at end
+            # new epoch ready for another pass and indicate iteration is at end
-            self.reset()
+            self.new_epoch()
            raise StopIteration()
        # create an index slice corresponding to current batch number
        batch_slice = slice(self._curr_batch * self.batch_size,
@ -93,12 +136,11 @@ class DataProvider(object):
        self._curr_batch += 1
        return inputs_batch, targets_batch
 class MNISTDataProvider(DataProvider):
    """Data provider for MNIST handwritten digit images."""
    def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
-                 shuffle_order=True, rng=None):
+                 shuffle_order=True, rng=None, smooth_labels=False):
        """Create a new MNIST data provider object.
        Args:
@ -112,9 +154,10 @@ class MNISTDataProvider(DataProvider):
            shuffle_order (bool): Whether to randomly permute the order of
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
            smooth_labels (bool): enable/disable label smoothing
        """
        # check a valid which_set was provided
-        assert which_set in ['train', 'valid', 'eval'], (
+        assert which_set in ['train', 'valid', 'test'], (
            'Expected which_set to be either train, valid or eval. '
            'Got {0}'.format(which_set)
        )
@ -134,7 +177,7 @@ class MNISTDataProvider(DataProvider):
        inputs = inputs.astype(np.float32)
        # pass the loaded data to the parent class __init__
        super(MNISTDataProvider, self).__init__(
-            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
+            inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
    def next(self):
        """Returns next data batch or raises `StopIteration` if at end."""
@ -160,6 +203,102 @@ class MNISTDataProvider(DataProvider):
        one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
        return one_of_k_targets
 class EMNISTDataProvider(DataProvider):
    """Data provider for EMNIST handwritten digit images."""
    def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
                 shuffle_order=True, rng=None, smooth_labels=False):
        """Create a new EMNIST data provider object.
        Args:
            which_set: One of 'train', 'valid' or 'eval'. Determines which
                portion of the EMNIST data this object should provide.
            batch_size (int): Number of data points to include in each batch.
            max_num_batches (int): Maximum number of batches to iterate over
                in an epoch. If `max_num_batches * batch_size > num_data` then
                only as many batches as the data can be split into will be
                used. If set to -1 all of the data will be used.
            shuffle_order (bool): Whether to randomly permute the order of
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
            smooth_labels (bool): enable/disable label smoothing
        """
        # check a valid which_set was provided
        assert which_set in ['train', 'valid', 'test'], (
            'Expected which_set to be either train, valid or eval. '
            'Got {0}'.format(which_set)
        )
        self.which_set = which_set
        self.num_classes = 47
        # construct path to data using os.path.join to ensure the correct path
        # separator for the current platform / OS is used
        # MLP_DATA_DIR environment variable should point to the data directory
        data_path = os.path.join(
            os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set))
        assert os.path.isfile(data_path), (
            'Data file does not exist at expected path: ' + data_path
        )
        # load data from compressed numpy file
        loaded = np.load(data_path)
        print(loaded.keys())
        inputs, targets = loaded['inputs'], loaded['targets']
        inputs = inputs.astype(np.float32)
        inputs = np.reshape(inputs, newshape=(-1, 28*28))
        inputs = inputs / 255.0
        # pass the loaded data to the parent class __init__
        super(EMNISTDataProvider, self).__init__(
            inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
    def next(self):
        """Returns next data batch or raises `StopIteration` if at end."""
        inputs_batch, targets_batch = super(EMNISTDataProvider, self).next()
        if self.smooth_labels:
            targets_batch_mat = self.label_smoothing(targets_batch)
        else:
            targets_batch_mat = self.to_one_of_k(targets_batch)
        return inputs_batch, targets_batch_mat
    def to_one_of_k(self, int_targets):
        """Converts integer coded class target to 1 of K coded targets.
        Args:
            int_targets (ndarray): Array of integer coded class targets (i.e.
                where an integer from 0 to `num_classes` - 1 is used to
                indicate which is the correct class). This should be of shape
                (num_data,).
        Returns:
            Array of 1 of K coded targets i.e. an array of shape
            (num_data, num_classes) where for each row all elements are equal
            to zero except for the column corresponding to the correct class
            which is equal to one.
        """
        one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
        one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
        return one_of_k_targets
    def label_smoothing(self, int_targets, alpha=0.1):
        """Converts integer coded class target to 1 of K coded targets with label smoothing.
        Args:
        int_targets (ndarray): Array of integer coded class targets (i.e.
        where an integer from 0 to `num_classes` - 1 is used to
        indicate which is the correct class). This should be of shape
        (num_data,).
        alpha (float): Smoothing factor.
        Returns:
        Array of 1 of K coded targets with label smoothing i.e. an array of shape
        (num_data, num_classes)
        """
        raise NotImplementedError
 class MetOfficeDataProvider(DataProvider):
    """South Scotland Met Office weather data provider."""
@ -253,3 +392,41 @@ class CCPPDataProvider(DataProvider):
        targets = loaded[which_set + '_targets']
        super(CCPPDataProvider, self).__init__(
            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
 class AugmentedMNISTDataProvider(MNISTDataProvider):
    """Data provider for MNIST dataset which randomly transforms images."""
    def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
                 shuffle_order=True, rng=None, transformer=None):
        """Create a new augmented MNIST data provider object.
        Args:
            which_set: One of 'train', 'valid' or 'test'. Determines which
                portion of the MNIST data this object should provide.
            batch_size (int): Number of data points to include in each batch.
            max_num_batches (int): Maximum number of batches to iterate over
                in an epoch. If `max_num_batches * batch_size > num_data` then
                only as many batches as the data can be split into will be
                used. If set to -1 all of the data will be used.
            shuffle_order (bool): Whether to randomly permute the order of
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
            transformer: Function which takes an `inputs` array of shape
                (batch_size, input_dim) corresponding to a batch of input
                images and a `rng` random number generator object (i.e. a
                call signature `transformer(inputs, rng)`) and applies a
                potentiall random set of transformations to some / all of the
                input images as each new batch is returned when iterating over
                the data provider.
        """
        super(AugmentedMNISTDataProvider, self).__init__(
            which_set, batch_size, max_num_batches, shuffle_order, rng)
        self.transformer = transformer
    def next(self):
        """Returns next data batch or raises `StopIteration` if at end."""
        inputs_batch, targets_batch = super(
            AugmentedMNISTDataProvider, self).next()
        transformed_inputs_batch = self.transformer(inputs_batch, self.rng)
        return transformed_inputs_batch, targets_batch
--- a/mlp/errors.py
+++ b/mlp/errors.py
@ -154,9 +154,9 @@ class CrossEntropySoftmaxError(object):
        Returns:
            Scalar error function value.
        """
-        probs = np.exp(outputs)
+        normOutputs = outputs - outputs.max(-1)[:, None]
-        probs /= probs.sum(-1)[:, None]
+        logProb = normOutputs - np.log(np.sum(np.exp(normOutputs), axis=-1)[:, None])
-        return -np.mean(np.sum(targets * np.log(probs), axis=1))
+        return -np.mean(np.sum(targets * logProb, axis=1))
    def grad(self, outputs, targets):
        """Calculates gradient of error function with respect to outputs.
@ -168,7 +168,7 @@ class CrossEntropySoftmaxError(object):
        Returns:
            Gradient of error function with respect to outputs.
        """
-        probs = np.exp(outputs)
+        probs = np.exp(outputs - outputs.max(-1)[:, None])
        probs /= probs.sum(-1)[:, None]
        return (probs - targets) / outputs.shape[0]
--- a/mlp/initialisers.py
+++ b/mlp/initialisers.py
@ -63,3 +63,81 @@ class NormalInit(object):
    def __call__(self, shape):
        return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
 class GlorotUniformInit(object):
    """Glorot and Bengio (2010) random uniform weights initialiser.
    Initialises an two-dimensional parameter array using the 'normalized
    initialisation' scheme suggested in [1] which attempts to maintain a
    roughly constant variance in the activations and backpropagated gradients
    of a multi-layer model consisting of interleaved affine and logistic
    sigmoidal transformation layers.
    Weights are sampled from a zero-mean uniform distribution with standard
    deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
    `output_dim` are the input and output dimensions of the weight matrix
    respectively.
    References:
      [1]: Understanding the difficulty of training deep feedforward neural
           networks, Glorot and Bengio (2010)
    """
    def __init__(self, gain=1., rng=None):
        """Construct a normalised initilisation random initialiser object.
        Args:
            gain: Multiplicative factor to scale initialised weights by.
                Recommended values is 1 for affine layers followed by
                logistic sigmoid layers (or another affine layer).
            rng (RandomState): Seeded random number generator.
        """
        self.gain = gain
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def __call__(self, shape):
        assert len(shape) == 2, (
            'Initialiser should only be used for two dimensional arrays.')
        std = self.gain * (2. / (shape[0] + shape[1]))**0.5
        half_width = 3.**0.5 * std
        return self.rng.uniform(low=-half_width, high=half_width, size=shape)
 class GlorotNormalInit(object):
    """Glorot and Bengio (2010) random normal weights initialiser.
    Initialises an two-dimensional parameter array using the 'normalized
    initialisation' scheme suggested in [1] which attempts to maintain a
    roughly constant variance in the activations and backpropagated gradients
    of a multi-layer model consisting of interleaved affine and logistic
    sigmoidal transformation layers.
    Weights are sampled from a zero-mean normal distribution with standard
    deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
    `output_dim` are the input and output dimensions of the weight matrix
    respectively.
    References:
      [1]: Understanding the difficulty of training deep feedforward neural
           networks, Glorot and Bengio (2010)
    """
    def __init__(self, gain=1., rng=None):
        """Construct a normalised initilisation random initialiser object.
        Args:
            gain: Multiplicative factor to scale initialised weights by.
                Recommended values is 1 for affine layers followed by
                logistic sigmoid layers (or another affine layer).
            rng (RandomState): Seeded random number generator.
        """
        self.gain = gain
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def __call__(self, shape):
        std = self.gain * (2. / (shape[0] + shape[1]))**0.5
        return self.rng.normal(loc=0., scale=std, size=shape)
--- a/mlp/layers.py
+++ b/mlp/layers.py
@ -14,7 +14,7 @@ respect to the layer parameters.
 import numpy as np
 import mlp.initialisers as init
-
+from mlp import DEFAULT_SEED
 class Layer(object):
    """Abstract class defining the interface for a layer."""
@ -68,6 +68,13 @@ class LayerWithParameters(Layer):
        """
        raise NotImplementedError()
    def params_penalty(self):
        """Returns the parameter dependent penalty term for this layer.
        If no parameter-dependent penalty terms are set this returns zero.
        """
        raise NotImplementedError()
    @property
    def params(self):
        """Returns a list of parameters of layer.
@ -88,6 +95,127 @@ class LayerWithParameters(Layer):
        """
        raise NotImplementedError()
 class StochasticLayerWithParameters(Layer):
    """Specialised layer which uses a stochastic forward propagation."""
    def __init__(self, rng=None):
        """Constructs a new StochasticLayer object.
        Args:
            rng (RandomState): Seeded random number generator object.
        """
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def fprop(self, inputs, stochastic=True):
        """Forward propagates activations through the layer transformation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            stochastic: Flag allowing different deterministic
                forward-propagation mode in addition to default stochastic
                forward-propagation e.g. for use at test time. If False
                a deterministic forward-propagation transformation
                corresponding to the expected output of the stochastic
                forward-propagation is applied.
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        raise NotImplementedError()
    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.
        Args:
            inputs: Array of inputs to layer of shape (batch_size, input_dim).
            grads_wrt_to_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            List of arrays of gradients with respect to the layer parameters
            with parameter gradients appearing in same order in tuple as
            returned from `get_params` method.
        """
        raise NotImplementedError()
    def params_penalty(self):
        """Returns the parameter dependent penalty term for this layer.
        If no parameter-dependent penalty terms are set this returns zero.
        """
        raise NotImplementedError()
    @property
    def params(self):
        """Returns a list of parameters of layer.
        Returns:
            List of current parameter values. This list should be in the
            corresponding order to the `values` argument to `set_params`.
        """
        raise NotImplementedError()
    @params.setter
    def params(self, values):
        """Sets layer parameters from a list of values.
        Args:
            values: List of values to set parameters to. This list should be
                in the corresponding order to what is returned by `get_params`.
        """
        raise NotImplementedError()
 class StochasticLayer(Layer):
    """Specialised layer which uses a stochastic forward propagation."""
    def __init__(self, rng=None):
        """Constructs a new StochasticLayer object.
        Args:
            rng (RandomState): Seeded random number generator object.
        """
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def fprop(self, inputs, stochastic=True):
        """Forward propagates activations through the layer transformation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            stochastic: Flag allowing different deterministic
                forward-propagation mode in addition to default stochastic
                forward-propagation e.g. for use at test time. If False
                a deterministic forward-propagation transformation
                corresponding to the expected output of the stochastic
                forward-propagation is applied.
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        raise NotImplementedError()
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs. This should correspond to
        default stochastic forward-propagation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        raise NotImplementedError()
 class AffineLayer(LayerWithParameters):
    """Layer implementing an affine tranformation of its inputs.
@ -97,7 +225,8 @@ class AffineLayer(LayerWithParameters):
    def __init__(self, input_dim, output_dim,
                 weights_initialiser=init.UniformInit(-0.1, 0.1),
-                 biases_initialiser=init.ConstantInit(0.)):
+                 biases_initialiser=init.ConstantInit(0.),
                 weights_penalty=None, biases_penalty=None):
        """Initialises a parameterised affine layer.
        Args:
@ -105,11 +234,17 @@ class AffineLayer(LayerWithParameters):
            output_dim (int): Dimension of the layer outputs.
            weights_initialiser: Initialiser for the weight parameters.
            biases_initialiser: Initialiser for the bias parameters.
            weights_penalty: Weights-dependent penalty term (regulariser) or
                None if no regularisation is to be applied to the weights.
            biases_penalty: Biases-dependent penalty term (regulariser) or
                None if no regularisation is to be applied to the biases.
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weights = weights_initialiser((self.output_dim, self.input_dim))
        self.biases = biases_initialiser(self.output_dim)
        self.weights_penalty = weights_penalty
        self.biases_penalty = biases_penalty
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
@ -123,7 +258,7 @@ class AffineLayer(LayerWithParameters):
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
-        return inputs.dot(self.weights.T) + self.biases
+        return self.weights.dot(inputs.T).T + self.biases
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
@ -159,8 +294,27 @@ class AffineLayer(LayerWithParameters):
        grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
        grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
        if self.weights_penalty is not None:
            grads_wrt_weights += self.weights_penalty.grad(parameter=self.weights)
        if self.biases_penalty is not None:
            grads_wrt_biases += self.biases_penalty.grad(parameter=self.biases)
        return [grads_wrt_weights, grads_wrt_biases]
    def params_penalty(self):
        """Returns the parameter dependent penalty term for this layer.
        If no parameter-dependent penalty terms are set this returns zero.
        """
        params_penalty = 0
        if self.weights_penalty is not None:
            params_penalty += self.weights_penalty(self.weights)
        if self.biases_penalty is not None:
            params_penalty += self.biases_penalty(self.biases)
        return params_penalty
    @property
    def params(self):
        """A list of layer parameter values: `[weights, biases]`."""
@ -175,7 +329,6 @@ class AffineLayer(LayerWithParameters):
        return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
            self.input_dim, self.output_dim)
 class SigmoidLayer(Layer):
    """Layer implementing an element-wise logistic sigmoid transformation."""
@ -215,6 +368,160 @@ class SigmoidLayer(Layer):
    def __repr__(self):
        return 'SigmoidLayer'
 class ReluLayer(Layer):
    """Layer implementing an element-wise rectified linear transformation."""
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        return np.maximum(inputs, 0.)
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        return (outputs > 0) * grads_wrt_outputs
    def __repr__(self):
        return 'ReluLayer'
 class LeakyReluLayer(Layer):
    """Layer implementing an element-wise leaky rectified linear transformation."""
    def __init__(self, alpha=0.01):
        self.alpha = alpha
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
        """
        raise NotImplementedError
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        """
        raise NotImplementedError
    def __repr__(self):
        return 'LeakyReluLayer'
 class ParametricReluLayer(LayerWithParameters):
    """Layer implementing an element-wise parametric rectified linear transformation."""
    def __init__(self, alpha=0.25):
        self.alpha = np.array([alpha])
    @property
    def params(self):
        """A list of layer parameter values: `[weights, biases]`."""
        return [self.alpha]
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
        """
        raise NotImplementedError
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        """
        raise NotImplementedError
    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.
        Args:
            inputs: array of inputs to layer of shape (batch_size, input_dim)
            grads_wrt_to_outputs: array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim)
        Returns:
            list of arrays of gradients with respect to the layer parameters
            `[grads_wrt_params]`. Where params is the alpha parameter.
        """
        raise NotImplementedError
    @property
    def params(self):
        """A list of layer parameter values: `[weights, biases]`."""
        return [self.alpha]
    @params.setter
    def params(self, values):
        self.alpha = values[0]
    def __repr__(self):
        return 'ParametricReluLayer'
 class TanhLayer(Layer):
    """Layer implementing an element-wise hyperbolic tangent transformation."""
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        return np.tanh(inputs)
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        return (1. - outputs**2) * grads_wrt_outputs
    def __repr__(self):
        return 'TanhLayer'
 class SoftmaxLayer(Layer):
    """Layer implementing a softmax transformation."""
@ -232,7 +539,9 @@ class SoftmaxLayer(Layer):
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
-        exp_inputs = np.exp(inputs)
+        # subtract max inside exponential to improve numerical stability -
        # when we divide through by sum this term cancels
        exp_inputs = np.exp(inputs - inputs.max(-1)[:, None])
        return exp_inputs / exp_inputs.sum(-1)[:, None]
    def bprop(self, inputs, outputs, grads_wrt_outputs):
@ -257,3 +566,177 @@ class SoftmaxLayer(Layer):
    def __repr__(self):
        return 'SoftmaxLayer'
 class RadialBasisFunctionLayer(Layer):
    """Layer implementing projection to a grid of radial basis functions."""
    def __init__(self, grid_dim, intervals=[[0., 1.]]):
        """Creates a radial basis function layer object.
        Args:
            grid_dim: Integer specifying how many basis function to use in
                grid across input space per dimension (so total number of
                basis functions will be grid_dim**input_dim)
            intervals: List of intervals (two element lists or tuples)
                specifying extents of axis-aligned region in input-space to
                tile basis functions in grid across. For example for a 2D input
                space spanning [0, 1] x [0, 1] use intervals=[[0, 1], [0, 1]].
        """
        num_basis = grid_dim**len(intervals)
        self.centres = np.array(np.meshgrid(*[
            np.linspace(low, high, grid_dim) for (low, high) in intervals])
        ).reshape((len(intervals), -1))
        self.scales = np.array([
            [(high - low) * 1. / grid_dim] for (low, high) in intervals])
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        return np.exp(-(inputs[..., None] - self.centres[None, ...])**2 /
                      self.scales**2).reshape((inputs.shape[0], -1))
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        num_basis = self.centres.shape[1]
        return -2 * (
            ((inputs[..., None] - self.centres[None, ...]) / self.scales**2) *
            grads_wrt_outputs.reshape((inputs.shape[0], -1, num_basis))
        ).sum(-1)
    def __repr__(self):
        return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)
 class DropoutLayer(StochasticLayer):
    """Layer which stochastically drops input dimensions in its output."""
    def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True):
        """Construct a new dropout layer.
        Args:
            rng (RandomState): Seeded random number generator.
            incl_prob: Scalar value in (0, 1] specifying the probability of
                each input dimension being included in the output.
            share_across_batch: Whether to use same dropout mask across
                all inputs in a batch or use per input masks.
        """
        super(DropoutLayer, self).__init__(rng)
        assert incl_prob > 0. and incl_prob <= 1.
        self.incl_prob = incl_prob
        self.share_across_batch = share_across_batch
        self.rng = rng
    def fprop(self, inputs, stochastic=True):
        """Forward propagates activations through the layer transformation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            stochastic: Flag allowing different deterministic
                forward-propagation mode in addition to default stochastic
                forward-propagation e.g. for use at test time. If False
                a deterministic forward-propagation transformation
                corresponding to the expected output of the stochastic
                forward-propagation is applied.
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        raise NotImplementedError
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs. This should correspond to
        default stochastic forward-propagation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        raise NotImplementedError
    def __repr__(self):
        return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob)
 class ReshapeLayer(Layer):
    """Layer which reshapes dimensions of inputs."""
    def __init__(self, output_shape=None):
        """Create a new reshape layer object.
        Args:
            output_shape: Tuple specifying shape each input in batch should
                be reshaped to in outputs. This **excludes** the batch size
                so the shape of the final output array will be
                    (batch_size, ) + output_shape
                Similarly to numpy.reshape, one shape dimension can be -1. In
                this case, the value is inferred from the size of the input
                array and remaining dimensions. The shape specified must be
                compatible with the input array shape - i.e. the total number
                of values in the array cannot be changed. If set to `None` the
                output shape will be set to
                    (batch_size, -1)
                which will flatten all the inputs to vectors.
        """
        self.output_shape = (-1,) if output_shape is None else output_shape
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        return inputs.reshape((inputs.shape[0],) + self.output_shape)
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        return grads_wrt_outputs.reshape(inputs.shape)
    def __repr__(self):
        return 'ReshapeLayer(output_shape={0})'.format(self.output_shape)
--- a/mlp/learning_rules.py
+++ b/mlp/learning_rules.py
@ -160,3 +160,158 @@ class MomentumLearningRule(GradientDescentLearningRule):
            mom *= self.mom_coeff
            mom -= self.learning_rate * grad
            param += mom
 class AdamLearningRule(GradientDescentLearningRule):
    """Adaptive moments (Adam) learning rule.
    First-order gradient-descent based learning rule which uses adaptive
    estimates of first and second moments of the parameter gradients to
    calculate the parameter updates.
    References:
      [1]: Adam: a method for stochastic optimisation
           Kingma and Ba, 2015
    """
    def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
                 epsilon=1e-8):
        """Creates a new learning rule object.
        Args:
            learning_rate: A postive scalar to scale gradient updates to the
                parameters by. This needs to be carefully set - if too large
                the learning dynamic will be unstable and may diverge, while
                if set too small learning will proceed very slowly.
            beta_1: Exponential decay rate for gradient first moment estimates.
                This should be a scalar value in [0, 1]. The running gradient
                first moment estimate is calculated using
                `m_1 = beta_1 * m_1_prev + (1 - beta_1) * g`
                 where `m_1_prev` is the previous estimate and `g` the current
                 parameter gradients.
            beta_2: Exponential decay rate for gradient second moment
                estimates. This should be a scalar value in [0, 1]. The run
                gradient second moment estimate is calculated using
                `m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2`
                 where `m_2_prev` is the previous estimate and `g` the current
                 parameter gradients.
            epsilon: 'Softening' parameter to stop updates diverging when
                second moment estimates are close to zero. Should be set to
                a small positive value.
        """
        super(AdamLearningRule, self).__init__(learning_rate)
        assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].'
        assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].'
        assert epsilon > 0., 'epsilon should be > 0.'
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
    def initialise(self, params):
        """Initialises the state of the learning rule for a set or parameters.
        This must be called before `update_params` is first called.
        Args:
            params: A list of the parameters to be optimised. Note these will
                be updated *in-place* to avoid reallocating arrays on each
                update.
        """
        super(AdamLearningRule, self).initialise(params)
        self.moms_1 = []
        for param in self.params:
            self.moms_1.append(np.zeros_like(param))
        self.moms_2 = []
        for param in self.params:
            self.moms_2.append(np.zeros_like(param))
        self.step_count = 0
    def reset(self):
        """Resets any additional state variables to their initial values.
        For this learning rule this corresponds to zeroing the estimates of
        the first and second moments of the gradients.
        """
        for mom_1, mom_2 in zip(self.moms_1, self.moms_2):
            mom_1 *= 0.
            mom_2 *= 0.
        self.step_count = 0
    def update_params(self, grads_wrt_params):
        """Applies a single update to all parameters.
        All parameter updates are performed using in-place operations and so
        nothing is returned.
        Args:
            grads_wrt_params: A list of gradients of the scalar loss function
                with respect to each of the parameters passed to `initialise`
                previously, with this list expected to be in the same order.
        """
        for param, mom_1, mom_2, grad in zip(
                self.params, self.moms_1, self.moms_2, grads_wrt_params):
            mom_1 *= self.beta_1
            mom_1 += (1. - self.beta_1) * grad
            mom_2 *= self.beta_2
            mom_2 += (1. - self.beta_2) * grad ** 2
            alpha_t = (
                    self.learning_rate *
                    (1. - self.beta_2 ** (self.step_count + 1)) ** 0.5 /
                    (1. - self.beta_1 ** (self.step_count + 1))
            )
            param -= alpha_t * mom_1 / (mom_2 ** 0.5 + self.epsilon)
        self.step_count += 1
 class AdaGradLearningRule(GradientDescentLearningRule):
    """Adaptive gradients (AdaGrad) learning rule.
    First-order gradient-descent based learning rule which normalises gradient
    updates by a running sum of the past squared gradients.
    References:
      [1]: Adaptive Subgradient Methods for Online Learning and Stochastic
           Optimization. Duchi, Haxan and Singer, 2011
    """
    def __init__(self, learning_rate=1e-2, epsilon=1e-8):
        """Creates a new learning rule object.
        Args:
            learning_rate: A postive scalar to scale gradient updates to the
                parameters by. This needs to be carefully set - if too large
                the learning dynamic will be unstable and may diverge, while
                if set too small learning will proceed very slowly.
            epsilon: 'Softening' parameter to stop updates diverging when
                sums of squared gradients are close to zero. Should be set to
                a small positive value.
        """
        super(AdaGradLearningRule, self).__init__(learning_rate)
        assert epsilon > 0., 'epsilon should be > 0.'
        self.epsilon = epsilon
    def initialise(self, params):
        """Initialises the state of the learning rule for a set or parameters.
        This must be called before `update_params` is first called.
        Args:
            params: A list of the parameters to be optimised. Note these will
                be updated *in-place* to avoid reallocating arrays on each
                update.
        """
        super(AdaGradLearningRule, self).initialise(params)
        self.sum_sq_grads = []
        for param in self.params:
            self.sum_sq_grads.append(np.zeros_like(param))
    def reset(self):
        """Resets any additional state variables to their initial values.
        For this learning rule this corresponds to zeroing all the sum of
        squared gradient states.
        """
        for sum_sq_grad in self.sum_sq_grads:
            sum_sq_grad *= 0.
    def update_params(self, grads_wrt_params):
        """Applies a single update to all parameters.
        All parameter updates are performed using in-place operations and so
        nothing is returned.
        Args:
            grads_wrt_params: A list of gradients of the scalar loss function
                with respect to each of the parameters passed to `initialise`
                previously, with this list expected to be in the same order.
        """
        for param, sum_sq_grad, grad in zip(
                self.params, self.sum_sq_grads, grads_wrt_params):
            sum_sq_grad += grad ** 2
            param -= (self.learning_rate * grad /
                      (sum_sq_grad + self.epsilon) ** 0.5)
--- a/mlp/models.py
+++ b/mlp/models.py
@ -8,7 +8,7 @@ outputs (and intermediate states) and for calculating gradients of scalar
 functions of the outputs with respect to the model parameters.
 """
-from mlp.layers import LayerWithParameters
+from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters
 class SingleLayerModel(object):
@ -80,11 +80,11 @@ class MultipleLayerModel(object):
        """A list of all of the parameters of the model."""
        params = []
        for layer in self.layers:
-            if isinstance(layer, LayerWithParameters):
+            if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
                params += layer.params
        return params
-    def fprop(self, inputs):
+    def fprop(self, inputs, evaluation=False):
        """Forward propagates a batch of inputs through the model.
        Args:
@ -97,7 +97,19 @@ class MultipleLayerModel(object):
        """
        activations = [inputs]
        for i, layer in enumerate(self.layers):
-            activations.append(self.layers[i].fprop(activations[i]))
+            if evaluation:
                if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
                                                                                   StochasticLayerWithParameters):
                    current_activations = self.layers[i].fprop(activations[i], stochastic=False)
                else:
                    current_activations = self.layers[i].fprop(activations[i])
            else:
                if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
                                                                                   StochasticLayerWithParameters):
                    current_activations = self.layers[i].fprop(activations[i], stochastic=True)
                else:
                    current_activations = self.layers[i].fprop(activations[i])
            activations.append(current_activations)
        return activations
    def grads_wrt_params(self, activations, grads_wrt_outputs):
@ -119,7 +131,7 @@ class MultipleLayerModel(object):
            inputs = activations[-i - 2]
            outputs = activations[-i - 1]
            grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
-            if isinstance(layer, LayerWithParameters):
+            if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
                grads_wrt_params += layer.grads_wrt_params(
                    inputs, grads_wrt_outputs)[::-1]
            grads_wrt_outputs = grads_wrt_inputs
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@ -9,7 +9,7 @@ import time
 import logging
 from collections import OrderedDict
 import numpy as np
-
+import tqdm
 logger = logging.getLogger(__name__)
@ -18,7 +18,7 @@ class Optimiser(object):
    """Basic model optimiser."""
    def __init__(self, model, error, learning_rule, train_dataset,
-                 valid_dataset=None, data_monitors=None):
+                 valid_dataset=None, data_monitors=None, notebook=False):
        """Create a new optimiser instance.
        Args:
@ -43,6 +43,11 @@ class Optimiser(object):
        self.data_monitors = OrderedDict([('error', error)])
        if data_monitors is not None:
            self.data_monitors.update(data_monitors)
        self.notebook = notebook
        if notebook:
            self.tqdm_progress = tqdm.tqdm_notebook
        else:
            self.tqdm_progress = tqdm.tqdm
    def do_training_epoch(self):
        """Do a single training epoch.
@ -52,12 +57,15 @@ class Optimiser(object):
        respect to all the model parameters and then updates the model
        parameters according to the learning rule.
        """
-        for inputs_batch, targets_batch in self.train_dataset:
+        with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar:
-            activations = self.model.fprop(inputs_batch)
+            train_progress_bar.set_description("Ep Prog")
-            grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
+            for inputs_batch, targets_batch in self.train_dataset:
-            grads_wrt_params = self.model.grads_wrt_params(
+                activations = self.model.fprop(inputs_batch)
-                activations, grads_wrt_outputs)
+                grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
-            self.learning_rule.update_params(grads_wrt_params)
+                grads_wrt_params = self.model.grads_wrt_params(
                    activations, grads_wrt_outputs)
                self.learning_rule.update_params(grads_wrt_params)
                train_progress_bar.update(1)
    def eval_monitors(self, dataset, label):
        """Evaluates the monitors for the given dataset.
@ -72,7 +80,7 @@ class Optimiser(object):
        data_mon_vals = OrderedDict([(key + label, 0.) for key
                                     in self.data_monitors.keys()])
        for inputs_batch, targets_batch in dataset:
-            activations = self.model.fprop(inputs_batch)
+            activations = self.model.fprop(inputs_batch, evaluation=True)
            for key, data_monitor in self.data_monitors.items():
                data_mon_vals[key + label] += data_monitor(
                    activations[-1], targets_batch)
@ -104,7 +112,7 @@ class Optimiser(object):
        """
        logger.info('Epoch {0}: {1:.1f}s to complete\n    {2}'.format(
            epoch, epoch_time,
-            ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
+            ', '.join(['{}={:.2e}'.format(k, v) for (k, v) in stats.items()])
        ))
    def train(self, num_epochs, stats_interval=5):
@ -121,17 +129,20 @@ class Optimiser(object):
            and the second being a dict mapping the labels for the statistics
            recorded to their column index in the array.
        """
-        start_train_time = time.process_time()
+        start_train_time = time.time()
        run_stats = [list(self.get_epoch_stats().values())]
-        for epoch in range(1, num_epochs + 1):
+        with self.tqdm_progress(total=num_epochs) as progress_bar:
-            start_time = time.process_time()
+            progress_bar.set_description("Exp Prog")
-            self.do_training_epoch()
+            for epoch in range(1, num_epochs + 1):
-            epoch_time = time.process_time() - start_time
+                start_time = time.time()
-            if epoch % stats_interval == 0:
+                self.do_training_epoch()
-                stats = self.get_epoch_stats()
+                epoch_time = time.time()- start_time
-                self.log_stats(epoch, epoch_time, stats)
+                if epoch % stats_interval == 0:
-                run_stats.append(list(stats.values()))
+                    stats = self.get_epoch_stats()
-        finish_train_time = time.process_time()
+                    self.log_stats(epoch, epoch_time, stats)
                    run_stats.append(list(stats.values()))
                progress_bar.update(1)
        finish_train_time = time.time()
        total_train_time = finish_train_time - start_train_time
        return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time
--- a/mlp/schedulers.py
+++ b/mlp/schedulers.py
@ -32,3 +32,42 @@ class ConstantLearningRateScheduler(object):
            epoch_number: Integer index of training epoch about to be run.
        """
        learning_rule.learning_rate = self.learning_rate
 class CosineAnnealingWithWarmRestarts(object):
    """Cosine annealing scheduler, implemented as in https://arxiv.org/pdf/1608.03983.pdf"""
    def __init__(self, min_learning_rate, max_learning_rate, total_iters_per_period, max_learning_rate_discount_factor,
                 period_iteration_expansion_factor):
        """
        Instantiates a new cosine annealing with warm restarts learning rate scheduler
        :param min_learning_rate: The minimum learning rate the scheduler can assign
        :param max_learning_rate: The maximum learning rate the scheduler can assign
        :param total_epochs_per_period: The number of epochs in a period
        :param max_learning_rate_discount_factor: The rate of discount for the maximum learning rate after each restart i.e. how many times smaller the max learning rate will be after a restart compared to the previous one
        :param period_iteration_expansion_factor: The rate of expansion of the period epochs. e.g. if it's set to 1 then all periods have the same number of epochs, if it's larger than 1 then each subsequent period will have more epochs and vice versa.
        """
        self.min_learning_rate = min_learning_rate
        self.max_learning_rate = max_learning_rate
        self.total_epochs_per_period = total_iters_per_period
        self.max_learning_rate_discount_factor = max_learning_rate_discount_factor
        self.period_iteration_expansion_factor = period_iteration_expansion_factor
    def update_learning_rule(self, learning_rule, epoch_number):
        """Update the hyperparameters of the learning rule.
        Run at the beginning of each epoch.
        Args:
            learning_rule: Learning rule object being used in training run,
                any scheduled hyperparameters to be altered should be
                attributes of this object.
            epoch_number: Integer index of training epoch about to be run.
        Returns:
            effective_learning_rate at step 'epoch_number'
        """
        raise NotImplementedError