diff --git a/mlp/data_providers.py b/mlp/data_providers.py index ebea079..0bf3840 100644 --- a/mlp/data_providers.py +++ b/mlp/data_providers.py @@ -16,7 +16,7 @@ class DataProvider(object): """Generic data provider.""" def __init__(self, inputs, targets, batch_size, max_num_batches=-1, - shuffle_order=True, rng=None): + shuffle_order=True, rng=None, smooth_labels=False): """Create a new data provider object. Args: @@ -32,26 +32,60 @@ class DataProvider(object): shuffle_order (bool): Whether to randomly permute the order of the data before each epoch. rng (RandomState): A seeded random number generator. + smooth_labels (bool): turn on label smoothing """ self.inputs = inputs self.targets = targets - self.batch_size = batch_size - assert max_num_batches != 0 and not max_num_batches < -1, ( - 'max_num_batches should be -1 or > 0') - self.max_num_batches = max_num_batches + if batch_size < 1: + raise ValueError('batch_size must be >= 1') + self._batch_size = batch_size + if max_num_batches == 0 or max_num_batches < -1: + raise ValueError('max_num_batches must be -1 or > 0') + self._max_num_batches = max_num_batches + self._update_num_batches() + self.shuffle_order = shuffle_order + + self._current_order = np.arange(inputs.shape[0]) + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + self.smooth_labels = smooth_labels + self.new_epoch() + + @property + def batch_size(self): + """Number of data points to include in each batch.""" + return self._batch_size + + @batch_size.setter + def batch_size(self, value): + if value < 1: + raise ValueError('batch_size must be >= 1') + self._batch_size = value + self._update_num_batches() + + @property + def max_num_batches(self): + """Maximum number of batches to iterate over in an epoch.""" + return self._max_num_batches + + @max_num_batches.setter + def max_num_batches(self, value): + if value == 0 or value < -1: + raise ValueError('max_num_batches must be -1 or > 0') + self._max_num_batches = value + self._update_num_batches() + + def _update_num_batches(self): + """Updates number of batches to iterate over.""" # maximum possible number of batches is equal to number of whole times # batch_size divides in to the number of data points which can be # found using integer division - possible_num_batches = self.inputs.shape[0] // batch_size + possible_num_batches = self.inputs.shape[0] // self.batch_size if self.max_num_batches == -1: self.num_batches = possible_num_batches else: self.num_batches = min(self.max_num_batches, possible_num_batches) - self.shuffle_order = shuffle_order - if rng is None: - rng = np.random.RandomState(DEFAULT_SEED) - self.rng = rng - self.reset() def __iter__(self): """Implements Python iterator interface. @@ -63,27 +97,36 @@ class DataProvider(object): """ return self - def reset(self): - """Resets the provider to the initial state to use in a new epoch.""" + def new_epoch(self): + """Starts a new epoch (pass through data), possibly shuffling first.""" self._curr_batch = 0 if self.shuffle_order: self.shuffle() - def shuffle(self): - """Randomly shuffles order of data.""" - new_order = self.rng.permutation(self.inputs.shape[0]) - self.inputs = self.inputs[new_order] - self.targets = self.targets[new_order] - def __next__(self): return self.next() + def reset(self): + """Resets the provider to the initial state.""" + inv_perm = np.argsort(self._current_order) + self._current_order = self._current_order[inv_perm] + self.inputs = self.inputs[inv_perm] + self.targets = self.targets[inv_perm] + self.new_epoch() + + def shuffle(self): + """Randomly shuffles order of data.""" + perm = self.rng.permutation(self.inputs.shape[0]) + self._current_order = self._current_order[perm] + self.inputs = self.inputs[perm] + self.targets = self.targets[perm] + def next(self): """Returns next data batch or raises `StopIteration` if at end.""" if self._curr_batch + 1 > self.num_batches: - # no more batches in current iteration through data set so reset - # the dataset for another pass and indicate iteration is at end - self.reset() + # no more batches in current iteration through data set so start + # new epoch ready for another pass and indicate iteration is at end + self.new_epoch() raise StopIteration() # create an index slice corresponding to current batch number batch_slice = slice(self._curr_batch * self.batch_size, @@ -93,12 +136,11 @@ class DataProvider(object): self._curr_batch += 1 return inputs_batch, targets_batch - class MNISTDataProvider(DataProvider): """Data provider for MNIST handwritten digit images.""" def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, - shuffle_order=True, rng=None): + shuffle_order=True, rng=None, smooth_labels=False): """Create a new MNIST data provider object. Args: @@ -112,9 +154,10 @@ class MNISTDataProvider(DataProvider): shuffle_order (bool): Whether to randomly permute the order of the data before each epoch. rng (RandomState): A seeded random number generator. + smooth_labels (bool): enable/disable label smoothing """ # check a valid which_set was provided - assert which_set in ['train', 'valid', 'eval'], ( + assert which_set in ['train', 'valid', 'test'], ( 'Expected which_set to be either train, valid or eval. ' 'Got {0}'.format(which_set) ) @@ -134,7 +177,7 @@ class MNISTDataProvider(DataProvider): inputs = inputs.astype(np.float32) # pass the loaded data to the parent class __init__ super(MNISTDataProvider, self).__init__( - inputs, targets, batch_size, max_num_batches, shuffle_order, rng) + inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels) def next(self): """Returns next data batch or raises `StopIteration` if at end.""" @@ -160,6 +203,102 @@ class MNISTDataProvider(DataProvider): one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1 return one_of_k_targets +class EMNISTDataProvider(DataProvider): + """Data provider for EMNIST handwritten digit images.""" + + def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, + shuffle_order=True, rng=None, smooth_labels=False): + """Create a new EMNIST data provider object. + + Args: + which_set: One of 'train', 'valid' or 'eval'. Determines which + portion of the EMNIST data this object should provide. + batch_size (int): Number of data points to include in each batch. + max_num_batches (int): Maximum number of batches to iterate over + in an epoch. If `max_num_batches * batch_size > num_data` then + only as many batches as the data can be split into will be + used. If set to -1 all of the data will be used. + shuffle_order (bool): Whether to randomly permute the order of + the data before each epoch. + rng (RandomState): A seeded random number generator. + smooth_labels (bool): enable/disable label smoothing + """ + # check a valid which_set was provided + assert which_set in ['train', 'valid', 'test'], ( + 'Expected which_set to be either train, valid or eval. ' + 'Got {0}'.format(which_set) + ) + self.which_set = which_set + self.num_classes = 47 + # construct path to data using os.path.join to ensure the correct path + # separator for the current platform / OS is used + # MLP_DATA_DIR environment variable should point to the data directory + data_path = os.path.join( + os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set)) + assert os.path.isfile(data_path), ( + 'Data file does not exist at expected path: ' + data_path + ) + # load data from compressed numpy file + loaded = np.load(data_path) + print(loaded.keys()) + inputs, targets = loaded['inputs'], loaded['targets'] + inputs = inputs.astype(np.float32) + inputs = np.reshape(inputs, newshape=(-1, 28*28)) + inputs = inputs / 255.0 + # pass the loaded data to the parent class __init__ + super(EMNISTDataProvider, self).__init__( + inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels) + + def next(self): + """Returns next data batch or raises `StopIteration` if at end.""" + inputs_batch, targets_batch = super(EMNISTDataProvider, self).next() + + if self.smooth_labels: + targets_batch_mat = self.label_smoothing(targets_batch) + else: + targets_batch_mat = self.to_one_of_k(targets_batch) + return inputs_batch, targets_batch_mat + + + + def to_one_of_k(self, int_targets): + """Converts integer coded class target to 1 of K coded targets. + + Args: + int_targets (ndarray): Array of integer coded class targets (i.e. + where an integer from 0 to `num_classes` - 1 is used to + indicate which is the correct class). This should be of shape + (num_data,). + + Returns: + Array of 1 of K coded targets i.e. an array of shape + (num_data, num_classes) where for each row all elements are equal + to zero except for the column corresponding to the correct class + which is equal to one. + """ + one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes)) + one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1 + return one_of_k_targets + + def label_smoothing(self, int_targets, alpha=0.1): + """Converts integer coded class target to 1 of K coded targets with label smoothing. + + Args: + int_targets (ndarray): Array of integer coded class targets (i.e. + where an integer from 0 to `num_classes` - 1 is used to + indicate which is the correct class). This should be of shape + (num_data,). + alpha (float): Smoothing factor. + + Returns: + Array of 1 of K coded targets with label smoothing i.e. an array of shape + (num_data, num_classes) + + """ + + raise NotImplementedError + + class MetOfficeDataProvider(DataProvider): """South Scotland Met Office weather data provider.""" @@ -253,3 +392,41 @@ class CCPPDataProvider(DataProvider): targets = loaded[which_set + '_targets'] super(CCPPDataProvider, self).__init__( inputs, targets, batch_size, max_num_batches, shuffle_order, rng) + + +class AugmentedMNISTDataProvider(MNISTDataProvider): + """Data provider for MNIST dataset which randomly transforms images.""" + + def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, + shuffle_order=True, rng=None, transformer=None): + """Create a new augmented MNIST data provider object. + + Args: + which_set: One of 'train', 'valid' or 'test'. Determines which + portion of the MNIST data this object should provide. + batch_size (int): Number of data points to include in each batch. + max_num_batches (int): Maximum number of batches to iterate over + in an epoch. If `max_num_batches * batch_size > num_data` then + only as many batches as the data can be split into will be + used. If set to -1 all of the data will be used. + shuffle_order (bool): Whether to randomly permute the order of + the data before each epoch. + rng (RandomState): A seeded random number generator. + transformer: Function which takes an `inputs` array of shape + (batch_size, input_dim) corresponding to a batch of input + images and a `rng` random number generator object (i.e. a + call signature `transformer(inputs, rng)`) and applies a + potentiall random set of transformations to some / all of the + input images as each new batch is returned when iterating over + the data provider. + """ + super(AugmentedMNISTDataProvider, self).__init__( + which_set, batch_size, max_num_batches, shuffle_order, rng) + self.transformer = transformer + + def next(self): + """Returns next data batch or raises `StopIteration` if at end.""" + inputs_batch, targets_batch = super( + AugmentedMNISTDataProvider, self).next() + transformed_inputs_batch = self.transformer(inputs_batch, self.rng) + return transformed_inputs_batch, targets_batch diff --git a/mlp/errors.py b/mlp/errors.py index 2b49917..c05b752 100644 --- a/mlp/errors.py +++ b/mlp/errors.py @@ -154,9 +154,9 @@ class CrossEntropySoftmaxError(object): Returns: Scalar error function value. """ - probs = np.exp(outputs) - probs /= probs.sum(-1)[:, None] - return -np.mean(np.sum(targets * np.log(probs), axis=1)) + normOutputs = outputs - outputs.max(-1)[:, None] + logProb = normOutputs - np.log(np.sum(np.exp(normOutputs), axis=-1)[:, None]) + return -np.mean(np.sum(targets * logProb, axis=1)) def grad(self, outputs, targets): """Calculates gradient of error function with respect to outputs. @@ -168,7 +168,7 @@ class CrossEntropySoftmaxError(object): Returns: Gradient of error function with respect to outputs. """ - probs = np.exp(outputs) + probs = np.exp(outputs - outputs.max(-1)[:, None]) probs /= probs.sum(-1)[:, None] return (probs - targets) / outputs.shape[0] diff --git a/mlp/initialisers.py b/mlp/initialisers.py index 243adc2..8c8e252 100644 --- a/mlp/initialisers.py +++ b/mlp/initialisers.py @@ -63,3 +63,81 @@ class NormalInit(object): def __call__(self, shape): return self.rng.normal(loc=self.mean, scale=self.std, size=shape) + +class GlorotUniformInit(object): + """Glorot and Bengio (2010) random uniform weights initialiser. + + Initialises an two-dimensional parameter array using the 'normalized + initialisation' scheme suggested in [1] which attempts to maintain a + roughly constant variance in the activations and backpropagated gradients + of a multi-layer model consisting of interleaved affine and logistic + sigmoidal transformation layers. + + Weights are sampled from a zero-mean uniform distribution with standard + deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and + `output_dim` are the input and output dimensions of the weight matrix + respectively. + + References: + [1]: Understanding the difficulty of training deep feedforward neural + networks, Glorot and Bengio (2010) + """ + + def __init__(self, gain=1., rng=None): + """Construct a normalised initilisation random initialiser object. + + Args: + gain: Multiplicative factor to scale initialised weights by. + Recommended values is 1 for affine layers followed by + logistic sigmoid layers (or another affine layer). + rng (RandomState): Seeded random number generator. + """ + self.gain = gain + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + assert len(shape) == 2, ( + 'Initialiser should only be used for two dimensional arrays.') + std = self.gain * (2. / (shape[0] + shape[1]))**0.5 + half_width = 3.**0.5 * std + return self.rng.uniform(low=-half_width, high=half_width, size=shape) + + +class GlorotNormalInit(object): + """Glorot and Bengio (2010) random normal weights initialiser. + + Initialises an two-dimensional parameter array using the 'normalized + initialisation' scheme suggested in [1] which attempts to maintain a + roughly constant variance in the activations and backpropagated gradients + of a multi-layer model consisting of interleaved affine and logistic + sigmoidal transformation layers. + + Weights are sampled from a zero-mean normal distribution with standard + deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and + `output_dim` are the input and output dimensions of the weight matrix + respectively. + + References: + [1]: Understanding the difficulty of training deep feedforward neural + networks, Glorot and Bengio (2010) + """ + + def __init__(self, gain=1., rng=None): + """Construct a normalised initilisation random initialiser object. + + Args: + gain: Multiplicative factor to scale initialised weights by. + Recommended values is 1 for affine layers followed by + logistic sigmoid layers (or another affine layer). + rng (RandomState): Seeded random number generator. + """ + self.gain = gain + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + std = self.gain * (2. / (shape[0] + shape[1]))**0.5 + return self.rng.normal(loc=0., scale=std, size=shape) diff --git a/mlp/layers.py b/mlp/layers.py index cc4cdda..d3cb38e 100644 --- a/mlp/layers.py +++ b/mlp/layers.py @@ -14,7 +14,7 @@ respect to the layer parameters. import numpy as np import mlp.initialisers as init - +from mlp import DEFAULT_SEED class Layer(object): """Abstract class defining the interface for a layer.""" @@ -68,6 +68,13 @@ class LayerWithParameters(Layer): """ raise NotImplementedError() + def params_penalty(self): + """Returns the parameter dependent penalty term for this layer. + + If no parameter-dependent penalty terms are set this returns zero. + """ + raise NotImplementedError() + @property def params(self): """Returns a list of parameters of layer. @@ -88,6 +95,127 @@ class LayerWithParameters(Layer): """ raise NotImplementedError() +class StochasticLayerWithParameters(Layer): + """Specialised layer which uses a stochastic forward propagation.""" + + def __init__(self, rng=None): + """Constructs a new StochasticLayer object. + + Args: + rng (RandomState): Seeded random number generator object. + """ + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def fprop(self, inputs, stochastic=True): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + stochastic: Flag allowing different deterministic + forward-propagation mode in addition to default stochastic + forward-propagation e.g. for use at test time. If False + a deterministic forward-propagation transformation + corresponding to the expected output of the stochastic + forward-propagation is applied. + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError() + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: Array of inputs to layer of shape (batch_size, input_dim). + grads_wrt_to_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + List of arrays of gradients with respect to the layer parameters + with parameter gradients appearing in same order in tuple as + returned from `get_params` method. + """ + raise NotImplementedError() + + def params_penalty(self): + """Returns the parameter dependent penalty term for this layer. + + If no parameter-dependent penalty terms are set this returns zero. + """ + raise NotImplementedError() + + @property + def params(self): + """Returns a list of parameters of layer. + + Returns: + List of current parameter values. This list should be in the + corresponding order to the `values` argument to `set_params`. + """ + raise NotImplementedError() + + @params.setter + def params(self, values): + """Sets layer parameters from a list of values. + + Args: + values: List of values to set parameters to. This list should be + in the corresponding order to what is returned by `get_params`. + """ + raise NotImplementedError() + +class StochasticLayer(Layer): + """Specialised layer which uses a stochastic forward propagation.""" + + def __init__(self, rng=None): + """Constructs a new StochasticLayer object. + + Args: + rng (RandomState): Seeded random number generator object. + """ + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def fprop(self, inputs, stochastic=True): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + stochastic: Flag allowing different deterministic + forward-propagation mode in addition to default stochastic + forward-propagation e.g. for use at test time. If False + a deterministic forward-propagation transformation + corresponding to the expected output of the stochastic + forward-propagation is applied. + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError() + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. This should correspond to + default stochastic forward-propagation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + raise NotImplementedError() + class AffineLayer(LayerWithParameters): """Layer implementing an affine tranformation of its inputs. @@ -97,7 +225,8 @@ class AffineLayer(LayerWithParameters): def __init__(self, input_dim, output_dim, weights_initialiser=init.UniformInit(-0.1, 0.1), - biases_initialiser=init.ConstantInit(0.)): + biases_initialiser=init.ConstantInit(0.), + weights_penalty=None, biases_penalty=None): """Initialises a parameterised affine layer. Args: @@ -105,11 +234,17 @@ class AffineLayer(LayerWithParameters): output_dim (int): Dimension of the layer outputs. weights_initialiser: Initialiser for the weight parameters. biases_initialiser: Initialiser for the bias parameters. + weights_penalty: Weights-dependent penalty term (regulariser) or + None if no regularisation is to be applied to the weights. + biases_penalty: Biases-dependent penalty term (regulariser) or + None if no regularisation is to be applied to the biases. """ self.input_dim = input_dim self.output_dim = output_dim self.weights = weights_initialiser((self.output_dim, self.input_dim)) self.biases = biases_initialiser(self.output_dim) + self.weights_penalty = weights_penalty + self.biases_penalty = biases_penalty def fprop(self, inputs): """Forward propagates activations through the layer transformation. @@ -123,7 +258,7 @@ class AffineLayer(LayerWithParameters): Returns: outputs: Array of layer outputs of shape (batch_size, output_dim). """ - return inputs.dot(self.weights.T) + self.biases + return self.weights.dot(inputs.T).T + self.biases def bprop(self, inputs, outputs, grads_wrt_outputs): """Back propagates gradients through a layer. @@ -159,8 +294,27 @@ class AffineLayer(LayerWithParameters): grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs) grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0) + + if self.weights_penalty is not None: + grads_wrt_weights += self.weights_penalty.grad(parameter=self.weights) + + if self.biases_penalty is not None: + grads_wrt_biases += self.biases_penalty.grad(parameter=self.biases) + return [grads_wrt_weights, grads_wrt_biases] + def params_penalty(self): + """Returns the parameter dependent penalty term for this layer. + + If no parameter-dependent penalty terms are set this returns zero. + """ + params_penalty = 0 + if self.weights_penalty is not None: + params_penalty += self.weights_penalty(self.weights) + if self.biases_penalty is not None: + params_penalty += self.biases_penalty(self.biases) + return params_penalty + @property def params(self): """A list of layer parameter values: `[weights, biases]`.""" @@ -175,7 +329,6 @@ class AffineLayer(LayerWithParameters): return 'AffineLayer(input_dim={0}, output_dim={1})'.format( self.input_dim, self.output_dim) - class SigmoidLayer(Layer): """Layer implementing an element-wise logistic sigmoid transformation.""" @@ -215,6 +368,160 @@ class SigmoidLayer(Layer): def __repr__(self): return 'SigmoidLayer' +class ReluLayer(Layer): + """Layer implementing an element-wise rectified linear transformation.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.maximum(inputs, 0.) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return (outputs > 0) * grads_wrt_outputs + + def __repr__(self): + return 'ReluLayer' + +class LeakyReluLayer(Layer): + """Layer implementing an element-wise leaky rectified linear transformation.""" + def __init__(self, alpha=0.01): + self.alpha = alpha + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to `y = ..., else`. + """ + + raise NotImplementedError + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + """ + raise NotImplementedError + + def __repr__(self): + return 'LeakyReluLayer' + + + +class ParametricReluLayer(LayerWithParameters): + """Layer implementing an element-wise parametric rectified linear transformation.""" + + def __init__(self, alpha=0.25): + self.alpha = np.array([alpha]) + + @property + def params(self): + """A list of layer parameter values: `[weights, biases]`.""" + return [self.alpha] + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to `y = ..., else`. + """ + raise NotImplementedError + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + """ + raise NotImplementedError + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: array of inputs to layer of shape (batch_size, input_dim) + grads_wrt_to_outputs: array of gradients with respect to the layer + outputs of shape (batch_size, output_dim) + + Returns: + list of arrays of gradients with respect to the layer parameters + `[grads_wrt_params]`. Where params is the alpha parameter. + """ + raise NotImplementedError + + @property + def params(self): + """A list of layer parameter values: `[weights, biases]`.""" + return [self.alpha] + + @params.setter + def params(self, values): + self.alpha = values[0] + + def __repr__(self): + return 'ParametricReluLayer' + + +class TanhLayer(Layer): + """Layer implementing an element-wise hyperbolic tangent transformation.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.tanh(inputs) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return (1. - outputs**2) * grads_wrt_outputs + + def __repr__(self): + return 'TanhLayer' class SoftmaxLayer(Layer): """Layer implementing a softmax transformation.""" @@ -232,7 +539,9 @@ class SoftmaxLayer(Layer): Returns: outputs: Array of layer outputs of shape (batch_size, output_dim). """ - exp_inputs = np.exp(inputs) + # subtract max inside exponential to improve numerical stability - + # when we divide through by sum this term cancels + exp_inputs = np.exp(inputs - inputs.max(-1)[:, None]) return exp_inputs / exp_inputs.sum(-1)[:, None] def bprop(self, inputs, outputs, grads_wrt_outputs): @@ -257,3 +566,177 @@ class SoftmaxLayer(Layer): def __repr__(self): return 'SoftmaxLayer' + +class RadialBasisFunctionLayer(Layer): + """Layer implementing projection to a grid of radial basis functions.""" + + def __init__(self, grid_dim, intervals=[[0., 1.]]): + """Creates a radial basis function layer object. + + Args: + grid_dim: Integer specifying how many basis function to use in + grid across input space per dimension (so total number of + basis functions will be grid_dim**input_dim) + intervals: List of intervals (two element lists or tuples) + specifying extents of axis-aligned region in input-space to + tile basis functions in grid across. For example for a 2D input + space spanning [0, 1] x [0, 1] use intervals=[[0, 1], [0, 1]]. + """ + num_basis = grid_dim**len(intervals) + self.centres = np.array(np.meshgrid(*[ + np.linspace(low, high, grid_dim) for (low, high) in intervals]) + ).reshape((len(intervals), -1)) + self.scales = np.array([ + [(high - low) * 1. / grid_dim] for (low, high) in intervals]) + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.exp(-(inputs[..., None] - self.centres[None, ...])**2 / + self.scales**2).reshape((inputs.shape[0], -1)) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + num_basis = self.centres.shape[1] + return -2 * ( + ((inputs[..., None] - self.centres[None, ...]) / self.scales**2) * + grads_wrt_outputs.reshape((inputs.shape[0], -1, num_basis)) + ).sum(-1) + + def __repr__(self): + return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim) + +class DropoutLayer(StochasticLayer): + """Layer which stochastically drops input dimensions in its output.""" + + def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True): + """Construct a new dropout layer. + + Args: + rng (RandomState): Seeded random number generator. + incl_prob: Scalar value in (0, 1] specifying the probability of + each input dimension being included in the output. + share_across_batch: Whether to use same dropout mask across + all inputs in a batch or use per input masks. + """ + super(DropoutLayer, self).__init__(rng) + assert incl_prob > 0. and incl_prob <= 1. + self.incl_prob = incl_prob + self.share_across_batch = share_across_batch + self.rng = rng + + def fprop(self, inputs, stochastic=True): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + stochastic: Flag allowing different deterministic + forward-propagation mode in addition to default stochastic + forward-propagation e.g. for use at test time. If False + a deterministic forward-propagation transformation + corresponding to the expected output of the stochastic + forward-propagation is applied. + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. This should correspond to + default stochastic forward-propagation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + raise NotImplementedError + + def __repr__(self): + return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob) + +class ReshapeLayer(Layer): + """Layer which reshapes dimensions of inputs.""" + + def __init__(self, output_shape=None): + """Create a new reshape layer object. + + Args: + output_shape: Tuple specifying shape each input in batch should + be reshaped to in outputs. This **excludes** the batch size + so the shape of the final output array will be + (batch_size, ) + output_shape + Similarly to numpy.reshape, one shape dimension can be -1. In + this case, the value is inferred from the size of the input + array and remaining dimensions. The shape specified must be + compatible with the input array shape - i.e. the total number + of values in the array cannot be changed. If set to `None` the + output shape will be set to + (batch_size, -1) + which will flatten all the inputs to vectors. + """ + self.output_shape = (-1,) if output_shape is None else output_shape + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return inputs.reshape((inputs.shape[0],) + self.output_shape) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return grads_wrt_outputs.reshape(inputs.shape) + + def __repr__(self): + return 'ReshapeLayer(output_shape={0})'.format(self.output_shape) diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py index 22f2bcb..56fcb63 100644 --- a/mlp/learning_rules.py +++ b/mlp/learning_rules.py @@ -160,3 +160,158 @@ class MomentumLearningRule(GradientDescentLearningRule): mom *= self.mom_coeff mom -= self.learning_rate * grad param += mom + + +class AdamLearningRule(GradientDescentLearningRule): + """Adaptive moments (Adam) learning rule. + First-order gradient-descent based learning rule which uses adaptive + estimates of first and second moments of the parameter gradients to + calculate the parameter updates. + References: + [1]: Adam: a method for stochastic optimisation + Kingma and Ba, 2015 + """ + + def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999, + epsilon=1e-8): + """Creates a new learning rule object. + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + beta_1: Exponential decay rate for gradient first moment estimates. + This should be a scalar value in [0, 1]. The running gradient + first moment estimate is calculated using + `m_1 = beta_1 * m_1_prev + (1 - beta_1) * g` + where `m_1_prev` is the previous estimate and `g` the current + parameter gradients. + beta_2: Exponential decay rate for gradient second moment + estimates. This should be a scalar value in [0, 1]. The run + gradient second moment estimate is calculated using + `m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2` + where `m_2_prev` is the previous estimate and `g` the current + parameter gradients. + epsilon: 'Softening' parameter to stop updates diverging when + second moment estimates are close to zero. Should be set to + a small positive value. + """ + super(AdamLearningRule, self).__init__(learning_rate) + assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].' + assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].' + assert epsilon > 0., 'epsilon should be > 0.' + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + This must be called before `update_params` is first called. + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(AdamLearningRule, self).initialise(params) + self.moms_1 = [] + for param in self.params: + self.moms_1.append(np.zeros_like(param)) + self.moms_2 = [] + for param in self.params: + self.moms_2.append(np.zeros_like(param)) + self.step_count = 0 + + def reset(self): + """Resets any additional state variables to their initial values. + For this learning rule this corresponds to zeroing the estimates of + the first and second moments of the gradients. + """ + for mom_1, mom_2 in zip(self.moms_1, self.moms_2): + mom_1 *= 0. + mom_2 *= 0. + self.step_count = 0 + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + All parameter updates are performed using in-place operations and so + nothing is returned. + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, mom_1, mom_2, grad in zip( + self.params, self.moms_1, self.moms_2, grads_wrt_params): + mom_1 *= self.beta_1 + mom_1 += (1. - self.beta_1) * grad + mom_2 *= self.beta_2 + mom_2 += (1. - self.beta_2) * grad ** 2 + alpha_t = ( + self.learning_rate * + (1. - self.beta_2 ** (self.step_count + 1)) ** 0.5 / + (1. - self.beta_1 ** (self.step_count + 1)) + ) + param -= alpha_t * mom_1 / (mom_2 ** 0.5 + self.epsilon) + self.step_count += 1 + + +class AdaGradLearningRule(GradientDescentLearningRule): + """Adaptive gradients (AdaGrad) learning rule. + First-order gradient-descent based learning rule which normalises gradient + updates by a running sum of the past squared gradients. + References: + [1]: Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization. Duchi, Haxan and Singer, 2011 + """ + + def __init__(self, learning_rate=1e-2, epsilon=1e-8): + """Creates a new learning rule object. + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + epsilon: 'Softening' parameter to stop updates diverging when + sums of squared gradients are close to zero. Should be set to + a small positive value. + """ + super(AdaGradLearningRule, self).__init__(learning_rate) + assert epsilon > 0., 'epsilon should be > 0.' + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + This must be called before `update_params` is first called. + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(AdaGradLearningRule, self).initialise(params) + self.sum_sq_grads = [] + for param in self.params: + self.sum_sq_grads.append(np.zeros_like(param)) + + def reset(self): + """Resets any additional state variables to their initial values. + For this learning rule this corresponds to zeroing all the sum of + squared gradient states. + """ + for sum_sq_grad in self.sum_sq_grads: + sum_sq_grad *= 0. + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + All parameter updates are performed using in-place operations and so + nothing is returned. + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, sum_sq_grad, grad in zip( + self.params, self.sum_sq_grads, grads_wrt_params): + sum_sq_grad += grad ** 2 + param -= (self.learning_rate * grad / + (sum_sq_grad + self.epsilon) ** 0.5) + diff --git a/mlp/models.py b/mlp/models.py index cccd62d..7f1273e 100644 --- a/mlp/models.py +++ b/mlp/models.py @@ -8,7 +8,7 @@ outputs (and intermediate states) and for calculating gradients of scalar functions of the outputs with respect to the model parameters. """ -from mlp.layers import LayerWithParameters +from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters class SingleLayerModel(object): @@ -80,11 +80,11 @@ class MultipleLayerModel(object): """A list of all of the parameters of the model.""" params = [] for layer in self.layers: - if isinstance(layer, LayerWithParameters): + if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters): params += layer.params return params - def fprop(self, inputs): + def fprop(self, inputs, evaluation=False): """Forward propagates a batch of inputs through the model. Args: @@ -97,7 +97,19 @@ class MultipleLayerModel(object): """ activations = [inputs] for i, layer in enumerate(self.layers): - activations.append(self.layers[i].fprop(activations[i])) + if evaluation: + if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]), + StochasticLayerWithParameters): + current_activations = self.layers[i].fprop(activations[i], stochastic=False) + else: + current_activations = self.layers[i].fprop(activations[i]) + else: + if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]), + StochasticLayerWithParameters): + current_activations = self.layers[i].fprop(activations[i], stochastic=True) + else: + current_activations = self.layers[i].fprop(activations[i]) + activations.append(current_activations) return activations def grads_wrt_params(self, activations, grads_wrt_outputs): @@ -119,7 +131,7 @@ class MultipleLayerModel(object): inputs = activations[-i - 2] outputs = activations[-i - 1] grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs) - if isinstance(layer, LayerWithParameters): + if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters): grads_wrt_params += layer.grads_wrt_params( inputs, grads_wrt_outputs)[::-1] grads_wrt_outputs = grads_wrt_inputs diff --git a/mlp/optimisers.py b/mlp/optimisers.py index e07e225..61509ae 100644 --- a/mlp/optimisers.py +++ b/mlp/optimisers.py @@ -9,7 +9,7 @@ import time import logging from collections import OrderedDict import numpy as np - +import tqdm logger = logging.getLogger(__name__) @@ -18,7 +18,7 @@ class Optimiser(object): """Basic model optimiser.""" def __init__(self, model, error, learning_rule, train_dataset, - valid_dataset=None, data_monitors=None): + valid_dataset=None, data_monitors=None, notebook=False): """Create a new optimiser instance. Args: @@ -43,6 +43,11 @@ class Optimiser(object): self.data_monitors = OrderedDict([('error', error)]) if data_monitors is not None: self.data_monitors.update(data_monitors) + self.notebook = notebook + if notebook: + self.tqdm_progress = tqdm.tqdm_notebook + else: + self.tqdm_progress = tqdm.tqdm def do_training_epoch(self): """Do a single training epoch. @@ -52,12 +57,15 @@ class Optimiser(object): respect to all the model parameters and then updates the model parameters according to the learning rule. """ - for inputs_batch, targets_batch in self.train_dataset: - activations = self.model.fprop(inputs_batch) - grads_wrt_outputs = self.error.grad(activations[-1], targets_batch) - grads_wrt_params = self.model.grads_wrt_params( - activations, grads_wrt_outputs) - self.learning_rule.update_params(grads_wrt_params) + with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar: + train_progress_bar.set_description("Ep Prog") + for inputs_batch, targets_batch in self.train_dataset: + activations = self.model.fprop(inputs_batch) + grads_wrt_outputs = self.error.grad(activations[-1], targets_batch) + grads_wrt_params = self.model.grads_wrt_params( + activations, grads_wrt_outputs) + self.learning_rule.update_params(grads_wrt_params) + train_progress_bar.update(1) def eval_monitors(self, dataset, label): """Evaluates the monitors for the given dataset. @@ -72,7 +80,7 @@ class Optimiser(object): data_mon_vals = OrderedDict([(key + label, 0.) for key in self.data_monitors.keys()]) for inputs_batch, targets_batch in dataset: - activations = self.model.fprop(inputs_batch) + activations = self.model.fprop(inputs_batch, evaluation=True) for key, data_monitor in self.data_monitors.items(): data_mon_vals[key + label] += data_monitor( activations[-1], targets_batch) @@ -104,7 +112,7 @@ class Optimiser(object): """ logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format( epoch, epoch_time, - ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()]) + ', '.join(['{}={:.2e}'.format(k, v) for (k, v) in stats.items()]) )) def train(self, num_epochs, stats_interval=5): @@ -121,17 +129,20 @@ class Optimiser(object): and the second being a dict mapping the labels for the statistics recorded to their column index in the array. """ - start_train_time = time.process_time() + start_train_time = time.time() run_stats = [list(self.get_epoch_stats().values())] - for epoch in range(1, num_epochs + 1): - start_time = time.process_time() - self.do_training_epoch() - epoch_time = time.process_time() - start_time - if epoch % stats_interval == 0: - stats = self.get_epoch_stats() - self.log_stats(epoch, epoch_time, stats) - run_stats.append(list(stats.values())) - finish_train_time = time.process_time() + with self.tqdm_progress(total=num_epochs) as progress_bar: + progress_bar.set_description("Exp Prog") + for epoch in range(1, num_epochs + 1): + start_time = time.time() + self.do_training_epoch() + epoch_time = time.time()- start_time + if epoch % stats_interval == 0: + stats = self.get_epoch_stats() + self.log_stats(epoch, epoch_time, stats) + run_stats.append(list(stats.values())) + progress_bar.update(1) + finish_train_time = time.time() total_train_time = finish_train_time - start_train_time return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time diff --git a/mlp/schedulers.py b/mlp/schedulers.py index 4f53e7e..7abab2f 100644 --- a/mlp/schedulers.py +++ b/mlp/schedulers.py @@ -32,3 +32,42 @@ class ConstantLearningRateScheduler(object): epoch_number: Integer index of training epoch about to be run. """ learning_rule.learning_rate = self.learning_rate + +class CosineAnnealingWithWarmRestarts(object): + """Cosine annealing scheduler, implemented as in https://arxiv.org/pdf/1608.03983.pdf""" + + def __init__(self, min_learning_rate, max_learning_rate, total_iters_per_period, max_learning_rate_discount_factor, + period_iteration_expansion_factor): + """ + Instantiates a new cosine annealing with warm restarts learning rate scheduler + :param min_learning_rate: The minimum learning rate the scheduler can assign + :param max_learning_rate: The maximum learning rate the scheduler can assign + :param total_epochs_per_period: The number of epochs in a period + :param max_learning_rate_discount_factor: The rate of discount for the maximum learning rate after each restart i.e. how many times smaller the max learning rate will be after a restart compared to the previous one + :param period_iteration_expansion_factor: The rate of expansion of the period epochs. e.g. if it's set to 1 then all periods have the same number of epochs, if it's larger than 1 then each subsequent period will have more epochs and vice versa. + """ + self.min_learning_rate = min_learning_rate + self.max_learning_rate = max_learning_rate + self.total_epochs_per_period = total_iters_per_period + + self.max_learning_rate_discount_factor = max_learning_rate_discount_factor + self.period_iteration_expansion_factor = period_iteration_expansion_factor + + + def update_learning_rule(self, learning_rule, epoch_number): + """Update the hyperparameters of the learning rule. + + Run at the beginning of each epoch. + + Args: + learning_rule: Learning rule object being used in training run, + any scheduled hyperparameters to be altered should be + attributes of this object. + epoch_number: Integer index of training epoch about to be run. + Returns: + effective_learning_rate at step 'epoch_number' + """ + raise NotImplementedError + + +