Add missing files
This commit is contained in:
parent
4657cca862
commit
5d52a22448
@ -16,7 +16,7 @@ class DataProvider(object):
|
||||
"""Generic data provider."""
|
||||
|
||||
def __init__(self, inputs, targets, batch_size, max_num_batches=-1,
|
||||
shuffle_order=True, rng=None):
|
||||
shuffle_order=True, rng=None, smooth_labels=False):
|
||||
"""Create a new data provider object.
|
||||
|
||||
Args:
|
||||
@ -32,26 +32,60 @@ class DataProvider(object):
|
||||
shuffle_order (bool): Whether to randomly permute the order of
|
||||
the data before each epoch.
|
||||
rng (RandomState): A seeded random number generator.
|
||||
smooth_labels (bool): turn on label smoothing
|
||||
"""
|
||||
self.inputs = inputs
|
||||
self.targets = targets
|
||||
self.batch_size = batch_size
|
||||
assert max_num_batches != 0 and not max_num_batches < -1, (
|
||||
'max_num_batches should be -1 or > 0')
|
||||
self.max_num_batches = max_num_batches
|
||||
if batch_size < 1:
|
||||
raise ValueError('batch_size must be >= 1')
|
||||
self._batch_size = batch_size
|
||||
if max_num_batches == 0 or max_num_batches < -1:
|
||||
raise ValueError('max_num_batches must be -1 or > 0')
|
||||
self._max_num_batches = max_num_batches
|
||||
self._update_num_batches()
|
||||
self.shuffle_order = shuffle_order
|
||||
|
||||
self._current_order = np.arange(inputs.shape[0])
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
self.smooth_labels = smooth_labels
|
||||
self.new_epoch()
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
"""Number of data points to include in each batch."""
|
||||
return self._batch_size
|
||||
|
||||
@batch_size.setter
|
||||
def batch_size(self, value):
|
||||
if value < 1:
|
||||
raise ValueError('batch_size must be >= 1')
|
||||
self._batch_size = value
|
||||
self._update_num_batches()
|
||||
|
||||
@property
|
||||
def max_num_batches(self):
|
||||
"""Maximum number of batches to iterate over in an epoch."""
|
||||
return self._max_num_batches
|
||||
|
||||
@max_num_batches.setter
|
||||
def max_num_batches(self, value):
|
||||
if value == 0 or value < -1:
|
||||
raise ValueError('max_num_batches must be -1 or > 0')
|
||||
self._max_num_batches = value
|
||||
self._update_num_batches()
|
||||
|
||||
def _update_num_batches(self):
|
||||
"""Updates number of batches to iterate over."""
|
||||
# maximum possible number of batches is equal to number of whole times
|
||||
# batch_size divides in to the number of data points which can be
|
||||
# found using integer division
|
||||
possible_num_batches = self.inputs.shape[0] // batch_size
|
||||
possible_num_batches = self.inputs.shape[0] // self.batch_size
|
||||
if self.max_num_batches == -1:
|
||||
self.num_batches = possible_num_batches
|
||||
else:
|
||||
self.num_batches = min(self.max_num_batches, possible_num_batches)
|
||||
self.shuffle_order = shuffle_order
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
self.reset()
|
||||
|
||||
def __iter__(self):
|
||||
"""Implements Python iterator interface.
|
||||
@ -63,27 +97,36 @@ class DataProvider(object):
|
||||
"""
|
||||
return self
|
||||
|
||||
def reset(self):
|
||||
"""Resets the provider to the initial state to use in a new epoch."""
|
||||
def new_epoch(self):
|
||||
"""Starts a new epoch (pass through data), possibly shuffling first."""
|
||||
self._curr_batch = 0
|
||||
if self.shuffle_order:
|
||||
self.shuffle()
|
||||
|
||||
def shuffle(self):
|
||||
"""Randomly shuffles order of data."""
|
||||
new_order = self.rng.permutation(self.inputs.shape[0])
|
||||
self.inputs = self.inputs[new_order]
|
||||
self.targets = self.targets[new_order]
|
||||
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
|
||||
def reset(self):
|
||||
"""Resets the provider to the initial state."""
|
||||
inv_perm = np.argsort(self._current_order)
|
||||
self._current_order = self._current_order[inv_perm]
|
||||
self.inputs = self.inputs[inv_perm]
|
||||
self.targets = self.targets[inv_perm]
|
||||
self.new_epoch()
|
||||
|
||||
def shuffle(self):
|
||||
"""Randomly shuffles order of data."""
|
||||
perm = self.rng.permutation(self.inputs.shape[0])
|
||||
self._current_order = self._current_order[perm]
|
||||
self.inputs = self.inputs[perm]
|
||||
self.targets = self.targets[perm]
|
||||
|
||||
def next(self):
|
||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||
if self._curr_batch + 1 > self.num_batches:
|
||||
# no more batches in current iteration through data set so reset
|
||||
# the dataset for another pass and indicate iteration is at end
|
||||
self.reset()
|
||||
# no more batches in current iteration through data set so start
|
||||
# new epoch ready for another pass and indicate iteration is at end
|
||||
self.new_epoch()
|
||||
raise StopIteration()
|
||||
# create an index slice corresponding to current batch number
|
||||
batch_slice = slice(self._curr_batch * self.batch_size,
|
||||
@ -93,12 +136,11 @@ class DataProvider(object):
|
||||
self._curr_batch += 1
|
||||
return inputs_batch, targets_batch
|
||||
|
||||
|
||||
class MNISTDataProvider(DataProvider):
|
||||
"""Data provider for MNIST handwritten digit images."""
|
||||
|
||||
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
|
||||
shuffle_order=True, rng=None):
|
||||
shuffle_order=True, rng=None, smooth_labels=False):
|
||||
"""Create a new MNIST data provider object.
|
||||
|
||||
Args:
|
||||
@ -112,9 +154,10 @@ class MNISTDataProvider(DataProvider):
|
||||
shuffle_order (bool): Whether to randomly permute the order of
|
||||
the data before each epoch.
|
||||
rng (RandomState): A seeded random number generator.
|
||||
smooth_labels (bool): enable/disable label smoothing
|
||||
"""
|
||||
# check a valid which_set was provided
|
||||
assert which_set in ['train', 'valid', 'eval'], (
|
||||
assert which_set in ['train', 'valid', 'test'], (
|
||||
'Expected which_set to be either train, valid or eval. '
|
||||
'Got {0}'.format(which_set)
|
||||
)
|
||||
@ -134,7 +177,7 @@ class MNISTDataProvider(DataProvider):
|
||||
inputs = inputs.astype(np.float32)
|
||||
# pass the loaded data to the parent class __init__
|
||||
super(MNISTDataProvider, self).__init__(
|
||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
|
||||
|
||||
def next(self):
|
||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||
@ -160,6 +203,102 @@ class MNISTDataProvider(DataProvider):
|
||||
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
|
||||
return one_of_k_targets
|
||||
|
||||
class EMNISTDataProvider(DataProvider):
|
||||
"""Data provider for EMNIST handwritten digit images."""
|
||||
|
||||
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
|
||||
shuffle_order=True, rng=None, smooth_labels=False):
|
||||
"""Create a new EMNIST data provider object.
|
||||
|
||||
Args:
|
||||
which_set: One of 'train', 'valid' or 'eval'. Determines which
|
||||
portion of the EMNIST data this object should provide.
|
||||
batch_size (int): Number of data points to include in each batch.
|
||||
max_num_batches (int): Maximum number of batches to iterate over
|
||||
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||
only as many batches as the data can be split into will be
|
||||
used. If set to -1 all of the data will be used.
|
||||
shuffle_order (bool): Whether to randomly permute the order of
|
||||
the data before each epoch.
|
||||
rng (RandomState): A seeded random number generator.
|
||||
smooth_labels (bool): enable/disable label smoothing
|
||||
"""
|
||||
# check a valid which_set was provided
|
||||
assert which_set in ['train', 'valid', 'test'], (
|
||||
'Expected which_set to be either train, valid or eval. '
|
||||
'Got {0}'.format(which_set)
|
||||
)
|
||||
self.which_set = which_set
|
||||
self.num_classes = 47
|
||||
# construct path to data using os.path.join to ensure the correct path
|
||||
# separator for the current platform / OS is used
|
||||
# MLP_DATA_DIR environment variable should point to the data directory
|
||||
data_path = os.path.join(
|
||||
os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set))
|
||||
assert os.path.isfile(data_path), (
|
||||
'Data file does not exist at expected path: ' + data_path
|
||||
)
|
||||
# load data from compressed numpy file
|
||||
loaded = np.load(data_path)
|
||||
print(loaded.keys())
|
||||
inputs, targets = loaded['inputs'], loaded['targets']
|
||||
inputs = inputs.astype(np.float32)
|
||||
inputs = np.reshape(inputs, newshape=(-1, 28*28))
|
||||
inputs = inputs / 255.0
|
||||
# pass the loaded data to the parent class __init__
|
||||
super(EMNISTDataProvider, self).__init__(
|
||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
|
||||
|
||||
def next(self):
|
||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||
inputs_batch, targets_batch = super(EMNISTDataProvider, self).next()
|
||||
|
||||
if self.smooth_labels:
|
||||
targets_batch_mat = self.label_smoothing(targets_batch)
|
||||
else:
|
||||
targets_batch_mat = self.to_one_of_k(targets_batch)
|
||||
return inputs_batch, targets_batch_mat
|
||||
|
||||
|
||||
|
||||
def to_one_of_k(self, int_targets):
|
||||
"""Converts integer coded class target to 1 of K coded targets.
|
||||
|
||||
Args:
|
||||
int_targets (ndarray): Array of integer coded class targets (i.e.
|
||||
where an integer from 0 to `num_classes` - 1 is used to
|
||||
indicate which is the correct class). This should be of shape
|
||||
(num_data,).
|
||||
|
||||
Returns:
|
||||
Array of 1 of K coded targets i.e. an array of shape
|
||||
(num_data, num_classes) where for each row all elements are equal
|
||||
to zero except for the column corresponding to the correct class
|
||||
which is equal to one.
|
||||
"""
|
||||
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
|
||||
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
|
||||
return one_of_k_targets
|
||||
|
||||
def label_smoothing(self, int_targets, alpha=0.1):
|
||||
"""Converts integer coded class target to 1 of K coded targets with label smoothing.
|
||||
|
||||
Args:
|
||||
int_targets (ndarray): Array of integer coded class targets (i.e.
|
||||
where an integer from 0 to `num_classes` - 1 is used to
|
||||
indicate which is the correct class). This should be of shape
|
||||
(num_data,).
|
||||
alpha (float): Smoothing factor.
|
||||
|
||||
Returns:
|
||||
Array of 1 of K coded targets with label smoothing i.e. an array of shape
|
||||
(num_data, num_classes)
|
||||
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
||||
class MetOfficeDataProvider(DataProvider):
|
||||
"""South Scotland Met Office weather data provider."""
|
||||
@ -253,3 +392,41 @@ class CCPPDataProvider(DataProvider):
|
||||
targets = loaded[which_set + '_targets']
|
||||
super(CCPPDataProvider, self).__init__(
|
||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||
|
||||
|
||||
class AugmentedMNISTDataProvider(MNISTDataProvider):
|
||||
"""Data provider for MNIST dataset which randomly transforms images."""
|
||||
|
||||
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
|
||||
shuffle_order=True, rng=None, transformer=None):
|
||||
"""Create a new augmented MNIST data provider object.
|
||||
|
||||
Args:
|
||||
which_set: One of 'train', 'valid' or 'test'. Determines which
|
||||
portion of the MNIST data this object should provide.
|
||||
batch_size (int): Number of data points to include in each batch.
|
||||
max_num_batches (int): Maximum number of batches to iterate over
|
||||
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||
only as many batches as the data can be split into will be
|
||||
used. If set to -1 all of the data will be used.
|
||||
shuffle_order (bool): Whether to randomly permute the order of
|
||||
the data before each epoch.
|
||||
rng (RandomState): A seeded random number generator.
|
||||
transformer: Function which takes an `inputs` array of shape
|
||||
(batch_size, input_dim) corresponding to a batch of input
|
||||
images and a `rng` random number generator object (i.e. a
|
||||
call signature `transformer(inputs, rng)`) and applies a
|
||||
potentiall random set of transformations to some / all of the
|
||||
input images as each new batch is returned when iterating over
|
||||
the data provider.
|
||||
"""
|
||||
super(AugmentedMNISTDataProvider, self).__init__(
|
||||
which_set, batch_size, max_num_batches, shuffle_order, rng)
|
||||
self.transformer = transformer
|
||||
|
||||
def next(self):
|
||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||
inputs_batch, targets_batch = super(
|
||||
AugmentedMNISTDataProvider, self).next()
|
||||
transformed_inputs_batch = self.transformer(inputs_batch, self.rng)
|
||||
return transformed_inputs_batch, targets_batch
|
||||
|
@ -154,9 +154,9 @@ class CrossEntropySoftmaxError(object):
|
||||
Returns:
|
||||
Scalar error function value.
|
||||
"""
|
||||
probs = np.exp(outputs)
|
||||
probs /= probs.sum(-1)[:, None]
|
||||
return -np.mean(np.sum(targets * np.log(probs), axis=1))
|
||||
normOutputs = outputs - outputs.max(-1)[:, None]
|
||||
logProb = normOutputs - np.log(np.sum(np.exp(normOutputs), axis=-1)[:, None])
|
||||
return -np.mean(np.sum(targets * logProb, axis=1))
|
||||
|
||||
def grad(self, outputs, targets):
|
||||
"""Calculates gradient of error function with respect to outputs.
|
||||
@ -168,7 +168,7 @@ class CrossEntropySoftmaxError(object):
|
||||
Returns:
|
||||
Gradient of error function with respect to outputs.
|
||||
"""
|
||||
probs = np.exp(outputs)
|
||||
probs = np.exp(outputs - outputs.max(-1)[:, None])
|
||||
probs /= probs.sum(-1)[:, None]
|
||||
return (probs - targets) / outputs.shape[0]
|
||||
|
||||
|
@ -63,3 +63,81 @@ class NormalInit(object):
|
||||
|
||||
def __call__(self, shape):
|
||||
return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
|
||||
|
||||
class GlorotUniformInit(object):
|
||||
"""Glorot and Bengio (2010) random uniform weights initialiser.
|
||||
|
||||
Initialises an two-dimensional parameter array using the 'normalized
|
||||
initialisation' scheme suggested in [1] which attempts to maintain a
|
||||
roughly constant variance in the activations and backpropagated gradients
|
||||
of a multi-layer model consisting of interleaved affine and logistic
|
||||
sigmoidal transformation layers.
|
||||
|
||||
Weights are sampled from a zero-mean uniform distribution with standard
|
||||
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
|
||||
`output_dim` are the input and output dimensions of the weight matrix
|
||||
respectively.
|
||||
|
||||
References:
|
||||
[1]: Understanding the difficulty of training deep feedforward neural
|
||||
networks, Glorot and Bengio (2010)
|
||||
"""
|
||||
|
||||
def __init__(self, gain=1., rng=None):
|
||||
"""Construct a normalised initilisation random initialiser object.
|
||||
|
||||
Args:
|
||||
gain: Multiplicative factor to scale initialised weights by.
|
||||
Recommended values is 1 for affine layers followed by
|
||||
logistic sigmoid layers (or another affine layer).
|
||||
rng (RandomState): Seeded random number generator.
|
||||
"""
|
||||
self.gain = gain
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
|
||||
def __call__(self, shape):
|
||||
assert len(shape) == 2, (
|
||||
'Initialiser should only be used for two dimensional arrays.')
|
||||
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
|
||||
half_width = 3.**0.5 * std
|
||||
return self.rng.uniform(low=-half_width, high=half_width, size=shape)
|
||||
|
||||
|
||||
class GlorotNormalInit(object):
|
||||
"""Glorot and Bengio (2010) random normal weights initialiser.
|
||||
|
||||
Initialises an two-dimensional parameter array using the 'normalized
|
||||
initialisation' scheme suggested in [1] which attempts to maintain a
|
||||
roughly constant variance in the activations and backpropagated gradients
|
||||
of a multi-layer model consisting of interleaved affine and logistic
|
||||
sigmoidal transformation layers.
|
||||
|
||||
Weights are sampled from a zero-mean normal distribution with standard
|
||||
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
|
||||
`output_dim` are the input and output dimensions of the weight matrix
|
||||
respectively.
|
||||
|
||||
References:
|
||||
[1]: Understanding the difficulty of training deep feedforward neural
|
||||
networks, Glorot and Bengio (2010)
|
||||
"""
|
||||
|
||||
def __init__(self, gain=1., rng=None):
|
||||
"""Construct a normalised initilisation random initialiser object.
|
||||
|
||||
Args:
|
||||
gain: Multiplicative factor to scale initialised weights by.
|
||||
Recommended values is 1 for affine layers followed by
|
||||
logistic sigmoid layers (or another affine layer).
|
||||
rng (RandomState): Seeded random number generator.
|
||||
"""
|
||||
self.gain = gain
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
|
||||
def __call__(self, shape):
|
||||
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
|
||||
return self.rng.normal(loc=0., scale=std, size=shape)
|
||||
|
493
mlp/layers.py
493
mlp/layers.py
@ -14,7 +14,7 @@ respect to the layer parameters.
|
||||
|
||||
import numpy as np
|
||||
import mlp.initialisers as init
|
||||
|
||||
from mlp import DEFAULT_SEED
|
||||
|
||||
class Layer(object):
|
||||
"""Abstract class defining the interface for a layer."""
|
||||
@ -68,6 +68,13 @@ class LayerWithParameters(Layer):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def params_penalty(self):
|
||||
"""Returns the parameter dependent penalty term for this layer.
|
||||
|
||||
If no parameter-dependent penalty terms are set this returns zero.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""Returns a list of parameters of layer.
|
||||
@ -88,6 +95,127 @@ class LayerWithParameters(Layer):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
class StochasticLayerWithParameters(Layer):
|
||||
"""Specialised layer which uses a stochastic forward propagation."""
|
||||
|
||||
def __init__(self, rng=None):
|
||||
"""Constructs a new StochasticLayer object.
|
||||
|
||||
Args:
|
||||
rng (RandomState): Seeded random number generator object.
|
||||
"""
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
|
||||
def fprop(self, inputs, stochastic=True):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
stochastic: Flag allowing different deterministic
|
||||
forward-propagation mode in addition to default stochastic
|
||||
forward-propagation e.g. for use at test time. If False
|
||||
a deterministic forward-propagation transformation
|
||||
corresponding to the expected output of the stochastic
|
||||
forward-propagation is applied.
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||
"""Calculates gradients with respect to layer parameters.
|
||||
|
||||
Args:
|
||||
inputs: Array of inputs to layer of shape (batch_size, input_dim).
|
||||
grads_wrt_to_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
List of arrays of gradients with respect to the layer parameters
|
||||
with parameter gradients appearing in same order in tuple as
|
||||
returned from `get_params` method.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def params_penalty(self):
|
||||
"""Returns the parameter dependent penalty term for this layer.
|
||||
|
||||
If no parameter-dependent penalty terms are set this returns zero.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""Returns a list of parameters of layer.
|
||||
|
||||
Returns:
|
||||
List of current parameter values. This list should be in the
|
||||
corresponding order to the `values` argument to `set_params`.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@params.setter
|
||||
def params(self, values):
|
||||
"""Sets layer parameters from a list of values.
|
||||
|
||||
Args:
|
||||
values: List of values to set parameters to. This list should be
|
||||
in the corresponding order to what is returned by `get_params`.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
class StochasticLayer(Layer):
|
||||
"""Specialised layer which uses a stochastic forward propagation."""
|
||||
|
||||
def __init__(self, rng=None):
|
||||
"""Constructs a new StochasticLayer object.
|
||||
|
||||
Args:
|
||||
rng (RandomState): Seeded random number generator object.
|
||||
"""
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
|
||||
def fprop(self, inputs, stochastic=True):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
stochastic: Flag allowing different deterministic
|
||||
forward-propagation mode in addition to default stochastic
|
||||
forward-propagation e.g. for use at test time. If False
|
||||
a deterministic forward-propagation transformation
|
||||
corresponding to the expected output of the stochastic
|
||||
forward-propagation is applied.
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs. This should correspond to
|
||||
default stochastic forward-propagation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
outputs: Array of layer outputs calculated in forward pass of
|
||||
shape (batch_size, output_dim).
|
||||
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Array of gradients with respect to the layer inputs of shape
|
||||
(batch_size, input_dim).
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class AffineLayer(LayerWithParameters):
|
||||
"""Layer implementing an affine tranformation of its inputs.
|
||||
@ -97,7 +225,8 @@ class AffineLayer(LayerWithParameters):
|
||||
|
||||
def __init__(self, input_dim, output_dim,
|
||||
weights_initialiser=init.UniformInit(-0.1, 0.1),
|
||||
biases_initialiser=init.ConstantInit(0.)):
|
||||
biases_initialiser=init.ConstantInit(0.),
|
||||
weights_penalty=None, biases_penalty=None):
|
||||
"""Initialises a parameterised affine layer.
|
||||
|
||||
Args:
|
||||
@ -105,11 +234,17 @@ class AffineLayer(LayerWithParameters):
|
||||
output_dim (int): Dimension of the layer outputs.
|
||||
weights_initialiser: Initialiser for the weight parameters.
|
||||
biases_initialiser: Initialiser for the bias parameters.
|
||||
weights_penalty: Weights-dependent penalty term (regulariser) or
|
||||
None if no regularisation is to be applied to the weights.
|
||||
biases_penalty: Biases-dependent penalty term (regulariser) or
|
||||
None if no regularisation is to be applied to the biases.
|
||||
"""
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.weights = weights_initialiser((self.output_dim, self.input_dim))
|
||||
self.biases = biases_initialiser(self.output_dim)
|
||||
self.weights_penalty = weights_penalty
|
||||
self.biases_penalty = biases_penalty
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
@ -123,7 +258,7 @@ class AffineLayer(LayerWithParameters):
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
return inputs.dot(self.weights.T) + self.biases
|
||||
return self.weights.dot(inputs.T).T + self.biases
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
@ -159,8 +294,27 @@ class AffineLayer(LayerWithParameters):
|
||||
|
||||
grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
|
||||
grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
|
||||
|
||||
if self.weights_penalty is not None:
|
||||
grads_wrt_weights += self.weights_penalty.grad(parameter=self.weights)
|
||||
|
||||
if self.biases_penalty is not None:
|
||||
grads_wrt_biases += self.biases_penalty.grad(parameter=self.biases)
|
||||
|
||||
return [grads_wrt_weights, grads_wrt_biases]
|
||||
|
||||
def params_penalty(self):
|
||||
"""Returns the parameter dependent penalty term for this layer.
|
||||
|
||||
If no parameter-dependent penalty terms are set this returns zero.
|
||||
"""
|
||||
params_penalty = 0
|
||||
if self.weights_penalty is not None:
|
||||
params_penalty += self.weights_penalty(self.weights)
|
||||
if self.biases_penalty is not None:
|
||||
params_penalty += self.biases_penalty(self.biases)
|
||||
return params_penalty
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""A list of layer parameter values: `[weights, biases]`."""
|
||||
@ -175,7 +329,6 @@ class AffineLayer(LayerWithParameters):
|
||||
return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
|
||||
self.input_dim, self.output_dim)
|
||||
|
||||
|
||||
class SigmoidLayer(Layer):
|
||||
"""Layer implementing an element-wise logistic sigmoid transformation."""
|
||||
|
||||
@ -215,6 +368,160 @@ class SigmoidLayer(Layer):
|
||||
def __repr__(self):
|
||||
return 'SigmoidLayer'
|
||||
|
||||
class ReluLayer(Layer):
|
||||
"""Layer implementing an element-wise rectified linear transformation."""
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
return np.maximum(inputs, 0.)
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
outputs: Array of layer outputs calculated in forward pass of
|
||||
shape (batch_size, output_dim).
|
||||
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Array of gradients with respect to the layer inputs of shape
|
||||
(batch_size, input_dim).
|
||||
"""
|
||||
return (outputs > 0) * grads_wrt_outputs
|
||||
|
||||
def __repr__(self):
|
||||
return 'ReluLayer'
|
||||
|
||||
class LeakyReluLayer(Layer):
|
||||
"""Layer implementing an element-wise leaky rectified linear transformation."""
|
||||
def __init__(self, alpha=0.01):
|
||||
self.alpha = alpha
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def __repr__(self):
|
||||
return 'LeakyReluLayer'
|
||||
|
||||
|
||||
|
||||
class ParametricReluLayer(LayerWithParameters):
|
||||
"""Layer implementing an element-wise parametric rectified linear transformation."""
|
||||
|
||||
def __init__(self, alpha=0.25):
|
||||
self.alpha = np.array([alpha])
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""A list of layer parameter values: `[weights, biases]`."""
|
||||
return [self.alpha]
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||
"""Calculates gradients with respect to layer parameters.
|
||||
|
||||
Args:
|
||||
inputs: array of inputs to layer of shape (batch_size, input_dim)
|
||||
grads_wrt_to_outputs: array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim)
|
||||
|
||||
Returns:
|
||||
list of arrays of gradients with respect to the layer parameters
|
||||
`[grads_wrt_params]`. Where params is the alpha parameter.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""A list of layer parameter values: `[weights, biases]`."""
|
||||
return [self.alpha]
|
||||
|
||||
@params.setter
|
||||
def params(self, values):
|
||||
self.alpha = values[0]
|
||||
|
||||
def __repr__(self):
|
||||
return 'ParametricReluLayer'
|
||||
|
||||
|
||||
class TanhLayer(Layer):
|
||||
"""Layer implementing an element-wise hyperbolic tangent transformation."""
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
return np.tanh(inputs)
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
outputs: Array of layer outputs calculated in forward pass of
|
||||
shape (batch_size, output_dim).
|
||||
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Array of gradients with respect to the layer inputs of shape
|
||||
(batch_size, input_dim).
|
||||
"""
|
||||
return (1. - outputs**2) * grads_wrt_outputs
|
||||
|
||||
def __repr__(self):
|
||||
return 'TanhLayer'
|
||||
|
||||
class SoftmaxLayer(Layer):
|
||||
"""Layer implementing a softmax transformation."""
|
||||
@ -232,7 +539,9 @@ class SoftmaxLayer(Layer):
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
exp_inputs = np.exp(inputs)
|
||||
# subtract max inside exponential to improve numerical stability -
|
||||
# when we divide through by sum this term cancels
|
||||
exp_inputs = np.exp(inputs - inputs.max(-1)[:, None])
|
||||
return exp_inputs / exp_inputs.sum(-1)[:, None]
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
@ -257,3 +566,177 @@ class SoftmaxLayer(Layer):
|
||||
|
||||
def __repr__(self):
|
||||
return 'SoftmaxLayer'
|
||||
|
||||
class RadialBasisFunctionLayer(Layer):
|
||||
"""Layer implementing projection to a grid of radial basis functions."""
|
||||
|
||||
def __init__(self, grid_dim, intervals=[[0., 1.]]):
|
||||
"""Creates a radial basis function layer object.
|
||||
|
||||
Args:
|
||||
grid_dim: Integer specifying how many basis function to use in
|
||||
grid across input space per dimension (so total number of
|
||||
basis functions will be grid_dim**input_dim)
|
||||
intervals: List of intervals (two element lists or tuples)
|
||||
specifying extents of axis-aligned region in input-space to
|
||||
tile basis functions in grid across. For example for a 2D input
|
||||
space spanning [0, 1] x [0, 1] use intervals=[[0, 1], [0, 1]].
|
||||
"""
|
||||
num_basis = grid_dim**len(intervals)
|
||||
self.centres = np.array(np.meshgrid(*[
|
||||
np.linspace(low, high, grid_dim) for (low, high) in intervals])
|
||||
).reshape((len(intervals), -1))
|
||||
self.scales = np.array([
|
||||
[(high - low) * 1. / grid_dim] for (low, high) in intervals])
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
return np.exp(-(inputs[..., None] - self.centres[None, ...])**2 /
|
||||
self.scales**2).reshape((inputs.shape[0], -1))
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
outputs: Array of layer outputs calculated in forward pass of
|
||||
shape (batch_size, output_dim).
|
||||
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Array of gradients with respect to the layer inputs of shape
|
||||
(batch_size, input_dim).
|
||||
"""
|
||||
num_basis = self.centres.shape[1]
|
||||
return -2 * (
|
||||
((inputs[..., None] - self.centres[None, ...]) / self.scales**2) *
|
||||
grads_wrt_outputs.reshape((inputs.shape[0], -1, num_basis))
|
||||
).sum(-1)
|
||||
|
||||
def __repr__(self):
|
||||
return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)
|
||||
|
||||
class DropoutLayer(StochasticLayer):
|
||||
"""Layer which stochastically drops input dimensions in its output."""
|
||||
|
||||
def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True):
|
||||
"""Construct a new dropout layer.
|
||||
|
||||
Args:
|
||||
rng (RandomState): Seeded random number generator.
|
||||
incl_prob: Scalar value in (0, 1] specifying the probability of
|
||||
each input dimension being included in the output.
|
||||
share_across_batch: Whether to use same dropout mask across
|
||||
all inputs in a batch or use per input masks.
|
||||
"""
|
||||
super(DropoutLayer, self).__init__(rng)
|
||||
assert incl_prob > 0. and incl_prob <= 1.
|
||||
self.incl_prob = incl_prob
|
||||
self.share_across_batch = share_across_batch
|
||||
self.rng = rng
|
||||
|
||||
def fprop(self, inputs, stochastic=True):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
stochastic: Flag allowing different deterministic
|
||||
forward-propagation mode in addition to default stochastic
|
||||
forward-propagation e.g. for use at test time. If False
|
||||
a deterministic forward-propagation transformation
|
||||
corresponding to the expected output of the stochastic
|
||||
forward-propagation is applied.
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs. This should correspond to
|
||||
default stochastic forward-propagation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
outputs: Array of layer outputs calculated in forward pass of
|
||||
shape (batch_size, output_dim).
|
||||
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Array of gradients with respect to the layer inputs of shape
|
||||
(batch_size, input_dim).
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def __repr__(self):
|
||||
return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob)
|
||||
|
||||
class ReshapeLayer(Layer):
|
||||
"""Layer which reshapes dimensions of inputs."""
|
||||
|
||||
def __init__(self, output_shape=None):
|
||||
"""Create a new reshape layer object.
|
||||
|
||||
Args:
|
||||
output_shape: Tuple specifying shape each input in batch should
|
||||
be reshaped to in outputs. This **excludes** the batch size
|
||||
so the shape of the final output array will be
|
||||
(batch_size, ) + output_shape
|
||||
Similarly to numpy.reshape, one shape dimension can be -1. In
|
||||
this case, the value is inferred from the size of the input
|
||||
array and remaining dimensions. The shape specified must be
|
||||
compatible with the input array shape - i.e. the total number
|
||||
of values in the array cannot be changed. If set to `None` the
|
||||
output shape will be set to
|
||||
(batch_size, -1)
|
||||
which will flatten all the inputs to vectors.
|
||||
"""
|
||||
self.output_shape = (-1,) if output_shape is None else output_shape
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
return inputs.reshape((inputs.shape[0],) + self.output_shape)
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
outputs: Array of layer outputs calculated in forward pass of
|
||||
shape (batch_size, output_dim).
|
||||
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Array of gradients with respect to the layer inputs of shape
|
||||
(batch_size, input_dim).
|
||||
"""
|
||||
return grads_wrt_outputs.reshape(inputs.shape)
|
||||
|
||||
def __repr__(self):
|
||||
return 'ReshapeLayer(output_shape={0})'.format(self.output_shape)
|
||||
|
@ -160,3 +160,158 @@ class MomentumLearningRule(GradientDescentLearningRule):
|
||||
mom *= self.mom_coeff
|
||||
mom -= self.learning_rate * grad
|
||||
param += mom
|
||||
|
||||
|
||||
class AdamLearningRule(GradientDescentLearningRule):
|
||||
"""Adaptive moments (Adam) learning rule.
|
||||
First-order gradient-descent based learning rule which uses adaptive
|
||||
estimates of first and second moments of the parameter gradients to
|
||||
calculate the parameter updates.
|
||||
References:
|
||||
[1]: Adam: a method for stochastic optimisation
|
||||
Kingma and Ba, 2015
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
|
||||
epsilon=1e-8):
|
||||
"""Creates a new learning rule object.
|
||||
Args:
|
||||
learning_rate: A postive scalar to scale gradient updates to the
|
||||
parameters by. This needs to be carefully set - if too large
|
||||
the learning dynamic will be unstable and may diverge, while
|
||||
if set too small learning will proceed very slowly.
|
||||
beta_1: Exponential decay rate for gradient first moment estimates.
|
||||
This should be a scalar value in [0, 1]. The running gradient
|
||||
first moment estimate is calculated using
|
||||
`m_1 = beta_1 * m_1_prev + (1 - beta_1) * g`
|
||||
where `m_1_prev` is the previous estimate and `g` the current
|
||||
parameter gradients.
|
||||
beta_2: Exponential decay rate for gradient second moment
|
||||
estimates. This should be a scalar value in [0, 1]. The run
|
||||
gradient second moment estimate is calculated using
|
||||
`m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2`
|
||||
where `m_2_prev` is the previous estimate and `g` the current
|
||||
parameter gradients.
|
||||
epsilon: 'Softening' parameter to stop updates diverging when
|
||||
second moment estimates are close to zero. Should be set to
|
||||
a small positive value.
|
||||
"""
|
||||
super(AdamLearningRule, self).__init__(learning_rate)
|
||||
assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].'
|
||||
assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].'
|
||||
assert epsilon > 0., 'epsilon should be > 0.'
|
||||
self.beta_1 = beta_1
|
||||
self.beta_2 = beta_2
|
||||
self.epsilon = epsilon
|
||||
|
||||
def initialise(self, params):
|
||||
"""Initialises the state of the learning rule for a set or parameters.
|
||||
This must be called before `update_params` is first called.
|
||||
Args:
|
||||
params: A list of the parameters to be optimised. Note these will
|
||||
be updated *in-place* to avoid reallocating arrays on each
|
||||
update.
|
||||
"""
|
||||
super(AdamLearningRule, self).initialise(params)
|
||||
self.moms_1 = []
|
||||
for param in self.params:
|
||||
self.moms_1.append(np.zeros_like(param))
|
||||
self.moms_2 = []
|
||||
for param in self.params:
|
||||
self.moms_2.append(np.zeros_like(param))
|
||||
self.step_count = 0
|
||||
|
||||
def reset(self):
|
||||
"""Resets any additional state variables to their initial values.
|
||||
For this learning rule this corresponds to zeroing the estimates of
|
||||
the first and second moments of the gradients.
|
||||
"""
|
||||
for mom_1, mom_2 in zip(self.moms_1, self.moms_2):
|
||||
mom_1 *= 0.
|
||||
mom_2 *= 0.
|
||||
self.step_count = 0
|
||||
|
||||
def update_params(self, grads_wrt_params):
|
||||
"""Applies a single update to all parameters.
|
||||
All parameter updates are performed using in-place operations and so
|
||||
nothing is returned.
|
||||
Args:
|
||||
grads_wrt_params: A list of gradients of the scalar loss function
|
||||
with respect to each of the parameters passed to `initialise`
|
||||
previously, with this list expected to be in the same order.
|
||||
"""
|
||||
for param, mom_1, mom_2, grad in zip(
|
||||
self.params, self.moms_1, self.moms_2, grads_wrt_params):
|
||||
mom_1 *= self.beta_1
|
||||
mom_1 += (1. - self.beta_1) * grad
|
||||
mom_2 *= self.beta_2
|
||||
mom_2 += (1. - self.beta_2) * grad ** 2
|
||||
alpha_t = (
|
||||
self.learning_rate *
|
||||
(1. - self.beta_2 ** (self.step_count + 1)) ** 0.5 /
|
||||
(1. - self.beta_1 ** (self.step_count + 1))
|
||||
)
|
||||
param -= alpha_t * mom_1 / (mom_2 ** 0.5 + self.epsilon)
|
||||
self.step_count += 1
|
||||
|
||||
|
||||
class AdaGradLearningRule(GradientDescentLearningRule):
|
||||
"""Adaptive gradients (AdaGrad) learning rule.
|
||||
First-order gradient-descent based learning rule which normalises gradient
|
||||
updates by a running sum of the past squared gradients.
|
||||
References:
|
||||
[1]: Adaptive Subgradient Methods for Online Learning and Stochastic
|
||||
Optimization. Duchi, Haxan and Singer, 2011
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate=1e-2, epsilon=1e-8):
|
||||
"""Creates a new learning rule object.
|
||||
Args:
|
||||
learning_rate: A postive scalar to scale gradient updates to the
|
||||
parameters by. This needs to be carefully set - if too large
|
||||
the learning dynamic will be unstable and may diverge, while
|
||||
if set too small learning will proceed very slowly.
|
||||
epsilon: 'Softening' parameter to stop updates diverging when
|
||||
sums of squared gradients are close to zero. Should be set to
|
||||
a small positive value.
|
||||
"""
|
||||
super(AdaGradLearningRule, self).__init__(learning_rate)
|
||||
assert epsilon > 0., 'epsilon should be > 0.'
|
||||
self.epsilon = epsilon
|
||||
|
||||
def initialise(self, params):
|
||||
"""Initialises the state of the learning rule for a set or parameters.
|
||||
This must be called before `update_params` is first called.
|
||||
Args:
|
||||
params: A list of the parameters to be optimised. Note these will
|
||||
be updated *in-place* to avoid reallocating arrays on each
|
||||
update.
|
||||
"""
|
||||
super(AdaGradLearningRule, self).initialise(params)
|
||||
self.sum_sq_grads = []
|
||||
for param in self.params:
|
||||
self.sum_sq_grads.append(np.zeros_like(param))
|
||||
|
||||
def reset(self):
|
||||
"""Resets any additional state variables to their initial values.
|
||||
For this learning rule this corresponds to zeroing all the sum of
|
||||
squared gradient states.
|
||||
"""
|
||||
for sum_sq_grad in self.sum_sq_grads:
|
||||
sum_sq_grad *= 0.
|
||||
|
||||
def update_params(self, grads_wrt_params):
|
||||
"""Applies a single update to all parameters.
|
||||
All parameter updates are performed using in-place operations and so
|
||||
nothing is returned.
|
||||
Args:
|
||||
grads_wrt_params: A list of gradients of the scalar loss function
|
||||
with respect to each of the parameters passed to `initialise`
|
||||
previously, with this list expected to be in the same order.
|
||||
"""
|
||||
for param, sum_sq_grad, grad in zip(
|
||||
self.params, self.sum_sq_grads, grads_wrt_params):
|
||||
sum_sq_grad += grad ** 2
|
||||
param -= (self.learning_rate * grad /
|
||||
(sum_sq_grad + self.epsilon) ** 0.5)
|
||||
|
||||
|
@ -8,7 +8,7 @@ outputs (and intermediate states) and for calculating gradients of scalar
|
||||
functions of the outputs with respect to the model parameters.
|
||||
"""
|
||||
|
||||
from mlp.layers import LayerWithParameters
|
||||
from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters
|
||||
|
||||
|
||||
class SingleLayerModel(object):
|
||||
@ -80,11 +80,11 @@ class MultipleLayerModel(object):
|
||||
"""A list of all of the parameters of the model."""
|
||||
params = []
|
||||
for layer in self.layers:
|
||||
if isinstance(layer, LayerWithParameters):
|
||||
if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
|
||||
params += layer.params
|
||||
return params
|
||||
|
||||
def fprop(self, inputs):
|
||||
def fprop(self, inputs, evaluation=False):
|
||||
"""Forward propagates a batch of inputs through the model.
|
||||
|
||||
Args:
|
||||
@ -97,7 +97,19 @@ class MultipleLayerModel(object):
|
||||
"""
|
||||
activations = [inputs]
|
||||
for i, layer in enumerate(self.layers):
|
||||
activations.append(self.layers[i].fprop(activations[i]))
|
||||
if evaluation:
|
||||
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
|
||||
StochasticLayerWithParameters):
|
||||
current_activations = self.layers[i].fprop(activations[i], stochastic=False)
|
||||
else:
|
||||
current_activations = self.layers[i].fprop(activations[i])
|
||||
else:
|
||||
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
|
||||
StochasticLayerWithParameters):
|
||||
current_activations = self.layers[i].fprop(activations[i], stochastic=True)
|
||||
else:
|
||||
current_activations = self.layers[i].fprop(activations[i])
|
||||
activations.append(current_activations)
|
||||
return activations
|
||||
|
||||
def grads_wrt_params(self, activations, grads_wrt_outputs):
|
||||
@ -119,7 +131,7 @@ class MultipleLayerModel(object):
|
||||
inputs = activations[-i - 2]
|
||||
outputs = activations[-i - 1]
|
||||
grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
|
||||
if isinstance(layer, LayerWithParameters):
|
||||
if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
|
||||
grads_wrt_params += layer.grads_wrt_params(
|
||||
inputs, grads_wrt_outputs)[::-1]
|
||||
grads_wrt_outputs = grads_wrt_inputs
|
||||
|
@ -9,7 +9,7 @@ import time
|
||||
import logging
|
||||
from collections import OrderedDict
|
||||
import numpy as np
|
||||
|
||||
import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -18,7 +18,7 @@ class Optimiser(object):
|
||||
"""Basic model optimiser."""
|
||||
|
||||
def __init__(self, model, error, learning_rule, train_dataset,
|
||||
valid_dataset=None, data_monitors=None):
|
||||
valid_dataset=None, data_monitors=None, notebook=False):
|
||||
"""Create a new optimiser instance.
|
||||
|
||||
Args:
|
||||
@ -43,6 +43,11 @@ class Optimiser(object):
|
||||
self.data_monitors = OrderedDict([('error', error)])
|
||||
if data_monitors is not None:
|
||||
self.data_monitors.update(data_monitors)
|
||||
self.notebook = notebook
|
||||
if notebook:
|
||||
self.tqdm_progress = tqdm.tqdm_notebook
|
||||
else:
|
||||
self.tqdm_progress = tqdm.tqdm
|
||||
|
||||
def do_training_epoch(self):
|
||||
"""Do a single training epoch.
|
||||
@ -52,12 +57,15 @@ class Optimiser(object):
|
||||
respect to all the model parameters and then updates the model
|
||||
parameters according to the learning rule.
|
||||
"""
|
||||
for inputs_batch, targets_batch in self.train_dataset:
|
||||
activations = self.model.fprop(inputs_batch)
|
||||
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
|
||||
grads_wrt_params = self.model.grads_wrt_params(
|
||||
activations, grads_wrt_outputs)
|
||||
self.learning_rule.update_params(grads_wrt_params)
|
||||
with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar:
|
||||
train_progress_bar.set_description("Ep Prog")
|
||||
for inputs_batch, targets_batch in self.train_dataset:
|
||||
activations = self.model.fprop(inputs_batch)
|
||||
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
|
||||
grads_wrt_params = self.model.grads_wrt_params(
|
||||
activations, grads_wrt_outputs)
|
||||
self.learning_rule.update_params(grads_wrt_params)
|
||||
train_progress_bar.update(1)
|
||||
|
||||
def eval_monitors(self, dataset, label):
|
||||
"""Evaluates the monitors for the given dataset.
|
||||
@ -72,7 +80,7 @@ class Optimiser(object):
|
||||
data_mon_vals = OrderedDict([(key + label, 0.) for key
|
||||
in self.data_monitors.keys()])
|
||||
for inputs_batch, targets_batch in dataset:
|
||||
activations = self.model.fprop(inputs_batch)
|
||||
activations = self.model.fprop(inputs_batch, evaluation=True)
|
||||
for key, data_monitor in self.data_monitors.items():
|
||||
data_mon_vals[key + label] += data_monitor(
|
||||
activations[-1], targets_batch)
|
||||
@ -104,7 +112,7 @@ class Optimiser(object):
|
||||
"""
|
||||
logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format(
|
||||
epoch, epoch_time,
|
||||
', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
|
||||
', '.join(['{}={:.2e}'.format(k, v) for (k, v) in stats.items()])
|
||||
))
|
||||
|
||||
def train(self, num_epochs, stats_interval=5):
|
||||
@ -121,17 +129,20 @@ class Optimiser(object):
|
||||
and the second being a dict mapping the labels for the statistics
|
||||
recorded to their column index in the array.
|
||||
"""
|
||||
start_train_time = time.process_time()
|
||||
start_train_time = time.time()
|
||||
run_stats = [list(self.get_epoch_stats().values())]
|
||||
for epoch in range(1, num_epochs + 1):
|
||||
start_time = time.process_time()
|
||||
self.do_training_epoch()
|
||||
epoch_time = time.process_time() - start_time
|
||||
if epoch % stats_interval == 0:
|
||||
stats = self.get_epoch_stats()
|
||||
self.log_stats(epoch, epoch_time, stats)
|
||||
run_stats.append(list(stats.values()))
|
||||
finish_train_time = time.process_time()
|
||||
with self.tqdm_progress(total=num_epochs) as progress_bar:
|
||||
progress_bar.set_description("Exp Prog")
|
||||
for epoch in range(1, num_epochs + 1):
|
||||
start_time = time.time()
|
||||
self.do_training_epoch()
|
||||
epoch_time = time.time()- start_time
|
||||
if epoch % stats_interval == 0:
|
||||
stats = self.get_epoch_stats()
|
||||
self.log_stats(epoch, epoch_time, stats)
|
||||
run_stats.append(list(stats.values()))
|
||||
progress_bar.update(1)
|
||||
finish_train_time = time.time()
|
||||
total_train_time = finish_train_time - start_train_time
|
||||
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time
|
||||
|
||||
|
@ -32,3 +32,42 @@ class ConstantLearningRateScheduler(object):
|
||||
epoch_number: Integer index of training epoch about to be run.
|
||||
"""
|
||||
learning_rule.learning_rate = self.learning_rate
|
||||
|
||||
class CosineAnnealingWithWarmRestarts(object):
|
||||
"""Cosine annealing scheduler, implemented as in https://arxiv.org/pdf/1608.03983.pdf"""
|
||||
|
||||
def __init__(self, min_learning_rate, max_learning_rate, total_iters_per_period, max_learning_rate_discount_factor,
|
||||
period_iteration_expansion_factor):
|
||||
"""
|
||||
Instantiates a new cosine annealing with warm restarts learning rate scheduler
|
||||
:param min_learning_rate: The minimum learning rate the scheduler can assign
|
||||
:param max_learning_rate: The maximum learning rate the scheduler can assign
|
||||
:param total_epochs_per_period: The number of epochs in a period
|
||||
:param max_learning_rate_discount_factor: The rate of discount for the maximum learning rate after each restart i.e. how many times smaller the max learning rate will be after a restart compared to the previous one
|
||||
:param period_iteration_expansion_factor: The rate of expansion of the period epochs. e.g. if it's set to 1 then all periods have the same number of epochs, if it's larger than 1 then each subsequent period will have more epochs and vice versa.
|
||||
"""
|
||||
self.min_learning_rate = min_learning_rate
|
||||
self.max_learning_rate = max_learning_rate
|
||||
self.total_epochs_per_period = total_iters_per_period
|
||||
|
||||
self.max_learning_rate_discount_factor = max_learning_rate_discount_factor
|
||||
self.period_iteration_expansion_factor = period_iteration_expansion_factor
|
||||
|
||||
|
||||
def update_learning_rule(self, learning_rule, epoch_number):
|
||||
"""Update the hyperparameters of the learning rule.
|
||||
|
||||
Run at the beginning of each epoch.
|
||||
|
||||
Args:
|
||||
learning_rule: Learning rule object being used in training run,
|
||||
any scheduled hyperparameters to be altered should be
|
||||
attributes of this object.
|
||||
epoch_number: Integer index of training epoch about to be run.
|
||||
Returns:
|
||||
effective_learning_rate at step 'epoch_number'
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user