Add missing files

This commit is contained in:
Visual Computing (VICO) Group 2024-10-14 10:51:43 +01:00
parent 4657cca862
commit 5d52a22448
8 changed files with 1015 additions and 60 deletions

View File

@ -16,7 +16,7 @@ class DataProvider(object):
"""Generic data provider.""" """Generic data provider."""
def __init__(self, inputs, targets, batch_size, max_num_batches=-1, def __init__(self, inputs, targets, batch_size, max_num_batches=-1,
shuffle_order=True, rng=None): shuffle_order=True, rng=None, smooth_labels=False):
"""Create a new data provider object. """Create a new data provider object.
Args: Args:
@ -32,26 +32,60 @@ class DataProvider(object):
shuffle_order (bool): Whether to randomly permute the order of shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch. the data before each epoch.
rng (RandomState): A seeded random number generator. rng (RandomState): A seeded random number generator.
smooth_labels (bool): turn on label smoothing
""" """
self.inputs = inputs self.inputs = inputs
self.targets = targets self.targets = targets
self.batch_size = batch_size if batch_size < 1:
assert max_num_batches != 0 and not max_num_batches < -1, ( raise ValueError('batch_size must be >= 1')
'max_num_batches should be -1 or > 0') self._batch_size = batch_size
self.max_num_batches = max_num_batches if max_num_batches == 0 or max_num_batches < -1:
raise ValueError('max_num_batches must be -1 or > 0')
self._max_num_batches = max_num_batches
self._update_num_batches()
self.shuffle_order = shuffle_order
self._current_order = np.arange(inputs.shape[0])
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
self.smooth_labels = smooth_labels
self.new_epoch()
@property
def batch_size(self):
"""Number of data points to include in each batch."""
return self._batch_size
@batch_size.setter
def batch_size(self, value):
if value < 1:
raise ValueError('batch_size must be >= 1')
self._batch_size = value
self._update_num_batches()
@property
def max_num_batches(self):
"""Maximum number of batches to iterate over in an epoch."""
return self._max_num_batches
@max_num_batches.setter
def max_num_batches(self, value):
if value == 0 or value < -1:
raise ValueError('max_num_batches must be -1 or > 0')
self._max_num_batches = value
self._update_num_batches()
def _update_num_batches(self):
"""Updates number of batches to iterate over."""
# maximum possible number of batches is equal to number of whole times # maximum possible number of batches is equal to number of whole times
# batch_size divides in to the number of data points which can be # batch_size divides in to the number of data points which can be
# found using integer division # found using integer division
possible_num_batches = self.inputs.shape[0] // batch_size possible_num_batches = self.inputs.shape[0] // self.batch_size
if self.max_num_batches == -1: if self.max_num_batches == -1:
self.num_batches = possible_num_batches self.num_batches = possible_num_batches
else: else:
self.num_batches = min(self.max_num_batches, possible_num_batches) self.num_batches = min(self.max_num_batches, possible_num_batches)
self.shuffle_order = shuffle_order
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
self.reset()
def __iter__(self): def __iter__(self):
"""Implements Python iterator interface. """Implements Python iterator interface.
@ -63,27 +97,36 @@ class DataProvider(object):
""" """
return self return self
def reset(self): def new_epoch(self):
"""Resets the provider to the initial state to use in a new epoch.""" """Starts a new epoch (pass through data), possibly shuffling first."""
self._curr_batch = 0 self._curr_batch = 0
if self.shuffle_order: if self.shuffle_order:
self.shuffle() self.shuffle()
def shuffle(self):
"""Randomly shuffles order of data."""
new_order = self.rng.permutation(self.inputs.shape[0])
self.inputs = self.inputs[new_order]
self.targets = self.targets[new_order]
def __next__(self): def __next__(self):
return self.next() return self.next()
def reset(self):
"""Resets the provider to the initial state."""
inv_perm = np.argsort(self._current_order)
self._current_order = self._current_order[inv_perm]
self.inputs = self.inputs[inv_perm]
self.targets = self.targets[inv_perm]
self.new_epoch()
def shuffle(self):
"""Randomly shuffles order of data."""
perm = self.rng.permutation(self.inputs.shape[0])
self._current_order = self._current_order[perm]
self.inputs = self.inputs[perm]
self.targets = self.targets[perm]
def next(self): def next(self):
"""Returns next data batch or raises `StopIteration` if at end.""" """Returns next data batch or raises `StopIteration` if at end."""
if self._curr_batch + 1 > self.num_batches: if self._curr_batch + 1 > self.num_batches:
# no more batches in current iteration through data set so reset # no more batches in current iteration through data set so start
# the dataset for another pass and indicate iteration is at end # new epoch ready for another pass and indicate iteration is at end
self.reset() self.new_epoch()
raise StopIteration() raise StopIteration()
# create an index slice corresponding to current batch number # create an index slice corresponding to current batch number
batch_slice = slice(self._curr_batch * self.batch_size, batch_slice = slice(self._curr_batch * self.batch_size,
@ -93,12 +136,11 @@ class DataProvider(object):
self._curr_batch += 1 self._curr_batch += 1
return inputs_batch, targets_batch return inputs_batch, targets_batch
class MNISTDataProvider(DataProvider): class MNISTDataProvider(DataProvider):
"""Data provider for MNIST handwritten digit images.""" """Data provider for MNIST handwritten digit images."""
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
shuffle_order=True, rng=None): shuffle_order=True, rng=None, smooth_labels=False):
"""Create a new MNIST data provider object. """Create a new MNIST data provider object.
Args: Args:
@ -112,9 +154,10 @@ class MNISTDataProvider(DataProvider):
shuffle_order (bool): Whether to randomly permute the order of shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch. the data before each epoch.
rng (RandomState): A seeded random number generator. rng (RandomState): A seeded random number generator.
smooth_labels (bool): enable/disable label smoothing
""" """
# check a valid which_set was provided # check a valid which_set was provided
assert which_set in ['train', 'valid', 'eval'], ( assert which_set in ['train', 'valid', 'test'], (
'Expected which_set to be either train, valid or eval. ' 'Expected which_set to be either train, valid or eval. '
'Got {0}'.format(which_set) 'Got {0}'.format(which_set)
) )
@ -134,7 +177,7 @@ class MNISTDataProvider(DataProvider):
inputs = inputs.astype(np.float32) inputs = inputs.astype(np.float32)
# pass the loaded data to the parent class __init__ # pass the loaded data to the parent class __init__
super(MNISTDataProvider, self).__init__( super(MNISTDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng) inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
def next(self): def next(self):
"""Returns next data batch or raises `StopIteration` if at end.""" """Returns next data batch or raises `StopIteration` if at end."""
@ -160,6 +203,102 @@ class MNISTDataProvider(DataProvider):
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1 one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
return one_of_k_targets return one_of_k_targets
class EMNISTDataProvider(DataProvider):
"""Data provider for EMNIST handwritten digit images."""
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
shuffle_order=True, rng=None, smooth_labels=False):
"""Create a new EMNIST data provider object.
Args:
which_set: One of 'train', 'valid' or 'eval'. Determines which
portion of the EMNIST data this object should provide.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
smooth_labels (bool): enable/disable label smoothing
"""
# check a valid which_set was provided
assert which_set in ['train', 'valid', 'test'], (
'Expected which_set to be either train, valid or eval. '
'Got {0}'.format(which_set)
)
self.which_set = which_set
self.num_classes = 47
# construct path to data using os.path.join to ensure the correct path
# separator for the current platform / OS is used
# MLP_DATA_DIR environment variable should point to the data directory
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set))
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
# load data from compressed numpy file
loaded = np.load(data_path)
print(loaded.keys())
inputs, targets = loaded['inputs'], loaded['targets']
inputs = inputs.astype(np.float32)
inputs = np.reshape(inputs, newshape=(-1, 28*28))
inputs = inputs / 255.0
# pass the loaded data to the parent class __init__
super(EMNISTDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
inputs_batch, targets_batch = super(EMNISTDataProvider, self).next()
if self.smooth_labels:
targets_batch_mat = self.label_smoothing(targets_batch)
else:
targets_batch_mat = self.to_one_of_k(targets_batch)
return inputs_batch, targets_batch_mat
def to_one_of_k(self, int_targets):
"""Converts integer coded class target to 1 of K coded targets.
Args:
int_targets (ndarray): Array of integer coded class targets (i.e.
where an integer from 0 to `num_classes` - 1 is used to
indicate which is the correct class). This should be of shape
(num_data,).
Returns:
Array of 1 of K coded targets i.e. an array of shape
(num_data, num_classes) where for each row all elements are equal
to zero except for the column corresponding to the correct class
which is equal to one.
"""
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
return one_of_k_targets
def label_smoothing(self, int_targets, alpha=0.1):
"""Converts integer coded class target to 1 of K coded targets with label smoothing.
Args:
int_targets (ndarray): Array of integer coded class targets (i.e.
where an integer from 0 to `num_classes` - 1 is used to
indicate which is the correct class). This should be of shape
(num_data,).
alpha (float): Smoothing factor.
Returns:
Array of 1 of K coded targets with label smoothing i.e. an array of shape
(num_data, num_classes)
"""
raise NotImplementedError
class MetOfficeDataProvider(DataProvider): class MetOfficeDataProvider(DataProvider):
"""South Scotland Met Office weather data provider.""" """South Scotland Met Office weather data provider."""
@ -253,3 +392,41 @@ class CCPPDataProvider(DataProvider):
targets = loaded[which_set + '_targets'] targets = loaded[which_set + '_targets']
super(CCPPDataProvider, self).__init__( super(CCPPDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng) inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
class AugmentedMNISTDataProvider(MNISTDataProvider):
"""Data provider for MNIST dataset which randomly transforms images."""
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
shuffle_order=True, rng=None, transformer=None):
"""Create a new augmented MNIST data provider object.
Args:
which_set: One of 'train', 'valid' or 'test'. Determines which
portion of the MNIST data this object should provide.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
transformer: Function which takes an `inputs` array of shape
(batch_size, input_dim) corresponding to a batch of input
images and a `rng` random number generator object (i.e. a
call signature `transformer(inputs, rng)`) and applies a
potentiall random set of transformations to some / all of the
input images as each new batch is returned when iterating over
the data provider.
"""
super(AugmentedMNISTDataProvider, self).__init__(
which_set, batch_size, max_num_batches, shuffle_order, rng)
self.transformer = transformer
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
inputs_batch, targets_batch = super(
AugmentedMNISTDataProvider, self).next()
transformed_inputs_batch = self.transformer(inputs_batch, self.rng)
return transformed_inputs_batch, targets_batch

View File

@ -154,9 +154,9 @@ class CrossEntropySoftmaxError(object):
Returns: Returns:
Scalar error function value. Scalar error function value.
""" """
probs = np.exp(outputs) normOutputs = outputs - outputs.max(-1)[:, None]
probs /= probs.sum(-1)[:, None] logProb = normOutputs - np.log(np.sum(np.exp(normOutputs), axis=-1)[:, None])
return -np.mean(np.sum(targets * np.log(probs), axis=1)) return -np.mean(np.sum(targets * logProb, axis=1))
def grad(self, outputs, targets): def grad(self, outputs, targets):
"""Calculates gradient of error function with respect to outputs. """Calculates gradient of error function with respect to outputs.
@ -168,7 +168,7 @@ class CrossEntropySoftmaxError(object):
Returns: Returns:
Gradient of error function with respect to outputs. Gradient of error function with respect to outputs.
""" """
probs = np.exp(outputs) probs = np.exp(outputs - outputs.max(-1)[:, None])
probs /= probs.sum(-1)[:, None] probs /= probs.sum(-1)[:, None]
return (probs - targets) / outputs.shape[0] return (probs - targets) / outputs.shape[0]

View File

@ -63,3 +63,81 @@ class NormalInit(object):
def __call__(self, shape): def __call__(self, shape):
return self.rng.normal(loc=self.mean, scale=self.std, size=shape) return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
class GlorotUniformInit(object):
"""Glorot and Bengio (2010) random uniform weights initialiser.
Initialises an two-dimensional parameter array using the 'normalized
initialisation' scheme suggested in [1] which attempts to maintain a
roughly constant variance in the activations and backpropagated gradients
of a multi-layer model consisting of interleaved affine and logistic
sigmoidal transformation layers.
Weights are sampled from a zero-mean uniform distribution with standard
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
`output_dim` are the input and output dimensions of the weight matrix
respectively.
References:
[1]: Understanding the difficulty of training deep feedforward neural
networks, Glorot and Bengio (2010)
"""
def __init__(self, gain=1., rng=None):
"""Construct a normalised initilisation random initialiser object.
Args:
gain: Multiplicative factor to scale initialised weights by.
Recommended values is 1 for affine layers followed by
logistic sigmoid layers (or another affine layer).
rng (RandomState): Seeded random number generator.
"""
self.gain = gain
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def __call__(self, shape):
assert len(shape) == 2, (
'Initialiser should only be used for two dimensional arrays.')
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
half_width = 3.**0.5 * std
return self.rng.uniform(low=-half_width, high=half_width, size=shape)
class GlorotNormalInit(object):
"""Glorot and Bengio (2010) random normal weights initialiser.
Initialises an two-dimensional parameter array using the 'normalized
initialisation' scheme suggested in [1] which attempts to maintain a
roughly constant variance in the activations and backpropagated gradients
of a multi-layer model consisting of interleaved affine and logistic
sigmoidal transformation layers.
Weights are sampled from a zero-mean normal distribution with standard
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
`output_dim` are the input and output dimensions of the weight matrix
respectively.
References:
[1]: Understanding the difficulty of training deep feedforward neural
networks, Glorot and Bengio (2010)
"""
def __init__(self, gain=1., rng=None):
"""Construct a normalised initilisation random initialiser object.
Args:
gain: Multiplicative factor to scale initialised weights by.
Recommended values is 1 for affine layers followed by
logistic sigmoid layers (or another affine layer).
rng (RandomState): Seeded random number generator.
"""
self.gain = gain
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def __call__(self, shape):
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
return self.rng.normal(loc=0., scale=std, size=shape)

View File

@ -14,7 +14,7 @@ respect to the layer parameters.
import numpy as np import numpy as np
import mlp.initialisers as init import mlp.initialisers as init
from mlp import DEFAULT_SEED
class Layer(object): class Layer(object):
"""Abstract class defining the interface for a layer.""" """Abstract class defining the interface for a layer."""
@ -68,6 +68,13 @@ class LayerWithParameters(Layer):
""" """
raise NotImplementedError() raise NotImplementedError()
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
raise NotImplementedError()
@property @property
def params(self): def params(self):
"""Returns a list of parameters of layer. """Returns a list of parameters of layer.
@ -88,6 +95,127 @@ class LayerWithParameters(Layer):
""" """
raise NotImplementedError() raise NotImplementedError()
class StochasticLayerWithParameters(Layer):
"""Specialised layer which uses a stochastic forward propagation."""
def __init__(self, rng=None):
"""Constructs a new StochasticLayer object.
Args:
rng (RandomState): Seeded random number generator object.
"""
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def fprop(self, inputs, stochastic=True):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
stochastic: Flag allowing different deterministic
forward-propagation mode in addition to default stochastic
forward-propagation e.g. for use at test time. If False
a deterministic forward-propagation transformation
corresponding to the expected output of the stochastic
forward-propagation is applied.
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
raise NotImplementedError()
def grads_wrt_params(self, inputs, grads_wrt_outputs):
"""Calculates gradients with respect to layer parameters.
Args:
inputs: Array of inputs to layer of shape (batch_size, input_dim).
grads_wrt_to_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
List of arrays of gradients with respect to the layer parameters
with parameter gradients appearing in same order in tuple as
returned from `get_params` method.
"""
raise NotImplementedError()
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
raise NotImplementedError()
@property
def params(self):
"""Returns a list of parameters of layer.
Returns:
List of current parameter values. This list should be in the
corresponding order to the `values` argument to `set_params`.
"""
raise NotImplementedError()
@params.setter
def params(self, values):
"""Sets layer parameters from a list of values.
Args:
values: List of values to set parameters to. This list should be
in the corresponding order to what is returned by `get_params`.
"""
raise NotImplementedError()
class StochasticLayer(Layer):
"""Specialised layer which uses a stochastic forward propagation."""
def __init__(self, rng=None):
"""Constructs a new StochasticLayer object.
Args:
rng (RandomState): Seeded random number generator object.
"""
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def fprop(self, inputs, stochastic=True):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
stochastic: Flag allowing different deterministic
forward-propagation mode in addition to default stochastic
forward-propagation e.g. for use at test time. If False
a deterministic forward-propagation transformation
corresponding to the expected output of the stochastic
forward-propagation is applied.
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
raise NotImplementedError()
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs. This should correspond to
default stochastic forward-propagation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
raise NotImplementedError()
class AffineLayer(LayerWithParameters): class AffineLayer(LayerWithParameters):
"""Layer implementing an affine tranformation of its inputs. """Layer implementing an affine tranformation of its inputs.
@ -97,7 +225,8 @@ class AffineLayer(LayerWithParameters):
def __init__(self, input_dim, output_dim, def __init__(self, input_dim, output_dim,
weights_initialiser=init.UniformInit(-0.1, 0.1), weights_initialiser=init.UniformInit(-0.1, 0.1),
biases_initialiser=init.ConstantInit(0.)): biases_initialiser=init.ConstantInit(0.),
weights_penalty=None, biases_penalty=None):
"""Initialises a parameterised affine layer. """Initialises a parameterised affine layer.
Args: Args:
@ -105,11 +234,17 @@ class AffineLayer(LayerWithParameters):
output_dim (int): Dimension of the layer outputs. output_dim (int): Dimension of the layer outputs.
weights_initialiser: Initialiser for the weight parameters. weights_initialiser: Initialiser for the weight parameters.
biases_initialiser: Initialiser for the bias parameters. biases_initialiser: Initialiser for the bias parameters.
weights_penalty: Weights-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the weights.
biases_penalty: Biases-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the biases.
""" """
self.input_dim = input_dim self.input_dim = input_dim
self.output_dim = output_dim self.output_dim = output_dim
self.weights = weights_initialiser((self.output_dim, self.input_dim)) self.weights = weights_initialiser((self.output_dim, self.input_dim))
self.biases = biases_initialiser(self.output_dim) self.biases = biases_initialiser(self.output_dim)
self.weights_penalty = weights_penalty
self.biases_penalty = biases_penalty
def fprop(self, inputs): def fprop(self, inputs):
"""Forward propagates activations through the layer transformation. """Forward propagates activations through the layer transformation.
@ -123,7 +258,7 @@ class AffineLayer(LayerWithParameters):
Returns: Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim). outputs: Array of layer outputs of shape (batch_size, output_dim).
""" """
return inputs.dot(self.weights.T) + self.biases return self.weights.dot(inputs.T).T + self.biases
def bprop(self, inputs, outputs, grads_wrt_outputs): def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer. """Back propagates gradients through a layer.
@ -159,8 +294,27 @@ class AffineLayer(LayerWithParameters):
grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs) grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0) grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
if self.weights_penalty is not None:
grads_wrt_weights += self.weights_penalty.grad(parameter=self.weights)
if self.biases_penalty is not None:
grads_wrt_biases += self.biases_penalty.grad(parameter=self.biases)
return [grads_wrt_weights, grads_wrt_biases] return [grads_wrt_weights, grads_wrt_biases]
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
params_penalty = 0
if self.weights_penalty is not None:
params_penalty += self.weights_penalty(self.weights)
if self.biases_penalty is not None:
params_penalty += self.biases_penalty(self.biases)
return params_penalty
@property @property
def params(self): def params(self):
"""A list of layer parameter values: `[weights, biases]`.""" """A list of layer parameter values: `[weights, biases]`."""
@ -175,7 +329,6 @@ class AffineLayer(LayerWithParameters):
return 'AffineLayer(input_dim={0}, output_dim={1})'.format( return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
self.input_dim, self.output_dim) self.input_dim, self.output_dim)
class SigmoidLayer(Layer): class SigmoidLayer(Layer):
"""Layer implementing an element-wise logistic sigmoid transformation.""" """Layer implementing an element-wise logistic sigmoid transformation."""
@ -215,6 +368,160 @@ class SigmoidLayer(Layer):
def __repr__(self): def __repr__(self):
return 'SigmoidLayer' return 'SigmoidLayer'
class ReluLayer(Layer):
"""Layer implementing an element-wise rectified linear transformation."""
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return np.maximum(inputs, 0.)
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
return (outputs > 0) * grads_wrt_outputs
def __repr__(self):
return 'ReluLayer'
class LeakyReluLayer(Layer):
"""Layer implementing an element-wise leaky rectified linear transformation."""
def __init__(self, alpha=0.01):
self.alpha = alpha
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
"""
raise NotImplementedError
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
"""
raise NotImplementedError
def __repr__(self):
return 'LeakyReluLayer'
class ParametricReluLayer(LayerWithParameters):
"""Layer implementing an element-wise parametric rectified linear transformation."""
def __init__(self, alpha=0.25):
self.alpha = np.array([alpha])
@property
def params(self):
"""A list of layer parameter values: `[weights, biases]`."""
return [self.alpha]
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
"""
raise NotImplementedError
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
"""
raise NotImplementedError
def grads_wrt_params(self, inputs, grads_wrt_outputs):
"""Calculates gradients with respect to layer parameters.
Args:
inputs: array of inputs to layer of shape (batch_size, input_dim)
grads_wrt_to_outputs: array of gradients with respect to the layer
outputs of shape (batch_size, output_dim)
Returns:
list of arrays of gradients with respect to the layer parameters
`[grads_wrt_params]`. Where params is the alpha parameter.
"""
raise NotImplementedError
@property
def params(self):
"""A list of layer parameter values: `[weights, biases]`."""
return [self.alpha]
@params.setter
def params(self, values):
self.alpha = values[0]
def __repr__(self):
return 'ParametricReluLayer'
class TanhLayer(Layer):
"""Layer implementing an element-wise hyperbolic tangent transformation."""
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return np.tanh(inputs)
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
return (1. - outputs**2) * grads_wrt_outputs
def __repr__(self):
return 'TanhLayer'
class SoftmaxLayer(Layer): class SoftmaxLayer(Layer):
"""Layer implementing a softmax transformation.""" """Layer implementing a softmax transformation."""
@ -232,7 +539,9 @@ class SoftmaxLayer(Layer):
Returns: Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim). outputs: Array of layer outputs of shape (batch_size, output_dim).
""" """
exp_inputs = np.exp(inputs) # subtract max inside exponential to improve numerical stability -
# when we divide through by sum this term cancels
exp_inputs = np.exp(inputs - inputs.max(-1)[:, None])
return exp_inputs / exp_inputs.sum(-1)[:, None] return exp_inputs / exp_inputs.sum(-1)[:, None]
def bprop(self, inputs, outputs, grads_wrt_outputs): def bprop(self, inputs, outputs, grads_wrt_outputs):
@ -257,3 +566,177 @@ class SoftmaxLayer(Layer):
def __repr__(self): def __repr__(self):
return 'SoftmaxLayer' return 'SoftmaxLayer'
class RadialBasisFunctionLayer(Layer):
"""Layer implementing projection to a grid of radial basis functions."""
def __init__(self, grid_dim, intervals=[[0., 1.]]):
"""Creates a radial basis function layer object.
Args:
grid_dim: Integer specifying how many basis function to use in
grid across input space per dimension (so total number of
basis functions will be grid_dim**input_dim)
intervals: List of intervals (two element lists or tuples)
specifying extents of axis-aligned region in input-space to
tile basis functions in grid across. For example for a 2D input
space spanning [0, 1] x [0, 1] use intervals=[[0, 1], [0, 1]].
"""
num_basis = grid_dim**len(intervals)
self.centres = np.array(np.meshgrid(*[
np.linspace(low, high, grid_dim) for (low, high) in intervals])
).reshape((len(intervals), -1))
self.scales = np.array([
[(high - low) * 1. / grid_dim] for (low, high) in intervals])
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return np.exp(-(inputs[..., None] - self.centres[None, ...])**2 /
self.scales**2).reshape((inputs.shape[0], -1))
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
num_basis = self.centres.shape[1]
return -2 * (
((inputs[..., None] - self.centres[None, ...]) / self.scales**2) *
grads_wrt_outputs.reshape((inputs.shape[0], -1, num_basis))
).sum(-1)
def __repr__(self):
return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)
class DropoutLayer(StochasticLayer):
"""Layer which stochastically drops input dimensions in its output."""
def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True):
"""Construct a new dropout layer.
Args:
rng (RandomState): Seeded random number generator.
incl_prob: Scalar value in (0, 1] specifying the probability of
each input dimension being included in the output.
share_across_batch: Whether to use same dropout mask across
all inputs in a batch or use per input masks.
"""
super(DropoutLayer, self).__init__(rng)
assert incl_prob > 0. and incl_prob <= 1.
self.incl_prob = incl_prob
self.share_across_batch = share_across_batch
self.rng = rng
def fprop(self, inputs, stochastic=True):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
stochastic: Flag allowing different deterministic
forward-propagation mode in addition to default stochastic
forward-propagation e.g. for use at test time. If False
a deterministic forward-propagation transformation
corresponding to the expected output of the stochastic
forward-propagation is applied.
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
raise NotImplementedError
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs. This should correspond to
default stochastic forward-propagation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
raise NotImplementedError
def __repr__(self):
return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob)
class ReshapeLayer(Layer):
"""Layer which reshapes dimensions of inputs."""
def __init__(self, output_shape=None):
"""Create a new reshape layer object.
Args:
output_shape: Tuple specifying shape each input in batch should
be reshaped to in outputs. This **excludes** the batch size
so the shape of the final output array will be
(batch_size, ) + output_shape
Similarly to numpy.reshape, one shape dimension can be -1. In
this case, the value is inferred from the size of the input
array and remaining dimensions. The shape specified must be
compatible with the input array shape - i.e. the total number
of values in the array cannot be changed. If set to `None` the
output shape will be set to
(batch_size, -1)
which will flatten all the inputs to vectors.
"""
self.output_shape = (-1,) if output_shape is None else output_shape
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return inputs.reshape((inputs.shape[0],) + self.output_shape)
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
return grads_wrt_outputs.reshape(inputs.shape)
def __repr__(self):
return 'ReshapeLayer(output_shape={0})'.format(self.output_shape)

View File

@ -160,3 +160,158 @@ class MomentumLearningRule(GradientDescentLearningRule):
mom *= self.mom_coeff mom *= self.mom_coeff
mom -= self.learning_rate * grad mom -= self.learning_rate * grad
param += mom param += mom
class AdamLearningRule(GradientDescentLearningRule):
"""Adaptive moments (Adam) learning rule.
First-order gradient-descent based learning rule which uses adaptive
estimates of first and second moments of the parameter gradients to
calculate the parameter updates.
References:
[1]: Adam: a method for stochastic optimisation
Kingma and Ba, 2015
"""
def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
epsilon=1e-8):
"""Creates a new learning rule object.
Args:
learning_rate: A postive scalar to scale gradient updates to the
parameters by. This needs to be carefully set - if too large
the learning dynamic will be unstable and may diverge, while
if set too small learning will proceed very slowly.
beta_1: Exponential decay rate for gradient first moment estimates.
This should be a scalar value in [0, 1]. The running gradient
first moment estimate is calculated using
`m_1 = beta_1 * m_1_prev + (1 - beta_1) * g`
where `m_1_prev` is the previous estimate and `g` the current
parameter gradients.
beta_2: Exponential decay rate for gradient second moment
estimates. This should be a scalar value in [0, 1]. The run
gradient second moment estimate is calculated using
`m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2`
where `m_2_prev` is the previous estimate and `g` the current
parameter gradients.
epsilon: 'Softening' parameter to stop updates diverging when
second moment estimates are close to zero. Should be set to
a small positive value.
"""
super(AdamLearningRule, self).__init__(learning_rate)
assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].'
assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].'
assert epsilon > 0., 'epsilon should be > 0.'
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
def initialise(self, params):
"""Initialises the state of the learning rule for a set or parameters.
This must be called before `update_params` is first called.
Args:
params: A list of the parameters to be optimised. Note these will
be updated *in-place* to avoid reallocating arrays on each
update.
"""
super(AdamLearningRule, self).initialise(params)
self.moms_1 = []
for param in self.params:
self.moms_1.append(np.zeros_like(param))
self.moms_2 = []
for param in self.params:
self.moms_2.append(np.zeros_like(param))
self.step_count = 0
def reset(self):
"""Resets any additional state variables to their initial values.
For this learning rule this corresponds to zeroing the estimates of
the first and second moments of the gradients.
"""
for mom_1, mom_2 in zip(self.moms_1, self.moms_2):
mom_1 *= 0.
mom_2 *= 0.
self.step_count = 0
def update_params(self, grads_wrt_params):
"""Applies a single update to all parameters.
All parameter updates are performed using in-place operations and so
nothing is returned.
Args:
grads_wrt_params: A list of gradients of the scalar loss function
with respect to each of the parameters passed to `initialise`
previously, with this list expected to be in the same order.
"""
for param, mom_1, mom_2, grad in zip(
self.params, self.moms_1, self.moms_2, grads_wrt_params):
mom_1 *= self.beta_1
mom_1 += (1. - self.beta_1) * grad
mom_2 *= self.beta_2
mom_2 += (1. - self.beta_2) * grad ** 2
alpha_t = (
self.learning_rate *
(1. - self.beta_2 ** (self.step_count + 1)) ** 0.5 /
(1. - self.beta_1 ** (self.step_count + 1))
)
param -= alpha_t * mom_1 / (mom_2 ** 0.5 + self.epsilon)
self.step_count += 1
class AdaGradLearningRule(GradientDescentLearningRule):
"""Adaptive gradients (AdaGrad) learning rule.
First-order gradient-descent based learning rule which normalises gradient
updates by a running sum of the past squared gradients.
References:
[1]: Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization. Duchi, Haxan and Singer, 2011
"""
def __init__(self, learning_rate=1e-2, epsilon=1e-8):
"""Creates a new learning rule object.
Args:
learning_rate: A postive scalar to scale gradient updates to the
parameters by. This needs to be carefully set - if too large
the learning dynamic will be unstable and may diverge, while
if set too small learning will proceed very slowly.
epsilon: 'Softening' parameter to stop updates diverging when
sums of squared gradients are close to zero. Should be set to
a small positive value.
"""
super(AdaGradLearningRule, self).__init__(learning_rate)
assert epsilon > 0., 'epsilon should be > 0.'
self.epsilon = epsilon
def initialise(self, params):
"""Initialises the state of the learning rule for a set or parameters.
This must be called before `update_params` is first called.
Args:
params: A list of the parameters to be optimised. Note these will
be updated *in-place* to avoid reallocating arrays on each
update.
"""
super(AdaGradLearningRule, self).initialise(params)
self.sum_sq_grads = []
for param in self.params:
self.sum_sq_grads.append(np.zeros_like(param))
def reset(self):
"""Resets any additional state variables to their initial values.
For this learning rule this corresponds to zeroing all the sum of
squared gradient states.
"""
for sum_sq_grad in self.sum_sq_grads:
sum_sq_grad *= 0.
def update_params(self, grads_wrt_params):
"""Applies a single update to all parameters.
All parameter updates are performed using in-place operations and so
nothing is returned.
Args:
grads_wrt_params: A list of gradients of the scalar loss function
with respect to each of the parameters passed to `initialise`
previously, with this list expected to be in the same order.
"""
for param, sum_sq_grad, grad in zip(
self.params, self.sum_sq_grads, grads_wrt_params):
sum_sq_grad += grad ** 2
param -= (self.learning_rate * grad /
(sum_sq_grad + self.epsilon) ** 0.5)

View File

@ -8,7 +8,7 @@ outputs (and intermediate states) and for calculating gradients of scalar
functions of the outputs with respect to the model parameters. functions of the outputs with respect to the model parameters.
""" """
from mlp.layers import LayerWithParameters from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters
class SingleLayerModel(object): class SingleLayerModel(object):
@ -80,11 +80,11 @@ class MultipleLayerModel(object):
"""A list of all of the parameters of the model.""" """A list of all of the parameters of the model."""
params = [] params = []
for layer in self.layers: for layer in self.layers:
if isinstance(layer, LayerWithParameters): if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
params += layer.params params += layer.params
return params return params
def fprop(self, inputs): def fprop(self, inputs, evaluation=False):
"""Forward propagates a batch of inputs through the model. """Forward propagates a batch of inputs through the model.
Args: Args:
@ -97,7 +97,19 @@ class MultipleLayerModel(object):
""" """
activations = [inputs] activations = [inputs]
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
activations.append(self.layers[i].fprop(activations[i])) if evaluation:
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
StochasticLayerWithParameters):
current_activations = self.layers[i].fprop(activations[i], stochastic=False)
else:
current_activations = self.layers[i].fprop(activations[i])
else:
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
StochasticLayerWithParameters):
current_activations = self.layers[i].fprop(activations[i], stochastic=True)
else:
current_activations = self.layers[i].fprop(activations[i])
activations.append(current_activations)
return activations return activations
def grads_wrt_params(self, activations, grads_wrt_outputs): def grads_wrt_params(self, activations, grads_wrt_outputs):
@ -119,7 +131,7 @@ class MultipleLayerModel(object):
inputs = activations[-i - 2] inputs = activations[-i - 2]
outputs = activations[-i - 1] outputs = activations[-i - 1]
grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs) grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
if isinstance(layer, LayerWithParameters): if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
grads_wrt_params += layer.grads_wrt_params( grads_wrt_params += layer.grads_wrt_params(
inputs, grads_wrt_outputs)[::-1] inputs, grads_wrt_outputs)[::-1]
grads_wrt_outputs = grads_wrt_inputs grads_wrt_outputs = grads_wrt_inputs

View File

@ -9,7 +9,7 @@ import time
import logging import logging
from collections import OrderedDict from collections import OrderedDict
import numpy as np import numpy as np
import tqdm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -18,7 +18,7 @@ class Optimiser(object):
"""Basic model optimiser.""" """Basic model optimiser."""
def __init__(self, model, error, learning_rule, train_dataset, def __init__(self, model, error, learning_rule, train_dataset,
valid_dataset=None, data_monitors=None): valid_dataset=None, data_monitors=None, notebook=False):
"""Create a new optimiser instance. """Create a new optimiser instance.
Args: Args:
@ -43,6 +43,11 @@ class Optimiser(object):
self.data_monitors = OrderedDict([('error', error)]) self.data_monitors = OrderedDict([('error', error)])
if data_monitors is not None: if data_monitors is not None:
self.data_monitors.update(data_monitors) self.data_monitors.update(data_monitors)
self.notebook = notebook
if notebook:
self.tqdm_progress = tqdm.tqdm_notebook
else:
self.tqdm_progress = tqdm.tqdm
def do_training_epoch(self): def do_training_epoch(self):
"""Do a single training epoch. """Do a single training epoch.
@ -52,12 +57,15 @@ class Optimiser(object):
respect to all the model parameters and then updates the model respect to all the model parameters and then updates the model
parameters according to the learning rule. parameters according to the learning rule.
""" """
for inputs_batch, targets_batch in self.train_dataset: with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar:
activations = self.model.fprop(inputs_batch) train_progress_bar.set_description("Ep Prog")
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch) for inputs_batch, targets_batch in self.train_dataset:
grads_wrt_params = self.model.grads_wrt_params( activations = self.model.fprop(inputs_batch)
activations, grads_wrt_outputs) grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
self.learning_rule.update_params(grads_wrt_params) grads_wrt_params = self.model.grads_wrt_params(
activations, grads_wrt_outputs)
self.learning_rule.update_params(grads_wrt_params)
train_progress_bar.update(1)
def eval_monitors(self, dataset, label): def eval_monitors(self, dataset, label):
"""Evaluates the monitors for the given dataset. """Evaluates the monitors for the given dataset.
@ -72,7 +80,7 @@ class Optimiser(object):
data_mon_vals = OrderedDict([(key + label, 0.) for key data_mon_vals = OrderedDict([(key + label, 0.) for key
in self.data_monitors.keys()]) in self.data_monitors.keys()])
for inputs_batch, targets_batch in dataset: for inputs_batch, targets_batch in dataset:
activations = self.model.fprop(inputs_batch) activations = self.model.fprop(inputs_batch, evaluation=True)
for key, data_monitor in self.data_monitors.items(): for key, data_monitor in self.data_monitors.items():
data_mon_vals[key + label] += data_monitor( data_mon_vals[key + label] += data_monitor(
activations[-1], targets_batch) activations[-1], targets_batch)
@ -104,7 +112,7 @@ class Optimiser(object):
""" """
logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format( logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format(
epoch, epoch_time, epoch, epoch_time,
', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()]) ', '.join(['{}={:.2e}'.format(k, v) for (k, v) in stats.items()])
)) ))
def train(self, num_epochs, stats_interval=5): def train(self, num_epochs, stats_interval=5):
@ -121,17 +129,20 @@ class Optimiser(object):
and the second being a dict mapping the labels for the statistics and the second being a dict mapping the labels for the statistics
recorded to their column index in the array. recorded to their column index in the array.
""" """
start_train_time = time.process_time() start_train_time = time.time()
run_stats = [list(self.get_epoch_stats().values())] run_stats = [list(self.get_epoch_stats().values())]
for epoch in range(1, num_epochs + 1): with self.tqdm_progress(total=num_epochs) as progress_bar:
start_time = time.process_time() progress_bar.set_description("Exp Prog")
self.do_training_epoch() for epoch in range(1, num_epochs + 1):
epoch_time = time.process_time() - start_time start_time = time.time()
if epoch % stats_interval == 0: self.do_training_epoch()
stats = self.get_epoch_stats() epoch_time = time.time()- start_time
self.log_stats(epoch, epoch_time, stats) if epoch % stats_interval == 0:
run_stats.append(list(stats.values())) stats = self.get_epoch_stats()
finish_train_time = time.process_time() self.log_stats(epoch, epoch_time, stats)
run_stats.append(list(stats.values()))
progress_bar.update(1)
finish_train_time = time.time()
total_train_time = finish_train_time - start_train_time total_train_time = finish_train_time - start_train_time
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time

View File

@ -32,3 +32,42 @@ class ConstantLearningRateScheduler(object):
epoch_number: Integer index of training epoch about to be run. epoch_number: Integer index of training epoch about to be run.
""" """
learning_rule.learning_rate = self.learning_rate learning_rule.learning_rate = self.learning_rate
class CosineAnnealingWithWarmRestarts(object):
"""Cosine annealing scheduler, implemented as in https://arxiv.org/pdf/1608.03983.pdf"""
def __init__(self, min_learning_rate, max_learning_rate, total_iters_per_period, max_learning_rate_discount_factor,
period_iteration_expansion_factor):
"""
Instantiates a new cosine annealing with warm restarts learning rate scheduler
:param min_learning_rate: The minimum learning rate the scheduler can assign
:param max_learning_rate: The maximum learning rate the scheduler can assign
:param total_epochs_per_period: The number of epochs in a period
:param max_learning_rate_discount_factor: The rate of discount for the maximum learning rate after each restart i.e. how many times smaller the max learning rate will be after a restart compared to the previous one
:param period_iteration_expansion_factor: The rate of expansion of the period epochs. e.g. if it's set to 1 then all periods have the same number of epochs, if it's larger than 1 then each subsequent period will have more epochs and vice versa.
"""
self.min_learning_rate = min_learning_rate
self.max_learning_rate = max_learning_rate
self.total_epochs_per_period = total_iters_per_period
self.max_learning_rate_discount_factor = max_learning_rate_discount_factor
self.period_iteration_expansion_factor = period_iteration_expansion_factor
def update_learning_rule(self, learning_rule, epoch_number):
"""Update the hyperparameters of the learning rule.
Run at the beginning of each epoch.
Args:
learning_rule: Learning rule object being used in training run,
any scheduled hyperparameters to be altered should be
attributes of this object.
epoch_number: Integer index of training epoch about to be run.
Returns:
effective_learning_rate at step 'epoch_number'
"""
raise NotImplementedError