Add missing files
This commit is contained in:
parent
4657cca862
commit
5d52a22448
@ -16,7 +16,7 @@ class DataProvider(object):
|
|||||||
"""Generic data provider."""
|
"""Generic data provider."""
|
||||||
|
|
||||||
def __init__(self, inputs, targets, batch_size, max_num_batches=-1,
|
def __init__(self, inputs, targets, batch_size, max_num_batches=-1,
|
||||||
shuffle_order=True, rng=None):
|
shuffle_order=True, rng=None, smooth_labels=False):
|
||||||
"""Create a new data provider object.
|
"""Create a new data provider object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -32,26 +32,60 @@ class DataProvider(object):
|
|||||||
shuffle_order (bool): Whether to randomly permute the order of
|
shuffle_order (bool): Whether to randomly permute the order of
|
||||||
the data before each epoch.
|
the data before each epoch.
|
||||||
rng (RandomState): A seeded random number generator.
|
rng (RandomState): A seeded random number generator.
|
||||||
|
smooth_labels (bool): turn on label smoothing
|
||||||
"""
|
"""
|
||||||
self.inputs = inputs
|
self.inputs = inputs
|
||||||
self.targets = targets
|
self.targets = targets
|
||||||
self.batch_size = batch_size
|
if batch_size < 1:
|
||||||
assert max_num_batches != 0 and not max_num_batches < -1, (
|
raise ValueError('batch_size must be >= 1')
|
||||||
'max_num_batches should be -1 or > 0')
|
self._batch_size = batch_size
|
||||||
self.max_num_batches = max_num_batches
|
if max_num_batches == 0 or max_num_batches < -1:
|
||||||
|
raise ValueError('max_num_batches must be -1 or > 0')
|
||||||
|
self._max_num_batches = max_num_batches
|
||||||
|
self._update_num_batches()
|
||||||
|
self.shuffle_order = shuffle_order
|
||||||
|
|
||||||
|
self._current_order = np.arange(inputs.shape[0])
|
||||||
|
if rng is None:
|
||||||
|
rng = np.random.RandomState(DEFAULT_SEED)
|
||||||
|
self.rng = rng
|
||||||
|
self.smooth_labels = smooth_labels
|
||||||
|
self.new_epoch()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def batch_size(self):
|
||||||
|
"""Number of data points to include in each batch."""
|
||||||
|
return self._batch_size
|
||||||
|
|
||||||
|
@batch_size.setter
|
||||||
|
def batch_size(self, value):
|
||||||
|
if value < 1:
|
||||||
|
raise ValueError('batch_size must be >= 1')
|
||||||
|
self._batch_size = value
|
||||||
|
self._update_num_batches()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_num_batches(self):
|
||||||
|
"""Maximum number of batches to iterate over in an epoch."""
|
||||||
|
return self._max_num_batches
|
||||||
|
|
||||||
|
@max_num_batches.setter
|
||||||
|
def max_num_batches(self, value):
|
||||||
|
if value == 0 or value < -1:
|
||||||
|
raise ValueError('max_num_batches must be -1 or > 0')
|
||||||
|
self._max_num_batches = value
|
||||||
|
self._update_num_batches()
|
||||||
|
|
||||||
|
def _update_num_batches(self):
|
||||||
|
"""Updates number of batches to iterate over."""
|
||||||
# maximum possible number of batches is equal to number of whole times
|
# maximum possible number of batches is equal to number of whole times
|
||||||
# batch_size divides in to the number of data points which can be
|
# batch_size divides in to the number of data points which can be
|
||||||
# found using integer division
|
# found using integer division
|
||||||
possible_num_batches = self.inputs.shape[0] // batch_size
|
possible_num_batches = self.inputs.shape[0] // self.batch_size
|
||||||
if self.max_num_batches == -1:
|
if self.max_num_batches == -1:
|
||||||
self.num_batches = possible_num_batches
|
self.num_batches = possible_num_batches
|
||||||
else:
|
else:
|
||||||
self.num_batches = min(self.max_num_batches, possible_num_batches)
|
self.num_batches = min(self.max_num_batches, possible_num_batches)
|
||||||
self.shuffle_order = shuffle_order
|
|
||||||
if rng is None:
|
|
||||||
rng = np.random.RandomState(DEFAULT_SEED)
|
|
||||||
self.rng = rng
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Implements Python iterator interface.
|
"""Implements Python iterator interface.
|
||||||
@ -63,27 +97,36 @@ class DataProvider(object):
|
|||||||
"""
|
"""
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def reset(self):
|
def new_epoch(self):
|
||||||
"""Resets the provider to the initial state to use in a new epoch."""
|
"""Starts a new epoch (pass through data), possibly shuffling first."""
|
||||||
self._curr_batch = 0
|
self._curr_batch = 0
|
||||||
if self.shuffle_order:
|
if self.shuffle_order:
|
||||||
self.shuffle()
|
self.shuffle()
|
||||||
|
|
||||||
def shuffle(self):
|
|
||||||
"""Randomly shuffles order of data."""
|
|
||||||
new_order = self.rng.permutation(self.inputs.shape[0])
|
|
||||||
self.inputs = self.inputs[new_order]
|
|
||||||
self.targets = self.targets[new_order]
|
|
||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
return self.next()
|
return self.next()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets the provider to the initial state."""
|
||||||
|
inv_perm = np.argsort(self._current_order)
|
||||||
|
self._current_order = self._current_order[inv_perm]
|
||||||
|
self.inputs = self.inputs[inv_perm]
|
||||||
|
self.targets = self.targets[inv_perm]
|
||||||
|
self.new_epoch()
|
||||||
|
|
||||||
|
def shuffle(self):
|
||||||
|
"""Randomly shuffles order of data."""
|
||||||
|
perm = self.rng.permutation(self.inputs.shape[0])
|
||||||
|
self._current_order = self._current_order[perm]
|
||||||
|
self.inputs = self.inputs[perm]
|
||||||
|
self.targets = self.targets[perm]
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||||
if self._curr_batch + 1 > self.num_batches:
|
if self._curr_batch + 1 > self.num_batches:
|
||||||
# no more batches in current iteration through data set so reset
|
# no more batches in current iteration through data set so start
|
||||||
# the dataset for another pass and indicate iteration is at end
|
# new epoch ready for another pass and indicate iteration is at end
|
||||||
self.reset()
|
self.new_epoch()
|
||||||
raise StopIteration()
|
raise StopIteration()
|
||||||
# create an index slice corresponding to current batch number
|
# create an index slice corresponding to current batch number
|
||||||
batch_slice = slice(self._curr_batch * self.batch_size,
|
batch_slice = slice(self._curr_batch * self.batch_size,
|
||||||
@ -93,12 +136,11 @@ class DataProvider(object):
|
|||||||
self._curr_batch += 1
|
self._curr_batch += 1
|
||||||
return inputs_batch, targets_batch
|
return inputs_batch, targets_batch
|
||||||
|
|
||||||
|
|
||||||
class MNISTDataProvider(DataProvider):
|
class MNISTDataProvider(DataProvider):
|
||||||
"""Data provider for MNIST handwritten digit images."""
|
"""Data provider for MNIST handwritten digit images."""
|
||||||
|
|
||||||
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
|
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
|
||||||
shuffle_order=True, rng=None):
|
shuffle_order=True, rng=None, smooth_labels=False):
|
||||||
"""Create a new MNIST data provider object.
|
"""Create a new MNIST data provider object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -112,9 +154,10 @@ class MNISTDataProvider(DataProvider):
|
|||||||
shuffle_order (bool): Whether to randomly permute the order of
|
shuffle_order (bool): Whether to randomly permute the order of
|
||||||
the data before each epoch.
|
the data before each epoch.
|
||||||
rng (RandomState): A seeded random number generator.
|
rng (RandomState): A seeded random number generator.
|
||||||
|
smooth_labels (bool): enable/disable label smoothing
|
||||||
"""
|
"""
|
||||||
# check a valid which_set was provided
|
# check a valid which_set was provided
|
||||||
assert which_set in ['train', 'valid', 'eval'], (
|
assert which_set in ['train', 'valid', 'test'], (
|
||||||
'Expected which_set to be either train, valid or eval. '
|
'Expected which_set to be either train, valid or eval. '
|
||||||
'Got {0}'.format(which_set)
|
'Got {0}'.format(which_set)
|
||||||
)
|
)
|
||||||
@ -134,7 +177,7 @@ class MNISTDataProvider(DataProvider):
|
|||||||
inputs = inputs.astype(np.float32)
|
inputs = inputs.astype(np.float32)
|
||||||
# pass the loaded data to the parent class __init__
|
# pass the loaded data to the parent class __init__
|
||||||
super(MNISTDataProvider, self).__init__(
|
super(MNISTDataProvider, self).__init__(
|
||||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||||
@ -160,6 +203,102 @@ class MNISTDataProvider(DataProvider):
|
|||||||
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
|
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
|
||||||
return one_of_k_targets
|
return one_of_k_targets
|
||||||
|
|
||||||
|
class EMNISTDataProvider(DataProvider):
|
||||||
|
"""Data provider for EMNIST handwritten digit images."""
|
||||||
|
|
||||||
|
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
|
||||||
|
shuffle_order=True, rng=None, smooth_labels=False):
|
||||||
|
"""Create a new EMNIST data provider object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
which_set: One of 'train', 'valid' or 'eval'. Determines which
|
||||||
|
portion of the EMNIST data this object should provide.
|
||||||
|
batch_size (int): Number of data points to include in each batch.
|
||||||
|
max_num_batches (int): Maximum number of batches to iterate over
|
||||||
|
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||||
|
only as many batches as the data can be split into will be
|
||||||
|
used. If set to -1 all of the data will be used.
|
||||||
|
shuffle_order (bool): Whether to randomly permute the order of
|
||||||
|
the data before each epoch.
|
||||||
|
rng (RandomState): A seeded random number generator.
|
||||||
|
smooth_labels (bool): enable/disable label smoothing
|
||||||
|
"""
|
||||||
|
# check a valid which_set was provided
|
||||||
|
assert which_set in ['train', 'valid', 'test'], (
|
||||||
|
'Expected which_set to be either train, valid or eval. '
|
||||||
|
'Got {0}'.format(which_set)
|
||||||
|
)
|
||||||
|
self.which_set = which_set
|
||||||
|
self.num_classes = 47
|
||||||
|
# construct path to data using os.path.join to ensure the correct path
|
||||||
|
# separator for the current platform / OS is used
|
||||||
|
# MLP_DATA_DIR environment variable should point to the data directory
|
||||||
|
data_path = os.path.join(
|
||||||
|
os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set))
|
||||||
|
assert os.path.isfile(data_path), (
|
||||||
|
'Data file does not exist at expected path: ' + data_path
|
||||||
|
)
|
||||||
|
# load data from compressed numpy file
|
||||||
|
loaded = np.load(data_path)
|
||||||
|
print(loaded.keys())
|
||||||
|
inputs, targets = loaded['inputs'], loaded['targets']
|
||||||
|
inputs = inputs.astype(np.float32)
|
||||||
|
inputs = np.reshape(inputs, newshape=(-1, 28*28))
|
||||||
|
inputs = inputs / 255.0
|
||||||
|
# pass the loaded data to the parent class __init__
|
||||||
|
super(EMNISTDataProvider, self).__init__(
|
||||||
|
inputs, targets, batch_size, max_num_batches, shuffle_order, rng, smooth_labels)
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||||
|
inputs_batch, targets_batch = super(EMNISTDataProvider, self).next()
|
||||||
|
|
||||||
|
if self.smooth_labels:
|
||||||
|
targets_batch_mat = self.label_smoothing(targets_batch)
|
||||||
|
else:
|
||||||
|
targets_batch_mat = self.to_one_of_k(targets_batch)
|
||||||
|
return inputs_batch, targets_batch_mat
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def to_one_of_k(self, int_targets):
|
||||||
|
"""Converts integer coded class target to 1 of K coded targets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
int_targets (ndarray): Array of integer coded class targets (i.e.
|
||||||
|
where an integer from 0 to `num_classes` - 1 is used to
|
||||||
|
indicate which is the correct class). This should be of shape
|
||||||
|
(num_data,).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of 1 of K coded targets i.e. an array of shape
|
||||||
|
(num_data, num_classes) where for each row all elements are equal
|
||||||
|
to zero except for the column corresponding to the correct class
|
||||||
|
which is equal to one.
|
||||||
|
"""
|
||||||
|
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
|
||||||
|
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
|
||||||
|
return one_of_k_targets
|
||||||
|
|
||||||
|
def label_smoothing(self, int_targets, alpha=0.1):
|
||||||
|
"""Converts integer coded class target to 1 of K coded targets with label smoothing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
int_targets (ndarray): Array of integer coded class targets (i.e.
|
||||||
|
where an integer from 0 to `num_classes` - 1 is used to
|
||||||
|
indicate which is the correct class). This should be of shape
|
||||||
|
(num_data,).
|
||||||
|
alpha (float): Smoothing factor.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of 1 of K coded targets with label smoothing i.e. an array of shape
|
||||||
|
(num_data, num_classes)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MetOfficeDataProvider(DataProvider):
|
class MetOfficeDataProvider(DataProvider):
|
||||||
"""South Scotland Met Office weather data provider."""
|
"""South Scotland Met Office weather data provider."""
|
||||||
@ -253,3 +392,41 @@ class CCPPDataProvider(DataProvider):
|
|||||||
targets = loaded[which_set + '_targets']
|
targets = loaded[which_set + '_targets']
|
||||||
super(CCPPDataProvider, self).__init__(
|
super(CCPPDataProvider, self).__init__(
|
||||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||||
|
|
||||||
|
|
||||||
|
class AugmentedMNISTDataProvider(MNISTDataProvider):
|
||||||
|
"""Data provider for MNIST dataset which randomly transforms images."""
|
||||||
|
|
||||||
|
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
|
||||||
|
shuffle_order=True, rng=None, transformer=None):
|
||||||
|
"""Create a new augmented MNIST data provider object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
which_set: One of 'train', 'valid' or 'test'. Determines which
|
||||||
|
portion of the MNIST data this object should provide.
|
||||||
|
batch_size (int): Number of data points to include in each batch.
|
||||||
|
max_num_batches (int): Maximum number of batches to iterate over
|
||||||
|
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||||
|
only as many batches as the data can be split into will be
|
||||||
|
used. If set to -1 all of the data will be used.
|
||||||
|
shuffle_order (bool): Whether to randomly permute the order of
|
||||||
|
the data before each epoch.
|
||||||
|
rng (RandomState): A seeded random number generator.
|
||||||
|
transformer: Function which takes an `inputs` array of shape
|
||||||
|
(batch_size, input_dim) corresponding to a batch of input
|
||||||
|
images and a `rng` random number generator object (i.e. a
|
||||||
|
call signature `transformer(inputs, rng)`) and applies a
|
||||||
|
potentiall random set of transformations to some / all of the
|
||||||
|
input images as each new batch is returned when iterating over
|
||||||
|
the data provider.
|
||||||
|
"""
|
||||||
|
super(AugmentedMNISTDataProvider, self).__init__(
|
||||||
|
which_set, batch_size, max_num_batches, shuffle_order, rng)
|
||||||
|
self.transformer = transformer
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||||
|
inputs_batch, targets_batch = super(
|
||||||
|
AugmentedMNISTDataProvider, self).next()
|
||||||
|
transformed_inputs_batch = self.transformer(inputs_batch, self.rng)
|
||||||
|
return transformed_inputs_batch, targets_batch
|
||||||
|
@ -154,9 +154,9 @@ class CrossEntropySoftmaxError(object):
|
|||||||
Returns:
|
Returns:
|
||||||
Scalar error function value.
|
Scalar error function value.
|
||||||
"""
|
"""
|
||||||
probs = np.exp(outputs)
|
normOutputs = outputs - outputs.max(-1)[:, None]
|
||||||
probs /= probs.sum(-1)[:, None]
|
logProb = normOutputs - np.log(np.sum(np.exp(normOutputs), axis=-1)[:, None])
|
||||||
return -np.mean(np.sum(targets * np.log(probs), axis=1))
|
return -np.mean(np.sum(targets * logProb, axis=1))
|
||||||
|
|
||||||
def grad(self, outputs, targets):
|
def grad(self, outputs, targets):
|
||||||
"""Calculates gradient of error function with respect to outputs.
|
"""Calculates gradient of error function with respect to outputs.
|
||||||
@ -168,7 +168,7 @@ class CrossEntropySoftmaxError(object):
|
|||||||
Returns:
|
Returns:
|
||||||
Gradient of error function with respect to outputs.
|
Gradient of error function with respect to outputs.
|
||||||
"""
|
"""
|
||||||
probs = np.exp(outputs)
|
probs = np.exp(outputs - outputs.max(-1)[:, None])
|
||||||
probs /= probs.sum(-1)[:, None]
|
probs /= probs.sum(-1)[:, None]
|
||||||
return (probs - targets) / outputs.shape[0]
|
return (probs - targets) / outputs.shape[0]
|
||||||
|
|
||||||
|
@ -63,3 +63,81 @@ class NormalInit(object):
|
|||||||
|
|
||||||
def __call__(self, shape):
|
def __call__(self, shape):
|
||||||
return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
|
return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
|
||||||
|
|
||||||
|
class GlorotUniformInit(object):
|
||||||
|
"""Glorot and Bengio (2010) random uniform weights initialiser.
|
||||||
|
|
||||||
|
Initialises an two-dimensional parameter array using the 'normalized
|
||||||
|
initialisation' scheme suggested in [1] which attempts to maintain a
|
||||||
|
roughly constant variance in the activations and backpropagated gradients
|
||||||
|
of a multi-layer model consisting of interleaved affine and logistic
|
||||||
|
sigmoidal transformation layers.
|
||||||
|
|
||||||
|
Weights are sampled from a zero-mean uniform distribution with standard
|
||||||
|
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
|
||||||
|
`output_dim` are the input and output dimensions of the weight matrix
|
||||||
|
respectively.
|
||||||
|
|
||||||
|
References:
|
||||||
|
[1]: Understanding the difficulty of training deep feedforward neural
|
||||||
|
networks, Glorot and Bengio (2010)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, gain=1., rng=None):
|
||||||
|
"""Construct a normalised initilisation random initialiser object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gain: Multiplicative factor to scale initialised weights by.
|
||||||
|
Recommended values is 1 for affine layers followed by
|
||||||
|
logistic sigmoid layers (or another affine layer).
|
||||||
|
rng (RandomState): Seeded random number generator.
|
||||||
|
"""
|
||||||
|
self.gain = gain
|
||||||
|
if rng is None:
|
||||||
|
rng = np.random.RandomState(DEFAULT_SEED)
|
||||||
|
self.rng = rng
|
||||||
|
|
||||||
|
def __call__(self, shape):
|
||||||
|
assert len(shape) == 2, (
|
||||||
|
'Initialiser should only be used for two dimensional arrays.')
|
||||||
|
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
|
||||||
|
half_width = 3.**0.5 * std
|
||||||
|
return self.rng.uniform(low=-half_width, high=half_width, size=shape)
|
||||||
|
|
||||||
|
|
||||||
|
class GlorotNormalInit(object):
|
||||||
|
"""Glorot and Bengio (2010) random normal weights initialiser.
|
||||||
|
|
||||||
|
Initialises an two-dimensional parameter array using the 'normalized
|
||||||
|
initialisation' scheme suggested in [1] which attempts to maintain a
|
||||||
|
roughly constant variance in the activations and backpropagated gradients
|
||||||
|
of a multi-layer model consisting of interleaved affine and logistic
|
||||||
|
sigmoidal transformation layers.
|
||||||
|
|
||||||
|
Weights are sampled from a zero-mean normal distribution with standard
|
||||||
|
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
|
||||||
|
`output_dim` are the input and output dimensions of the weight matrix
|
||||||
|
respectively.
|
||||||
|
|
||||||
|
References:
|
||||||
|
[1]: Understanding the difficulty of training deep feedforward neural
|
||||||
|
networks, Glorot and Bengio (2010)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, gain=1., rng=None):
|
||||||
|
"""Construct a normalised initilisation random initialiser object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gain: Multiplicative factor to scale initialised weights by.
|
||||||
|
Recommended values is 1 for affine layers followed by
|
||||||
|
logistic sigmoid layers (or another affine layer).
|
||||||
|
rng (RandomState): Seeded random number generator.
|
||||||
|
"""
|
||||||
|
self.gain = gain
|
||||||
|
if rng is None:
|
||||||
|
rng = np.random.RandomState(DEFAULT_SEED)
|
||||||
|
self.rng = rng
|
||||||
|
|
||||||
|
def __call__(self, shape):
|
||||||
|
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
|
||||||
|
return self.rng.normal(loc=0., scale=std, size=shape)
|
||||||
|
493
mlp/layers.py
493
mlp/layers.py
@ -14,7 +14,7 @@ respect to the layer parameters.
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import mlp.initialisers as init
|
import mlp.initialisers as init
|
||||||
|
from mlp import DEFAULT_SEED
|
||||||
|
|
||||||
class Layer(object):
|
class Layer(object):
|
||||||
"""Abstract class defining the interface for a layer."""
|
"""Abstract class defining the interface for a layer."""
|
||||||
@ -68,6 +68,13 @@ class LayerWithParameters(Layer):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def params_penalty(self):
|
||||||
|
"""Returns the parameter dependent penalty term for this layer.
|
||||||
|
|
||||||
|
If no parameter-dependent penalty terms are set this returns zero.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def params(self):
|
def params(self):
|
||||||
"""Returns a list of parameters of layer.
|
"""Returns a list of parameters of layer.
|
||||||
@ -88,6 +95,127 @@ class LayerWithParameters(Layer):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
class StochasticLayerWithParameters(Layer):
|
||||||
|
"""Specialised layer which uses a stochastic forward propagation."""
|
||||||
|
|
||||||
|
def __init__(self, rng=None):
|
||||||
|
"""Constructs a new StochasticLayer object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rng (RandomState): Seeded random number generator object.
|
||||||
|
"""
|
||||||
|
if rng is None:
|
||||||
|
rng = np.random.RandomState(DEFAULT_SEED)
|
||||||
|
self.rng = rng
|
||||||
|
|
||||||
|
def fprop(self, inputs, stochastic=True):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
stochastic: Flag allowing different deterministic
|
||||||
|
forward-propagation mode in addition to default stochastic
|
||||||
|
forward-propagation e.g. for use at test time. If False
|
||||||
|
a deterministic forward-propagation transformation
|
||||||
|
corresponding to the expected output of the stochastic
|
||||||
|
forward-propagation is applied.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||||
|
"""Calculates gradients with respect to layer parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of inputs to layer of shape (batch_size, input_dim).
|
||||||
|
grads_wrt_to_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of arrays of gradients with respect to the layer parameters
|
||||||
|
with parameter gradients appearing in same order in tuple as
|
||||||
|
returned from `get_params` method.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def params_penalty(self):
|
||||||
|
"""Returns the parameter dependent penalty term for this layer.
|
||||||
|
|
||||||
|
If no parameter-dependent penalty terms are set this returns zero.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self):
|
||||||
|
"""Returns a list of parameters of layer.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of current parameter values. This list should be in the
|
||||||
|
corresponding order to the `values` argument to `set_params`.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@params.setter
|
||||||
|
def params(self, values):
|
||||||
|
"""Sets layer parameters from a list of values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
values: List of values to set parameters to. This list should be
|
||||||
|
in the corresponding order to what is returned by `get_params`.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
class StochasticLayer(Layer):
|
||||||
|
"""Specialised layer which uses a stochastic forward propagation."""
|
||||||
|
|
||||||
|
def __init__(self, rng=None):
|
||||||
|
"""Constructs a new StochasticLayer object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rng (RandomState): Seeded random number generator object.
|
||||||
|
"""
|
||||||
|
if rng is None:
|
||||||
|
rng = np.random.RandomState(DEFAULT_SEED)
|
||||||
|
self.rng = rng
|
||||||
|
|
||||||
|
def fprop(self, inputs, stochastic=True):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
stochastic: Flag allowing different deterministic
|
||||||
|
forward-propagation mode in addition to default stochastic
|
||||||
|
forward-propagation e.g. for use at test time. If False
|
||||||
|
a deterministic forward-propagation transformation
|
||||||
|
corresponding to the expected output of the stochastic
|
||||||
|
forward-propagation is applied.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs. This should correspond to
|
||||||
|
default stochastic forward-propagation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
outputs: Array of layer outputs calculated in forward pass of
|
||||||
|
shape (batch_size, output_dim).
|
||||||
|
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of gradients with respect to the layer inputs of shape
|
||||||
|
(batch_size, input_dim).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
class AffineLayer(LayerWithParameters):
|
class AffineLayer(LayerWithParameters):
|
||||||
"""Layer implementing an affine tranformation of its inputs.
|
"""Layer implementing an affine tranformation of its inputs.
|
||||||
@ -97,7 +225,8 @@ class AffineLayer(LayerWithParameters):
|
|||||||
|
|
||||||
def __init__(self, input_dim, output_dim,
|
def __init__(self, input_dim, output_dim,
|
||||||
weights_initialiser=init.UniformInit(-0.1, 0.1),
|
weights_initialiser=init.UniformInit(-0.1, 0.1),
|
||||||
biases_initialiser=init.ConstantInit(0.)):
|
biases_initialiser=init.ConstantInit(0.),
|
||||||
|
weights_penalty=None, biases_penalty=None):
|
||||||
"""Initialises a parameterised affine layer.
|
"""Initialises a parameterised affine layer.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -105,11 +234,17 @@ class AffineLayer(LayerWithParameters):
|
|||||||
output_dim (int): Dimension of the layer outputs.
|
output_dim (int): Dimension of the layer outputs.
|
||||||
weights_initialiser: Initialiser for the weight parameters.
|
weights_initialiser: Initialiser for the weight parameters.
|
||||||
biases_initialiser: Initialiser for the bias parameters.
|
biases_initialiser: Initialiser for the bias parameters.
|
||||||
|
weights_penalty: Weights-dependent penalty term (regulariser) or
|
||||||
|
None if no regularisation is to be applied to the weights.
|
||||||
|
biases_penalty: Biases-dependent penalty term (regulariser) or
|
||||||
|
None if no regularisation is to be applied to the biases.
|
||||||
"""
|
"""
|
||||||
self.input_dim = input_dim
|
self.input_dim = input_dim
|
||||||
self.output_dim = output_dim
|
self.output_dim = output_dim
|
||||||
self.weights = weights_initialiser((self.output_dim, self.input_dim))
|
self.weights = weights_initialiser((self.output_dim, self.input_dim))
|
||||||
self.biases = biases_initialiser(self.output_dim)
|
self.biases = biases_initialiser(self.output_dim)
|
||||||
|
self.weights_penalty = weights_penalty
|
||||||
|
self.biases_penalty = biases_penalty
|
||||||
|
|
||||||
def fprop(self, inputs):
|
def fprop(self, inputs):
|
||||||
"""Forward propagates activations through the layer transformation.
|
"""Forward propagates activations through the layer transformation.
|
||||||
@ -123,7 +258,7 @@ class AffineLayer(LayerWithParameters):
|
|||||||
Returns:
|
Returns:
|
||||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
"""
|
"""
|
||||||
return inputs.dot(self.weights.T) + self.biases
|
return self.weights.dot(inputs.T).T + self.biases
|
||||||
|
|
||||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
"""Back propagates gradients through a layer.
|
"""Back propagates gradients through a layer.
|
||||||
@ -159,8 +294,27 @@ class AffineLayer(LayerWithParameters):
|
|||||||
|
|
||||||
grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
|
grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
|
||||||
grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
|
grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
|
||||||
|
|
||||||
|
if self.weights_penalty is not None:
|
||||||
|
grads_wrt_weights += self.weights_penalty.grad(parameter=self.weights)
|
||||||
|
|
||||||
|
if self.biases_penalty is not None:
|
||||||
|
grads_wrt_biases += self.biases_penalty.grad(parameter=self.biases)
|
||||||
|
|
||||||
return [grads_wrt_weights, grads_wrt_biases]
|
return [grads_wrt_weights, grads_wrt_biases]
|
||||||
|
|
||||||
|
def params_penalty(self):
|
||||||
|
"""Returns the parameter dependent penalty term for this layer.
|
||||||
|
|
||||||
|
If no parameter-dependent penalty terms are set this returns zero.
|
||||||
|
"""
|
||||||
|
params_penalty = 0
|
||||||
|
if self.weights_penalty is not None:
|
||||||
|
params_penalty += self.weights_penalty(self.weights)
|
||||||
|
if self.biases_penalty is not None:
|
||||||
|
params_penalty += self.biases_penalty(self.biases)
|
||||||
|
return params_penalty
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def params(self):
|
def params(self):
|
||||||
"""A list of layer parameter values: `[weights, biases]`."""
|
"""A list of layer parameter values: `[weights, biases]`."""
|
||||||
@ -175,7 +329,6 @@ class AffineLayer(LayerWithParameters):
|
|||||||
return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
|
return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
|
||||||
self.input_dim, self.output_dim)
|
self.input_dim, self.output_dim)
|
||||||
|
|
||||||
|
|
||||||
class SigmoidLayer(Layer):
|
class SigmoidLayer(Layer):
|
||||||
"""Layer implementing an element-wise logistic sigmoid transformation."""
|
"""Layer implementing an element-wise logistic sigmoid transformation."""
|
||||||
|
|
||||||
@ -215,6 +368,160 @@ class SigmoidLayer(Layer):
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'SigmoidLayer'
|
return 'SigmoidLayer'
|
||||||
|
|
||||||
|
class ReluLayer(Layer):
|
||||||
|
"""Layer implementing an element-wise rectified linear transformation."""
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
return np.maximum(inputs, 0.)
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
outputs: Array of layer outputs calculated in forward pass of
|
||||||
|
shape (batch_size, output_dim).
|
||||||
|
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of gradients with respect to the layer inputs of shape
|
||||||
|
(batch_size, input_dim).
|
||||||
|
"""
|
||||||
|
return (outputs > 0) * grads_wrt_outputs
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'ReluLayer'
|
||||||
|
|
||||||
|
class LeakyReluLayer(Layer):
|
||||||
|
"""Layer implementing an element-wise leaky rectified linear transformation."""
|
||||||
|
def __init__(self, alpha=0.01):
|
||||||
|
self.alpha = alpha
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'LeakyReluLayer'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ParametricReluLayer(LayerWithParameters):
|
||||||
|
"""Layer implementing an element-wise parametric rectified linear transformation."""
|
||||||
|
|
||||||
|
def __init__(self, alpha=0.25):
|
||||||
|
self.alpha = np.array([alpha])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self):
|
||||||
|
"""A list of layer parameter values: `[weights, biases]`."""
|
||||||
|
return [self.alpha]
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
For inputs `x` and outputs `y` this corresponds to `y = ..., else`.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||||
|
"""Calculates gradients with respect to layer parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: array of inputs to layer of shape (batch_size, input_dim)
|
||||||
|
grads_wrt_to_outputs: array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of arrays of gradients with respect to the layer parameters
|
||||||
|
`[grads_wrt_params]`. Where params is the alpha parameter.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self):
|
||||||
|
"""A list of layer parameter values: `[weights, biases]`."""
|
||||||
|
return [self.alpha]
|
||||||
|
|
||||||
|
@params.setter
|
||||||
|
def params(self, values):
|
||||||
|
self.alpha = values[0]
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'ParametricReluLayer'
|
||||||
|
|
||||||
|
|
||||||
|
class TanhLayer(Layer):
|
||||||
|
"""Layer implementing an element-wise hyperbolic tangent transformation."""
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
return np.tanh(inputs)
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
outputs: Array of layer outputs calculated in forward pass of
|
||||||
|
shape (batch_size, output_dim).
|
||||||
|
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of gradients with respect to the layer inputs of shape
|
||||||
|
(batch_size, input_dim).
|
||||||
|
"""
|
||||||
|
return (1. - outputs**2) * grads_wrt_outputs
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'TanhLayer'
|
||||||
|
|
||||||
class SoftmaxLayer(Layer):
|
class SoftmaxLayer(Layer):
|
||||||
"""Layer implementing a softmax transformation."""
|
"""Layer implementing a softmax transformation."""
|
||||||
@ -232,7 +539,9 @@ class SoftmaxLayer(Layer):
|
|||||||
Returns:
|
Returns:
|
||||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
"""
|
"""
|
||||||
exp_inputs = np.exp(inputs)
|
# subtract max inside exponential to improve numerical stability -
|
||||||
|
# when we divide through by sum this term cancels
|
||||||
|
exp_inputs = np.exp(inputs - inputs.max(-1)[:, None])
|
||||||
return exp_inputs / exp_inputs.sum(-1)[:, None]
|
return exp_inputs / exp_inputs.sum(-1)[:, None]
|
||||||
|
|
||||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
@ -257,3 +566,177 @@ class SoftmaxLayer(Layer):
|
|||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'SoftmaxLayer'
|
return 'SoftmaxLayer'
|
||||||
|
|
||||||
|
class RadialBasisFunctionLayer(Layer):
|
||||||
|
"""Layer implementing projection to a grid of radial basis functions."""
|
||||||
|
|
||||||
|
def __init__(self, grid_dim, intervals=[[0., 1.]]):
|
||||||
|
"""Creates a radial basis function layer object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grid_dim: Integer specifying how many basis function to use in
|
||||||
|
grid across input space per dimension (so total number of
|
||||||
|
basis functions will be grid_dim**input_dim)
|
||||||
|
intervals: List of intervals (two element lists or tuples)
|
||||||
|
specifying extents of axis-aligned region in input-space to
|
||||||
|
tile basis functions in grid across. For example for a 2D input
|
||||||
|
space spanning [0, 1] x [0, 1] use intervals=[[0, 1], [0, 1]].
|
||||||
|
"""
|
||||||
|
num_basis = grid_dim**len(intervals)
|
||||||
|
self.centres = np.array(np.meshgrid(*[
|
||||||
|
np.linspace(low, high, grid_dim) for (low, high) in intervals])
|
||||||
|
).reshape((len(intervals), -1))
|
||||||
|
self.scales = np.array([
|
||||||
|
[(high - low) * 1. / grid_dim] for (low, high) in intervals])
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
return np.exp(-(inputs[..., None] - self.centres[None, ...])**2 /
|
||||||
|
self.scales**2).reshape((inputs.shape[0], -1))
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
outputs: Array of layer outputs calculated in forward pass of
|
||||||
|
shape (batch_size, output_dim).
|
||||||
|
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of gradients with respect to the layer inputs of shape
|
||||||
|
(batch_size, input_dim).
|
||||||
|
"""
|
||||||
|
num_basis = self.centres.shape[1]
|
||||||
|
return -2 * (
|
||||||
|
((inputs[..., None] - self.centres[None, ...]) / self.scales**2) *
|
||||||
|
grads_wrt_outputs.reshape((inputs.shape[0], -1, num_basis))
|
||||||
|
).sum(-1)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)
|
||||||
|
|
||||||
|
class DropoutLayer(StochasticLayer):
|
||||||
|
"""Layer which stochastically drops input dimensions in its output."""
|
||||||
|
|
||||||
|
def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True):
|
||||||
|
"""Construct a new dropout layer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rng (RandomState): Seeded random number generator.
|
||||||
|
incl_prob: Scalar value in (0, 1] specifying the probability of
|
||||||
|
each input dimension being included in the output.
|
||||||
|
share_across_batch: Whether to use same dropout mask across
|
||||||
|
all inputs in a batch or use per input masks.
|
||||||
|
"""
|
||||||
|
super(DropoutLayer, self).__init__(rng)
|
||||||
|
assert incl_prob > 0. and incl_prob <= 1.
|
||||||
|
self.incl_prob = incl_prob
|
||||||
|
self.share_across_batch = share_across_batch
|
||||||
|
self.rng = rng
|
||||||
|
|
||||||
|
def fprop(self, inputs, stochastic=True):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
stochastic: Flag allowing different deterministic
|
||||||
|
forward-propagation mode in addition to default stochastic
|
||||||
|
forward-propagation e.g. for use at test time. If False
|
||||||
|
a deterministic forward-propagation transformation
|
||||||
|
corresponding to the expected output of the stochastic
|
||||||
|
forward-propagation is applied.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs. This should correspond to
|
||||||
|
default stochastic forward-propagation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
outputs: Array of layer outputs calculated in forward pass of
|
||||||
|
shape (batch_size, output_dim).
|
||||||
|
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of gradients with respect to the layer inputs of shape
|
||||||
|
(batch_size, input_dim).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob)
|
||||||
|
|
||||||
|
class ReshapeLayer(Layer):
|
||||||
|
"""Layer which reshapes dimensions of inputs."""
|
||||||
|
|
||||||
|
def __init__(self, output_shape=None):
|
||||||
|
"""Create a new reshape layer object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_shape: Tuple specifying shape each input in batch should
|
||||||
|
be reshaped to in outputs. This **excludes** the batch size
|
||||||
|
so the shape of the final output array will be
|
||||||
|
(batch_size, ) + output_shape
|
||||||
|
Similarly to numpy.reshape, one shape dimension can be -1. In
|
||||||
|
this case, the value is inferred from the size of the input
|
||||||
|
array and remaining dimensions. The shape specified must be
|
||||||
|
compatible with the input array shape - i.e. the total number
|
||||||
|
of values in the array cannot be changed. If set to `None` the
|
||||||
|
output shape will be set to
|
||||||
|
(batch_size, -1)
|
||||||
|
which will flatten all the inputs to vectors.
|
||||||
|
"""
|
||||||
|
self.output_shape = (-1,) if output_shape is None else output_shape
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
return inputs.reshape((inputs.shape[0],) + self.output_shape)
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
outputs: Array of layer outputs calculated in forward pass of
|
||||||
|
shape (batch_size, output_dim).
|
||||||
|
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of gradients with respect to the layer inputs of shape
|
||||||
|
(batch_size, input_dim).
|
||||||
|
"""
|
||||||
|
return grads_wrt_outputs.reshape(inputs.shape)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'ReshapeLayer(output_shape={0})'.format(self.output_shape)
|
||||||
|
@ -160,3 +160,158 @@ class MomentumLearningRule(GradientDescentLearningRule):
|
|||||||
mom *= self.mom_coeff
|
mom *= self.mom_coeff
|
||||||
mom -= self.learning_rate * grad
|
mom -= self.learning_rate * grad
|
||||||
param += mom
|
param += mom
|
||||||
|
|
||||||
|
|
||||||
|
class AdamLearningRule(GradientDescentLearningRule):
|
||||||
|
"""Adaptive moments (Adam) learning rule.
|
||||||
|
First-order gradient-descent based learning rule which uses adaptive
|
||||||
|
estimates of first and second moments of the parameter gradients to
|
||||||
|
calculate the parameter updates.
|
||||||
|
References:
|
||||||
|
[1]: Adam: a method for stochastic optimisation
|
||||||
|
Kingma and Ba, 2015
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
|
||||||
|
epsilon=1e-8):
|
||||||
|
"""Creates a new learning rule object.
|
||||||
|
Args:
|
||||||
|
learning_rate: A postive scalar to scale gradient updates to the
|
||||||
|
parameters by. This needs to be carefully set - if too large
|
||||||
|
the learning dynamic will be unstable and may diverge, while
|
||||||
|
if set too small learning will proceed very slowly.
|
||||||
|
beta_1: Exponential decay rate for gradient first moment estimates.
|
||||||
|
This should be a scalar value in [0, 1]. The running gradient
|
||||||
|
first moment estimate is calculated using
|
||||||
|
`m_1 = beta_1 * m_1_prev + (1 - beta_1) * g`
|
||||||
|
where `m_1_prev` is the previous estimate and `g` the current
|
||||||
|
parameter gradients.
|
||||||
|
beta_2: Exponential decay rate for gradient second moment
|
||||||
|
estimates. This should be a scalar value in [0, 1]. The run
|
||||||
|
gradient second moment estimate is calculated using
|
||||||
|
`m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2`
|
||||||
|
where `m_2_prev` is the previous estimate and `g` the current
|
||||||
|
parameter gradients.
|
||||||
|
epsilon: 'Softening' parameter to stop updates diverging when
|
||||||
|
second moment estimates are close to zero. Should be set to
|
||||||
|
a small positive value.
|
||||||
|
"""
|
||||||
|
super(AdamLearningRule, self).__init__(learning_rate)
|
||||||
|
assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].'
|
||||||
|
assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].'
|
||||||
|
assert epsilon > 0., 'epsilon should be > 0.'
|
||||||
|
self.beta_1 = beta_1
|
||||||
|
self.beta_2 = beta_2
|
||||||
|
self.epsilon = epsilon
|
||||||
|
|
||||||
|
def initialise(self, params):
|
||||||
|
"""Initialises the state of the learning rule for a set or parameters.
|
||||||
|
This must be called before `update_params` is first called.
|
||||||
|
Args:
|
||||||
|
params: A list of the parameters to be optimised. Note these will
|
||||||
|
be updated *in-place* to avoid reallocating arrays on each
|
||||||
|
update.
|
||||||
|
"""
|
||||||
|
super(AdamLearningRule, self).initialise(params)
|
||||||
|
self.moms_1 = []
|
||||||
|
for param in self.params:
|
||||||
|
self.moms_1.append(np.zeros_like(param))
|
||||||
|
self.moms_2 = []
|
||||||
|
for param in self.params:
|
||||||
|
self.moms_2.append(np.zeros_like(param))
|
||||||
|
self.step_count = 0
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets any additional state variables to their initial values.
|
||||||
|
For this learning rule this corresponds to zeroing the estimates of
|
||||||
|
the first and second moments of the gradients.
|
||||||
|
"""
|
||||||
|
for mom_1, mom_2 in zip(self.moms_1, self.moms_2):
|
||||||
|
mom_1 *= 0.
|
||||||
|
mom_2 *= 0.
|
||||||
|
self.step_count = 0
|
||||||
|
|
||||||
|
def update_params(self, grads_wrt_params):
|
||||||
|
"""Applies a single update to all parameters.
|
||||||
|
All parameter updates are performed using in-place operations and so
|
||||||
|
nothing is returned.
|
||||||
|
Args:
|
||||||
|
grads_wrt_params: A list of gradients of the scalar loss function
|
||||||
|
with respect to each of the parameters passed to `initialise`
|
||||||
|
previously, with this list expected to be in the same order.
|
||||||
|
"""
|
||||||
|
for param, mom_1, mom_2, grad in zip(
|
||||||
|
self.params, self.moms_1, self.moms_2, grads_wrt_params):
|
||||||
|
mom_1 *= self.beta_1
|
||||||
|
mom_1 += (1. - self.beta_1) * grad
|
||||||
|
mom_2 *= self.beta_2
|
||||||
|
mom_2 += (1. - self.beta_2) * grad ** 2
|
||||||
|
alpha_t = (
|
||||||
|
self.learning_rate *
|
||||||
|
(1. - self.beta_2 ** (self.step_count + 1)) ** 0.5 /
|
||||||
|
(1. - self.beta_1 ** (self.step_count + 1))
|
||||||
|
)
|
||||||
|
param -= alpha_t * mom_1 / (mom_2 ** 0.5 + self.epsilon)
|
||||||
|
self.step_count += 1
|
||||||
|
|
||||||
|
|
||||||
|
class AdaGradLearningRule(GradientDescentLearningRule):
|
||||||
|
"""Adaptive gradients (AdaGrad) learning rule.
|
||||||
|
First-order gradient-descent based learning rule which normalises gradient
|
||||||
|
updates by a running sum of the past squared gradients.
|
||||||
|
References:
|
||||||
|
[1]: Adaptive Subgradient Methods for Online Learning and Stochastic
|
||||||
|
Optimization. Duchi, Haxan and Singer, 2011
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learning_rate=1e-2, epsilon=1e-8):
|
||||||
|
"""Creates a new learning rule object.
|
||||||
|
Args:
|
||||||
|
learning_rate: A postive scalar to scale gradient updates to the
|
||||||
|
parameters by. This needs to be carefully set - if too large
|
||||||
|
the learning dynamic will be unstable and may diverge, while
|
||||||
|
if set too small learning will proceed very slowly.
|
||||||
|
epsilon: 'Softening' parameter to stop updates diverging when
|
||||||
|
sums of squared gradients are close to zero. Should be set to
|
||||||
|
a small positive value.
|
||||||
|
"""
|
||||||
|
super(AdaGradLearningRule, self).__init__(learning_rate)
|
||||||
|
assert epsilon > 0., 'epsilon should be > 0.'
|
||||||
|
self.epsilon = epsilon
|
||||||
|
|
||||||
|
def initialise(self, params):
|
||||||
|
"""Initialises the state of the learning rule for a set or parameters.
|
||||||
|
This must be called before `update_params` is first called.
|
||||||
|
Args:
|
||||||
|
params: A list of the parameters to be optimised. Note these will
|
||||||
|
be updated *in-place* to avoid reallocating arrays on each
|
||||||
|
update.
|
||||||
|
"""
|
||||||
|
super(AdaGradLearningRule, self).initialise(params)
|
||||||
|
self.sum_sq_grads = []
|
||||||
|
for param in self.params:
|
||||||
|
self.sum_sq_grads.append(np.zeros_like(param))
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets any additional state variables to their initial values.
|
||||||
|
For this learning rule this corresponds to zeroing all the sum of
|
||||||
|
squared gradient states.
|
||||||
|
"""
|
||||||
|
for sum_sq_grad in self.sum_sq_grads:
|
||||||
|
sum_sq_grad *= 0.
|
||||||
|
|
||||||
|
def update_params(self, grads_wrt_params):
|
||||||
|
"""Applies a single update to all parameters.
|
||||||
|
All parameter updates are performed using in-place operations and so
|
||||||
|
nothing is returned.
|
||||||
|
Args:
|
||||||
|
grads_wrt_params: A list of gradients of the scalar loss function
|
||||||
|
with respect to each of the parameters passed to `initialise`
|
||||||
|
previously, with this list expected to be in the same order.
|
||||||
|
"""
|
||||||
|
for param, sum_sq_grad, grad in zip(
|
||||||
|
self.params, self.sum_sq_grads, grads_wrt_params):
|
||||||
|
sum_sq_grad += grad ** 2
|
||||||
|
param -= (self.learning_rate * grad /
|
||||||
|
(sum_sq_grad + self.epsilon) ** 0.5)
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ outputs (and intermediate states) and for calculating gradients of scalar
|
|||||||
functions of the outputs with respect to the model parameters.
|
functions of the outputs with respect to the model parameters.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from mlp.layers import LayerWithParameters
|
from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters
|
||||||
|
|
||||||
|
|
||||||
class SingleLayerModel(object):
|
class SingleLayerModel(object):
|
||||||
@ -80,11 +80,11 @@ class MultipleLayerModel(object):
|
|||||||
"""A list of all of the parameters of the model."""
|
"""A list of all of the parameters of the model."""
|
||||||
params = []
|
params = []
|
||||||
for layer in self.layers:
|
for layer in self.layers:
|
||||||
if isinstance(layer, LayerWithParameters):
|
if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
|
||||||
params += layer.params
|
params += layer.params
|
||||||
return params
|
return params
|
||||||
|
|
||||||
def fprop(self, inputs):
|
def fprop(self, inputs, evaluation=False):
|
||||||
"""Forward propagates a batch of inputs through the model.
|
"""Forward propagates a batch of inputs through the model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -97,7 +97,19 @@ class MultipleLayerModel(object):
|
|||||||
"""
|
"""
|
||||||
activations = [inputs]
|
activations = [inputs]
|
||||||
for i, layer in enumerate(self.layers):
|
for i, layer in enumerate(self.layers):
|
||||||
activations.append(self.layers[i].fprop(activations[i]))
|
if evaluation:
|
||||||
|
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
|
||||||
|
StochasticLayerWithParameters):
|
||||||
|
current_activations = self.layers[i].fprop(activations[i], stochastic=False)
|
||||||
|
else:
|
||||||
|
current_activations = self.layers[i].fprop(activations[i])
|
||||||
|
else:
|
||||||
|
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
|
||||||
|
StochasticLayerWithParameters):
|
||||||
|
current_activations = self.layers[i].fprop(activations[i], stochastic=True)
|
||||||
|
else:
|
||||||
|
current_activations = self.layers[i].fprop(activations[i])
|
||||||
|
activations.append(current_activations)
|
||||||
return activations
|
return activations
|
||||||
|
|
||||||
def grads_wrt_params(self, activations, grads_wrt_outputs):
|
def grads_wrt_params(self, activations, grads_wrt_outputs):
|
||||||
@ -119,7 +131,7 @@ class MultipleLayerModel(object):
|
|||||||
inputs = activations[-i - 2]
|
inputs = activations[-i - 2]
|
||||||
outputs = activations[-i - 1]
|
outputs = activations[-i - 1]
|
||||||
grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
|
grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)
|
||||||
if isinstance(layer, LayerWithParameters):
|
if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters):
|
||||||
grads_wrt_params += layer.grads_wrt_params(
|
grads_wrt_params += layer.grads_wrt_params(
|
||||||
inputs, grads_wrt_outputs)[::-1]
|
inputs, grads_wrt_outputs)[::-1]
|
||||||
grads_wrt_outputs = grads_wrt_inputs
|
grads_wrt_outputs = grads_wrt_inputs
|
||||||
|
@ -9,7 +9,7 @@ import time
|
|||||||
import logging
|
import logging
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import tqdm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -18,7 +18,7 @@ class Optimiser(object):
|
|||||||
"""Basic model optimiser."""
|
"""Basic model optimiser."""
|
||||||
|
|
||||||
def __init__(self, model, error, learning_rule, train_dataset,
|
def __init__(self, model, error, learning_rule, train_dataset,
|
||||||
valid_dataset=None, data_monitors=None):
|
valid_dataset=None, data_monitors=None, notebook=False):
|
||||||
"""Create a new optimiser instance.
|
"""Create a new optimiser instance.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -43,6 +43,11 @@ class Optimiser(object):
|
|||||||
self.data_monitors = OrderedDict([('error', error)])
|
self.data_monitors = OrderedDict([('error', error)])
|
||||||
if data_monitors is not None:
|
if data_monitors is not None:
|
||||||
self.data_monitors.update(data_monitors)
|
self.data_monitors.update(data_monitors)
|
||||||
|
self.notebook = notebook
|
||||||
|
if notebook:
|
||||||
|
self.tqdm_progress = tqdm.tqdm_notebook
|
||||||
|
else:
|
||||||
|
self.tqdm_progress = tqdm.tqdm
|
||||||
|
|
||||||
def do_training_epoch(self):
|
def do_training_epoch(self):
|
||||||
"""Do a single training epoch.
|
"""Do a single training epoch.
|
||||||
@ -52,12 +57,15 @@ class Optimiser(object):
|
|||||||
respect to all the model parameters and then updates the model
|
respect to all the model parameters and then updates the model
|
||||||
parameters according to the learning rule.
|
parameters according to the learning rule.
|
||||||
"""
|
"""
|
||||||
for inputs_batch, targets_batch in self.train_dataset:
|
with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar:
|
||||||
activations = self.model.fprop(inputs_batch)
|
train_progress_bar.set_description("Ep Prog")
|
||||||
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
|
for inputs_batch, targets_batch in self.train_dataset:
|
||||||
grads_wrt_params = self.model.grads_wrt_params(
|
activations = self.model.fprop(inputs_batch)
|
||||||
activations, grads_wrt_outputs)
|
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
|
||||||
self.learning_rule.update_params(grads_wrt_params)
|
grads_wrt_params = self.model.grads_wrt_params(
|
||||||
|
activations, grads_wrt_outputs)
|
||||||
|
self.learning_rule.update_params(grads_wrt_params)
|
||||||
|
train_progress_bar.update(1)
|
||||||
|
|
||||||
def eval_monitors(self, dataset, label):
|
def eval_monitors(self, dataset, label):
|
||||||
"""Evaluates the monitors for the given dataset.
|
"""Evaluates the monitors for the given dataset.
|
||||||
@ -72,7 +80,7 @@ class Optimiser(object):
|
|||||||
data_mon_vals = OrderedDict([(key + label, 0.) for key
|
data_mon_vals = OrderedDict([(key + label, 0.) for key
|
||||||
in self.data_monitors.keys()])
|
in self.data_monitors.keys()])
|
||||||
for inputs_batch, targets_batch in dataset:
|
for inputs_batch, targets_batch in dataset:
|
||||||
activations = self.model.fprop(inputs_batch)
|
activations = self.model.fprop(inputs_batch, evaluation=True)
|
||||||
for key, data_monitor in self.data_monitors.items():
|
for key, data_monitor in self.data_monitors.items():
|
||||||
data_mon_vals[key + label] += data_monitor(
|
data_mon_vals[key + label] += data_monitor(
|
||||||
activations[-1], targets_batch)
|
activations[-1], targets_batch)
|
||||||
@ -104,7 +112,7 @@ class Optimiser(object):
|
|||||||
"""
|
"""
|
||||||
logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format(
|
logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format(
|
||||||
epoch, epoch_time,
|
epoch, epoch_time,
|
||||||
', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
|
', '.join(['{}={:.2e}'.format(k, v) for (k, v) in stats.items()])
|
||||||
))
|
))
|
||||||
|
|
||||||
def train(self, num_epochs, stats_interval=5):
|
def train(self, num_epochs, stats_interval=5):
|
||||||
@ -121,17 +129,20 @@ class Optimiser(object):
|
|||||||
and the second being a dict mapping the labels for the statistics
|
and the second being a dict mapping the labels for the statistics
|
||||||
recorded to their column index in the array.
|
recorded to their column index in the array.
|
||||||
"""
|
"""
|
||||||
start_train_time = time.process_time()
|
start_train_time = time.time()
|
||||||
run_stats = [list(self.get_epoch_stats().values())]
|
run_stats = [list(self.get_epoch_stats().values())]
|
||||||
for epoch in range(1, num_epochs + 1):
|
with self.tqdm_progress(total=num_epochs) as progress_bar:
|
||||||
start_time = time.process_time()
|
progress_bar.set_description("Exp Prog")
|
||||||
self.do_training_epoch()
|
for epoch in range(1, num_epochs + 1):
|
||||||
epoch_time = time.process_time() - start_time
|
start_time = time.time()
|
||||||
if epoch % stats_interval == 0:
|
self.do_training_epoch()
|
||||||
stats = self.get_epoch_stats()
|
epoch_time = time.time()- start_time
|
||||||
self.log_stats(epoch, epoch_time, stats)
|
if epoch % stats_interval == 0:
|
||||||
run_stats.append(list(stats.values()))
|
stats = self.get_epoch_stats()
|
||||||
finish_train_time = time.process_time()
|
self.log_stats(epoch, epoch_time, stats)
|
||||||
|
run_stats.append(list(stats.values()))
|
||||||
|
progress_bar.update(1)
|
||||||
|
finish_train_time = time.time()
|
||||||
total_train_time = finish_train_time - start_train_time
|
total_train_time = finish_train_time - start_train_time
|
||||||
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time
|
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time
|
||||||
|
|
||||||
|
@ -32,3 +32,42 @@ class ConstantLearningRateScheduler(object):
|
|||||||
epoch_number: Integer index of training epoch about to be run.
|
epoch_number: Integer index of training epoch about to be run.
|
||||||
"""
|
"""
|
||||||
learning_rule.learning_rate = self.learning_rate
|
learning_rule.learning_rate = self.learning_rate
|
||||||
|
|
||||||
|
class CosineAnnealingWithWarmRestarts(object):
|
||||||
|
"""Cosine annealing scheduler, implemented as in https://arxiv.org/pdf/1608.03983.pdf"""
|
||||||
|
|
||||||
|
def __init__(self, min_learning_rate, max_learning_rate, total_iters_per_period, max_learning_rate_discount_factor,
|
||||||
|
period_iteration_expansion_factor):
|
||||||
|
"""
|
||||||
|
Instantiates a new cosine annealing with warm restarts learning rate scheduler
|
||||||
|
:param min_learning_rate: The minimum learning rate the scheduler can assign
|
||||||
|
:param max_learning_rate: The maximum learning rate the scheduler can assign
|
||||||
|
:param total_epochs_per_period: The number of epochs in a period
|
||||||
|
:param max_learning_rate_discount_factor: The rate of discount for the maximum learning rate after each restart i.e. how many times smaller the max learning rate will be after a restart compared to the previous one
|
||||||
|
:param period_iteration_expansion_factor: The rate of expansion of the period epochs. e.g. if it's set to 1 then all periods have the same number of epochs, if it's larger than 1 then each subsequent period will have more epochs and vice versa.
|
||||||
|
"""
|
||||||
|
self.min_learning_rate = min_learning_rate
|
||||||
|
self.max_learning_rate = max_learning_rate
|
||||||
|
self.total_epochs_per_period = total_iters_per_period
|
||||||
|
|
||||||
|
self.max_learning_rate_discount_factor = max_learning_rate_discount_factor
|
||||||
|
self.period_iteration_expansion_factor = period_iteration_expansion_factor
|
||||||
|
|
||||||
|
|
||||||
|
def update_learning_rule(self, learning_rule, epoch_number):
|
||||||
|
"""Update the hyperparameters of the learning rule.
|
||||||
|
|
||||||
|
Run at the beginning of each epoch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
learning_rule: Learning rule object being used in training run,
|
||||||
|
any scheduled hyperparameters to be altered should be
|
||||||
|
attributes of this object.
|
||||||
|
epoch_number: Integer index of training epoch about to be run.
|
||||||
|
Returns:
|
||||||
|
effective_learning_rate at step 'epoch_number'
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user