update lab 2
This commit is contained in:
parent
f5579c980d
commit
2702ee6f7b
@ -75,6 +75,9 @@ class DataProvider(object):
|
|||||||
self.inputs = self.inputs[new_order]
|
self.inputs = self.inputs[new_order]
|
||||||
self.targets = self.targets[new_order]
|
self.targets = self.targets[new_order]
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
return self.next()
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||||
if self._curr_batch + 1 > self.num_batches:
|
if self._curr_batch + 1 > self.num_batches:
|
||||||
@ -133,13 +136,10 @@ class MNISTDataProvider(DataProvider):
|
|||||||
super(MNISTDataProvider, self).__init__(
|
super(MNISTDataProvider, self).__init__(
|
||||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||||
|
|
||||||
# def next(self):
|
def next(self):
|
||||||
# """Returns next data batch or raises `StopIteration` if at end."""
|
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||||
# inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
|
inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
|
||||||
# return inputs_batch, self.to_one_of_k(targets_batch)
|
return inputs_batch, self.to_one_of_k(targets_batch)
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
return self.next()
|
|
||||||
|
|
||||||
def to_one_of_k(self, int_targets):
|
def to_one_of_k(self, int_targets):
|
||||||
"""Converts integer coded class target to 1 of K coded targets.
|
"""Converts integer coded class target to 1 of K coded targets.
|
||||||
@ -156,21 +156,23 @@ class MNISTDataProvider(DataProvider):
|
|||||||
to zero except for the column corresponding to the correct class
|
to zero except for the column corresponding to the correct class
|
||||||
which is equal to one.
|
which is equal to one.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
|
||||||
|
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
|
||||||
|
return one_of_k_targets
|
||||||
|
|
||||||
|
|
||||||
class MetOfficeDataProvider(DataProvider):
|
class MetOfficeDataProvider(DataProvider):
|
||||||
"""South Scotland Met Office weather data provider."""
|
"""South Scotland Met Office weather data provider."""
|
||||||
|
|
||||||
def __init__(self, window_size, batch_size=10, max_num_batches=-1,
|
def __init__(self, window_size, batch_size=10, max_num_batches=-1,
|
||||||
shuffle_order=True, rng=None):
|
shuffle_order=True, rng=None):
|
||||||
"""Create a new Met Offfice data provider object.
|
"""Create a new Met Office data provider object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
window_size (int): Size of windows to split weather time series
|
window_size (int): Size of windows to split weather time series
|
||||||
data into. The constructed input features will be the first
|
data into. The constructed input features will be the first
|
||||||
`window_size - 1` entries in each window and the target outputs
|
`window_size - 1` entries in each window and the target outputs
|
||||||
the last entry in each window.
|
the last entry in each window.
|
||||||
batch_size (int): Number of data points to include in each batch.
|
batch_size (int): Number of data points to include in each batch.
|
||||||
max_num_batches (int): Maximum number of batches to iterate over
|
max_num_batches (int): Maximum number of batches to iterate over
|
||||||
in an epoch. If `max_num_batches * batch_size > num_data` then
|
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||||
@ -180,29 +182,74 @@ class MetOfficeDataProvider(DataProvider):
|
|||||||
the data before each epoch.
|
the data before each epoch.
|
||||||
rng (RandomState): A seeded random number generator.
|
rng (RandomState): A seeded random number generator.
|
||||||
"""
|
"""
|
||||||
self.window_size = window_size
|
|
||||||
assert window_size > 1, 'window_size must be at least 2.'
|
|
||||||
data_path = os.path.join(
|
data_path = os.path.join(
|
||||||
os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
|
os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
|
||||||
assert os.path.isfile(data_path), (
|
assert os.path.isfile(data_path), (
|
||||||
'Data file does not exist at expected path: ' + data_path
|
'Data file does not exist at expected path: ' + data_path
|
||||||
)
|
)
|
||||||
#TODO: load raw data from text file
|
raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32))
|
||||||
|
assert window_size > 1, 'window_size must be at least 2.'
|
||||||
|
self.window_size = window_size
|
||||||
|
# filter out all missing datapoints and flatten to a vector
|
||||||
|
filtered = raw[raw >= 0].flatten()
|
||||||
|
# normalise data to zero mean, unit standard deviation
|
||||||
|
mean = np.mean(filtered)
|
||||||
|
std = np.std(filtered)
|
||||||
|
normalised = (filtered - mean) / std
|
||||||
|
# create a view on to array corresponding to a rolling window
|
||||||
|
shape = (normalised.shape[-1] - self.window_size + 1, self.window_size)
|
||||||
|
strides = normalised.strides + (normalised.strides[-1],)
|
||||||
|
windowed = np.lib.stride_tricks.as_strided(
|
||||||
|
normalised, shape=shape, strides=strides)
|
||||||
|
# inputs are first (window_size - 1) entries in windows
|
||||||
|
inputs = windowed[:, :-1]
|
||||||
|
# targets are last entry in windows
|
||||||
|
targets = windowed[:, -1]
|
||||||
|
super(MetOfficeDataProvider, self).__init__(
|
||||||
|
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||||
|
|
||||||
#TODO: filter out all missing datapoints and flatten to a vector
|
class CCPPDataProvider(DataProvider):
|
||||||
|
|
||||||
#TODO: normalise data to zero mean, unit standard deviation
|
def __init__(self, which_set='train', input_dims=None, batch_size=10,
|
||||||
|
max_num_batches=-1, shuffle_order=True, rng=None):
|
||||||
|
"""Create a new Combined Cycle Power Plant data provider object.
|
||||||
|
|
||||||
#TODO: convert from flat sequence to windowed data
|
Args:
|
||||||
|
which_set: One of 'train' or 'valid'. Determines which portion of
|
||||||
#TODO: separate into inputs and targets
|
data this object should provide.
|
||||||
# inputs are the first (window_size - 1) entries in windows
|
input_dims: Which of the four input dimension to use. If `None` all
|
||||||
# inputs = ...
|
are used. If an iterable of integers are provided (consisting
|
||||||
# targets are the last entries in windows
|
of a subset of {0, 1, 2, 3}) then only the corresponding
|
||||||
# targets = ...
|
input dimensions are included.
|
||||||
|
batch_size (int): Number of data points to include in each batch.
|
||||||
# initialise base class with inputs and targets arrays (uncomment below)
|
max_num_batches (int): Maximum number of batches to iterate over
|
||||||
# super(MetOfficeDataProvider, self).__init__(
|
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||||
# inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
only as many batches as the data can be split into will be
|
||||||
def __next__(self):
|
used. If set to -1 all of the data will be used.
|
||||||
return self.next()
|
shuffle_order (bool): Whether to randomly permute the order of
|
||||||
|
the data before each epoch.
|
||||||
|
rng (RandomState): A seeded random number generator.
|
||||||
|
"""
|
||||||
|
data_path = os.path.join(
|
||||||
|
os.environ['MLP_DATA_DIR'], 'ccpp_data.npz')
|
||||||
|
assert os.path.isfile(data_path), (
|
||||||
|
'Data file does not exist at expected path: ' + data_path
|
||||||
|
)
|
||||||
|
# check a valid which_set was provided
|
||||||
|
assert which_set in ['train', 'valid'], (
|
||||||
|
'Expected which_set to be either train or valid '
|
||||||
|
'Got {0}'.format(which_set)
|
||||||
|
)
|
||||||
|
# check input_dims are valid
|
||||||
|
if not input_dims is not None:
|
||||||
|
input_dims = set(input_dims)
|
||||||
|
assert input_dims.issubset({0, 1, 2, 3}), (
|
||||||
|
'input_dims should be a subset of {0, 1, 2, 3}'
|
||||||
|
)
|
||||||
|
loaded = np.load(data_path)
|
||||||
|
inputs = loaded[which_set + '_inputs']
|
||||||
|
if input_dims is not None:
|
||||||
|
inputs = inputs[:, input_dims]
|
||||||
|
targets = loaded[which_set + '_targets']
|
||||||
|
super(CCPPDataProvider, self).__init__(
|
||||||
|
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||||
|
46
mlp/errors.py
Normal file
46
mlp/errors.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Error functions.
|
||||||
|
|
||||||
|
This module defines error functions, with the aim of model training being to
|
||||||
|
minimise the error function given a set of inputs and target outputs.
|
||||||
|
|
||||||
|
The error functions will typically measure some concept of distance between the
|
||||||
|
model outputs and target outputs, averaged over all data points in the data set
|
||||||
|
or batch.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class SumOfSquaredDiffsError(object):
|
||||||
|
"""Sum of squared differences (squared Euclidean distance) error."""
|
||||||
|
|
||||||
|
def __call__(self, outputs, targets):
|
||||||
|
"""Calculates error function given a batch of outputs and targets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs: Array of model outputs of shape (batch_size, output_dim).
|
||||||
|
targets: Array of target outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Scalar error function value.
|
||||||
|
"""
|
||||||
|
#TODO write your code here
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def grad(self, outputs, targets):
|
||||||
|
"""Calculates gradient of error function with respect to outputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs: Array of model outputs of shape (batch_size, output_dim).
|
||||||
|
targets: Array of target outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Gradient of error function with respect to outputs. This should be
|
||||||
|
an array of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
#TODO write your code here
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'SumOfSquaredDiffsError'
|
65
mlp/initialisers.py
Normal file
65
mlp/initialisers.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Parameter initialisers.
|
||||||
|
|
||||||
|
This module defines classes to initialise the parameters in a layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from mlp import DEFAULT_SEED
|
||||||
|
|
||||||
|
|
||||||
|
class ConstantInit(object):
|
||||||
|
"""Constant parameter initialiser."""
|
||||||
|
|
||||||
|
def __init__(self, value):
|
||||||
|
"""Construct a constant parameter initialiser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value: Value to initialise parameter to.
|
||||||
|
"""
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
def __call__(self, shape):
|
||||||
|
return np.ones(shape=shape) * self.value
|
||||||
|
|
||||||
|
|
||||||
|
class UniformInit(object):
|
||||||
|
"""Random uniform parameter initialiser."""
|
||||||
|
|
||||||
|
def __init__(self, low, high, rng=None):
|
||||||
|
"""Construct a random uniform parameter initialiser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
low: Lower bound of interval to sample from.
|
||||||
|
high: Upper bound of interval to sample from.
|
||||||
|
rng (RandomState): Seeded random number generator.
|
||||||
|
"""
|
||||||
|
self.low = low
|
||||||
|
self.high = high
|
||||||
|
if rng is None:
|
||||||
|
rng = np.random.RandomState(DEFAULT_SEED)
|
||||||
|
self.rng = rng
|
||||||
|
|
||||||
|
def __call__(self, shape):
|
||||||
|
return self.rng.uniform(low=self.low, high=self.high, size=shape)
|
||||||
|
|
||||||
|
|
||||||
|
class NormalInit(object):
|
||||||
|
"""Random normal parameter initialiser."""
|
||||||
|
|
||||||
|
def __init__(self, mean, std, rng=None):
|
||||||
|
"""Construct a random uniform parameter initialiser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mean: Mean of distribution to sample from.
|
||||||
|
std: Standard deviation of distribution to sample from.
|
||||||
|
rng (RandomState): Seeded random number generator.
|
||||||
|
"""
|
||||||
|
self.mean = mean
|
||||||
|
self.std = std
|
||||||
|
if rng is None:
|
||||||
|
rng = np.random.RandomState(DEFAULT_SEED)
|
||||||
|
self.rng = rng
|
||||||
|
|
||||||
|
def __call__(self, shape):
|
||||||
|
return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
|
141
mlp/layers.py
Normal file
141
mlp/layers.py
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Layer definitions.
|
||||||
|
|
||||||
|
This module defines classes which encapsulate a single layer.
|
||||||
|
|
||||||
|
These layers map input activations to output activation with the `fprop`
|
||||||
|
method and map gradients with repsect to outputs to gradients with respect to
|
||||||
|
their inputs with the `bprop` method.
|
||||||
|
|
||||||
|
Some layers will have learnable parameters and so will additionally define
|
||||||
|
methods for getting and setting parameter and calculating gradients with
|
||||||
|
respect to the layer parameters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import mlp.initialisers as init
|
||||||
|
|
||||||
|
|
||||||
|
class Layer(object):
|
||||||
|
"""Abstract class defining the interface for a layer."""
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||||
|
"""Back propagates gradients through a layer.
|
||||||
|
|
||||||
|
Given gradients with respect to the outputs of the layer calculates the
|
||||||
|
gradients with respect to the layer inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
outputs: Array of layer outputs calculated in forward pass of
|
||||||
|
shape (batch_size, output_dim).
|
||||||
|
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array of gradients with respect to the layer inputs of shape
|
||||||
|
(batch_size, input_dim).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class LayerWithParameters(Layer):
|
||||||
|
"""Abstract class defining the interface for a layer with parameters."""
|
||||||
|
|
||||||
|
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||||
|
"""Calculates gradients with respect to layer parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of inputs to layer of shape (batch_size, input_dim).
|
||||||
|
grads_wrt_to_outputs: Array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of arrays of gradients with respect to the layer parameters
|
||||||
|
with parameter gradients appearing in same order in tuple as
|
||||||
|
returned from `get_params` method.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self):
|
||||||
|
"""Returns a list of parameters of layer.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of current parameter values.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class AffineLayer(LayerWithParameters):
|
||||||
|
"""Layer implementing an affine tranformation of its inputs.
|
||||||
|
|
||||||
|
This layer is parameterised by a weight matrix and bias vector.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input_dim, output_dim,
|
||||||
|
weights_initialiser=init.UniformInit(-0.1, 0.1),
|
||||||
|
biases_initialiser=init.ConstantInit(0.),
|
||||||
|
weights_cost=None, biases_cost=None):
|
||||||
|
"""Initialises a parameterised affine layer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_dim (int): Dimension of inputs to the layer.
|
||||||
|
output_dim (int): Dimension of the layer outputs.
|
||||||
|
weights_initialiser: Initialiser for the weight parameters.
|
||||||
|
biases_initialiser: Initialiser for the bias parameters.
|
||||||
|
"""
|
||||||
|
self.input_dim = input_dim
|
||||||
|
self.output_dim = output_dim
|
||||||
|
self.weights = weights_initialiser((self.output_dim, self.input_dim))
|
||||||
|
self.biases = biases_initialiser(self.output_dim)
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Forward propagates activations through the layer transformation.
|
||||||
|
|
||||||
|
For inputs `x`, outputs `y`, weights `W` and biases `b` the layer
|
||||||
|
corresponds to `y = W.dot(x) + b`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||||
|
"""
|
||||||
|
#TODO write your code here
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||||
|
"""Calculates gradients with respect to layer parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: array of inputs to layer of shape (batch_size, input_dim)
|
||||||
|
grads_wrt_to_outputs: array of gradients with respect to the layer
|
||||||
|
outputs of shape (batch_size, output_dim)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of arrays of gradients with respect to the layer parameters
|
||||||
|
`[grads_wrt_weights, grads_wrt_biases]`.
|
||||||
|
"""
|
||||||
|
#TODO write your code here
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self):
|
||||||
|
"""A list of layer parameter values: `[weights, biases]`."""
|
||||||
|
return [self.weights, self.biases]
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
|
||||||
|
self.input_dim, self.output_dim)
|
162
mlp/learning_rules.py
Normal file
162
mlp/learning_rules.py
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Learning rules.
|
||||||
|
|
||||||
|
This module contains classes implementing gradient based learning rules.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class GradientDescentLearningRule(object):
|
||||||
|
"""Simple (stochastic) gradient descent learning rule.
|
||||||
|
|
||||||
|
For a scalar error function `E(p[0], p_[1] ... )` of some set of
|
||||||
|
potentially multidimensional parameters this attempts to find a local
|
||||||
|
minimum of the loss function by applying updates to each parameter of the
|
||||||
|
form
|
||||||
|
|
||||||
|
p[i] := p[i] - learning_rate * dE/dp[i]
|
||||||
|
|
||||||
|
With `learning_rate` a positive scaling parameter.
|
||||||
|
|
||||||
|
The error function used in successive applications of these updates may be
|
||||||
|
a stochastic estimator of the true error function (e.g. when the error with
|
||||||
|
respect to only a subset of data-points is calculated) in which case this
|
||||||
|
will correspond to a stochastic gradient descent learning rule.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learning_rate=1e-3):
|
||||||
|
"""Creates a new learning rule object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
learning_rate: A postive scalar to scale gradient updates to the
|
||||||
|
parameters by. This needs to be carefully set - if too large
|
||||||
|
the learning dynamic will be unstable and may diverge, while
|
||||||
|
if set too small learning will proceed very slowly.
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert learning_rate > 0., 'learning_rate should be positive.'
|
||||||
|
self.learning_rate = learning_rate
|
||||||
|
|
||||||
|
def initialise(self, params):
|
||||||
|
"""Initialises the state of the learning rule for a set or parameters.
|
||||||
|
|
||||||
|
This must be called before `update_params` is first called.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params: A list of the parameters to be optimised. Note these will
|
||||||
|
be updated *in-place* to avoid reallocating arrays on each
|
||||||
|
update.
|
||||||
|
"""
|
||||||
|
self.params = params
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets any additional state variables to their intial values.
|
||||||
|
|
||||||
|
For this learning rule there are no additional state variables so we
|
||||||
|
do nothing here.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def update_params(self, grads_wrt_params):
|
||||||
|
"""Applies a single gradient descent update to all parameters.
|
||||||
|
|
||||||
|
All parameter updates are performed using in-place operations and so
|
||||||
|
nothing is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grads_wrt_params: A list of gradients of the scalar loss function
|
||||||
|
with respect to each of the parameters passed to `initialise`
|
||||||
|
previously, with this list expected to be in the same order.
|
||||||
|
"""
|
||||||
|
for param, grad in zip(self.params, grads_wrt_params):
|
||||||
|
param -= self.learning_rate * grad
|
||||||
|
|
||||||
|
|
||||||
|
class MomentumLearningRule(GradientDescentLearningRule):
|
||||||
|
"""Gradient descent with momentum learning rule.
|
||||||
|
|
||||||
|
This extends the basic gradient learning rule by introducing extra
|
||||||
|
momentum state variables for each parameter. These can help the learning
|
||||||
|
dynamic help overcome shallow local minima and speed convergence when
|
||||||
|
making multiple successive steps in a similar direction in parameter space.
|
||||||
|
|
||||||
|
For parameter p[i] and corresponding momentum m[i] the updates for a
|
||||||
|
scalar loss function `L` are of the form
|
||||||
|
|
||||||
|
m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i]
|
||||||
|
p[i] := p[i] + m[i]
|
||||||
|
|
||||||
|
with `learning_rate` a positive scaling parameter for the gradient updates
|
||||||
|
and `mom_coeff` a value in [0, 1] that determines how much 'friction' there
|
||||||
|
is the system and so how quickly previous momentum contributions decay.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learning_rate=1e-3, mom_coeff=0.9):
|
||||||
|
"""Creates a new learning rule object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
learning_rate: A postive scalar to scale gradient updates to the
|
||||||
|
parameters by. This needs to be carefully set - if too large
|
||||||
|
the learning dynamic will be unstable and may diverge, while
|
||||||
|
if set too small learning will proceed very slowly.
|
||||||
|
mom_coeff: A scalar in the range [0, 1] inclusive. This determines
|
||||||
|
the contribution of the previous momentum value to the value
|
||||||
|
after each update. If equal to 0 the momentum is set to exactly
|
||||||
|
the negative scaled gradient each update and so this rule
|
||||||
|
collapses to standard gradient descent. If equal to 1 the
|
||||||
|
momentum will just be decremented by the scaled gradient at
|
||||||
|
each update. This is equivalent to simulating the dynamic in
|
||||||
|
a frictionless system. Due to energy conservation the loss
|
||||||
|
of 'potential energy' as the dynamics moves down the loss
|
||||||
|
function surface will lead to an increasingly large 'kinetic
|
||||||
|
energy' and so speed, meaning the updates will become
|
||||||
|
increasingly large, potentially unstably so. Typically a value
|
||||||
|
less than but close to 1 will avoid these issues and cause the
|
||||||
|
dynamic to converge to a local minima where the gradients are
|
||||||
|
by definition zero.
|
||||||
|
"""
|
||||||
|
super(MomentumLearningRule, self).__init__(learning_rate)
|
||||||
|
assert mom_coeff >= 0. and mom_coeff <= 1., (
|
||||||
|
'mom_coeff should be in the range [0, 1].'
|
||||||
|
)
|
||||||
|
self.mom_coeff = mom_coeff
|
||||||
|
|
||||||
|
def initialise(self, params):
|
||||||
|
"""Initialises the state of the learning rule for a set or parameters.
|
||||||
|
|
||||||
|
This must be called before `update_params` is first called.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params: A list of the parameters to be optimised. Note these will
|
||||||
|
be updated *in-place* to avoid reallocating arrays on each
|
||||||
|
update.
|
||||||
|
"""
|
||||||
|
super(MomentumLearningRule, self).initialise(params)
|
||||||
|
self.moms = []
|
||||||
|
for param in self.params:
|
||||||
|
self.moms.append(np.zeros_like(param))
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets any additional state variables to their intial values.
|
||||||
|
|
||||||
|
For this learning rule this corresponds to zeroing all the momenta.
|
||||||
|
"""
|
||||||
|
for mom in zip(self.moms):
|
||||||
|
mom *= 0.
|
||||||
|
|
||||||
|
def update_params(self, grads_wrt_params):
|
||||||
|
"""Applies a single update to all parameters.
|
||||||
|
|
||||||
|
All parameter updates are performed using in-place operations and so
|
||||||
|
nothing is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grads_wrt_params: A list of gradients of the scalar loss function
|
||||||
|
with respect to each of the parameters passed to `initialise`
|
||||||
|
previously, with this list expected to be in the same order.
|
||||||
|
"""
|
||||||
|
for param, mom, grad in zip(self.params, self.moms, grads_wrt_params):
|
||||||
|
mom *= self.mom_coeff
|
||||||
|
mom -= self.learning_rate * grad
|
||||||
|
param += mom
|
67
mlp/models.py
Normal file
67
mlp/models.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Model definitions.
|
||||||
|
|
||||||
|
This module implements objects encapsulating learnable models of input-output
|
||||||
|
relationships. The model objects implement methods for forward propagating
|
||||||
|
the inputs through the transformation(s) defined by the model to produce
|
||||||
|
outputs (and intermediate states) and for calculating gradients of scalar
|
||||||
|
functions of the outputs with respect to the model parameters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from mlp.layers import LayerWithParameters
|
||||||
|
|
||||||
|
|
||||||
|
class SingleLayerModel(object):
|
||||||
|
"""A model consisting of a single transformation layer."""
|
||||||
|
|
||||||
|
def __init__(self, layer):
|
||||||
|
"""Create a new single layer model instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
layer: The layer object defining the model architecture.
|
||||||
|
"""
|
||||||
|
self.layer = layer
|
||||||
|
|
||||||
|
@property
|
||||||
|
def params(self):
|
||||||
|
"""A list of all of the parameters of the model."""
|
||||||
|
return self.layer.params
|
||||||
|
|
||||||
|
def fprop(self, inputs):
|
||||||
|
"""Calculate the model outputs corresponding to a batch of inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Batch of inputs to the model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List which is a concatenation of the model inputs and model
|
||||||
|
outputs, this being done for consistency of the interface with
|
||||||
|
multi-layer models for which `fprop` returns a list of
|
||||||
|
activations through all immediate layers of the model and including
|
||||||
|
the inputs and outputs.
|
||||||
|
"""
|
||||||
|
activations = [inputs, self.layer.fprop(inputs)]
|
||||||
|
return activations
|
||||||
|
|
||||||
|
def grads_wrt_params(self, activations, grads_wrt_outputs):
|
||||||
|
"""Calculates gradients with respect to the model parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
activations: List of all activations from forward pass through
|
||||||
|
model using `fprop`.
|
||||||
|
grads_wrt_outputs: Gradient with respect to the model outputs of
|
||||||
|
the scalar function parameter gradients are being calculated
|
||||||
|
for.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of gradients of the scalar function with respect to all model
|
||||||
|
parameters.
|
||||||
|
"""
|
||||||
|
return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
|
||||||
|
|
||||||
|
def params_cost(self):
|
||||||
|
"""Calculates the parameter dependent cost term of the model."""
|
||||||
|
return self.layer.params_cost()
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'SingleLayerModel(' + str(layer) + ')'
|
134
mlp/optimisers.py
Normal file
134
mlp/optimisers.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Model optimisers.
|
||||||
|
|
||||||
|
This module contains objects implementing (batched) stochastic gradient descent
|
||||||
|
based optimisation of models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from collections import OrderedDict
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Optimiser(object):
|
||||||
|
"""Basic model optimiser."""
|
||||||
|
|
||||||
|
def __init__(self, model, error, learning_rule, train_dataset,
|
||||||
|
valid_dataset=None, data_monitors=None):
|
||||||
|
"""Create a new optimiser instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: The model to optimise.
|
||||||
|
error: The scalar error function to minimise.
|
||||||
|
learning_rule: Gradient based learning rule to use to minimise
|
||||||
|
error.
|
||||||
|
train_dataset: Data provider for training set data batches.
|
||||||
|
valid_dataset: Data provider for validation set data batches.
|
||||||
|
data_monitors: Dictionary of functions evaluated on targets and
|
||||||
|
model outputs (averaged across both full training and
|
||||||
|
validation data sets) to monitor during training in addition
|
||||||
|
to the error. Keys should correspond to a string label for
|
||||||
|
the statistic being evaluated.
|
||||||
|
"""
|
||||||
|
self.model = model
|
||||||
|
self.error = error
|
||||||
|
self.learning_rule = learning_rule
|
||||||
|
self.learning_rule.initialise(self.model.params)
|
||||||
|
self.train_dataset = train_dataset
|
||||||
|
self.valid_dataset = valid_dataset
|
||||||
|
self.data_monitors = OrderedDict([('error', error)])
|
||||||
|
if data_monitors is not None:
|
||||||
|
self.data_monitors.update(data_monitors)
|
||||||
|
|
||||||
|
def do_training_epoch(self):
|
||||||
|
"""Do a single training epoch.
|
||||||
|
|
||||||
|
This iterates through all batches in training dataset, for each
|
||||||
|
calculating the gradient of the estimated error given the batch with
|
||||||
|
respect to all the model parameters and then updates the model
|
||||||
|
parameters according to the learning rule.
|
||||||
|
"""
|
||||||
|
for inputs_batch, targets_batch in self.train_dataset:
|
||||||
|
activations = self.model.fprop(inputs_batch)
|
||||||
|
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
|
||||||
|
grads_wrt_params = self.model.grads_wrt_params(
|
||||||
|
activations, grads_wrt_outputs)
|
||||||
|
self.learning_rule.update_params(grads_wrt_params)
|
||||||
|
|
||||||
|
def eval_monitors(self, dataset, label):
|
||||||
|
"""Evaluates the monitors for the given dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset: Dataset to perform evaluation with.
|
||||||
|
label: Tag to add to end of monitor keys to identify dataset.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
OrderedDict of monitor values evaluated on dataset.
|
||||||
|
"""
|
||||||
|
data_mon_vals = OrderedDict([(key + label, 0.) for key
|
||||||
|
in self.data_monitors.keys()])
|
||||||
|
for inputs_batch, targets_batch in dataset:
|
||||||
|
activations = self.model.fprop(inputs_batch)
|
||||||
|
for key, data_monitor in self.data_monitors.items():
|
||||||
|
data_mon_vals[key + label] += data_monitor(
|
||||||
|
activations[-1], targets_batch)
|
||||||
|
for key, data_monitor in self.data_monitors.items():
|
||||||
|
data_mon_vals[key + label] /= dataset.num_batches
|
||||||
|
return data_mon_vals
|
||||||
|
|
||||||
|
def get_epoch_stats(self):
|
||||||
|
"""Computes training statistics for an epoch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An OrderedDict with keys corresponding to the statistic labels and
|
||||||
|
values corresponding to the value of the statistic.
|
||||||
|
"""
|
||||||
|
epoch_stats = OrderedDict()
|
||||||
|
epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)'))
|
||||||
|
if self.valid_dataset is not None:
|
||||||
|
epoch_stats.update(self.eval_monitors(
|
||||||
|
self.valid_dataset, '(valid)'))
|
||||||
|
return epoch_stats
|
||||||
|
|
||||||
|
def log_stats(self, epoch, epoch_time, stats):
|
||||||
|
"""Outputs stats for a training epoch to a logger.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
epoch (int): Epoch counter.
|
||||||
|
epoch_time: Time taken in seconds for the epoch to complete.
|
||||||
|
stats: Monitored stats for the epoch.
|
||||||
|
"""
|
||||||
|
logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format(
|
||||||
|
epoch, epoch_time,
|
||||||
|
', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
|
||||||
|
))
|
||||||
|
|
||||||
|
def train(self, num_epochs, stats_interval=5):
|
||||||
|
"""Trains a model for a set number of epochs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
num_epochs: Number of epochs (complete passes through trainin
|
||||||
|
dataset) to train for.
|
||||||
|
stats_interval: Training statistics will be recorded and logged
|
||||||
|
every `stats_interval` epochs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple with first value being an array of training run statistics
|
||||||
|
and the second being a dict mapping the labels for the statistics
|
||||||
|
recorded to their column index in the array.
|
||||||
|
"""
|
||||||
|
run_stats = [list(self.get_epoch_stats().values())]
|
||||||
|
for epoch in range(1, num_epochs + 1):
|
||||||
|
start_time = time.process_time()
|
||||||
|
self.do_training_epoch()
|
||||||
|
epoch_time = time.process_time() - start_time
|
||||||
|
if epoch % stats_interval == 0:
|
||||||
|
stats = self.get_epoch_stats()
|
||||||
|
self.log_stats(epoch, epoch_time, stats)
|
||||||
|
run_stats.append(list(stats.values()))
|
||||||
|
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}
|
||||||
|
|
File diff suppressed because one or more lines are too long
1124
notebooks/02_Single_layer_models.ipynb
Normal file
1124
notebooks/02_Single_layer_models.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
BIN
notebooks/res/._fprop-bprop-block-diagram.png
Normal file
BIN
notebooks/res/._fprop-bprop-block-diagram.png
Normal file
Binary file not shown.
BIN
notebooks/res/._jupyter-dashboard.png
Normal file
BIN
notebooks/res/._jupyter-dashboard.png
Normal file
Binary file not shown.
BIN
notebooks/res/._jupyter-notebook-interface.png
Normal file
BIN
notebooks/res/._jupyter-notebook-interface.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetBP-1.png
Normal file
BIN
notebooks/res/._singleLayerNetBP-1.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetPredict.png
Normal file
BIN
notebooks/res/._singleLayerNetPredict.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetWts-1.png
Normal file
BIN
notebooks/res/._singleLayerNetWts-1.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetWtsEqns-1.png
Normal file
BIN
notebooks/res/._singleLayerNetWtsEqns-1.png
Normal file
Binary file not shown.
BIN
notebooks/res/fprop-bprop-block-diagram.pdf
Normal file
BIN
notebooks/res/fprop-bprop-block-diagram.pdf
Normal file
Binary file not shown.
BIN
notebooks/res/fprop-bprop-block-diagram.png
Normal file
BIN
notebooks/res/fprop-bprop-block-diagram.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 6.9 KiB |
65
notebooks/res/fprop-bprop-block-diagram.tex
Normal file
65
notebooks/res/fprop-bprop-block-diagram.tex
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
\documentclass[tikz]{standalone}
|
||||||
|
|
||||||
|
\usepackage{amsmath}
|
||||||
|
\usepackage{tikz}
|
||||||
|
\usetikzlibrary{arrows}
|
||||||
|
\usetikzlibrary{calc}
|
||||||
|
\usepackage{ifthen}
|
||||||
|
|
||||||
|
\newcommand{\vct}[1]{\boldsymbol{#1}}
|
||||||
|
\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
|
||||||
|
|
||||||
|
\tikzstyle{fprop} = [draw,fill=blue!20,minimum size=2em,align=center]
|
||||||
|
\tikzstyle{bprop} = [draw,fill=red!20,minimum size=2em,align=center]
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\begin{tikzpicture}[xscale=1.75] %
|
||||||
|
% define number of layers
|
||||||
|
\def\nl{2};
|
||||||
|
% model input
|
||||||
|
\node at (0, 0) (input) {$\vct{x}$};
|
||||||
|
% draw fprop through model layers
|
||||||
|
\foreach \l in {0,...,\nl} {
|
||||||
|
\node[fprop] at (2 * \l + 1, 0) (fprop\l) {\texttt{layers[\l]} \\ \texttt{.fprop}};
|
||||||
|
\ifthenelse{\l > 0}{
|
||||||
|
\node at (2 * \l, 0) (hidden\l) {$\vct{h}_\l$};
|
||||||
|
\draw[->] (hidden\l) -- (fprop\l);
|
||||||
|
\draw[->] let \n1={\l - 1} in (fprop\n1) -- (hidden\l);
|
||||||
|
}{
|
||||||
|
\draw[->] (input) -- (fprop\l);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
% model output
|
||||||
|
\node at (2 * \nl + 2, 0) (output) {$\mathbf{y}$};
|
||||||
|
% error function
|
||||||
|
\node[fprop] at (2 * \nl + 3, 0) (errorfunc) {\texttt{error}};
|
||||||
|
% error value
|
||||||
|
\node at (2 * \nl + 3, -1) (error) {$\bar{E}$};
|
||||||
|
% targets
|
||||||
|
\node at (2 * \nl + 4, -1) (tgt) {$\vct{t}$};
|
||||||
|
% error gradient
|
||||||
|
\node[bprop] at (2 * \nl + 3, -2) (errorgrad) {\texttt{error} \\ \texttt{.grad}};
|
||||||
|
% gradient wrt outputs
|
||||||
|
\node at (2 * \nl + 2, -2) (gradoutput) {$\pd{\bar{E}}{\vct{y}}$};
|
||||||
|
\draw[->] (fprop\nl) -- (output);
|
||||||
|
\draw[->] (output) -- (errorfunc);
|
||||||
|
\draw[->] (errorfunc) -- (error);
|
||||||
|
\draw[->] (error) -- (errorgrad);
|
||||||
|
\draw[->] (errorgrad) -- (gradoutput);
|
||||||
|
\draw[->] (tgt) |- (errorfunc);
|
||||||
|
\draw[->] (tgt) |- (errorgrad);
|
||||||
|
\foreach \l in {0,...,\nl} {
|
||||||
|
\node[bprop] at (2 * \l + 1, -2) (bprop\l) {\texttt{layers[\l]} \\ \texttt{.bprop}};
|
||||||
|
\ifthenelse{\l > 0}{
|
||||||
|
\node at (2 * \l, -2) (grad\l) {$\pd{\bar{E}}{\vct{h}_\l}$};
|
||||||
|
\draw[<-] (grad\l) -- (bprop\l);
|
||||||
|
\draw[<-] let \n1={\l - 1} in (bprop\n1) -- (grad\l);
|
||||||
|
}{}
|
||||||
|
}
|
||||||
|
\node at (0, -2) (gradinput) {$\pd{\bar{E}}{\vct{x}}$};
|
||||||
|
\draw[->] (bprop0) -- (gradinput);
|
||||||
|
\draw[->] (gradoutput) -- (bprop\nl);
|
||||||
|
\end{tikzpicture}
|
||||||
|
|
||||||
|
\end{document}
|
Loading…
Reference in New Issue
Block a user