update lab 2
This commit is contained in:
parent
f5579c980d
commit
2702ee6f7b
@ -75,6 +75,9 @@ class DataProvider(object):
|
||||
self.inputs = self.inputs[new_order]
|
||||
self.targets = self.targets[new_order]
|
||||
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
|
||||
def next(self):
|
||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||
if self._curr_batch + 1 > self.num_batches:
|
||||
@ -133,13 +136,10 @@ class MNISTDataProvider(DataProvider):
|
||||
super(MNISTDataProvider, self).__init__(
|
||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||
|
||||
# def next(self):
|
||||
# """Returns next data batch or raises `StopIteration` if at end."""
|
||||
# inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
|
||||
# return inputs_batch, self.to_one_of_k(targets_batch)
|
||||
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
def next(self):
|
||||
"""Returns next data batch or raises `StopIteration` if at end."""
|
||||
inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
|
||||
return inputs_batch, self.to_one_of_k(targets_batch)
|
||||
|
||||
def to_one_of_k(self, int_targets):
|
||||
"""Converts integer coded class target to 1 of K coded targets.
|
||||
@ -156,21 +156,23 @@ class MNISTDataProvider(DataProvider):
|
||||
to zero except for the column corresponding to the correct class
|
||||
which is equal to one.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
|
||||
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
|
||||
return one_of_k_targets
|
||||
|
||||
|
||||
class MetOfficeDataProvider(DataProvider):
|
||||
"""South Scotland Met Office weather data provider."""
|
||||
|
||||
def __init__(self, window_size, batch_size=10, max_num_batches=-1,
|
||||
shuffle_order=True, rng=None):
|
||||
"""Create a new Met Offfice data provider object.
|
||||
shuffle_order=True, rng=None):
|
||||
"""Create a new Met Office data provider object.
|
||||
|
||||
Args:
|
||||
window_size (int): Size of windows to split weather time series
|
||||
data into. The constructed input features will be the first
|
||||
`window_size - 1` entries in each window and the target outputs
|
||||
the last entry in each window.
|
||||
data into. The constructed input features will be the first
|
||||
`window_size - 1` entries in each window and the target outputs
|
||||
the last entry in each window.
|
||||
batch_size (int): Number of data points to include in each batch.
|
||||
max_num_batches (int): Maximum number of batches to iterate over
|
||||
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||
@ -180,29 +182,74 @@ class MetOfficeDataProvider(DataProvider):
|
||||
the data before each epoch.
|
||||
rng (RandomState): A seeded random number generator.
|
||||
"""
|
||||
self.window_size = window_size
|
||||
assert window_size > 1, 'window_size must be at least 2.'
|
||||
data_path = os.path.join(
|
||||
os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
|
||||
assert os.path.isfile(data_path), (
|
||||
'Data file does not exist at expected path: ' + data_path
|
||||
)
|
||||
#TODO: load raw data from text file
|
||||
|
||||
#TODO: filter out all missing datapoints and flatten to a vector
|
||||
|
||||
#TODO: normalise data to zero mean, unit standard deviation
|
||||
raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32))
|
||||
assert window_size > 1, 'window_size must be at least 2.'
|
||||
self.window_size = window_size
|
||||
# filter out all missing datapoints and flatten to a vector
|
||||
filtered = raw[raw >= 0].flatten()
|
||||
# normalise data to zero mean, unit standard deviation
|
||||
mean = np.mean(filtered)
|
||||
std = np.std(filtered)
|
||||
normalised = (filtered - mean) / std
|
||||
# create a view on to array corresponding to a rolling window
|
||||
shape = (normalised.shape[-1] - self.window_size + 1, self.window_size)
|
||||
strides = normalised.strides + (normalised.strides[-1],)
|
||||
windowed = np.lib.stride_tricks.as_strided(
|
||||
normalised, shape=shape, strides=strides)
|
||||
# inputs are first (window_size - 1) entries in windows
|
||||
inputs = windowed[:, :-1]
|
||||
# targets are last entry in windows
|
||||
targets = windowed[:, -1]
|
||||
super(MetOfficeDataProvider, self).__init__(
|
||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||
|
||||
#TODO: convert from flat sequence to windowed data
|
||||
class CCPPDataProvider(DataProvider):
|
||||
|
||||
#TODO: separate into inputs and targets
|
||||
# inputs are the first (window_size - 1) entries in windows
|
||||
# inputs = ...
|
||||
# targets are the last entries in windows
|
||||
# targets = ...
|
||||
|
||||
# initialise base class with inputs and targets arrays (uncomment below)
|
||||
# super(MetOfficeDataProvider, self).__init__(
|
||||
# inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
def __init__(self, which_set='train', input_dims=None, batch_size=10,
|
||||
max_num_batches=-1, shuffle_order=True, rng=None):
|
||||
"""Create a new Combined Cycle Power Plant data provider object.
|
||||
|
||||
Args:
|
||||
which_set: One of 'train' or 'valid'. Determines which portion of
|
||||
data this object should provide.
|
||||
input_dims: Which of the four input dimension to use. If `None` all
|
||||
are used. If an iterable of integers are provided (consisting
|
||||
of a subset of {0, 1, 2, 3}) then only the corresponding
|
||||
input dimensions are included.
|
||||
batch_size (int): Number of data points to include in each batch.
|
||||
max_num_batches (int): Maximum number of batches to iterate over
|
||||
in an epoch. If `max_num_batches * batch_size > num_data` then
|
||||
only as many batches as the data can be split into will be
|
||||
used. If set to -1 all of the data will be used.
|
||||
shuffle_order (bool): Whether to randomly permute the order of
|
||||
the data before each epoch.
|
||||
rng (RandomState): A seeded random number generator.
|
||||
"""
|
||||
data_path = os.path.join(
|
||||
os.environ['MLP_DATA_DIR'], 'ccpp_data.npz')
|
||||
assert os.path.isfile(data_path), (
|
||||
'Data file does not exist at expected path: ' + data_path
|
||||
)
|
||||
# check a valid which_set was provided
|
||||
assert which_set in ['train', 'valid'], (
|
||||
'Expected which_set to be either train or valid '
|
||||
'Got {0}'.format(which_set)
|
||||
)
|
||||
# check input_dims are valid
|
||||
if not input_dims is not None:
|
||||
input_dims = set(input_dims)
|
||||
assert input_dims.issubset({0, 1, 2, 3}), (
|
||||
'input_dims should be a subset of {0, 1, 2, 3}'
|
||||
)
|
||||
loaded = np.load(data_path)
|
||||
inputs = loaded[which_set + '_inputs']
|
||||
if input_dims is not None:
|
||||
inputs = inputs[:, input_dims]
|
||||
targets = loaded[which_set + '_targets']
|
||||
super(CCPPDataProvider, self).__init__(
|
||||
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
|
||||
|
46
mlp/errors.py
Normal file
46
mlp/errors.py
Normal file
@ -0,0 +1,46 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Error functions.
|
||||
|
||||
This module defines error functions, with the aim of model training being to
|
||||
minimise the error function given a set of inputs and target outputs.
|
||||
|
||||
The error functions will typically measure some concept of distance between the
|
||||
model outputs and target outputs, averaged over all data points in the data set
|
||||
or batch.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class SumOfSquaredDiffsError(object):
|
||||
"""Sum of squared differences (squared Euclidean distance) error."""
|
||||
|
||||
def __call__(self, outputs, targets):
|
||||
"""Calculates error function given a batch of outputs and targets.
|
||||
|
||||
Args:
|
||||
outputs: Array of model outputs of shape (batch_size, output_dim).
|
||||
targets: Array of target outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Scalar error function value.
|
||||
"""
|
||||
#TODO write your code here
|
||||
raise NotImplementedError()
|
||||
|
||||
def grad(self, outputs, targets):
|
||||
"""Calculates gradient of error function with respect to outputs.
|
||||
|
||||
Args:
|
||||
outputs: Array of model outputs of shape (batch_size, output_dim).
|
||||
targets: Array of target outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Gradient of error function with respect to outputs. This should be
|
||||
an array of shape (batch_size, output_dim).
|
||||
"""
|
||||
#TODO write your code here
|
||||
raise NotImplementedError()
|
||||
|
||||
def __repr__(self):
|
||||
return 'SumOfSquaredDiffsError'
|
65
mlp/initialisers.py
Normal file
65
mlp/initialisers.py
Normal file
@ -0,0 +1,65 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Parameter initialisers.
|
||||
|
||||
This module defines classes to initialise the parameters in a layer.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from mlp import DEFAULT_SEED
|
||||
|
||||
|
||||
class ConstantInit(object):
|
||||
"""Constant parameter initialiser."""
|
||||
|
||||
def __init__(self, value):
|
||||
"""Construct a constant parameter initialiser.
|
||||
|
||||
Args:
|
||||
value: Value to initialise parameter to.
|
||||
"""
|
||||
self.value = value
|
||||
|
||||
def __call__(self, shape):
|
||||
return np.ones(shape=shape) * self.value
|
||||
|
||||
|
||||
class UniformInit(object):
|
||||
"""Random uniform parameter initialiser."""
|
||||
|
||||
def __init__(self, low, high, rng=None):
|
||||
"""Construct a random uniform parameter initialiser.
|
||||
|
||||
Args:
|
||||
low: Lower bound of interval to sample from.
|
||||
high: Upper bound of interval to sample from.
|
||||
rng (RandomState): Seeded random number generator.
|
||||
"""
|
||||
self.low = low
|
||||
self.high = high
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
|
||||
def __call__(self, shape):
|
||||
return self.rng.uniform(low=self.low, high=self.high, size=shape)
|
||||
|
||||
|
||||
class NormalInit(object):
|
||||
"""Random normal parameter initialiser."""
|
||||
|
||||
def __init__(self, mean, std, rng=None):
|
||||
"""Construct a random uniform parameter initialiser.
|
||||
|
||||
Args:
|
||||
mean: Mean of distribution to sample from.
|
||||
std: Standard deviation of distribution to sample from.
|
||||
rng (RandomState): Seeded random number generator.
|
||||
"""
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
if rng is None:
|
||||
rng = np.random.RandomState(DEFAULT_SEED)
|
||||
self.rng = rng
|
||||
|
||||
def __call__(self, shape):
|
||||
return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
|
141
mlp/layers.py
Normal file
141
mlp/layers.py
Normal file
@ -0,0 +1,141 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Layer definitions.
|
||||
|
||||
This module defines classes which encapsulate a single layer.
|
||||
|
||||
These layers map input activations to output activation with the `fprop`
|
||||
method and map gradients with repsect to outputs to gradients with respect to
|
||||
their inputs with the `bprop` method.
|
||||
|
||||
Some layers will have learnable parameters and so will additionally define
|
||||
methods for getting and setting parameter and calculating gradients with
|
||||
respect to the layer parameters.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import mlp.initialisers as init
|
||||
|
||||
|
||||
class Layer(object):
|
||||
"""Abstract class defining the interface for a layer."""
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def bprop(self, inputs, outputs, grads_wrt_outputs):
|
||||
"""Back propagates gradients through a layer.
|
||||
|
||||
Given gradients with respect to the outputs of the layer calculates the
|
||||
gradients with respect to the layer inputs.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
outputs: Array of layer outputs calculated in forward pass of
|
||||
shape (batch_size, output_dim).
|
||||
grads_wrt_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
Array of gradients with respect to the layer inputs of shape
|
||||
(batch_size, input_dim).
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class LayerWithParameters(Layer):
|
||||
"""Abstract class defining the interface for a layer with parameters."""
|
||||
|
||||
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||
"""Calculates gradients with respect to layer parameters.
|
||||
|
||||
Args:
|
||||
inputs: Array of inputs to layer of shape (batch_size, input_dim).
|
||||
grads_wrt_to_outputs: Array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim).
|
||||
|
||||
Returns:
|
||||
List of arrays of gradients with respect to the layer parameters
|
||||
with parameter gradients appearing in same order in tuple as
|
||||
returned from `get_params` method.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""Returns a list of parameters of layer.
|
||||
|
||||
Returns:
|
||||
List of current parameter values.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class AffineLayer(LayerWithParameters):
|
||||
"""Layer implementing an affine tranformation of its inputs.
|
||||
|
||||
This layer is parameterised by a weight matrix and bias vector.
|
||||
"""
|
||||
|
||||
def __init__(self, input_dim, output_dim,
|
||||
weights_initialiser=init.UniformInit(-0.1, 0.1),
|
||||
biases_initialiser=init.ConstantInit(0.),
|
||||
weights_cost=None, biases_cost=None):
|
||||
"""Initialises a parameterised affine layer.
|
||||
|
||||
Args:
|
||||
input_dim (int): Dimension of inputs to the layer.
|
||||
output_dim (int): Dimension of the layer outputs.
|
||||
weights_initialiser: Initialiser for the weight parameters.
|
||||
biases_initialiser: Initialiser for the bias parameters.
|
||||
"""
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.weights = weights_initialiser((self.output_dim, self.input_dim))
|
||||
self.biases = biases_initialiser(self.output_dim)
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Forward propagates activations through the layer transformation.
|
||||
|
||||
For inputs `x`, outputs `y`, weights `W` and biases `b` the layer
|
||||
corresponds to `y = W.dot(x) + b`.
|
||||
|
||||
Args:
|
||||
inputs: Array of layer inputs of shape (batch_size, input_dim).
|
||||
|
||||
Returns:
|
||||
outputs: Array of layer outputs of shape (batch_size, output_dim).
|
||||
"""
|
||||
#TODO write your code here
|
||||
raise NotImplementedError()
|
||||
|
||||
def grads_wrt_params(self, inputs, grads_wrt_outputs):
|
||||
"""Calculates gradients with respect to layer parameters.
|
||||
|
||||
Args:
|
||||
inputs: array of inputs to layer of shape (batch_size, input_dim)
|
||||
grads_wrt_to_outputs: array of gradients with respect to the layer
|
||||
outputs of shape (batch_size, output_dim)
|
||||
|
||||
Returns:
|
||||
list of arrays of gradients with respect to the layer parameters
|
||||
`[grads_wrt_weights, grads_wrt_biases]`.
|
||||
"""
|
||||
#TODO write your code here
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""A list of layer parameter values: `[weights, biases]`."""
|
||||
return [self.weights, self.biases]
|
||||
|
||||
def __repr__(self):
|
||||
return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
|
||||
self.input_dim, self.output_dim)
|
162
mlp/learning_rules.py
Normal file
162
mlp/learning_rules.py
Normal file
@ -0,0 +1,162 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Learning rules.
|
||||
|
||||
This module contains classes implementing gradient based learning rules.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class GradientDescentLearningRule(object):
|
||||
"""Simple (stochastic) gradient descent learning rule.
|
||||
|
||||
For a scalar error function `E(p[0], p_[1] ... )` of some set of
|
||||
potentially multidimensional parameters this attempts to find a local
|
||||
minimum of the loss function by applying updates to each parameter of the
|
||||
form
|
||||
|
||||
p[i] := p[i] - learning_rate * dE/dp[i]
|
||||
|
||||
With `learning_rate` a positive scaling parameter.
|
||||
|
||||
The error function used in successive applications of these updates may be
|
||||
a stochastic estimator of the true error function (e.g. when the error with
|
||||
respect to only a subset of data-points is calculated) in which case this
|
||||
will correspond to a stochastic gradient descent learning rule.
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate=1e-3):
|
||||
"""Creates a new learning rule object.
|
||||
|
||||
Args:
|
||||
learning_rate: A postive scalar to scale gradient updates to the
|
||||
parameters by. This needs to be carefully set - if too large
|
||||
the learning dynamic will be unstable and may diverge, while
|
||||
if set too small learning will proceed very slowly.
|
||||
|
||||
"""
|
||||
assert learning_rate > 0., 'learning_rate should be positive.'
|
||||
self.learning_rate = learning_rate
|
||||
|
||||
def initialise(self, params):
|
||||
"""Initialises the state of the learning rule for a set or parameters.
|
||||
|
||||
This must be called before `update_params` is first called.
|
||||
|
||||
Args:
|
||||
params: A list of the parameters to be optimised. Note these will
|
||||
be updated *in-place* to avoid reallocating arrays on each
|
||||
update.
|
||||
"""
|
||||
self.params = params
|
||||
|
||||
def reset(self):
|
||||
"""Resets any additional state variables to their intial values.
|
||||
|
||||
For this learning rule there are no additional state variables so we
|
||||
do nothing here.
|
||||
"""
|
||||
pass
|
||||
|
||||
def update_params(self, grads_wrt_params):
|
||||
"""Applies a single gradient descent update to all parameters.
|
||||
|
||||
All parameter updates are performed using in-place operations and so
|
||||
nothing is returned.
|
||||
|
||||
Args:
|
||||
grads_wrt_params: A list of gradients of the scalar loss function
|
||||
with respect to each of the parameters passed to `initialise`
|
||||
previously, with this list expected to be in the same order.
|
||||
"""
|
||||
for param, grad in zip(self.params, grads_wrt_params):
|
||||
param -= self.learning_rate * grad
|
||||
|
||||
|
||||
class MomentumLearningRule(GradientDescentLearningRule):
|
||||
"""Gradient descent with momentum learning rule.
|
||||
|
||||
This extends the basic gradient learning rule by introducing extra
|
||||
momentum state variables for each parameter. These can help the learning
|
||||
dynamic help overcome shallow local minima and speed convergence when
|
||||
making multiple successive steps in a similar direction in parameter space.
|
||||
|
||||
For parameter p[i] and corresponding momentum m[i] the updates for a
|
||||
scalar loss function `L` are of the form
|
||||
|
||||
m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i]
|
||||
p[i] := p[i] + m[i]
|
||||
|
||||
with `learning_rate` a positive scaling parameter for the gradient updates
|
||||
and `mom_coeff` a value in [0, 1] that determines how much 'friction' there
|
||||
is the system and so how quickly previous momentum contributions decay.
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate=1e-3, mom_coeff=0.9):
|
||||
"""Creates a new learning rule object.
|
||||
|
||||
Args:
|
||||
learning_rate: A postive scalar to scale gradient updates to the
|
||||
parameters by. This needs to be carefully set - if too large
|
||||
the learning dynamic will be unstable and may diverge, while
|
||||
if set too small learning will proceed very slowly.
|
||||
mom_coeff: A scalar in the range [0, 1] inclusive. This determines
|
||||
the contribution of the previous momentum value to the value
|
||||
after each update. If equal to 0 the momentum is set to exactly
|
||||
the negative scaled gradient each update and so this rule
|
||||
collapses to standard gradient descent. If equal to 1 the
|
||||
momentum will just be decremented by the scaled gradient at
|
||||
each update. This is equivalent to simulating the dynamic in
|
||||
a frictionless system. Due to energy conservation the loss
|
||||
of 'potential energy' as the dynamics moves down the loss
|
||||
function surface will lead to an increasingly large 'kinetic
|
||||
energy' and so speed, meaning the updates will become
|
||||
increasingly large, potentially unstably so. Typically a value
|
||||
less than but close to 1 will avoid these issues and cause the
|
||||
dynamic to converge to a local minima where the gradients are
|
||||
by definition zero.
|
||||
"""
|
||||
super(MomentumLearningRule, self).__init__(learning_rate)
|
||||
assert mom_coeff >= 0. and mom_coeff <= 1., (
|
||||
'mom_coeff should be in the range [0, 1].'
|
||||
)
|
||||
self.mom_coeff = mom_coeff
|
||||
|
||||
def initialise(self, params):
|
||||
"""Initialises the state of the learning rule for a set or parameters.
|
||||
|
||||
This must be called before `update_params` is first called.
|
||||
|
||||
Args:
|
||||
params: A list of the parameters to be optimised. Note these will
|
||||
be updated *in-place* to avoid reallocating arrays on each
|
||||
update.
|
||||
"""
|
||||
super(MomentumLearningRule, self).initialise(params)
|
||||
self.moms = []
|
||||
for param in self.params:
|
||||
self.moms.append(np.zeros_like(param))
|
||||
|
||||
def reset(self):
|
||||
"""Resets any additional state variables to their intial values.
|
||||
|
||||
For this learning rule this corresponds to zeroing all the momenta.
|
||||
"""
|
||||
for mom in zip(self.moms):
|
||||
mom *= 0.
|
||||
|
||||
def update_params(self, grads_wrt_params):
|
||||
"""Applies a single update to all parameters.
|
||||
|
||||
All parameter updates are performed using in-place operations and so
|
||||
nothing is returned.
|
||||
|
||||
Args:
|
||||
grads_wrt_params: A list of gradients of the scalar loss function
|
||||
with respect to each of the parameters passed to `initialise`
|
||||
previously, with this list expected to be in the same order.
|
||||
"""
|
||||
for param, mom, grad in zip(self.params, self.moms, grads_wrt_params):
|
||||
mom *= self.mom_coeff
|
||||
mom -= self.learning_rate * grad
|
||||
param += mom
|
67
mlp/models.py
Normal file
67
mlp/models.py
Normal file
@ -0,0 +1,67 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Model definitions.
|
||||
|
||||
This module implements objects encapsulating learnable models of input-output
|
||||
relationships. The model objects implement methods for forward propagating
|
||||
the inputs through the transformation(s) defined by the model to produce
|
||||
outputs (and intermediate states) and for calculating gradients of scalar
|
||||
functions of the outputs with respect to the model parameters.
|
||||
"""
|
||||
|
||||
from mlp.layers import LayerWithParameters
|
||||
|
||||
|
||||
class SingleLayerModel(object):
|
||||
"""A model consisting of a single transformation layer."""
|
||||
|
||||
def __init__(self, layer):
|
||||
"""Create a new single layer model instance.
|
||||
|
||||
Args:
|
||||
layer: The layer object defining the model architecture.
|
||||
"""
|
||||
self.layer = layer
|
||||
|
||||
@property
|
||||
def params(self):
|
||||
"""A list of all of the parameters of the model."""
|
||||
return self.layer.params
|
||||
|
||||
def fprop(self, inputs):
|
||||
"""Calculate the model outputs corresponding to a batch of inputs.
|
||||
|
||||
Args:
|
||||
inputs: Batch of inputs to the model.
|
||||
|
||||
Returns:
|
||||
List which is a concatenation of the model inputs and model
|
||||
outputs, this being done for consistency of the interface with
|
||||
multi-layer models for which `fprop` returns a list of
|
||||
activations through all immediate layers of the model and including
|
||||
the inputs and outputs.
|
||||
"""
|
||||
activations = [inputs, self.layer.fprop(inputs)]
|
||||
return activations
|
||||
|
||||
def grads_wrt_params(self, activations, grads_wrt_outputs):
|
||||
"""Calculates gradients with respect to the model parameters.
|
||||
|
||||
Args:
|
||||
activations: List of all activations from forward pass through
|
||||
model using `fprop`.
|
||||
grads_wrt_outputs: Gradient with respect to the model outputs of
|
||||
the scalar function parameter gradients are being calculated
|
||||
for.
|
||||
|
||||
Returns:
|
||||
List of gradients of the scalar function with respect to all model
|
||||
parameters.
|
||||
"""
|
||||
return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
|
||||
|
||||
def params_cost(self):
|
||||
"""Calculates the parameter dependent cost term of the model."""
|
||||
return self.layer.params_cost()
|
||||
|
||||
def __repr__(self):
|
||||
return 'SingleLayerModel(' + str(layer) + ')'
|
134
mlp/optimisers.py
Normal file
134
mlp/optimisers.py
Normal file
@ -0,0 +1,134 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Model optimisers.
|
||||
|
||||
This module contains objects implementing (batched) stochastic gradient descent
|
||||
based optimisation of models.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
from collections import OrderedDict
|
||||
import numpy as np
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Optimiser(object):
|
||||
"""Basic model optimiser."""
|
||||
|
||||
def __init__(self, model, error, learning_rule, train_dataset,
|
||||
valid_dataset=None, data_monitors=None):
|
||||
"""Create a new optimiser instance.
|
||||
|
||||
Args:
|
||||
model: The model to optimise.
|
||||
error: The scalar error function to minimise.
|
||||
learning_rule: Gradient based learning rule to use to minimise
|
||||
error.
|
||||
train_dataset: Data provider for training set data batches.
|
||||
valid_dataset: Data provider for validation set data batches.
|
||||
data_monitors: Dictionary of functions evaluated on targets and
|
||||
model outputs (averaged across both full training and
|
||||
validation data sets) to monitor during training in addition
|
||||
to the error. Keys should correspond to a string label for
|
||||
the statistic being evaluated.
|
||||
"""
|
||||
self.model = model
|
||||
self.error = error
|
||||
self.learning_rule = learning_rule
|
||||
self.learning_rule.initialise(self.model.params)
|
||||
self.train_dataset = train_dataset
|
||||
self.valid_dataset = valid_dataset
|
||||
self.data_monitors = OrderedDict([('error', error)])
|
||||
if data_monitors is not None:
|
||||
self.data_monitors.update(data_monitors)
|
||||
|
||||
def do_training_epoch(self):
|
||||
"""Do a single training epoch.
|
||||
|
||||
This iterates through all batches in training dataset, for each
|
||||
calculating the gradient of the estimated error given the batch with
|
||||
respect to all the model parameters and then updates the model
|
||||
parameters according to the learning rule.
|
||||
"""
|
||||
for inputs_batch, targets_batch in self.train_dataset:
|
||||
activations = self.model.fprop(inputs_batch)
|
||||
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
|
||||
grads_wrt_params = self.model.grads_wrt_params(
|
||||
activations, grads_wrt_outputs)
|
||||
self.learning_rule.update_params(grads_wrt_params)
|
||||
|
||||
def eval_monitors(self, dataset, label):
|
||||
"""Evaluates the monitors for the given dataset.
|
||||
|
||||
Args:
|
||||
dataset: Dataset to perform evaluation with.
|
||||
label: Tag to add to end of monitor keys to identify dataset.
|
||||
|
||||
Returns:
|
||||
OrderedDict of monitor values evaluated on dataset.
|
||||
"""
|
||||
data_mon_vals = OrderedDict([(key + label, 0.) for key
|
||||
in self.data_monitors.keys()])
|
||||
for inputs_batch, targets_batch in dataset:
|
||||
activations = self.model.fprop(inputs_batch)
|
||||
for key, data_monitor in self.data_monitors.items():
|
||||
data_mon_vals[key + label] += data_monitor(
|
||||
activations[-1], targets_batch)
|
||||
for key, data_monitor in self.data_monitors.items():
|
||||
data_mon_vals[key + label] /= dataset.num_batches
|
||||
return data_mon_vals
|
||||
|
||||
def get_epoch_stats(self):
|
||||
"""Computes training statistics for an epoch.
|
||||
|
||||
Returns:
|
||||
An OrderedDict with keys corresponding to the statistic labels and
|
||||
values corresponding to the value of the statistic.
|
||||
"""
|
||||
epoch_stats = OrderedDict()
|
||||
epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)'))
|
||||
if self.valid_dataset is not None:
|
||||
epoch_stats.update(self.eval_monitors(
|
||||
self.valid_dataset, '(valid)'))
|
||||
return epoch_stats
|
||||
|
||||
def log_stats(self, epoch, epoch_time, stats):
|
||||
"""Outputs stats for a training epoch to a logger.
|
||||
|
||||
Args:
|
||||
epoch (int): Epoch counter.
|
||||
epoch_time: Time taken in seconds for the epoch to complete.
|
||||
stats: Monitored stats for the epoch.
|
||||
"""
|
||||
logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format(
|
||||
epoch, epoch_time,
|
||||
', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
|
||||
))
|
||||
|
||||
def train(self, num_epochs, stats_interval=5):
|
||||
"""Trains a model for a set number of epochs.
|
||||
|
||||
Args:
|
||||
num_epochs: Number of epochs (complete passes through trainin
|
||||
dataset) to train for.
|
||||
stats_interval: Training statistics will be recorded and logged
|
||||
every `stats_interval` epochs.
|
||||
|
||||
Returns:
|
||||
Tuple with first value being an array of training run statistics
|
||||
and the second being a dict mapping the labels for the statistics
|
||||
recorded to their column index in the array.
|
||||
"""
|
||||
run_stats = [list(self.get_epoch_stats().values())]
|
||||
for epoch in range(1, num_epochs + 1):
|
||||
start_time = time.process_time()
|
||||
self.do_training_epoch()
|
||||
epoch_time = time.process_time() - start_time
|
||||
if epoch % stats_interval == 0:
|
||||
stats = self.get_epoch_stats()
|
||||
self.log_stats(epoch, epoch_time, stats)
|
||||
run_stats.append(list(stats.values()))
|
||||
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}
|
||||
|
@ -239,4 +239,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
1124
notebooks/02_Single_layer_models.ipynb
Normal file
1124
notebooks/02_Single_layer_models.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
BIN
notebooks/res/._fprop-bprop-block-diagram.png
Normal file
BIN
notebooks/res/._fprop-bprop-block-diagram.png
Normal file
Binary file not shown.
BIN
notebooks/res/._jupyter-dashboard.png
Normal file
BIN
notebooks/res/._jupyter-dashboard.png
Normal file
Binary file not shown.
BIN
notebooks/res/._jupyter-notebook-interface.png
Normal file
BIN
notebooks/res/._jupyter-notebook-interface.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetBP-1.png
Normal file
BIN
notebooks/res/._singleLayerNetBP-1.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetPredict.png
Normal file
BIN
notebooks/res/._singleLayerNetPredict.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetWts-1.png
Normal file
BIN
notebooks/res/._singleLayerNetWts-1.png
Normal file
Binary file not shown.
BIN
notebooks/res/._singleLayerNetWtsEqns-1.png
Normal file
BIN
notebooks/res/._singleLayerNetWtsEqns-1.png
Normal file
Binary file not shown.
BIN
notebooks/res/fprop-bprop-block-diagram.pdf
Normal file
BIN
notebooks/res/fprop-bprop-block-diagram.pdf
Normal file
Binary file not shown.
BIN
notebooks/res/fprop-bprop-block-diagram.png
Normal file
BIN
notebooks/res/fprop-bprop-block-diagram.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 6.9 KiB |
65
notebooks/res/fprop-bprop-block-diagram.tex
Normal file
65
notebooks/res/fprop-bprop-block-diagram.tex
Normal file
@ -0,0 +1,65 @@
|
||||
\documentclass[tikz]{standalone}
|
||||
|
||||
\usepackage{amsmath}
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{arrows}
|
||||
\usetikzlibrary{calc}
|
||||
\usepackage{ifthen}
|
||||
|
||||
\newcommand{\vct}[1]{\boldsymbol{#1}}
|
||||
\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
|
||||
|
||||
\tikzstyle{fprop} = [draw,fill=blue!20,minimum size=2em,align=center]
|
||||
\tikzstyle{bprop} = [draw,fill=red!20,minimum size=2em,align=center]
|
||||
|
||||
\begin{document}
|
||||
|
||||
\begin{tikzpicture}[xscale=1.75] %
|
||||
% define number of layers
|
||||
\def\nl{2};
|
||||
% model input
|
||||
\node at (0, 0) (input) {$\vct{x}$};
|
||||
% draw fprop through model layers
|
||||
\foreach \l in {0,...,\nl} {
|
||||
\node[fprop] at (2 * \l + 1, 0) (fprop\l) {\texttt{layers[\l]} \\ \texttt{.fprop}};
|
||||
\ifthenelse{\l > 0}{
|
||||
\node at (2 * \l, 0) (hidden\l) {$\vct{h}_\l$};
|
||||
\draw[->] (hidden\l) -- (fprop\l);
|
||||
\draw[->] let \n1={\l - 1} in (fprop\n1) -- (hidden\l);
|
||||
}{
|
||||
\draw[->] (input) -- (fprop\l);
|
||||
}
|
||||
}
|
||||
% model output
|
||||
\node at (2 * \nl + 2, 0) (output) {$\mathbf{y}$};
|
||||
% error function
|
||||
\node[fprop] at (2 * \nl + 3, 0) (errorfunc) {\texttt{error}};
|
||||
% error value
|
||||
\node at (2 * \nl + 3, -1) (error) {$\bar{E}$};
|
||||
% targets
|
||||
\node at (2 * \nl + 4, -1) (tgt) {$\vct{t}$};
|
||||
% error gradient
|
||||
\node[bprop] at (2 * \nl + 3, -2) (errorgrad) {\texttt{error} \\ \texttt{.grad}};
|
||||
% gradient wrt outputs
|
||||
\node at (2 * \nl + 2, -2) (gradoutput) {$\pd{\bar{E}}{\vct{y}}$};
|
||||
\draw[->] (fprop\nl) -- (output);
|
||||
\draw[->] (output) -- (errorfunc);
|
||||
\draw[->] (errorfunc) -- (error);
|
||||
\draw[->] (error) -- (errorgrad);
|
||||
\draw[->] (errorgrad) -- (gradoutput);
|
||||
\draw[->] (tgt) |- (errorfunc);
|
||||
\draw[->] (tgt) |- (errorgrad);
|
||||
\foreach \l in {0,...,\nl} {
|
||||
\node[bprop] at (2 * \l + 1, -2) (bprop\l) {\texttt{layers[\l]} \\ \texttt{.bprop}};
|
||||
\ifthenelse{\l > 0}{
|
||||
\node at (2 * \l, -2) (grad\l) {$\pd{\bar{E}}{\vct{h}_\l}$};
|
||||
\draw[<-] (grad\l) -- (bprop\l);
|
||||
\draw[<-] let \n1={\l - 1} in (bprop\n1) -- (grad\l);
|
||||
}{}
|
||||
}
|
||||
\node at (0, -2) (gradinput) {$\pd{\bar{E}}{\vct{x}}$};
|
||||
\draw[->] (bprop0) -- (gradinput);
|
||||
\draw[->] (gradoutput) -- (bprop\nl);
|
||||
\end{tikzpicture}
|
||||
|
||||
\end{document}
|
Loading…
Reference in New Issue
Block a user