Merge pull request #57 from CSTR-Edinburgh/mlp2017-8/lab4

Init Lab_4
This commit is contained in:
AntreasAntoniou 2017-10-15 15:46:44 +01:00 committed by GitHub
commit 018bb05de2
5 changed files with 2445 additions and 94 deletions

View File

@ -63,3 +63,81 @@ class NormalInit(object):
def __call__(self, shape): def __call__(self, shape):
return self.rng.normal(loc=self.mean, scale=self.std, size=shape) return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
class GlorotUniformInit(object):
"""Glorot and Bengio (2010) random uniform weights initialiser.
Initialises an two-dimensional parameter array using the 'normalized
initialisation' scheme suggested in [1] which attempts to maintain a
roughly constant variance in the activations and backpropagated gradients
of a multi-layer model consisting of interleaved affine and logistic
sigmoidal transformation layers.
Weights are sampled from a zero-mean uniform distribution with standard
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
`output_dim` are the input and output dimensions of the weight matrix
respectively.
References:
[1]: Understanding the difficulty of training deep feedforward neural
networks, Glorot and Bengio (2010)
"""
def __init__(self, gain=1., rng=None):
"""Construct a normalised initilisation random initialiser object.
Args:
gain: Multiplicative factor to scale initialised weights by.
Recommended values is 1 for affine layers followed by
logistic sigmoid layers (or another affine layer).
rng (RandomState): Seeded random number generator.
"""
self.gain = gain
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def __call__(self, shape):
assert len(shape) == 2, (
'Initialiser should only be used for two dimensional arrays.')
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
half_width = 3.**0.5 * std
return self.rng.uniform(low=-half_width, high=half_width, size=shape)
class GlorotNormalInit(object):
"""Glorot and Bengio (2010) random normal weights initialiser.
Initialises an two-dimensional parameter array using the 'normalized
initialisation' scheme suggested in [1] which attempts to maintain a
roughly constant variance in the activations and backpropagated gradients
of a multi-layer model consisting of interleaved affine and logistic
sigmoidal transformation layers.
Weights are sampled from a zero-mean normal distribution with standard
deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and
`output_dim` are the input and output dimensions of the weight matrix
respectively.
References:
[1]: Understanding the difficulty of training deep feedforward neural
networks, Glorot and Bengio (2010)
"""
def __init__(self, gain=1., rng=None):
"""Construct a normalised initilisation random initialiser object.
Args:
gain: Multiplicative factor to scale initialised weights by.
Recommended values is 1 for affine layers followed by
logistic sigmoid layers (or another affine layer).
rng (RandomState): Seeded random number generator.
"""
self.gain = gain
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def __call__(self, shape):
std = self.gain * (2. / (shape[0] + shape[1]))**0.5
return self.rng.normal(loc=0., scale=std, size=shape)

View File

@ -257,3 +257,63 @@ class SoftmaxLayer(Layer):
def __repr__(self): def __repr__(self):
return 'SoftmaxLayer' return 'SoftmaxLayer'
class RadialBasisFunctionLayer(Layer):
"""Layer implementing projection to a grid of radial basis functions."""
def __init__(self, grid_dim, intervals=[[0., 1.]]):
"""Creates a radial basis function layer object.
Args:
grid_dim: Integer specifying how many basis function to use in
grid across input space per dimension (so total number of
basis functions will be grid_dim**input_dim)
intervals: List of intervals (two element lists or tuples)
specifying extents of axis-aligned region in input-space to
tile basis functions in grid across. For example for a 2D input
space spanning [0, 1] x [0, 1] use intervals=[[0, 1], [0, 1]].
"""
num_basis = grid_dim**len(intervals)
self.centres = np.array(np.meshgrid(*[
np.linspace(low, high, grid_dim) for (low, high) in intervals])
).reshape((len(intervals), -1))
self.scales = np.array([
[(high - low) * 1. / grid_dim] for (low, high) in intervals])
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return np.exp(-(inputs[..., None] - self.centres[None, ...])**2 /
self.scales**2).reshape((inputs.shape[0], -1))
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
num_basis = self.centres.shape[1]
return -2 * (
((inputs[..., None] - self.centres[None, ...]) / self.scales**2) *
grads_wrt_outputs.reshape((inputs.shape[0], -1, num_basis))
).sum(-1)
def __repr__(self):
return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)

34
mlp/schedulers.py Normal file
View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
"""Training schedulers.
This module contains classes implementing schedulers which control the
evolution of learning rule hyperparameters (such as learning rate) over a
training run.
"""
import numpy as np
class ConstantLearningRateScheduler(object):
"""Example of scheduler interface which sets a constant learning rate."""
def __init__(self, learning_rate):
"""Construct a new constant learning rate scheduler object.
Args:
learning_rate: Learning rate to use in learning rule.
"""
self.learning_rate = learning_rate
def update_learning_rule(self, learning_rule, epoch_number):
"""Update the hyperparameters of the learning rule.
Run at the beginning of each epoch.
Args:
learning_rule: Learning rule object being used in training run,
any scheduled hyperparameters to be altered should be
attributes of this object.
epoch_number: Integer index of training epoch about to be run.
"""
learning_rule.learning_rate = self.learning_rate

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,382 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generalisation and overfitting\n",
"\n",
"In this notebook we will explore the issue of overfitting and how we can measure how well the models we train generalise their predictions to unseen data. This will build upon the introduction to generalisation given in the [fourth lecture](http://www.inf.ed.ac.uk/teaching/courses/mlp/2016/mlp04-learn.pdf)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exercise: overfitting and model complexity in a 1D regression problem\n",
"\n",
"As an exercise we will consider a regression problem. In particular we will attempt to use a multiple layer network model to learn to predict output values from inputs, given a fixed set of (noisy) observations of the underlying functional relationship between inputs and outputs. The aim of the exercise will be to visualise how increasing the complexity of the model we fit to the training data effects the ability of the model to make predictions across the input space.\n",
"\n",
"### Function\n",
"\n",
"To keep things simple we will consider a single input-output function defined by a fourth degree polynomial (quartic)\n",
"\n",
"$$ f(x) = 10 x^4 - 17 x^3 + 8 x^2 - x $$\n",
"\n",
"with the observed values being the function values plus zero-mean Gaussian noise\n",
"\n",
"$$ y = f(x) + 0.01 \\epsilon \\qquad \\epsilon \\sim \\mathcal{N}\\left(\\cdot;\\,0,\\,1\\right) $$\n",
"\n",
"The inputs will be drawn from the uniform distribution on $[0, 1]$.\n",
"\n",
"First import the necessary modules and seed the random number generator by running the cell below."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"plt.style.use('ggplot')\n",
"seed = 17102016 \n",
"rng = np.random.RandomState(seed)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Write code in the cell below to calculate a polynomial function of one dimensional inputs. \n",
"\n",
"If $\\boldsymbol{c}$ is a length $P$ vector of coefficients corresponding to increasing powers in the polynomial (starting from the constant zero power term up to the $P-1^{\\textrm{th}}$ power) the function should correspond to the following\n",
"\n",
"\\begin{equation}\n",
" f_{\\textrm{polynomial}}(x,\\ \\boldsymbol{c}) = \\sum_{p=0}^{P-1} \\left( c_p x^p \\right)\n",
"\\end{equation}"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def polynomial_function(inputs, coefficients):\n",
" \"\"\"Calculates polynomial with given coefficients of an array of inputs.\n",
" \n",
" Args:\n",
" inputs: One-dimensional array of input values of shape (num_inputs,)\n",
" coefficients: One-dimensional array of polynomial coefficient terms\n",
" with `coefficients[0]` corresponding to the coefficient for the\n",
" zero order term in the polynomial (constant) and `coefficients[-1]`\n",
" corresponding to the highest order term.\n",
" \n",
" Returns:\n",
" One dimensional array of output values of shape (num_inputs,)\n",
" \n",
" \"\"\"\n",
" raise NotImplementedError()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the cell below to test your implementation."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "NotImplementedError",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-432f12de57d8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mtest_inputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0.\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1.\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2.\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtest_outputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1.\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m6.\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m21.\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m assert polynomial_function(test_inputs, test_coefficients).shape == (4,), (\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m'Function gives wrong shape output.'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m )\n",
"\u001b[0;32m<ipython-input-2-2d034824fdbe>\u001b[0m in \u001b[0;36mpolynomial_function\u001b[0;34m(inputs, coefficients)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \"\"\"\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNotImplementedError\u001b[0m: "
]
}
],
"source": [
"test_coefficients = np.array([-1., 3., 4.])\n",
"test_inputs = np.array([0., 0.5, 1., 2.])\n",
"test_outputs = np.array([-1., 1.5, 6., 21.])\n",
"assert polynomial_function(test_inputs, test_coefficients).shape == (4,), (\n",
" 'Function gives wrong shape output.'\n",
")\n",
"assert np.allclose(polynomial_function(test_inputs, test_coefficients), test_outputs), (\n",
" 'Function gives incorrect output values.'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now need to use the random number generator to sample input values and calculate the corresponding target outputs using your polynomial implementation with the relevant coefficients for our function. Do this by running the cell below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"coefficients = np.array([0, -1., 8., -17., 10.])\n",
"input_dim, output_dim = 1, 1\n",
"noise_std = 0.01\n",
"num_data = 80\n",
"inputs = rng.uniform(size=(num_data, input_dim))\n",
"epsilons = rng.normal(size=num_data)\n",
"targets = (polynomial_function(inputs[:, 0], coefficients) + \n",
" epsilons * noise_std)[:, None]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will split the generated data points in to equal sized training and validation data sets and use these to create data provider objects which we can use to train models in our framework. As the dataset is small here we will use a batch size equal to the size of the data set. Run the cell below to split the data and set up the data provider objects."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from mlp.data_providers import DataProvider\n",
"num_train = num_data // 2\n",
"batch_size = num_train\n",
"inputs_train, targets_train = inputs[:num_train], targets[:num_train]\n",
"inputs_valid, targets_valid = inputs[num_train:], targets[num_train:]\n",
"train_data = DataProvider(inputs_train, targets_train, batch_size=batch_size, rng=rng)\n",
"valid_data = DataProvider(inputs_valid, targets_valid, batch_size=batch_size, rng=rng)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now visualise the data we will be modelling. Run the cell below to plot the target outputs against inputs for both the training and validation sets. Note the clear underlying smooth functional relationship evident in the noisy data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = plt.figure(figsize=(8, 4))\n",
"ax = fig.add_subplot(111)\n",
"ax.plot(inputs_train[:, 0], targets_train[:, 0], '.', label='training data')\n",
"ax.plot(inputs_valid[:, 0], targets_valid[:, 0], '.', label='validation data')\n",
"ax.set_xlabel('Inputs $x$', fontsize=14)\n",
"ax.set_ylabel('Ouputs $y$', fontsize=14)\n",
"ax.legend(loc='best')\n",
"fig.tight_layout()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model\n",
"\n",
"We will fit models with a varying number of parameters to the training data. As multi-layer logistic sigmoid models do not tend to perform well in regressions tasks like this we will instead use a [radial basis function (RBF) network](https://en.wikipedia.org/wiki/Radial_basis_function_network).\n",
"\n",
"This model predicts the output as the weighted sum of basis functions (here Gaussian like bumps) tiled across the input space. The cell below generates a random set of weights and bias for a RBF network and plots the modelled input-output function across inputs $[0, 1]$. Run the cell below for several different number of weight parameters (specified with `num_weights` variable) to get a feel for the sort of predictions the RBF network models produce."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_weights = 15\n",
"weights_scale = 1.\n",
"bias_scale = 1.\n",
"\n",
"def basis_function(x, centre, scale):\n",
" return np.exp(-(x - centre)**2 / scale**2)\n",
"\n",
"weights = rng.normal(size=num_weights) * weights_scale\n",
"bias = rng.normal() * bias_scale\n",
"\n",
"centres = np.linspace(0, 1, weights.shape[0])\n",
"scale = 1. / weights.shape[0]\n",
"\n",
"xs = np.linspace(0, 1, 200)\n",
"ys = np.zeros(xs.shape[0])\n",
"\n",
"fig = plt.figure(figsize=(12, 4))\n",
"ax = fig.add_subplot(1, 1, 1)\n",
"for weight, centre in zip(weights, centres):\n",
" ys += weight * basis_function(xs, centre, scale)\n",
"ys += bias\n",
"ax.plot(xs, ys)\n",
"ax.set_xlabel('Input', fontsize=14)\n",
"ax.set_ylabel('Output', fontsize=14)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You do not need to read in to the details of how to implement this model. All of the additional code you need to fit RBF networks is provided in the `RadialBasisFunctionLayer` in the `mlp.layers` module. The `RadialBasisFunctionLayer` class has the same interface as the layer classes we encountered in the previous lab, defining both `fprop` and `bprop` methods, and we can therefore include it as a layer in a `MultipleLayerModel` as with any other layer. \n",
"\n",
"Here we will use the `RadialBasisFunctionLayer` as the first layer in a two layer model. This first layer calculates the basis function terms which are then be weighted and summed together in an `AffineLayer`, the second and final layer. This illustrates the advantage of using a modular modelling framework - we can reuse the code we previously implemented to train a quite different model architecture just by defining a new layer class. \n",
"\n",
"Run the cell below to run some necessary setup code."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from mlp.models import MultipleLayerModel\n",
"from mlp.layers import AffineLayer, RadialBasisFunctionLayer\n",
"from mlp.errors import SumOfSquaredDiffsError\n",
"from mlp.initialisers import ConstantInit, UniformInit\n",
"from mlp.learning_rules import GradientDescentLearningRule\n",
"from mlp.optimisers import Optimiser\n",
"\n",
"# Regression problem therefore use sum of squared differences error\n",
"error = SumOfSquaredDiffsError()\n",
"# Use basic gradient descent learning rule with fixed learning rate\n",
"learning_rule = GradientDescentLearningRule(0.1)\n",
"# Initialise weights from uniform distribution and zero bias\n",
"weights_init = UniformInit(-0.1, 0.1)\n",
"biases_init = ConstantInit(0.)\n",
"# Train all models for 2000 epochs\n",
"num_epoch = 2000"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The cell below defines RBF network models with varying number of weight parameters (equal to the number of basis functions) and fits each to the training set, recording the final training and validation set errors for the fitted models. Run it now to fit the models and calculate the error values."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_weight_list = [2, 5, 10, 25, 50, 100]\n",
"models = []\n",
"train_errors = []\n",
"valid_errors = []\n",
"for num_weight in num_weight_list:\n",
" model = MultipleLayerModel([\n",
" RadialBasisFunctionLayer(num_weight),\n",
" AffineLayer(input_dim * num_weight, output_dim, \n",
" weights_init, biases_init)\n",
" ])\n",
" optimiser = Optimiser(model, error, learning_rule, \n",
" train_data, valid_data)\n",
" print('-' * 80)\n",
" print('Training model with {0} weights'.format(num_weight))\n",
" print('-' * 80)\n",
" _ = optimiser.train(num_epoch, -1)\n",
" outputs_train = model.fprop(inputs_train)[-1]\n",
" outputs_valid = model.fprop(inputs_valid)[-1]\n",
" models.append(model)\n",
" train_errors.append(error(outputs_train, targets_train))\n",
" valid_errors.append(error(outputs_valid, targets_valid))\n",
" print(' Final training set error: {0:.1e}'.format(train_errors[-1]))\n",
" print(' Final validation set error: {0:.1e}'.format(valid_errors[-1]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the cell below write code to [plot bar charts](http://matplotlib.org/examples/api/barchart_demo.html) of the training and validation set errors for the different fitted models.\n",
"\n",
"Some questions to think about from the plots:\n",
"\n",
" * Do the models with more free parameters fit the training data better or worse?\n",
" * What does the validation set error value tell us about the models?\n",
" * Of the models fitted here which would you say seems like it is most likely to generalise well to unseen data? \n",
" * Do any of the models seem to be overfitting?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's visualise what the fitted model's predictions look like across the whole input space compared to the 'true' function we were trying to fit. \n",
"\n",
"In the cell below, for each of the fitted models stored in the `models` list above:\n",
" * Compute output predictions for the model across a linearly spaced series of 500 input points between 0 and 1 in the input space.\n",
" * Plot the computed predicted outputs and true function values at the corresponding inputs as line plots on the same axis (use a new axis for each model).\n",
" * On the same axis plot the training data sets input-target pairs as points."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should be able to relate your answers to the questions above to what you see in these plots - ask a demonstrator if you are unsure what is going on. In particular for the models which appeared to be overfitting and generalising poorly you should now have an idea how this looks in terms of the model's predictions and how these relate to the training data points and true function values."
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}