diff --git a/mlp/data_providers.py b/mlp/data_providers.py index 867a2f9..ebea079 100644 --- a/mlp/data_providers.py +++ b/mlp/data_providers.py @@ -75,6 +75,9 @@ class DataProvider(object): self.inputs = self.inputs[new_order] self.targets = self.targets[new_order] + def __next__(self): + return self.next() + def next(self): """Returns next data batch or raises `StopIteration` if at end.""" if self._curr_batch + 1 > self.num_batches: @@ -133,13 +136,10 @@ class MNISTDataProvider(DataProvider): super(MNISTDataProvider, self).__init__( inputs, targets, batch_size, max_num_batches, shuffle_order, rng) - # def next(self): - # """Returns next data batch or raises `StopIteration` if at end.""" - # inputs_batch, targets_batch = super(MNISTDataProvider, self).next() - # return inputs_batch, self.to_one_of_k(targets_batch) - - def __next__(self): - return self.next() + def next(self): + """Returns next data batch or raises `StopIteration` if at end.""" + inputs_batch, targets_batch = super(MNISTDataProvider, self).next() + return inputs_batch, self.to_one_of_k(targets_batch) def to_one_of_k(self, int_targets): """Converts integer coded class target to 1 of K coded targets. @@ -156,21 +156,23 @@ class MNISTDataProvider(DataProvider): to zero except for the column corresponding to the correct class which is equal to one. """ - raise NotImplementedError() + one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes)) + one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1 + return one_of_k_targets class MetOfficeDataProvider(DataProvider): """South Scotland Met Office weather data provider.""" def __init__(self, window_size, batch_size=10, max_num_batches=-1, - shuffle_order=True, rng=None): - """Create a new Met Offfice data provider object. + shuffle_order=True, rng=None): + """Create a new Met Office data provider object. Args: window_size (int): Size of windows to split weather time series - data into. The constructed input features will be the first - `window_size - 1` entries in each window and the target outputs - the last entry in each window. + data into. The constructed input features will be the first + `window_size - 1` entries in each window and the target outputs + the last entry in each window. batch_size (int): Number of data points to include in each batch. max_num_batches (int): Maximum number of batches to iterate over in an epoch. If `max_num_batches * batch_size > num_data` then @@ -180,29 +182,74 @@ class MetOfficeDataProvider(DataProvider): the data before each epoch. rng (RandomState): A seeded random number generator. """ - self.window_size = window_size - assert window_size > 1, 'window_size must be at least 2.' data_path = os.path.join( os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt') assert os.path.isfile(data_path), ( 'Data file does not exist at expected path: ' + data_path ) - #TODO: load raw data from text file - - #TODO: filter out all missing datapoints and flatten to a vector - - #TODO: normalise data to zero mean, unit standard deviation + raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32)) + assert window_size > 1, 'window_size must be at least 2.' + self.window_size = window_size + # filter out all missing datapoints and flatten to a vector + filtered = raw[raw >= 0].flatten() + # normalise data to zero mean, unit standard deviation + mean = np.mean(filtered) + std = np.std(filtered) + normalised = (filtered - mean) / std + # create a view on to array corresponding to a rolling window + shape = (normalised.shape[-1] - self.window_size + 1, self.window_size) + strides = normalised.strides + (normalised.strides[-1],) + windowed = np.lib.stride_tricks.as_strided( + normalised, shape=shape, strides=strides) + # inputs are first (window_size - 1) entries in windows + inputs = windowed[:, :-1] + # targets are last entry in windows + targets = windowed[:, -1] + super(MetOfficeDataProvider, self).__init__( + inputs, targets, batch_size, max_num_batches, shuffle_order, rng) - #TODO: convert from flat sequence to windowed data +class CCPPDataProvider(DataProvider): - #TODO: separate into inputs and targets - # inputs are the first (window_size - 1) entries in windows - # inputs = ... - # targets are the last entries in windows - # targets = ... - - # initialise base class with inputs and targets arrays (uncomment below) - # super(MetOfficeDataProvider, self).__init__( - # inputs, targets, batch_size, max_num_batches, shuffle_order, rng) - def __next__(self): - return self.next() \ No newline at end of file + def __init__(self, which_set='train', input_dims=None, batch_size=10, + max_num_batches=-1, shuffle_order=True, rng=None): + """Create a new Combined Cycle Power Plant data provider object. + + Args: + which_set: One of 'train' or 'valid'. Determines which portion of + data this object should provide. + input_dims: Which of the four input dimension to use. If `None` all + are used. If an iterable of integers are provided (consisting + of a subset of {0, 1, 2, 3}) then only the corresponding + input dimensions are included. + batch_size (int): Number of data points to include in each batch. + max_num_batches (int): Maximum number of batches to iterate over + in an epoch. If `max_num_batches * batch_size > num_data` then + only as many batches as the data can be split into will be + used. If set to -1 all of the data will be used. + shuffle_order (bool): Whether to randomly permute the order of + the data before each epoch. + rng (RandomState): A seeded random number generator. + """ + data_path = os.path.join( + os.environ['MLP_DATA_DIR'], 'ccpp_data.npz') + assert os.path.isfile(data_path), ( + 'Data file does not exist at expected path: ' + data_path + ) + # check a valid which_set was provided + assert which_set in ['train', 'valid'], ( + 'Expected which_set to be either train or valid ' + 'Got {0}'.format(which_set) + ) + # check input_dims are valid + if not input_dims is not None: + input_dims = set(input_dims) + assert input_dims.issubset({0, 1, 2, 3}), ( + 'input_dims should be a subset of {0, 1, 2, 3}' + ) + loaded = np.load(data_path) + inputs = loaded[which_set + '_inputs'] + if input_dims is not None: + inputs = inputs[:, input_dims] + targets = loaded[which_set + '_targets'] + super(CCPPDataProvider, self).__init__( + inputs, targets, batch_size, max_num_batches, shuffle_order, rng) diff --git a/mlp/errors.py b/mlp/errors.py new file mode 100644 index 0000000..e517e85 --- /dev/null +++ b/mlp/errors.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +"""Error functions. + +This module defines error functions, with the aim of model training being to +minimise the error function given a set of inputs and target outputs. + +The error functions will typically measure some concept of distance between the +model outputs and target outputs, averaged over all data points in the data set +or batch. +""" + +import numpy as np + + +class SumOfSquaredDiffsError(object): + """Sum of squared differences (squared Euclidean distance) error.""" + + def __call__(self, outputs, targets): + """Calculates error function given a batch of outputs and targets. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Scalar error function value. + """ + #TODO write your code here + raise NotImplementedError() + + def grad(self, outputs, targets): + """Calculates gradient of error function with respect to outputs. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Gradient of error function with respect to outputs. This should be + an array of shape (batch_size, output_dim). + """ + #TODO write your code here + raise NotImplementedError() + + def __repr__(self): + return 'SumOfSquaredDiffsError' diff --git a/mlp/initialisers.py b/mlp/initialisers.py new file mode 100644 index 0000000..243adc2 --- /dev/null +++ b/mlp/initialisers.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +"""Parameter initialisers. + +This module defines classes to initialise the parameters in a layer. +""" + +import numpy as np +from mlp import DEFAULT_SEED + + +class ConstantInit(object): + """Constant parameter initialiser.""" + + def __init__(self, value): + """Construct a constant parameter initialiser. + + Args: + value: Value to initialise parameter to. + """ + self.value = value + + def __call__(self, shape): + return np.ones(shape=shape) * self.value + + +class UniformInit(object): + """Random uniform parameter initialiser.""" + + def __init__(self, low, high, rng=None): + """Construct a random uniform parameter initialiser. + + Args: + low: Lower bound of interval to sample from. + high: Upper bound of interval to sample from. + rng (RandomState): Seeded random number generator. + """ + self.low = low + self.high = high + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + return self.rng.uniform(low=self.low, high=self.high, size=shape) + + +class NormalInit(object): + """Random normal parameter initialiser.""" + + def __init__(self, mean, std, rng=None): + """Construct a random uniform parameter initialiser. + + Args: + mean: Mean of distribution to sample from. + std: Standard deviation of distribution to sample from. + rng (RandomState): Seeded random number generator. + """ + self.mean = mean + self.std = std + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + return self.rng.normal(loc=self.mean, scale=self.std, size=shape) diff --git a/mlp/layers.py b/mlp/layers.py new file mode 100644 index 0000000..baa9345 --- /dev/null +++ b/mlp/layers.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- +"""Layer definitions. + +This module defines classes which encapsulate a single layer. + +These layers map input activations to output activation with the `fprop` +method and map gradients with repsect to outputs to gradients with respect to +their inputs with the `bprop` method. + +Some layers will have learnable parameters and so will additionally define +methods for getting and setting parameter and calculating gradients with +respect to the layer parameters. +""" + +import numpy as np +import mlp.initialisers as init + + +class Layer(object): + """Abstract class defining the interface for a layer.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError() + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + raise NotImplementedError() + + +class LayerWithParameters(Layer): + """Abstract class defining the interface for a layer with parameters.""" + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: Array of inputs to layer of shape (batch_size, input_dim). + grads_wrt_to_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + List of arrays of gradients with respect to the layer parameters + with parameter gradients appearing in same order in tuple as + returned from `get_params` method. + """ + raise NotImplementedError() + + @property + def params(self): + """Returns a list of parameters of layer. + + Returns: + List of current parameter values. + """ + raise NotImplementedError() + + +class AffineLayer(LayerWithParameters): + """Layer implementing an affine tranformation of its inputs. + + This layer is parameterised by a weight matrix and bias vector. + """ + + def __init__(self, input_dim, output_dim, + weights_initialiser=init.UniformInit(-0.1, 0.1), + biases_initialiser=init.ConstantInit(0.), + weights_cost=None, biases_cost=None): + """Initialises a parameterised affine layer. + + Args: + input_dim (int): Dimension of inputs to the layer. + output_dim (int): Dimension of the layer outputs. + weights_initialiser: Initialiser for the weight parameters. + biases_initialiser: Initialiser for the bias parameters. + """ + self.input_dim = input_dim + self.output_dim = output_dim + self.weights = weights_initialiser((self.output_dim, self.input_dim)) + self.biases = biases_initialiser(self.output_dim) + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x`, outputs `y`, weights `W` and biases `b` the layer + corresponds to `y = W.dot(x) + b`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + #TODO write your code here + raise NotImplementedError() + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: array of inputs to layer of shape (batch_size, input_dim) + grads_wrt_to_outputs: array of gradients with respect to the layer + outputs of shape (batch_size, output_dim) + + Returns: + list of arrays of gradients with respect to the layer parameters + `[grads_wrt_weights, grads_wrt_biases]`. + """ + #TODO write your code here + raise NotImplementedError() + + @property + def params(self): + """A list of layer parameter values: `[weights, biases]`.""" + return [self.weights, self.biases] + + def __repr__(self): + return 'AffineLayer(input_dim={0}, output_dim={1})'.format( + self.input_dim, self.output_dim) diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py new file mode 100644 index 0000000..22f2bcb --- /dev/null +++ b/mlp/learning_rules.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +"""Learning rules. + +This module contains classes implementing gradient based learning rules. +""" + +import numpy as np + + +class GradientDescentLearningRule(object): + """Simple (stochastic) gradient descent learning rule. + + For a scalar error function `E(p[0], p_[1] ... )` of some set of + potentially multidimensional parameters this attempts to find a local + minimum of the loss function by applying updates to each parameter of the + form + + p[i] := p[i] - learning_rate * dE/dp[i] + + With `learning_rate` a positive scaling parameter. + + The error function used in successive applications of these updates may be + a stochastic estimator of the true error function (e.g. when the error with + respect to only a subset of data-points is calculated) in which case this + will correspond to a stochastic gradient descent learning rule. + """ + + def __init__(self, learning_rate=1e-3): + """Creates a new learning rule object. + + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + + """ + assert learning_rate > 0., 'learning_rate should be positive.' + self.learning_rate = learning_rate + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + + This must be called before `update_params` is first called. + + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + self.params = params + + def reset(self): + """Resets any additional state variables to their intial values. + + For this learning rule there are no additional state variables so we + do nothing here. + """ + pass + + def update_params(self, grads_wrt_params): + """Applies a single gradient descent update to all parameters. + + All parameter updates are performed using in-place operations and so + nothing is returned. + + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, grad in zip(self.params, grads_wrt_params): + param -= self.learning_rate * grad + + +class MomentumLearningRule(GradientDescentLearningRule): + """Gradient descent with momentum learning rule. + + This extends the basic gradient learning rule by introducing extra + momentum state variables for each parameter. These can help the learning + dynamic help overcome shallow local minima and speed convergence when + making multiple successive steps in a similar direction in parameter space. + + For parameter p[i] and corresponding momentum m[i] the updates for a + scalar loss function `L` are of the form + + m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i] + p[i] := p[i] + m[i] + + with `learning_rate` a positive scaling parameter for the gradient updates + and `mom_coeff` a value in [0, 1] that determines how much 'friction' there + is the system and so how quickly previous momentum contributions decay. + """ + + def __init__(self, learning_rate=1e-3, mom_coeff=0.9): + """Creates a new learning rule object. + + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + mom_coeff: A scalar in the range [0, 1] inclusive. This determines + the contribution of the previous momentum value to the value + after each update. If equal to 0 the momentum is set to exactly + the negative scaled gradient each update and so this rule + collapses to standard gradient descent. If equal to 1 the + momentum will just be decremented by the scaled gradient at + each update. This is equivalent to simulating the dynamic in + a frictionless system. Due to energy conservation the loss + of 'potential energy' as the dynamics moves down the loss + function surface will lead to an increasingly large 'kinetic + energy' and so speed, meaning the updates will become + increasingly large, potentially unstably so. Typically a value + less than but close to 1 will avoid these issues and cause the + dynamic to converge to a local minima where the gradients are + by definition zero. + """ + super(MomentumLearningRule, self).__init__(learning_rate) + assert mom_coeff >= 0. and mom_coeff <= 1., ( + 'mom_coeff should be in the range [0, 1].' + ) + self.mom_coeff = mom_coeff + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + + This must be called before `update_params` is first called. + + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(MomentumLearningRule, self).initialise(params) + self.moms = [] + for param in self.params: + self.moms.append(np.zeros_like(param)) + + def reset(self): + """Resets any additional state variables to their intial values. + + For this learning rule this corresponds to zeroing all the momenta. + """ + for mom in zip(self.moms): + mom *= 0. + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + + All parameter updates are performed using in-place operations and so + nothing is returned. + + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, mom, grad in zip(self.params, self.moms, grads_wrt_params): + mom *= self.mom_coeff + mom -= self.learning_rate * grad + param += mom diff --git a/mlp/models.py b/mlp/models.py new file mode 100644 index 0000000..86a0472 --- /dev/null +++ b/mlp/models.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +"""Model definitions. + +This module implements objects encapsulating learnable models of input-output +relationships. The model objects implement methods for forward propagating +the inputs through the transformation(s) defined by the model to produce +outputs (and intermediate states) and for calculating gradients of scalar +functions of the outputs with respect to the model parameters. +""" + +from mlp.layers import LayerWithParameters + + +class SingleLayerModel(object): + """A model consisting of a single transformation layer.""" + + def __init__(self, layer): + """Create a new single layer model instance. + + Args: + layer: The layer object defining the model architecture. + """ + self.layer = layer + + @property + def params(self): + """A list of all of the parameters of the model.""" + return self.layer.params + + def fprop(self, inputs): + """Calculate the model outputs corresponding to a batch of inputs. + + Args: + inputs: Batch of inputs to the model. + + Returns: + List which is a concatenation of the model inputs and model + outputs, this being done for consistency of the interface with + multi-layer models for which `fprop` returns a list of + activations through all immediate layers of the model and including + the inputs and outputs. + """ + activations = [inputs, self.layer.fprop(inputs)] + return activations + + def grads_wrt_params(self, activations, grads_wrt_outputs): + """Calculates gradients with respect to the model parameters. + + Args: + activations: List of all activations from forward pass through + model using `fprop`. + grads_wrt_outputs: Gradient with respect to the model outputs of + the scalar function parameter gradients are being calculated + for. + + Returns: + List of gradients of the scalar function with respect to all model + parameters. + """ + return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs) + + def params_cost(self): + """Calculates the parameter dependent cost term of the model.""" + return self.layer.params_cost() + + def __repr__(self): + return 'SingleLayerModel(' + str(layer) + ')' diff --git a/mlp/optimisers.py b/mlp/optimisers.py new file mode 100644 index 0000000..91d1f28 --- /dev/null +++ b/mlp/optimisers.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +"""Model optimisers. + +This module contains objects implementing (batched) stochastic gradient descent +based optimisation of models. +""" + +import time +import logging +from collections import OrderedDict +import numpy as np + + +logger = logging.getLogger(__name__) + + +class Optimiser(object): + """Basic model optimiser.""" + + def __init__(self, model, error, learning_rule, train_dataset, + valid_dataset=None, data_monitors=None): + """Create a new optimiser instance. + + Args: + model: The model to optimise. + error: The scalar error function to minimise. + learning_rule: Gradient based learning rule to use to minimise + error. + train_dataset: Data provider for training set data batches. + valid_dataset: Data provider for validation set data batches. + data_monitors: Dictionary of functions evaluated on targets and + model outputs (averaged across both full training and + validation data sets) to monitor during training in addition + to the error. Keys should correspond to a string label for + the statistic being evaluated. + """ + self.model = model + self.error = error + self.learning_rule = learning_rule + self.learning_rule.initialise(self.model.params) + self.train_dataset = train_dataset + self.valid_dataset = valid_dataset + self.data_monitors = OrderedDict([('error', error)]) + if data_monitors is not None: + self.data_monitors.update(data_monitors) + + def do_training_epoch(self): + """Do a single training epoch. + + This iterates through all batches in training dataset, for each + calculating the gradient of the estimated error given the batch with + respect to all the model parameters and then updates the model + parameters according to the learning rule. + """ + for inputs_batch, targets_batch in self.train_dataset: + activations = self.model.fprop(inputs_batch) + grads_wrt_outputs = self.error.grad(activations[-1], targets_batch) + grads_wrt_params = self.model.grads_wrt_params( + activations, grads_wrt_outputs) + self.learning_rule.update_params(grads_wrt_params) + + def eval_monitors(self, dataset, label): + """Evaluates the monitors for the given dataset. + + Args: + dataset: Dataset to perform evaluation with. + label: Tag to add to end of monitor keys to identify dataset. + + Returns: + OrderedDict of monitor values evaluated on dataset. + """ + data_mon_vals = OrderedDict([(key + label, 0.) for key + in self.data_monitors.keys()]) + for inputs_batch, targets_batch in dataset: + activations = self.model.fprop(inputs_batch) + for key, data_monitor in self.data_monitors.items(): + data_mon_vals[key + label] += data_monitor( + activations[-1], targets_batch) + for key, data_monitor in self.data_monitors.items(): + data_mon_vals[key + label] /= dataset.num_batches + return data_mon_vals + + def get_epoch_stats(self): + """Computes training statistics for an epoch. + + Returns: + An OrderedDict with keys corresponding to the statistic labels and + values corresponding to the value of the statistic. + """ + epoch_stats = OrderedDict() + epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)')) + if self.valid_dataset is not None: + epoch_stats.update(self.eval_monitors( + self.valid_dataset, '(valid)')) + return epoch_stats + + def log_stats(self, epoch, epoch_time, stats): + """Outputs stats for a training epoch to a logger. + + Args: + epoch (int): Epoch counter. + epoch_time: Time taken in seconds for the epoch to complete. + stats: Monitored stats for the epoch. + """ + logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format( + epoch, epoch_time, + ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()]) + )) + + def train(self, num_epochs, stats_interval=5): + """Trains a model for a set number of epochs. + + Args: + num_epochs: Number of epochs (complete passes through trainin + dataset) to train for. + stats_interval: Training statistics will be recorded and logged + every `stats_interval` epochs. + + Returns: + Tuple with first value being an array of training run statistics + and the second being a dict mapping the labels for the statistics + recorded to their column index in the array. + """ + run_stats = [list(self.get_epoch_stats().values())] + for epoch in range(1, num_epochs + 1): + start_time = time.process_time() + self.do_training_epoch() + epoch_time = time.process_time() - start_time + if epoch % stats_interval == 0: + stats = self.get_epoch_stats() + self.log_stats(epoch, epoch_time, stats) + run_stats.append(list(stats.values())) + return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())} + diff --git a/notebooks/00_notebook.ipynb b/notebooks/00_notebook.ipynb index 384dd8a..59fae18 100644 --- a/notebooks/00_notebook.ipynb +++ b/notebooks/00_notebook.ipynb @@ -239,4 +239,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/notebooks/01_Introduction.ipynb b/notebooks/01_Introduction.ipynb index f9ceab4..f3dc403 100644 --- a/notebooks/01_Introduction.ipynb +++ b/notebooks/01_Introduction.ipynb @@ -27,13 +27,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "nbpresent": { "id": "978c1095-a9ce-4626-a113-e0be5fe51ecb" } }, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAK4AAACuCAYAAACvDDbuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAFGUlEQVR4nO3dvUtjTRiG8cm6VSysBNsUFoqVBD+aiFiLlSiClQYEQdQ/wEbQ1g8ExRgrY6dgYWMhVmKnNhaSRtGInRYWNtl2YZ6BnD3Jmju5fuXDsDv7vhcDJxOPiXK5XHaAmF8/vQHgXxAuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJP3+6Q00m+/vb3N+fX3tzS4uLsy1+Xzem729vZlre3t7vdnx8bG5trOz05zXI05cSCJcSCJcSCJcSCJcSEqUy+XyT2+iUeVyOW+2u7trrr29vfVmof81iUQi1tqJiQlz7dHRkTmvR5y4kES4kES4kES4kMSVb0CUq9mlpSVz7d3dnTdLJpPm2kwm483W19fNtf39/d5sa2vLXGvtrVQqmWuVcOJCEuFCEuFCEuFCEuFCEle+AaGn9OXlZW8W+k9ofYl7Z2fHXNvX1xdhd76WlhZzbl35hj7ZKBaL3qy9vT3WvmqFExeSCBeSCBeSCBeSmurKN3SNOz8/780ODg7MtdbDzuTkpLl2f3/fm4UejCzPz8/m3LryjfKMnU6nzXlra2vFf8ZP48SFJMKFJMKFJMKFJMKFpKa68s1ms+b88PDQm42Ojpprp6amvNn4+HjFe/j6+jLnm5ub3mx7e9tc+/7+7s2i/JTvw8ODuZZ3hwE1RriQRLiQRLiQ1LAPZ4uLi94s9B3bjo4Ob/b6+lrx3xXlJ4KHh4fNtXFfq5RKpcy1hULBm8X97m894MSFJMKFJMKFJMKFJMKFpIb9Ivn9/b03s57GnXNuZmbGm52fn1f8d62srJhz62XNoT2E5pWutf69zkX74roSTlxIIlxIIlxIIlxIkn84C123fn5+erPQFera2lrFa6NczVpXyaHfuWsZGhoy53t7e96sUR/CQjhxIYlwIYlwIYlwIYlwIUn+i+QfHx/mvLu725uFnuijfFJgvaz55OTEXLuwsODNzs7OKt5DI/w0bq1w4kIS4UIS4UIS4UKS/JVvW1ubOX95eflvewg9IN7c3Hiz0EPfxsaGN+MhLIwTF5IIF5IIF5IIF5IIF5LkP1WoByMjI+bcegHz7OysuXZubq6qe2p0nLiQRLiQRLiQRLiQJP993FoJXeN2dXV5s9D3fMfGxrzZ6elpvI3BOceJC1GEC0mEC0mEC0mEC0lc+QZYL2V2zr7GDb2UeXV1tZpbwl84cSGJcCGJcCGJcCGJhzPn3OPjozeL8jt3rRctO+dcT09PvI0hiBMXkggXkggXkggXkggXkprqUwXrXV7OOTc9Pe3NQte4l5eX3iyTycTbGCLjxIUkwoUkwoUkwoWkpno4y+Vy5rxYLHqzVCplrk2n01XdE/4NJy4kES4kES4kES4kES4kNeynCtYnCPl83lxrXe8WCgVzbTKZjLcxVAUnLiQRLiQRLiQRLiTJv9j56enJnA8MDHizUqlkrr26uvJmfMe2vnHiQhLhQhLhQhLhQhLhQpL8lW/o1zpZL2DOZrPm2sHBwaruCbXHiQtJhAtJhAtJhAtJ8le+aE6cuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJBEuJD0B9JqFu5lRSB0AAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Image target: [9]\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAK4AAACuCAYAAACvDDbuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAFZElEQVR4nO3dyyu0fRzH8Wt0OxTlsBAicthKDslKUaRI2VhYYCMWLPgP/BOShaLY2EiKkhKZENk4bCxsHDdO5ZTmWTzL33ee+2Ka55rPeL+W375mru7e969mrhlCkUgk4gFiUoK+AOAnCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeSCBeS/gR9Acns7u7Oma2trfn++c7OTnOel5f342tKFpy4kES4kES4kES4kBT4i7Py8nJndnR0ZO7m5OTE+Wp+5uTkxJw3NDQ4s/f3d9+Pm5uba85nZ2edWVdXl+/HTQacuJBEuJBEuJBEuJBEuJAUikQikSAvICXF/b9zfHxs7lZXV8f5av7OelegtrbW3D07O3NmBQUF5u7t7a3va7D+zQ4ODszdmpoa34+rhBMXkggXkggXkggXkgK/5WvZ3t4254nw4uz6+tqZnZ+fm7sDAwPObGpqytxdWlpyZkNDQ+bu29ubM7u/vzd3kxUnLiQRLiQRLiQRLiQRLiQl5LsKmZmZQV9CVBcXFzH9fFpamjnv6+tzZk9PT+buxMSEM4t2K9kS7S7/4eGhM/v8/DR3m5qafD9fPHDiQhLhQhLhQhLhQlLgn8dNT093Zqenp+ZuRUVFvC/nr15eXpxZWVmZuVtVVeXMdnd3zd1QKOT7Gq6urpxZUVGR759/fX0159a/b7RvXH/nxWA8cOJCEuFCEuFCEuFCEuFCUuC3fPv7+51ZIrx7EE1GRoYzi3Zrdm9vz5lZH0T3PPtdga+vL3M3NTXVmYXDYXN3fX3dmS0uLpq7WVlZzizodw+i4cSFJMKFJMKFJMKFpMBv+Vq/0si6DZzIov1S5dXVVWc2PDxs7hYWFjqz+fl5czfWzwSXlJSY862tLWdWWloa03PFCycuJBEuJBEuJBEuJBEuJAV+y1ftHQRLZWWl793p6emYn6++vt6ZNTY2mrt1dXXOrKOjw9zNz8+P7cL+R5y4kES4kES4kES4kBT4i7NEdXl5ac7Hxsac2crKirn7nbvpvb29zmxyctLctb49/Ntw4kIS4UIS4UIS4UIS4UJS4B8kTwQ7OzvOrLu729x9eHhwZtnZ2b6f6/Hx0Zzv7+87M+t2Lf7FiQtJhAtJhAtJhAtJv+qWb7Rvx1rf0rV+gbPned7y8rIza29vN3dnZmac2ejo6H9dInzixIUkwoUkwoUkwoUkwoWkX/WuwsjIiDm3/l7txsaGudvc3Oz7+T4+PnzvKn3DNhFw4kIS4UIS4UIS4UJS0r44s/7erfWZV8/zvPHxcWf2nRdh0czNzTmztrY2c7e4uDjm5/tNOHEhiXAhiXAhiXAhiXAhKWnfVTg5OXFmz8/P5u7m5qYza21tNXfv7++dmfX3cj3P846Pj53ZwsKCuRsKhcw5bJy4kES4kES4kES4kJS0L86+IxwOO7OWlpaYH7e/v9+Z9fT0xPy44MSFKMKFJMKFJMKFJMKFpKT9xc7W7d3BwUFz1/p9YCkp9v/psrIy349rfUA9LS3N3MX3cOJCEuFCEuFCEuFCUtK+OPuOm5sbZxbtRVReXl68Lwc+cOJCEuFCEuFCEuFCEuFCEu8qQBInLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiQRLiT9A39h/RwPbFTqAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Image target: [8]\n" + ] + } + ], "source": [ "%matplotlib inline\n", "import numpy as np\n", @@ -81,22 +116,96 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# write your code here for iterating over five batches of \n", "# 100 data points each and displaying as 10x10 grids\n", "\n", "def show_batch_of_images(img_batch, fig_size=(3, 3)):\n", - " #Expected shape of img_batch: (batch_size, im_height, im_width)\n", - " raise NotImplementedError('Write me!')\n", + " #Shape of img_batch: (batch_size, im_height, im_width)\n", + " fig = plt.figure(figsize=fig_size)\n", + " batch_size, im_height, im_width = img_batch.shape\n", + " # calculate no. of columns per grid row to give square grid\n", + " grid_size = int(batch_size**0.5)\n", + " # intialise empty array to tile image grid into\n", + " tiled = np.empty((im_height * grid_size, \n", + " im_width * batch_size // grid_size))\n", + " # iterate over images in batch + indexes within batch\n", + " for i, img in enumerate(img_batch):\n", + " # calculate grid row and column indices\n", + " r, c = i % grid_size, i // grid_size\n", + " tiled[r * im_height:(r + 1) * im_height, \n", + " c * im_height:(c + 1) * im_height] = img\n", + " ax = fig.add_subplot(111)\n", + " ax.imshow(tiled, cmap='Greys')\n", + " ax.axis('off')\n", + " fig.tight_layout()\n", + " plt.show()\n", + " return fig, ax\n", "\n", "batch_size = 100\n", "num_batches = 5\n", "\n", - "#TODO: initialize the MNISTDataProvider class and iterate over batches\n", - "# with the show_batch_of_images function" + "mnist_dp = data_providers.MNISTDataProvider(\n", + " which_set='valid', batch_size=batch_size, \n", + " max_num_batches=num_batches, shuffle_order=True)\n", + "\n", + "for inputs, target in mnist_dp:\n", + " # reshape inputs from batch of vectors to batch of 2D arrays (images)\n", + " show_batch_of_images(inputs.reshape((batch_size, 28, 28)))" ] }, { @@ -130,9 +239,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", + " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n", + " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n", + " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]\n", + " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]\n", + "[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n", + " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n", + " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", + " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n", + " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]\n", + "[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n", + " [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n", + " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", + " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n", + " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]\n", + "[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n", + " [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n", + " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]\n", + "[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]\n", + " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n", + " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n", + " [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n" + ] + } + ], "source": [ "import mlp.data_providers as data_providers\n", "import numpy as np\n", @@ -164,10 +305,9 @@ "\n", "**Your tasks**:\n", "\n", - " * Implement the `MetOfficeDataProvider` class in `mlp/data_providers.py`. You only need to implement the `__init__()` function, following the instructions below:\n", - " * You should read all of the data from the file ([`np.loadtxt`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.loadtxt.html) may be useful for this) and then filter out the `-99.9` values and collapse the table to a one-dimensional array corresponding to a sequence of daily measurements for the whole period data is available for. [NumPy's boolean indexing feature](http://docs.scipy.org/doc/numpy/user/basics.indexing.html#boolean-or-mask-index-arrays) could be helpful here.\n", - " * A common initial preprocessing step in machine learning tasks is to normalise data so that it has zero mean and a standard deviation of one. Normalise the data sequence so that its overall mean is zero and standard deviation one.\n", - " * Each data point in the data provider should correspond to a window of length specified in the `__init__` method as `window_size` of this contiguous data sequence, with the model inputs being the first `window_size - 1` elements of the window and the target output being the last element of the window. For example if the original data sequence was `[1, 2, 3, 4, 5, 6]` and `window_size=3` then `input, target` pairs iterated over by the data provider should be\n", + " * You should read all of the data from the file ([`np.loadtxt`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.loadtxt.html) may be useful for this) and then filter out the `-99.9` values and collapse the table to a one-dimensional array corresponding to a sequence of daily measurements for the whole period data is available for. [NumPy's boolean indexing feature](http://docs.scipy.org/doc/numpy/user/basics.indexing.html#boolean-or-mask-index-arrays) could be helpful here.\n", + " * A common initial preprocessing step in machine learning tasks is to normalise data so that it has zero mean and a standard deviation of one. Normalise the data sequence so that its overall mean is zero and standard deviation one.\n", + " * Each data point in the data provider should correspond to a window of length specified in the `__init__` method as `window_size` of this contiguous data sequence, with the model inputs being the first `window_size - 1` elements of the window and the target output being the last element of the window. For example if the original data sequence was `[1, 2, 3, 4, 5, 6]` and `window_size=3` then `input, target` pairs iterated over by the data provider should be\n", " ```\n", " [1, 2], 3\n", " [4, 5], 6\n", @@ -185,13 +325,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "nbpresent": { "id": "c8553a56-9f25-4198-8a1a-d7e9572b4382" } }, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import matplotlib.pyplot as plt\n", "import mlp.data_providers as data_providers\n", diff --git a/notebooks/02_Single_layer_models.ipynb b/notebooks/02_Single_layer_models.ipynb new file mode 100644 index 0000000..093ec41 --- /dev/null +++ b/notebooks/02_Single_layer_models.ipynb @@ -0,0 +1,1124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Single layer models\n", + "\n", + "The objective of this lab is to implement a single-layer network model consisting of solely of an affine transformation of the inputs. The relevant material for this is covered in slides 12-23 of the first lecture.\n", + "\n", + "We will first implement the forward propagation of inputs to the network to produce predicted outputs. We will then move on to considering how to use gradients of an error function evaluated on the outputs to compute the gradients with respect to the model parameters to allow us to perform an iterative gradient-descent training procedure. In the final exercise you will use an interactive visualisation to explore the role of some of the different hyperparameters of gradient-descent based training methods.\n", + "\n", + "#### A note on random number generators\n", + "\n", + "It is generally a good practice (for machine learning applications **not** for cryptography!) to seed a pseudo-random number generator once at the beginning of each experiment. This makes it easier to reproduce results as the same random draws will produced each time the experiment is run (e.g. the same random initialisations used for parameters). Therefore generally when we need to generate random values during this course, we will create a seeded random number generator object as we do in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "seed = 27092016 \n", + "rng = np.random.RandomState(seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1: linear and affine transforms\n", + "\n", + "Any *linear transform* (also called a linear map) on a finite-dimensional vector space can be parametrised by a matrix. For example if we consider $\\boldsymbol{x} \\in \\mathbb{R}^{D}$ as the input space of a model with $D$ dimensional real-valued inputs, then a matrix $\\mathbf{W} \\in \\mathbb{R}^{K\\times D}$ can be used to define a prediction model consisting solely of a linear transform of the inputs\n", + "\n", + "\\begin{equation}\n", + " \\boldsymbol{y} = \\mathbf{W} \\boldsymbol{x}\n", + " \\qquad\n", + " \\Leftrightarrow\n", + " \\qquad\n", + " y_k = \\sum_{d=1}^D \\left( W_{kd} x_d \\right) \\quad \\forall k \\in \\left\\lbrace 1 \\dots K\\right\\rbrace\n", + "\\end{equation}\n", + "\n", + "with $\\boldsymbol{y} \\in \\mathbb{R}^K$ the $K$-dimensional real-valued output of the model. Geometrically we can think of a linear transform as some combination of rotation, scaling, reflection and shearing of the input.\n", + "\n", + "An *affine transform* consists of a linear transform plus an additional translation on the output space parameterised by a vector $\\boldsymbol{b} \\in \\mathbb{R}^K$. A model consisting of an affine transformation of the inputs can then be defined as\n", + "\n", + "\\begin{equation}\n", + " \\boldsymbol{y} = \\mathbf{W}\\boldsymbol{x} + \\boldsymbol{b}\n", + " \\qquad\n", + " \\Leftrightarrow\n", + " \\qquad\n", + " y_k = \\sum_{d=1}^D \\left( W_{kd} x_d \\right) + b_k \\quad \\forall k \\in \\left\\lbrace 1 \\dots K\\right\\rbrace\n", + "\\end{equation}\n", + "\n", + "In machine learning we will usually refer to the matrix $\\mathbf{W}$ as a *weight matrix* and the vector $\\boldsymbol{b}$ as a *bias vector*.\n", + "\n", + "Generally, rather than working with a single data vector $\\boldsymbol{x}$ we will work with batches of datapoints $\\left\\lbrace \\boldsymbol{x}^{(b)}\\right\\rbrace_{b=1}^B$. We could calculate the outputs for each input in the batch sequentially\n", + "\n", + "\\begin{align}\n", + " \\boldsymbol{y}^{(1)} &= \\mathbf{W}\\boldsymbol{x}^{(1)} + \\boldsymbol{b}\\\\\n", + " \\boldsymbol{y}^{(2)} &= \\mathbf{W}\\boldsymbol{x}^{(2)} + \\boldsymbol{b}\\\\\n", + " \\dots &\\\\\n", + " \\boldsymbol{y}^{(B)} &= \\mathbf{W}\\boldsymbol{x}^{(B)} + \\boldsymbol{b}\\\\\n", + "\\end{align}\n", + "\n", + "by looping over each input in the batch and calculating the output. However, loops in Python are slow (particularly compared to compiled and typed languages such as C). This is due at least in part to the large overhead in dynamically inferring variable types. In consequence, we want to avoid having loops in which this overhead would be the dominant computational cost.\n", + "\n", + "For array-based numerical operations, one way of overcoming this bottleneck is to *vectorise* operations, that is, computing all of them at once. NumPy `ndarrays` are typed arrays for which operations, like basic elementwise arithmetic and linear algebra operations (*e.g.* computing matrix-matrix or matrix-vector products) are implemented by calls to highly-optimised compiled libraries. Therefore, implementing code directly using NumPy operations on arrays rather than by looping over array elements usually leads to very substantial performance gains.\n", + "\n", + "As a simple example, we can consider adding up two arrays `a` and `b` and writing the result to a third array `c`. Let us start by initialising `a` and `b` with arbitrary values by running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "size = 1000\n", + "a = np.random.randn(size)\n", + "b = np.random.randn(size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we are going to measure how long it takes to add up each pair of values in the two array and write the results to a third array using a loop-based implementation. We will use the `%%timeit` magic briefly mentioned in the previous lab notebook, specifying the number of times to loop the code as 100 and repeating it 3 times for better consistency. Run the cell below to get a print out of the average time taken." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit -n 100 -r 3\n", + "c = np.empty(size)\n", + "for i in range(size):\n", + " c[i] = a[i] + b[i]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now we will perform the corresponding summation with the overloaded addition operator of NumPy arrays. Again run the cell below to get a print out of the average time taken." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit -n 100 -r 3\n", + "c = a + b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first loop-based implementation should have taken on the order of milliseconds ( $10^{-3}$ s) while the vectorised implementation should have taken on the order of microseconds ( $10^{-6}$ s), i.e. a $\\sim1000\\times$ speedup. Hopefully this simple example should make it clear why we want to vectorise operations whenever possible!\n", + "\n", + "Getting back to our affine model, ideally rather than individually computing the output corresponding to each input we should compute the outputs for all inputs in a batch using a vectorised implementation. As you saw last week, data providers return batches of inputs as arrays of shape `(batch_size, input_dim)`. In the mathematical notation used earlier we can consider the input as a matrix $\\mathbf{X}$ of dimensionality $B \\times D$:\n", + "\n", + "\\begin{equation}\n", + " \\mathbf{X} = \\left[ \\boldsymbol{x}^{(1)} ~ \\boldsymbol{x}^{(2)} ~ \\dots ~ \\boldsymbol{x}^{(B)} \\right]^\\mathrm{T}\n", + "\\end{equation}\n", + "\n", + "i.e. the $b^{\\textrm{th}}$ input vector $\\boldsymbol{x}^{(b)}$ corresponds to the $b^{\\textrm{th}}$ row of $\\mathbf{X}$. Similarly, we can define the $B \\times K$ matrix of outputs $\\mathbf{Y}$ as\n", + "\n", + "\\begin{equation}\n", + " \\mathbf{Y} = \\left[ \\boldsymbol{y}^{(1)} ~ \\boldsymbol{y}^{(2)} ~ \\dots ~ \\boldsymbol{y}^{(B)} \\right]^\\mathrm{T}\n", + "\\end{equation}\n", + "\n", + "We can then express the relationship between $\\mathbf{X}$ and $\\mathbf{Y}$ using [matrix multiplication](https://en.wikipedia.org/wiki/Matrix_multiplication) and addition as\n", + "\n", + "\\begin{equation}\n", + " \\mathbf{Y} = \\mathbf{X} \\mathbf{W}^\\mathrm{T} + \\mathbf{B}\n", + "\\end{equation}\n", + "\n", + "where $\\mathbf{B} = \\left[ \\boldsymbol{b} ~ \\boldsymbol{b} ~ \\dots ~ \\boldsymbol{b} \\right]^\\mathrm{T}$ i.e. a $B \\times K$ matrix with each row corresponding to the same bias vector. The weight matrix needs to be transposed here as the inner dimensions of a matrix multiplication must match i.e. for $\\mathbf{C} = \\mathbf{A} \\mathbf{B}$ then if $\\mathbf{A}$ is of dimensionality $K \\times L$ and $\\mathbf{B}$ is of dimensionality $M \\times N$ then it must be the case that $L = M$ and $\\mathbf{C}$ will be of dimensionality $K \\times N$.\n", + "\n", + "**Your Tasks:**\n", + "\n", + "The first exercise for this lab is to implement *forward propagation* for a single-layer model consisting of an affine transformation of the inputs in the `fprop` function given as skeleton code in the cell below. This should work for a batch of inputs of shape `(batch_size, input_dim)` producing a batch of outputs of shape `(batch_size, output_dim)`.\n", + " \n", + "You will probably want to use the NumPy `dot` function and [broadcasting features](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) to implement this efficiently. If you are not familiar with either of these, you may wish to read the [hints](#Hints:-Using-the-dot-function-and-broadcasting) section below which provides some tips before attempting the exercise." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def fprop(inputs, weights, biases):\n", + " \"\"\"Forward propagates activations through the layer transformation.\n", + "\n", + " For inputs `x`, outputs `y`, weights `W` and biases `b` the layer\n", + " corresponds to `y = W x + b`.\n", + "\n", + " Args:\n", + " inputs: Array of layer inputs of shape (batch_size, input_dim).\n", + " weights: Array of weight parameters of shape \n", + " (output_dim, input_dim).\n", + " biases: Array of bias parameters of shape (output_dim, ).\n", + "\n", + " Returns:\n", + " outputs: Array of layer outputs of shape (batch_size, output_dim).\n", + " \"\"\"\n", + " raise NotImplementedError(\"TODO: Implement this function.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have implemented `fprop` in the cell above you can test your implementation by running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = np.array([[0., -1., 2.], [-6., 3., 1.]])\n", + "weights = np.array([[2., -3., -1.], [-5., 7., 2.]])\n", + "biases = np.array([5., -3.])\n", + "true_outputs = np.array([[6., -6.], [-17., 50.]])\n", + "\n", + "if not np.allclose(fprop(inputs, weights, biases), true_outputs):\n", + " print('Wrong outputs computed.')\n", + "else:\n", + " print('All outputs correct!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hints: Using the `dot` function and broadcasting\n", + "\n", + "For those new to NumPy below are some details on the `dot` function and broadcasting feature of NumPy that you may want to use for implementing the first exercise. If you are already familiar with these and have already completed the first exercise you can move on straight to [second exercise](#Exercise-2:-visualising-random-models).\n", + "\n", + "#### `numpy.dot` function\n", + "\n", + "Matrix-matrix, matrix-vector and vector-vector (dot) products can all be computed in NumPy using the [`dot`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html) operator which generalizes all of these operation. For example if `A` and `B` are both two dimensional arrays, then `C = np.dot(A, B)` or equivalently `C = A.dot(B)` will compute the matrix product of `A` and `B` assuming `A` and `B` have compatible dimensions. Similarly if `a` and `b` are one dimensional arrays then `c = np.dot(a, b)` (which is equivalent to `c = a.dot(b)`) will compute the [scalar / dot product](https://en.wikipedia.org/wiki/Dot_product) of the two arrays. If `A` is a two-dimensional array and `b` a one-dimensional array `np.dot(A, b)` (which is equivalent to `A.dot(b)`) will compute the matrix-vector product of `A` and `b`. Examples of all three of these product types are shown in the cell below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initiliase arrays with arbitrary values\n", + "A = np.arange(9).reshape((3, 3))\n", + "B = np.ones((3, 3)) * 2\n", + "a = np.array([-1., 0., 1.])\n", + "b = np.array([0.1, 0.2, 0.3])\n", + "print(A.dot(B)) # Matrix-matrix product\n", + "print(B.dot(A)) # Reversed product of above. A.dot(B) != B.dot(A) in general\n", + "print(A.dot(b)) # Matrix-vector product\n", + "print(b.dot(A)) # Again A.dot(b) != b.dot(A) unless A is symmetric i.e. A == A.T\n", + "print(a.dot(b)) # Vector-vector scalar product" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Broadcasting\n", + "\n", + "Another NumPy feature it will be helpful to get familiar with is [broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html). Broadcasting allows you to apply operations to arrays of different shapes by letting numpy infer the missing parts, for example to add a one-dimensional array to a two-dimensional array or multiply a multidimensional array by a scalar. The complete set of rules for broadcasting as explained in the official documentation page just linked to can sound a bit complex: you might find the [visual explanation on this page](http://www.scipy-lectures.org/intro/numpy/operations.html#broadcasting) more intuitive.\n", + "Keep in mind that the shapes must be compatible with one another, and that it may lead to erroneous results if the shapes are not as intended, so you are advised to make sure your arrays have the proper shapes.\n", + "The cell below gives a few examples:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initiliase arrays with arbitrary values\n", + "A = np.arange(6).reshape((3, 2))\n", + "b = np.array([0.1, 0.2])\n", + "c = np.array([-1., 0., 1.])\n", + "print(A + b) # Add b elementwise to all rows of A\n", + "print((A.T + c).T) # Add b elementwise to all columns of A\n", + "print(A * b) # Multiply each row of A elementise by b " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2: visualising random models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this exercise you will use your `fprop` implementation to visualise the outputs of a single-layer affine transform model with two-dimensional inputs and a one-dimensional output. In this simple case, we can visualise the joint input-output space on a 3D axis.\n", + "\n", + "For this task and the learning experiments later in the notebook we will use a regression dataset from the UCI machine learning repository. In particular we will use a version of the [Combined Cycle Power Plant dataset](http://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant), where the task is to predict the energy output of a power plant given observations of the local ambient conditions (e.g. temperature, pressure and humidity).\n", + "\n", + "The original dataset has four input dimensions and a single target output dimension. We have preprocessed the dataset by [whitening](https://en.wikipedia.org/wiki/Whitening_transformation) it. Geometrically, this process rotates the data so that it's [principle components](https://en.wikipedia.org/wiki/Principal_component_analysis) are aligned with the basis vectors, and then scales the data so that variance along each dimension is one (see [here](https://www.quora.com/What-is-the-use-of-Whitening-images-as-a-preprocessing-step-for-a-Convolutional-Neural-Network)).\n", + "\n", + "If the original dataset has a covariance $\\mathbf{C}$, a whitening transformation $\\mathbf{D}$ is one which satisfies:\n", + "\\begin{equation}\n", + " \\mathbf{D}^{\\mathrm{T}} \\mathbf{C} \\mathbf{D} = \\mathbf{I},\n", + "\\end{equation}\n", + "where $\\mathbf{I}$ is the identity matrix.\n", + "\n", + "This can be considered a change of basis, where newly formed input features are decorrelated and have equivalent scale, which can lead to reduced learning times (see [here](https://proceedings.neurips.cc/paper/1990/file/758874998f5bd0c393da094e1967a72b-Paper.pdf)). We will only use the first two dimensions of the whitened inputs (corresponding to the first two principal components of the original inputs) so we can easily visualise the joint input-output space.\n", + "\n", + "The dataset has been wrapped in the `CCPPDataProvider` class in the `mlp.data_providers` module and the data included as a compressed file in the data directory as `ccpp_data.npz`. Running the cell below will initialise an instance of this class, get a single batch of inputs and targets, and import the necessary `matplotlib` objects." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "# import sys\n", + "# sys.path.append('/path/to/mlpractical')\n", + "from mpl_toolkits.mplot3d import Axes3D\n", + "from mlp.data_providers import CCPPDataProvider\n", + "\n", + "data_provider = CCPPDataProvider(\n", + " which_set='train',\n", + " input_dims=[0, 1],\n", + " batch_size=5000, \n", + " max_num_batches=1, \n", + " shuffle_order=False\n", + ")\n", + "\n", + "input_dim, output_dim = 2, 1\n", + "\n", + "inputs, targets = data_provider.next()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now run the cell below to plot the predicted outputs of a randomly initialised model across the two dimensional input space as well as the true target outputs. This sort of visualisation can be a useful method (in low dimensions) to assess how well the model is likely to be able to fit the data and to judge appropriate initialisation scales for the parameters. Each time you re-run the cell a new set of random parameters will be sampled\n", + "\n", + "**Your Tasks:**\n", + "\n", + "Here you don't need to implement anything. Just run the cell for several times and try to answer the following questions:\n", + "\n", + " * How do the weights and bias initialisation scale affect the sort of predicted input-output relationships?\n", + " * Do you think a linear model is a good choice for this data?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "weights_init_range = 0.5\n", + "biases_init_range = 0.1\n", + "\n", + "# Randomly initialise weights matrix\n", + "weights = rng.uniform(\n", + " low=-weights_init_range, \n", + " high=weights_init_range, \n", + " size=(output_dim, input_dim)\n", + ")\n", + "\n", + "# Randomly initialise biases vector\n", + "biases = rng.uniform(\n", + " low=-biases_init_range, \n", + " high=biases_init_range, \n", + " size=output_dim\n", + ")\n", + "# Calculate predicted model outputs\n", + "outputs = fprop(inputs, weights, biases)\n", + "\n", + "# Plot target and predicted outputs against inputs on same axis\n", + "fig = plt.figure(figsize=(8, 8))\n", + "ax = fig.add_subplot(111, projection='3d')\n", + "ax.plot(inputs[:, 0], inputs[:, 1], targets[:, 0], 'r.', ms=2)\n", + "ax.plot(inputs[:, 0], inputs[:, 1], outputs[:, 0], 'b.', ms=2)\n", + "ax.set_xlabel('Input dim 1')\n", + "ax.set_ylabel('Input dim 2')\n", + "ax.set_zlabel('Output')\n", + "ax.legend(['Targets', 'Predictions'], frameon=False)\n", + "fig.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 3: computing the error function and its gradient\n", + "\n", + "We will now consider the task of regression as covered in the first lecture. Given a set of inputs $\\left\\lbrace \\boldsymbol{x}^{(n)}\\right\\rbrace_{n=1}^N$, the aim in a regression problem is to produce outputs $\\left\\lbrace \\boldsymbol{y}^{(n)}\\right\\rbrace_{n=1}^N$ that are as 'close' as possible to a set of targets $\\left\\lbrace \\boldsymbol{t}^{(n)}\\right\\rbrace_{n=1}^N$. The measure of 'closeness' or distance between target and predicted outputs can vary and is usually a design choice. \n", + "\n", + "A very common choice is the squared Euclidean distance between the predicted and target outputs. This can be computed as the sum of the squared differences between each element in the target and predicted outputs. A widespread convention is to multiply this value by $\\frac{1}{2}$ as this gives a slightly nicer expression for the error gradient. The error for the $n^{\\textrm{th}}$ training example is then expresed by\n", + "\n", + "\\begin{equation}\n", + " E^{(n)} = \\frac{1}{2} \\sum_{k=1}^K \\left\\lbrace \\left( y^{(n)}_k - t^{(n)}_k \\right)^2 \\right\\rbrace.\n", + "\\end{equation}\n", + "\n", + "The overall error is defined as the *average* of this value across all training examples\n", + "\n", + "\\begin{equation}\n", + " \\bar{E} = \\frac{1}{N} \\sum_{n=1}^N \\left\\lbrace E^{(n)} \\right\\rbrace. \n", + "\\end{equation}\n", + "\n", + "*Note here we are using a slightly different convention from the lectures. There the overall error was considered to be the sum of the individual error terms rather than the mean. To differentiate between the two we will use $\\bar{E}$ to represent the average error here as opposed to sum of errors $E$ as used in the slides with $\\bar{E} = \\frac{E}{N}$. Normalising by the number of training examples is helpful to do in practice as this means we can more easily compare errors across data sets / batches of different sizes, and more importantly it means the size of our gradient updates will be independent of the number of training examples summed over.*\n", + "\n", + "Solving the regression problem means finding parameters of the model which minimise $\\bar{E}$. For our simple single-layer affine model here, that corresponds to finding weights $\\mathbf{W}$ and biases $\\boldsymbol{b}$ which minimise $\\bar{E}$. \n", + "\n", + "As mentioned in the lecture, in this case there is actually a closed form solution for the optimal weights and bias parameters. This is the linear least-squares solution those doing MLPR will have come across.\n", + "\n", + "However in general we will be interested in models where closed form solutions do not exist. Therefore, we will generally use iterative gradient descent based optimization methods to find parameters which (locally) minimise the error function. A basic requirement of being able to do gradient-descent based training is (unsuprisingly) the ability to evaluate gradients of the error function.\n", + "\n", + "Our end goal is to calculate gradients of the error function with respect to the model parameters $\\mathbf{W}$ and $\\boldsymbol{b}$. As a first step here we will consider the gradient of the error function with respect to the model outputs $\\left\\lbrace \\boldsymbol{y}^{(n)}\\right\\rbrace_{n=1}^N$. This can be written\n", + "\n", + "\\begin{equation}\n", + " \\frac{\\partial \\bar{E}}{\\partial \\boldsymbol{y}^{(n)}} = \\frac{1}{N} \\left( \\boldsymbol{y}^{(n)} - \\boldsymbol{t}^{(n)} \\right)\n", + " \\qquad \\Leftrightarrow \\qquad\n", + " \\frac{\\partial \\bar{E}}{\\partial y^{(n)}_k} = \\frac{1}{N} \\left( y^{(n)}_k - t^{(n)}_k \\right) \\quad \\forall k \\in \\left\\lbrace 1 \\dots K\\right\\rbrace\n", + "\\end{equation}\n", + "\n", + "*i.e.* the gradient of the error function with respect to the $n^{\\textrm{th}}$ model output is the difference between the $n^{\\textrm{th}}$ model and target outputs, corresponding to the $\\boldsymbol{\\delta}^{(n)}$ terms mentioned in the lecture slides.\n", + "\n", + "**Your Tasks:** \n", + "\n", + "Using the equations given above, implement functions computing the mean sum of squared differences error and its gradient with respect to the model outputs. You should implement the functions using the provided skeleton definitions in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def error(outputs, targets):\n", + " \"\"\"Calculates error function given a batch of outputs and targets.\n", + "\n", + " Args:\n", + " outputs: Array of model outputs of shape (batch_size, output_dim).\n", + " targets: Array of target outputs of shape (batch_size, output_dim).\n", + "\n", + " Returns:\n", + " Scalar error function value.\n", + " \"\"\"\n", + " raise NotImplementedError(\"TODO implement this function\")\n", + " \n", + "def error_grad(outputs, targets):\n", + " \"\"\"Calculates gradient of error function with respect to model outputs.\n", + "\n", + " Args:\n", + " outputs: Array of model outputs of shape (batch_size, output_dim).\n", + " targets: Array of target outputs of shape (batch_size, output_dim).\n", + "\n", + " Returns:\n", + " Gradient of error function with respect to outputs.\n", + " This will be an array of shape (batch_size, output_dim).\n", + " \"\"\"\n", + " raise NotImplementedError(\"TODO implement this function\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your implementation by running the test cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "outputs = np.array([[1., 2.], [-1., 0.], [6., -5.], [-1., 1.]])\n", + "targets = np.array([[0., 1.], [3., -2.], [7., -3.], [1., -2.]])\n", + "true_error = 5.\n", + "true_error_grad = np.array([[0.25, 0.25], [-1., 0.5], [-0.25, -0.5], [-0.5, 0.75]])\n", + "\n", + "if not error(outputs, targets) == true_error:\n", + " print('Error calculated incorrectly.')\n", + "elif not np.allclose(error_grad(outputs, targets), true_error_grad):\n", + " print('Error gradient calculated incorrectly.')\n", + "else:\n", + " print('Error function and gradient computed correctly!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 4: computing gradients with respect to the parameters\n", + "\n", + "In the previous exercise you implemented a function computing the gradient of the error function with respect to the model outputs. For gradient-descent based training, we need to be able to evaluate the gradient of the error function with respect to the model parameters.\n", + "\n", + "Using the [chain rule for derivatives](https://en.wikipedia.org/wiki/Chain_rule#Higher_dimensions), we can write the partial deriviative of the error function with respect to single elements of the weight matrix and bias vector as\n", + "\n", + "\\begin{equation}\n", + " \\frac{\\partial E}{\\partial W_{kj}} = \\sum_{n=1}^N \\left\\lbrace \\frac{\\partial E}{\\partial y^{(n)}_k} \\frac{\\partial y^{(n)}_k}{\\partial W_{kj}} \\right\\rbrace\n", + " \\quad \\textrm{and} \\quad\n", + " \\frac{\\partial E}{\\partial b_k} = \\sum_{n=1}^N \\left\\lbrace \\frac{\\partial E}{\\partial y^{(n)}_k} \\frac{\\partial y^{(n)}_k}{\\partial b_k} \\right\\rbrace.\n", + "\\end{equation}\n", + "\n", + "From the definition of our model at the beginning we have \n", + "\n", + "\\begin{equation}\n", + " y^{(n)}_k = \\sum_{d=1}^D \\left\\lbrace W_{kd} x^{(n)}_d \\right\\rbrace + b_k\n", + " \\quad \\Rightarrow \\quad\n", + " \\frac{\\partial y^{(n)}_k}{\\partial W_{kj}} = x^{(n)}_j\n", + " \\quad \\textrm{and} \\quad\n", + " \\frac{\\partial y^{(n)}_k}{\\partial b_k} = 1.\n", + "\\end{equation}\n", + "\n", + "Putting this together we get that\n", + "\n", + "\\begin{equation}\n", + " \\frac{\\partial E}{\\partial W_{kj}} = \n", + " \\sum_{n=1}^N \\left\\lbrace \\frac{\\partial E}{\\partial y^{(n)}_k} x^{(n)}_j \\right\\rbrace\n", + " \\quad \\textrm{and} \\quad\n", + " \\frac{\\partial E}{\\partial b_{k}} = \n", + " \\sum_{n=1}^N \\left\\lbrace \\frac{\\partial E}{\\partial y^{(n)}_k} \\right\\rbrace.\n", + "\\end{equation}\n", + "\n", + "Although this may seem a bit of a roundabout way to get to these results, this method of decomposing the error gradient with respect to the parameters in terms of the gradient of the error function with respect to the model outputs and the derivatives of the model outputs with respect to the model parameters is the key element that allows calculating the parameter gradients of more complex models we will study later in the course.\n", + "\n", + "**Your Tasks:** \n", + "\n", + "Implement a function calculating the gradient of the error function with respect to the weight and bias parameters of the model given the already computed gradient of the error function with respect to the model outputs. You should implement this in the `grads_wrt_params` function in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def grads_wrt_params(inputs, grads_wrt_outputs):\n", + " \"\"\"Calculates gradients with respect to model parameters.\n", + "\n", + " Args:\n", + " inputs: array of inputs to model of shape (batch_size, input_dim)\n", + " grads_wrt_to_outputs: array of gradients of with respect to the model\n", + " outputs of shape (batch_size, output_dim).\n", + "\n", + " Returns:\n", + " list of arrays of gradients with respect to the model parameters\n", + " `[grads_wrt_weights, grads_wrt_biases]`.\n", + " \"\"\"\n", + " raise NotImplementedError(\"TODO implement this function\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your implementation by running the test cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = np.array([[1., 2., 3.], [-1., 4., -9.]])\n", + "grads_wrt_outputs = np.array([[-1., 1.], [2., -3.]])\n", + "true_grads_wrt_weights = np.array([[-3., 6., -21.], [4., -10., 30.]])\n", + "true_grads_wrt_biases = np.array([1., -2.])\n", + "\n", + "grads_wrt_weights, grads_wrt_biases = grads_wrt_params(\n", + " inputs, grads_wrt_outputs)\n", + "\n", + "if not np.allclose(true_grads_wrt_weights, grads_wrt_weights):\n", + " print('Gradients with respect to weights incorrect.')\n", + "elif not np.allclose(true_grads_wrt_biases, grads_wrt_biases):\n", + " print('Gradients with respect to biases incorrect.')\n", + "else:\n", + " print('All parameter gradients calculated correctly!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 5: wrapping the functions into reusable components\n", + "\n", + "In exercises 1, 3 and 4 you implemented methods to compute the predicted outputs of our model, evaluate the error function and its gradient on the outputs and finally to calculate the gradients of the error with respect to the model parameters. Together they constitute all the basic ingredients we need to implement a gradient-descent based iterative learning procedure.\n", + "\n", + "Although you could implement training code which directly uses the functions you defined, this would only be usable for this particular model architecture. In subsequent labs we will want to use the affine transform functions as the basis for more interesting multi-layer models. We will therefore wrap the implementations you just wrote in to reusable components that we can combine to build more complex models later in the course.\n", + "\n", + "**Your Tasks:**\n", + "\n", + " * In the [`mlp.layers`](../mlp/layers.py) module, use your implementations of `fprop` and `grad_wrt_params` above to implement the corresponding methods in the skeleton `AffineLayer` class provided.\n", + " * In the [`mlp.errors`](../mlp/errors.py) module use your implementation of `error` and `error_grad` to implement the `__call__` and `grad` methods respectively of the skeleton `SumOfSquaredDiffsError` class provided. Note `__call__` is a special Python method that allows an object to be used with a function call syntax.\n", + " * All functions where you need to implement has been marked with a `#TODO` comment. You don't need to implement other functions right now.\n", + "\n", + "Run the cell below to use your completed `AffineLayer` and `SumOfSquaredDiffsError` implementations to train a single-layer model using batch gradient descent on the CCPP dataset. Remember to reload the notebook if you made changes to the `mlp` module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mlp.layers import AffineLayer\n", + "from mlp.errors import SumOfSquaredDiffsError\n", + "from mlp.models import SingleLayerModel\n", + "from mlp.initialisers import UniformInit, ConstantInit\n", + "from mlp.learning_rules import GradientDescentLearningRule\n", + "from mlp.optimisers import Optimiser\n", + "import logging\n", + "\n", + "# Seed a random number generator\n", + "seed = 27092016 \n", + "rng = np.random.RandomState(seed)\n", + "\n", + "# Set up a logger object to print info about the training run to stdout\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.INFO)\n", + "logger.handlers = [logging.StreamHandler()]\n", + "\n", + "# Create data provider objects for the CCPP training set\n", + "train_data = CCPPDataProvider('train', [0, 1], batch_size=100, rng=rng)\n", + "input_dim, output_dim = 2, 1\n", + "\n", + "# Create a parameter initialiser which will sample random uniform values\n", + "# from [-0.1, 0.1]\n", + "param_init = UniformInit(-0.1, 0.1, rng=rng)\n", + "\n", + "# Create our single layer model\n", + "layer = AffineLayer(input_dim, output_dim, param_init, param_init)\n", + "model = SingleLayerModel(layer)\n", + "\n", + "# Initialise the error object\n", + "error = SumOfSquaredDiffsError()\n", + "\n", + "# Use a basic gradient descent learning rule with a small learning rate\n", + "learning_rule = GradientDescentLearningRule(learning_rate=1e-2)\n", + "\n", + "# Use the created objects to initialise a new Optimiser instance.\n", + "optimiser = Optimiser(model, error, learning_rule, train_data)\n", + "\n", + "# Run the optimiser for 5 epochs (full passes through the training set)\n", + "# printing statistics every epoch.\n", + "stats, keys, _ = optimiser.train(num_epochs=10, stats_interval=1)\n", + "\n", + "# Plot the change in the error over training.\n", + "fig = plt.figure(figsize=(8, 4))\n", + "ax = fig.add_subplot(111)\n", + "ax.plot(np.arange(1, stats.shape[0] + 1), stats[:, keys['error(train)']])\n", + "ax.set_xlabel('Epoch number')\n", + "ax.set_ylabel('Error')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using similar code to exercise 2, we can visualise the joint input-output space for the trained model. If you implemented the required methods correctly you should now see a much improved fit between predicted and target outputs when running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_provider = CCPPDataProvider(\n", + " which_set='train',\n", + " input_dims=[0, 1],\n", + " batch_size=5000, \n", + " max_num_batches=1, \n", + " shuffle_order=False\n", + ")\n", + "\n", + "inputs, targets = data_provider.next()\n", + "\n", + "# Calculate predicted model outputs\n", + "outputs = model.fprop(inputs)[-1]\n", + "\n", + "# Plot target and predicted outputs against inputs on same axis\n", + "fig = plt.figure(figsize=(8, 8))\n", + "ax = fig.add_subplot(111, projection='3d')\n", + "ax.plot(inputs[:, 0], inputs[:, 1], targets[:, 0], 'r.', ms=2)\n", + "ax.plot(inputs[:, 0], inputs[:, 1], outputs[:, 0], 'b.', ms=2)\n", + "ax.set_xlabel('Input dim 1')\n", + "ax.set_ylabel('Input dim 2')\n", + "ax.set_zlabel('Output')\n", + "ax.legend(['Targets', 'Predictions'], frameon=False)\n", + "fig.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 6: visualising training trajectories in parameter space" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running the cell below will display an interactive widget which plots the trajectories of gradient-based training of the single-layer affine model on the CCPP dataset in the three dimensional parameter space (two weights plus bias) from random initialisations. Also shown on the right is a plot of the evolution of the error function (evaluated on the current batch) over training. By moving the sliders you can alter the training hyperparameters to investigate the effect they have on how training procedes. The hyperparameters are as follows:\n", + "\n", + "- `n_epochs` : number of training epochs,\n", + "- `batch_size` : number of data points per batch,\n", + "- `log_lr` : logarithm of the learning rate,\n", + "- `n_inits` : number of different parameter initializations,\n", + "- `w_scale` : min/max initial weight value,\n", + "- `b_scale` : min/max initial bias value,\n", + "- `elev`/`azim` : spherical coordinates for camera position.\n", + "\n", + "When adjusting these hyperparameters, keep in mind that the magnitude of each (per batch) update is independent of the batch size. Increasing the batch size may there for necessitate a larger number of epochs to ensure convergence, or a larger learning rate.\n", + "\n", + "**Your Tasks:**\n", + "\n", + "No need to implement anything. Run the cell and explore the following questions:\n", + "\n", + " * Are there multiple local minima in parameter space here? Why?\n", + " * What are the effects of using very small learning rates? And very large learning ones?\n", + " * How does the batch size affect learning?\n", + " \n", + "**Note:** You don't need to understand how the code below works. The idea of this exercise is to help you understand the role of the various hyperparameters involved in gradient-descent based training methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from ipywidgets import interact\n", + "%matplotlib inline\n", + "\n", + "def setup_figure():\n", + " # create figure and axes\n", + " fig = plt.figure(figsize=(12, 6))\n", + " ax1 = fig.add_axes([0., 0., 0.5, 1.], projection='3d')\n", + " ax2 = fig.add_axes([0.6, 0.1, 0.4, 0.8])\n", + " # set axes properties\n", + " ax2.spines['right'].set_visible(False)\n", + " ax2.spines['top'].set_visible(False)\n", + " ax2.yaxis.set_ticks_position('left')\n", + " ax2.xaxis.set_ticks_position('bottom')\n", + " #ax2.set_yscale('log')\n", + " ax1.set_xlim((-2, 2))\n", + " ax1.set_ylim((-2, 2))\n", + " ax1.set_zlim((-2, 2))\n", + " #set axes labels and title\n", + " ax1.set_title('Parameter trajectories over training')\n", + " ax1.set_xlabel('Weight 1')\n", + " ax1.set_ylabel('Weight 2')\n", + " ax1.set_zlabel('Bias')\n", + " ax2.set_title('Batch errors over training')\n", + " ax2.set_xlabel('Batch update number')\n", + " ax2.set_ylabel('Batch error')\n", + " return fig, ax1, ax2\n", + "\n", + "def visualise_training(n_epochs=1, batch_size=200, log_lr=-1., n_inits=1,\n", + " w_scale=1., b_scale=1., elev=30., azim=0.):\n", + " fig, ax1, ax2 = setup_figure()\n", + " # create seeded random number generator\n", + " rng = np.random.RandomState(1234)\n", + " # create data provider\n", + " data_provider = CCPPDataProvider(\n", + " input_dims=[0, 1],\n", + " batch_size=batch_size, \n", + " shuffle_order=False,\n", + " )\n", + " learning_rate = 10 ** log_lr\n", + " n_batches = data_provider.num_batches\n", + " weights_traj = np.empty((n_inits, n_epochs * n_batches + 1, 1, 2))\n", + " biases_traj = np.empty((n_inits, n_epochs * n_batches + 1, 1))\n", + " errors_traj = np.empty((n_inits, n_epochs * n_batches))\n", + " # randomly initialise parameters\n", + " weights = rng.uniform(-w_scale, w_scale, (n_inits, 1, 2))\n", + " biases = rng.uniform(-b_scale, b_scale, (n_inits, 1))\n", + " # store initial parameters\n", + " weights_traj[:, 0] = weights\n", + " biases_traj[:, 0] = biases\n", + " # iterate across different initialisations\n", + " for i in range(n_inits):\n", + " # iterate across epochs\n", + " for e in range(n_epochs):\n", + " # iterate across batches\n", + " for b, (inputs, targets) in enumerate(data_provider):\n", + " outputs = fprop(inputs, weights[i], biases[i])\n", + " errors_traj[i, e * n_batches + b] = error(outputs, targets)\n", + " grad_wrt_outputs = error_grad(outputs, targets)\n", + " weights_grad, biases_grad = grads_wrt_params(inputs, grad_wrt_outputs)\n", + " weights[i] -= learning_rate * weights_grad\n", + " biases[i] -= learning_rate * biases_grad\n", + " weights_traj[i, e * n_batches + b + 1] = weights[i]\n", + " biases_traj[i, e * n_batches + b + 1] = biases[i]\n", + " # choose a different color for each trajectory\n", + " colors = plt.cm.jet(np.linspace(0, 1, n_inits))\n", + " # plot all trajectories\n", + " for i in range(n_inits):\n", + " lines_1 = ax1.plot(\n", + " weights_traj[i, :, 0, 0], \n", + " weights_traj[i, :, 0, 1], \n", + " biases_traj[i, :, 0], \n", + " '-', c=colors[i], lw=2)\n", + " lines_2 = ax2.plot(\n", + " np.arange(n_batches * n_epochs),\n", + " errors_traj[i],\n", + " c=colors[i]\n", + " )\n", + " ax1.view_init(elev, azim)\n", + " plt.show()\n", + "\n", + "w = interact(\n", + " visualise_training,\n", + " elev=(-90, 90, 2),\n", + " azim=(-180, 180, 2), \n", + " n_epochs=(1, 50), \n", + " batch_size=(10, 1000, 100),\n", + " log_lr=(-5., 1.),\n", + " w_scale=(0., 4.),\n", + " b_scale=(0., 4.),\n", + " n_inits=(1, 10)\n", + ")\n", + "\n", + "for child in w.widget.children:\n", + " child.layout.width = '100%'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hints:\n", + "- Remember an affine single layer model is an linear with respect to it's parameters, such that any given output $y$ can be expressed as $y = w_1 x_1 + w_2 x_2 + b$. Subsituting this into the loss function we have: \n", + "\\begin{equation}\n", + " E = \\sum_{n=1}^K \\frac{1}{2} \\left( y^{(n)} - t^{(n)} \\right)^2 = \\sum_{n=1}^K \\frac{1}{2} \\left( w_1 x_1^{(n)} + w_2 x_2^{(n)} + b - t^{(n)} \\right)^2.\n", + "\\end{equation}\n", + "The loss surface is therefore *quadratic* with respect to parameters $w_1, w_2, b$. What effect does this have on the number of minima?\n", + "\n", + "- Note that by using batch-wise updates, we are computing gradients of loss surface described by a subset $B < N$ of the training data:\n", + "\\begin{equation}\n", + " E = \\sum_{n=1}^B \\frac{1}{2} \\left( y^{(n)} - t^{(n)} \\right)^2.\n", + "\\end{equation}\n", + "Hence, this gradient direction is only an approximation of the optimal update direction dictated by the full dataset. With very small batch sizes, what convergence behaviour would we therefore expect?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch\n", + "\n", + "PyTorch is a deep-learning framework that allows us to easily build and train neural networks. It is based on the concept of [tensors](https://pytorch.org/docs/stable/tensors.html), which are multidimensional arrays. In this section, we will use PyTorch to build a simple neural network and train it on the Combined Cycle Power Plant dataset dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.utils.data import Dataset, DataLoader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To ensure [reproducibility](https://pytorch.org/docs/stable/notes/randomness.html), we wil set the seed of the random number generator to a fixed value.\n", + "\n", + "We will also use the same hyperparameters as in the previous section." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "torch.manual_seed(seed)\n", + "\n", + "learning_rate = 1e-2\n", + "num_epochs = 10\n", + "batch_size = 100\n", + "input_dim = 2\n", + "output_dim = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To work with data in PyTorch, we need to create a [Dataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) object. This object will be used by a [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) object to load the data in batches. The DataLoader object will also shuffle the data at each epoch. \n", + "\n", + "*Here, we do not shuffle the data by setting `shuffle=False` as we want to compare the results with the previous section. However, it is strongly advised to shuffle the data in the training set, but not in the validation or the test set. Can you think about why?*\n", + "\n", + "For a dataset to be used with PyTorch, it need to have the following methods:\n", + "- `__len__` : returns the size of the dataset,\n", + "- `__getitem__` : returns the $i^{\\textrm{th}}$ sample of the dataset.\n", + "\n", + "Also, the data needs to be converted to PyTorch tensors. This can be done by using the [TensorDataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.TensorDataset) class or the `torch.from_numpy()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "class CCPPDataProvider(Dataset):\n", + " \"\"\"Combined Cycle Power Plant dataset.\"\"\"\n", + " \n", + " def __init__(self, data_path, which_set='train', x_dims=None):\n", + " super().__init__()\n", + " self.data = np.load(data_path)\n", + "\n", + " assert which_set in ['train', 'valid'], (\n", + " 'Expected which_set to be either train or valid '\n", + " 'Got {0}'.format(which_set)\n", + " )\n", + " self.x = self.data[which_set + '_inputs']\n", + " if x_dims is not None:\n", + " self.x = self.x[:, x_dims]\n", + " self.x = torch.from_numpy(self.x).to(torch.float32)\n", + " self.t = self.data[which_set + '_targets']\n", + " self.t = torch.from_numpy(self.t).to(torch.float32)\n", + "\n", + " def __len__(self):\n", + " return len(self.x)\n", + " \n", + " def __getitem__(self, idx):\n", + " return self.x[idx], self.t[idx]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Linear` layer, also called a fully-connected layer perform the affine operation described above by combining the input data with the weights and biases." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "class SingleLayerModel(nn.Module):\n", + " \"\"\"Single layer model.\"\"\"\n", + " def __init__(self, input_dim, output_dim):\n", + " super().__init__()\n", + " self.layer = nn.Linear(input_dim, output_dim) \n", + " \n", + " def forward(self, x):\n", + " return self.layer(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The weights and biases for each layer (neural network parameters) are initialized randomly but we can decide what distribution to sample from using the [`torch.init.module`](https://pytorch.org/docs/stable/nn.init.html). Here, we will use a uniform distribution for the weights and set the biases to 0." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def weights_init(m):\n", + " \"\"\"Reinitialize model weights\"\"\"\n", + " classname = m.__class__.__name__\n", + " if classname.find('Linear') != -1:\n", + " nn.init.uniform_(m.weight.data, -0.1, 0.1)\n", + " nn.init.constant_(m.bias.data, 0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "data_path = os.path.join(os.environ['MLP_DATA_DIR'], 'ccpp_data.npz')\n", + "assert os.path.isfile(data_path), ('Data file does not exist at expected path: ' + data_path)\n", + "\n", + "dataset = CCPPDataProvider(data_path, which_set='train', x_dims=[0, 1])\n", + "\n", + "dataloader = DataLoader(dataset, batch_size=100, shuffle=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The error between predictions and ground truth values is calculated using the means square error loss function. This function is implemented in PyTorch as [`torch.nn.MSELoss`](https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html).\n", + "\n", + "There are many [optimisers](https://pytorch.org/docs/stable/optim.html#module-torch.optim) available in PyTorch. Here, we will use the [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam) optimiser. This optimiser takes as input the parameters to optimise and the learning rate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = SingleLayerModel(input_dim, output_dim)\n", + "model.apply(weights_init)\n", + "\n", + "print(f\"Model structure: {model}\\n\\n\")\n", + "\n", + "loss = nn.MSELoss() # Mean Squared Error loss\n", + "optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimiser" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The training loop will be similar to the one in the previous section. For every epoch, we will iterate through the datase by batches. For each batch, we will compute the predictions, the loss and the gradients. We will then update the parameters using the gradients and the optimiser.\n", + "\n", + "However, we will use the [`torch.optim.zero_grad()`](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html) method to set the gradients to zero before computing the gradients of the loss function with respect to the parameters. *Think about why we need to do this and what would happens if we do not?* \n", + "\n", + "We will also use the [`torch.optim.step()`](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.step.html) method to update the parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Keep track of the loss values over training\n", + "train_loss = [] \n", + "\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " epoch_loss = 0\n", + "\n", + " for x, t in dataloader:\n", + " y = model(x)\n", + " E_value = loss(y, t)\n", + " optimizer.zero_grad()\n", + " E_value.backward()\n", + " optimizer.step()\n", + " epoch_loss += E_value.item()\n", + " # Calculate average loss for this epoch\n", + " avg_epoch_loss = epoch_loss / len(dataloader)\n", + " print(f\"Epoch [{epoch+1}/{num_epochs}]\\tError(train): {avg_epoch_loss:.4f}\")\n", + " train_loss.append(avg_epoch_loss)\n", + "\n", + "# Plot the change in the error over training.\n", + "fig = plt.figure(figsize=(8, 4))\n", + "ax = fig.add_subplot(111)\n", + "ax.plot(train_loss)\n", + "ax.set_xlabel('Epoch number')\n", + "ax.set_ylabel('Error')\n", + "fig.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When the training is finished, we can compute the predictions and plot them along with the the ground truth values. \n", + "\n", + "Here, we will use the [`torch.no_grad()`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) context manager to disable gradient calculation as we do not need it for the predictions." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = []\n", + "inputs = []\n", + "targets = []\n", + "\n", + "with torch.no_grad():\n", + " model.eval()\n", + " for x, t in dataloader:\n", + " inputs.append(x.numpy())\n", + " targets.append(t.numpy())\n", + " y = model(x)\n", + " predictions.append(y.numpy())\n", + " \n", + "predictions = np.concatenate(predictions, axis=0)\n", + "inputs = np.concatenate(inputs, axis=0)\n", + "targets = np.concatenate(targets, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot target and predicted outputs against inputs on same axis\n", + "fig = plt.figure(figsize=(8, 8))\n", + "ax = fig.add_subplot(111, projection='3d')\n", + "ax.plot(inputs[:, 0], inputs[:, 1], targets[:, 0], 'r.', ms=2)\n", + "ax.plot(inputs[:, 0], inputs[:, 1], predictions[:, 0], 'b.', ms=2)\n", + "ax.set_xlabel('Input dim 1')\n", + "ax.set_ylabel('Input dim 2')\n", + "ax.set_zlabel('Output')\n", + "ax.legend(['Targets', 'Predictions'], frameon=False)\n", + "fig.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/res/._fprop-bprop-block-diagram.png b/notebooks/res/._fprop-bprop-block-diagram.png new file mode 100644 index 0000000..939914a Binary files /dev/null and b/notebooks/res/._fprop-bprop-block-diagram.png differ diff --git a/notebooks/res/._jupyter-dashboard.png b/notebooks/res/._jupyter-dashboard.png new file mode 100644 index 0000000..f0a4dc3 Binary files /dev/null and b/notebooks/res/._jupyter-dashboard.png differ diff --git a/notebooks/res/._jupyter-notebook-interface.png b/notebooks/res/._jupyter-notebook-interface.png new file mode 100644 index 0000000..dbda828 Binary files /dev/null and b/notebooks/res/._jupyter-notebook-interface.png differ diff --git a/notebooks/res/._singleLayerNetBP-1.png b/notebooks/res/._singleLayerNetBP-1.png new file mode 100644 index 0000000..ec10481 Binary files /dev/null and b/notebooks/res/._singleLayerNetBP-1.png differ diff --git a/notebooks/res/._singleLayerNetPredict.png b/notebooks/res/._singleLayerNetPredict.png new file mode 100644 index 0000000..750c3c3 Binary files /dev/null and b/notebooks/res/._singleLayerNetPredict.png differ diff --git a/notebooks/res/._singleLayerNetWts-1.png b/notebooks/res/._singleLayerNetWts-1.png new file mode 100644 index 0000000..1d943eb Binary files /dev/null and b/notebooks/res/._singleLayerNetWts-1.png differ diff --git a/notebooks/res/._singleLayerNetWtsEqns-1.png b/notebooks/res/._singleLayerNetWtsEqns-1.png new file mode 100644 index 0000000..f714429 Binary files /dev/null and b/notebooks/res/._singleLayerNetWtsEqns-1.png differ diff --git a/notebooks/res/fprop-bprop-block-diagram.pdf b/notebooks/res/fprop-bprop-block-diagram.pdf new file mode 100644 index 0000000..6c5f0e0 Binary files /dev/null and b/notebooks/res/fprop-bprop-block-diagram.pdf differ diff --git a/notebooks/res/fprop-bprop-block-diagram.png b/notebooks/res/fprop-bprop-block-diagram.png new file mode 100644 index 0000000..17f6a8b Binary files /dev/null and b/notebooks/res/fprop-bprop-block-diagram.png differ diff --git a/notebooks/res/fprop-bprop-block-diagram.tex b/notebooks/res/fprop-bprop-block-diagram.tex new file mode 100644 index 0000000..d2c2c7b --- /dev/null +++ b/notebooks/res/fprop-bprop-block-diagram.tex @@ -0,0 +1,65 @@ +\documentclass[tikz]{standalone} + +\usepackage{amsmath} +\usepackage{tikz} +\usetikzlibrary{arrows} +\usetikzlibrary{calc} +\usepackage{ifthen} + +\newcommand{\vct}[1]{\boldsymbol{#1}} +\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}} + +\tikzstyle{fprop} = [draw,fill=blue!20,minimum size=2em,align=center] +\tikzstyle{bprop} = [draw,fill=red!20,minimum size=2em,align=center] + +\begin{document} + +\begin{tikzpicture}[xscale=1.75] % + % define number of layers + \def\nl{2}; + % model input + \node at (0, 0) (input) {$\vct{x}$}; + % draw fprop through model layers + \foreach \l in {0,...,\nl} { + \node[fprop] at (2 * \l + 1, 0) (fprop\l) {\texttt{layers[\l]} \\ \texttt{.fprop}}; + \ifthenelse{\l > 0}{ + \node at (2 * \l, 0) (hidden\l) {$\vct{h}_\l$}; + \draw[->] (hidden\l) -- (fprop\l); + \draw[->] let \n1={\l - 1} in (fprop\n1) -- (hidden\l); + }{ + \draw[->] (input) -- (fprop\l); + } + } + % model output + \node at (2 * \nl + 2, 0) (output) {$\mathbf{y}$}; + % error function + \node[fprop] at (2 * \nl + 3, 0) (errorfunc) {\texttt{error}}; + % error value + \node at (2 * \nl + 3, -1) (error) {$\bar{E}$}; + % targets + \node at (2 * \nl + 4, -1) (tgt) {$\vct{t}$}; + % error gradient + \node[bprop] at (2 * \nl + 3, -2) (errorgrad) {\texttt{error} \\ \texttt{.grad}}; + % gradient wrt outputs + \node at (2 * \nl + 2, -2) (gradoutput) {$\pd{\bar{E}}{\vct{y}}$}; + \draw[->] (fprop\nl) -- (output); + \draw[->] (output) -- (errorfunc); + \draw[->] (errorfunc) -- (error); + \draw[->] (error) -- (errorgrad); + \draw[->] (errorgrad) -- (gradoutput); + \draw[->] (tgt) |- (errorfunc); + \draw[->] (tgt) |- (errorgrad); + \foreach \l in {0,...,\nl} { + \node[bprop] at (2 * \l + 1, -2) (bprop\l) {\texttt{layers[\l]} \\ \texttt{.bprop}}; + \ifthenelse{\l > 0}{ + \node at (2 * \l, -2) (grad\l) {$\pd{\bar{E}}{\vct{h}_\l}$}; + \draw[<-] (grad\l) -- (bprop\l); + \draw[<-] let \n1={\l - 1} in (bprop\n1) -- (grad\l); + }{} + } + \node at (0, -2) (gradinput) {$\pd{\bar{E}}{\vct{x}}$}; + \draw[->] (bprop0) -- (gradinput); + \draw[->] (gradoutput) -- (bprop\nl); +\end{tikzpicture} + +\end{document} \ No newline at end of file