update lab 2

2024-09-21 02:09:17 +08:00 · 2024-09-21 02:09:17 +08:00 · 2702ee6f7b
commit 2702ee6f7b
parent f5579c980d
20 changed files with 2071 additions and 49 deletions
--- a/mlp/data_providers.py
+++ b/mlp/data_providers.py
@ -75,6 +75,9 @@ class DataProvider(object):
        self.inputs = self.inputs[new_order]
        self.targets = self.targets[new_order]

+    def __next__(self):
+        return self.next()
+
    def next(self):
        """Returns next data batch or raises `StopIteration` if at end."""
        if self._curr_batch + 1 > self.num_batches:
@ -133,13 +136,10 @@ class MNISTDataProvider(DataProvider):
        super(MNISTDataProvider, self).__init__(
            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)

-    # def next(self):
-    #    """Returns next data batch or raises `StopIteration` if at end."""
-    #    inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
-    #    return inputs_batch, self.to_one_of_k(targets_batch)
-    
-    def __next__(self):
-        return self.next()
+    def next(self):
+        """Returns next data batch or raises `StopIteration` if at end."""
+        inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
+        return inputs_batch, self.to_one_of_k(targets_batch)

    def to_one_of_k(self, int_targets):
        """Converts integer coded class target to 1 of K coded targets.
@ -156,21 +156,23 @@ class MNISTDataProvider(DataProvider):
            to zero except for the column corresponding to the correct class
            which is equal to one.
        """
-        raise NotImplementedError()
+        one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
+        one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
+        return one_of_k_targets


 class MetOfficeDataProvider(DataProvider):
    """South Scotland Met Office weather data provider."""

    def __init__(self, window_size, batch_size=10, max_num_batches=-1,
-                shuffle_order=True, rng=None):
-        """Create a new Met Offfice data provider object.
+                 shuffle_order=True, rng=None):
+        """Create a new Met Office data provider object.

        Args:
            window_size (int): Size of windows to split weather time series
-            data into. The constructed input features will be the first
-            `window_size - 1` entries in each window and the target outputs
-            the last entry in each window.
+               data into. The constructed input features will be the first
+               `window_size - 1` entries in each window and the target outputs
+               the last entry in each window.
            batch_size (int): Number of data points to include in each batch.
            max_num_batches (int): Maximum number of batches to iterate over
                in an epoch. If `max_num_batches * batch_size > num_data` then
@ -180,29 +182,74 @@ class MetOfficeDataProvider(DataProvider):
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
        """
-        self.window_size = window_size
-        assert window_size > 1, 'window_size must be at least 2.'
        data_path = os.path.join(
            os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
        assert os.path.isfile(data_path), (
            'Data file does not exist at expected path: ' + data_path
        )
-        #TODO: load raw data from text file
-        
-        #TODO: filter out all missing datapoints and flatten to a vector
-        
-        #TODO: normalise data to zero mean, unit standard deviation
+        raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32))
+        assert window_size > 1, 'window_size must be at least 2.'
+        self.window_size = window_size
+        # filter out all missing datapoints and flatten to a vector
+        filtered = raw[raw >= 0].flatten()
+        # normalise data to zero mean, unit standard deviation
+        mean = np.mean(filtered)
+        std = np.std(filtered)
+        normalised = (filtered - mean) / std
+        # create a view on to array corresponding to a rolling window
+        shape = (normalised.shape[-1] - self.window_size + 1, self.window_size)
+        strides = normalised.strides + (normalised.strides[-1],)
+        windowed = np.lib.stride_tricks.as_strided(
+            normalised, shape=shape, strides=strides)
+        # inputs are first (window_size - 1) entries in windows
+        inputs = windowed[:, :-1]
+        # targets are last entry in windows
+        targets = windowed[:, -1]
+        super(MetOfficeDataProvider, self).__init__(
+            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)

-        #TODO: convert from flat sequence to windowed data
+class CCPPDataProvider(DataProvider):

-        #TODO: separate into inputs and targets
-        # inputs are the first (window_size - 1) entries in windows
-        # inputs = ...
-        # targets are the last entries in windows
-        # targets = ...
-        
-        # initialise base class with inputs and targets arrays (uncomment below)
-        # super(MetOfficeDataProvider, self).__init__(
-        #     inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
-    def __next__(self):
-            return self.next()
+    def __init__(self, which_set='train', input_dims=None, batch_size=10,
+                 max_num_batches=-1, shuffle_order=True, rng=None):
+        """Create a new Combined Cycle Power Plant data provider object.
+
+        Args:
+            which_set: One of 'train' or 'valid'. Determines which portion of
+                data this object should provide.
+            input_dims: Which of the four input dimension to use. If `None` all
+                are used. If an iterable of integers are provided (consisting
+                of a subset of {0, 1, 2, 3}) then only the corresponding
+                input dimensions are included.
+            batch_size (int): Number of data points to include in each batch.
+            max_num_batches (int): Maximum number of batches to iterate over
+                in an epoch. If `max_num_batches * batch_size > num_data` then
+                only as many batches as the data can be split into will be
+                used. If set to -1 all of the data will be used.
+            shuffle_order (bool): Whether to randomly permute the order of
+                the data before each epoch.
+            rng (RandomState): A seeded random number generator.
+        """
+        data_path = os.path.join(
+            os.environ['MLP_DATA_DIR'], 'ccpp_data.npz')
+        assert os.path.isfile(data_path), (
+            'Data file does not exist at expected path: ' + data_path
+        )
+        # check a valid which_set was provided
+        assert which_set in ['train', 'valid'], (
+            'Expected which_set to be either train or valid '
+            'Got {0}'.format(which_set)
+        )
+        # check input_dims are valid
+        if not input_dims is not None:
+            input_dims = set(input_dims)
+            assert input_dims.issubset({0, 1, 2, 3}), (
+                'input_dims should be a subset of {0, 1, 2, 3}'
+            )
+        loaded = np.load(data_path)
+        inputs = loaded[which_set + '_inputs']
+        if input_dims is not None:
+            inputs = inputs[:, input_dims]
+        targets = loaded[which_set + '_targets']
+        super(CCPPDataProvider, self).__init__(
+            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
--- a/mlp/errors.py
+++ b/mlp/errors.py
@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+"""Error functions.
+
+This module defines error functions, with the aim of model training being to
+minimise the error function given a set of inputs and target outputs.
+
+The error functions will typically measure some concept of distance between the
+model outputs and target outputs, averaged over all data points in the data set
+or batch.
+"""
+
+import numpy as np
+
+
+class SumOfSquaredDiffsError(object):
+    """Sum of squared differences (squared Euclidean distance) error."""
+
+    def __call__(self, outputs, targets):
+        """Calculates error function given a batch of outputs and targets.
+
+        Args:
+            outputs: Array of model outputs of shape (batch_size, output_dim).
+            targets: Array of target outputs of shape (batch_size, output_dim).
+
+        Returns:
+            Scalar error function value.
+        """
+        #TODO write your code here
+        raise NotImplementedError()
+
+    def grad(self, outputs, targets):
+        """Calculates gradient of error function with respect to outputs.
+
+        Args:
+            outputs: Array of model outputs of shape (batch_size, output_dim).
+            targets: Array of target outputs of shape (batch_size, output_dim).
+
+        Returns:
+            Gradient of error function with respect to outputs. This should be
+            an array of shape (batch_size, output_dim).
+        """
+        #TODO write your code here
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return 'SumOfSquaredDiffsError'
--- a/mlp/initialisers.py
+++ b/mlp/initialisers.py
@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""Parameter initialisers.
+
+This module defines classes to initialise the parameters in a layer.
+"""
+
+import numpy as np
+from mlp import DEFAULT_SEED
+
+
+class ConstantInit(object):
+    """Constant parameter initialiser."""
+
+    def __init__(self, value):
+        """Construct a constant parameter initialiser.
+
+        Args:
+            value: Value to initialise parameter to.
+        """
+        self.value = value
+
+    def __call__(self, shape):
+        return np.ones(shape=shape) * self.value
+
+
+class UniformInit(object):
+    """Random uniform parameter initialiser."""
+
+    def __init__(self, low, high, rng=None):
+        """Construct a random uniform parameter initialiser.
+
+        Args:
+            low: Lower bound of interval to sample from.
+            high: Upper bound of interval to sample from.
+            rng (RandomState): Seeded random number generator.
+        """
+        self.low = low
+        self.high = high
+        if rng is None:
+            rng = np.random.RandomState(DEFAULT_SEED)
+        self.rng = rng
+
+    def __call__(self, shape):
+        return self.rng.uniform(low=self.low, high=self.high, size=shape)
+
+
+class NormalInit(object):
+    """Random normal parameter initialiser."""
+
+    def __init__(self, mean, std, rng=None):
+        """Construct a random uniform parameter initialiser.
+
+        Args:
+            mean: Mean of distribution to sample from.
+            std: Standard deviation of distribution to sample from.
+            rng (RandomState): Seeded random number generator.
+        """
+        self.mean = mean
+        self.std = std
+        if rng is None:
+            rng = np.random.RandomState(DEFAULT_SEED)
+        self.rng = rng
+
+    def __call__(self, shape):
+        return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
--- a/mlp/layers.py
+++ b/mlp/layers.py
@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+"""Layer definitions.
+
+This module defines classes which encapsulate a single layer.
+
+These layers map input activations to output activation with the `fprop`
+method and map gradients with repsect to outputs to gradients with respect to
+their inputs with the `bprop` method.
+
+Some layers will have learnable parameters and so will additionally define
+methods for getting and setting parameter and calculating gradients with
+respect to the layer parameters.
+"""
+
+import numpy as np
+import mlp.initialisers as init
+
+
+class Layer(object):
+    """Abstract class defining the interface for a layer."""
+
+    def fprop(self, inputs):
+        """Forward propagates activations through the layer transformation.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+
+        Returns:
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
+        """
+        raise NotImplementedError()
+
+    def bprop(self, inputs, outputs, grads_wrt_outputs):
+        """Back propagates gradients through a layer.
+
+        Given gradients with respect to the outputs of the layer calculates the
+        gradients with respect to the layer inputs.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+            outputs: Array of layer outputs calculated in forward pass of
+                shape (batch_size, output_dim).
+            grads_wrt_outputs: Array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim).
+
+        Returns:
+            Array of gradients with respect to the layer inputs of shape
+            (batch_size, input_dim).
+        """
+        raise NotImplementedError()
+
+
+class LayerWithParameters(Layer):
+    """Abstract class defining the interface for a layer with parameters."""
+
+    def grads_wrt_params(self, inputs, grads_wrt_outputs):
+        """Calculates gradients with respect to layer parameters.
+
+        Args:
+            inputs: Array of inputs to layer of shape (batch_size, input_dim).
+            grads_wrt_to_outputs: Array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim).
+
+        Returns:
+            List of arrays of gradients with respect to the layer parameters
+            with parameter gradients appearing in same order in tuple as
+            returned from `get_params` method.
+        """
+        raise NotImplementedError()
+
+    @property
+    def params(self):
+        """Returns a list of parameters of layer.
+
+        Returns:
+            List of current parameter values.
+        """
+        raise NotImplementedError()
+
+
+class AffineLayer(LayerWithParameters):
+    """Layer implementing an affine tranformation of its inputs.
+
+    This layer is parameterised by a weight matrix and bias vector.
+    """
+
+    def __init__(self, input_dim, output_dim,
+                 weights_initialiser=init.UniformInit(-0.1, 0.1),
+                 biases_initialiser=init.ConstantInit(0.),
+                 weights_cost=None, biases_cost=None):
+        """Initialises a parameterised affine layer.
+
+        Args:
+            input_dim (int): Dimension of inputs to the layer.
+            output_dim (int): Dimension of the layer outputs.
+            weights_initialiser: Initialiser for the weight parameters.
+            biases_initialiser: Initialiser for the bias parameters.
+        """
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.weights = weights_initialiser((self.output_dim, self.input_dim))
+        self.biases = biases_initialiser(self.output_dim)
+
+    def fprop(self, inputs):
+        """Forward propagates activations through the layer transformation.
+
+        For inputs `x`, outputs `y`, weights `W` and biases `b` the layer
+        corresponds to `y = W.dot(x) + b`.
+
+        Args:
+            inputs: Array of layer inputs of shape (batch_size, input_dim).
+
+        Returns:
+            outputs: Array of layer outputs of shape (batch_size, output_dim).
+        """
+        #TODO write your code here
+        raise NotImplementedError()
+
+    def grads_wrt_params(self, inputs, grads_wrt_outputs):
+        """Calculates gradients with respect to layer parameters.
+
+        Args:
+            inputs: array of inputs to layer of shape (batch_size, input_dim)
+            grads_wrt_to_outputs: array of gradients with respect to the layer
+                outputs of shape (batch_size, output_dim)
+
+        Returns:
+            list of arrays of gradients with respect to the layer parameters
+            `[grads_wrt_weights, grads_wrt_biases]`.
+        """
+        #TODO write your code here
+        raise NotImplementedError()
+
+    @property
+    def params(self):
+        """A list of layer parameter values: `[weights, biases]`."""
+        return [self.weights, self.biases]
+
+    def __repr__(self):
+        return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
+            self.input_dim, self.output_dim)
--- a/mlp/learning_rules.py
+++ b/mlp/learning_rules.py
@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+"""Learning rules.
+
+This module contains classes implementing gradient based learning rules.
+"""
+
+import numpy as np
+
+
+class GradientDescentLearningRule(object):
+    """Simple (stochastic) gradient descent learning rule.
+
+    For a scalar error function `E(p[0], p_[1] ... )` of some set of
+    potentially multidimensional parameters this attempts to find a local
+    minimum of the loss function by applying updates to each parameter of the
+    form
+
+        p[i] := p[i] - learning_rate * dE/dp[i]
+
+    With `learning_rate` a positive scaling parameter.
+
+    The error function used in successive applications of these updates may be
+    a stochastic estimator of the true error function (e.g. when the error with
+    respect to only a subset of data-points is calculated) in which case this
+    will correspond to a stochastic gradient descent learning rule.
+    """
+
+    def __init__(self, learning_rate=1e-3):
+        """Creates a new learning rule object.
+
+        Args:
+            learning_rate: A postive scalar to scale gradient updates to the
+                parameters by. This needs to be carefully set - if too large
+                the learning dynamic will be unstable and may diverge, while
+                if set too small learning will proceed very slowly.
+
+        """
+        assert learning_rate > 0., 'learning_rate should be positive.'
+        self.learning_rate = learning_rate
+
+    def initialise(self, params):
+        """Initialises the state of the learning rule for a set or parameters.
+
+        This must be called before `update_params` is first called.
+
+        Args:
+            params: A list of the parameters to be optimised. Note these will
+                be updated *in-place* to avoid reallocating arrays on each
+                update.
+        """
+        self.params = params
+
+    def reset(self):
+        """Resets any additional state variables to their intial values.
+
+        For this learning rule there are no additional state variables so we
+        do nothing here.
+        """
+        pass
+
+    def update_params(self, grads_wrt_params):
+        """Applies a single gradient descent update to all parameters.
+
+        All parameter updates are performed using in-place operations and so
+        nothing is returned.
+
+        Args:
+            grads_wrt_params: A list of gradients of the scalar loss function
+                with respect to each of the parameters passed to `initialise`
+                previously, with this list expected to be in the same order.
+        """
+        for param, grad in zip(self.params, grads_wrt_params):
+            param -= self.learning_rate * grad
+
+
+class MomentumLearningRule(GradientDescentLearningRule):
+    """Gradient descent with momentum learning rule.
+
+    This extends the basic gradient learning rule by introducing extra
+    momentum state variables for each parameter. These can help the learning
+    dynamic help overcome shallow local minima and speed convergence when
+    making multiple successive steps in a similar direction in parameter space.
+
+    For parameter p[i] and corresponding momentum m[i] the updates for a
+    scalar loss function `L` are of the form
+
+        m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i]
+        p[i] := p[i] + m[i]
+
+    with `learning_rate` a positive scaling parameter for the gradient updates
+    and `mom_coeff` a value in [0, 1] that determines how much 'friction' there
+    is the system and so how quickly previous momentum contributions decay.
+    """
+
+    def __init__(self, learning_rate=1e-3, mom_coeff=0.9):
+        """Creates a new learning rule object.
+
+        Args:
+            learning_rate: A postive scalar to scale gradient updates to the
+                parameters by. This needs to be carefully set - if too large
+                the learning dynamic will be unstable and may diverge, while
+                if set too small learning will proceed very slowly.
+            mom_coeff: A scalar in the range [0, 1] inclusive. This determines
+                the contribution of the previous momentum value to the value
+                after each update. If equal to 0 the momentum is set to exactly
+                the negative scaled gradient each update and so this rule
+                collapses to standard gradient descent. If equal to 1 the
+                momentum will just be decremented by the scaled gradient at
+                each update. This is equivalent to simulating the dynamic in
+                a frictionless system. Due to energy conservation the loss
+                of 'potential energy' as the dynamics moves down the loss
+                function surface will lead to an increasingly large 'kinetic
+                energy' and so speed, meaning the updates will become
+                increasingly large, potentially unstably so. Typically a value
+                less than but close to 1 will avoid these issues and cause the
+                dynamic to converge to a local minima where the gradients are
+                by definition zero.
+        """
+        super(MomentumLearningRule, self).__init__(learning_rate)
+        assert mom_coeff >= 0. and mom_coeff <= 1., (
+            'mom_coeff should be in the range [0, 1].'
+        )
+        self.mom_coeff = mom_coeff
+
+    def initialise(self, params):
+        """Initialises the state of the learning rule for a set or parameters.
+
+        This must be called before `update_params` is first called.
+
+        Args:
+            params: A list of the parameters to be optimised. Note these will
+                be updated *in-place* to avoid reallocating arrays on each
+                update.
+        """
+        super(MomentumLearningRule, self).initialise(params)
+        self.moms = []
+        for param in self.params:
+            self.moms.append(np.zeros_like(param))
+
+    def reset(self):
+        """Resets any additional state variables to their intial values.
+
+        For this learning rule this corresponds to zeroing all the momenta.
+        """
+        for mom in zip(self.moms):
+            mom *= 0.
+
+    def update_params(self, grads_wrt_params):
+        """Applies a single update to all parameters.
+
+        All parameter updates are performed using in-place operations and so
+        nothing is returned.
+
+        Args:
+            grads_wrt_params: A list of gradients of the scalar loss function
+                with respect to each of the parameters passed to `initialise`
+                previously, with this list expected to be in the same order.
+        """
+        for param, mom, grad in zip(self.params, self.moms, grads_wrt_params):
+            mom *= self.mom_coeff
+            mom -= self.learning_rate * grad
+            param += mom
--- a/mlp/models.py
+++ b/mlp/models.py
@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""Model definitions.
+
+This module implements objects encapsulating learnable models of input-output
+relationships. The model objects implement methods for forward propagating
+the inputs through the transformation(s) defined by the model to produce
+outputs (and intermediate states) and for calculating gradients of scalar
+functions of the outputs with respect to the model parameters.
+"""
+
+from mlp.layers import LayerWithParameters
+
+
+class SingleLayerModel(object):
+    """A model consisting of a single transformation layer."""
+
+    def __init__(self, layer):
+        """Create a new single layer model instance.
+
+        Args:
+            layer: The layer object defining the model architecture.
+        """
+        self.layer = layer
+
+    @property
+    def params(self):
+        """A list of all of the parameters of the model."""
+        return self.layer.params
+
+    def fprop(self, inputs):
+        """Calculate the model outputs corresponding to a batch of inputs.
+
+        Args:
+            inputs: Batch of inputs to the model.
+
+        Returns:
+            List which is a concatenation of the model inputs and model
+            outputs, this being done for consistency of the interface with
+            multi-layer models for which `fprop` returns a list of
+            activations through all immediate layers of the model and including
+            the inputs and outputs.
+        """
+        activations = [inputs, self.layer.fprop(inputs)]
+        return activations
+
+    def grads_wrt_params(self, activations, grads_wrt_outputs):
+        """Calculates gradients with respect to the model parameters.
+
+        Args:
+            activations: List of all activations from forward pass through
+                model using `fprop`.
+            grads_wrt_outputs: Gradient with respect to the model outputs of
+               the scalar function parameter gradients are being calculated
+               for.
+
+        Returns:
+            List of gradients of the scalar function with respect to all model
+            parameters.
+        """
+        return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
+
+    def params_cost(self):
+        """Calculates the parameter dependent cost term of the model."""
+        return self.layer.params_cost()
+
+    def __repr__(self):
+        return 'SingleLayerModel(' + str(layer) + ')'
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+"""Model optimisers.
+
+This module contains objects implementing (batched) stochastic gradient descent
+based optimisation of models.
+"""
+
+import time
+import logging
+from collections import OrderedDict
+import numpy as np
+
+
+logger = logging.getLogger(__name__)
+
+
+class Optimiser(object):
+    """Basic model optimiser."""
+
+    def __init__(self, model, error, learning_rule, train_dataset,
+                 valid_dataset=None, data_monitors=None):
+        """Create a new optimiser instance.
+
+        Args:
+            model: The model to optimise.
+            error: The scalar error function to minimise.
+            learning_rule: Gradient based learning rule to use to minimise
+                error.
+            train_dataset: Data provider for training set data batches.
+            valid_dataset: Data provider for validation set data batches.
+            data_monitors: Dictionary of functions evaluated on targets and
+                model outputs (averaged across both full training and
+                validation data sets) to monitor during training in addition
+                to the error. Keys should correspond to a string label for
+                the statistic being evaluated.
+        """
+        self.model = model
+        self.error = error
+        self.learning_rule = learning_rule
+        self.learning_rule.initialise(self.model.params)
+        self.train_dataset = train_dataset
+        self.valid_dataset = valid_dataset
+        self.data_monitors = OrderedDict([('error', error)])
+        if data_monitors is not None:
+            self.data_monitors.update(data_monitors)
+
+    def do_training_epoch(self):
+        """Do a single training epoch.
+
+        This iterates through all batches in training dataset, for each
+        calculating the gradient of the estimated error given the batch with
+        respect to all the model parameters and then updates the model
+        parameters according to the learning rule.
+        """
+        for inputs_batch, targets_batch in self.train_dataset:
+            activations = self.model.fprop(inputs_batch)
+            grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
+            grads_wrt_params = self.model.grads_wrt_params(
+                activations, grads_wrt_outputs)
+            self.learning_rule.update_params(grads_wrt_params)
+
+    def eval_monitors(self, dataset, label):
+        """Evaluates the monitors for the given dataset.
+
+        Args:
+            dataset: Dataset to perform evaluation with.
+            label: Tag to add to end of monitor keys to identify dataset.
+
+        Returns:
+            OrderedDict of monitor values evaluated on dataset.
+        """
+        data_mon_vals = OrderedDict([(key + label, 0.) for key
+                                     in self.data_monitors.keys()])
+        for inputs_batch, targets_batch in dataset:
+            activations = self.model.fprop(inputs_batch)
+            for key, data_monitor in self.data_monitors.items():
+                data_mon_vals[key + label] += data_monitor(
+                    activations[-1], targets_batch)
+        for key, data_monitor in self.data_monitors.items():
+            data_mon_vals[key + label] /= dataset.num_batches
+        return data_mon_vals
+
+    def get_epoch_stats(self):
+        """Computes training statistics for an epoch.
+
+        Returns:
+            An OrderedDict with keys corresponding to the statistic labels and
+            values corresponding to the value of the statistic.
+        """
+        epoch_stats = OrderedDict()
+        epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)'))
+        if self.valid_dataset is not None:
+            epoch_stats.update(self.eval_monitors(
+                self.valid_dataset, '(valid)'))
+        return epoch_stats
+
+    def log_stats(self, epoch, epoch_time, stats):
+        """Outputs stats for a training epoch to a logger.
+
+        Args:
+            epoch (int): Epoch counter.
+            epoch_time: Time taken in seconds for the epoch to complete.
+            stats: Monitored stats for the epoch.
+        """
+        logger.info('Epoch {0}: {1:.1f}s to complete\n    {2}'.format(
+            epoch, epoch_time,
+            ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
+        ))
+
+    def train(self, num_epochs, stats_interval=5):
+        """Trains a model for a set number of epochs.
+
+        Args:
+            num_epochs: Number of epochs (complete passes through trainin
+                dataset) to train for.
+            stats_interval: Training statistics will be recorded and logged
+                every `stats_interval` epochs.
+
+        Returns:
+            Tuple with first value being an array of training run statistics
+            and the second being a dict mapping the labels for the statistics
+            recorded to their column index in the array.
+        """
+        run_stats = [list(self.get_epoch_stats().values())]
+        for epoch in range(1, num_epochs + 1):
+            start_time = time.process_time()
+            self.do_training_epoch()
+            epoch_time = time.process_time() - start_time
+            if epoch % stats_interval == 0:
+                stats = self.get_epoch_stats()
+                self.log_stats(epoch, epoch_time, stats)
+                run_stats.append(list(stats.values()))
+        return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}
+
--- a/notebooks/00_notebook.ipynb
+++ b/notebooks/00_notebook.ipynb
@ -239,4 +239,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/notebooks/01_Introduction.ipynb
+++ b/notebooks/01_Introduction.ipynb
--- a/notebooks/02_Single_layer_models.ipynb
+++ b/notebooks/02_Single_layer_models.ipynb
--- a/notebooks/res/._fprop-bprop-block-diagram.png
+++ b/notebooks/res/._fprop-bprop-block-diagram.png
--- a/notebooks/res/._jupyter-dashboard.png
+++ b/notebooks/res/._jupyter-dashboard.png
--- a/notebooks/res/._jupyter-notebook-interface.png
+++ b/notebooks/res/._jupyter-notebook-interface.png
--- a/notebooks/res/._singleLayerNetBP-1.png
+++ b/notebooks/res/._singleLayerNetBP-1.png
--- a/notebooks/res/._singleLayerNetPredict.png
+++ b/notebooks/res/._singleLayerNetPredict.png
--- a/notebooks/res/._singleLayerNetWts-1.png
+++ b/notebooks/res/._singleLayerNetWts-1.png
--- a/notebooks/res/._singleLayerNetWtsEqns-1.png
+++ b/notebooks/res/._singleLayerNetWtsEqns-1.png
--- a/notebooks/res/fprop-bprop-block-diagram.pdf
+++ b/notebooks/res/fprop-bprop-block-diagram.pdf
--- a/notebooks/res/fprop-bprop-block-diagram.png
+++ b/notebooks/res/fprop-bprop-block-diagram.png
--- a/notebooks/res/fprop-bprop-block-diagram.tex
+++ b/notebooks/res/fprop-bprop-block-diagram.tex
@ -0,0 +1,65 @@
+\documentclass[tikz]{standalone}
+
+\usepackage{amsmath}
+\usepackage{tikz}
+\usetikzlibrary{arrows}
+\usetikzlibrary{calc}
+\usepackage{ifthen}
+
+\newcommand{\vct}[1]{\boldsymbol{#1}}
+\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
+
+\tikzstyle{fprop} = [draw,fill=blue!20,minimum size=2em,align=center]
+\tikzstyle{bprop} = [draw,fill=red!20,minimum size=2em,align=center]
+
+\begin{document}
+
+\begin{tikzpicture}[xscale=1.75] %
+    % define number of layers
+    \def\nl{2};
+    % model input
+    \node at (0, 0) (input) {$\vct{x}$};
+    % draw fprop through model layers
+    \foreach \l in {0,...,\nl} {
+        \node[fprop] at (2 * \l + 1, 0) (fprop\l) {\texttt{layers[\l]} \\ \texttt{.fprop}};
+        \ifthenelse{\l > 0}{
+            \node at (2 * \l, 0) (hidden\l) {$\vct{h}_\l$};
+            \draw[->] (hidden\l) -- (fprop\l);
+            \draw[->] let \n1={\l - 1} in (fprop\n1) -- (hidden\l);
+        }{
+            \draw[->] (input) -- (fprop\l);
+        }
+    }
+    % model output
+    \node at (2 * \nl + 2, 0) (output) {$\mathbf{y}$};
+    % error function
+    \node[fprop] at (2 * \nl + 3, 0) (errorfunc) {\texttt{error}};
+    % error value
+    \node at (2 * \nl + 3, -1) (error) {$\bar{E}$};
+    % targets
+    \node at (2 * \nl + 4, -1) (tgt) {$\vct{t}$};
+    % error gradient
+    \node[bprop] at (2 * \nl + 3, -2) (errorgrad) {\texttt{error} \\ \texttt{.grad}};
+    % gradient wrt outputs
+    \node at (2 * \nl + 2, -2) (gradoutput) {$\pd{\bar{E}}{\vct{y}}$};
+    \draw[->] (fprop\nl) -- (output);
+    \draw[->] (output) -- (errorfunc);
+    \draw[->] (errorfunc) -- (error);
+    \draw[->] (error) -- (errorgrad);
+    \draw[->] (errorgrad) -- (gradoutput);
+    \draw[->] (tgt) |- (errorfunc);
+    \draw[->] (tgt) |- (errorgrad);
+    \foreach \l in {0,...,\nl} {
+        \node[bprop] at (2 * \l + 1, -2) (bprop\l) {\texttt{layers[\l]} \\ \texttt{.bprop}};
+        \ifthenelse{\l > 0}{
+            \node at (2 * \l, -2) (grad\l) {$\pd{\bar{E}}{\vct{h}_\l}$};
+            \draw[<-] (grad\l) -- (bprop\l);
+            \draw[<-] let \n1={\l - 1} in (bprop\n1) -- (grad\l);
+        }{}
+    }
+    \node at (0, -2) (gradinput) {$\pd{\bar{E}}{\vct{x}}$};
+    \draw[->] (bprop0) -- (gradinput);
+    \draw[->] (gradoutput) -- (bprop\nl);
+\end{tikzpicture}
+
+\end{document}