update lab 2

2024-09-21 02:09:17 +08:00 · 2024-09-21 02:09:17 +08:00 · 2702ee6f7b
commit 2702ee6f7b
parent f5579c980d
20 changed files with 2071 additions and 49 deletions
--- a/mlp/data_providers.py
+++ b/mlp/data_providers.py
@ -75,6 +75,9 @@ class DataProvider(object):
        self.inputs = self.inputs[new_order]
        self.targets = self.targets[new_order]
    def __next__(self):
        return self.next()
    def next(self):
        """Returns next data batch or raises `StopIteration` if at end."""
        if self._curr_batch + 1 > self.num_batches:
@ -133,13 +136,10 @@ class MNISTDataProvider(DataProvider):
        super(MNISTDataProvider, self).__init__(
            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
-    # def next(self):
+    def next(self):
-    #    """Returns next data batch or raises `StopIteration` if at end."""
+        """Returns next data batch or raises `StopIteration` if at end."""
-    #    inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
+        inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
-    #    return inputs_batch, self.to_one_of_k(targets_batch)
+        return inputs_batch, self.to_one_of_k(targets_batch)
    def __next__(self):
        return self.next()
    def to_one_of_k(self, int_targets):
        """Converts integer coded class target to 1 of K coded targets.
@ -156,21 +156,23 @@ class MNISTDataProvider(DataProvider):
            to zero except for the column corresponding to the correct class
            which is equal to one.
        """
-        raise NotImplementedError()
+        one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
        one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
        return one_of_k_targets
 class MetOfficeDataProvider(DataProvider):
    """South Scotland Met Office weather data provider."""
    def __init__(self, window_size, batch_size=10, max_num_batches=-1,
-                shuffle_order=True, rng=None):
+                 shuffle_order=True, rng=None):
-        """Create a new Met Offfice data provider object.
+        """Create a new Met Office data provider object.
        Args:
            window_size (int): Size of windows to split weather time series
-            data into. The constructed input features will be the first
+               data into. The constructed input features will be the first
-            `window_size - 1` entries in each window and the target outputs
+               `window_size - 1` entries in each window and the target outputs
-            the last entry in each window.
+               the last entry in each window.
            batch_size (int): Number of data points to include in each batch.
            max_num_batches (int): Maximum number of batches to iterate over
                in an epoch. If `max_num_batches * batch_size > num_data` then
@ -180,29 +182,74 @@ class MetOfficeDataProvider(DataProvider):
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
        """
        self.window_size = window_size
        assert window_size > 1, 'window_size must be at least 2.'
        data_path = os.path.join(
            os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
        assert os.path.isfile(data_path), (
            'Data file does not exist at expected path: ' + data_path
        )
-        #TODO: load raw data from text file
+        raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32))
        assert window_size > 1, 'window_size must be at least 2.'
        self.window_size = window_size
        # filter out all missing datapoints and flatten to a vector
        filtered = raw[raw >= 0].flatten()
        # normalise data to zero mean, unit standard deviation
        mean = np.mean(filtered)
        std = np.std(filtered)
        normalised = (filtered - mean) / std
        # create a view on to array corresponding to a rolling window
        shape = (normalised.shape[-1] - self.window_size + 1, self.window_size)
        strides = normalised.strides + (normalised.strides[-1],)
        windowed = np.lib.stride_tricks.as_strided(
            normalised, shape=shape, strides=strides)
        # inputs are first (window_size - 1) entries in windows
        inputs = windowed[:, :-1]
        # targets are last entry in windows
        targets = windowed[:, -1]
        super(MetOfficeDataProvider, self).__init__(
            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
-        #TODO: filter out all missing datapoints and flatten to a vector
+class CCPPDataProvider(DataProvider):
-        #TODO: normalise data to zero mean, unit standard deviation
+    def __init__(self, which_set='train', input_dims=None, batch_size=10,
                 max_num_batches=-1, shuffle_order=True, rng=None):
        """Create a new Combined Cycle Power Plant data provider object.
-        #TODO: convert from flat sequence to windowed data
+        Args:
-
+            which_set: One of 'train' or 'valid'. Determines which portion of
-        #TODO: separate into inputs and targets
+                data this object should provide.
-        # inputs are the first (window_size - 1) entries in windows
+            input_dims: Which of the four input dimension to use. If `None` all
-        # inputs = ...
+                are used. If an iterable of integers are provided (consisting
-        # targets are the last entries in windows
+                of a subset of {0, 1, 2, 3}) then only the corresponding
-        # targets = ...
+                input dimensions are included.
-        
+            batch_size (int): Number of data points to include in each batch.
-        # initialise base class with inputs and targets arrays (uncomment below)
+            max_num_batches (int): Maximum number of batches to iterate over
-        # super(MetOfficeDataProvider, self).__init__(
+                in an epoch. If `max_num_batches * batch_size > num_data` then
-        #     inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
+                only as many batches as the data can be split into will be
-    def __next__(self):
+                used. If set to -1 all of the data will be used.
-            return self.next()
+            shuffle_order (bool): Whether to randomly permute the order of
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
        """
        data_path = os.path.join(
            os.environ['MLP_DATA_DIR'], 'ccpp_data.npz')
        assert os.path.isfile(data_path), (
            'Data file does not exist at expected path: ' + data_path
        )
        # check a valid which_set was provided
        assert which_set in ['train', 'valid'], (
            'Expected which_set to be either train or valid '
            'Got {0}'.format(which_set)
        )
        # check input_dims are valid
        if not input_dims is not None:
            input_dims = set(input_dims)
            assert input_dims.issubset({0, 1, 2, 3}), (
                'input_dims should be a subset of {0, 1, 2, 3}'
            )
        loaded = np.load(data_path)
        inputs = loaded[which_set + '_inputs']
        if input_dims is not None:
            inputs = inputs[:, input_dims]
        targets = loaded[which_set + '_targets']
        super(CCPPDataProvider, self).__init__(
            inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
--- a/mlp/errors.py
+++ b/mlp/errors.py
@ -0,0 +1,46 @@
 # -*- coding: utf-8 -*-
 """Error functions.
 This module defines error functions, with the aim of model training being to
 minimise the error function given a set of inputs and target outputs.
 The error functions will typically measure some concept of distance between the
 model outputs and target outputs, averaged over all data points in the data set
 or batch.
 """
 import numpy as np
 class SumOfSquaredDiffsError(object):
    """Sum of squared differences (squared Euclidean distance) error."""
    def __call__(self, outputs, targets):
        """Calculates error function given a batch of outputs and targets.
        Args:
            outputs: Array of model outputs of shape (batch_size, output_dim).
            targets: Array of target outputs of shape (batch_size, output_dim).
        Returns:
            Scalar error function value.
        """
        #TODO write your code here
        raise NotImplementedError()
    def grad(self, outputs, targets):
        """Calculates gradient of error function with respect to outputs.
        Args:
            outputs: Array of model outputs of shape (batch_size, output_dim).
            targets: Array of target outputs of shape (batch_size, output_dim).
        Returns:
            Gradient of error function with respect to outputs. This should be
            an array of shape (batch_size, output_dim).
        """
        #TODO write your code here
        raise NotImplementedError()
    def __repr__(self):
        return 'SumOfSquaredDiffsError'
--- a/mlp/initialisers.py
+++ b/mlp/initialisers.py
@ -0,0 +1,65 @@
 # -*- coding: utf-8 -*-
 """Parameter initialisers.
 This module defines classes to initialise the parameters in a layer.
 """
 import numpy as np
 from mlp import DEFAULT_SEED
 class ConstantInit(object):
    """Constant parameter initialiser."""
    def __init__(self, value):
        """Construct a constant parameter initialiser.
        Args:
            value: Value to initialise parameter to.
        """
        self.value = value
    def __call__(self, shape):
        return np.ones(shape=shape) * self.value
 class UniformInit(object):
    """Random uniform parameter initialiser."""
    def __init__(self, low, high, rng=None):
        """Construct a random uniform parameter initialiser.
        Args:
            low: Lower bound of interval to sample from.
            high: Upper bound of interval to sample from.
            rng (RandomState): Seeded random number generator.
        """
        self.low = low
        self.high = high
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def __call__(self, shape):
        return self.rng.uniform(low=self.low, high=self.high, size=shape)
 class NormalInit(object):
    """Random normal parameter initialiser."""
    def __init__(self, mean, std, rng=None):
        """Construct a random uniform parameter initialiser.
        Args:
            mean: Mean of distribution to sample from.
            std: Standard deviation of distribution to sample from.
            rng (RandomState): Seeded random number generator.
        """
        self.mean = mean
        self.std = std
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
    def __call__(self, shape):
        return self.rng.normal(loc=self.mean, scale=self.std, size=shape)
--- a/mlp/layers.py
+++ b/mlp/layers.py
@ -0,0 +1,141 @@
 # -*- coding: utf-8 -*-
 """Layer definitions.
 This module defines classes which encapsulate a single layer.
 These layers map input activations to output activation with the `fprop`
 method and map gradients with repsect to outputs to gradients with respect to
 their inputs with the `bprop` method.
 Some layers will have learnable parameters and so will additionally define
 methods for getting and setting parameter and calculating gradients with
 respect to the layer parameters.
 """
 import numpy as np
 import mlp.initialisers as init
 class Layer(object):
    """Abstract class defining the interface for a layer."""
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        raise NotImplementedError()
    def bprop(self, inputs, outputs, grads_wrt_outputs):
        """Back propagates gradients through a layer.
        Given gradients with respect to the outputs of the layer calculates the
        gradients with respect to the layer inputs.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
            outputs: Array of layer outputs calculated in forward pass of
                shape (batch_size, output_dim).
            grads_wrt_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            Array of gradients with respect to the layer inputs of shape
            (batch_size, input_dim).
        """
        raise NotImplementedError()
 class LayerWithParameters(Layer):
    """Abstract class defining the interface for a layer with parameters."""
    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.
        Args:
            inputs: Array of inputs to layer of shape (batch_size, input_dim).
            grads_wrt_to_outputs: Array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim).
        Returns:
            List of arrays of gradients with respect to the layer parameters
            with parameter gradients appearing in same order in tuple as
            returned from `get_params` method.
        """
        raise NotImplementedError()
    @property
    def params(self):
        """Returns a list of parameters of layer.
        Returns:
            List of current parameter values.
        """
        raise NotImplementedError()
 class AffineLayer(LayerWithParameters):
    """Layer implementing an affine tranformation of its inputs.
    This layer is parameterised by a weight matrix and bias vector.
    """
    def __init__(self, input_dim, output_dim,
                 weights_initialiser=init.UniformInit(-0.1, 0.1),
                 biases_initialiser=init.ConstantInit(0.),
                 weights_cost=None, biases_cost=None):
        """Initialises a parameterised affine layer.
        Args:
            input_dim (int): Dimension of inputs to the layer.
            output_dim (int): Dimension of the layer outputs.
            weights_initialiser: Initialiser for the weight parameters.
            biases_initialiser: Initialiser for the bias parameters.
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weights = weights_initialiser((self.output_dim, self.input_dim))
        self.biases = biases_initialiser(self.output_dim)
    def fprop(self, inputs):
        """Forward propagates activations through the layer transformation.
        For inputs `x`, outputs `y`, weights `W` and biases `b` the layer
        corresponds to `y = W.dot(x) + b`.
        Args:
            inputs: Array of layer inputs of shape (batch_size, input_dim).
        Returns:
            outputs: Array of layer outputs of shape (batch_size, output_dim).
        """
        #TODO write your code here
        raise NotImplementedError()
    def grads_wrt_params(self, inputs, grads_wrt_outputs):
        """Calculates gradients with respect to layer parameters.
        Args:
            inputs: array of inputs to layer of shape (batch_size, input_dim)
            grads_wrt_to_outputs: array of gradients with respect to the layer
                outputs of shape (batch_size, output_dim)
        Returns:
            list of arrays of gradients with respect to the layer parameters
            `[grads_wrt_weights, grads_wrt_biases]`.
        """
        #TODO write your code here
        raise NotImplementedError()
    @property
    def params(self):
        """A list of layer parameter values: `[weights, biases]`."""
        return [self.weights, self.biases]
    def __repr__(self):
        return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
            self.input_dim, self.output_dim)
--- a/mlp/learning_rules.py
+++ b/mlp/learning_rules.py
@ -0,0 +1,162 @@
 # -*- coding: utf-8 -*-
 """Learning rules.
 This module contains classes implementing gradient based learning rules.
 """
 import numpy as np
 class GradientDescentLearningRule(object):
    """Simple (stochastic) gradient descent learning rule.
    For a scalar error function `E(p[0], p_[1] ... )` of some set of
    potentially multidimensional parameters this attempts to find a local
    minimum of the loss function by applying updates to each parameter of the
    form
        p[i] := p[i] - learning_rate * dE/dp[i]
    With `learning_rate` a positive scaling parameter.
    The error function used in successive applications of these updates may be
    a stochastic estimator of the true error function (e.g. when the error with
    respect to only a subset of data-points is calculated) in which case this
    will correspond to a stochastic gradient descent learning rule.
    """
    def __init__(self, learning_rate=1e-3):
        """Creates a new learning rule object.
        Args:
            learning_rate: A postive scalar to scale gradient updates to the
                parameters by. This needs to be carefully set - if too large
                the learning dynamic will be unstable and may diverge, while
                if set too small learning will proceed very slowly.
        """
        assert learning_rate > 0., 'learning_rate should be positive.'
        self.learning_rate = learning_rate
    def initialise(self, params):
        """Initialises the state of the learning rule for a set or parameters.
        This must be called before `update_params` is first called.
        Args:
            params: A list of the parameters to be optimised. Note these will
                be updated *in-place* to avoid reallocating arrays on each
                update.
        """
        self.params = params
    def reset(self):
        """Resets any additional state variables to their intial values.
        For this learning rule there are no additional state variables so we
        do nothing here.
        """
        pass
    def update_params(self, grads_wrt_params):
        """Applies a single gradient descent update to all parameters.
        All parameter updates are performed using in-place operations and so
        nothing is returned.
        Args:
            grads_wrt_params: A list of gradients of the scalar loss function
                with respect to each of the parameters passed to `initialise`
                previously, with this list expected to be in the same order.
        """
        for param, grad in zip(self.params, grads_wrt_params):
            param -= self.learning_rate * grad
 class MomentumLearningRule(GradientDescentLearningRule):
    """Gradient descent with momentum learning rule.
    This extends the basic gradient learning rule by introducing extra
    momentum state variables for each parameter. These can help the learning
    dynamic help overcome shallow local minima and speed convergence when
    making multiple successive steps in a similar direction in parameter space.
    For parameter p[i] and corresponding momentum m[i] the updates for a
    scalar loss function `L` are of the form
        m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i]
        p[i] := p[i] + m[i]
    with `learning_rate` a positive scaling parameter for the gradient updates
    and `mom_coeff` a value in [0, 1] that determines how much 'friction' there
    is the system and so how quickly previous momentum contributions decay.
    """
    def __init__(self, learning_rate=1e-3, mom_coeff=0.9):
        """Creates a new learning rule object.
        Args:
            learning_rate: A postive scalar to scale gradient updates to the
                parameters by. This needs to be carefully set - if too large
                the learning dynamic will be unstable and may diverge, while
                if set too small learning will proceed very slowly.
            mom_coeff: A scalar in the range [0, 1] inclusive. This determines
                the contribution of the previous momentum value to the value
                after each update. If equal to 0 the momentum is set to exactly
                the negative scaled gradient each update and so this rule
                collapses to standard gradient descent. If equal to 1 the
                momentum will just be decremented by the scaled gradient at
                each update. This is equivalent to simulating the dynamic in
                a frictionless system. Due to energy conservation the loss
                of 'potential energy' as the dynamics moves down the loss
                function surface will lead to an increasingly large 'kinetic
                energy' and so speed, meaning the updates will become
                increasingly large, potentially unstably so. Typically a value
                less than but close to 1 will avoid these issues and cause the
                dynamic to converge to a local minima where the gradients are
                by definition zero.
        """
        super(MomentumLearningRule, self).__init__(learning_rate)
        assert mom_coeff >= 0. and mom_coeff <= 1., (
            'mom_coeff should be in the range [0, 1].'
        )
        self.mom_coeff = mom_coeff
    def initialise(self, params):
        """Initialises the state of the learning rule for a set or parameters.
        This must be called before `update_params` is first called.
        Args:
            params: A list of the parameters to be optimised. Note these will
                be updated *in-place* to avoid reallocating arrays on each
                update.
        """
        super(MomentumLearningRule, self).initialise(params)
        self.moms = []
        for param in self.params:
            self.moms.append(np.zeros_like(param))
    def reset(self):
        """Resets any additional state variables to their intial values.
        For this learning rule this corresponds to zeroing all the momenta.
        """
        for mom in zip(self.moms):
            mom *= 0.
    def update_params(self, grads_wrt_params):
        """Applies a single update to all parameters.
        All parameter updates are performed using in-place operations and so
        nothing is returned.
        Args:
            grads_wrt_params: A list of gradients of the scalar loss function
                with respect to each of the parameters passed to `initialise`
                previously, with this list expected to be in the same order.
        """
        for param, mom, grad in zip(self.params, self.moms, grads_wrt_params):
            mom *= self.mom_coeff
            mom -= self.learning_rate * grad
            param += mom
--- a/mlp/models.py
+++ b/mlp/models.py
@ -0,0 +1,67 @@
 # -*- coding: utf-8 -*-
 """Model definitions.
 This module implements objects encapsulating learnable models of input-output
 relationships. The model objects implement methods for forward propagating
 the inputs through the transformation(s) defined by the model to produce
 outputs (and intermediate states) and for calculating gradients of scalar
 functions of the outputs with respect to the model parameters.
 """
 from mlp.layers import LayerWithParameters
 class SingleLayerModel(object):
    """A model consisting of a single transformation layer."""
    def __init__(self, layer):
        """Create a new single layer model instance.
        Args:
            layer: The layer object defining the model architecture.
        """
        self.layer = layer
    @property
    def params(self):
        """A list of all of the parameters of the model."""
        return self.layer.params
    def fprop(self, inputs):
        """Calculate the model outputs corresponding to a batch of inputs.
        Args:
            inputs: Batch of inputs to the model.
        Returns:
            List which is a concatenation of the model inputs and model
            outputs, this being done for consistency of the interface with
            multi-layer models for which `fprop` returns a list of
            activations through all immediate layers of the model and including
            the inputs and outputs.
        """
        activations = [inputs, self.layer.fprop(inputs)]
        return activations
    def grads_wrt_params(self, activations, grads_wrt_outputs):
        """Calculates gradients with respect to the model parameters.
        Args:
            activations: List of all activations from forward pass through
                model using `fprop`.
            grads_wrt_outputs: Gradient with respect to the model outputs of
               the scalar function parameter gradients are being calculated
               for.
        Returns:
            List of gradients of the scalar function with respect to all model
            parameters.
        """
        return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
    def params_cost(self):
        """Calculates the parameter dependent cost term of the model."""
        return self.layer.params_cost()
    def __repr__(self):
        return 'SingleLayerModel(' + str(layer) + ')'
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@ -0,0 +1,134 @@
 # -*- coding: utf-8 -*-
 """Model optimisers.
 This module contains objects implementing (batched) stochastic gradient descent
 based optimisation of models.
 """
 import time
 import logging
 from collections import OrderedDict
 import numpy as np
 logger = logging.getLogger(__name__)
 class Optimiser(object):
    """Basic model optimiser."""
    def __init__(self, model, error, learning_rule, train_dataset,
                 valid_dataset=None, data_monitors=None):
        """Create a new optimiser instance.
        Args:
            model: The model to optimise.
            error: The scalar error function to minimise.
            learning_rule: Gradient based learning rule to use to minimise
                error.
            train_dataset: Data provider for training set data batches.
            valid_dataset: Data provider for validation set data batches.
            data_monitors: Dictionary of functions evaluated on targets and
                model outputs (averaged across both full training and
                validation data sets) to monitor during training in addition
                to the error. Keys should correspond to a string label for
                the statistic being evaluated.
        """
        self.model = model
        self.error = error
        self.learning_rule = learning_rule
        self.learning_rule.initialise(self.model.params)
        self.train_dataset = train_dataset
        self.valid_dataset = valid_dataset
        self.data_monitors = OrderedDict([('error', error)])
        if data_monitors is not None:
            self.data_monitors.update(data_monitors)
    def do_training_epoch(self):
        """Do a single training epoch.
        This iterates through all batches in training dataset, for each
        calculating the gradient of the estimated error given the batch with
        respect to all the model parameters and then updates the model
        parameters according to the learning rule.
        """
        for inputs_batch, targets_batch in self.train_dataset:
            activations = self.model.fprop(inputs_batch)
            grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
            grads_wrt_params = self.model.grads_wrt_params(
                activations, grads_wrt_outputs)
            self.learning_rule.update_params(grads_wrt_params)
    def eval_monitors(self, dataset, label):
        """Evaluates the monitors for the given dataset.
        Args:
            dataset: Dataset to perform evaluation with.
            label: Tag to add to end of monitor keys to identify dataset.
        Returns:
            OrderedDict of monitor values evaluated on dataset.
        """
        data_mon_vals = OrderedDict([(key + label, 0.) for key
                                     in self.data_monitors.keys()])
        for inputs_batch, targets_batch in dataset:
            activations = self.model.fprop(inputs_batch)
            for key, data_monitor in self.data_monitors.items():
                data_mon_vals[key + label] += data_monitor(
                    activations[-1], targets_batch)
        for key, data_monitor in self.data_monitors.items():
            data_mon_vals[key + label] /= dataset.num_batches
        return data_mon_vals
    def get_epoch_stats(self):
        """Computes training statistics for an epoch.
        Returns:
            An OrderedDict with keys corresponding to the statistic labels and
            values corresponding to the value of the statistic.
        """
        epoch_stats = OrderedDict()
        epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)'))
        if self.valid_dataset is not None:
            epoch_stats.update(self.eval_monitors(
                self.valid_dataset, '(valid)'))
        return epoch_stats
    def log_stats(self, epoch, epoch_time, stats):
        """Outputs stats for a training epoch to a logger.
        Args:
            epoch (int): Epoch counter.
            epoch_time: Time taken in seconds for the epoch to complete.
            stats: Monitored stats for the epoch.
        """
        logger.info('Epoch {0}: {1:.1f}s to complete\n    {2}'.format(
            epoch, epoch_time,
            ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
        ))
    def train(self, num_epochs, stats_interval=5):
        """Trains a model for a set number of epochs.
        Args:
            num_epochs: Number of epochs (complete passes through trainin
                dataset) to train for.
            stats_interval: Training statistics will be recorded and logged
                every `stats_interval` epochs.
        Returns:
            Tuple with first value being an array of training run statistics
            and the second being a dict mapping the labels for the statistics
            recorded to their column index in the array.
        """
        run_stats = [list(self.get_epoch_stats().values())]
        for epoch in range(1, num_epochs + 1):
            start_time = time.process_time()
            self.do_training_epoch()
            epoch_time = time.process_time() - start_time
            if epoch % stats_interval == 0:
                stats = self.get_epoch_stats()
                self.log_stats(epoch, epoch_time, stats)
                run_stats.append(list(stats.values()))
        return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}
--- a/notebooks/01_Introduction.ipynb
+++ b/notebooks/01_Introduction.ipynb
--- a/notebooks/02_Single_layer_models.ipynb
+++ b/notebooks/02_Single_layer_models.ipynb
--- a/notebooks/res/._fprop-bprop-block-diagram.png
+++ b/notebooks/res/._fprop-bprop-block-diagram.png
--- a/notebooks/res/._jupyter-dashboard.png
+++ b/notebooks/res/._jupyter-dashboard.png
--- a/notebooks/res/._jupyter-notebook-interface.png
+++ b/notebooks/res/._jupyter-notebook-interface.png
--- a/notebooks/res/._singleLayerNetBP-1.png
+++ b/notebooks/res/._singleLayerNetBP-1.png
--- a/notebooks/res/._singleLayerNetPredict.png
+++ b/notebooks/res/._singleLayerNetPredict.png
--- a/notebooks/res/._singleLayerNetWts-1.png
+++ b/notebooks/res/._singleLayerNetWts-1.png
--- a/notebooks/res/._singleLayerNetWtsEqns-1.png
+++ b/notebooks/res/._singleLayerNetWtsEqns-1.png
--- a/notebooks/res/fprop-bprop-block-diagram.pdf
+++ b/notebooks/res/fprop-bprop-block-diagram.pdf
--- a/notebooks/res/fprop-bprop-block-diagram.png
+++ b/notebooks/res/fprop-bprop-block-diagram.png
--- a/notebooks/res/fprop-bprop-block-diagram.tex
+++ b/notebooks/res/fprop-bprop-block-diagram.tex
@ -0,0 +1,65 @@
 \documentclass[tikz]{standalone}
 \usepackage{amsmath}
 \usepackage{tikz}
 \usetikzlibrary{arrows}
 \usetikzlibrary{calc}
 \usepackage{ifthen}
 \newcommand{\vct}[1]{\boldsymbol{#1}}
 \newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
 \tikzstyle{fprop} = [draw,fill=blue!20,minimum size=2em,align=center]
 \tikzstyle{bprop} = [draw,fill=red!20,minimum size=2em,align=center]
 \begin{document}
 \begin{tikzpicture}[xscale=1.75] %
    % define number of layers
    \def\nl{2};
    % model input
    \node at (0, 0) (input) {$\vct{x}$};
    % draw fprop through model layers
    \foreach \l in {0,...,\nl} {
        \node[fprop] at (2 * \l + 1, 0) (fprop\l) {\texttt{layers[\l]} \\ \texttt{.fprop}};
        \ifthenelse{\l > 0}{
            \node at (2 * \l, 0) (hidden\l) {$\vct{h}_\l$};
            \draw[->] (hidden\l) -- (fprop\l);
            \draw[->] let \n1={\l - 1} in (fprop\n1) -- (hidden\l);
        }{
            \draw[->] (input) -- (fprop\l);
        }
    }
    % model output
    \node at (2 * \nl + 2, 0) (output) {$\mathbf{y}$};
    % error function
    \node[fprop] at (2 * \nl + 3, 0) (errorfunc) {\texttt{error}};
    % error value
    \node at (2 * \nl + 3, -1) (error) {$\bar{E}$};
    % targets
    \node at (2 * \nl + 4, -1) (tgt) {$\vct{t}$};
    % error gradient
    \node[bprop] at (2 * \nl + 3, -2) (errorgrad) {\texttt{error} \\ \texttt{.grad}};
    % gradient wrt outputs
    \node at (2 * \nl + 2, -2) (gradoutput) {$\pd{\bar{E}}{\vct{y}}$};
    \draw[->] (fprop\nl) -- (output);
    \draw[->] (output) -- (errorfunc);
    \draw[->] (errorfunc) -- (error);
    \draw[->] (error) -- (errorgrad);
    \draw[->] (errorgrad) -- (gradoutput);
    \draw[->] (tgt) |- (errorfunc);
    \draw[->] (tgt) |- (errorgrad);
    \foreach \l in {0,...,\nl} {
        \node[bprop] at (2 * \l + 1, -2) (bprop\l) {\texttt{layers[\l]} \\ \texttt{.bprop}};
        \ifthenelse{\l > 0}{
            \node at (2 * \l, -2) (grad\l) {$\pd{\bar{E}}{\vct{h}_\l}$};
            \draw[<-] (grad\l) -- (bprop\l);
            \draw[<-] let \n1={\l - 1} in (bprop\n1) -- (grad\l);
        }{}
    }
    \node at (0, -2) (gradinput) {$\pd{\bar{E}}{\vct{x}}$};
    \draw[->] (bprop0) -- (gradinput);
    \draw[->] (gradoutput) -- (bprop\nl);
 \end{tikzpicture}
 \end{document}