update lab 2

This commit is contained in:
tpmmthomas 2024-09-21 02:09:17 +08:00
parent f5579c980d
commit 2702ee6f7b
20 changed files with 2071 additions and 49 deletions

View File

@ -75,6 +75,9 @@ class DataProvider(object):
self.inputs = self.inputs[new_order]
self.targets = self.targets[new_order]
def __next__(self):
return self.next()
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
if self._curr_batch + 1 > self.num_batches:
@ -133,13 +136,10 @@ class MNISTDataProvider(DataProvider):
super(MNISTDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
# def next(self):
# """Returns next data batch or raises `StopIteration` if at end."""
# inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
# return inputs_batch, self.to_one_of_k(targets_batch)
def __next__(self):
return self.next()
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
return inputs_batch, self.to_one_of_k(targets_batch)
def to_one_of_k(self, int_targets):
"""Converts integer coded class target to 1 of K coded targets.
@ -156,21 +156,23 @@ class MNISTDataProvider(DataProvider):
to zero except for the column corresponding to the correct class
which is equal to one.
"""
raise NotImplementedError()
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
return one_of_k_targets
class MetOfficeDataProvider(DataProvider):
"""South Scotland Met Office weather data provider."""
def __init__(self, window_size, batch_size=10, max_num_batches=-1,
shuffle_order=True, rng=None):
"""Create a new Met Offfice data provider object.
shuffle_order=True, rng=None):
"""Create a new Met Office data provider object.
Args:
window_size (int): Size of windows to split weather time series
data into. The constructed input features will be the first
`window_size - 1` entries in each window and the target outputs
the last entry in each window.
data into. The constructed input features will be the first
`window_size - 1` entries in each window and the target outputs
the last entry in each window.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
@ -180,29 +182,74 @@ class MetOfficeDataProvider(DataProvider):
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
self.window_size = window_size
assert window_size > 1, 'window_size must be at least 2.'
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
#TODO: load raw data from text file
#TODO: filter out all missing datapoints and flatten to a vector
#TODO: normalise data to zero mean, unit standard deviation
raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32))
assert window_size > 1, 'window_size must be at least 2.'
self.window_size = window_size
# filter out all missing datapoints and flatten to a vector
filtered = raw[raw >= 0].flatten()
# normalise data to zero mean, unit standard deviation
mean = np.mean(filtered)
std = np.std(filtered)
normalised = (filtered - mean) / std
# create a view on to array corresponding to a rolling window
shape = (normalised.shape[-1] - self.window_size + 1, self.window_size)
strides = normalised.strides + (normalised.strides[-1],)
windowed = np.lib.stride_tricks.as_strided(
normalised, shape=shape, strides=strides)
# inputs are first (window_size - 1) entries in windows
inputs = windowed[:, :-1]
# targets are last entry in windows
targets = windowed[:, -1]
super(MetOfficeDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
#TODO: convert from flat sequence to windowed data
class CCPPDataProvider(DataProvider):
#TODO: separate into inputs and targets
# inputs are the first (window_size - 1) entries in windows
# inputs = ...
# targets are the last entries in windows
# targets = ...
# initialise base class with inputs and targets arrays (uncomment below)
# super(MetOfficeDataProvider, self).__init__(
# inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
def __next__(self):
return self.next()
def __init__(self, which_set='train', input_dims=None, batch_size=10,
max_num_batches=-1, shuffle_order=True, rng=None):
"""Create a new Combined Cycle Power Plant data provider object.
Args:
which_set: One of 'train' or 'valid'. Determines which portion of
data this object should provide.
input_dims: Which of the four input dimension to use. If `None` all
are used. If an iterable of integers are provided (consisting
of a subset of {0, 1, 2, 3}) then only the corresponding
input dimensions are included.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'ccpp_data.npz')
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
# check a valid which_set was provided
assert which_set in ['train', 'valid'], (
'Expected which_set to be either train or valid '
'Got {0}'.format(which_set)
)
# check input_dims are valid
if not input_dims is not None:
input_dims = set(input_dims)
assert input_dims.issubset({0, 1, 2, 3}), (
'input_dims should be a subset of {0, 1, 2, 3}'
)
loaded = np.load(data_path)
inputs = loaded[which_set + '_inputs']
if input_dims is not None:
inputs = inputs[:, input_dims]
targets = loaded[which_set + '_targets']
super(CCPPDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)

46
mlp/errors.py Normal file
View File

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
"""Error functions.
This module defines error functions, with the aim of model training being to
minimise the error function given a set of inputs and target outputs.
The error functions will typically measure some concept of distance between the
model outputs and target outputs, averaged over all data points in the data set
or batch.
"""
import numpy as np
class SumOfSquaredDiffsError(object):
"""Sum of squared differences (squared Euclidean distance) error."""
def __call__(self, outputs, targets):
"""Calculates error function given a batch of outputs and targets.
Args:
outputs: Array of model outputs of shape (batch_size, output_dim).
targets: Array of target outputs of shape (batch_size, output_dim).
Returns:
Scalar error function value.
"""
#TODO write your code here
raise NotImplementedError()
def grad(self, outputs, targets):
"""Calculates gradient of error function with respect to outputs.
Args:
outputs: Array of model outputs of shape (batch_size, output_dim).
targets: Array of target outputs of shape (batch_size, output_dim).
Returns:
Gradient of error function with respect to outputs. This should be
an array of shape (batch_size, output_dim).
"""
#TODO write your code here
raise NotImplementedError()
def __repr__(self):
return 'SumOfSquaredDiffsError'

65
mlp/initialisers.py Normal file
View File

@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
"""Parameter initialisers.
This module defines classes to initialise the parameters in a layer.
"""
import numpy as np
from mlp import DEFAULT_SEED
class ConstantInit(object):
"""Constant parameter initialiser."""
def __init__(self, value):
"""Construct a constant parameter initialiser.
Args:
value: Value to initialise parameter to.
"""
self.value = value
def __call__(self, shape):
return np.ones(shape=shape) * self.value
class UniformInit(object):
"""Random uniform parameter initialiser."""
def __init__(self, low, high, rng=None):
"""Construct a random uniform parameter initialiser.
Args:
low: Lower bound of interval to sample from.
high: Upper bound of interval to sample from.
rng (RandomState): Seeded random number generator.
"""
self.low = low
self.high = high
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def __call__(self, shape):
return self.rng.uniform(low=self.low, high=self.high, size=shape)
class NormalInit(object):
"""Random normal parameter initialiser."""
def __init__(self, mean, std, rng=None):
"""Construct a random uniform parameter initialiser.
Args:
mean: Mean of distribution to sample from.
std: Standard deviation of distribution to sample from.
rng (RandomState): Seeded random number generator.
"""
self.mean = mean
self.std = std
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def __call__(self, shape):
return self.rng.normal(loc=self.mean, scale=self.std, size=shape)

141
mlp/layers.py Normal file
View File

@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
"""Layer definitions.
This module defines classes which encapsulate a single layer.
These layers map input activations to output activation with the `fprop`
method and map gradients with repsect to outputs to gradients with respect to
their inputs with the `bprop` method.
Some layers will have learnable parameters and so will additionally define
methods for getting and setting parameter and calculating gradients with
respect to the layer parameters.
"""
import numpy as np
import mlp.initialisers as init
class Layer(object):
"""Abstract class defining the interface for a layer."""
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
raise NotImplementedError()
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
raise NotImplementedError()
class LayerWithParameters(Layer):
"""Abstract class defining the interface for a layer with parameters."""
def grads_wrt_params(self, inputs, grads_wrt_outputs):
"""Calculates gradients with respect to layer parameters.
Args:
inputs: Array of inputs to layer of shape (batch_size, input_dim).
grads_wrt_to_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
List of arrays of gradients with respect to the layer parameters
with parameter gradients appearing in same order in tuple as
returned from `get_params` method.
"""
raise NotImplementedError()
@property
def params(self):
"""Returns a list of parameters of layer.
Returns:
List of current parameter values.
"""
raise NotImplementedError()
class AffineLayer(LayerWithParameters):
"""Layer implementing an affine tranformation of its inputs.
This layer is parameterised by a weight matrix and bias vector.
"""
def __init__(self, input_dim, output_dim,
weights_initialiser=init.UniformInit(-0.1, 0.1),
biases_initialiser=init.ConstantInit(0.),
weights_cost=None, biases_cost=None):
"""Initialises a parameterised affine layer.
Args:
input_dim (int): Dimension of inputs to the layer.
output_dim (int): Dimension of the layer outputs.
weights_initialiser: Initialiser for the weight parameters.
biases_initialiser: Initialiser for the bias parameters.
"""
self.input_dim = input_dim
self.output_dim = output_dim
self.weights = weights_initialiser((self.output_dim, self.input_dim))
self.biases = biases_initialiser(self.output_dim)
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x`, outputs `y`, weights `W` and biases `b` the layer
corresponds to `y = W.dot(x) + b`.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
#TODO write your code here
raise NotImplementedError()
def grads_wrt_params(self, inputs, grads_wrt_outputs):
"""Calculates gradients with respect to layer parameters.
Args:
inputs: array of inputs to layer of shape (batch_size, input_dim)
grads_wrt_to_outputs: array of gradients with respect to the layer
outputs of shape (batch_size, output_dim)
Returns:
list of arrays of gradients with respect to the layer parameters
`[grads_wrt_weights, grads_wrt_biases]`.
"""
#TODO write your code here
raise NotImplementedError()
@property
def params(self):
"""A list of layer parameter values: `[weights, biases]`."""
return [self.weights, self.biases]
def __repr__(self):
return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
self.input_dim, self.output_dim)

162
mlp/learning_rules.py Normal file
View File

@ -0,0 +1,162 @@
# -*- coding: utf-8 -*-
"""Learning rules.
This module contains classes implementing gradient based learning rules.
"""
import numpy as np
class GradientDescentLearningRule(object):
"""Simple (stochastic) gradient descent learning rule.
For a scalar error function `E(p[0], p_[1] ... )` of some set of
potentially multidimensional parameters this attempts to find a local
minimum of the loss function by applying updates to each parameter of the
form
p[i] := p[i] - learning_rate * dE/dp[i]
With `learning_rate` a positive scaling parameter.
The error function used in successive applications of these updates may be
a stochastic estimator of the true error function (e.g. when the error with
respect to only a subset of data-points is calculated) in which case this
will correspond to a stochastic gradient descent learning rule.
"""
def __init__(self, learning_rate=1e-3):
"""Creates a new learning rule object.
Args:
learning_rate: A postive scalar to scale gradient updates to the
parameters by. This needs to be carefully set - if too large
the learning dynamic will be unstable and may diverge, while
if set too small learning will proceed very slowly.
"""
assert learning_rate > 0., 'learning_rate should be positive.'
self.learning_rate = learning_rate
def initialise(self, params):
"""Initialises the state of the learning rule for a set or parameters.
This must be called before `update_params` is first called.
Args:
params: A list of the parameters to be optimised. Note these will
be updated *in-place* to avoid reallocating arrays on each
update.
"""
self.params = params
def reset(self):
"""Resets any additional state variables to their intial values.
For this learning rule there are no additional state variables so we
do nothing here.
"""
pass
def update_params(self, grads_wrt_params):
"""Applies a single gradient descent update to all parameters.
All parameter updates are performed using in-place operations and so
nothing is returned.
Args:
grads_wrt_params: A list of gradients of the scalar loss function
with respect to each of the parameters passed to `initialise`
previously, with this list expected to be in the same order.
"""
for param, grad in zip(self.params, grads_wrt_params):
param -= self.learning_rate * grad
class MomentumLearningRule(GradientDescentLearningRule):
"""Gradient descent with momentum learning rule.
This extends the basic gradient learning rule by introducing extra
momentum state variables for each parameter. These can help the learning
dynamic help overcome shallow local minima and speed convergence when
making multiple successive steps in a similar direction in parameter space.
For parameter p[i] and corresponding momentum m[i] the updates for a
scalar loss function `L` are of the form
m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i]
p[i] := p[i] + m[i]
with `learning_rate` a positive scaling parameter for the gradient updates
and `mom_coeff` a value in [0, 1] that determines how much 'friction' there
is the system and so how quickly previous momentum contributions decay.
"""
def __init__(self, learning_rate=1e-3, mom_coeff=0.9):
"""Creates a new learning rule object.
Args:
learning_rate: A postive scalar to scale gradient updates to the
parameters by. This needs to be carefully set - if too large
the learning dynamic will be unstable and may diverge, while
if set too small learning will proceed very slowly.
mom_coeff: A scalar in the range [0, 1] inclusive. This determines
the contribution of the previous momentum value to the value
after each update. If equal to 0 the momentum is set to exactly
the negative scaled gradient each update and so this rule
collapses to standard gradient descent. If equal to 1 the
momentum will just be decremented by the scaled gradient at
each update. This is equivalent to simulating the dynamic in
a frictionless system. Due to energy conservation the loss
of 'potential energy' as the dynamics moves down the loss
function surface will lead to an increasingly large 'kinetic
energy' and so speed, meaning the updates will become
increasingly large, potentially unstably so. Typically a value
less than but close to 1 will avoid these issues and cause the
dynamic to converge to a local minima where the gradients are
by definition zero.
"""
super(MomentumLearningRule, self).__init__(learning_rate)
assert mom_coeff >= 0. and mom_coeff <= 1., (
'mom_coeff should be in the range [0, 1].'
)
self.mom_coeff = mom_coeff
def initialise(self, params):
"""Initialises the state of the learning rule for a set or parameters.
This must be called before `update_params` is first called.
Args:
params: A list of the parameters to be optimised. Note these will
be updated *in-place* to avoid reallocating arrays on each
update.
"""
super(MomentumLearningRule, self).initialise(params)
self.moms = []
for param in self.params:
self.moms.append(np.zeros_like(param))
def reset(self):
"""Resets any additional state variables to their intial values.
For this learning rule this corresponds to zeroing all the momenta.
"""
for mom in zip(self.moms):
mom *= 0.
def update_params(self, grads_wrt_params):
"""Applies a single update to all parameters.
All parameter updates are performed using in-place operations and so
nothing is returned.
Args:
grads_wrt_params: A list of gradients of the scalar loss function
with respect to each of the parameters passed to `initialise`
previously, with this list expected to be in the same order.
"""
for param, mom, grad in zip(self.params, self.moms, grads_wrt_params):
mom *= self.mom_coeff
mom -= self.learning_rate * grad
param += mom

67
mlp/models.py Normal file
View File

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
"""Model definitions.
This module implements objects encapsulating learnable models of input-output
relationships. The model objects implement methods for forward propagating
the inputs through the transformation(s) defined by the model to produce
outputs (and intermediate states) and for calculating gradients of scalar
functions of the outputs with respect to the model parameters.
"""
from mlp.layers import LayerWithParameters
class SingleLayerModel(object):
"""A model consisting of a single transformation layer."""
def __init__(self, layer):
"""Create a new single layer model instance.
Args:
layer: The layer object defining the model architecture.
"""
self.layer = layer
@property
def params(self):
"""A list of all of the parameters of the model."""
return self.layer.params
def fprop(self, inputs):
"""Calculate the model outputs corresponding to a batch of inputs.
Args:
inputs: Batch of inputs to the model.
Returns:
List which is a concatenation of the model inputs and model
outputs, this being done for consistency of the interface with
multi-layer models for which `fprop` returns a list of
activations through all immediate layers of the model and including
the inputs and outputs.
"""
activations = [inputs, self.layer.fprop(inputs)]
return activations
def grads_wrt_params(self, activations, grads_wrt_outputs):
"""Calculates gradients with respect to the model parameters.
Args:
activations: List of all activations from forward pass through
model using `fprop`.
grads_wrt_outputs: Gradient with respect to the model outputs of
the scalar function parameter gradients are being calculated
for.
Returns:
List of gradients of the scalar function with respect to all model
parameters.
"""
return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
def params_cost(self):
"""Calculates the parameter dependent cost term of the model."""
return self.layer.params_cost()
def __repr__(self):
return 'SingleLayerModel(' + str(layer) + ')'

134
mlp/optimisers.py Normal file
View File

@ -0,0 +1,134 @@
# -*- coding: utf-8 -*-
"""Model optimisers.
This module contains objects implementing (batched) stochastic gradient descent
based optimisation of models.
"""
import time
import logging
from collections import OrderedDict
import numpy as np
logger = logging.getLogger(__name__)
class Optimiser(object):
"""Basic model optimiser."""
def __init__(self, model, error, learning_rule, train_dataset,
valid_dataset=None, data_monitors=None):
"""Create a new optimiser instance.
Args:
model: The model to optimise.
error: The scalar error function to minimise.
learning_rule: Gradient based learning rule to use to minimise
error.
train_dataset: Data provider for training set data batches.
valid_dataset: Data provider for validation set data batches.
data_monitors: Dictionary of functions evaluated on targets and
model outputs (averaged across both full training and
validation data sets) to monitor during training in addition
to the error. Keys should correspond to a string label for
the statistic being evaluated.
"""
self.model = model
self.error = error
self.learning_rule = learning_rule
self.learning_rule.initialise(self.model.params)
self.train_dataset = train_dataset
self.valid_dataset = valid_dataset
self.data_monitors = OrderedDict([('error', error)])
if data_monitors is not None:
self.data_monitors.update(data_monitors)
def do_training_epoch(self):
"""Do a single training epoch.
This iterates through all batches in training dataset, for each
calculating the gradient of the estimated error given the batch with
respect to all the model parameters and then updates the model
parameters according to the learning rule.
"""
for inputs_batch, targets_batch in self.train_dataset:
activations = self.model.fprop(inputs_batch)
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
grads_wrt_params = self.model.grads_wrt_params(
activations, grads_wrt_outputs)
self.learning_rule.update_params(grads_wrt_params)
def eval_monitors(self, dataset, label):
"""Evaluates the monitors for the given dataset.
Args:
dataset: Dataset to perform evaluation with.
label: Tag to add to end of monitor keys to identify dataset.
Returns:
OrderedDict of monitor values evaluated on dataset.
"""
data_mon_vals = OrderedDict([(key + label, 0.) for key
in self.data_monitors.keys()])
for inputs_batch, targets_batch in dataset:
activations = self.model.fprop(inputs_batch)
for key, data_monitor in self.data_monitors.items():
data_mon_vals[key + label] += data_monitor(
activations[-1], targets_batch)
for key, data_monitor in self.data_monitors.items():
data_mon_vals[key + label] /= dataset.num_batches
return data_mon_vals
def get_epoch_stats(self):
"""Computes training statistics for an epoch.
Returns:
An OrderedDict with keys corresponding to the statistic labels and
values corresponding to the value of the statistic.
"""
epoch_stats = OrderedDict()
epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)'))
if self.valid_dataset is not None:
epoch_stats.update(self.eval_monitors(
self.valid_dataset, '(valid)'))
return epoch_stats
def log_stats(self, epoch, epoch_time, stats):
"""Outputs stats for a training epoch to a logger.
Args:
epoch (int): Epoch counter.
epoch_time: Time taken in seconds for the epoch to complete.
stats: Monitored stats for the epoch.
"""
logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format(
epoch, epoch_time,
', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()])
))
def train(self, num_epochs, stats_interval=5):
"""Trains a model for a set number of epochs.
Args:
num_epochs: Number of epochs (complete passes through trainin
dataset) to train for.
stats_interval: Training statistics will be recorded and logged
every `stats_interval` epochs.
Returns:
Tuple with first value being an array of training run statistics
and the second being a dict mapping the labels for the statistics
recorded to their column index in the array.
"""
run_stats = [list(self.get_epoch_stats().values())]
for epoch in range(1, num_epochs + 1):
start_time = time.process_time()
self.do_training_epoch()
epoch_time = time.process_time() - start_time
if epoch % stats_interval == 0:
stats = self.get_epoch_stats()
self.log_stats(epoch, epoch_time, stats)
run_stats.append(list(stats.values()))
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}

View File

@ -239,4 +239,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.9 KiB

View File

@ -0,0 +1,65 @@
\documentclass[tikz]{standalone}
\usepackage{amsmath}
\usepackage{tikz}
\usetikzlibrary{arrows}
\usetikzlibrary{calc}
\usepackage{ifthen}
\newcommand{\vct}[1]{\boldsymbol{#1}}
\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
\tikzstyle{fprop} = [draw,fill=blue!20,minimum size=2em,align=center]
\tikzstyle{bprop} = [draw,fill=red!20,minimum size=2em,align=center]
\begin{document}
\begin{tikzpicture}[xscale=1.75] %
% define number of layers
\def\nl{2};
% model input
\node at (0, 0) (input) {$\vct{x}$};
% draw fprop through model layers
\foreach \l in {0,...,\nl} {
\node[fprop] at (2 * \l + 1, 0) (fprop\l) {\texttt{layers[\l]} \\ \texttt{.fprop}};
\ifthenelse{\l > 0}{
\node at (2 * \l, 0) (hidden\l) {$\vct{h}_\l$};
\draw[->] (hidden\l) -- (fprop\l);
\draw[->] let \n1={\l - 1} in (fprop\n1) -- (hidden\l);
}{
\draw[->] (input) -- (fprop\l);
}
}
% model output
\node at (2 * \nl + 2, 0) (output) {$\mathbf{y}$};
% error function
\node[fprop] at (2 * \nl + 3, 0) (errorfunc) {\texttt{error}};
% error value
\node at (2 * \nl + 3, -1) (error) {$\bar{E}$};
% targets
\node at (2 * \nl + 4, -1) (tgt) {$\vct{t}$};
% error gradient
\node[bprop] at (2 * \nl + 3, -2) (errorgrad) {\texttt{error} \\ \texttt{.grad}};
% gradient wrt outputs
\node at (2 * \nl + 2, -2) (gradoutput) {$\pd{\bar{E}}{\vct{y}}$};
\draw[->] (fprop\nl) -- (output);
\draw[->] (output) -- (errorfunc);
\draw[->] (errorfunc) -- (error);
\draw[->] (error) -- (errorgrad);
\draw[->] (errorgrad) -- (gradoutput);
\draw[->] (tgt) |- (errorfunc);
\draw[->] (tgt) |- (errorgrad);
\foreach \l in {0,...,\nl} {
\node[bprop] at (2 * \l + 1, -2) (bprop\l) {\texttt{layers[\l]} \\ \texttt{.bprop}};
\ifthenelse{\l > 0}{
\node at (2 * \l, -2) (grad\l) {$\pd{\bar{E}}{\vct{h}_\l}$};
\draw[<-] (grad\l) -- (bprop\l);
\draw[<-] let \n1={\l - 1} in (bprop\n1) -- (grad\l);
}{}
}
\node at (0, -2) (gradinput) {$\pd{\bar{E}}{\vct{x}}$};
\draw[->] (bprop0) -- (gradinput);
\draw[->] (gradoutput) -- (bprop\nl);
\end{tikzpicture}
\end{document}