From 04fe5a72798eb8a9ddcbd9fbaa55587c2968bb6e Mon Sep 17 00:00:00 2001 From: Matt Graham Date: Thu, 22 Sep 2016 15:03:29 +0100 Subject: [PATCH] Removing files not relevant to first lab. --- mlp/costs.py | 173 ------- mlp/initialisers.py | 65 --- mlp/layers.py | 325 ------------- mlp/learning_rules.py | 161 ------- mlp/models.py | 145 ------ mlp/optimisers.py | 134 ------ mlp/schedulers.py | 172 ------- mlp/utils.py | 361 -------------- notebooks/02_Linear_models.ipynb | 650 -------------------------- notebooks/03_Multi_layer_models.ipynb | 303 ------------ notebooks/04_Regularisation.ipynb | 293 ------------ notebooks/05_Transfer_functions.ipynb | 238 ---------- 12 files changed, 3020 deletions(-) delete mode 100644 mlp/costs.py delete mode 100644 mlp/initialisers.py delete mode 100644 mlp/layers.py delete mode 100644 mlp/learning_rules.py delete mode 100644 mlp/models.py delete mode 100644 mlp/optimisers.py delete mode 100644 mlp/schedulers.py delete mode 100644 mlp/utils.py delete mode 100644 notebooks/02_Linear_models.ipynb delete mode 100644 notebooks/03_Multi_layer_models.ipynb delete mode 100644 notebooks/04_Regularisation.ipynb delete mode 100644 notebooks/05_Transfer_functions.ipynb diff --git a/mlp/costs.py b/mlp/costs.py deleted file mode 100644 index bd103b3..0000000 --- a/mlp/costs.py +++ /dev/null @@ -1,173 +0,0 @@ -# -*- coding: utf-8 -*- -"""Model costs. - -This module defines cost functions, with the aim of model training being to -minimise the cost function given a set of inputs and target outputs. The cost -functions typically measure some concept of distance between the model outputs -and target outputs. -""" - -import numpy as np - - -class MeanSquaredErrorCost(object): - """Mean squared error cost.""" - - def __call__(self, outputs, targets): - """Calculates cost function given a batch of outputs and targets. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Scalar cost function value. - """ - return 0.5 * np.mean(np.sum((outputs - targets)**2, axis=1)) - - def grad(self, outputs, targets): - """Calculates gradient of cost function with respect to outputs. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Gradient of cost function with respect to outputs. - """ - return outputs - targets - - def __repr__(self): - return 'MeanSquaredErrorCost' - - -class BinaryCrossEntropyCost(object): - """Binary cross entropy cost.""" - - def __call__(self, outputs, targets): - """Calculates cost function given a batch of outputs and targets. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Scalar cost function value. - """ - return -np.mean( - targets * np.log(outputs) + (1. - targets) * np.log(1. - ouputs)) - - def grad(self, outputs, targets): - """Calculates gradient of cost function with respect to outputs. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Gradient of cost function with respect to outputs. - """ - return (1. - targets) / (1. - outputs) - (targets / outputs) - - def __repr__(self): - return 'BinaryCrossEntropyCost' - - -class BinaryCrossEntropySigmoidCost(object): - """Binary cross entropy cost with logistic sigmoid applied to outputs.""" - - def __call__(self, outputs, targets): - """Calculates cost function given a batch of outputs and targets. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Scalar cost function value. - """ - probs = 1. / (1. + np.exp(-outputs)) - return -np.mean( - targets * np.log(probs) + (1. - targets) * np.log(1. - probs)) - - def grad(self, outputs, targets): - """Calculates gradient of cost function with respect to outputs. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Gradient of cost function with respect to outputs. - """ - probs = 1. / (1. + np.exp(-outputs)) - return probs - targets - - def __repr__(self): - return 'BinaryCrossEntropySigmoidCost' - - -class CrossEntropyCost(object): - """Multi-class cross entropy cost.""" - - def __call__(self, outputs, targets): - """Calculates cost function given a batch of outputs and targets. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Scalar cost function value. - """ - return -np.mean(np.sum(targets * np.log(outputs), axis=1)) - - def grad(self, outputs, targets): - """Calculates gradient of cost function with respect to outputs. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Gradient of cost function with respect to outputs. - """ - return -targets / outputs - - def __repr__(self): - return 'CrossEntropyCost' - - -class CrossEntropySoftmaxCost(object): - """Multi-class cross entropy cost with Softmax applied to outputs.""" - - def __call__(self, outputs, targets): - """Calculates cost function given a batch of outputs and targets. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Scalar cost function value. - """ - probs = np.exp(outputs) - probs /= probs.sum(-1)[:, None] - return -np.mean(np.sum(targets * np.log(probs), axis=1)) - - def grad(self, outputs, targets): - """Calculates gradient of cost function with respect to outputs. - - Args: - outputs: Array of model outputs of shape (batch_size, output_dim). - targets: Array of target outputs of shape (batch_size, output_dim). - - Returns: - Gradient of cost function with respect to outputs. - """ - probs = np.exp(outputs) - probs /= probs.sum(-1)[:, None] - return probs - targets - - def __repr__(self): - return 'CrossEntropySoftmaxCost' diff --git a/mlp/initialisers.py b/mlp/initialisers.py deleted file mode 100644 index 243adc2..0000000 --- a/mlp/initialisers.py +++ /dev/null @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- -"""Parameter initialisers. - -This module defines classes to initialise the parameters in a layer. -""" - -import numpy as np -from mlp import DEFAULT_SEED - - -class ConstantInit(object): - """Constant parameter initialiser.""" - - def __init__(self, value): - """Construct a constant parameter initialiser. - - Args: - value: Value to initialise parameter to. - """ - self.value = value - - def __call__(self, shape): - return np.ones(shape=shape) * self.value - - -class UniformInit(object): - """Random uniform parameter initialiser.""" - - def __init__(self, low, high, rng=None): - """Construct a random uniform parameter initialiser. - - Args: - low: Lower bound of interval to sample from. - high: Upper bound of interval to sample from. - rng (RandomState): Seeded random number generator. - """ - self.low = low - self.high = high - if rng is None: - rng = np.random.RandomState(DEFAULT_SEED) - self.rng = rng - - def __call__(self, shape): - return self.rng.uniform(low=self.low, high=self.high, size=shape) - - -class NormalInit(object): - """Random normal parameter initialiser.""" - - def __init__(self, mean, std, rng=None): - """Construct a random uniform parameter initialiser. - - Args: - mean: Mean of distribution to sample from. - std: Standard deviation of distribution to sample from. - rng (RandomState): Seeded random number generator. - """ - self.mean = mean - self.std = std - if rng is None: - rng = np.random.RandomState(DEFAULT_SEED) - self.rng = rng - - def __call__(self, shape): - return self.rng.normal(loc=self.mean, scale=self.std, size=shape) diff --git a/mlp/layers.py b/mlp/layers.py deleted file mode 100644 index 760a01c..0000000 --- a/mlp/layers.py +++ /dev/null @@ -1,325 +0,0 @@ -# -*- coding: utf-8 -*- -"""Layer definitions. - -This module defines classes which encapsulate a single layer. - -These layers map input activations to output activation with the `fprop` -method and map gradients with repsect to outputs to gradients with respect to -their inputs with the `bprop` method. - -Some layers will have learnable parameters and so will additionally define -methods for getting and setting parameter and calculating gradients with -respect to the layer parameters. -""" - -import numpy as np -import mlp.initialisers as init - - -class Layer(object): - """Abstract class defining the interface for a layer.""" - - def fprop(self, inputs): - """Forward propagates activations through the layer transformation. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - - Returns: - outputs: Array of layer outputs of shape (batch_size, output_dim). - """ - raise NotImplementedError() - - def bprop(self, inputs, outputs, grads_wrt_outputs): - """Back propagates gradients through a layer. - - Given gradients with respect to the outputs of the layer calculates the - gradients with respect to the layer inputs. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - outputs: Array of layer outputs calculated in forward pass of - shape (batch_size, output_dim). - grads_wrt_outputs: Array of gradients with respect to the layer - outputs of shape (batch_size, output_dim). - - Returns: - Array of gradients with respect to the layer inputs of shape - (batch_size, input_dim). - """ - raise NotImplementedError() - - -class LayerWithParameters(Layer): - """Abstract class defining the interface for a layer with parameters.""" - - def grads_wrt_params(self, inputs, grads_wrt_outputs): - """Calculates gradients with respect to layer parameters. - - Args: - inputs: Array of inputs to layer of shape (batch_size, input_dim). - grads_wrt_to_outputs: Array of gradients with respect to the layer - outputs of shape (batch_size, output_dim). - - Returns: - List of arrays of gradients with respect to the layer parameters - with parameter gradients appearing in same order in tuple as - returned from `get_params` method. - """ - raise NotImplementedError() - - def params_cost(self): - """Returns the parameter dependent cost term for this layer. - - If no parameter-dependent cost terms are set this returns zero. - """ - raise NotImplementedError() - - @property - def params(self): - """Returns a list of parameters of layer. - - Returns: - List of current parameter values. This list should be in the - corresponding order to the `values` argument to `set_params`. - """ - raise NotImplementedError() - - @params.setter - def params(self, values): - """Sets layer parameters from a list of values. - - Args: - values: List of values to set parameters to. This list should be - in the corresponding order to what is returned by `get_params`. - """ - raise NotImplementedError() - - -class AffineLayer(LayerWithParameters): - """Layer implementing an affine tranformation of its inputs. - - This layer is parameterised by a weight matrix and bias vector. - """ - - def __init__(self, input_dim, output_dim, - weights_initialiser=init.UniformInit(-0.1, 0.1), - biases_initialiser=init.ConstantInit(0.), - weights_cost=None, biases_cost=None): - """Initialises a parameterised affine layer. - - Args: - input_dim (int): Dimension of inputs to the layer. - output_dim (int): Dimension of the layer outputs. - weights_initialiser: Initialiser for the weight parameters. - biases_initialiser: Initialiser for the bias parameters. - weights_cost: Weights-dependent cost term. - biases_cost: Biases-dependent cost term. - """ - self.input_dim = input_dim - self.output_dim = output_dim - self.weights = weights_initialiser((self.output_dim, self.input_dim)) - self.biases = biases_initialiser(self.output_dim) - self.weights_cost = weights_cost - self.biases_cost = biases_cost - - def fprop(self, inputs): - """Forward propagates activations through the layer transformation. - - For inputs `x`, outputs `y`, weights `W` and biases `b` the layer - corresponds to `y = W.dot(x) + b`. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - - Returns: - outputs: Array of layer outputs of shape (batch_size, output_dim). - """ - return self.weights.dot(inputs.T).T + self.biases - - def bprop(self, inputs, outputs, grads_wrt_outputs): - """Back propagates gradients through a layer. - - Given gradients with respect to the outputs of the layer calculates the - gradients with respect to the layer inputs. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - outputs: Array of layer outputs calculated in forward pass of - shape (batch_size, output_dim). - grads_wrt_outputs: Array of gradients with respect to the layer - outputs of shape (batch_size, output_dim). - - Returns: - Array of gradients with respect to the layer inputs of shape - (batch_size, input_dim). - """ - return grads_wrt_outputs.dot(self.weights) - - def grads_wrt_params(self, inputs, grads_wrt_outputs): - """Calculates gradients with respect to layer parameters. - - Args: - inputs: array of inputs to layer of shape (batch_size, input_dim) - grads_wrt_to_outputs: array of gradients with respect to the layer - outputs of shape (batch_size, output_dim) - - Returns: - list of arrays of gradients with respect to the layer parameters - `[grads_wrt_weights, grads_wrt_biases]`. - """ - - grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs) - grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0) - - if self.weights_cost is not None: - grads_wrt_weights += self.weights_cost.grad(self.weights) - - if self.biases_cost is not None: - grads_wrt_biases += self.biases_cost.grads(self.biases) - - return [grads_wrt_weights, grads_wrt_biases] - - def params_cost(self): - """Returns the parameter dependent cost term for this layer. - - If no parameter-dependent cost terms are set this returns zero. - """ - params_cost = 0 - if self.weights_cost is not None: - params_cost += self.weights_cost(self.weights) - if self.biases_cost is not None: - params_cost += self.biases_cost(self.biases) - return params_cost - - @property - def params(self): - """A list of layer parameter values: `[weights, biases]`.""" - return [self.weights, self.biases] - - @params.setter - def params(self, values): - self.weights = values[0] - self.biases = values[1] - - def __repr__(self): - return 'AffineLayer(input_dim={0}, output_dim={1})'.format( - self.input_dim, self.output_dim) - - -class SigmoidLayer(Layer): - """Layer implementing an element-wise logistic sigmoid transformation.""" - - def fprop(self, inputs): - """Forward propagates activations through the layer transformation. - - For inputs `x` and outputs `y` this corresponds to - `y = 1 / (1 + exp(-x))`. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - - Returns: - outputs: Array of layer outputs of shape (batch_size, output_dim). - """ - return 1. / (1. + np.exp(-inputs)) - - def bprop(self, inputs, outputs, grads_wrt_outputs): - """Back propagates gradients through a layer. - - Given gradients with respect to the outputs of the layer calculates the - gradients with respect to the layer inputs. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - outputs: Array of layer outputs calculated in forward pass of - shape (batch_size, output_dim). - grads_wrt_outputs: Array of gradients with respect to the layer - outputs of shape (batch_size, output_dim). - - Returns: - Array of gradients with respect to the layer inputs of shape - (batch_size, input_dim). - """ - return grads_wrt_outputs * outputs * (1. - outputs) - - def __repr__(self): - return 'SigmoidLayer' - - -class ReluLayer(Layer): - """Layer implementing an element-wise rectified linear transformation.""" - - def fprop(self, inputs): - """Forward propagates activations through the layer transformation. - - For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - - Returns: - outputs: Array of layer outputs of shape (batch_size, output_dim). - """ - return np.maximum(inputs, 0.) - - def bprop(self, inputs, outputs, grads_wrt_outputs): - """Back propagates gradients through a layer. - - Given gradients with respect to the outputs of the layer calculates the - gradients with respect to the layer inputs. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - outputs: Array of layer outputs calculated in forward pass of - shape (batch_size, output_dim). - grads_wrt_outputs: Array of gradients with respect to the layer - outputs of shape (batch_size, output_dim). - - Returns: - Array of gradients with respect to the layer inputs of shape - (batch_size, input_dim). - """ - return (outputs > 0) * grads_wrt_outputs - - def __repr__(self): - return 'ReluLayer' - - -class TanhLayer(Layer): - """Layer implementing an element-wise hyperbolic tangent transformation.""" - - def fprop(self, inputs): - """Forward propagates activations through the layer transformation. - - For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - - Returns: - outputs: Array of layer outputs of shape (batch_size, output_dim). - """ - return np.tanh(inputs) - - def bprop(self, inputs, outputs, grads_wrt_outputs): - """Back propagates gradients through a layer. - - Given gradients with respect to the outputs of the layer calculates the - gradients with respect to the layer inputs. - - Args: - inputs: Array of layer inputs of shape (batch_size, input_dim). - outputs: Array of layer outputs calculated in forward pass of - shape (batch_size, output_dim). - grads_wrt_outputs: Array of gradients with respect to the layer - outputs of shape (batch_size, output_dim). - - Returns: - Array of gradients with respect to the layer inputs of shape - (batch_size, input_dim). - """ - return (1. - outputs**2) * grads_wrt_outputs - - def __repr__(self): - return 'TanhLayer' diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py deleted file mode 100644 index 4156c23..0000000 --- a/mlp/learning_rules.py +++ /dev/null @@ -1,161 +0,0 @@ -# -*- coding: utf-8 -*- -"""Learning rules. - -This module contains classes implementing gradient based learning rules. -""" - -import numpy as np - - -class GradientDescentLearningRule(object): - """Simple (stochastic) gradient descent learning rule. - - For a scalar loss function `L(p[0], p_[1] ... )` of some set of potentially - multidimensional parameters this attempts to find a local minimum of the - loss function by applying updates to each parameter of the form - - p[i] := p[i] - learning_rate * dL/dp[i] - - With `learning_rate` a positive scaling parameter. - - The loss function used in successive applications of these updates may be a - stochastic estimator of the true loss function (e.g. when the loss with - respect to only a subset of data-points is calculated) in which case this - will correspond to a stochastic gradient descent learning rule. - """ - - def __init__(self, learning_rate=1e-3): - """Creates a new learning rule object. - - Args: - learning_rate: A postive scalar to scale gradient updates to the - parameters by. This needs to be carefully set - if too large - the learning dynamic will be unstable and may diverge, while - if set too small learning will proceed very slowly. - - """ - assert learning_rate > 0., 'learning_rate should be positive.' - self.learning_rate = learning_rate - - def initialise(self, params): - """Initialises the state of the learning rule for a set or parameters. - - This must be called before `update_params` is first called. - - Args: - params: A list of the parameters to be optimised. Note these will - be updated *in-place* to avoid reallocating arrays on each - update. - """ - self.params = params - - def reset(self): - """Resets any additional state variables to their intial values. - - For this learning rule there are no additional state variables so we - do nothing here. - """ - pass - - def update_params(self, grads_wrt_params): - """Applies a single gradient descent update to all parameters. - - All parameter updates are performed using in-place operations and so - nothing is returned. - - Args: - grads_wrt_params: A list of gradients of the scalar loss function - with respect to each of the parameters passed to `initialise` - previously, with this list expected to be in the same order. - """ - for param, grad in zip(self.params, grads_wrt_params): - param -= self.learning_rate * grad - - -class MomentumLearningRule(GradientDescentLearningRule): - """Gradient descent with momentum learning rule. - - This extends the basic gradient learning rule by introducing extra - momentum state variables for each parameter. These can help the learning - dynamic help overcome shallow local minima and speed convergence when - making multiple successive steps in a similar direction in parameter space. - - For parameter p[i] and corresponding momentum m[i] the updates for a - scalar loss function `L` are of the form - - m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i] - p[i] := p[i] + m[i] - - with `learning_rate` a positive scaling parameter for the gradient updates - and `mom_coeff` a value in [0, 1] that determines how much 'friction' there - is the system and so how quickly previous momentum contributions decay. - """ - - def __init__(self, learning_rate=1e-3, mom_coeff=0.9): - """Creates a new learning rule object. - - Args: - learning_rate: A postive scalar to scale gradient updates to the - parameters by. This needs to be carefully set - if too large - the learning dynamic will be unstable and may diverge, while - if set too small learning will proceed very slowly. - mom_coeff: A scalar in the range [0, 1] inclusive. This determines - the contribution of the previous momentum value to the value - after each update. If equal to 0 the momentum is set to exactly - the negative scaled gradient each update and so this rule - collapses to standard gradient descent. If equal to 1 the - momentum will just be decremented by the scaled gradient at - each update. This is equivalent to simulating the dynamic in - a frictionless system. Due to energy conservation the loss - of 'potential energy' as the dynamics moves down the loss - function surface will lead to an increasingly large 'kinetic - energy' and so speed, meaning the updates will become - increasingly large, potentially unstably so. Typically a value - less than but close to 1 will avoid these issues and cause the - dynamic to converge to a local minima where the gradients are - by definition zero. - """ - super(MomentumLearningRule, self).__init__(learning_rate) - assert mom_coeff >= 0. and mom_coeff <= 1., ( - 'mom_coeff should be in the range [0, 1].' - ) - self.mom_coeff = mom_coeff - - def initialise(self, params): - """Initialises the state of the learning rule for a set or parameters. - - This must be called before `update_params` is first called. - - Args: - params: A list of the parameters to be optimised. Note these will - be updated *in-place* to avoid reallocating arrays on each - update. - """ - super(MomentumLearningRule, self).initialise(params) - self.moms = [] - for param in self.params: - self.moms.append(np.zeros_like(param)) - - def reset(self): - """Resets any additional state variables to their intial values. - - For this learning rule this corresponds to zeroing all the momenta. - """ - for mom in zip(self.moms): - mom *= 0. - - def update_params(self, grads_wrt_params): - """Applies a single update to all parameters. - - All parameter updates are performed using in-place operations and so - nothing is returned. - - Args: - grads_wrt_params: A list of gradients of the scalar loss function - with respect to each of the parameters passed to `initialise` - previously, with this list expected to be in the same order. - """ - for param, mom, grad in zip(self.params, self.moms, grads_wrt_params): - mom *= self.mom_coeff - mom -= self.learning_rate * grad - param += mom diff --git a/mlp/models.py b/mlp/models.py deleted file mode 100644 index f4b1f55..0000000 --- a/mlp/models.py +++ /dev/null @@ -1,145 +0,0 @@ -# -*- coding: utf-8 -*- -"""Model definitions. - -This module implements objects encapsulating learnable models of input-output -relationships. The model objects implement methods for forward propagating -the inputs through the transformation(s) defined by the model to produce -outputs (and intermediate states) and for calculating gradients of scalar -functions of the outputs with respect to the model parameters. -""" - -from mlp.layers import LayerWithParameters - - -class SingleLayerModel(object): - """A model consisting of a single transformation layer.""" - - def __init__(self, layer): - """Create a new single layer model instance. - - Args: - layer: The layer object defining the model architecture. - """ - self.layer = layer - - @property - def params(self): - """A list of all of the parameters of the model.""" - return self.layer.params - - def fprop(self, inputs): - """Calculate the model outputs corresponding to a batch of inputs. - - Args: - inputs: Batch of inputs to the model. - - Returns: - List which is a concatenation of the model inputs and model - outputs, this being done for consistency of the interface with - multi-layer models for which `fprop` returns a list of - activations through all immediate layers of the model and including - the inputs and outputs. - """ - activations = [inputs, self.layer.fprop(inputs)] - return activations - - def grads_wrt_params(self, activations, grads_wrt_outputs): - """Calculates gradients with respect to the model parameters. - - Args: - activations: List of all activations from forward pass through - model using `fprop`. - grads_wrt_outputs: Gradient with respect to the model outputs of - the scalar function parameter gradients are being calculated - for. - - Returns: - List of gradients of the scalar function with respect to all model - parameters. - """ - return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs) - - def params_cost(self): - """Calculates the parameter dependent cost term of the model.""" - return self.layer.params_cost() - - def __repr__(self): - return 'SingleLayerModel(' + str(layer) + ')' - - -class MultipleLayerModel(object): - """A model consisting of multiple layers applied sequentially.""" - - def __init__(self, layers): - """Create a new multiple layer model instance. - - Args: - layers: List of the the layer objecst defining the model in the - order they should be applied from inputs to outputs. - """ - self.layers = layers - - @property - def params(self): - """A list of all of the parameters of the model.""" - params = [] - for layer in self.layers: - if isinstance(layer, LayerWithParameters): - params += layer.params - return params - - def fprop(self, inputs): - """Forward propagates a batch of inputs through the model. - - Args: - inputs: Batch of inputs to the model. - - Returns: - List of the activations at the output of all layers of the model - plus the inputs (to the first layer) as the first element. The - last element of the list corresponds to the model outputs. - """ - activations = [inputs] - for i, layer in enumerate(self.layers): - activations.append(self.layers[i].fprop(activations[i])) - return activations - - def grads_wrt_params(self, activations, grads_wrt_outputs): - """Calculates gradients with respect to the model parameters. - - Args: - activations: List of all activations from forward pass through - model using `fprop`. - grads_wrt_outputs: Gradient with respect to the model outputs of - the scalar function parameter gradients are being calculated - for. - - Returns: - List of gradients of the scalar function with respect to all model - parameters. - """ - grads_wrt_params = [] - for i, layer in enumerate(self.layers[::-1]): - inputs = activations[-i - 2] - outputs = activations[-i - 1] - grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs) - if isinstance(layer, LayerWithParameters): - grads_wrt_params += layer.grads_wrt_params( - inputs, grads_wrt_outputs)[::-1] - grads_wrt_outputs = grads_wrt_inputs - return grads_wrt_params[::-1] - - def params_cost(self): - """Calculates the parameter dependent cost term of the model.""" - params_cost = 0. - for layer in self.layers: - if isinstance(layer, LayerWithParameters): - params_cost += layer.params_cost() - return params_cost - - def __repr__(self): - return ( - 'MultiLayerModel(\n ' + - '\n '.join([str(layer) for layer in self.layers]) + - '\n)' - ) diff --git a/mlp/optimisers.py b/mlp/optimisers.py deleted file mode 100644 index 4ce9e4d..0000000 --- a/mlp/optimisers.py +++ /dev/null @@ -1,134 +0,0 @@ -# -*- coding: utf-8 -*- -"""Model optimisers. - -This module contains objects implementing (batched) stochastic gradient descent -based optimisation of models. -""" - -import time -import logging -from collections import OrderedDict -import numpy as np - - -logger = logging.getLogger(__name__) - - -class Optimiser(object): - """Basic model optimiser.""" - - def __init__(self, model, cost, learning_rule, train_dataset, - valid_dataset=None, data_monitors=None): - """Create a new optimiser instance. - - Args: - model: The model to optimise. - cost: The scalar cost function to minimise. - learning_rule: Gradient based learning rule to use to minimise - cost. - train_dataset: Data provider for training set data batches. - valid_dataset: Data provider for validation set data batches. - data_monitors: Dictionary of functions evaluated on targets and - model outputs (averaged across both full training and - validation data sets) to monitor during training in addition - to the cost. Keys should correspond to a string label for - the statistic being evaluated. - """ - self.model = model - self.cost = cost - self.learning_rule = learning_rule - self.learning_rule.initialise(self.model.params) - self.train_dataset = train_dataset - self.valid_dataset = valid_dataset - self.data_monitors = OrderedDict([('cost', cost)]) - if data_monitors is not None: - self.data_monitors.update(data_monitors) - - def do_training_epoch(self): - """Do a single training epoch. - - This iterates through all batches in training dataset, for each - calculating the gradient of the estimated loss given the batch with - respect to all the model parameters and then updates the model - parameters according to the learning rule. - """ - for inputs_batch, targets_batch in self.train_dataset: - activations = self.model.fprop(inputs_batch) - grads_wrt_outputs = self.cost.grad(activations[-1], targets_batch) - grads_wrt_params = self.model.grads_wrt_params( - activations, grads_wrt_outputs) - self.learning_rule.update_params(grads_wrt_params) - - def eval_monitors(self, dataset, label): - """Evaluates the monitors for the given dataset. - - Args: - dataset: Dataset to perform evaluation with. - label: Tag to add to end of monitor keys to identify dataset. - - Returns: - OrderedDict of monitor values evaluated on dataset. - """ - data_mon_vals = OrderedDict([(key + label, 0.) for key - in self.data_monitors.keys()]) - for inputs_batch, targets_batch in dataset: - activations = self.model.fprop(inputs_batch) - for key, data_monitor in self.data_monitors.items(): - data_mon_vals[key + label] += data_monitor( - activations[-1], targets_batch) - for key, data_monitor in self.data_monitors.items(): - data_mon_vals[key + label] /= dataset.num_batches - return data_mon_vals - - def get_epoch_stats(self): - """Computes training statistics for an epoch. - - Returns: - An OrderedDict with keys corresponding to the statistic labels and - values corresponding to the value of the statistic. - """ - epoch_stats = OrderedDict() - epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)')) - if self.valid_dataset is not None: - epoch_stats.update(self.eval_monitors( - self.valid_dataset, '(valid)')) - epoch_stats['cost(param)'] = self.model.params_cost() - return epoch_stats - - def log_stats(self, epoch, epoch_time, stats): - """Outputs stats for a training epoch to a logger. - - Args: - epoch (int): Epoch counter. - epoch_time: Time taken in seconds for the epoch to complete. - stats: Monitored stats for the epoch. - """ - logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format( - epoch, epoch_time, - ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()]) - )) - - def train(self, num_epochs, stats_interval=5): - """Trains a model for a set number of epochs. - - Args: - num_epochs: Number of epochs (complete passes through trainin - dataset) to train for. - stats_interval: Training statistics will be recorded and logged - every `stats_interval` epochs. - - Returns: - Tuple with first value being an array of training run statistics - and the second being a dict mapping the labels for the statistics - recorded to their column index in the array. - """ - run_stats = [] - for epoch in range(1, num_epochs + 1): - start_time = time.clock() - self.do_training_epoch() - epoch_time = time.clock() - start_time - if epoch % stats_interval == 0: - stats = self.get_epoch_stats() - self.log_stats(epoch, epoch_time, stats) - run_stats.append(stats.values()) - return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())} diff --git a/mlp/schedulers.py b/mlp/schedulers.py deleted file mode 100644 index 6ae9597..0000000 --- a/mlp/schedulers.py +++ /dev/null @@ -1,172 +0,0 @@ -# Machine Learning Practical (INFR11119), -# Pawel Swietojanski, University of Edinburgh - -import logging - - -class LearningRateScheduler(object): - """ - Define an interface for determining learning rates - """ - def __init__(self, max_epochs=100): - self.epoch = 0 - self.max_epochs = max_epochs - - def get_rate(self): - raise NotImplementedError() - - def get_next_rate(self, current_accuracy=None): - self.epoch += 1 - - -class LearningRateList(LearningRateScheduler): - def __init__(self, learning_rates_list, max_epochs): - - super(LearningRateList, self).__init__(max_epochs) - - assert isinstance(learning_rates_list, list), ( - "The learning_rates_list argument expected" - " to be of type list, got %s" % type(learning_rates_list) - ) - self.lr_list = learning_rates_list - - def get_rate(self): - if self.epoch < len(self.lr_list): - return self.lr_list[self.epoch] - return 0.0 - - def get_next_rate(self, current_accuracy=None): - super(LearningRateList, self).get_next_rate(current_accuracy=None) - return self.get_rate() - - -class LearningRateFixed(LearningRateList): - - def __init__(self, learning_rate, max_epochs): - assert learning_rate > 0, ( - "learning rate expected to be > 0, got %f" % learning_rate - ) - super(LearningRateFixed, self).__init__([learning_rate], max_epochs) - - def get_rate(self): - if self.epoch < self.max_epochs: - return self.lr_list[0] - return 0.0 - - def get_next_rate(self, current_accuracy=None): - super(LearningRateFixed, self).get_next_rate(current_accuracy=None) - return self.get_rate() - - -class LearningRateNewBob(LearningRateScheduler): - """ - newbob learning rate schedule. - - Fixed learning rate until validation set stops improving then exponential - decay. - """ - - def __init__(self, start_rate, scale_by=.5, max_epochs=99, - min_derror_ramp_start=.5, min_derror_stop=.5, init_error=100.0, - patience=0, zero_rate=None, ramping=False): - """ - :type start_rate: float - :param start_rate: - - :type scale_by: float - :param scale_by: - - :type max_epochs: int - :param max_epochs: - - :type min_error_start: float - :param min_error_start: - - :type min_error_stop: float - :param min_error_stop: - - :type init_error: float - :param init_error: - """ - self.start_rate = start_rate - self.init_error = init_error - self.init_patience = patience - - self.rate = start_rate - self.scale_by = scale_by - self.max_epochs = max_epochs - self.min_derror_ramp_start = min_derror_ramp_start - self.min_derror_stop = min_derror_stop - self.lowest_error = init_error - - self.epoch = 1 - self.ramping = ramping - self.patience = patience - self.zero_rate = zero_rate - - def reset(self): - self.rate = self.start_rate - self.lowest_error = self.init_error - self.epoch = 1 - self.ramping = False - self.patience = self.init_patience - - def get_rate(self): - if (self.epoch==1 and self.zero_rate!=None): - return self.zero_rate - return self.rate - - def get_next_rate(self, current_accuracy): - """ - :type current_accuracy: float - :param current_accuracy: current proportion correctly classified - - """ - - current_error = 1. - current_accuracy - diff_error = 0.0 - - if ( (self.max_epochs > 10000) or (self.epoch >= self.max_epochs) ): - #logging.debug('Setting rate to 0.0. max_epochs or epoch>=max_epochs') - self.rate = 0.0 - else: - diff_error = self.lowest_error - current_error - - if (current_error < self.lowest_error): - self.lowest_error = current_error - - if (self.ramping): - if (diff_error < self.min_derror_stop): - if (self.patience > 0): - #logging.debug('Patience decreased to %f' % self.patience) - self.patience -= 1 - self.rate *= self.scale_by - else: - #logging.debug('diff_error (%f) < min_derror_stop (%f)' % (diff_error, self.min_derror_stop)) - self.rate = 0.0 - else: - self.rate *= self.scale_by - else: - if (diff_error < self.min_derror_ramp_start): - #logging.debug('Start ramping.') - self.ramping = True - self.rate *= self.scale_by - - self.epoch += 1 - - return self.rate - - -class DropoutFixed(LearningRateList): - - def __init__(self, p_inp_keep, p_hid_keep): - assert 0 < p_inp_keep <= 1 and 0 < p_hid_keep <= 1, ( - "Dropout 'keep' probabilites are suppose to be in (0, 1] range" - ) - super(DropoutFixed, self).__init__([(p_inp_keep, p_hid_keep)], max_epochs=999) - - def get_rate(self): - return self.lr_list[0] - - def get_next_rate(self, current_accuracy=None): - return self.get_rate() diff --git a/mlp/utils.py b/mlp/utils.py deleted file mode 100644 index 34d62e5..0000000 --- a/mlp/utils.py +++ /dev/null @@ -1,361 +0,0 @@ -# Machine Learning Practical (INFR11119), -# Pawel Swietojanski, University of Edinburgh - -import numpy -from mlp.layers import Layer - - -def numerical_gradient(f, x, eps=1e-4, **kwargs): - """ - Implements the following numerical gradient rule - df(x)/dx = (f(x+eps)-f(x-eps))/(2eps) - """ - - xc = x.copy() - g = numpy.zeros_like(xc) - xf = xc.ravel() - gf = g.ravel() - - for i in xrange(xf.shape[0]): - xx = xf[i] - xf[i] = xx + eps - fp_eps, ___ = f(xc, **kwargs) - xf[i] = xx - eps - fm_eps, ___ = f(xc, **kwargs) - xf[i] = xx - gf[i] = (fp_eps - fm_eps)/(2*eps) - - return g - - -def verify_gradient(f, x, eps=1e-4, tol=1e-6, **kwargs): - """ - Compares the numerical and analytical gradients. - """ - fval, fgrad = f(x=x, **kwargs) - ngrad = numerical_gradient(f=f, x=x, eps=eps, tol=tol, **kwargs) - - fgradnorm = numpy.sqrt(numpy.sum(fgrad**2)) - ngradnorm = numpy.sqrt(numpy.sum(ngrad**2)) - diffnorm = numpy.sqrt(numpy.sum((fgrad-ngrad)**2)) - - if fgradnorm > 0 or ngradnorm > 0: - norm = numpy.maximum(fgradnorm, ngradnorm) - if not (diffnorm < tol or diffnorm/norm < tol): - raise Exception("Numerical and analytical gradients " - "are different: %s != %s!" % (ngrad, fgrad)) - else: - if not (diffnorm < tol): - raise Exception("Numerical and analytical gradients " - "are different: %s != %s!" % (ngrad, fgrad)) - return True - - -def verify_layer_gradient(layer, x, eps=1e-4, tol=1e-6): - - assert isinstance(layer, Layer), ( - "Expected to get the instance of Layer class, got" - " %s " % type(layer) - ) - - def grad_layer_wrapper(x, **kwargs): - h = layer.fprop(x) - deltas, ograds = layer.bprop(h=h, igrads=numpy.ones_like(h)) - return numpy.sum(h), ograds - - return verify_gradient(f=grad_layer_wrapper, x=x, eps=eps, tol=tol, layer=layer) - - -def test_conv_linear_fprop(layer, kernel_order='ioxy', kernels_first=True, - dtype=numpy.float): - """ - Tests forward propagation method of a convolutional layer. - - Checks the outputs of `fprop` method for a fixed input against known - reference values for the outputs and raises an AssertionError if - the outputted values are not consistent with the reference values. If - tests are all passed returns True. - - Parameters - ---------- - layer : instance of Layer subclass - Convolutional (linear only) layer implementation. It must implement - the methods `get_params`, `set_params` and `fprop`. - kernel_order : string - Specifes dimension ordering assumed for convolutional kernels - passed to `layer`. Default is `ioxy` which corresponds to: - input channels, output channels, image x, image y - The other option is 'oixy' which corresponds to - output channels, input channels, image x, image y - Any other value will raise a ValueError exception. - kernels_first : boolean - Specifies order in which parameters are passed to and returned from - `get_params` and `set_params`. Default is True which corresponds - to signatures of `get_params` and `set_params` being: - kernels, biases = layer.get_params() - layer.set_params([kernels, biases]) - If False this corresponds to signatures of `get_params` and - `set_params` being: - biases, kernels = layer.get_params() - layer.set_params([biases, kernels]) - dtype : numpy data type - Data type to use in numpy arrays passed to layer methods. Default - is `numpy.float`. - - Raises - ------ - AssertionError - Raised if output of `layer.fprop` is inconsistent with reference - values either in shape or values. - ValueError - Raised if `kernel_order` is not a valid order string. - """ - inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype) - kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype) - if kernel_order == 'oixy': - kernels = kernels.swapaxes(0, 1) - elif kernel_order != 'ioxy': - raise ValueError('kernel_order must be one of "ioxy" and "oixy"') - biases = numpy.arange(2).astype(dtype) - true_output = numpy.array( - [[[[ 496., 466., 436.], - [ 376., 346., 316.], - [ 256., 226., 196.]], - [[ 1385., 1403., 1421.], - [ 1457., 1475., 1493.], - [ 1529., 1547., 1565.]]], - [[[ -944., -974., -1004.], - [-1064., -1094., -1124.], - [-1184., -1214., -1244.]], - [[ 2249., 2267., 2285.], - [ 2321., 2339., 2357.], - [ 2393., 2411., 2429.]]]], dtype=dtype) - try: - orig_params = layer.get_params() - if kernels_first: - layer.set_params([kernels, biases]) - else: - layer.set_params([biases, kernels]) - layer_output = layer.fprop(inputs) - assert layer_output.shape == true_output.shape, ( - 'Layer fprop gives incorrect shaped output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_output.shape, layer_output.shape) - ) - assert numpy.allclose(layer_output, true_output), ( - 'Layer fprop does not give correct output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_output, layer_output) - ) - finally: - layer.set_params(orig_params) - return True - - -def test_conv_linear_bprop(layer, kernel_order='ioxy', kernels_first=True, - dtype=numpy.float): - """ - Tests input gradients backpropagation method of a convolutional layer. - - Checks the outputs of `bprop` method for a fixed input against known - reference values for the outputs and raises an AssertionError if - the outputted values are not consistent with the reference values. If - tests are all passed returns True. - - Parameters - ---------- - layer : instance of Layer subclass - Convolutional (linear only) layer implementation. It must implement - the methods `get_params`, `set_params` and `bprop`. - kernel_order : string - Specifes dimension ordering assumed for convolutional kernels - passed to `layer`. Default is `ioxy` which corresponds to: - input channels, output channels, image x, image y - The other option is 'oixy' which corresponds to - output channels, input channels, image x, image y - Any other value will raise a ValueError exception. - kernels_first : boolean - Specifies order in which parameters are passed to and returned from - `get_params` and `set_params`. Default is True which corresponds - to signatures of `get_params` and `set_params` being: - kernels, biases = layer.get_params() - layer.set_params([kernels, biases]) - If False this corresponds to signatures of `get_params` and - `set_params` being: - biases, kernels = layer.get_params() - layer.set_params([biases, kernels]) - dtype : numpy data type - Data type to use in numpy arrays passed to layer methods. Default - is `numpy.float`. - - Raises - ------ - AssertionError - Raised if output of `layer.bprop` is inconsistent with reference - values either in shape or values. - ValueError - Raised if `kernel_order` is not a valid order string. - """ - inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype) - kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype) - if kernel_order == 'oixy': - kernels = kernels.swapaxes(0, 1) - elif kernel_order != 'ioxy': - raise ValueError('kernel_order must be one of "ioxy" and "oixy"') - biases = numpy.arange(2).astype(dtype) - igrads = numpy.arange(-20, 16).reshape((2, 2, 3, 3)).astype(dtype) - true_ograds = numpy.array( - [[[[ 328., 605., 567., 261.], - [ 534., 976., 908., 414.], - [ 426., 772., 704., 318.], - [ 170., 305., 275., 123.]], - [[ 80., 125., 119., 45.], - [ 86., 112., 108., 30.], - [ 74., 100., 96., 30.], - [ 18., 17., 19., 3.]], - [[-168., -355., -329., -171.], - [-362., -752., -692., -354.], - [-278., -572., -512., -258.], - [-134., -271., -237., -117.]]], - [[[ -32., -79., -117., -63.], - [-114., -248., -316., -162.], - [-222., -452., -520., -258.], - [-118., -235., -265., -129.]], - [[ 8., 17., 11., 9.], - [ 14., 40., 36., 30.], - [ 2., 28., 24., 30.], - [ 18., 53., 55., 39.]], - [[ 48., 113., 139., 81.], - [ 142., 328., 388., 222.], - [ 226., 508., 568., 318.], - [ 154., 341., 375., 207.]]]], dtype=dtype) - try: - orig_params = layer.get_params() - if kernels_first: - layer.set_params([kernels, biases]) - else: - layer.set_params([biases, kernels]) - layer_deltas, layer_ograds = layer.bprop(None, igrads) - assert layer_deltas.shape == igrads.shape, ( - 'Layer bprop give incorrectly shaped deltas output.' - 'Correct shape is {0} but returned shape is {1}.' - .format(igrads.shape, layer_deltas.shape) - ) - assert numpy.allclose(layer_deltas, igrads), ( - 'Layer bprop does not give correct deltas output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(igrads, layer_deltas) - ) - assert layer_ograds.shape == true_ograds.shape, ( - 'Layer bprop gives incorrect shaped ograds output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_ograds.shape, layer_ograds.shape) - ) - assert numpy.allclose(layer_ograds, true_ograds), ( - 'Layer bprop does not give correct ograds output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_ograds, layer_ograds) - ) - finally: - layer.set_params(orig_params) - return True - - -def test_conv_linear_pgrads(layer, kernel_order='ioxy', kernels_first=True, - dtype=numpy.float): - """ - Tests parameter gradients backpropagation method of a convolutional layer. - - Checks the outputs of `pgrads` method for a fixed input against known - reference values for the outputs and raises an AssertionError if - the outputted values are not consistent with the reference values. If - tests are all passed returns True. - - Parameters - ---------- - layer : instance of Layer subclass - Convolutional (linear only) layer implementation. It must implement - the methods `get_params`, `set_params` and `pgrads`. - kernel_order : string - Specifes dimension ordering assumed for convolutional kernels - passed to `layer`. Default is `ioxy` which corresponds to: - input channels, output channels, image x, image y - The other option is 'oixy' which corresponds to - output channels, input channels, image x, image y - Any other value will raise a ValueError exception. - kernels_first : boolean - Specifies order in which parameters are passed to and returned from - `get_params` and `set_params`. Default is True which corresponds - to signatures of `get_params` and `set_params` being: - kernels, biases = layer.get_params() - layer.set_params([kernels, biases]) - If False this corresponds to signatures of `get_params` and - `set_params` being: - biases, kernels = layer.get_params() - layer.set_params([biases, kernels]) - dtype : numpy data type - Data type to use in numpy arrays passed to layer methods. Default - is `numpy.float`. - - Raises - ------ - AssertionError - Raised if output of `layer.pgrads` is inconsistent with reference - values either in shape or values. - ValueError - Raised if `kernel_order` is not a valid order string. - """ - inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype) - kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype) - biases = numpy.arange(2).astype(dtype) - deltas = numpy.arange(-20, 16).reshape((2, 2, 3, 3)).astype(dtype) - true_kernel_grads = numpy.array( - [[[[ 390., 264.], - [ -114., -240.]], - [[ 5088., 5124.], - [ 5232., 5268.]]], - [[[-1626., -1752.], - [-2130., -2256.]], - [[ 5664., 5700.], - [ 5808., 5844.]]], - [[[-3642., -3768.], - [-4146., -4272.]], - [[ 6240., 6276.], - [ 6384., 6420.]]]], dtype=dtype) - if kernel_order == 'oixy': - kernels = kernels.swapaxes(0, 1) - true_kernel_grads = true_kernel_grads.swapaxes(0, 1) - elif kernel_order != 'ioxy': - raise ValueError('kernel_order must be one of "ioxy" and "oixy"') - true_bias_grads = numpy.array([-126., 36.], dtype=dtype) - try: - orig_params = layer.get_params() - if kernels_first: - layer.set_params([kernels, biases]) - else: - layer.set_params([biases, kernels]) - layer_kernel_grads, layer_bias_grads = layer.pgrads(inputs, deltas) - assert layer_kernel_grads.shape == true_kernel_grads.shape, ( - 'Layer pgrads gives incorrect shaped kernel gradients output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_kernel_grads.shape, layer_kernel_grads.shape) - ) - assert numpy.allclose(layer_kernel_grads, true_kernel_grads), ( - 'Layer pgrads does not give correct kernel gradients output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_kernel_grads, layer_kernel_grads) - ) - assert layer_bias_grads.shape == true_bias_grads.shape, ( - 'Layer pgrads gives incorrect shaped bias gradients output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_bias_grads.shape, layer_bias_grads.shape) - ) - assert numpy.allclose(layer_bias_grads, true_bias_grads), ( - 'Layer pgrads does not give correct bias gradients output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_bias_grads, layer_bias_grads) - ) - finally: - layer.set_params(orig_params) - return True - diff --git a/notebooks/02_Linear_models.ipynb b/notebooks/02_Linear_models.ipynb deleted file mode 100644 index 004f3cd..0000000 --- a/notebooks/02_Linear_models.ipynb +++ /dev/null @@ -1,650 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "-" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Single Layer Models\n", - "\n", - "***\n", - "### Note on storing matrices in computer memory\n", - "\n", - "Suppose you want to store the following matrix in memory: $\\left[ \\begin{array}{ccc}\n", - "1 & 2 & 3 \\\\\n", - "4 & 5 & 6 \\\\\n", - "7 & 8 & 9 \\end{array} \\right]$ \n", - "\n", - "If you allocate the memory at once for the whole matrix, then the above matrix would be organised as a vector in one of two possible forms:\n", - "\n", - "* Row-wise layout where the order would look like: $\\left [ \\begin{array}{ccccccccc}\n", - "1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\end{array} \\right ]$\n", - "* Column-wise layout where the order would look like: $\\left [ \\begin{array}{ccccccccc}\n", - "1 & 4 & 7 & 2 & 5 & 8 & 3 & 6 & 9 \\end{array} \\right ]$\n", - "\n", - "Although `numpy` can easily handle both formats (possibly with some computational overhead), in our code we will stick with the more modern (and default) `C`-like approach and use the row-wise format (in contrast to Fortran that used a column-wise approach). \n", - "\n", - "This means, that in this tutorial:\n", - "* vectors are kept row-wise $\\mathbf{x} = (x_1, x_1, \\ldots, x_D) $ (rather than $\\mathbf{x} = (x_1, x_1, \\ldots, x_D)^T$)\n", - "* similarly, in case of matrices we will stick to: $\\left[ \\begin{array}{cccc}\n", - "x_{11} & x_{12} & \\ldots & x_{1D} \\\\\n", - "x_{21} & x_{22} & \\ldots & x_{2D} \\\\\n", - "x_{31} & x_{32} & \\ldots & x_{3D} \\\\ \\end{array} \\right]$ and each row (i.e. $\\left[ \\begin{array}{cccc} x_{11} & x_{12} & \\ldots & x_{1D} \\end{array} \\right]$) represents a single data-point (like one MNIST image or one window of observations)\n", - "\n", - "In lecture slides you will find the equations following the conventional mathematical approach, using column vectors, but you can easily map between column-major and row-major organisations using a matrix transpose.\n", - "\n", - "***\n", - "\n", - "## Linear and Affine Transforms\n", - "\n", - "The basis of all linear models is the so called affine transform, which is a transform that implements a linear transformation and translation of the input features. The transforms we are going to use are parameterised by:\n", - "\n", - " * A weight matrix $\\mathbf{W} \\in \\mathbb{R}^{D\\times K}$: where element $w_{ik}$ is the weight from input $x_i$ to output $y_k$\n", - " * A bias vector $\\mathbf{b}\\in R^{K}$ : where element $b_{k}$ is the bias for output $k$\n", - "\n", - "Note, the bias is simply some additive term, and can be easily incorporated into an additional row in weight matrix and an additional input in the inputs which is set to $1.0$ (as in the below picture taken from the lecture slides). However, here (and in the code) we will keep them separate.\n", - "\n", - "![Making Predictions](res/singleLayerNetWts-1.png)\n", - "\n", - "For instance, for the above example of 5-dimensional input vector by $\\mathbf{x} = (x_1, x_2, x_3, x_4, x_5)$, weight matrix $\\mathbf{W}=\\left[ \\begin{array}{ccc}\n", - "w_{11} & w_{12} & w_{13} \\\\\n", - "w_{21} & w_{22} & w_{23} \\\\\n", - "w_{31} & w_{32} & w_{33} \\\\\n", - "w_{41} & w_{42} & w_{43} \\\\\n", - "w_{51} & w_{52} & w_{53} \\\\ \\end{array} \\right]$, bias vector $\\mathbf{b} = (b_1, b_2, b_3)$ and outputs $\\mathbf{y} = (y_1, y_2, y_3)$, one can write the transformation as follows:\n", - "\n", - "(for the $i$-th output)\n", - "\n", - "(1) $\n", - "\\begin{equation}\n", - " y_i = b_i + \\sum_j x_jw_{ji}\n", - "\\end{equation}\n", - "$\n", - "\n", - "or the equivalent vector form (where $\\mathbf w_i$ is the $i$-th column of $\\mathbf W$, but note, when we **slice** the $i$th column we will get a **vector** $\\mathbf w_i = (w_{1i}, w_{2i}, w_{3i}, w_{4i}, w_{5i})$, hence the transpose for $\\mathbf w_i$ in the below equation):\n", - "\n", - "(2) $\n", - "\\begin{equation}\n", - " y_i = b_i + \\mathbf x \\mathbf w_i^T\n", - "\\end{equation}\n", - "$\n", - "\n", - "The same operation can be also written in matrix form, to compute all the outputs $\\mathbf{y}$ at the same time:\n", - "\n", - "(3) $\n", - "\\begin{equation}\n", - " \\mathbf y=\\mathbf x\\mathbf W + \\mathbf b\n", - "\\end{equation}\n", - "$\n", - "\n", - "This is equivalent to slides 12/13 in lecture 1, except we are using row vectors.\n", - "\n", - "When $\\mathbf{x}$ is a mini-batch (contains $B$ data-points of dimension $D$ each), i.e. $\\left[ \\begin{array}{cccc}\n", - "x_{11} & x_{12} & \\ldots & x_{1D} \\\\\n", - "x_{21} & x_{22} & \\ldots & x_{2D} \\\\\n", - "\\cdots \\\\\n", - "x_{B1} & x_{B2} & \\ldots & x_{BD} \\\\ \\end{array} \\right]$ equation (3) effectively becomes to be\n", - "\n", - "(4) $\n", - "\\begin{equation}\n", - " \\mathbf Y=\\mathbf X\\mathbf W + \\mathbf b\n", - "\\end{equation}\n", - "$\n", - "\n", - "where $\\mathbf{W} \\in \\mathbb{R}^{D\\times K}$ and both $\\mathbf{X}\\in\\mathbb{R}^{B\\times D}$ and $\\mathbf{Y}\\in\\mathbb{R}^{B\\times K}$ are matrices, and $\\mathbf{b}\\in\\mathbb{R}^{1\\times K}$ needs to be broadcasted $B$ times (numpy will do this by default). However, we will not make an explicit distinction between a special case for $B=1$ and $B>1$ and simply use equation (3) instead, although $\\mathbf{x}$ and hence $\\mathbf{y}$ could be matrices. From an implementation point of view, it does not matter.\n", - "\n", - "The desired functionality for matrix multiplication in numpy is provided by numpy.dot function. If you haven't use it so far, get familiar with it as we will use it extensively." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### A general note on random number generators\n", - "\n", - "It is generally a good practice (for machine learning applications **not** for cryptography!) to seed a pseudo-random number generator once at the beginning of the experiment, and use it later through the code where necesarry. This makes it easier to reproduce results since random initialisations can be replicated. As such, within this course we are going use a single random generator object, similar to the below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import numpy\n", - "\n", - "#initialise the random generator to be used later\n", - "seed=[2015, 10, 1]\n", - "random_generator = numpy.random.RandomState(seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exercise 1 \n", - "\n", - "Using `numpy.dot`, implement **forward** propagation through the linear transform defined by equations (3) and (4) for $B=1$ and $B>1$ i.e. use parameters $\\mathbf{W}$ and $\\mathbf{b}$ with data $\\mathbf{X}$ to determine $\\mathbf{Y}$. Use `MNISTDataProvider` (introduced last week) to generate $\\mathbf{X}$. We are going to write a function for each equation:\n", - "1. `y1_equation_1`: Return the value of the $1^{st}$ dimension of $\\mathbf{y}$ (the output of the first output node) given a single training data point $\\mathbf{x}$ using a sum\n", - "1. `y1_equation_2`: Repeat above using vector multiplication (use `numpy.dot()`)\n", - "1. `y_equation_3`: Return the value of $\\mathbf{y}$ (the whole output layer) given a single training data point $\\mathbf{x}$\n", - "1. `Y_equation_4`: Return the value of $\\mathbf{Y}$ given $\\mathbf{X}$\n", - "\n", - "We have initialised $\\mathbf{b}$ to zeros and randomly generated $\\mathbf{W}$ for you. The constants introduced above are:\n", - "* The number of data points $B = 3$\n", - "* The dimensionality of the input $D = 784$\n", - "* The dimensionality of the output $K = 10$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from mlp.dataset import MNISTDataProvider\n", - "\n", - "mnist_dp = MNISTDataProvider(dset='valid', batch_size=3, max_num_batches=1, randomize=False)\n", - "B = 3\n", - "D = 784\n", - "K = 10\n", - "irange = 0.1\n", - "W = random_generator.uniform(-irange, irange, (D, K)) \n", - "b = numpy.zeros((10,))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "\n", - "mnist_dp.reset()\n", - "\n", - "#implement following functions, then run the cell\n", - "def y1_equation_1(x, W, b):\n", - " raise NotImplementedError()\n", - " \n", - "def y1_equation_2(x, W, b):\n", - " raise NotImplementedError()\n", - "\n", - "def y_equation_3(x, W, b):\n", - " #use numpy.dot\n", - " raise NotImplementedError()\n", - "\n", - "def Y_equation_4(x, W, b):\n", - " #use numpy.dot\n", - " raise NotImplementedError()\n", - "\n", - "for X, t in mnist_dp:\n", - " n = 0\n", - " y1e1 = y1_equation_1(x[n], W, b)\n", - " y1e2 = y1_equation_2(x[n], W, b)\n", - " ye3 = y_equation_3(x[n], W, b)\n", - " Ye4 = Y_equation_4(x, W, b)\n", - "\n", - "print 'y1e1', y1e1\n", - "print 'y1e1', y1e1\n", - "print 'ye3', ye3\n", - "print 'Ye4', ye4\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## Exercise 2\n", - "\n", - "Modify the examples from Exercise 1 to perform **backward** propagation, that is, given $\\mathbf{y}$ (obtained in the previous step) and weight matrix $\\mathbf{W}$, project $\\mathbf{y}$ onto the input space $\\mathbf{x}$ (ignore or set to zero the biases towards $\\mathbf{x}$ in backward pass, and note, we are **not** trying to reconstruct the original $\\mathbf{x}$). Mathematically, we are interested in the following transformation: $\\mathbf{z}=\\mathbf{y}\\mathbf{W}^T$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***\n", - "## Exercise 3 (optional)\n", - "\n", - "In case you do not fully understand how matrix-vector and/or matrix-matrix products work, consider implementing `my_dot_mat_mat` function (you have been given `my_dot_vec_mat` code to look at as an example) which takes as the input the following arguments:\n", - "\n", - "* D-dimensional input vector $\\mathbf{x} = (x_1, x_2, \\ldots, x_D) $.\n", - "* Weight matrix $\\mathbf{W}\\in\\mathbb{R}^{D\\times K}$:\n", - "\n", - "and returns:\n", - "\n", - "* K-dimensional output vector $\\mathbf{y} = (y_1, \\ldots, y_K) $\n", - "\n", - "Your job is to write a variant that works in a mini-batch mode where both $\\mathbf{x}\\in\\mathbb{R}^{B\\times D}$ and $\\mathbf{y}\\in\\mathbb{R}^{B\\times K}$ are matrices in which each rows contain one of $B$ data-points from mini-batch (rather than $\\mathbf{x}\\in\\mathbb{R}^{D}$ and $\\mathbf{y}\\in\\mathbb{R}^{K}$)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def my_dot_vec_mat(x, W):\n", - " J = x.shape[0]\n", - " K = W.shape[1]\n", - " assert (J == W.shape[0]), (\n", - " \"Number of columns of x expected to \"\n", - " \" to be equal to the number of rows in \"\n", - " \"W, bot got shapes %s, %s\" % (x.shape, W.shape)\n", - " )\n", - " y = numpy.zeros((K,))\n", - " for k in xrange(0, K):\n", - " for j in xrange(0, J):\n", - " y[k] += x[j] * W[j,k]\n", - " \n", - " return y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "irange = 0.1 #+-range from which we draw the random numbers\n", - "\n", - "x = random_generator.uniform(-irange, irange, (5,)) \n", - "W = random_generator.uniform(-irange, irange, (5,3)) \n", - "\n", - "y_my = my_dot_vec_mat(x, W)\n", - "y_np = numpy.dot(x, W)\n", - "\n", - "same = numpy.allclose(y_my, y_np)\n", - "\n", - "if same:\n", - " print 'Well done!'\n", - "else:\n", - " print 'Matrices are different:'\n", - " print 'y_my is: ', y_my\n", - " print 'y_np is: ', y_np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def my_dot_mat_mat(x, W):\n", - " I = x.shape[0]\n", - " J = x.shape[1]\n", - " K = W.shape[1]\n", - " assert (J == W.shape[0]), (\n", - " \"Number of columns in of x expected to \"\n", - " \" to be the same as rows in W, got\"\n", - " )\n", - " #allocate the output container\n", - " y = numpy.zeros((I, K))\n", - " \n", - " #implement here matrix-matrix inner product here\n", - " raise NotImplementedError('Write me!')\n", - " \n", - " return y" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test whether you get comparable numbers to what numpy is producing:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "irange = 0.1 #+-range from which we draw the random numbers\n", - "\n", - "x = random_generator.uniform(-irange, irange, (2,5)) \n", - "W = random_generator.uniform(-irange, irange, (5,3)) \n", - "\n", - "y_my = my_dot_mat_mat(x, W)\n", - "y_np = numpy.dot(x, W)\n", - "\n", - "same = numpy.allclose(y_my, y_np)\n", - "\n", - "if same:\n", - " print 'Well done!'\n", - "else:\n", - " print 'Matrices are different:'\n", - " print 'y_my is: ', y_my\n", - " print 'y_np is: ', y_np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we benchmark each approach (we do it in separate cells, as timeit currently can measure whole cell execuiton only)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "#generate bit bigger matrices, to better evaluate timings\n", - "x = random_generator.uniform(-irange, irange, (10, 1000))\n", - "W = random_generator.uniform(-irange, irange, (1000, 100))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print 'my_dot timings:'\n", - "%timeit -n10 my_dot_mat_mat(x, W)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print 'numpy.dot timings:'\n", - "%timeit -n10 numpy.dot(x, W)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Optional section ends here**\n", - "***" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Iterative learning of linear models\n", - "\n", - "We will learn the model with stochastic gradient descent on N data-points using mean square error (MSE) loss function, which is defined as follows:\n", - "\n", - "(5) $\n", - "E = \\frac{1}{2} \\sum_{n=1}^N ||\\mathbf{y}^n - \\mathbf{t}^n||^2 = \\sum_{n=1}^N E^n \\\\\n", - " E^n = \\frac{1}{2} ||\\mathbf{y}^n - \\mathbf{t}^n||^2\n", - "$\n", - "\n", - "(6) $ E^n = \\frac{1}{2} \\sum_{k=1}^K (y_k^n - t_k^n)^2 $\n", - " \n", - "Hence, the gradient w.r.t (with respect to) the $r$ output y of the model is defined as, so called delta function, $\\delta_r$: \n", - "\n", - "(8) $\\frac{\\partial{E^n}}{\\partial{y_{r}}} = (y^n_r - t^n_r) = \\delta^n_r \\quad ; \\quad\n", - " \\delta^n_r = y^n_r - t^n_r \\\\\n", - " \\frac{\\partial{E}}{\\partial{y_{r}}} = \\sum_{n=1}^N \\frac{\\partial{E^n}}{\\partial{y_{r}}} = \\sum_{n=1}^N \\delta^n_r\n", - "$\n", - "\n", - "Similarly, using the above $\\delta^n_r$ one can express the gradient of the weight $w_{sr}$ (from the s-th input to the r-th output) for linear model and MSE cost as follows:\n", - "\n", - "(9) $\n", - " \\frac{\\partial{E^n}}{\\partial{w_{sr}}} = (y^n_r - t^n_r)x_s^n = \\delta^n_r x_s^n \\quad\\\\\n", - " \\frac{\\partial{E}}{\\partial{w_{sr}}} = \\sum_{n=1}^N \\frac{\\partial{E^n}}{\\partial{w_{rs}}} = \\sum_{n=1}^N \\delta^n_r x_s^n\n", - "$\n", - "\n", - "and the gradient for bias parameter at the $r$-th output is:\n", - "\n", - "(10) $\n", - " \\frac{\\partial{E}}{\\partial{b_{r}}} = \\sum_{n=1}^N \\frac{\\partial{E^n}}{\\partial{b_{r}}} = \\sum_{n=1}^N \\delta^n_r\n", - "$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "![Making Predictions](res/singleLayerNetPredict.png)\n", - " \n", - " * Input vector $\\mathbf{x} = (x_1, x_2, \\ldots, x_D) $\n", - " * Output scalar $y_1$\n", - " * Weight matrix $\\mathbf{W}$: $w_{ik}$ is the weight from input $x_i$ to output $y_k$. Note, here this is really a vector since a single scalar output, y_1.\n", - " * Scalar bias $b$ for the only output in our model \n", - " * Scalar target $t$ for the only output in out model\n", - " \n", - "First, ensure you can make use of the data provider (note, for training data has been normalised to zero mean and unit variance, hence different effective range than one can find in file):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from mlp.dataset import MetOfficeDataProvider\n", - "\n", - "modp = MetOfficeDataProvider(10, batch_size=10, max_num_batches=2, randomize=False)\n", - "\n", - "%precision 2\n", - "for x, t in modp:\n", - " print 'Observations: ', x\n", - " print 'To predict: ', t" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exercise 4\n", - "\n", - "The below code implements a very simple variant of stochastic gradient descent for the rainfall prediction example. Your task is to implement 5 functions in the next cell and then run two next cells that 1) build sgd functions and 2) run the actual training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "\n", - "#When implementing those, take into account the mini-batch case, for which one is\n", - "#expected to sum the errors for each example\n", - "\n", - "def fprop(x, W, b):\n", - " #code implementing eq. (3)\n", - " raise NotImplementedError('Write me!')\n", - "\n", - "def cost(y, t):\n", - " #Mean Square Error cost, equation (5)\n", - " raise NotImplementedError('Write me!')\n", - "\n", - "def cost_grad(y, t):\n", - " #Gradient of the cost w.r.t y equation (8)\n", - " raise NotImplementedError('Write me!')\n", - "\n", - "def cost_wrt_W(cost_grad, x):\n", - " #Gradient of the cost w.r.t W, equation (9)\n", - " raise NotImplementedError('Write me!')\n", - " \n", - "def cost_wrt_b(cost_grad):\n", - " #Gradient of the cost w.r.t to b, equation (10)\n", - " raise NotImplementedError('Write me!')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "\n", - "def sgd_epoch(data_provider, W, b, learning_rate):\n", - " mse_stats = []\n", - " \n", - " #get the minibatch of data\n", - " for x, t in data_provider:\n", - " \n", - " #1. get the estimate of y\n", - " y = fprop(x, W, b)\n", - "\n", - " #2. compute the loss function\n", - " tmp = cost(y, t)\n", - " mse_stats.append(tmp)\n", - " \n", - " #3. compute the grad of the cost w.r.t the output layer activation y\n", - " #i.e. how the cost changes when output y changes\n", - " cost_grad_deltas = cost_grad(y, t)\n", - "\n", - " #4. compute the gradients w.r.t model's parameters\n", - " grad_W = cost_wrt_W(cost_grad_deltas, x)\n", - " grad_b = cost_wrt_b(cost_grad_deltas)\n", - "\n", - " #4. Update the model, we update with the mean gradient\n", - " # over the minibatch, rather than sum of particular gradients\n", - " # in a minibatch, to do so we scale the learning rate by batch_size\n", - " batch_size = x.shape[0]\n", - " effect_learn_rate = learning_rate / batch_size\n", - "\n", - " W = W - effect_learn_rate * grad_W\n", - " b = b - effect_learn_rate * grad_b\n", - " \n", - " return W, b, numpy.mean(mse_stats)\n", - "\n", - "def sgd(data_provider, W, b, learning_rate=0.1, max_epochs=10):\n", - " \n", - " for epoch in xrange(0, max_epochs):\n", - " #reset the data provider\n", - " data_provider.reset()\n", - " \n", - " #train for one epoch\n", - " W, b, mean_cost = \\\n", - " sgd_epoch(data_provider, W, b, learning_rate)\n", - " \n", - " print \"MSE training cost after %d-th epoch is %f\" % (epoch + 1, mean_cost)\n", - " \n", - " return W, b\n", - " \n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "\n", - "#some hyper-parameters\n", - "window_size = 12\n", - "irange = 0.1\n", - "learning_rate = 0.01\n", - "max_epochs=40\n", - "\n", - "# note, while developing you can set max_num_batches to some positive number to limit\n", - "# the number of training data-points (you will get feedback faster)\n", - "mdp = MetOfficeDataProvider(window_size, batch_size=10, max_num_batches=-100, randomize=False)\n", - "\n", - "#initialise the parameters\n", - "W = random_generator.uniform(-irange, irange, (window_size, 1))\n", - "b = random_generator.uniform(-irange, irange, (1, ))\n", - "\n", - "#train the model\n", - "sgd(mdp, W, b, learning_rate=learning_rate, max_epochs=max_epochs)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## Exercise 5\n", - "\n", - "Modify the above prediction (regression) problem so the model makes a binary classification whether the the weather is going to be one of those \\{rainy, not-rainy} (look at slide 12 of the 2nd lecture)\n", - "\n", - "Tip: You need to introduce the following changes:\n", - "1. Modify `MetOfficeDataProvider` (for example, inherit from MetOfficeDataProvider to create a new class MetOfficeDataProviderBin) and modify `next()` function so it returns as `targets` either 0 (not-rainy - if the the amount of rain [before mean/variance normalisation] is equal to 0) or 1 (rainy -- otherwise).\n", - "2. Modify the functions from previous exercise so the fprop implements `sigmoid` on top of affine transform.\n", - "3. Modify cost function to binary cross-entropy\n", - "4. Make sure you compute the gradients correctly (as you have changed both the output and the cost)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/03_Multi_layer_models.ipynb b/notebooks/03_Multi_layer_models.ipynb deleted file mode 100644 index d5230ac..0000000 --- a/notebooks/03_Multi_layer_models.ipynb +++ /dev/null @@ -1,303 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This tutorial is an introduction to the first coursework about multi-layer networks (also known as Multi-Layer Perceptrons - MLPs - or Deep Neural Networks - DNNs). Here, we will show how to build a single layer linear model (similar to the one from the previous lab) for MNIST digit classification using the provided code-base. \n", - "\n", - "The principal purpose of this introduction is to get you familiar with how to connect the code blocks (and what operations each of them implements) in order to set up an experiment that includes 1) building the model structure 2) optimising the model's parameters (weights) and 3) evaluating the model on test data. \n", - "\n", - "## For those affected by notebook kernel issues\n", - "\n", - "In case you are still having issues with running notebook kernels, have a look at [this note](https://github.com/CSTR-Edinburgh/mlpractical/blob/master/kernel_issue_fix.md) on the GitHub.\n", - "\n", - "## Virtual environments\n", - "\n", - "Before you proceed onwards, remember to activate your virtual environment:\n", - " * If you were in last week's Tuesday or Wednesday group type `activate_mlp` or `source ~/mlpractical/venv/bin/activate`\n", - " * If you were in the Monday group:\n", - " + and if you have chosen the **comfy** way type: `workon mlpractical`\n", - " + and if you have chosen the **generic** way, `source` your virutal environment using `source` and specyfing the path to the activate script (you need to localise it yourself, there were not any general recommendations w.r.t dir structure and people have installed it in different places, usually somewhere in the home directories. If you cannot easily find it by yourself, use something like: `find . -iname activate` ):\n", - "\n", - "## Syncing the git repository\n", - "\n", - "Look here for more details. But in short, we recommend to create a separate branch for the coursework, as follows:\n", - "\n", - "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n", - "2. List the branches and check which is currently active by typing: `git checkout`\n", - "3. If you are not in `master` branch, switch to it by typing: \n", - "```\n", - "git checkout master\n", - " ```\n", - "4. Then update the repository (note, assuming master does not have any conflicts), if there are some, have a look here\n", - "```\n", - "git pull\n", - "```\n", - "5. And now, create the new branch & swith to it by typing:\n", - "```\n", - "git checkout -b coursework1\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multi Layer Models\n", - "\n", - "Today, we shall build models which can have an arbitrary number of hidden layers. Please have a look at the diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n", - "\n", - "![Making Predictions](res/code_scheme.svg)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. Structuring the model\n", - " * The model (for now) is allowed to have a sequence of layers, mapping inputs $\\mathbf{x}$ to outputs $\\mathbf{y}$. \n", - " * This operation is implemented as a special type of a layer in `mlp.layers.MLP` class. It keeps a sequence of other layers (of various typyes like Linear, Sigmoid, Softmax, etc.) as well as the internal state of a model for a mini-batch, that is, the intermediate data produced in *forward* and *backward* passes.\n", - "2. Forward computation\n", - " * `mlp.layers.MLP` provides an `fprop()` method that iterates over defined layers propagates $\\mathbf{x}$ to $\\mathbf{y}$. \n", - " * Each layer (look at `mlp.layers.Linear` attached below) also implements an `fprop()` method, which performs an atomic, for the given layer, operation. Most often, for the $i$-th layer, we want to obtain a linear transform $\\mathbf a^i$ of the inputs, and apply some non-linear transfer function $f^i(\\mathbf a^i)$ to produce the output $\\mathbf h^i$. Note, in general each layer may implement different activation functions $f^i()$, however for now we will use only `sigmoid` and `softmax`\n", - "3. Backward computation\n", - " * Similarly, `mlp.layers.MLP` also implements a `bprop()` function, to back-propagate the errors from the top to the bottom layer. This class also keeps the back-propagated statistics ($\\delta$) to be used later when computing the gradients with respect to the parameters.\n", - " * This functionality is also re-implemented by particular layers (again, have a look at the `bprop` function of `mlp.layers.Linear`). `bprop()` returns both $\\delta$ (needed to update the parameters) but also back-progapates the gradient down to the inputs. Also note, that depending on whether the layer is the top or not (i.e. if it deals directly with the cost function or not) some simplifications may apply ( as with cross-entropy and softmax). That's why when implementing a new type of layer that may be used as an output layer one also need to specify the implementation of `bprop_cost()`.\n", - "4. Learning the model\n", - " * The actual evaluation of the cost as well as the *forward* and *backward* passes may be found in the `train_epoch()` method of `mlp.optimisers.SGDOptimiser`\n", - " * This function also calls the `pgrads()` method on each layer, that given activations and deltas, returns the list of the gradients of the cost with respect to the model parameters, i.e. $\\frac{\\partial{\\mathbf{E}}}{\\partial{\\mathbf{W^i}}}$ and $\\frac{\\partial{\\mathbf{E}}}{\\partial{\\mathbf{b}^i}}$ at the above diagram (look at an example implementation in `mlp.layers.Linear`)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "Example code for the above\n", - "```python\n", - "# %load -s Linear mlp/layers.py\n", - "class Linear(Layer):\n", - "\n", - " def __init__(self, idim, odim,\n", - " rng=None,\n", - " irange=0.1):\n", - "\n", - " super(Linear, self).__init__(rng=rng)\n", - "\n", - " self.idim = idim\n", - " self.odim = odim\n", - "\n", - " self.W = self.rng.uniform(\n", - " -irange, irange,\n", - " (self.idim, self.odim))\n", - "\n", - " self.b = numpy.zeros((self.odim,), dtype=numpy.float32)\n", - "\n", - " def fprop(self, inputs):\n", - " \"\"\"\n", - " Implements a forward propagation through the i-th layer, that is\n", - " some form of:\n", - " a^i = xW^i + b^i\n", - " h^i = f^i(a^i)\n", - " with f^i, W^i, b^i denoting a non-linearity, weight matrix and\n", - " biases of this (i-th) layer, respectively and x denoting inputs.\n", - "\n", - " :param inputs: matrix of features (x) or the output of the previous layer h^{i-1}\n", - " :return: h^i, matrix of transformed by layer features\n", - " \"\"\"\n", - " a = numpy.dot(inputs, self.W) + self.b\n", - " # here f() is an identity function, so just return a linear transformation\n", - " return a\n", - "\n", - " def bprop(self, h, igrads):\n", - " \"\"\"\n", - " Implements a backward propagation through the layer, that is, given\n", - " h^i denotes the output of the layer and x^i the input, we compute:\n", - " dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i\n", - " x^i could be either features (x) or the output of the lower layer h^{i-1}\n", - " :param h: it's an activation produced in forward pass\n", - " :param igrads, error signal (or gradient) flowing to the layer, note,\n", - " this in general case does not corresponds to 'deltas' used to update\n", - " the layer's parameters, to get deltas ones need to multiply it with\n", - " the dh^i/da^i derivative\n", - " :return: a tuple (deltas, ograds) where:\n", - " deltas = igrads * dh^i/da^i\n", - " ograds = deltas \\times da^i/dx^i\n", - " \"\"\"\n", - "\n", - " # since df^i/da^i = 1 (f is assumed identity function),\n", - " # deltas are in fact the same as igrads\n", - " ograds = numpy.dot(igrads, self.W.T)\n", - " return igrads, ograds\n", - "\n", - " def bprop_cost(self, h, igrads, cost):\n", - " \"\"\"\n", - " Implements a backward propagation in case the layer directly\n", - " deals with the optimised cost (i.e. the top layer)\n", - " By default, method should implement a bprop for default cost, that is\n", - " the one that is natural to the layer's output, i.e.:\n", - " here we implement linear -> mse scenario\n", - " :param h: it's an activation produced in forward pass\n", - " :param igrads, error signal (or gradient) flowing to the layer, note,\n", - " this in general case does not corresponds to 'deltas' used to update\n", - " the layer's parameters, to get deltas ones need to multiply it with\n", - " the dh^i/da^i derivative\n", - " :param cost, mlp.costs.Cost instance defining the used cost\n", - " :return: a tuple (deltas, ograds) where:\n", - " deltas = igrads * dh^i/da^i\n", - " ograds = deltas \\times da^i/dx^i\n", - " \"\"\"\n", - "\n", - " if cost is None or cost.get_name() == 'mse':\n", - " # for linear layer and mean square error cost,\n", - " # cost back-prop is the same as standard back-prop\n", - " return self.bprop(h, igrads)\n", - " else:\n", - " raise NotImplementedError('Linear.bprop_cost method not implemented '\n", - " 'for the %s cost' % cost.get_name())\n", - "\n", - " def pgrads(self, inputs, deltas):\n", - " \"\"\"\n", - " Return gradients w.r.t parameters\n", - "\n", - " :param inputs, input to the i-th layer\n", - " :param deltas, deltas computed in bprop stage up to -ith layer\n", - " :return list of grads w.r.t parameters dE/dW and dE/db in *exactly*\n", - " the same order as the params are returned by get_params()\n", - "\n", - " Note: deltas here contain the whole chain rule leading\n", - " from the cost up to the the i-th layer, i.e.\n", - " dE/dy^L dy^L/da^L da^L/dh^{L-1} dh^{L-1}/da^{L-1} ... dh^{i}/da^{i}\n", - " and here we are just asking about\n", - " 1) da^i/dW^i and 2) da^i/db^i\n", - " since W and b are only layer's parameters\n", - " \"\"\"\n", - "\n", - " grad_W = numpy.dot(inputs.T, deltas)\n", - " grad_b = numpy.sum(deltas, axis=0)\n", - "\n", - " return [grad_W, grad_b]\n", - "\n", - " def get_params(self):\n", - " return [self.W, self.b]\n", - "\n", - " def set_params(self, params):\n", - " #we do not make checks here, but the order on the list\n", - " #is assumed to be exactly the same as get_params() returns\n", - " self.W = params[0]\n", - " self.b = params[1]\n", - "\n", - " def get_name(self):\n", - " return 'linear'\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 1: Experiment with linear models and MNIST\n", - "\n", - "The below snippet demonstrates how to use the code we have provided for the coursework 1. Get familiar with it, as from now on we will use till the end of the course, including the 2nd coursework.\n", - "\n", - "It should be straightforward to extend the following code to more complex models, like stack more layers, change the cost, the optimiser, learning rate schedules, etc.. But **ask** in case something is not clear.\n", - "\n", - "In this particular example, we use the following components:\n", - " * One layer mapping data-points ($\\mathbf x$) straight to 10 digits classes represented as 10 (linear) outputs ($\\mathbf y$). This operation is implemented as a linear layer in `mlp.layers.Linear`. Get familiar with this class (read the comments, etc.) as it is going to be a building block for the coursework.\n", - " * One can stack as many different layers as required through the container `mlp.layers.MLP`\n", - " * As an objective here we use the Mean Square Error cost defined in `mlp.costs.MSECost`\n", - " * Our *Stochastic Gradient Descent* optimiser can be found in `mlp.optimisers.SGDOptimiser`. Its parent `mlp.optimisers.Optimiser` implements validation functionality (and an interface in case one need to implement a different optimiser)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import numpy\n", - "import logging\n", - "\n", - "logger = logging.getLogger()\n", - "logger.setLevel(logging.INFO)\n", - "\n", - "from mlp.layers import MLP, Linear #import required layer types\n", - "from mlp.optimisers import SGDOptimiser #import the optimiser\n", - "from mlp.dataset import MNISTDataProvider #import data provider\n", - "from mlp.costs import MSECost #import the cost we want to use for optimisation\n", - "from mlp.schedulers import LearningRateFixed\n", - "\n", - "rng = numpy.random.RandomState([2015,10,10])\n", - "\n", - "# define the model structure, here just one linear layer\n", - "# and mean square error cost\n", - "cost = MSECost()\n", - "model = MLP(cost=cost)\n", - "model.add_layer(Linear(idim=784, odim=10, rng=rng))\n", - "#one can stack more layers here\n", - "\n", - "# define the optimiser, here stochasitc gradient descent\n", - "# with fixed learning rate and max_epochs as stopping criterion\n", - "lr_scheduler = LearningRateFixed(learning_rate=0.01, max_epochs=20)\n", - "optimiser = SGDOptimiser(lr_scheduler=lr_scheduler)\n", - "\n", - "logger.info('Initialising data providers...')\n", - "train_dp = MNISTDataProvider(dset='train', batch_size=100, max_num_batches=-10, randomize=True)\n", - "valid_dp = MNISTDataProvider(dset='valid', batch_size=100, max_num_batches=-10, randomize=False)\n", - "\n", - "logger.info('Training started...')\n", - "optimiser.train(model, train_dp, valid_dp)\n", - "\n", - "logger.info('Testing the model on test set:')\n", - "test_dp = MNISTDataProvider(dset='eval', batch_size=100, max_num_batches=-10, randomize=False)\n", - "cost, accuracy = optimiser.validate(model, test_dp)\n", - "logger.info('MNIST test set accuracy is %.2f %% (cost is %.3f)'%(accuracy*100., cost))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exercise\n", - "\n", - "Modify the above code by adding an intemediate linear layer of size 200 hidden units between input and output layers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/04_Regularisation.ipynb b/notebooks/04_Regularisation.ipynb deleted file mode 100644 index 24f2349..0000000 --- a/notebooks/04_Regularisation.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This tutorial focuses on implementation of three reqularisaion techniques: two of them add a regularisation term to the cost function based on the *L1* and *L2* norms; the third technique, called *Dropout*, is a form of noise injection by random corruption of information carried by the hidden units during training.\n", - "\n", - "\n", - "## Virtual environments\n", - "\n", - "Before you proceed onwards, remember to activate your virtual environment by typing `activate_mlp` or `source ~/mlpractical/venv/bin/activate` (or if you did the original install the \"comfy way\" type: `workon mlpractical`).\n", - "\n", - "\n", - "## Syncing the git repository\n", - "\n", - "Look here for more details. But in short, we recommend to create a separate branch for this lab, as follows:\n", - "\n", - "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n", - "2. List the branches and check which are currently active by typing: `git branch`\n", - "3. If you have followed our recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n", - "```\n", - "git commit -am \"finished coursework\"\n", - "```\n", - "4. Now you can switch to `master` branch by typing: \n", - "```\n", - "git checkout master\n", - " ```\n", - "5. To update the repository (note, assuming master does not have any conflicts), if there are some, have a look here\n", - "```\n", - "git pull\n", - "```\n", - "6. And now, create the new branch & swith to it by typing:\n", - "```\n", - "git checkout -b lab4\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Regularisation\n", - "\n", - "Regularisation add a *complexity term* to the cost function. Its purpose is to put some prior on the model's parameters, which will penalise complexity. The most common prior is perhaps the one which assumes smoother solutions (the one which are not able to fit training data too well) are better as they are more likely to better generalise to unseen data. \n", - "\n", - "A way to incorporate such a prior in the model is to add some term that penalise certain configurations of the parameters -- either from growing too large ($L_2$) or the one that prefers a solution that could be modelled with fewer parameters ($L_1$), hence encouraging some parameters to become 0. One can, of course, combine many such priors when optimising the model, however, in the lab we shall use $L_1$ and/or $L_2$ priors.\n", - "\n", - "$L_1$ and $L_2$ priors can be easily incorporated into the training objective through additive terms, as follows:\n", - "\n", - "(1) $\n", - " \\begin{align*}\n", - " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", - " \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} + \\underbrace{\\beta_{L_2} E^n_{L_2}}_{\\text{prior term}}\n", - "\\end{align*}\n", - "$\n", - "\n", - "where $ E^n_{\\text{train}} = - \\sum_{k=1}^K t^n_k \\ln y^n_k $ is the cross-entropy cost function, $\\beta_{L_1}$ and $\\beta_{L_2}$ are non-negative constants specified in advance (hyper-parameters) and $E^n_{L_1}$ and $E^n_{L_2}$ are norm metrics specifying certain properties of the parameters:\n", - "\n", - "(2) $\n", - " \\begin{align*}\n", - " E^n_{L_p}(\\mathbf{W}) = ||\\mathbf{W}||_p = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n", - "\\end{align*}\n", - "$\n", - "\n", - "where $p$ denotes the norm-order (for regularisation either 1 or 2). Notice, in practice for computational purposes we will rather compute squared $L_{p=2}$ norm, which omits the square root in (2), that is:\n", - "\n", - "(3)$ \\begin{align*}\n", - " E^n_{L_{p=2}}(\\mathbf{W}) = ||\\mathbf{W}||^2_2 = \\left ( \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2 \\right )^{\\frac{1}{2}} \\right )^2 = \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2\n", - "\\end{align*}\n", - "$\n", - "\n", - "## $L_{p=2}$ (Weight Decay)\n", - "\n", - "Our cost with $L_{2}$ regulariser then becomes ($\\frac{1}{2}$ simplifies a derivative later):\n", - "\n", - "(4) $\n", - " \\begin{align*}\n", - " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", - " \\underbrace{\\beta_{L_2} \\frac{1}{2} E^n_{L_2}}_{\\text{prior term}}\n", - "\\end{align*}\n", - "$\n", - "\n", - "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n", - "\n", - "(5) $\n", - "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} 0.5 E^n_{L_2}) }{\\partial w_i} \n", - " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} 0.5 \\frac{\\partial\n", - " E^n_{L_2}}{\\partial w_i} \\right) \n", - " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right)\n", - "\\end{align*}\n", - "$\n", - "\n", - "And the actual update we to the $W_i$ parameter is:\n", - "\n", - "(6) $\n", - "\\begin{align*}\n", - " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right) \n", - "\\end{align*}\n", - "$\n", - "\n", - "where $\\eta$ is learning rate. \n", - "\n", - "Exercise 1 gives some more implementational suggestions on how to incorporate this technique into the lab code, the cost related prior contributions (equation (1)) are computed in mlp.optimisers.Optimiser.compute_prior_costs() and your job is to add the relevant optimisation related code when computing the gradients w.r.t parameters. \n", - "\n", - "## $L_{p=1}$ (Sparsity)\n", - "\n", - "Our cost with $L_{1}$ regulariser then becomes:\n", - "\n", - "(7) $\n", - " \\begin{align*}\n", - " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", - " \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} \n", - "\\end{align*}\n", - "$\n", - "\n", - "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n", - "\n", - "(8) $\\begin{align*}\n", - " \\frac{\\partial E^n}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i)\n", - "\\end{align*}\n", - "$\n", - "\n", - "And the actual update we to the $W_i$ parameter is:\n", - "\n", - "(9) $\\begin{align*}\n", - " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n", - "\\end{align*}$\n", - "\n", - "Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n", - "\n", - "One can also easily apply those penalty terms for biases, however, this is usually not necessary as biases do not affect the smoothness of the solution (given data).\n", - "\n", - "## Dropout\n", - "\n", - "For a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality), Dropout implements the following transformation:\n", - "\n", - "(10) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n", - "\n", - "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which element $d^l_{ij}$ is sampled from the Bernoulli distribution:\n", - "\n", - "(11) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n", - "\n", - "with $0