Merge pull request #65 from CSTR-Edinburgh/mlp2017-8/coursework_2

Mlp2017 8/coursework 2
This commit is contained in:
AntreasAntoniou 2017-11-15 18:22:46 +00:00 committed by GitHub
commit acefdd11ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 12377 additions and 72 deletions

BIN
data/emnist-test.npz Normal file

Binary file not shown.

BIN
data/emnist-train.npz Normal file

Binary file not shown.

BIN
data/emnist-valid.npz Normal file

Binary file not shown.

View File

@ -153,7 +153,7 @@ class MNISTDataProvider(DataProvider):
rng (RandomState): A seeded random number generator.
"""
# check a valid which_set was provided
assert which_set in ['train', 'valid', 'eval'], (
assert which_set in ['train', 'valid', 'test'], (
'Expected which_set to be either train, valid or eval. '
'Got {0}'.format(which_set)
)
@ -199,6 +199,75 @@ class MNISTDataProvider(DataProvider):
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
return one_of_k_targets
class EMNISTDataProvider(DataProvider):
"""Data provider for EMNIST handwritten digit images."""
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
shuffle_order=True, rng=None):
"""Create a new EMNIST data provider object.
Args:
which_set: One of 'train', 'valid' or 'eval'. Determines which
portion of the EMNIST data this object should provide.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
# check a valid which_set was provided
assert which_set in ['train', 'valid', 'test'], (
'Expected which_set to be either train, valid or eval. '
'Got {0}'.format(which_set)
)
self.which_set = which_set
self.num_classes = 47
# construct path to data using os.path.join to ensure the correct path
# separator for the current platform / OS is used
# MLP_DATA_DIR environment variable should point to the data directory
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set))
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
# load data from compressed numpy file
loaded = np.load(data_path)
print(loaded.keys())
inputs, targets = loaded['inputs'], loaded['targets']
inputs = inputs.astype(np.float32)
inputs = np.reshape(inputs, newshape=(-1, 28*28))
inputs = inputs / 255.0
# pass the loaded data to the parent class __init__
super(EMNISTDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
inputs_batch, targets_batch = super(EMNISTDataProvider, self).next()
return inputs_batch, self.to_one_of_k(targets_batch)
def to_one_of_k(self, int_targets):
"""Converts integer coded class target to 1 of K coded targets.
Args:
int_targets (ndarray): Array of integer coded class targets (i.e.
where an integer from 0 to `num_classes` - 1 is used to
indicate which is the correct class). This should be of shape
(num_data,).
Returns:
Array of 1 of K coded targets i.e. an array of shape
(num_data, num_classes) where for each row all elements are equal
to zero except for the column corresponding to the correct class
which is equal to one.
"""
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
return one_of_k_targets
class MetOfficeDataProvider(DataProvider):
"""South Scotland Met Office weather data provider."""
@ -292,3 +361,41 @@ class CCPPDataProvider(DataProvider):
targets = loaded[which_set + '_targets']
super(CCPPDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
class AugmentedMNISTDataProvider(MNISTDataProvider):
"""Data provider for MNIST dataset which randomly transforms images."""
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
shuffle_order=True, rng=None, transformer=None):
"""Create a new augmented MNIST data provider object.
Args:
which_set: One of 'train', 'valid' or 'test'. Determines which
portion of the MNIST data this object should provide.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
transformer: Function which takes an `inputs` array of shape
(batch_size, input_dim) corresponding to a batch of input
images and a `rng` random number generator object (i.e. a
call signature `transformer(inputs, rng)`) and applies a
potentiall random set of transformations to some / all of the
input images as each new batch is returned when iterating over
the data provider.
"""
super(AugmentedMNISTDataProvider, self).__init__(
which_set, batch_size, max_num_batches, shuffle_order, rng)
self.transformer = transformer
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
inputs_batch, targets_batch = super(
AugmentedMNISTDataProvider, self).next()
transformed_inputs_batch = self.transformer(inputs_batch, self.rng)
return transformed_inputs_batch, targets_batch

View File

@ -154,9 +154,9 @@ class CrossEntropySoftmaxError(object):
Returns:
Scalar error function value.
"""
probs = np.exp(outputs)
probs /= probs.sum(-1)[:, None]
return -np.mean(np.sum(targets * np.log(probs), axis=1))
normOutputs = outputs - outputs.max(-1)[:, None]
logProb = normOutputs - np.log(np.sum(np.exp(normOutputs), axis=-1)[:, None])
return -np.mean(np.sum(targets * logProb, axis=1))
def grad(self, outputs, targets):
"""Calculates gradient of error function with respect to outputs.
@ -168,7 +168,7 @@ class CrossEntropySoftmaxError(object):
Returns:
Gradient of error function with respect to outputs.
"""
probs = np.exp(outputs)
probs = np.exp(outputs - outputs.max(-1)[:, None])
probs /= probs.sum(-1)[:, None]
return (probs - targets) / outputs.shape[0]

View File

@ -14,7 +14,7 @@ respect to the layer parameters.
import numpy as np
import mlp.initialisers as init
from mlp import DEFAULT_SEED
class Layer(object):
"""Abstract class defining the interface for a layer."""
@ -68,6 +68,13 @@ class LayerWithParameters(Layer):
"""
raise NotImplementedError()
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
raise NotImplementedError()
@property
def params(self):
"""Returns a list of parameters of layer.
@ -88,6 +95,127 @@ class LayerWithParameters(Layer):
"""
raise NotImplementedError()
class StochasticLayerWithParameters(Layer):
"""Specialised layer which uses a stochastic forward propagation."""
def __init__(self, rng=None):
"""Constructs a new StochasticLayer object.
Args:
rng (RandomState): Seeded random number generator object.
"""
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def fprop(self, inputs, stochastic=True):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
stochastic: Flag allowing different deterministic
forward-propagation mode in addition to default stochastic
forward-propagation e.g. for use at test time. If False
a deterministic forward-propagation transformation
corresponding to the expected output of the stochastic
forward-propagation is applied.
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
raise NotImplementedError()
def grads_wrt_params(self, inputs, grads_wrt_outputs):
"""Calculates gradients with respect to layer parameters.
Args:
inputs: Array of inputs to layer of shape (batch_size, input_dim).
grads_wrt_to_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
List of arrays of gradients with respect to the layer parameters
with parameter gradients appearing in same order in tuple as
returned from `get_params` method.
"""
raise NotImplementedError()
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
raise NotImplementedError()
@property
def params(self):
"""Returns a list of parameters of layer.
Returns:
List of current parameter values. This list should be in the
corresponding order to the `values` argument to `set_params`.
"""
raise NotImplementedError()
@params.setter
def params(self, values):
"""Sets layer parameters from a list of values.
Args:
values: List of values to set parameters to. This list should be
in the corresponding order to what is returned by `get_params`.
"""
raise NotImplementedError()
class StochasticLayer(Layer):
"""Specialised layer which uses a stochastic forward propagation."""
def __init__(self, rng=None):
"""Constructs a new StochasticLayer object.
Args:
rng (RandomState): Seeded random number generator object.
"""
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
def fprop(self, inputs, stochastic=True):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
stochastic: Flag allowing different deterministic
forward-propagation mode in addition to default stochastic
forward-propagation e.g. for use at test time. If False
a deterministic forward-propagation transformation
corresponding to the expected output of the stochastic
forward-propagation is applied.
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
raise NotImplementedError()
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs. This should correspond to
default stochastic forward-propagation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
raise NotImplementedError()
class AffineLayer(LayerWithParameters):
"""Layer implementing an affine tranformation of its inputs.
@ -97,7 +225,8 @@ class AffineLayer(LayerWithParameters):
def __init__(self, input_dim, output_dim,
weights_initialiser=init.UniformInit(-0.1, 0.1),
biases_initialiser=init.ConstantInit(0.)):
biases_initialiser=init.ConstantInit(0.),
weights_penalty=None, biases_penalty=None):
"""Initialises a parameterised affine layer.
Args:
@ -105,11 +234,17 @@ class AffineLayer(LayerWithParameters):
output_dim (int): Dimension of the layer outputs.
weights_initialiser: Initialiser for the weight parameters.
biases_initialiser: Initialiser for the bias parameters.
weights_penalty: Weights-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the weights.
biases_penalty: Biases-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the biases.
"""
self.input_dim = input_dim
self.output_dim = output_dim
self.weights = weights_initialiser((self.output_dim, self.input_dim))
self.biases = biases_initialiser(self.output_dim)
self.weights_penalty = weights_penalty
self.biases_penalty = biases_penalty
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
@ -123,7 +258,7 @@ class AffineLayer(LayerWithParameters):
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return inputs.dot(self.weights.T) + self.biases
return self.weights.dot(inputs.T).T + self.biases
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
@ -159,8 +294,27 @@ class AffineLayer(LayerWithParameters):
grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs)
grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0)
if self.weights_penalty is not None:
grads_wrt_weights += self.weights_penalty.grad(self.weights)
if self.biases_penalty is not None:
grads_wrt_biases += self.biases_penalty.grad(self.biases)
return [grads_wrt_weights, grads_wrt_biases]
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
params_penalty = 0
if self.weights_penalty is not None:
params_penalty += self.weights_penalty(self.weights)
if self.biases_penalty is not None:
params_penalty += self.biases_penalty(self.biases)
return params_penalty
@property
def params(self):
"""A list of layer parameter values: `[weights, biases]`."""
@ -175,6 +329,94 @@ class AffineLayer(LayerWithParameters):
return 'AffineLayer(input_dim={0}, output_dim={1})'.format(
self.input_dim, self.output_dim)
class BatchNormalizationLayer(StochasticLayerWithParameters):
"""Layer implementing an affine tranformation of its inputs.
This layer is parameterised by a weight matrix and bias vector.
"""
def __init__(self, input_dim, rng=None):
"""Initialises a parameterised affine layer.
Args:
input_dim (int): Dimension of inputs to the layer.
output_dim (int): Dimension of the layer outputs.
weights_initialiser: Initialiser for the weight parameters.
biases_initialiser: Initialiser for the bias parameters.
weights_penalty: Weights-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the weights.
biases_penalty: Biases-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the biases.
"""
super(BatchNormalizationLayer, self).__init__(rng)
self.beta = np.random.normal(size=(input_dim))
self.gamma = np.random.normal(size=(input_dim))
self.epsilon = 0.00001
self.cache = None
self.input_dim = input_dim
def fprop(self, inputs, stochastic=True):
"""Forward propagates inputs through a layer."""
raise NotImplementedError
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
raise NotImplementedError
def grads_wrt_params(self, inputs, grads_wrt_outputs):
"""Calculates gradients with respect to layer parameters.
Args:
inputs: array of inputs to layer of shape (batch_size, input_dim)
grads_wrt_to_outputs: array of gradients with respect to the layer
outputs of shape (batch_size, output_dim)
Returns:
list of arrays of gradients with respect to the layer parameters
`[grads_wrt_weights, grads_wrt_biases]`.
"""
raise NotImplementedError
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
params_penalty = 0
return params_penalty
@property
def params(self):
"""A list of layer parameter values: `[gammas, betas]`."""
return [self.gamma, self.beta]
@params.setter
def params(self, values):
self.gamma = values[0]
self.beta = values[1]
def __repr__(self):
return 'BatchNormalizationLayer(input_dim={0})'.format(
self.input_dim)
class SigmoidLayer(Layer):
"""Layer implementing an element-wise logistic sigmoid transformation."""
@ -215,6 +457,325 @@ class SigmoidLayer(Layer):
def __repr__(self):
return 'SigmoidLayer'
class ConvolutionalLayer(LayerWithParameters):
"""Layer implementing a 2D convolution-based transformation of its inputs.
The layer is parameterised by a set of 2D convolutional kernels, a four
dimensional array of shape
(num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2)
and a bias vector, a one dimensional array of shape
(num_output_channels,)
i.e. one shared bias per output channel.
Assuming no-padding is applied to the inputs so that outputs are only
calculated for positions where the kernel filters fully overlap with the
inputs, and that unit strides are used the outputs will have spatial extent
output_dim_1 = input_dim_1 - kernel_dim_1 + 1
output_dim_2 = input_dim_2 - kernel_dim_2 + 1
"""
def __init__(self, num_input_channels, num_output_channels,
input_dim_1, input_dim_2,
kernel_dim_1, kernel_dim_2,
kernels_init=init.UniformInit(-0.01, 0.01),
biases_init=init.ConstantInit(0.),
kernels_penalty=None, biases_penalty=None):
"""Initialises a parameterised convolutional layer.
Args:
num_input_channels (int): Number of channels in inputs to
layer (this may be number of colour channels in the input
images if used as the first layer in a model, or the
number of output channels, a.k.a. feature maps, from a
a previous convolutional layer).
num_output_channels (int): Number of channels in outputs
from the layer, a.k.a. number of feature maps.
input_dim_1 (int): Size of first input dimension of each 2D
channel of inputs.
input_dim_2 (int): Size of second input dimension of each 2D
channel of inputs.
kernel_dim_1 (int): Size of first dimension of each 2D channel of
kernels.
kernel_dim_2 (int): Size of second dimension of each 2D channel of
kernels.
kernels_intialiser: Initialiser for the kernel parameters.
biases_initialiser: Initialiser for the bias parameters.
kernels_penalty: Kernel-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the kernels.
biases_penalty: Biases-dependent penalty term (regulariser) or
None if no regularisation is to be applied to the biases.
"""
self.num_input_channels = num_input_channels
self.num_output_channels = num_output_channels
self.input_dim_1 = input_dim_1
self.input_dim_2 = input_dim_2
self.kernel_dim_1 = kernel_dim_1
self.kernel_dim_2 = kernel_dim_2
self.kernels_init = kernels_init
self.biases_init = biases_init
self.kernels_shape = (
num_output_channels, num_input_channels, kernel_dim_1, kernel_dim_2
)
self.inputs_shape = (
None, num_input_channels, input_dim_1, input_dim_2
)
self.kernels = self.kernels_init(self.kernels_shape)
self.biases = self.biases_init(num_output_channels)
self.kernels_penalty = kernels_penalty
self.biases_penalty = biases_penalty
self.cache = None
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer
corresponds to `y = conv2d(x, K) + b`.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
raise NotImplementedError
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape
(batch_size, num_input_channels, input_dim_1, input_dim_2).
outputs: Array of layer outputs calculated in forward pass of
shape
(batch_size, num_output_channels, output_dim_1, output_dim_2).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape
(batch_size, num_output_channels, output_dim_1, output_dim_2).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
# Pad the grads_wrt_outputs
raise NotImplementedError
def grads_wrt_params(self, inputs, grads_wrt_outputs):
"""Calculates gradients with respect to layer parameters.
Args:
inputs: array of inputs to layer of shape (batch_size, input_dim)
grads_wrt_to_outputs: array of gradients with respect to the layer
outputs of shape
(batch_size, num_output-_channels, output_dim_1, output_dim_2).
Returns:
list of arrays of gradients with respect to the layer parameters
`[grads_wrt_kernels, grads_wrt_biases]`.
"""
raise NotImplementedError
def params_penalty(self):
"""Returns the parameter dependent penalty term for this layer.
If no parameter-dependent penalty terms are set this returns zero.
"""
params_penalty = 0
if self.kernels_penalty is not None:
params_penalty += self.kernels_penalty(self.kernels)
if self.biases_penalty is not None:
params_penalty += self.biases_penalty(self.biases)
return params_penalty
@property
def params(self):
"""A list of layer parameter values: `[kernels, biases]`."""
return [self.kernels, self.biases]
@params.setter
def params(self, values):
self.kernels = values[0]
self.biases = values[1]
def __repr__(self):
return (
'ConvolutionalLayer(\n'
' num_input_channels={0}, num_output_channels={1},\n'
' input_dim_1={2}, input_dim_2={3},\n'
' kernel_dim_1={4}, kernel_dim_2={5}\n'
')'
.format(self.num_input_channels, self.num_output_channels,
self.input_dim_1, self.input_dim_2, self.kernel_dim_1,
self.kernel_dim_2)
)
class ReluLayer(Layer):
"""Layer implementing an element-wise rectified linear transformation."""
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return np.maximum(inputs, 0.)
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
return (outputs > 0) * grads_wrt_outputs
def __repr__(self):
return 'ReluLayer'
class LeakyReluLayer(Layer):
"""Layer implementing an element-wise rectified linear transformation."""
def __init__(self, alpha=0.01):
self.alpha = alpha
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
"""
positive_inputs = np.maximum(inputs, 0.)
negative_inputs = inputs
negative_inputs[negative_inputs>0] = 0.
negative_inputs = negative_inputs * self.alpha
outputs = positive_inputs + negative_inputs
return outputs
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
"""
positive_gradients = (outputs > 0) * grads_wrt_outputs
negative_gradients = self.alpha * (outputs < 0) * grads_wrt_outputs
gradients = positive_gradients + negative_gradients
return gradients
def __repr__(self):
return 'LeakyReluLayer'
class ELULayer(Layer):
"""Layer implementing an ELU activation."""
def __init__(self, alpha=1.0):
self.alpha = alpha
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
"""
positive_inputs = np.maximum(inputs, 0.)
negative_inputs = np.copy(inputs)
negative_inputs[negative_inputs>0] = 0.
negative_inputs = self.alpha * (np.exp(negative_inputs) - 1)
outputs = positive_inputs + negative_inputs
return outputs
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
"""
positive_gradients = (outputs >= 0) * grads_wrt_outputs
outputs_to_use = (outputs < 0) * outputs
negative_gradients = (outputs_to_use + self.alpha)
negative_gradients[outputs >= 0] = 0.
negative_gradients = negative_gradients * grads_wrt_outputs
gradients = positive_gradients + negative_gradients
return gradients
def __repr__(self):
return 'ELULayer'
class SELULayer(Layer):
"""Layer implementing an element-wise rectified linear transformation."""
#α01 ≈ 1.6733 and λ01 ≈ 1.0507
def __init__(self):
self.alpha = 1.6733
self.lamda = 1.0507
self.elu = ELULayer(alpha=self.alpha)
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`.
"""
outputs = self.lamda * self.elu.fprop(inputs)
return outputs
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
"""
scaled_outputs = outputs / self.lamda
gradients = self.lamda * self.elu.bprop(inputs=inputs, outputs=scaled_outputs,
grads_wrt_outputs=grads_wrt_outputs)
return gradients
def __repr__(self):
return 'SELULayer'
class TanhLayer(Layer):
"""Layer implementing an element-wise hyperbolic tangent transformation."""
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return np.tanh(inputs)
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
return (1. - outputs**2) * grads_wrt_outputs
def __repr__(self):
return 'TanhLayer'
class SoftmaxLayer(Layer):
"""Layer implementing a softmax transformation."""
@ -232,7 +793,9 @@ class SoftmaxLayer(Layer):
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
exp_inputs = np.exp(inputs)
# subtract max inside exponential to improve numerical stability -
# when we divide through by sum this term cancels
exp_inputs = np.exp(inputs - inputs.max(-1)[:, None])
return exp_inputs / exp_inputs.sum(-1)[:, None]
def bprop(self, inputs, outputs, grads_wrt_outputs):
@ -258,6 +821,7 @@ class SoftmaxLayer(Layer):
def __repr__(self):
return 'SoftmaxLayer'
class RadialBasisFunctionLayer(Layer):
"""Layer implementing projection to a grid of radial basis functions."""
@ -317,3 +881,122 @@ class RadialBasisFunctionLayer(Layer):
def __repr__(self):
return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim)
class DropoutLayer(StochasticLayer):
"""Layer which stochastically drops input dimensions in its output."""
def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True):
"""Construct a new dropout layer.
Args:
rng (RandomState): Seeded random number generator.
incl_prob: Scalar value in (0, 1] specifying the probability of
each input dimension being included in the output.
share_across_batch: Whether to use same dropout mask across
all inputs in a batch or use per input masks.
"""
super(DropoutLayer, self).__init__(rng)
assert incl_prob > 0. and incl_prob <= 1.
self.incl_prob = incl_prob
self.share_across_batch = share_across_batch
self.rng = rng
def fprop(self, inputs, stochastic=True):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
stochastic: Flag allowing different deterministic
forward-propagation mode in addition to default stochastic
forward-propagation e.g. for use at test time. If False
a deterministic forward-propagation transformation
corresponding to the expected output of the stochastic
forward-propagation is applied.
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
if stochastic:
mask_shape = (1,) + inputs.shape[1:] if self.share_across_batch else inputs.shape
self._mask = (self.rng.uniform(size=mask_shape) < self.incl_prob)
return inputs * self._mask
else:
return inputs * self.incl_prob
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs. This should correspond to
default stochastic forward-propagation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
return grads_wrt_outputs * self._mask
def __repr__(self):
return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob)
class ReshapeLayer(Layer):
"""Layer which reshapes dimensions of inputs."""
def __init__(self, output_shape=None):
"""Create a new reshape layer object.
Args:
output_shape: Tuple specifying shape each input in batch should
be reshaped to in outputs. This **excludes** the batch size
so the shape of the final output array will be
(batch_size, ) + output_shape
Similarly to numpy.reshape, one shape dimension can be -1. In
this case, the value is inferred from the size of the input
array and remaining dimensions. The shape specified must be
compatible with the input array shape - i.e. the total number
of values in the array cannot be changed. If set to `None` the
output shape will be set to
(batch_size, -1)
which will flatten all the inputs to vectors.
"""
self.output_shape = (-1,) if output_shape is None else output_shape
def fprop(self, inputs):
"""Forward propagates activations through the layer transformation.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
Returns:
outputs: Array of layer outputs of shape (batch_size, output_dim).
"""
return inputs.reshape((inputs.shape[0],) + self.output_shape)
def bprop(self, inputs, outputs, grads_wrt_outputs):
"""Back propagates gradients through a layer.
Given gradients with respect to the outputs of the layer calculates the
gradients with respect to the layer inputs.
Args:
inputs: Array of layer inputs of shape (batch_size, input_dim).
outputs: Array of layer outputs calculated in forward pass of
shape (batch_size, output_dim).
grads_wrt_outputs: Array of gradients with respect to the layer
outputs of shape (batch_size, output_dim).
Returns:
Array of gradients with respect to the layer inputs of shape
(batch_size, input_dim).
"""
return grads_wrt_outputs.reshape(inputs.shape)
def __repr__(self):
return 'ReshapeLayer(output_shape={0})'.format(self.output_shape)

View File

@ -8,7 +8,7 @@ outputs (and intermediate states) and for calculating gradients of scalar
functions of the outputs with respect to the model parameters.
"""
from mlp.layers import LayerWithParameters
from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters
class SingleLayerModel(object):
@ -60,7 +60,7 @@ class SingleLayerModel(object):
return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs)
def __repr__(self):
return 'SingleLayerModel(' + str(layer) + ')'
return 'SingleLayerModel(' + str(self.layer) + ')'
class MultipleLayerModel(object):
@ -84,7 +84,7 @@ class MultipleLayerModel(object):
params += layer.params
return params
def fprop(self, inputs):
def fprop(self, inputs, evaluation=False):
"""Forward propagates a batch of inputs through the model.
Args:
@ -97,7 +97,19 @@ class MultipleLayerModel(object):
"""
activations = [inputs]
for i, layer in enumerate(self.layers):
activations.append(self.layers[i].fprop(activations[i]))
if evaluation:
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
StochasticLayerWithParameters):
current_activations = self.layers[i].fprop(activations[i], stochastic=False)
else:
current_activations = self.layers[i].fprop(activations[i])
else:
if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]),
StochasticLayerWithParameters):
current_activations = self.layers[i].fprop(activations[i], stochastic=True)
else:
current_activations = self.layers[i].fprop(activations[i])
activations.append(current_activations)
return activations
def grads_wrt_params(self, activations, grads_wrt_outputs):

View File

@ -9,7 +9,7 @@ import time
import logging
from collections import OrderedDict
import numpy as np
import tqdm
logger = logging.getLogger(__name__)
@ -18,7 +18,7 @@ class Optimiser(object):
"""Basic model optimiser."""
def __init__(self, model, error, learning_rule, train_dataset,
valid_dataset=None, data_monitors=None):
valid_dataset=None, data_monitors=None, notebook=False):
"""Create a new optimiser instance.
Args:
@ -43,6 +43,11 @@ class Optimiser(object):
self.data_monitors = OrderedDict([('error', error)])
if data_monitors is not None:
self.data_monitors.update(data_monitors)
self.notebook = notebook
if notebook:
self.tqdm_progress = tqdm.tqdm_notebook
else:
self.tqdm_progress = tqdm.tqdm
def do_training_epoch(self):
"""Do a single training epoch.
@ -52,12 +57,15 @@ class Optimiser(object):
respect to all the model parameters and then updates the model
parameters according to the learning rule.
"""
for inputs_batch, targets_batch in self.train_dataset:
activations = self.model.fprop(inputs_batch)
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
grads_wrt_params = self.model.grads_wrt_params(
activations, grads_wrt_outputs)
self.learning_rule.update_params(grads_wrt_params)
with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar:
train_progress_bar.set_description("Epoch Progress")
for inputs_batch, targets_batch in self.train_dataset:
activations = self.model.fprop(inputs_batch)
grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
grads_wrt_params = self.model.grads_wrt_params(
activations, grads_wrt_outputs)
self.learning_rule.update_params(grads_wrt_params)
train_progress_bar.update(1)
def eval_monitors(self, dataset, label):
"""Evaluates the monitors for the given dataset.
@ -72,7 +80,7 @@ class Optimiser(object):
data_mon_vals = OrderedDict([(key + label, 0.) for key
in self.data_monitors.keys()])
for inputs_batch, targets_batch in dataset:
activations = self.model.fprop(inputs_batch)
activations = self.model.fprop(inputs_batch, evaluation=True)
for key, data_monitor in self.data_monitors.items():
data_mon_vals[key + label] += data_monitor(
activations[-1], targets_batch)
@ -121,17 +129,20 @@ class Optimiser(object):
and the second being a dict mapping the labels for the statistics
recorded to their column index in the array.
"""
start_train_time = time.clock()
start_train_time = time.time()
run_stats = [list(self.get_epoch_stats().values())]
for epoch in range(1, num_epochs + 1):
start_time = time.clock()
self.do_training_epoch()
epoch_time = time.clock() - start_time
if epoch % stats_interval == 0:
stats = self.get_epoch_stats()
self.log_stats(epoch, epoch_time, stats)
run_stats.append(list(stats.values()))
finish_train_time = time.clock()
with self.tqdm_progress(total=num_epochs) as progress_bar:
progress_bar.set_description("Experiment Progress")
for epoch in range(1, num_epochs + 1):
start_time = time.time()
self.do_training_epoch()
epoch_time = time.time()- start_time
if epoch % stats_interval == 0:
stats = self.get_epoch_stats()
self.log_stats(epoch, epoch_time, stats)
run_stats.append(list(stats.values()))
progress_bar.update(1)
finish_train_time = time.time()
total_train_time = finish_train_time - start_train_time
return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,152 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from mlp.layers import BatchNormalizationLayer\n",
"test_inputs = np.array([[-1.38066782, -0.94725498, -3.05585424, 2.28644454, 0.85520889,\n",
" 0.10575624, 0.23618609, 0.84723205, 1.06569909, -2.21704034],\n",
" [ 0.11060968, -0.0747448 , 0.56809029, 2.45926149, -2.28677816,\n",
" -0.9964566 , 2.7356007 , 1.98002308, -0.39032315, 1.46515481]])\n",
"test_grads_wrt_outputs = np.array([[-0.43857052, 1.00380109, -1.18425494, 0.00486091, 0.21470207,\n",
" -0.12179054, -0.11508482, 0.738482 , -1.17249238, 0.69188295],\n",
" [ 1.07802015, 0.69901145, 0.81603688, -1.76743026, -1.24418692,\n",
" -0.65729963, -0.50834305, -0.49016145, 1.63749743, -0.71123104]])\n",
"\n",
"#produce BatchNorm fprop and bprop\n",
"activation_layer = BatchNormalizationLayer(input_dim=10)\n",
"\n",
"beta = np.array(10*[0.3])\n",
"gamma = np.array(10*[0.5])\n",
"\n",
"activation_layer.params = [gamma, beta]\n",
"BN_fprop = activation_layer.fprop(test_inputs)\n",
"BN_bprop = activation_layer.bprop(\n",
" test_inputs, BN_fprop, test_grads_wrt_outputs)\n",
"BN_grads_wrt_params = activation_layer.grads_wrt_params(\n",
" test_inputs, test_grads_wrt_outputs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"true_fprop_outputs = np.array([[-0.1999955 , -0.19998686, -0.19999924, -0.1996655 , 0.79999899,\n",
" 0.79999177, -0.1999984 , -0.19999221, 0.79999528, -0.19999926],\n",
" [ 0.7999955 , 0.79998686, 0.79999924, 0.7996655 , -0.19999899,\n",
" -0.19999177, 0.7999984 , 0.79999221, -0.19999528, 0.79999926]])\n",
"assert BN_fprop.shape == true_fprop_outputs.shape, (\n",
" 'Layer bprop returns incorrect shaped array. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_fprop_outputs.shape, BN_fprop.shape)\n",
")\n",
"assert np.allclose(np.round(BN_fprop, decimals=2), np.round(true_fprop_outputs, decimals=2)), (\n",
"'Layer bprop does not return correct values. '\n",
"'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
".format(true_fprop_outputs, BN_fprop, BN_fprop-true_fprop_outputs)\n",
")\n",
"\n",
"print(\"Batch Normalization F-prop test passed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"true_bprop_outputs = np.array([[ -9.14558020e-06, 9.17665617e-06, -8.40575535e-07,\n",
" 6.85384297e-03, 9.40668131e-07, 7.99795574e-06,\n",
" 5.03719464e-07, 1.69038704e-05, -1.82061629e-05,\n",
" 5.62083224e-07],\n",
" [ 9.14558020e-06, -9.17665617e-06, 8.40575535e-07,\n",
" -6.85384297e-03, -9.40668131e-07, -7.99795574e-06,\n",
" -5.03719464e-07, -1.69038704e-05, 1.82061629e-05,\n",
" -5.62083224e-07]])\n",
"assert BN_bprop.shape == true_bprop_outputs.shape, (\n",
" 'Layer bprop returns incorrect shaped array. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_bprop_outputs.shape, BN_bprop.shape)\n",
")\n",
"assert np.allclose(np.round(BN_bprop, decimals=2), np.round(true_bprop_outputs, decimals=2)), (\n",
"'Layer bprop does not return correct values. '\n",
"'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
".format(true_bprop_outputs, BN_bprop, BN_bprop-true_bprop_outputs)\n",
")\n",
"\n",
"print(\"Batch Normalization B-prop test passed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"grads_wrt_gamma, grads_wrt_beta = BN_grads_wrt_params\n",
"true_grads_wrt_gamma = np.array(([ 1.51657703, -0.30478163, 2.00028878, -1.77110552, 1.45888603,\n",
" 0.53550028, -0.39325697, -1.2286243 , -2.8099633 , -1.40311192]))\n",
"true_grads_wrt_beta = np.array([ 0.63944963, 1.70281254, -0.36821806, -1.76256935, -1.02948485,\n",
" -0.77909018, -0.62342786, 0.24832055, 0.46500505, -0.01934809])\n",
"\n",
"assert grads_wrt_gamma.shape == true_grads_wrt_gamma.shape, (\n",
" 'Layer bprop returns incorrect shaped array. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_grads_wrt_gamma.shape, grads_wrt_gamma.shape)\n",
")\n",
"assert np.allclose(np.round(grads_wrt_gamma, decimals=2), np.round(true_grads_wrt_gamma, decimals=2)), (\n",
"'Layer bprop does not return correct values. '\n",
"'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
".format(true_grads_wrt_gamma, grads_wrt_gamma, grads_wrt_gamma-true_grads_wrt_gamma)\n",
")\n",
"\n",
"assert grads_wrt_beta.shape == true_grads_wrt_beta.shape, (\n",
" 'Layer bprop returns incorrect shaped array. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_grads_wrt_beta.shape, grads_wrt_beta.shape)\n",
")\n",
"assert np.allclose(np.round(grads_wrt_beta, decimals=2), np.round(true_grads_wrt_beta, decimals=2)), (\n",
"'Layer bprop does not return correct values. '\n",
"'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
".format(true_grads_wrt_beta, grads_wrt_beta, grads_wrt_beta-true_grads_wrt_beta)\n",
")\n",
"\n",
"print(\"Batch Normalization grads wrt to params test passed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -0,0 +1,307 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below a skeleton class and associated test functions for the `fprop`, `bprop` and `grads_wrt_params` methods of the ConvolutionalLayer class are included.\n",
"\n",
"The test functions assume that in your implementation of `fprop` for the convolutional layer, outputs are calculated only for 'valid' overlaps of the kernel filters with the input - i.e. without any padding.\n",
"\n",
"It is also assumed that if convolutions with non-unit strides are implemented the default behaviour is to take unit-strides, with the test cases only correct for unit strides in both directions."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The three test functions are defined in the cell below. All the functions take as first argument the *class* corresponding to the convolutional layer implementation to be tested (**not** an instance of the class). It is assumed the class being tested has an `__init__` method with at least all of the arguments defined in the skeleton definition above. A boolean second argument to each function can be used to specify if the layer implements a cross-correlation or convolution based operation (see note in [seventh lecture slides](http://www.inf.ed.ac.uk/teaching/courses/mlp/2016/mlp07-cnn.pdf))."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def test_conv_layer_fprop(layer_class, do_cross_correlation=False):\n",
" \"\"\"Tests `fprop` method of a convolutional layer.\n",
" \n",
" Checks the outputs of `fprop` method for a fixed input against known\n",
" reference values for the outputs and raises an AssertionError if\n",
" the outputted values are not consistent with the reference values. If\n",
" tests are all passed returns True.\n",
" \n",
" Args:\n",
" layer_class: Convolutional layer implementation following the \n",
" interface defined in the provided skeleton class.\n",
" do_cross_correlation: Whether the layer implements an operation\n",
" corresponding to cross-correlation (True) i.e kernels are\n",
" not flipped before sliding over inputs, or convolution\n",
" (False) with filters being flipped.\n",
"\n",
" Raises:\n",
" AssertionError: Raised if output of `layer.fprop` is inconsistent \n",
" with reference values either in shape or values.\n",
" \"\"\"\n",
" inputs = np.arange(96).reshape((2, 3, 4, 4))\n",
" kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))\n",
" if do_cross_correlation:\n",
" kernels = kernels[:, :, ::-1, ::-1]\n",
" biases = np.arange(2)\n",
" true_output = np.array(\n",
" [[[[ -958., -1036., -1114.],\n",
" [-1270., -1348., -1426.],\n",
" [-1582., -1660., -1738.]],\n",
" [[ 1707., 1773., 1839.],\n",
" [ 1971., 2037., 2103.],\n",
" [ 2235., 2301., 2367.]]],\n",
" [[[-4702., -4780., -4858.],\n",
" [-5014., -5092., -5170.],\n",
" [-5326., -5404., -5482.]],\n",
" [[ 4875., 4941., 5007.],\n",
" [ 5139., 5205., 5271.],\n",
" [ 5403., 5469., 5535.]]]]\n",
" )\n",
" \n",
" layer = layer_class(\n",
" num_input_channels=kernels.shape[1], \n",
" num_output_channels=kernels.shape[0], \n",
" input_dim_1=inputs.shape[2], \n",
" input_dim_2=inputs.shape[3],\n",
" kernel_dim_1=kernels.shape[2],\n",
" kernel_dim_2=kernels.shape[3]\n",
" )\n",
" layer.params = [kernels, biases]\n",
" layer_output = layer.fprop(inputs)\n",
" \n",
" assert layer_output.shape == true_output.shape, (\n",
" 'Layer fprop gives incorrect shaped output. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_output.shape, layer_output.shape)\n",
" )\n",
" assert np.allclose(layer_output, true_output), (\n",
" 'Layer fprop does not give correct output. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}.'\n",
" .format(true_output, layer_output, true_output-layer_output)\n",
" )\n",
" return True\n",
"\n",
"def test_conv_layer_bprop(layer_class, do_cross_correlation=False):\n",
" \"\"\"Tests `bprop` method of a convolutional layer.\n",
" \n",
" Checks the outputs of `bprop` method for a fixed input against known\n",
" reference values for the gradients with respect to inputs and raises \n",
" an AssertionError if the returned values are not consistent with the\n",
" reference values. If tests are all passed returns True.\n",
" \n",
" Args:\n",
" layer_class: Convolutional layer implementation following the \n",
" interface defined in the provided skeleton class.\n",
" do_cross_correlation: Whether the layer implements an operation\n",
" corresponding to cross-correlation (True) i.e kernels are\n",
" not flipped before sliding over inputs, or convolution\n",
" (False) with filters being flipped.\n",
"\n",
" Raises:\n",
" AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
" with reference values either in shape or values.\n",
" \"\"\"\n",
" inputs = np.arange(96).reshape((2, 3, 4, 4))\n",
" kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))\n",
" if do_cross_correlation:\n",
" kernels = kernels[:, :, ::-1, ::-1]\n",
" biases = np.arange(2)\n",
" grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3))\n",
" outputs = np.array(\n",
" [[[[ -958., -1036., -1114.],\n",
" [-1270., -1348., -1426.],\n",
" [-1582., -1660., -1738.]],\n",
" [[ 1707., 1773., 1839.],\n",
" [ 1971., 2037., 2103.],\n",
" [ 2235., 2301., 2367.]]],\n",
" [[[-4702., -4780., -4858.],\n",
" [-5014., -5092., -5170.],\n",
" [-5326., -5404., -5482.]],\n",
" [[ 4875., 4941., 5007.],\n",
" [ 5139., 5205., 5271.],\n",
" [ 5403., 5469., 5535.]]]]\n",
" )\n",
" true_grads_wrt_inputs = np.array(\n",
" [[[[ 147., 319., 305., 162.],\n",
" [ 338., 716., 680., 354.],\n",
" [ 290., 608., 572., 294.],\n",
" [ 149., 307., 285., 144.]],\n",
" [[ 23., 79., 81., 54.],\n",
" [ 114., 284., 280., 162.],\n",
" [ 114., 272., 268., 150.],\n",
" [ 73., 163., 157., 84.]],\n",
" [[-101., -161., -143., -54.],\n",
" [-110., -148., -120., -30.],\n",
" [ -62., -64., -36., 6.],\n",
" [ -3., 19., 29., 24.]]],\n",
" [[[ 39., 67., 53., 18.],\n",
" [ 50., 68., 32., -6.],\n",
" [ 2., -40., -76., -66.],\n",
" [ -31., -89., -111., -72.]],\n",
" [[ 59., 115., 117., 54.],\n",
" [ 114., 212., 208., 90.],\n",
" [ 114., 200., 196., 78.],\n",
" [ 37., 55., 49., 12.]],\n",
" [[ 79., 163., 181., 90.],\n",
" [ 178., 356., 384., 186.],\n",
" [ 226., 440., 468., 222.],\n",
" [ 105., 199., 209., 96.]]]])\n",
" layer = layer_class(\n",
" num_input_channels=kernels.shape[1], \n",
" num_output_channels=kernels.shape[0], \n",
" input_dim_1=inputs.shape[2], \n",
" input_dim_2=inputs.shape[3],\n",
" kernel_dim_1=kernels.shape[2],\n",
" kernel_dim_2=kernels.shape[3]\n",
" )\n",
" layer.params = [kernels, biases]\n",
" layer_grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs)\n",
" assert layer_grads_wrt_inputs.shape == true_grads_wrt_inputs.shape, (\n",
" 'Layer bprop returns incorrect shaped array. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_grads_wrt_inputs.shape, layer_grads_wrt_inputs.shape)\n",
" )\n",
" assert np.allclose(layer_grads_wrt_inputs, true_grads_wrt_inputs), (\n",
" 'Layer bprop does not return correct values. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}\\n\\n difference is \\n\\n{2}'\n",
" .format(true_grads_wrt_inputs, layer_grads_wrt_inputs, layer_grads_wrt_inputs-true_grads_wrt_inputs)\n",
" )\n",
" return True\n",
"\n",
"def test_conv_layer_grad_wrt_params(\n",
" layer_class, do_cross_correlation=False):\n",
" \"\"\"Tests `grad_wrt_params` method of a convolutional layer.\n",
" \n",
" Checks the outputs of `grad_wrt_params` method for fixed inputs \n",
" against known reference values for the gradients with respect to \n",
" kernels and biases, and raises an AssertionError if the returned\n",
" values are not consistent with the reference values. If tests\n",
" are all passed returns True.\n",
" \n",
" Args:\n",
" layer_class: Convolutional layer implementation following the \n",
" interface defined in the provided skeleton class.\n",
" do_cross_correlation: Whether the layer implements an operation\n",
" corresponding to cross-correlation (True) i.e kernels are\n",
" not flipped before sliding over inputs, or convolution\n",
" (False) with filters being flipped.\n",
"\n",
" Raises:\n",
" AssertionError: Raised if output of `layer.bprop` is inconsistent \n",
" with reference values either in shape or values.\n",
" \"\"\"\n",
" inputs = np.arange(96).reshape((2, 3, 4, 4))\n",
" kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))\n",
" biases = np.arange(2)\n",
" grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3))\n",
" true_kernel_grads = np.array(\n",
" [[[[ -240., -114.],\n",
" [ 264., 390.]],\n",
" [[-2256., -2130.],\n",
" [-1752., -1626.]],\n",
" [[-4272., -4146.],\n",
" [-3768., -3642.]]],\n",
" [[[ 5268., 5232.],\n",
" [ 5124., 5088.]],\n",
" [[ 5844., 5808.],\n",
" [ 5700., 5664.]],\n",
" [[ 6420., 6384.],\n",
" [ 6276., 6240.]]]])\n",
" if do_cross_correlation:\n",
" kernels = kernels[:, :, ::-1, ::-1]\n",
" true_kernel_grads = true_kernel_grads[:, :, ::-1, ::-1]\n",
" true_bias_grads = np.array([-126., 36.])\n",
" layer = layer_class(\n",
" num_input_channels=kernels.shape[1], \n",
" num_output_channels=kernels.shape[0], \n",
" input_dim_1=inputs.shape[2], \n",
" input_dim_2=inputs.shape[3],\n",
" kernel_dim_1=kernels.shape[2],\n",
" kernel_dim_2=kernels.shape[3]\n",
" )\n",
" layer.params = [kernels, biases]\n",
" layer_kernel_grads, layer_bias_grads = (\n",
" layer.grads_wrt_params(inputs, grads_wrt_outputs))\n",
" assert layer_kernel_grads.shape == true_kernel_grads.shape, (\n",
" 'grads_wrt_params gives incorrect shaped kernel gradients output. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_kernel_grads.shape, layer_kernel_grads.shape)\n",
" )\n",
" assert np.allclose(layer_kernel_grads, true_kernel_grads), (\n",
" 'grads_wrt_params does not give correct kernel gradients output. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
" .format(true_kernel_grads, layer_kernel_grads)\n",
" )\n",
" assert layer_bias_grads.shape == true_bias_grads.shape, (\n",
" 'grads_wrt_params gives incorrect shaped bias gradients output. '\n",
" 'Correct shape is \\n\\n{0}\\n\\n but returned shape is \\n\\n{1}.'\n",
" .format(true_bias_grads.shape, layer_bias_grads.shape)\n",
" )\n",
" assert np.allclose(layer_bias_grads, true_bias_grads), (\n",
" 'grads_wrt_params does not give correct bias gradients output. '\n",
" 'Correct output is \\n\\n{0}\\n\\n but returned output is \\n\\n{1}.'\n",
" .format(true_bias_grads, layer_bias_grads)\n",
" )\n",
" return True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An example of using the test functions if given in the cell below. This assumes you implement a convolution (rather than cross-correlation) operation. If the implementation is correct "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from mlp.layers import ConvolutionalLayer\n",
"fprop_correct = test_conv_layer_fprop(ConvolutionalLayer, False)\n",
"bprop_correct = test_conv_layer_bprop(ConvolutionalLayer, False)\n",
"grads_wrt_param_correct = test_conv_layer_grad_wrt_params(ConvolutionalLayer, False)\n",
"if fprop_correct and grads_wrt_param_correct and bprop_correct:\n",
" print('All tests passed.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -0,0 +1,147 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Coursework 2\n",
"\n",
"This notebook is intended to be used as a starting point for your experiments. The instructions can be found in the instructions file located under spec/coursework2.pdf. The methods provided here are just helper functions. If you want more complex graphs such as side by side comparisons of different experiments you should learn more about matplotlib and implement them. Before each experiment remember to re-initialize neural network weights and reset the data providers so you get a properly initialized experiment. For each experiment try to keep most hyperparameters the same except the one under investigation so you can understand what the effects of each are."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"plt.style.use('ggplot')\n",
"\n",
"def train_model_and_plot_stats(\n",
" model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True):\n",
" \n",
" # As well as monitoring the error over training also monitor classification\n",
" # accuracy i.e. proportion of most-probable predicted classes being equal to targets\n",
" data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}\n",
"\n",
" # Use the created objects to initialise a new Optimiser instance.\n",
" optimiser = Optimiser(\n",
" model, error, learning_rule, train_data, valid_data, data_monitors, notebook=notebook)\n",
"\n",
" # Run the optimiser for 5 epochs (full passes through the training set)\n",
" # printing statistics every epoch.\n",
" stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)\n",
"\n",
" # Plot the change in the validation and training set error over training.\n",
" fig_1 = plt.figure(figsize=(8, 4))\n",
" ax_1 = fig_1.add_subplot(111)\n",
" for k in ['error(train)', 'error(valid)']:\n",
" ax_1.plot(np.arange(1, stats.shape[0]) * stats_interval, \n",
" stats[1:, keys[k]], label=k)\n",
" ax_1.legend(loc=0)\n",
" ax_1.set_xlabel('Epoch number')\n",
"\n",
" # Plot the change in the validation and training set accuracy over training.\n",
" fig_2 = plt.figure(figsize=(8, 4))\n",
" ax_2 = fig_2.add_subplot(111)\n",
" for k in ['acc(train)', 'acc(valid)']:\n",
" ax_2.plot(np.arange(1, stats.shape[0]) * stats_interval, \n",
" stats[1:, keys[k]], label=k)\n",
" ax_2.legend(loc=0)\n",
" ax_2.set_xlabel('Epoch number')\n",
" \n",
" return stats, keys, run_time, fig_1, ax_1, fig_2, ax_2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The below code will set up the data providers, random number\n",
"# generator and logger objects needed for training runs. As\n",
"# loading the data from file take a little while you generally\n",
"# will probably not want to reload the data providers on\n",
"# every training run. If you wish to reset their state you\n",
"# should instead use the .reset() method of the data providers.\n",
"import numpy as np\n",
"import logging\n",
"from mlp.data_providers import MNISTDataProvider, EMNISTDataProvider\n",
"\n",
"# Seed a random number generator\n",
"seed = 10102016 \n",
"rng = np.random.RandomState(seed)\n",
"batch_size = 100\n",
"# Set up a logger object to print info about the training run to stdout\n",
"logger = logging.getLogger()\n",
"logger.setLevel(logging.INFO)\n",
"logger.handlers = [logging.StreamHandler()]\n",
"\n",
"# Create data provider objects for the MNIST data set\n",
"train_data = EMNISTDataProvider('train', batch_size=batch_size, rng=rng)\n",
"valid_data = EMNISTDataProvider('valid', batch_size=batch_size, rng=rng)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The model set up code below is provided as a starting point.\n",
"# You will probably want to add further code cells for the\n",
"# different experiments you run.\n",
"\n",
"from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer, LeakyReluLayer, ELULayer, SELULayer\n",
"from mlp.errors import CrossEntropySoftmaxError\n",
"from mlp.models import MultipleLayerModel\n",
"from mlp.initialisers import ConstantInit, GlorotUniformInit\n",
"from mlp.learning_rules import GradientDescentLearningRule\n",
"from mlp.optimisers import Optimiser\n",
"\n",
"#setup hyperparameters\n",
"learning_rate = 0.1\n",
"num_epochs = 100\n",
"stats_interval = 1\n",
"input_dim, output_dim, hidden_dim = 784, 47, 100\n",
"\n",
"weights_init = GlorotUniformInit(rng=rng)\n",
"biases_init = ConstantInit(0.)\n",
"model = MultipleLayerModel([\n",
" AffineLayer(input_dim, hidden_dim, weights_init, biases_init), \n",
" ReluLayer(),\n",
" AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), \n",
" ReluLayer(),\n",
" AffineLayer(hidden_dim, output_dim, weights_init, biases_init)\n",
"])\n",
"\n",
"error = CrossEntropySoftmaxError()\n",
"# Use a basic gradient descent learning rule\n",
"learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)\n",
"\n",
"#Remember to use notebook=False when you write a script to be run in a terminal\n",
"_ = train_model_and_plot_stats(\n",
" model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, notebook=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

79
report/algorithm.sty Normal file
View File

@ -0,0 +1,79 @@
% ALGORITHM STYLE -- Released 8 April 1996
% for LaTeX-2e
% Copyright -- 1994 Peter Williams
% E-mail Peter.Williams@dsto.defence.gov.au
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithm}
\typeout{Document Style `algorithm' - floating environment}
\RequirePackage{float}
\RequirePackage{ifthen}
\newcommand{\ALG@within}{nothing}
\newboolean{ALG@within}
\setboolean{ALG@within}{false}
\newcommand{\ALG@floatstyle}{ruled}
\newcommand{\ALG@name}{Algorithm}
\newcommand{\listalgorithmname}{List of \ALG@name s}
% Declare Options
% first appearance
\DeclareOption{plain}{
\renewcommand{\ALG@floatstyle}{plain}
}
\DeclareOption{ruled}{
\renewcommand{\ALG@floatstyle}{ruled}
}
\DeclareOption{boxed}{
\renewcommand{\ALG@floatstyle}{boxed}
}
% then numbering convention
\DeclareOption{part}{
\renewcommand{\ALG@within}{part}
\setboolean{ALG@within}{true}
}
\DeclareOption{chapter}{
\renewcommand{\ALG@within}{chapter}
\setboolean{ALG@within}{true}
}
\DeclareOption{section}{
\renewcommand{\ALG@within}{section}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsection}{
\renewcommand{\ALG@within}{subsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsubsection}{
\renewcommand{\ALG@within}{subsubsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{nothing}{
\renewcommand{\ALG@within}{nothing}
\setboolean{ALG@within}{true}
}
\DeclareOption*{\edef\ALG@name{\CurrentOption}}
% ALGORITHM
%
\ProcessOptions
\floatstyle{\ALG@floatstyle}
\ifthenelse{\boolean{ALG@within}}{
\ifthenelse{\equal{\ALG@within}{part}}
{\newfloat{algorithm}{htbp}{loa}[part]}{}
\ifthenelse{\equal{\ALG@within}{chapter}}
{\newfloat{algorithm}{htbp}{loa}[chapter]}{}
\ifthenelse{\equal{\ALG@within}{section}}
{\newfloat{algorithm}{htbp}{loa}[section]}{}
\ifthenelse{\equal{\ALG@within}{subsection}}
{\newfloat{algorithm}{htbp}{loa}[subsection]}{}
\ifthenelse{\equal{\ALG@within}{subsubsection}}
{\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
\ifthenelse{\equal{\ALG@within}{nothing}}
{\newfloat{algorithm}{htbp}{loa}}{}
}{
\newfloat{algorithm}{htbp}{loa}
}
\floatname{algorithm}{\ALG@name}
\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}

201
report/algorithmic.sty Normal file
View File

@ -0,0 +1,201 @@
% ALGORITHMIC STYLE -- Released 8 APRIL 1996
% for LaTeX version 2e
% Copyright -- 1994 Peter Williams
% E-mail PeterWilliams@dsto.defence.gov.au
%
% Modified by Alex Smola (08/2000)
% E-mail Alex.Smola@anu.edu.au
%
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithmic}
\typeout{Document Style `algorithmic' - environment}
%
\RequirePackage{ifthen}
\RequirePackage{calc}
\newboolean{ALC@noend}
\setboolean{ALC@noend}{false}
\newcounter{ALC@line}
\newcounter{ALC@rem}
\newlength{\ALC@tlm}
%
\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
%
\ProcessOptions
%
% ALGORITHMIC
\newcommand{\algorithmicrequire}{\textbf{Require:}}
\newcommand{\algorithmicensure}{\textbf{Ensure:}}
\newcommand{\algorithmiccomment}[1]{\{#1\}}
\newcommand{\algorithmicend}{\textbf{end}}
\newcommand{\algorithmicif}{\textbf{if}}
\newcommand{\algorithmicthen}{\textbf{then}}
\newcommand{\algorithmicelse}{\textbf{else}}
\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
\newcommand{\algorithmicfor}{\textbf{for}}
\newcommand{\algorithmicforall}{\textbf{for all}}
\newcommand{\algorithmicdo}{\textbf{do}}
\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
\newcommand{\algorithmicwhile}{\textbf{while}}
\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
\newcommand{\algorithmicloop}{\textbf{loop}}
\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
\newcommand{\algorithmicrepeat}{\textbf{repeat}}
\newcommand{\algorithmicuntil}{\textbf{until}}
%changed by alex smola
\newcommand{\algorithmicinput}{\textbf{input}}
\newcommand{\algorithmicoutput}{\textbf{output}}
\newcommand{\algorithmicset}{\textbf{set}}
\newcommand{\algorithmictrue}{\textbf{true}}
\newcommand{\algorithmicfalse}{\textbf{false}}
\newcommand{\algorithmicand}{\textbf{and\ }}
\newcommand{\algorithmicor}{\textbf{or\ }}
\newcommand{\algorithmicfunction}{\textbf{function}}
\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
\newcommand{\algorithmicmain}{\textbf{main}}
\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
%end changed by alex smola
\def\ALC@item[#1]{%
\if@noparitem \@donoparitem
\else \if@inlabel \indent \par \fi
\ifhmode \unskip\unskip \par \fi
\if@newlist \if@nobreak \@nbitem \else
\addpenalty\@beginparpenalty
\addvspace\@topsep \addvspace{-\parskip}\fi
\else \addpenalty\@itempenalty \addvspace\itemsep
\fi
\global\@inlabeltrue
\fi
\everypar{\global\@minipagefalse\global\@newlistfalse
\if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
\penalty\z@ \fi
\everypar{}}\global\@nobreakfalse
\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
\sbox\@tempboxa{\makelabel{#1}}%
\global\setbox\@labels
\hbox{\unhbox\@labels \hskip \itemindent
\hskip -\labelwidth \hskip -\ALC@tlm
\ifdim \wd\@tempboxa >\labelwidth
\box\@tempboxa
\else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
\hskip \ALC@tlm}\ignorespaces}
%
\newenvironment{algorithmic}[1][0]{
\let\@item\ALC@item
\newcommand{\ALC@lno}{%
\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
{{\footnotesize \arabic{ALC@line}:}}{}%
}
\let\@listii\@listi
\let\@listiii\@listi
\let\@listiv\@listi
\let\@listv\@listi
\let\@listvi\@listi
\let\@listvii\@listi
\newenvironment{ALC@g}{
\begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
\listparindent\z@ \rightmargin\z@
\topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
\leftmargin 1em
\addtolength{\ALC@tlm}{\leftmargin}
}
}
{\end{list}}
\newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
\newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
{}{\ \algorithmiccomment{##1}}}
\newcommand{\REQUIRE}{\item[\algorithmicrequire]}
\newcommand{\ENSURE}{\item[\algorithmicensure]}
\newcommand{\STATE}{\ALC@it}
\newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
%changes by alex smola
\newcommand{\INPUT}{\item[\algorithmicinput]}
\newcommand{\OUTPUT}{\item[\algorithmicoutput]}
\newcommand{\SET}{\item[\algorithmicset]}
% \newcommand{\TRUE}{\algorithmictrue}
% \newcommand{\FALSE}{\algorithmicfalse}
\newcommand{\AND}{\algorithmicand}
\newcommand{\OR}{\algorithmicor}
\newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
%end changes by alex smola
\newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
\renewcommand{\\}{\@centercr}
\newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
\algorithmicthen\ {##2}}
\newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\ELSIF}[2][default]%
{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
\algorithmicdo\ {##2}}
\newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@whl}}
\newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
\ALC@com{##1}\begin{ALC@loop}}
%changed by alex smola
\newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
\ALC@com{##1}\begin{ALC@func}}
\newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
\ALC@com{##1}\begin{ALC@main}}
%end changed by alex smola
\newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
\ALC@com{##1}\begin{ALC@rpt}}
\newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
\ifthenelse{\boolean{ALC@noend}}{
\newcommand{\ENDIF}{\end{ALC@if}}
\newcommand{\ENDFOR}{\end{ALC@for}}
\newcommand{\ENDWHILE}{\end{ALC@whl}}
\newcommand{\ENDLOOP}{\end{ALC@loop}}
\newcommand{\ENDFUNCTION}{\end{ALC@func}}
\newcommand{\ENDMAIN}{\end{ALC@main}}
}{
\newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
\newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
\newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
\newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
\newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
\newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
}
\renewcommand{\@toodeep}{}
\begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
\itemsep\z@ \itemindent\z@ \listparindent\z@%
\partopsep\z@ \parskip\z@ \parsep\z@%
\labelsep 0.5em \topsep 0.2em%
\ifthenelse{\equal{#1}{0}}
{\labelwidth 0.5em }
{\labelwidth 1.2em }
\leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
\ALC@tlm\labelsep
}
}
{\end{list}}

75
report/example-refs.bib Normal file
View File

@ -0,0 +1,75 @@
@inproceedings{langley00,
author = {P. Langley},
title = {Crafting Papers on Machine Learning},
year = {2000},
pages = {1207--1216},
editor = {Pat Langley},
booktitle = {Proceedings of the 17th International Conference
on Machine Learning (ICML 2000)},
address = {Stanford, CA},
publisher = {Morgan Kaufmann}
}
@TechReport{mitchell80,
author = "T. M. Mitchell",
title = "The Need for Biases in Learning Generalizations",
institution = "Computer Science Department, Rutgers University",
year = "1980",
address = "New Brunswick, MA",
}
@phdthesis{kearns89,
author = {M. J. Kearns},
title = {Computational Complexity of Machine Learning},
school = {Department of Computer Science, Harvard University},
year = {1989}
}
@Book{MachineLearningI,
editor = "R. S. Michalski and J. G. Carbonell and T.
M. Mitchell",
title = "Machine Learning: An Artificial Intelligence
Approach, Vol. I",
publisher = "Tioga",
year = "1983",
address = "Palo Alto, CA"
}
@Book{DudaHart2nd,
author = "R. O. Duda and P. E. Hart and D. G. Stork",
title = "Pattern Classification",
publisher = "John Wiley and Sons",
edition = "2nd",
year = "2000"
}
@misc{anonymous,
title= {Suppressed for Anonymity},
author= {Author, N. N.},
year= {2011},
}
@InCollection{Newell81,
author = "A. Newell and P. S. Rosenbloom",
title = "Mechanisms of Skill Acquisition and the Law of
Practice",
booktitle = "Cognitive Skills and Their Acquisition",
pages = "1--51",
publisher = "Lawrence Erlbaum Associates, Inc.",
year = "1981",
editor = "J. R. Anderson",
chapter = "1",
address = "Hillsdale, NJ"
}
@Article{Samuel59,
author = "A. L. Samuel",
title = "Some Studies in Machine Learning Using the Game of
Checkers",
journal = "IBM Journal of Research and Development",
year = "1959",
volume = "3",
number = "3",
pages = "211--229"
}

485
report/fancyhdr.sty Normal file
View File

@ -0,0 +1,485 @@
% fancyhdr.sty version 3.2
% Fancy headers and footers for LaTeX.
% Piet van Oostrum,
% Dept of Computer and Information Sciences, University of Utrecht,
% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands
% Telephone: +31 30 2532180. Email: piet@cs.uu.nl
% ========================================================================
% LICENCE:
% This file may be distributed under the terms of the LaTeX Project Public
% License, as described in lppl.txt in the base LaTeX distribution.
% Either version 1 or, at your option, any later version.
% ========================================================================
% MODIFICATION HISTORY:
% Sep 16, 1994
% version 1.4: Correction for use with \reversemargin
% Sep 29, 1994:
% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands
% Oct 4, 1994:
% version 1.6: Reset single spacing in headers/footers for use with
% setspace.sty or doublespace.sty
% Oct 4, 1994:
% version 1.7: changed \let\@mkboth\markboth to
% \def\@mkboth{\protect\markboth} to make it more robust
% Dec 5, 1994:
% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more
% importantly) use the \chapter/sectionmark definitions from ps@headings if
% they exist (which should be true for all standard classes).
% May 31, 1995:
% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage...
% construction in the doc did not work properly with the fancyplain style.
% June 1, 1995:
% version 1.91: The definition of \@mkboth wasn't restored on subsequent
% \pagestyle{fancy}'s.
% June 1, 1995:
% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain}
% \pagestyle{fancy} would erroneously select the plain version.
% June 1, 1995:
% version 1.93: \fancypagestyle command added.
% Dec 11, 1995:
% version 1.94: suggested by Conrad Hughes <chughes@maths.tcd.ie>
% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule
% position (old hardcoded value of .3\normalbaselineskip is far too high
% when used with very small footer fonts).
% Jan 31, 1996:
% version 1.95: call \@normalsize in the reset code if that is defined,
% otherwise \normalsize.
% this is to solve a problem with ucthesis.cls, as this doesn't
% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't
% work as this is optimized to do very little, so there \@normalsize should
% be called. Hopefully this code works for all versions of LaTeX known to
% mankind.
% April 25, 1996:
% version 1.96: initialize \headwidth to a magic (negative) value to catch
% most common cases that people change it before calling \pagestyle{fancy}.
% Note it can't be initialized when reading in this file, because
% \textwidth could be changed afterwards. This is quite probable.
% We also switch to \MakeUppercase rather than \uppercase and introduce a
% \nouppercase command for use in headers. and footers.
% May 3, 1996:
% version 1.97: Two changes:
% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults
% for the chapter and section marks. The current version of amsbook and
% amsart classes don't seem to need them anymore. Moreover the standard
% latex classes don't use \markboth if twoside isn't selected, and this is
% confusing as \leftmark doesn't work as expected.
% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem
% in the amsbook and amsart classes, that make global changes to \topskip,
% which are reset in \ps@empty. Hopefully this doesn't break other things.
% May 7, 1996:
% version 1.98:
% Added % after the line \def\nouppercase
% May 7, 1996:
% version 1.99: This is the alpha version of fancyhdr 2.0
% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf.
% Changed \headrulewidth, \footrulewidth, \footruleskip to
% macros rather than length parameters, In this way they can be
% conditionalized and they don't consume length registers. There is no need
% to have them as length registers unless you want to do calculations with
% them, which is unlikely. Note that this may make some uses of them
% incompatible (i.e. if you have a file that uses \setlength or \xxxx=)
% May 10, 1996:
% version 1.99a:
% Added a few more % signs
% May 10, 1996:
% version 1.99b:
% Changed the syntax of \f@nfor to be resistent to catcode changes of :=
% Removed the [1] from the defs of \lhead etc. because the parameter is
% consumed by the \@[xy]lhead etc. macros.
% June 24, 1997:
% version 1.99c:
% corrected \nouppercase to also include the protected form of \MakeUppercase
% \global added to manipulation of \headwidth.
% \iffootnote command added.
% Some comments added about \@fancyhead and \@fancyfoot.
% Aug 24, 1998
% version 1.99d
% Changed the default \ps@empty to \ps@@empty in order to allow
% \fancypagestyle{empty} redefinition.
% Oct 11, 2000
% version 2.0
% Added LPPL license clause.
%
% A check for \headheight is added. An errormessage is given (once) if the
% header is too large. Empty headers don't generate the error even if
% \headheight is very small or even 0pt.
% Warning added for the use of 'E' option when twoside option is not used.
% In this case the 'E' fields will never be used.
%
% Mar 10, 2002
% version 2.1beta
% New command: \fancyhfoffset[place]{length}
% defines offsets to be applied to the header/footer to let it stick into
% the margins (if length > 0).
% place is like in fancyhead, except that only E,O,L,R can be used.
% This replaces the old calculation based on \headwidth and the marginpar
% area.
% \headwidth will be dynamically calculated in the headers/footers when
% this is used.
%
% Mar 26, 2002
% version 2.1beta2
% \fancyhfoffset now also takes h,f as possible letters in the argument to
% allow the header and footer widths to be different.
% New commands \fancyheadoffset and \fancyfootoffset added comparable to
% \fancyhead and \fancyfoot.
% Errormessages and warnings have been made more informative.
%
% Dec 9, 2002
% version 2.1
% The defaults for \footrulewidth, \plainheadrulewidth and
% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when
% someone inadvertantly uses \setlength to change any of these, the value
% of \z@skip will not be changed, rather an errormessage will be given.
% March 3, 2004
% Release of version 3.0
% Oct 7, 2004
% version 3.1
% Added '\endlinechar=13' to \fancy@reset to prevent problems with
% includegraphics in header when verbatiminput is active.
% March 22, 2005
% version 3.2
% reset \everypar (the real one) in \fancy@reset because spanish.ldf does
% strange things with \everypar between << and >>.
\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty}
\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else
\fancy@gbl\def#1{#2\strut}\fi}
\let\fancy@gbl\global
\def\@fancyerrmsg#1{%
\ifx\PackageError\undefined
\errmessage{#1}\else
\PackageError{Fancyhdr}{#1}{}\fi}
\def\@fancywarning#1{%
\ifx\PackageWarning\undefined
\errmessage{#1}\else
\PackageWarning{Fancyhdr}{#1}{}\fi}
% Usage: \@forc \var{charstring}{command to be executed for each char}
% This is similar to LaTeX's \@tfor, but expands the charstring.
\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}}
\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else
\f@@rc#1#2\f@@rc{#3}\fi}
\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}}
% Usage: \f@nfor\name:=list\do{body}
% Like LaTeX's \@for but an empty list is treated as a list with an empty
% element
\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}%
\expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}}
% Usage: \def@ult \cs{defaults}{argument}
% sets \cs to the characters from defaults appearing in argument
% or defaults if it would be empty. All characters are lowercased.
\newcommand\def@ult[3]{%
\edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a
\def#1{}%
\@forc\tmpf@ra{#2}%
{\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}%
\ifx\@empty#1\def#1{#2}\fi}
%
% \if@in <char><set><truecase><falsecase>
%
\newcommand{\if@in}[4]{%
\edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}%
\expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi}
\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}%
{\f@ncyhf\fancyhead h[]}}
\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}%
{\f@ncyhf\fancyfoot f[]}}
\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}%
{\f@ncyhf\fancyhf{}[]}}
% New commands for offsets added
\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}%
{\f@ncyhfoffs\fancyheadoffset h[]}}
\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}%
{\f@ncyhfoffs\fancyfootoffset f[]}}
\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}%
{\f@ncyhfoffs\fancyhfoffset{}[]}}
% The header and footer fields are stored in command sequences with
% names of the form: \f@ncy<x><y><z> with <x> for [eo], <y> from [lcr]
% and <z> from [hf].
\def\f@ncyhf#1#2[#3]#4{%
\def\temp@c{}%
\@forc\tmpf@ra{#3}%
{\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}%
{}{\edef\temp@c{\temp@c\tmpf@ra}}}%
\ifx\@empty\temp@c\else
\@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
[#3]}%
\fi
\f@nfor\temp@c{#3}%
{\def@ult\f@@@eo{eo}\temp@c
\if@twoside\else
\if\f@@@eo e\@fancywarning
{\string#1's `E' option without twoside option is useless}\fi\fi
\def@ult\f@@@lcr{lcr}\temp@c
\def@ult\f@@@hf{hf}{#2\temp@c}%
\@forc\f@@eo\f@@@eo
{\@forc\f@@lcr\f@@@lcr
{\@forc\f@@hf\f@@@hf
{\expandafter\fancy@def\csname
f@ncy\f@@eo\f@@lcr\f@@hf\endcsname
{#4}}}}}}
\def\f@ncyhfoffs#1#2[#3]#4{%
\def\temp@c{}%
\@forc\tmpf@ra{#3}%
{\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}%
{}{\edef\temp@c{\temp@c\tmpf@ra}}}%
\ifx\@empty\temp@c\else
\@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
[#3]}%
\fi
\f@nfor\temp@c{#3}%
{\def@ult\f@@@eo{eo}\temp@c
\if@twoside\else
\if\f@@@eo e\@fancywarning
{\string#1's `E' option without twoside option is useless}\fi\fi
\def@ult\f@@@lcr{lr}\temp@c
\def@ult\f@@@hf{hf}{#2\temp@c}%
\@forc\f@@eo\f@@@eo
{\@forc\f@@lcr\f@@@lcr
{\@forc\f@@hf\f@@@hf
{\expandafter\setlength\csname
f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname
{#4}}}}}%
\fancy@setoffs}
% Fancyheadings version 1 commands. These are more or less deprecated,
% but they continue to work.
\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}}
\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}}
\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}}
\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}}
\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}}
\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}}
\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}}
\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}}
\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}}
\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}}
\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}}
\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}}
\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}}
\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}}
\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}}
\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}}
\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}}
\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}}
\newlength{\fancy@headwidth}
\let\headwidth\fancy@headwidth
\newlength{\f@ncyO@elh}
\newlength{\f@ncyO@erh}
\newlength{\f@ncyO@olh}
\newlength{\f@ncyO@orh}
\newlength{\f@ncyO@elf}
\newlength{\f@ncyO@erf}
\newlength{\f@ncyO@olf}
\newlength{\f@ncyO@orf}
\newcommand{\headrulewidth}{0.4pt}
\newcommand{\footrulewidth}{0pt}
\newcommand{\footruleskip}{.3\normalbaselineskip}
% Fancyplain stuff shouldn't be used anymore (rather
% \fancypagestyle{plain} should be used), but it must be present for
% compatibility reasons.
\newcommand{\plainheadrulewidth}{0pt}
\newcommand{\plainfootrulewidth}{0pt}
\newif\if@fancyplain \@fancyplainfalse
\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi}
\headwidth=-123456789sp %magic constant
% Command to reset various things in the headers:
% a.o. single spacing (taken from setspace.sty)
% and the catcode of ^^M (so that epsf files in the header work if a
% verbatim crosses a page boundary)
% It also defines a \nouppercase command that disables \uppercase and
% \Makeuppercase. It can only be used in the headers and footers.
\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf
\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13
\def\baselinestretch{1}%
\def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax
\expandafter\let\csname MakeUppercase \endcsname\relax##1}}%
\ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e
\ifx\@normalsize\undefined \normalsize % for ucthesis.cls
\else \@normalsize \fi
\else% NFSS (2.09) present
\@newbaseline%
\fi}
% Initialization of the head and foot text.
% The default values still contain \fancyplain for compatibility.
\fancyhf{} % clear all
% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages
% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages
\if@twoside
\fancyhead[el,or]{\fancyplain{}{\sl\rightmark}}
\fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}}
\else
\fancyhead[l]{\fancyplain{}{\sl\rightmark}}
\fancyhead[r]{\fancyplain{}{\sl\leftmark}}
\fi
\fancyfoot[c]{\rm\thepage} % page number
% Use box 0 as a temp box and dimen 0 as temp dimen.
% This can be done, because this code will always
% be used inside another box, and therefore the changes are local.
\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning
{\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J
We now make it that large for the rest of the document.^^J
This may cause the page layout to be inconsistent, however\@gobble}%
\dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi
\box0}
% Put together a header or footer given the left, center and
% right text, fillers at left and right and a rule.
% The \lap commands put the text into an hbox of zero size,
% so overlapping text does not generate an errormessage.
% These macros have 5 parameters:
% 1. LEFTSIDE BEARING % This determines at which side the header will stick
% out. When \fancyhfoffset is used this calculates \headwidth, otherwise
% it is \hss or \relax (after expansion).
% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component.
% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp.
% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component.
% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion).
\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
\@fancyvbox\headheight{\hbox
{\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill
\parbox[b]{\headwidth}{\centering#3}\hfill
\llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5}
\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
\@fancyvbox\footskip{\footrule
\hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill
\parbox[t]{\headwidth}{\centering#3}\hfill
\llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5}
\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi
\hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}}
\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi
\vskip-\footruleskip\vskip-\footrulewidth
\hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}}
\def\ps@fancy{%
\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook
%
% Define \MakeUppercase for old LaTeXen.
% Note: we used \def rather than \let, so that \let\uppercase\relax (from
% the version 1 documentation) will still work.
%
\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}%
\@ifundefined{chapter}{\def\sectionmark##1{\markboth
{\MakeUppercase{\ifnum \c@secnumdepth>\z@
\thesection\hskip 1em\relax \fi ##1}}{}}%
\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne
\thesubsection\hskip 1em\relax \fi ##1}}}%
{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne
\@chapapp\ \thechapter. \ \fi ##1}}{}}%
\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@
\thesection. \ \fi ##1}}}}%
%\csname ps@headings\endcsname % use \ps@headings defaults if they exist
\ps@@fancy
\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}%
% Initialize \headwidth if the user didn't
%
\ifdim\headwidth<0sp
%
% This catches the case that \headwidth hasn't been initialized and the
% case that the user added something to \headwidth in the expectation that
% it was initialized to \textwidth. We compensate this now. This loses if
% the user intended to multiply it by a factor. But that case is more
% likely done by saying something like \headwidth=1.2\textwidth.
% The doc says you have to change \headwidth after the first call to
% \pagestyle{fancy}. This code is just to catch the most common cases were
% that requirement is violated.
%
\global\advance\headwidth123456789sp\global\advance\headwidth\textwidth
\fi}
\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy}
\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy}
\let\ps@@empty\ps@empty
\def\ps@@fancy{%
\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip
\def\@mkboth{\protect\markboth}%
\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}%
\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}%
\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}%
\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}%
}
% Default definitions for compatibility mode:
% These cause the header/footer to take the defined \headwidth as width
% And to shift in the direction of the marginpar area
\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi}
\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi}
\let\fancy@Oelh\fancy@Oorh
\let\fancy@Oerh\fancy@Oolh
\let\fancy@Oolf\fancy@Oolh
\let\fancy@Oorf\fancy@Oorh
\let\fancy@Oelf\fancy@Oelh
\let\fancy@Oerf\fancy@Oerh
% New definitions for the use of \fancyhfoffset
% These calculate the \headwidth from \textwidth and the specified offsets.
\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh
\advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh}
\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh
\advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh}
\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf
\advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf}
\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf
\advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf}
\def\fancy@setoffs{%
% Just in case \let\headwidth\textwidth was used
\fancy@gbl\let\headwidth\fancy@headwidth
\fancy@gbl\let\fancy@Oolh\fancy@offsolh
\fancy@gbl\let\fancy@Oelh\fancy@offselh
\fancy@gbl\let\fancy@Oorh\hss
\fancy@gbl\let\fancy@Oerh\hss
\fancy@gbl\let\fancy@Oolf\fancy@offsolf
\fancy@gbl\let\fancy@Oelf\fancy@offself
\fancy@gbl\let\fancy@Oorf\hss
\fancy@gbl\let\fancy@Oerf\hss}
\newif\iffootnote
\let\latex@makecol\@makecol
\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi
\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol}
\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi}
\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi}
\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi}
\newcommand{\fancypagestyle}[2]{%
\@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}}

1441
report/icml2017.bst Normal file

File diff suppressed because it is too large Load Diff

BIN
report/icml_numpapers.pdf Normal file

Binary file not shown.

BIN
report/mlp-cw1-template.pdf Normal file

Binary file not shown.

207
report/mlp-cw1-template.tex Normal file
View File

@ -0,0 +1,207 @@
%% Template for MLP Coursework 1 / 16 October 2017
%% Based on LaTeX template for ICML 2017 - example_paper.tex at
%% https://2017.icml.cc/Conferences/2017/StyleAuthorInstructions
\documentclass{article}
\usepackage[T1]{fontenc}
\usepackage{amssymb,amsmath}
\usepackage{txfonts}
\usepackage{microtype}
% For figures
\usepackage{graphicx}
\usepackage{subfigure}
% For citations
\usepackage{natbib}
% For algorithms
\usepackage{algorithm}
\usepackage{algorithmic}
% the hyperref package is used to produce hyperlinks in the
% resulting PDF. If this breaks your system, please commend out the
% following usepackage line and replace \usepackage{mlp2017} with
% \usepackage[nohyperref]{mlp2017} below.
\usepackage{hyperref}
\usepackage{url}
\urlstyle{same}
% Packages hyperref and algorithmic misbehave sometimes. We can fix
% this with the following command.
\newcommand{\theHalgorithm}{\arabic{algorithm}}
% Set up MLP coursework style (based on ICML style)
\usepackage{mlp2017}
\mlptitlerunning{MLP Coursework 1 (\studentNumber)}
\bibliographystyle{icml2017}
\DeclareMathOperator{\softmax}{softmax}
\DeclareMathOperator{\sigmoid}{sigmoid}
\DeclareMathOperator{\sgn}{sgn}
\DeclareMathOperator{\relu}{relu}
\DeclareMathOperator{\lrelu}{lrelu}
\DeclareMathOperator{\elu}{elu}
\DeclareMathOperator{\selu}{selu}
\DeclareMathOperator{\maxout}{maxout}
%% You probably do not need to change anything above this comment
%% REPLACE this with your student number
\def\studentNumber{s1754321}
\begin{document}
\twocolumn[
\mlptitle{MLP Coursework 1: Activation Functions}
\centerline{\studentNumber}
\vskip 7mm
]
\begin{abstract}
The abstract should be 100--200 words long, providing a concise summary of the contents of your report.
\end{abstract}
\section{Introduction}
\label{sec:intro}
This document provides a template for the MLP coursework 1 report. In particular, it structures the document into five sections (plus an abstract and the references) -- you should keep to this structure for your report. If you want to use subsections within a section that is fine, but please do not use any deeper structuring. In this template the text in each section will include an outline of what you should include in each section, along with some practical LaTeX examples (for example figures, tables, algorithms). Your document should be no longer than \textbf{six pages}, with an additional page allowed for references.
The introduction should place your work in context, giving the overall motivation for the work, and clearly outlining the research questions you have explored -- in this case comparison of the behaviour of the different activation functions, experimental investigation of the impact of the depth of the network with respect to accuracy, and experimental investigation of different approaches to weight initialisation. This section should also include a concise description of the MNIST task and data -- be precise: for example state the size of the training and validation sets.
\section{Activation functions}
\label{sec:actfn}
This section should cover the theoretical methodology -- in this case you should present the four activation functions: ReLU, Leaky ReLU, ELU, and SELU. I didn't do it in this document, but the first time you use an acronym you should say what it stands for, for example Restricted Linear Unit (ReLU). You should use equations to concisely describe each activation function. For example, ReLU:
\begin{equation}
\relu(x) = \max(0, x) ,
\end{equation}
which has the gradient:
\begin{equation}
\frac{d}{dx} \relu(x) =
\begin{cases}
0 & \quad \text{if } x \leq 0 \\
1 & \quad \text{if } x > 0 .
\end{cases}
\end{equation}
The \LaTeX for the derivatives is slightly more complicated. We provided definitions near the top of the file (the part before \verb+\begin{document}+) for \verb+\relu+, \verb+\lrelu+, \verb+\elu+, and \verb+\selu+. There is no need to discuss the unit tests for these activation functions in this report.
It is probably not needed in this report, but if you would like to include an algorithm in your report, please use the \verb+algorithm+ and \verb+algorithmic+ environments to format pseudocode (for instance, Algorithm~\ref{alg:example}). These require the corresponding style files, \verb+algorithm.sty+ and \verb+algorithmic.sty+ which are supplied with this package.
\begin{algorithm}[ht]
\begin{algorithmic}
\STATE {\bfseries Input:} data $x_i$, size $m$
\REPEAT
\STATE Initialize $noChange = true$.
\FOR{$i=1$ {\bfseries to} $m-1$}
\IF{$x_i > x_{i+1}$}
\STATE Swap $x_i$ and $x_{i+1}$
\STATE $noChange = false$
\ENDIF
\ENDFOR
\UNTIL{$noChange$ is $true$}
\end{algorithmic}
\caption{Bubble Sort}
\label{alg:example}
\end{algorithm}
\section{Experimental comparison of activation functions}
\label{sec:actexpts}
In this section you should present the results and discussion of your experiments comparing networks using the different activation functions on the MNIST task. As explained in the coursework document, you should use 2 hidden layers with 100 hidden units per layer for these experiments. You can compare the learning curves (error vs epoch) for training and/or validation, and the validation set accuracies.
Your experimental sections should include graphs (for instance, figure~\ref{fig:sample-graph}) and/or tables (for instance, table~\ref{tab:sample-table})\footnote{These examples were taken from the ICML template paper.}, using the \verb+figure+ and \verb+table+ environments, in which you use \verb+\includegraphics+ to include an image (pdf, png, or jpg formats). Please export graphs as
\href{https://en.wikipedia.org/wiki/Vector_graphics}{vector graphics}
rather than \href{https://en.wikipedia.org/wiki/Raster_graphics}{raster
files} as this will make sure all detail in the plot is visible.
Matplotlib supports saving high quality figures in a wide range of
common image formats using the
\href{http://matplotlib.org/api/pyplot_api.html\#matplotlib.pyplot.savefig}{\texttt{savefig}}
function. \textbf{You should use \texttt{savefig} rather than copying
the screen-resolution raster images outputted in the notebook.} An
example of using \texttt{savefig} to save a figure as a PDF file (which
can be included as graphics in a \LaTeX document is given in the coursework document.
If you need a figure or table to stretch across two columns use the \verb+figure*+ or \verb+table*+ environment instead of the \verb+figure+ or \verb+table+ environment. Use the \verb+subfigure+ environment if you want to include multiple graphics in a single figure.
\begin{figure}[tb]
\vskip 5mm
\begin{center}
\centerline{\includegraphics[width=\columnwidth]{icml_numpapers}}
\caption{Historical locations and number of accepted papers for International
Machine Learning Conferences (ICML 1993 -- ICML 2008) and
International Workshops on Machine Learning (ML 1988 -- ML
1992). At the time this figure was produced, the number of
accepted papers for ICML 2008 was unknown and instead estimated.}
\label{fig:sample-graph}
\end{center}
\vskip -5mm
\end{figure}
\begin{table}[tb]
\vskip 3mm
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lcccr}
\hline
\abovespace\belowspace
Data set & Naive & Flexible & Better? \\
\hline
\abovespace
Breast & 95.9$\pm$ 0.2& 96.7$\pm$ 0.2& $\surd$ \\
Cleveland & 83.3$\pm$ 0.6& 80.0$\pm$ 0.6& $\times$\\
Glass2 & 61.9$\pm$ 1.4& 83.8$\pm$ 0.7& $\surd$ \\
Credit & 74.8$\pm$ 0.5& 78.3$\pm$ 0.6& \\
Horse & 73.3$\pm$ 0.9& 69.7$\pm$ 1.0& $\times$\\
Meta & 67.1$\pm$ 0.6& 76.5$\pm$ 0.5& $\surd$ \\
Pima & 75.1$\pm$ 0.6& 73.9$\pm$ 0.5& \\
\belowspace
Vehicle & 44.9$\pm$ 0.6& 61.5$\pm$ 0.4& $\surd$ \\
\hline
\end{tabular}
\end{sc}
\end{small}
\caption{Classification accuracies for naive Bayes and flexible
Bayes on various data sets.}
\label{tab:sample-table}
\end{center}
\vskip -3mm
\end{table}
\section{Deep neural network experiments}
\label{sec:dnnexpts}
This section should report on your experiments on deeper networks for MNIST. The two sets of experiments are to explore the impact of the depth of the network (number of hidden layers), and a comparison of different approaches to weight initialisation.
In this section, and in the previous section, you should present your experimental results clearly and concisely, followed by an interpretation and discussion of results. You need to present your results in a way that makes it easy for a reader to understand what they mean. You should facilitate comparisons either using graphs with multiple curves or (if appropriate, e.g. for accuracies) a results table. You need to avoid having too many figures, poorly labelled graphs, and graphs which should be comparable but which use different axis scales. A good presentation will enable the reader to compare trends in the same graph -- each graph should summarise the results relating to a particular research (sub)question.
Your discussion should interpret the results, both in terms of summarising the outcomes of a particular experiment, and attempting to relate to the underlying models. A good report would have some analysis, resulting in an understanding of why particular results are observed, perhaps with reference to the literature. Use bibtex to organise your references -- in this case the references are in the file \verb+example-refs.bib+. Here is a an example reference \citep{langley00}.
\section{Conclusions}
\label{sec:concl}
You should draw conclusions from the experiments, related to the research questions outlined in the introduction (section~\ref{sec:intro}). You should state the conclusions clearly and concisely. It is good if the conclusion from one experiment influenced what you did in later experiments -- your aim is to learn from your experiments. Extra credit if you relate your findings to what has been reported in the literature.
A good conclusions section would also include a further work discussion, building on work done so far, and referencing the literature where appropriate.
\bibliography{example-refs}
\end{document}
% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was
% created by Lise Getoor and Tobias Scheffer, it was slightly modified
% from the 2010 version by Thorsten Joachims & Johannes Fuernkranz,
% slightly modified from the 2009 version by Kiri Wagstaff and
% Sam Roweis's 2008 version, which is slightly modified from
% Prasad Tadepalli's 2007 version which is a lightly
% changed version of the previous year's version by Andrew Moore,
% which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.

BIN
report/mlp-cw2-template.pdf Normal file

Binary file not shown.

195
report/mlp-cw2-template.tex Normal file
View File

@ -0,0 +1,195 @@
%% Template for MLP Coursework 2 / 6 November 2017
%% Based on LaTeX template for ICML 2017 - example_paper.tex at
%% https://2017.icml.cc/Conferences/2017/StyleAuthorInstructions
\documentclass{article}
\usepackage[T1]{fontenc}
\usepackage{amssymb,amsmath}
\usepackage{txfonts}
\usepackage{microtype}
% For figures
\usepackage{graphicx}
\usepackage{subfigure}
% For citations
\usepackage{natbib}
% For algorithms
\usepackage{algorithm}
\usepackage{algorithmic}
% the hyperref package is used to produce hyperlinks in the
% resulting PDF. If this breaks your system, please commend out the
% following usepackage line and replace \usepackage{mlp2017} with
% \usepackage[nohyperref]{mlp2017} below.
\usepackage{hyperref}
\usepackage{url}
\urlstyle{same}
% Packages hyperref and algorithmic misbehave sometimes. We can fix
% this with the following command.
\newcommand{\theHalgorithm}{\arabic{algorithm}}
% Set up MLP coursework style (based on ICML style)
\usepackage{mlp2017}
\mlptitlerunning{MLP Coursework 2 (\studentNumber)}
\bibliographystyle{icml2017}
\DeclareMathOperator{\softmax}{softmax}
\DeclareMathOperator{\sigmoid}{sigmoid}
\DeclareMathOperator{\sgn}{sgn}
\DeclareMathOperator{\relu}{relu}
\DeclareMathOperator{\lrelu}{lrelu}
\DeclareMathOperator{\elu}{elu}
\DeclareMathOperator{\selu}{selu}
\DeclareMathOperator{\maxout}{maxout}
%% You probably do not need to change anything above this comment
%% REPLACE this with your student number
\def\studentNumber{sXXXXXXX}
\begin{document}
\twocolumn[
\mlptitle{MLP Coursework 2: Learning rules, BatchNorm, and ConvNets}
\centerline{\studentNumber}
\vskip 7mm
]
\begin{abstract}
The abstract should be 100--200 words long, providing a concise summary of the contents of your report.
\end{abstract}
\section{Introduction}
\label{sec:intro}
This document provides a template for the MLP coursework 2 report. This template structures the report into sections, which you are recommended to use, but can change if you wish. If you want to use subsections within a section that is fine, but please do not use any deeper structuring. In this template the text in each section will include an outline of what you should include in each section, along with some practical LaTeX examples (for example figures, tables, algorithms). Your document should be no longer than \textbf{seven pages}, with an additional page allowed for references.
The introduction should place your work in context, giving the overall motivation for the work, and clearly outlining the research questions you have explored. This section should also include a concise description of the Balanced EMNIST task and data -- be precise: for example state the size of the training, validation, and test sets.
\section{Baseline systems}
In this section you should report your baseline experiments for EMNIST. No need for theoretical explanations of things covered in the course, but should you go beyond what was covered please explain what you did with references to relevant paper(s) if appropriate. In this section you should aim to cover the both the ``what'' and the ``why'': \emph{what} you did, giving sufficient information (hyperparameter settings, etc.) so that someone else (e.g. another student on the course) could reproduce your results; and \emph{why} you performed the experiments you are reporting - what you are aiming to discover what is the motivation for the particular experiments you undertook. You should also provide some discussion and interpretation of your results.
As before, your experimental sections should include graphs (for instance, figure~\ref{fig:sample-graph}) and/or tables (for instance, table~\ref{tab:sample-table})\footnote{These examples were taken from the ICML template paper.}, using the \verb+figure+ and \verb+table+ environments, in which you use \verb+\includegraphics+ to include an image (pdf, png, or jpg formats). Please export graphs as
\href{https://en.wikipedia.org/wiki/Vector_graphics}{vector graphics}
rather than \href{https://en.wikipedia.org/wiki/Raster_graphics}{raster
files} as this will make sure all detail in the plot is visible.
Matplotlib supports saving high quality figures in a wide range of
common image formats using the
\href{http://matplotlib.org/api/pyplot_api.html\#matplotlib.pyplot.savefig}{\texttt{savefig}}
function. \textbf{You should use \texttt{savefig} rather than copying
the screen-resolution raster images outputted in the notebook.} An
example of using \texttt{savefig} to save a figure as a PDF file (which
can be included as graphics in a \LaTeX document is given in the coursework 1 document.
If you need a figure or table to stretch across two columns use the \verb+figure*+ or \verb+table*+ environment instead of the \verb+figure+ or \verb+table+ environment. Use the \verb+subfigure+ environment if you want to include multiple graphics in a single figure.
\begin{figure}[tb]
\vskip 5mm
\begin{center}
\centerline{\includegraphics[width=\columnwidth]{icml_numpapers}}
\caption{Historical locations and number of accepted papers for International
Machine Learning Conferences (ICML 1993 -- ICML 2008) and
International Workshops on Machine Learning (ML 1988 -- ML
1992). At the time this figure was produced, the number of
accepted papers for ICML 2008 was unknown and instead estimated.}
\label{fig:sample-graph}
\end{center}
\vskip -5mm
\end{figure}
\begin{table}[tb]
\vskip 3mm
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lcccr}
\hline
\abovespace\belowspace
Data set & Naive & Flexible & Better? \\
\hline
\abovespace
Breast & 95.9$\pm$ 0.2& 96.7$\pm$ 0.2& $\surd$ \\
Cleveland & 83.3$\pm$ 0.6& 80.0$\pm$ 0.6& $\times$\\
Glass2 & 61.9$\pm$ 1.4& 83.8$\pm$ 0.7& $\surd$ \\
Credit & 74.8$\pm$ 0.5& 78.3$\pm$ 0.6& \\
Horse & 73.3$\pm$ 0.9& 69.7$\pm$ 1.0& $\times$\\
Meta & 67.1$\pm$ 0.6& 76.5$\pm$ 0.5& $\surd$ \\
Pima & 75.1$\pm$ 0.6& 73.9$\pm$ 0.5& \\
\belowspace
Vehicle & 44.9$\pm$ 0.6& 61.5$\pm$ 0.4& $\surd$ \\
\hline
\end{tabular}
\end{sc}
\end{small}
\caption{Classification accuracies for naive Bayes and flexible
Bayes on various data sets.}
\label{tab:sample-table}
\end{center}
\vskip -3mm
\end{table}
\section{Learning rules}
In this section you should compare RMSProp and Adam with gradient descent, introducing these learning rules either as equations or as algorithmic pseudocode. If you present the different approaches as algorithms, you can use the \verb+algorithm+ and \verb+algorithmic+ environments to format pseudocode (for instance, Algorithm~\ref{alg:example}). These require the corresponding style files, \verb+algorithm.sty+ and \verb+algorithmic.sty+ which are supplied with this package.
\begin{algorithm}[ht]
\begin{algorithmic}
\STATE {\bfseries Input:} data $x_i$, size $m$
\REPEAT
\STATE Initialize $noChange = true$.
\FOR{$i=1$ {\bfseries to} $m-1$}
\IF{$x_i > x_{i+1}$}
\STATE Swap $x_i$ and $x_{i+1}$
\STATE $noChange = false$
\ENDIF
\ENDFOR
\UNTIL{$noChange$ is $true$}
\end{algorithmic}
\caption{Bubble Sort}
\label{alg:example}
\end{algorithm}
You should, in your own words, explain what the different learning rules do, and how they differ. You should then present your experiments and results, comparing and contrasting stochastic gradient descent, RMSProp, and Adam. As before concentrate on the ``what'' (remember give enough information so someone can reproduce your experiments), the ``why'' (why did you choose the experiments that you performed -- you may have been motivated by your earlier results, by the literature, or by a specific research question), and the interpretation of your results.
In every section, you should present your results in a way that makes it easy for a reader to understand what they mean. You should facilitate comparisons either using graphs with multiple curves or (if appropriate, e.g. for accuracies) a results table. You need to avoid having too many figures, poorly labelled graphs, and graphs which should be comparable but which use different axis scales. A good presentation will enable the reader to compare trends in the same graph -- each graph should summarise the results relating to a particular research (sub)question.
Your discussion should interpret the results, both in terms of summarising the outcomes of a particular experiment, and attempting to relate to the underlying models. A good report would have some analysis, resulting in an understanding of why particular results are observed, perhaps with reference to the literature. Use bibtex to organise your references -- in this case the references are in the file \verb+example-refs.bib+. Here is a an example reference \citep{langley00}.
\section{Batch normalisation}
In this section you should present batch normalisation, supported using equations or algorithmic pseudocode. Following this present your experiments, again remembering to include the ``what'', the ``why'', and the interpretation of results.
\section{Convolutional networks}
In this section you should present your experiments with convolutional networks. Explain the idea of convolutional layers and pooling layers, and briefly explain how you did the implementation. There is no need to include chunks of code. You should report the experiments you have undertaken, again remembering to include \emph{what} experiments you performed (include details of hyperparameters, etc.), \emph{why} you performed them (what was the motivation for the experiments, what research questions are you exploring), and the interpretation and discussion of your results.
\section{Test results}
The results reported in the previous sections should be on the validation set. You should finally report results on the EMNIST test set using what you judge to the be the best deep neural network (without convolutional layers) and the best convolutional network. Again focus on what the experiments were (be precise), why you chose to do them (in particular, how did you choose the architectures/settings to use with the test set), and a discussion/interpretation of the results.
\section{Conclusions}
\label{sec:concl}
You should draw conclusions from the experiments, related to the research questions outlined in the introduction (section~\ref{sec:intro}). You should state the conclusions clearly and concisely. It is good if the conclusion from one experiment influenced what you did in later experiments -- your aim is to learn from your experiments. Extra credit if you relate your findings to what has been reported in the literature.
A good conclusions section would also include a further work discussion, building on work done so far, and referencing the literature where appropriate.
\bibliography{example-refs}
\end{document}
% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was
% created by Lise Getoor and Tobias Scheffer, it was slightly modified
% from the 2010 version by Thorsten Joachims & Johannes Fuernkranz,
% slightly modified from the 2009 version by Kiri Wagstaff and
% Sam Roweis's 2008 version, which is slightly modified from
% Prasad Tadepalli's 2007 version which is a lightly
% changed version of the previous year's version by Andrew Moore,
% which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.

720
report/mlp2017.sty Normal file
View File

@ -0,0 +1,720 @@
% File: mlp2017.sty (LaTeX style file for ICML-2017, version of 2017-05-31)
% Modified by Daniel Roy 2017: changed byline to use footnotes for affiliations, and removed emails
% This file contains the LaTeX formatting parameters for a two-column
% conference proceedings that is 8.5 inches wide by 11 inches high.
%
% Modified by Percy Liang 12/2/2013: changed the year, location from the previous template for ICML 2014
% Modified by Fei Sha 9/2/2013: changed the year, location form the previous template for ICML 2013
%
% Modified by Fei Sha 4/24/2013: (1) remove the extra whitespace after the first author's email address (in %the camera-ready version) (2) change the Proceeding ... of ICML 2010 to 2014 so PDF's metadata will show up % correctly
%
% Modified by Sanjoy Dasgupta, 2013: changed years, location
%
% Modified by Francesco Figari, 2012: changed years, location
%
% Modified by Christoph Sawade and Tobias Scheffer, 2011: added line
% numbers, changed years
%
% Modified by Hal Daume III, 2010: changed years, added hyperlinks
%
% Modified by Kiri Wagstaff, 2009: changed years
%
% Modified by Sam Roweis, 2008: changed years
%
% Modified by Ricardo Silva, 2007: update of the ifpdf verification
%
% Modified by Prasad Tadepalli and Andrew Moore, merely changing years.
%
% Modified by Kristian Kersting, 2005, based on Jennifer Dy's 2004 version
% - running title. If the original title is to long or is breaking a line,
% use \mlptitlerunning{...} in the preamble to supply a shorter form.
% Added fancyhdr package to get a running head.
% - Updated to store the page size because pdflatex does compile the
% page size into the pdf.
%
% Hacked by Terran Lane, 2003:
% - Updated to use LaTeX2e style file conventions (ProvidesPackage,
% etc.)
% - Added an ``appearing in'' block at the base of the first column
% (thus keeping the ``appearing in'' note out of the bottom margin
% where the printer should strip in the page numbers).
% - Added a package option [accepted] that selects between the ``Under
% review'' notice (default, when no option is specified) and the
% ``Appearing in'' notice (for use when the paper has been accepted
% and will appear).
%
% Originally created as: ml2k.sty (LaTeX style file for ICML-2000)
% by P. Langley (12/23/99)
%%%%%%%%%%%%%%%%%%%%
%% This version of the style file supports both a ``review'' version
%% and a ``final/accepted'' version. The difference is only in the
%% text that appears in the note at the bottom of the first column of
%% the first page. The default behavior is to print a note to the
%% effect that the paper is under review and don't distribute it. The
%% final/accepted version prints an ``Appearing in'' note. To get the
%% latter behavior, in the calling file change the ``usepackage'' line
%% from:
%% \usepackage{icml2017}
%% to
%% \usepackage[accepted]{icml2017}
%%%%%%%%%%%%%%%%%%%%
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{mlp2017}[2017/01/01 MLP Coursework Style File]
% Use fancyhdr package
\RequirePackage{fancyhdr}
\RequirePackage{color}
\RequirePackage{algorithm}
\RequirePackage{algorithmic}
\RequirePackage{natbib}
\RequirePackage{eso-pic} % used by \AddToShipoutPicture
\RequirePackage{forloop}
%%%%%%%% Options
%\DeclareOption{accepted}{%
% \renewcommand{\Notice@String}{\ICML@appearing}
\gdef\isaccepted{1}
%}
\DeclareOption{nohyperref}{%
\gdef\nohyperref{1}
}
\ifdefined\nohyperref\else\ifdefined\hypersetup
\definecolor{mydarkblue}{rgb}{0,0.08,0.45}
\hypersetup{ %
pdftitle={},
pdfauthor={},
pdfsubject={MLP Coursework 2017-18},
pdfkeywords={},
pdfborder=0 0 0,
pdfpagemode=UseNone,
colorlinks=true,
linkcolor=mydarkblue,
citecolor=mydarkblue,
filecolor=mydarkblue,
urlcolor=mydarkblue,
pdfview=FitH}
\ifdefined\isaccepted \else
\hypersetup{pdfauthor={Anonymous Submission}}
\fi
\fi\fi
%%%%%%%%%%%%%%%%%%%%
% This string is printed at the bottom of the page for the
% final/accepted version of the ``appearing in'' note. Modify it to
% change that text.
%%%%%%%%%%%%%%%%%%%%
\newcommand{\ICML@appearing}{\textit{MLP Coursework 1 2017-18}}
%%%%%%%%%%%%%%%%%%%%
% This string is printed at the bottom of the page for the draft/under
% review version of the ``appearing in'' note. Modify it to change
% that text.
%%%%%%%%%%%%%%%%%%%%
\newcommand{\Notice@String}{MLP Coursework 1 2017-18}
% Cause the declared options to actually be parsed and activated
\ProcessOptions\relax
% Uncomment the following for debugging. It will cause LaTeX to dump
% the version of the ``appearing in'' string that will actually appear
% in the document.
%\typeout{>> Notice string='\Notice@String'}
% Change citation commands to be more like old ICML styles
\newcommand{\yrcite}[1]{\citeyearpar{#1}}
\renewcommand{\cite}[1]{\citep{#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% to ensure the letter format is used. pdflatex does compile the
% page size into the pdf. This is done using \pdfpagewidth and
% \pdfpageheight. As Latex does not know this directives, we first
% check whether pdflatex or latex is used.
%
% Kristian Kersting 2005
%
% in order to account for the more recent use of pdfetex as the default
% compiler, I have changed the pdf verification.
%
% Ricardo Silva 2007
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\paperwidth=210mm
\paperheight=297mm
% old PDFLaTex verification, circa 2005
%
%\newif\ifpdf\ifx\pdfoutput\undefined
% \pdffalse % we are not running PDFLaTeX
%\else
% \pdfoutput=1 % we are running PDFLaTeX
% \pdftrue
%\fi
\newif\ifpdf %adapted from ifpdf.sty
\ifx\pdfoutput\undefined
\else
\ifx\pdfoutput\relax
\else
\ifcase\pdfoutput
\else
\pdftrue
\fi
\fi
\fi
\ifpdf
% \pdfpagewidth=\paperwidth
% \pdfpageheight=\paperheight
\setlength{\pdfpagewidth}{210mm}
\setlength{\pdfpageheight}{297mm}
\fi
% Physical page layout
\evensidemargin -5.5mm
\oddsidemargin -5.5mm
\setlength\textheight{248mm}
\setlength\textwidth{170mm}
\setlength\columnsep{6.5mm}
\setlength\headheight{10pt}
\setlength\headsep{10pt}
\addtolength{\topmargin}{-20pt}
%\setlength\headheight{1em}
%\setlength\headsep{1em}
\addtolength{\topmargin}{-6mm}
%\addtolength{\topmargin}{-2em}
%% The following is adapted from code in the acmconf.sty conference
%% style file. The constants in it are somewhat magical, and appear
%% to work well with the two-column format on US letter paper that
%% ICML uses, but will break if you change that layout, or if you use
%% a longer block of text for the copyright notice string. Fiddle with
%% them if necessary to get the block to fit/look right.
%%
%% -- Terran Lane, 2003
%%
%% The following comments are included verbatim from acmconf.sty:
%%
%%% This section (written by KBT) handles the 1" box in the lower left
%%% corner of the left column of the first page by creating a picture,
%%% and inserting the predefined string at the bottom (with a negative
%%% displacement to offset the space allocated for a non-existent
%%% caption).
%%%
\def\ftype@copyrightbox{8}
\def\@copyrightspace{
% Create a float object positioned at the bottom of the column. Note
% that because of the mystical nature of floats, this has to be called
% before the first column is populated with text (e.g., from the title
% or abstract blocks). Otherwise, the text will force the float to
% the next column. -- TDRL.
\@float{copyrightbox}[b]
\begin{center}
\setlength{\unitlength}{1pc}
\begin{picture}(20,1.5)
% Create a line separating the main text from the note block.
% 4.818pc==0.8in.
\put(0,2.5){\line(1,0){4.818}}
% Insert the text string itself. Note that the string has to be
% enclosed in a parbox -- the \put call needs a box object to
% position. Without the parbox, the text gets splattered across the
% bottom of the page semi-randomly. The 19.75pc distance seems to be
% the width of the column, though I can't find an appropriate distance
% variable to substitute here. -- TDRL.
\put(0,0){\parbox[b]{19.75pc}{\small \Notice@String}}
\end{picture}
\end{center}
\end@float}
% Note: A few Latex versions need the next line instead of the former.
% \addtolength{\topmargin}{0.3in}
% \setlength\footheight{0pt}
\setlength\footskip{0pt}
%\pagestyle{empty}
\flushbottom \twocolumn
\sloppy
% Clear out the addcontentsline command
\def\addcontentsline#1#2#3{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% commands for formatting paper title, author names, and addresses.
%%start%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%% title as running head -- Kristian Kersting 2005 %%%%%%%%%%%%%
%\makeatletter
%\newtoks\mytoksa
%\newtoks\mytoksb
%\newcommand\addtomylist[2]{%
% \mytoksa\expandafter{#1}%
% \mytoksb{#2}%
% \edef#1{\the\mytoksa\the\mytoksb}%
%}
%\makeatother
% box to check the size of the running head
\newbox\titrun
% general page style
\pagestyle{fancy}
\fancyhf{}
\fancyhead{}
\fancyfoot{}
% set the width of the head rule to 1 point
\renewcommand{\headrulewidth}{1pt}
% definition to set the head as running head in the preamble
\def\mlptitlerunning#1{\gdef\@mlptitlerunning{#1}}
% main definition adapting \mlptitle from 2004
\long\def\mlptitle#1{%
%check whether @mlptitlerunning exists
% if not \mlptitle is used as running head
\ifx\undefined\@mlptitlerunning%
\gdef\@mlptitlerunning{#1}
\fi
%add it to pdf information
\ifdefined\nohyperref\else\ifdefined\hypersetup
\hypersetup{pdftitle={#1}}
\fi\fi
%get the dimension of the running title
\global\setbox\titrun=\vbox{\small\bf\@mlptitlerunning}
% error flag
\gdef\@runningtitleerror{0}
% running title too long
\ifdim\wd\titrun>\textwidth%
{\gdef\@runningtitleerror{1}}%
% running title breaks a line
\else\ifdim\ht\titrun>6.25pt
{\gdef\@runningtitleerror{2}}%
\fi
\fi
% if there is somthing wrong with the running title
\ifnum\@runningtitleerror>0
\typeout{}%
\typeout{}%
\typeout{*******************************************************}%
\typeout{Title exceeds size limitations for running head.}%
\typeout{Please supply a shorter form for the running head}
\typeout{with \string\mlptitlerunning{...}\space prior to \string\begin{document}}%
\typeout{*******************************************************}%
\typeout{}%
\typeout{}%
% set default running title
\chead{\small\bf Title Suppressed Due to Excessive Size}%
\else
% 'everything' fine, set provided running title
\chead{\small\bf\@mlptitlerunning}%
\fi
% no running title on the first page of the paper
\thispagestyle{empty}
%%%%%%%%%%%%%%%%%%%% Kristian Kersting %%%%%%%%%%%%%%%%%%%%%%%%%
%end%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\center\baselineskip 18pt
\toptitlebar{\Large\bf #1}\bottomtitlebar}
}
\gdef\icmlfullauthorlist{}
\newcommand\addstringtofullauthorlist{\g@addto@macro\icmlfullauthorlist}
\newcommand\addtofullauthorlist[1]{%
\ifdefined\icmlanyauthors%
\addstringtofullauthorlist{, #1}%
\else%
\addstringtofullauthorlist{#1}%
\gdef\icmlanyauthors{1}%
\fi%
\ifdefined\nohyperref\else\ifdefined\hypersetup%
\hypersetup{pdfauthor=\icmlfullauthorlist}%
\fi\fi}
\def\toptitlebar{\hrule height1pt \vskip .25in}
\def\bottomtitlebar{\vskip .22in \hrule height1pt \vskip .3in}
\newenvironment{icmlauthorlist}{%
\setlength\topsep{0pt}
\setlength\parskip{0pt}
\begin{center}
}{%
\end{center}
}
\newcounter{@affiliationcounter}
\newcommand{\@pa}[1]{%
% ``#1''
\ifcsname the@affil#1\endcsname
% do nothing
\else
\ifcsname @icmlsymbol#1\endcsname
% nothing
\else
\stepcounter{@affiliationcounter}%
\newcounter{@affil#1}%
\setcounter{@affil#1}{\value{@affiliationcounter}}%
\fi
\fi%
\ifcsname @icmlsymbol#1\endcsname
\textsuperscript{\csname @icmlsymbol#1\endcsname\,}%
\else
%\expandafter\footnotemark[\arabic{@affil#1}\,]%
\textsuperscript{\arabic{@affil#1}\,}%
\fi
}
%\newcommand{\icmlauthor}[2]{%
%\addtofullauthorlist{#1}%
%#1\@for\theaffil:=#2\do{\pa{\theaffil}}%
%}
\newcommand{\icmlauthor}[2]{%
\ifdefined\isaccepted
\mbox{\bf #1}\,\@for\theaffil:=#2\do{\@pa{\theaffil}} \addtofullauthorlist{#1}%
\else
\ifdefined\@icmlfirsttime
\else
\gdef\@icmlfirsttime{1}
\mbox{\bf Anonymous Authors}\@pa{@anon} \addtofullauthorlist{Anonymous Authors}
\fi
\fi
}
\newcommand{\icmlsetsymbol}[2]{%
\expandafter\gdef\csname @icmlsymbol#1\endcsname{#2}
}
\newcommand{\icmlaffiliation}[2]{%
\ifdefined\isaccepted
\ifcsname the@affil#1\endcsname
\expandafter\gdef\csname @affilname\csname the@affil#1\endcsname\endcsname{#2}%
\else
{\bf AUTHORERR: Error in use of \textbackslash{}icmlaffiliation command. Label ``#1'' not mentioned in some \textbackslash{}icmlauthor\{author name\}\{labels here\} command beforehand. }
\typeout{}%
\typeout{}%
\typeout{*******************************************************}%
\typeout{Affiliation label undefined. }%
\typeout{Make sure \string\icmlaffiliation\space follows }
\typeout{all of \string\icmlauthor\space commands}%
\typeout{*******************************************************}%
\typeout{}%
\typeout{}%
\fi
\else % \isaccepted
% can be called multiple times... it's idempotent
\expandafter\gdef\csname @affilname1\endcsname{Anonymous Institution, Anonymous City, Anonymous Region, Anonymous Country}
\fi
}
\newcommand{\icmlcorrespondingauthor}[2]{
\ifdefined\isaccepted
\ifdefined\icmlcorrespondingauthor@text
\g@addto@macro\icmlcorrespondingauthor@text{, #1 \textless{}#2\textgreater{}}
\else
\gdef\icmlcorrespondingauthor@text{#1 \textless{}#2\textgreater{}}
\fi
\else
\gdef\icmlcorrespondingauthor@text{Anonymous Author \textless{}anon.email@domain.com\textgreater{}}
\fi
}
\newcommand{\icmlEqualContribution}{\textsuperscript{*}Equal contribution }
\newcounter{@affilnum}
\newcommand{\printAffiliationsAndNotice}[1]{%
\stepcounter{@affiliationcounter}%
{\let\thefootnote\relax\footnotetext{\hspace*{-\footnotesep}#1%
\forloop{@affilnum}{1}{\value{@affilnum} < \value{@affiliationcounter}}{
\textsuperscript{\arabic{@affilnum}}\ifcsname @affilname\the@affilnum\endcsname%
\csname @affilname\the@affilnum\endcsname%
\else
{\bf AUTHORERR: Missing \textbackslash{}icmlaffiliation.}
\fi
}.
\ifdefined\icmlcorrespondingauthor@text
Correspondence to: \icmlcorrespondingauthor@text.
\else
{\bf AUTHORERR: Missing \textbackslash{}icmlcorrespondingauthor.}
\fi
\ \\
\Notice@String
}
}
}
%\makeatother
\long\def\icmladdress#1{%
{\bf The \textbackslash{}icmladdress command is no longer used. See the example\_paper PDF .tex for usage of \textbackslash{}icmlauther and \textbackslash{}icmlaffiliation.}
}
%% keywords as first class citizens
\def\icmlkeywords#1{%
% \ifdefined\isaccepted \else
% \par {\bf Keywords:} #1%
% \fi
% \ifdefined\nohyperref\else\ifdefined\hypersetup
% \hypersetup{pdfkeywords={#1}}
% \fi\fi
% \ifdefined\isaccepted \else
% \par {\bf Keywords:} #1%
% \fi
\ifdefined\nohyperref\else\ifdefined\hypersetup
\hypersetup{pdfkeywords={#1}}
\fi\fi
}
% modification to natbib citations
\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
% Redefinition of the abstract environment.
\renewenvironment{abstract}
{%
% Insert the ``appearing in'' copyright notice.
%\@copyrightspace
\centerline{\large\bf Abstract}
\vspace{-0.12in}\begin{quote}}
{\par\end{quote}\vskip 0.12in}
% numbered section headings with different treatment of numbers
\def\@startsection#1#2#3#4#5#6{\if@noskipsec \leavevmode \fi
\par \@tempskipa #4\relax
\@afterindenttrue
% Altered the following line to indent a section's first paragraph.
% \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \@afterindentfalse\fi
\ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \fi
\if@nobreak \everypar{}\else
\addpenalty{\@secpenalty}\addvspace{\@tempskipa}\fi \@ifstar
{\@ssect{#3}{#4}{#5}{#6}}{\@dblarg{\@sict{#1}{#2}{#3}{#4}{#5}{#6}}}}
\def\@sict#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
\def\@svsec{}\else
\refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname}\fi
\@tempskipa #5\relax
\ifdim \@tempskipa>\z@
\begingroup #6\relax
\@hangfrom{\hskip #3\relax\@svsec.~}{\interlinepenalty \@M #8\par}
\endgroup
\csname #1mark\endcsname{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}\else
\def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}}\fi
\@xsect{#5}}
\def\@sect#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
\def\@svsec{}\else
\refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname\hskip 0.4em }\fi
\@tempskipa #5\relax
\ifdim \@tempskipa>\z@
\begingroup #6\relax
\@hangfrom{\hskip #3\relax\@svsec}{\interlinepenalty \@M #8\par}
\endgroup
\csname #1mark\endcsname{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}\else
\def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}}\fi
\@xsect{#5}}
% section headings with less space above and below them
\def\thesection {\arabic{section}}
\def\thesubsection {\thesection.\arabic{subsection}}
\def\section{\@startsection{section}{1}{\z@}{-0.12in}{0.02in}
{\large\bf\raggedright}}
\def\subsection{\@startsection{subsection}{2}{\z@}{-0.10in}{0.01in}
{\normalsize\bf\raggedright}}
\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-0.08in}{0.01in}
{\normalsize\sc\raggedright}}
\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
0.5ex minus .2ex}{-1em}{\normalsize\bf}}
\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
0.5ex minus .2ex}{-1em}{\normalsize\bf}}
% Footnotes
\footnotesep 6.65pt %
\skip\footins 9pt
\def\footnoterule{\kern-3pt \hrule width 0.8in \kern 2.6pt }
\setcounter{footnote}{0}
% Lists and paragraphs
\parindent 0pt
\topsep 4pt plus 1pt minus 2pt
\partopsep 1pt plus 0.5pt minus 0.5pt
\itemsep 2pt plus 1pt minus 0.5pt
\parsep 2pt plus 1pt minus 0.5pt
\parskip 6pt
\leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em
\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em
\leftmarginvi .5em
\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
\def\@listi{\leftmargin\leftmargini}
\def\@listii{\leftmargin\leftmarginii
\labelwidth\leftmarginii\advance\labelwidth-\labelsep
\topsep 2pt plus 1pt minus 0.5pt
\parsep 1pt plus 0.5pt minus 0.5pt
\itemsep \parsep}
\def\@listiii{\leftmargin\leftmarginiii
\labelwidth\leftmarginiii\advance\labelwidth-\labelsep
\topsep 1pt plus 0.5pt minus 0.5pt
\parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
\itemsep \topsep}
\def\@listiv{\leftmargin\leftmarginiv
\labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
\def\@listv{\leftmargin\leftmarginv
\labelwidth\leftmarginv\advance\labelwidth-\labelsep}
\def\@listvi{\leftmargin\leftmarginvi
\labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
\abovedisplayskip 7pt plus2pt minus5pt%
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip 0pt plus3pt%
\belowdisplayshortskip 4pt plus3pt minus3pt%
% Less leading in most fonts (due to the narrow columns)
% The choices were between 1-pt and 1.5-pt leading
\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
% Revised formatting for figure captions and table titles.
\newsavebox\newcaptionbox\newdimen\newcaptionboxwid
\long\def\@makecaption#1#2{
\vskip 10pt
\baselineskip 11pt
\setbox\@tempboxa\hbox{#1. #2}
\ifdim \wd\@tempboxa >\hsize
\sbox{\newcaptionbox}{\small\sl #1.~}
\newcaptionboxwid=\wd\newcaptionbox
\usebox\newcaptionbox {\footnotesize #2}
% \usebox\newcaptionbox {\small #2}
\else
\centerline{{\small\sl #1.} {\small #2}}
\fi}
\def\fnum@figure{Figure \thefigure}
\def\fnum@table{Table \thetable}
% Strut macros for skipping spaces above and below text in tables.
\def\abovestrut#1{\rule[0in]{0in}{#1}\ignorespaces}
\def\belowstrut#1{\rule[-#1]{0in}{#1}\ignorespaces}
\def\abovespace{\abovestrut{0.20in}}
\def\aroundspace{\abovestrut{0.20in}\belowstrut{0.10in}}
\def\belowspace{\belowstrut{0.10in}}
% Various personal itemization commands.
\def\texitem#1{\par\noindent\hangindent 12pt
\hbox to 12pt {\hss #1 ~}\ignorespaces}
\def\icmlitem{\texitem{$\bullet$}}
% To comment out multiple lines of text.
\long\def\comment#1{}
%% Line counter (not in final version). Adapted from NIPS style file by Christoph Sawade
% Vertical Ruler
% This code is, largely, from the CVPR 2010 conference style file
% ----- define vruler
\makeatletter
\newbox\icmlrulerbox
\newcount\icmlrulercount
\newdimen\icmlruleroffset
\newdimen\cv@lineheight
\newdimen\cv@boxheight
\newbox\cv@tmpbox
\newcount\cv@refno
\newcount\cv@tot
% NUMBER with left flushed zeros \fillzeros[<WIDTH>]<NUMBER>
\newcount\cv@tmpc@ \newcount\cv@tmpc
\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
\cv@tmpc=1 %
\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
\ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
\ifnum#2<0\advance\cv@tmpc1\relax-\fi
\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
\def\makevruler[#1][#2][#3][#4][#5]{
\begingroup\offinterlineskip
\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
\global\setbox\icmlrulerbox=\vbox to \textheight{%
{
\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
\cv@lineheight=#1\global\icmlrulercount=#2%
\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
\cv@refno1\vskip-\cv@lineheight\vskip1ex%
\loop\setbox\cv@tmpbox=\hbox to0cm{ % side margin
\hfil {\hfil\fillzeros[#4]\icmlrulercount}
}%
\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
\advance\cv@refno1\global\advance\icmlrulercount#3\relax
\ifnum\cv@refno<\cv@tot\repeat
}
}
\endgroup
}%
\makeatother
% ----- end of vruler
% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
\def\icmlruler#1{\makevruler[12pt][#1][1][3][\textheight]\usebox{\icmlrulerbox}}
\AddToShipoutPicture{%
\icmlruleroffset=\textheight
\advance\icmlruleroffset by 5.2pt % top margin
\color[rgb]{.7,.7,.7}
\ifdefined\isaccepted \else
\AtTextUpperLeft{%
\put(\LenToUnit{-35pt},\LenToUnit{-\icmlruleroffset}){%left ruler
\icmlruler{\icmlrulercount}}
% \put(\LenToUnit{1.04\textwidth},\LenToUnit{-\icmlruleroffset}){%right ruler
% \icmlruler{\icmlrulercount}}
}
\fi
}
\endinput

1246
report/natbib.sty Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,43 @@
import numpy as np
from mlp.layers import BatchNormalizationLayer
import argparse
parser = argparse.ArgumentParser(description='Welcome to GAN-Shot-Learning script')
parser.add_argument('--student_id', nargs="?", type=str, help='Your student id in the format "sxxxxxxx"')
args = parser.parse_args()
student_id = args.student_id
def generate_inputs(student_id):
student_number = student_id
tests = np.arange(96).reshape((2, 3, 4, 4))
tests[:, 0, :, :] = float(student_number[1:3]) / 10 - 5
tests[:, :, 1, :] = float(student_number[3:5]) / 10 - 5
tests[:, 2, :, :] = float(student_number[5:7]) / 10 - 5
tests[0, 1, :, :] = float(student_number[7]) / 10 - 5
return tests
test_inputs = generate_inputs(student_id)
test_inputs = np.reshape(test_inputs, newshape=(2, -1))
test_grads_wrt_outputs = np.arange(-48, 48).reshape((2, -1))
#produce BatchNorm Layer fprop and bprop
activation_layer = BatchNormalizationLayer(input_dim=48)
beta = np.array(48*[0.3])
gamma = np.array(48*[0.8])
activation_layer.params = [gamma, beta]
BN_fprop = activation_layer.fprop(test_inputs)
BN_bprop = activation_layer.bprop(
test_inputs, BN_fprop, test_grads_wrt_outputs)
BN_grads_wrt_params = activation_layer.grads_wrt_params(
test_inputs, test_grads_wrt_outputs)
test_output = "BatchNormalization:\nFprop: {}\nBprop: {}\nGrads_wrt_params: {}\n"\
.format(BN_fprop, BN_bprop, BN_grads_wrt_params)
with open("{}_batchnorm_test_file.txt".format(student_id), "w+") as out_file:
out_file.write(test_output)

View File

@ -0,0 +1,46 @@
import numpy as np
from mlp.layers import ConvolutionalLayer
import argparse
parser = argparse.ArgumentParser(description='Welcome to GAN-Shot-Learning script')
parser.add_argument('--student_id', nargs="?", type=str, help='Your student id in the format "sxxxxxxx"')
args = parser.parse_args()
student_id = args.student_id
def generate_inputs(student_id):
student_number = student_id
tests = np.arange(96).reshape((2, 3, 4, 4))
tests[:, 0, :, :] = float(student_number[1:3]) / 10 - 5
tests[:, :, 1, :] = float(student_number[3:5]) / 10 - 5
tests[:, 2, :, :] = float(student_number[5:7]) / 10 - 5
tests[0, 1, :, :] = float(student_number[7]) / 10 - 5
return tests
test_inputs = generate_inputs(student_id)
test_grads_wrt_outputs = np.arange(-20, 16).reshape((2, 2, 3, 3))
inputs = np.arange(96).reshape((2, 3, 4, 4))
kernels = np.arange(-12, 12).reshape((2, 3, 2, 2))
biases = np.arange(2)
#produce ConvolutionalLayer fprop, bprop and grads_wrt_params
activation_layer = ConvolutionalLayer(num_input_channels=3, num_output_channels=2, input_dim_1=4, input_dim_2=4,
kernel_dim_1=2, kernel_dim_2=2)
activation_layer.params = [kernels, biases]
conv_fprop = activation_layer.fprop(test_inputs)
conv_bprop = activation_layer.bprop(
test_inputs, conv_fprop, test_grads_wrt_outputs)
conv_grads_wrt_params = activation_layer.grads_wrt_params(test_inputs,
test_grads_wrt_outputs)
test_output = "ConvolutionalLayer:\nFprop: {}\nBprop: {}\n" \
"Grads_wrt_params: {}\n".format(conv_fprop,
conv_bprop,
conv_grads_wrt_params)
with open("{}_conv_test_file.txt".format(student_id), "w+") as out_file:
out_file.write(test_output)

BIN
spec/coursework1.pdf Normal file

Binary file not shown.

493
spec/coursework1.tex Normal file
View File

@ -0,0 +1,493 @@
\documentclass[11pt,]{article}
\usepackage[T1]{fontenc}
\usepackage{amssymb,amsmath}
\usepackage{txfonts}
\usepackage{microtype}
\usepackage{amssymb,amsmath}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{natbib}
\usepackage{paralist}
\usepackage{hyperref}
\usepackage{url}
\urlstyle{same}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\newenvironment{Shaded}{}{}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.56,0.13,0.00}{{#1}}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textit{{#1}}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{{#1}}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.02,0.16,0.49}{{#1}}}
\newcommand{\RegionMarkerTok}[1]{{#1}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
\newcommand{\NormalTok}[1]{{#1}}
\hypersetup{breaklinks=true,
pdfauthor={},
pdftitle={},
colorlinks=true,
citecolor=blue,
urlcolor=blue,
linkcolor=magenta,
pdfborder={0 0 0}}
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{0}
\usepackage[a4paper,body={170mm,250mm},top=25mm,left=25mm]{geometry}
\usepackage[sf,bf,small]{titlesec}
\usepackage{fancyhdr}
\pagestyle{fancy}
\lhead{\sffamily MLP Coursework 1}
\rhead{\sffamily Due: 30 October 2017}
\cfoot{\sffamily \thepage}
\author{}
\date{}
\DeclareMathOperator{\softmax}{softmax}
\DeclareMathOperator{\sigmoid}{sigmoid}
\DeclareMathOperator{\sgn}{sgn}
\DeclareMathOperator{\relu}{relu}
\DeclareMathOperator{\lrelu}{lrelu}
\DeclareMathOperator{\elu}{elu}
\DeclareMathOperator{\selu}{selu}
\DeclareMathOperator{\maxout}{maxout}
\begin{document}
\section{Machine Learning Practical: Coursework
1}
\label{sec:machine-learning-practical-coursework-1}
\textbf{Release date: Monday 16th October 2017}\\
\textbf{Due date: 16:00 Monday 30th October 2017}
\subsection{Introduction}
\label{sec:introduction}
This coursework is concerned with training multi-layer networks to
address the MNIST digit classification problem. It builds on the
material covered in the first three lab notebooks and the first four
lectures. \textbf{You should complete the first three lab
notebooks before starting the coursework.} The aim of the coursework is
to investigate variants of the ReLU activation function for hidden units
in multi-layer networks, with respect to the validation set accuracies
achieved by the trained models.
\subsection{Code}
\label{sec:code}
You should run all of the experiments for the coursework inside the
Conda environment you set up in first labs. The code for the coursework is available on the course
\href{https://github.com/CSTR-Edinburgh/mlpractical/}{Github repository}
on a branch \texttt{mlp2017-8/coursework1}. To create a local working
copy of this branch in your local repository you need to do the
following.
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\itemsep1pt\parskip0pt\parsep0pt
\item
Make sure all modified files on the branch you are currently have been
committed
(\href{https://github.com/CSTR-Edinburgh/mlpractical/blob/mlp2017-8/master/notes/getting-started-in-a-lab.md}{see
details here} if you are unsure how to do this).
\item
Fetch changes to the upstream \texttt{origin} repository by running\\
\texttt{git fetch origin}
\item
Checkout a new local branch from the fetched branch using\\
\texttt{git checkout -b coursework1 origin/mlp2017-8/coursework1}
\end{enumerate}
You will now have a new branch in your local repository with all the
code necessary for the coursework in it. In the \texttt{notebooks}
directory there is a notebook \texttt{Coursework\_1.ipynb} which is
intended as a starting point for structuring the code for your
experiments. You will probably want to add additional code cells to this
as you go along and run new experiments (e.g.~doing each new training
run in a new cell). You may also wish to use Markdown cells to keep
notes on the results of experiments.
There will also be a \verb+report+ directory which contains the LaTeX template and style files for the report. You should copy all these files into the directory which will contain your report.
\subsection{Standard network
architecture}
\label{sec:standard-network-architecture}
To make the results of your experiments more easily comparable, you
should try to keep as many of the free choices in the specification of
the model and learning problem the same across different experiments. If
you vary only a small number of aspects of the problem at a time this
will make it easier to interpret the effect of those changes.
In these experiments you should use a multi-layer network with two hidden layers
(corresponding to three affine transformations) and a softmax output layer. The initial baseline
should use a sigmoid activation function for the hidden layer; other experiments will explore
different nonlinear activation functions. The hidden layers should each contain 100 hidden units.
The baseline network can this be defined with the following code (which should be familiar to you from Lab 3):
\begin{Shaded}
\begin{Highlighting}[]
\CharTok{import} \NormalTok{numpy }\CharTok{as} \NormalTok{np}
\CharTok{from} \NormalTok{mlp.layers }\CharTok{import} \NormalTok{AffineLayer, SoftmaxLayer, SigmoidLayer}
\CharTok{from} \NormalTok{mlp.errors }\CharTok{import} \NormalTok{CrossEntropySoftmaxError}
\CharTok{from} \NormalTok{mlp.models }\CharTok{import} \NormalTok{MultipleLayerModel}
\CharTok{from} \NormalTok{mlp.initialisers }\CharTok{import} \NormalTok{ConstantInit, GlorotUniformInit}
\NormalTok{seed = }\DecValTok{10102016}
\NormalTok{rng = np.random.RandomState(seed)}
\NormalTok{input_dim, output_dim, hidden_dim = }\DecValTok{784}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{100}
\NormalTok{weights_init = GlorotUniformInit(rng=rng)}
\NormalTok{biases_init = ConstantInit(}\DecValTok{0}\NormalTok{.)}
\NormalTok{model = MultipleLayerModel([}
\NormalTok{AffineLayer(input_dim, hidden_dim, weights_init, biases_init),}
\NormalTok{SigmoidLayer(),}
\NormalTok{AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init),}
\NormalTok{SigmoidLayer(),}
\NormalTok{AffineLayer(hidden_dim, output_dim, weights_init, biases_init)}
\NormalTok{])}
\NormalTok{error = CrossEntropySoftmaxError()}
\end{Highlighting}
\end{Shaded}
Here we are using the Glorot initialisation scheme, discussed in lecture 4. In part 2B of this coursework you will explore the effect of different initialisation schemes.
The above code creates a network using sigmoid hidden layers; you should modify it to also create a network using ReLU activation functions (see Lab 3). These two networks will form your baseline systems.
As well as standardising the network architecture, you should also fix
the hyperparameters of the training procedure not being investigated to
be the same across different runs. In particular for all experiments you
should use a \textbf{batch size of 50 and train for a total of 100
epochs} for all reported runs. You may of course use a smaller number of
epochs for initial pilot runs.
\subsection{Part 1: Implementing Activation Functions}
\label{sec:actfns}
In the first part of the assignment you will implement three further
activation functions, each of which is related to ReLU \citep{nair2010rectified}: Leaky ReLU, ELU (Exponential Linear Unit), and SELU (Scaled Exponential Linear Unit). Each of these units defines an activation function for which $f(x) = x$ when $x>0$, as for ReLU, but avoid having a zero gradient when $x<0$.
\textbf{Leaky ReLU} ($\lrelu(x)$) \citep{maas2013rectifier} has the following form:
\begin{equation}
\lrelu(x) =
\begin{cases}
\alpha x & \quad \text{if } x \leq 0 \\
x & \quad \text{if } x > 0 \\
\end{cases}
\end{equation}
Where $\alpha$ is a constant; typically $\alpha=0.01$, and you can use this value in this coursework. Note that $\alpha$ can be taken to be a parameter which is learned by back-propagation along with the weights and biases -- this is called Parametric ReLU (PReLU).
\textbf{ELU} ($\elu(x)$) \citep{clevert2015fast} has the following form:
\begin{equation}
\elu(x) =
\begin{cases}
\alpha (\exp(x) - 1) & \quad \text{if } x \leq 0 \\
x & \quad \text{if } x > 0 \\
\end{cases}
\end{equation}
Again $\alpha$ can be taken as a constant or a tunable parameter. Typically $\alpha=1$, which results in a smooth function, and you can use this value in this coursework.
\textbf{SELU} ($\selu(x)$) \citep{klambauer2017self} has the following form:
\begin{equation}
\selu(x) =
\lambda \begin{cases}
\alpha (\exp(x) - 1) & \quad \text{if } x \leq 0 \\
x & \quad \text{if } x > 0 \\
\end{cases}
\end{equation}
In the case of SELU, there is a theoretical argument for optimal values of the two parameters: $\alpha \approx 1.6733$ and $\lambda \approx 1.0507$, and you can use these values in this coursework.
\begin{enumerate}
\item Implement each of these activations function as classes \verb+LeakyReluLayer+, \verb+EluLayer+ and \verb+SeluLayer+. You need to implement \verb+fprop+ and \verb+bprop+ methods for each class.
\item Verify the correctness of your implementation using the supplied unit tests in \verb+Activation\_Tests.ipynb+
\item Automatically create a test file \verb+sXXXXXXX_test_file.txt+, by running the provided program \verb+generate_inputs.py+ which uses your code for \verb+LeakyReluLayer+, \verb+EluLayer+ and \verb+SeluLayer+ to run your fprop and bprop methods for each layer on a unique test vector generated using your student ID number.
\end{enumerate}
For Part 1 of the coursework you need to submit the test file \verb+sXXXXXXX_test_file.txt+ (where sXXXXXXX is replaced with your student number) created in step 3 above.
\subsection{Part 2: MNIST Experiments}
\label{sec:expts}
In Part 2 of the coursework you will experiment with \verb+LeakyReluLayer+, \verb+EluLayer+ and \verb+SeluLayer+ in multi-layer networks trained on MNIST.
\subsubsection{2A: Comparing activation functions}
In this sub-part you should compare the behaviour of Leaky ReLU, ELU, and SELU activation functions on the MNIST task. Carry out all experiments using 2 hidden layers, with 100 units per hidden layer. You should compare the results with baseline systems of the same architecture using sigmoid units and using ReLU units.
\subsubsection{2B: Deep neural network experiments}
In this subpart you will explore the behaviour of deeper networks. Based on the results of Part 2A, choose one activation function, and compare networks with 2--8 hidden layers, using 100 hidden units per hidden layer.
Also compare the effect of different initialisation strategies, as discussed in lecture 4. First look at the effect of weight initialisation based on
\begin{compactitem}
\item Fan-in: $w_i \sim U(-\sqrt{3/n_{in}}, \sqrt{3/n_{in}}$
\item Fan-out: $w_i \sim U(-\sqrt{3/n_{out}}, \sqrt{3/n_{out}}$
\item Fan-in and Fan-out: $w_i \sim U \left(-\sqrt{6/(n_{in}+n_{out})}, \sqrt{6/(n_{in}+n_{out})}\right)$
\end{compactitem}
where $U$ is the uniform distribution. The first of these corresponds to constraining the estimated variance of a unit to be independent of the number of incoming connections ($n_{in}$); the second to constraining the estimated variance of a unit's gradient to be independent of the number of outgoing connections ($n_{out}$); the third corresponds to Glorot and Bengio's combined initialisation.
Additionally you could also explore the effect of drawing from a Gaussian distribution compared with a uniform distribution. In particular you might like to explore initialising a SELU layer drawing from a Gaussian with mean 0 and variance $1/n_{out}$ as recommended by \cite{klambauer2017self}.
For Part 2 of the coursework you need to write and submit a report, using the template provided, in the directory \verb+report+. Please read the template document \verb+mlp-cw1-template.pdf+ very carefully, as it provides advice and instructions on writing your report. You can use the LaTeX source file \verb+mlp-cw1-template.tex+ as a template for your report (see below, in the section 'Report').
It is highly recommended that you use LaTeX for your report. If you have not used LaTeX previously, now is a good time to learn how to use it!
\subsection{Backing up your work}
\label{sec:backing-up-your-work}
It is \textbf{strongly recommended} you use some method for backing up
your work. Those working in their AFS homespace on DICE will have their
work automatically backed up as part of the
\href{http://computing.help.inf.ed.ac.uk/backups-and-mirrors}{routine
backup} of all user homespaces. If you are working on a personal
computer you should have your own backup method in place (e.g.~saving
additional copies to an external drive, syncing to a cloud service or
pushing commits to your local Git repository to a private repository on
Github). \textbf{Loss of work through failure to back up
\href{http://tinyurl.com/edinflate}{does not consitute a good reason for
late submission}}.
You may \emph{additionally} wish to keep your coursework under version
control in your local Git repository on the \texttt{coursework1} branch.
% This does not need to be limited to the coursework notebook and
% \texttt{mlp} Python modules - you can also add your report document to
% the repository.
If you make regular commits of your work on the coursework this will
allow you to better keep track of the changes you have made and if
necessary revert to previous versions of files and/or restore
accidentally deleted work. This is not however required and you should
note that keeping your work under version control is a distinct issue
from backing up to guard against hard drive failure. If you are working
on a personal computer you should still keep an additional back up of
your work as described above.
\subsection{Report}
\label{sec:report}
Part two of your coursework submission, worth 70 marks will be a report. The directory
\verb+coursework1/report+ contains a template for your report (\verb+mlp-cw1-template.txt+); the generated pdf file (\verb+mlp-cw1-template.pdf+) is also provided, and you should read this file carefully as it contains information about the required structure and experimentation. The template is written in LaTeX, and we strongly recommend that you write your own report using LaTeX, using the supplied document style \verb+mlp2017+ (as in the template).
You should copy the files in the \verb+report+ directory to the directory containing the LaTeX file of your report, as \verb+pdflatex+ will need to access these files when building the pdf document from the LaTeX source file.
Your report should be in a 2-column format, based on the document format used for the ICML conference. The report should be a \textbf{maximum of 6 pages long}, with a further page for references. We will not read or assess any parts of the report beyond the allowed 6+1 pages.
Ideally, all figures should be included in your report file as
\href{https://en.wikipedia.org/wiki/Vector_graphics}{vector graphics}
rather than \href{https://en.wikipedia.org/wiki/Raster_graphics}{raster
files} as this will make sure all detail in the plot is visible.
Matplotlib supports saving high quality figures in a wide range of
common image formats using the
\href{http://matplotlib.org/api/pyplot_api.html\#matplotlib.pyplot.savefig}{\texttt{savefig}}
function. \textbf{You should use \texttt{savefig} rather than copying
the screen-resolution raster images outputted in the notebook.} An
example of using \texttt{savefig} to save a figure as a PDF file (which
can be included as graphics in
\href{https://en.wikibooks.org/wiki/LaTeX/Importing_Graphics}{LaTeX}
compiled with \texttt{pdflatex} and in Apple Pages and
\href{https://support.office.com/en-us/article/Add-a-PDF-to-your-Office-file-74819342-8f00-4ab4-bcbe-0f3df15ab0dc}{Microsoft
Word} documents) is given below.
\begin{Shaded}
\begin{Highlighting}[]
\CharTok{import} \NormalTok{matplotlib.pyplot }\CharTok{as} \NormalTok{plt}
\CharTok{import} \NormalTok{numpy }\CharTok{as} \NormalTok{np}
\CommentTok{# Generate some example data to plot}
\NormalTok{x = np.linspace(}\DecValTok{0}\NormalTok{., }\DecValTok{1}\NormalTok{., }\DecValTok{100}\NormalTok{)}
\NormalTok{y1 = np.sin(}\DecValTok{2}\NormalTok{. * np.pi * x)}
\NormalTok{y2 = np.cos(}\DecValTok{2}\NormalTok{. * np.pi * x)}
\NormalTok{fig_size = (}\DecValTok{6}\NormalTok{, }\DecValTok{3}\NormalTok{) }\CommentTok{# Set figure size in inches (width, height)}
\NormalTok{fig = plt.figure(figsize=fig_size) }\CommentTok{# Create a new figure object}
\NormalTok{ax = fig.add_subplot(}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{) }\CommentTok{# Add a single axes to the figure}
\CommentTok{# Plot lines giving each a label for the legend and setting line width to 2}
\NormalTok{ax.plot(x, y1, linewidth=}\DecValTok{2}\NormalTok{, label=}\StringTok{'$y = \textbackslash{}sin(2\textbackslash{}pi x)$'}\NormalTok{)}
\NormalTok{ax.plot(x, y2, linewidth=}\DecValTok{2}\NormalTok{, label=}\StringTok{'$y = \textbackslash{}cos(2\textbackslash{}pi x)$'}\NormalTok{)}
\CommentTok{# Set the axes labels. Can use LaTeX in labels within $...$ delimiters.}
\NormalTok{ax.set_xlabel(}\StringTok{'$x$'}\NormalTok{, fontsize=}\DecValTok{12}\NormalTok{)}
\NormalTok{ax.set_ylabel(}\StringTok{'$y$'}\NormalTok{, fontsize=}\DecValTok{12}\NormalTok{)}
\NormalTok{ax.grid(}\StringTok{'on'}\NormalTok{) }\CommentTok{# Turn axes grid on}
\NormalTok{ax.legend(loc=}\StringTok{'best'}\NormalTok{, fontsize=}\DecValTok{11}\NormalTok{) }\CommentTok{# Add a legend}
\NormalTok{fig.tight_layout() }\CommentTok{# This minimises whitespace around the axes.}
\NormalTok{fig.savefig(}\StringTok{'file-name.pdf'}\NormalTok{) }\CommentTok{# Save figure to current directory in PDF format}
\end{Highlighting}
\end{Shaded}
(If you are using Libre/OpenOffice you should use Scalable Vector Format
plots instead using \\
\texttt{fig.savefig('file-name.svg')}. If the
document editor you are using for the report does not support including
either PDF or SVG graphics you can instead output high-resolution raster
images using \texttt{fig.savefig('file-name.png', dpi=200)} however note
these files will generally be larger than either SVG or PDF formatted
graphics.)
However to emphasise again: \textbf{It is highly recommended that you use LaTeX.}
If you make use of any any books, articles, web pages or other resources
you should appropriately cite these in your report. You do not need to
cite material from the course lecture slides or lab notebooks.
To create a pdf file \verb+mlp-cw1-template.pdf+ from a LaTeX source file (\verb+mlp-cw1-template.tex+), you can run the following in a terminal:
\begin{verbatim}
pdflatex mlp-cw1-template
bibtex mlp-cw1-template
pdflatex mlp-cw1-template
pdflatex mlp-cw1-template
\end{verbatim}
(Yes, you have to run pdflatex multiple times, in order for latex to construct the internal document references.)
An alternative, simpler approach uses the \verb+latexmk+ program:
\begin{verbatim}
latexmk -pdf mlp-cw1-template
\end{verbatim}
It is worth learning how to use LaTeX effectively, as it is particularly powerful for mathematical and academic writing. There are many tutorials on the web.
\subsection{Mechanics}
\label{sec:mechanics}
\textbf{Marks:}
This assignment will be assessed out of 100 marks and
forms 10\% of your final grade for the course.
\textbf{Academic conduct:}
Assessed work is subject to University
regulations on academic
conduct:\\\url{http://web.inf.ed.ac.uk/infweb/admin/policies/academic-misconduct}
\textbf{Submission:}
You can submit more than once up until the submission deadline. All
submissions are timestamped automatically. Identically named files
will overwrite earlier submitted versions, so we will mark the latest
submission that comes in before the deadline.
If you submit anything before the deadline, you may not resubmit
afterward. (This policy allows us to begin marking submissions
immediately after the deadline, without having to worry that some may
need to be re-marked).
If you do not submit anything before the deadline, you may submit {\em
exactly once} after the deadline, and a late penalty will be applied
to this submission unless you have received an approved extension.
Please be aware that late submissions may receive lower priority for
marking, and marks may not be returned within the same timeframe as
for on-time submissions.
{\em Warning:} Unfortunately the \verb+submit+ command will technically
allow you to submit late even if you submitted before the deadline
(i.e.\ it does not enforce the above policy). Don't do this! We will
mark the version that we retrieve just after the deadline, and (even
worse) you may still be penalized for submitting late because the
timestamp will update.
For additional information about late penalties and extension
requests, see the School web page below. Do {\bf not} email any course
staff directly about extension requests; you must follow the
instructions on the web page.
\url{http://web.inf.ed.ac.uk/infweb/student-services/ito/admin/coursework-projects/late-coursework-extension-requests}
\textbf{Late submission penalty:}
Following the University guidelines,
late coursework submitted without an authorised extension will be
recorded as late and the following penalties will apply: 5
percentage points will be deducted for every calendar day or part
thereof it is late, up to a maximum of 7 calendar days. After this
time a mark of zero will be recorded.
\subsection{Submission}
\label{sec:submission}
Your coursework submission should be done electronically using the
\href{http://computing.help.inf.ed.ac.uk/submit}{\texttt{submit}}
command available on DICE machines.
Your submission should include
\begin{itemize}
\itemsep1pt\parskip0pt\parsep0pt
\item
the unit test file generated in part 1, \verb+sXXXXXXX_test_file.txt+, where your student number replaces \verb+sXXXXXXX+
\item
your completed report as a PDF file, using the provided template
\item
the notebook (\verb+.ipynb+) file you used to run the experiments in
\item
and your local version of the \texttt{mlp} code including any changes
you made to the modules (\texttt{.py} files).
\end{itemize}
You should copy all of the files to a single directory, \verb+coursework1+, e.g.
\begin{verbatim}
mkdir coursework1
cp notebooks/Coursework_1.ipynb mlp/*.py coursework1
cp reports/coursework1.pdf reports/sXXXXXXX_test_file.txt coursework1
\end{verbatim}
and then submit this directory using
\begin{verbatim}
submit mlp cw1 coursework1
\end{verbatim}
The \texttt{submit} command will prompt you with the details of the
submission including the name of the files / directories you are
submitting and the name of the course and exercise you are submitting
for and ask you to check if these details are correct. You should check
these carefully and reply \texttt{y} to submit if you are sure the files
are correct and \texttt{n} otherwise.
You can amend an existing submission by rerunning the \texttt{submit}
command any time up to the deadline. It is therefore a good idea
(particularly if this is your first time using the DICE submit
mechanism) to do an initial run of the \texttt{submit} command early on
and then rerun the command if you make any further updates to your
submisison rather than leaving submission to the last minute.
\subsection{Marking Scheme}
\label{sec:marking-scheme}
\begin{itemize}
\item
Part 1, Activation function implementation (30 marks). Based on your submitted test file.
\item
Part 2, Report (70 marks). The following aspects will contribute to the mark for your report:
\begin{itemize}
\item Abstract - how clear is it? does it cover what is reported in the document
\item Introduction - do you clear outline and motivate the paper, and describe the research questions investigated?
\item Description of activation functions -- is it clear and correct?
\item Experiments -- did you carry out the experiments correctly? are the results clearly presented and described?
\item Interpretation and discussion of results
\item Conclusions
\item Presentation and clarity of report
\end{itemize}
\end{itemize}
\bibliographystyle{plainnat}
\bibliography{cw1-references}
\end{document}

BIN
spec/coursework2.pdf Normal file

Binary file not shown.

408
spec/coursework2.tex Normal file
View File

@ -0,0 +1,408 @@
\documentclass[11pt,]{article}
\usepackage[T1]{fontenc}
\usepackage{amssymb,amsmath}
\usepackage{txfonts}
\usepackage{microtype}
\usepackage{amssymb,amsmath}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{natbib}
\usepackage{paralist}
\usepackage{hyperref}
\usepackage{url}
\urlstyle{same}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\newenvironment{Shaded}{}{}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.56,0.13,0.00}{{#1}}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textit{{#1}}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{{#1}}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.02,0.16,0.49}{{#1}}}
\newcommand{\RegionMarkerTok}[1]{{#1}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
\newcommand{\NormalTok}[1]{{#1}}
\hypersetup{breaklinks=true,
pdfauthor={},
pdftitle={},
colorlinks=true,
citecolor=blue,
urlcolor=blue,
linkcolor=magenta,
pdfborder={0 0 0}}
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{1}
\usepackage[a4paper,body={170mm,250mm},top=25mm,left=25mm]{geometry}
\usepackage[sf,bf,small]{titlesec}
\usepackage{fancyhdr}
\pagestyle{fancy}
\lhead{\sffamily MLP Coursework 2}
\rhead{\sffamily Due: 28 November 2017}
\cfoot{\sffamily \thepage}
\author{}
\date{}
\DeclareMathOperator{\softmax}{softmax}
\DeclareMathOperator{\sigmoid}{sigmoid}
\DeclareMathOperator{\sgn}{sgn}
\DeclareMathOperator{\relu}{relu}
\DeclareMathOperator{\lrelu}{lrelu}
\DeclareMathOperator{\elu}{elu}
\DeclareMathOperator{\selu}{selu}
\DeclareMathOperator{\maxout}{maxout}
\begin{document}
\begin{center}
\textsf{\textbf{\Large Machine Learning Practical: Coursework 2}}
\bigskip
\textbf{Release date: Monday 6th November 2017}
\textbf{Due date: 16:00 Tuesday 28th November 2017}
\end{center}
\section{Introduction}
\label{sec:introduction}
% This coursework is concerned with training multi-layer networks to
% address the MNIST digit classification problem. It builds on the
% material covered in the first three lab notebooks and the first four
% lectures. \textbf{You should complete the first three lab
% notebooks before starting the coursework.} The aim of the coursework is
% to investigate variants of the ReLU activation function for hidden units
% in multi-layer networks, with respect to the validation set accuracies
% achieved by the trained models.
The aim of this coursework is to further explore the classification of images of handwritten digits using neural networks. We'll be using an extended version of the MNIST database, the EMNIST Balanced dataset, described in Section~\ref{sec:emnist}. Part A of the coursework will consist of building baseline deep neural networks for the EMNIST classification task, implementation and experimentation of the Adam and RMSProp learning rules, and implementation and experimentation of Batch Normalisation. Part B will concern implementation and experimentation of convolutional networks. As with the previous coursework, you will need to hand in test files generated from your code, and a report.
\section{Dataset}
\label{sec:emnist}
In this coursework we shall use the EMNIST (Extended MNIST) Balanced dataset, \url{https://www.nist.gov/itl/iad/image-group/emnist-dataset} \citep{cohen2017emnist}. EMNIST extends MNIST by including images of handwritten letters (upper and lower case) as well as handwritten digits. Both EMNIST and MNIST are extracted from the same underlying dataset, referred to as NIST Special Database 19. Both use the same conversion process resulting in centred images of dimension 28$\times$28.
Although there are 62 potential classes for EMNIST (10 digits, 26 lower case letters, and 26 upper case letters) we shall use a reduced label set of 47 different labels. This is because of confusions which arise when trying to discriminate upper-case and lower-case versions of the same letter, following the data conversion process. In the 47 label set, upper- and lower-case labels are merged for the following letters: C, I, J, K, L, M, O, P, S, U, V, W, X, Y and Z.
The training set for Balanced EMNIST has about twice the number of examples as MNIST, thus you should expect the run-time of your experiments to be about twice as long. The expected accuracy rates are lower for EMNIST than for MNIST (as EMNIST has more classes, and more confusable examples), and differences in accuracy between different systems should be larger. See \citet{cohen2017emnist} for some baseline results on EMNIST, as well as a description of the dataset.
You don't need to download the EMNIST database from the NIST website, it will be part of the \verb+coursework_2+ branch from the \verb+mlpractical+ Github repository, discussed in Section~\ref{sec:code} below.
\section{Code}
\label{sec:code}
You should run all of the experiments for the coursework inside the
Conda environment you set up in the first labs. The code for the coursework
is available on the course
\href{https://github.com/CSTR-Edinburgh/mlpractical/}{Github repository}
on a branch \verb+mlp2017-8/coursework_2+. To create a local working
copy of this branch in your local repository you need to do the
following.
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\itemsep1pt\parskip0pt\parsep0pt
\item
Make sure all modified files on the branch you are currently have been
committed
(\href{https://github.com/CSTR-Edinburgh/mlpractical/blob/mlp2017-8/master/notes/getting-started-in-a-lab.md}{see
details here} if you are unsure how to do this).
\item
Fetch changes to the upstream \texttt{origin} repository by running\\
\texttt{git fetch origin}
\item
Checkout a new local branch from the fetched branch using\\
\verb+git checkout -b coursework_2 origin/mlp2017-8/coursework_2+
\end{enumerate}
You will now have a new branch in your local repository with all the
code necessary for the coursework in it.
This branch includes the following additions to your setup:
\begin{itemize}
\itemsep1pt\parskip0pt\parsep0pt
\item
A notebook \verb+BatchNormalizationLayer_tests.ipynb+ which includes
test functions to check the implementations of the BatchNorm layer
\texttt{fprop}, \texttt{bprop} and \texttt{grads\_wrt\_params}
methods. The BatchNormalizationLayer skeleton code can be found in mlp.layers.
The tests use the mlp.layers implementation so be sure to reload your notebook
when you update your mlp.layers code.
\item
A notebook \verb+ConvolutionalLayer_tests.ipynb+ which includes
test functions to check the implementations of the Convolutional layer
\texttt{fprop}, \texttt{bprop} and \texttt{grads\_wrt\_params}
methods. The ConvolutionalLayer skeleton code can be found in mlp.layers.
The tests use the mlp.layers implementation so be sure to reload your notebook
when you update your mlp.layers code.
\item
A new \texttt{ReshapeLayer} class in the \verb+mlp.layers+ module.
When included in a a multiple layer model, this allows the output of
the previous layer to be reshaped before being forward propagated to
the next layer.
\item
A new \texttt{EMNISTDataProvider} class in the \verb+mlp.data_providers+ module.
This class is a small change to the \texttt{MNISTDataProvider} class, linking to the Balanced EMNIST data, and setting the number of classes to 47.
\item
Training, validation, and test sets for the \texttt{EMNIST Balanced} dataset that
you will use in this coursework
\end{itemize}
% In the \texttt{notebooks}
% directory there is a notebook \verb+Coursework_1.ipynb+ which is
% intended as a starting point for structuring the code for your
% experiments. You will probably want to add additional code cells to this
% as you go along and run new experiments (e.g.~doing each new training
% run in a new cell). You may also wish to use Markdown cells to keep
% notes on the results of experiments.
There will also be a \verb+coursework_2/report+ directory which contains the LaTeX template and style files for the report. You should copy all these files into the directory which will contain your report.
\section{Tasks}
\subsection*{Part A: Deep Neural Networks}
In part A of the coursework you will focus on using deep neural networks on EMNIST, and you should implement the Adam and RMSProp learning rules, and Batch Normalisation.
\begin{enumerate}
\item Perform baseline experiments using DNNs trained on EMNIST. Obviously there are a lot things that could be explored including hidden unit activation functions, network architectures, training hyperparameters, and the use of regularisation and dropout. You cannot explore everything and is best to carefully investigate a few things in depth.
\item Implement the RMSProp \citep{tieleman2012rmsprop} and Adam \citep{kingma2015adam} learning rules, by defining new classes inheriting from \texttt{GradientDescendLearningRule} in the \texttt{mlp/learning\_rules.py} module. The \texttt{MomentumLearningRule} class is an example of how to define a learning rules which uses an additional state variable to calculate the updates to the parameters.
\item Perform experiments to compare stochastic gradient descent, RMSProp, and Adam for deep neural network training on EMNIST, building on your earlier baseline experiments.
\item Implement batch normalisation \citep{ioffe2015batch} as a class \verb+BatchNormalizationLayer+. You need to implement \texttt{fprop}, \texttt{bprop} and \texttt{grads\_wrt\_params} methods for this class.
\item Verify the correctness of your implementation using the supplied unit tests in\\\verb+BatchNormalizationLayer_tests.ipynb+.
\item Automatically create a test file \verb+sXXXXXXX_batchnorm_test.txt+, by running the provided program \verb+generate_batchnorm_test.py+ which uses your \verb+BatchNormalizationLayer+ class methods on a unique test vector generated using your student ID number.
\item Perform experiments on EMNIST to investigate the impact of using batch normalisation in deep neural networks, building on your earlier experiments.
\end{enumerate}
In the above experiments you should use the validation set to assess accuracy. Use the test set at the end to assess the accuracy of the deep neural network architecture and training setup that you judge to be the best.
\subsection*{Part B: Convolutional Networks}
In part B of the coursework you should implement convolutional and max-pooling layers, and carry out experiments using a convolutional networks with one and two convolutional layers.
\begin{enumerate}
\item Implement a convolutional layer as a class \verb+ConvolutionalLayer+. You need to implement \texttt{fprop}, \texttt{bprop} and \texttt{grads\_wrt\_params} methods for this class.
\item Verify the correctness of your implementation using the supplied unit tests in\\\verb+ConvolutionalLayer_tests.ipynb+.
\item Automatically create a test file \verb+sXXXXXXX_conv_test.txt+, by running the provided program \verb+generate_conv_test.py+ which uses your \verb+ConvolutionalLayer+ class methods on a unique test vector generated using your student ID number.
\item Implement a max-pooling layer. Non-overlapping pooling (which was assumed in the lecture presentation) is required. You may also implement a more generic solution with striding as well.
\item Construct and train networks containing one and two convolutional layers, and max-pooling layers, using the Balanced EMNIST data, reporting your experimental results. As a default use convolutional kernels of dimension 5x5 (stride 1) and pooling regions of 2x2 (stride 2, hence non-overlapping). As a default convolutional networks with two convolutional layers, investigate a network with two convolutional+maxpooling layers with 5 feature maps in the first convolutional layer, and 10 feature maps in the second convolutional layer.
\end{enumerate}
As before you should mainly use the validation set to assess accuracy, using the test set to assess the accuracy of the convolutional network you judge to be the best.
\section{Unit Tests}
\label{sec:tests}
Part one of your coursework submission will be the test files generated for batch normalisation (\verb+sXXXXXXX_batchnorm_test.txt+) and for the convolutional layer (\verb+sXXXXXXX_conv_test.txt+), as described above. Please do not change the names of these files as they will be automatically verified.
\section{Report}
\label{sec:report}
Part two of your coursework submission, worth 70 marks will be a report. The directory
\verb+coursework_2/report+ contains a template for your report (\verb+mlp-cw2-template.txt+); the generated pdf file (\verb+mlp-cw2-template.pdf+) is also provided, and you should read this file carefully as it contains information about the required structure and experimentation. The template is written in LaTeX, and we strongly recommend that you write your own report using LaTeX, using the supplied document style \verb+mlp2017+ (as in the template).
You should copy the files in the \verb+report+ directory to the directory containing the LaTeX file of your report, as \verb+pdflatex+ will need to access these files when building the pdf document from the LaTeX source file.
Your report should be in a 2-column format, based on the document format used for the ICML conference. The report should be a \textbf{maximum of 7 pages long}, with a further page for references. We will not read or assess any parts of the report beyond the allowed 7+1 pages.
As before, all figures should be included in your report file as vector graphics;
please see the section in \verb+coursework1.pdf+ about how to do this.
If you make use of any any books, articles, web pages or other resources
you should appropriately cite these in your report. You do not need to
cite material from the course lecture slides or lab notebooks.
To create a pdf file \verb+mlp-cw2-template.pdf+ from a LaTeX source file (\verb+mlp-cw2-template.tex+), you can run the following in a terminal:
\begin{verbatim}
pdflatex mlp-cw2-template
bibtex mlp-cw2-template
pdflatex mlp-cw2-template
pdflatex mlp-cw2-template
\end{verbatim}
(Yes, you have to run pdflatex multiple times, in order for latex to construct the internal document references.)
An alternative, simpler approach uses the \verb+latexmk+ program:
\begin{verbatim}
latexmk -pdf mlp-cw2-template
\end{verbatim}
It is worth learning how to use LaTeX effectively, as it is particularly powerful for mathematical and academic writing. There are many tutorials on the web.
\section{Mechanics}
\label{sec:mechanics}
\textbf{Marks:}
This assignment will be assessed out of 100 marks and
forms 25\% of your final grade for the course.
\textbf{Academic conduct:}
Assessed work is subject to University
regulations on academic
conduct:\\\url{http://web.inf.ed.ac.uk/infweb/admin/policies/academic-misconduct}
\textbf{Submission:}
You can submit more than once up until the submission deadline. All
submissions are timestamped automatically. Identically named files
will overwrite earlier submitted versions, so we will mark the latest
submission that comes in before the deadline.
If you submit anything before the deadline, you may not resubmit
afterward. (This policy allows us to begin marking submissions
immediately after the deadline, without having to worry that some may
need to be re-marked).
If you do not submit anything before the deadline, you may submit {\em
exactly once} after the deadline, and a late penalty will be applied
to this submission unless you have received an approved extension.
Please be aware that late submissions may receive lower priority for
marking, and marks may not be returned within the same timeframe as
for on-time submissions.
{\em Warning:} Unfortunately the \verb+submit+ command will technically
allow you to submit late even if you submitted before the deadline
(i.e.\ it does not enforce the above policy). Don't do this! We will
mark the version that we retrieve just after the deadline, and (even
worse) you may still be penalized for submitting late because the
timestamp will update.
For additional information about late penalties and extension
requests, see the School web page below. Do {\bf not} email any course
staff directly about extension requests; you must follow the
instructions on the web page.
\url{http://web.inf.ed.ac.uk/infweb/student-services/ito/admin/coursework-projects/late-coursework-extension-requests}
\textbf{Late submission penalty:}
Following the University guidelines,
late coursework submitted without an authorised extension will be
recorded as late and the following penalties will apply: 5
percentage points will be deducted for every calendar day or part
thereof it is late, up to a maximum of 7 calendar days. After this
time a mark of zero will be recorded.
\section{Backing up your work}
\label{sec:backing-up-your-work}
It is \textbf{strongly recommended} you use some method for backing up
your work. Those working in their AFS homespace on DICE will have their
work automatically backed up as part of the
\href{http://computing.help.inf.ed.ac.uk/backups-and-mirrors}{routine
backup} of all user homespaces. If you are working on a personal
computer you should have your own backup method in place (e.g.~saving
additional copies to an external drive, syncing to a cloud service or
pushing commits to your local Git repository to a private repository on
Github). \textbf{Loss of work through failure to back up
\href{http://tinyurl.com/edinflate}{does not consitute a good reason for
late submission}}.
You may \emph{additionally} wish to keep your coursework under version
control in your local Git repository on the \verb+coursework_2+ branch.
If you make regular commits of your work on the coursework this will
allow you to better keep track of the changes you have made and if
necessary revert to previous versions of files and/or restore
accidentally deleted work. This is not however required and you should
note that keeping your work under version control is a distinct issue
from backing up to guard against hard drive failure. If you are working
on a personal computer you should still keep an additional back up of
your work as described above.
\section{Submission}
\label{sec:submission}
Your coursework submission should be done electronically using the
\href{http://computing.help.inf.ed.ac.uk/submit}{\texttt{submit}}
command available on DICE machines.
Your submission should include
\begin{itemize}
\itemsep1pt\parskip0pt\parsep0pt
\item
the unit test files generated for part 1, \verb+sXXXXXXX_batchnorm_test.txt+ and \verb+sXXXXXXX_conv_test.txt+, where your student number replaces \verb+sXXXXXXX+. Please do not
change the names of these files.
\item
your completed report as a PDF file, using the provided template
\item
any notebook (\verb+.ipynb+) files you used to run the experiments in
\item
and your local version of the \texttt{mlp} code including any changes
you made to the modules (\texttt{.py} files).
\end{itemize}
Please do not submit anything else (e.g. log files).
You should copy all of the files to a single directory, \verb+coursework2+, e.g.
\begin{verbatim}
mkdir coursework2
cp reports/coursework2.pdf sXXXXXXX_batchnorm_test.txt sXXXXXXX_conv_test.txt coursework2
\end{verbatim}
and then submit this directory using
\begin{verbatim}
submit mlp cw2 coursework2
\end{verbatim}
Please submit the directory, not a zip file, not a tar file.
The \texttt{submit} command will prompt you with the details of the
submission including the name of the files / directories you are
submitting and the name of the course and exercise you are submitting
for and ask you to check if these details are correct. You should check
these carefully and reply \texttt{y} to submit if you are sure the files
are correct and \texttt{n} otherwise.
You can amend an existing submission by rerunning the \texttt{submit}
command any time up to the deadline. It is therefore a good idea
(particularly if this is your first time using the DICE submit
mechanism) to do an initial run of the \texttt{submit} command early on
and then rerun the command if you make any further updates to your
submisison rather than leaving submission to the last minute.
\section{Marking Scheme}
\label{sec:marking-scheme}
\begin{itemize}
\item
Part 1, Unit tests (30 marks).
\item
Part 2, Report (70 marks). The following aspects will contribute to the mark for your report:
\begin{itemize}
\item Abstract - how clear is it? does it cover what is reported in the document
\item Introduction - do you clearly outline and motivate the paper, and describe the research questions investigated?
\item Methods -- have you carefully described the approaches you have used?
\item Experiments -- did you carry out the experiments correctly? are the results clearly presented and described?
\item Interpretation and discussion of results
\item Conclusions
\item Presentation and clarity of report
\end{itemize}
\end{itemize}
\bibliographystyle{plainnat}
\bibliography{cw2-references}
\end{document}

29
spec/cw1-references.bib Normal file
View File

@ -0,0 +1,29 @@
@inproceedings{maas2013rectifier,
title={Rectifier nonlinearities improve neural network acoustic models},
author={Maas, Andrew L and Hannun, Awni Y and Ng, Andrew Y},
booktitle={Proc. ICML},
volume={30},
number={1},
year={2013}
}
@inproceedings{nair2010rectified,
title={Rectified linear units improve restricted {Boltzmann} machines},
author={Nair, Vinod and Hinton, Geoffrey E},
booktitle={Proc ICML},
pages={807--814},
year={2010}
}
@article{clevert2015fast,
title={Fast and accurate deep network learning by exponential linear units ({ELU}s)},
author={Clevert, Djork-Arn{\'e} and Unterthiner, Thomas and Hochreiter, Sepp},
journal={arXiv preprint arXiv:1511.07289},
year={2015}
}
@article{klambauer2017self,
title={Self-Normalizing Neural Networks},
author={Klambauer, G{\"u}nter and Unterthiner, Thomas and Mayr, Andreas and Hochreiter, Sepp},
journal={arXiv preprint arXiv:1706.02515},
year={2017}
}

64
spec/cw2-references.bib Normal file
View File

@ -0,0 +1,64 @@
@inproceedings{maas2013rectifier,
title={Rectifier nonlinearities improve neural network acoustic models},
author={Maas, Andrew L and Hannun, Awni Y and Ng, Andrew Y},
booktitle={Proc. ICML},
volume={30},
number={1},
year={2013}
}
@inproceedings{nair2010rectified,
title={Rectified linear units improve restricted {Boltzmann} machines},
author={Nair, Vinod and Hinton, Geoffrey E},
booktitle={Proc ICML},
pages={807--814},
year={2010}
}
@article{clevert2015fast,
title={Fast and accurate deep network learning by exponential linear units ({ELU}s)},
author={Clevert, Djork-Arn{\'e} and Unterthiner, Thomas and Hochreiter, Sepp},
journal={arXiv preprint arXiv:1511.07289},
year={2015}
}
@article{klambauer2017self,
title={Self-Normalizing Neural Networks},
author={Klambauer, G{\"u}nter and Unterthiner, Thomas and Mayr, Andreas and Hochreiter, Sepp},
journal={arXiv preprint arXiv:1706.02515},
year={2017}
}
@article{cohen2017emnist,
title = {{EMNIST}: an extension of {MNIST} to handwritten letters},
author = {Cohen, G. and Afshar, S. and Tapson, J. and van Schaik, A.},
journal={arXiv preprint arXiv:1702.05373},
year={2017},
url = {https://arxiv.org/abs/1702.05373}
}
@inproceedings{kingma2015adam,
title = {Adam: A Method for Stochastic Optimization},
author = {Diederik P. Kingma and Jimmy Ba},
booktitle = {ICML},
year = {2015},
url = {https://arxiv.org/abs/1412.6980}
}
@article{tieleman2012rmsprop,
title={Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude},
author={Tieleman, T. and Hinton, G. E.},
journal={COURSERA: Neural Networks for Machine Learning},
volume={4},
number={2},
year={2012},
url = {https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf}
}
@inproceedings{ioffe2015batch,
title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
author={Ioffe, Sergey and Szegedy, Christian},
booktitle={ICML},
pages={448--456},
year={2015},
url = {http://proceedings.mlr.press/v37/ioffe15.html}
}