addded code-base

This commit is contained in:
pswietojanski 2015-10-12 01:50:05 +01:00
parent 4404649ba1
commit 2b1700180e
5 changed files with 2700 additions and 0 deletions

64
mlp/costs.py Normal file
View File

@ -0,0 +1,64 @@
# Machine Learning Practical (INFR11119),
# Pawel Swietojanski, University of Edinburgh
import numpy
class Cost(object):
"""
Defines an interface for the cost object
"""
def cost(self, y, t, **kwargs):
"""
Implements a cost for monitoring purposes
:param y: matrix -- an output of the model
:param t: matrix -- an expected output the model should produce
:param kwargs: -- some optional parameters required by the cost
:return: the scalar value representing the cost given y and t
"""
raise NotImplementedError()
def grad(self, y, t, **kwargs):
"""
Implements a gradient of the cost w.r.t y
:param y: matrix -- an output of the model
:param t: matrix -- an expected output the model should produce
:param kwargs: -- some optional parameters required by the cost
:return: matrix - the gradient of the cost w.r.t y
"""
raise NotImplementedError()
def get_name(self):
return 'cost'
class MSECost(Cost):
def cost(self, y, t, **kwargs):
se = 0.5*numpy.sum((y - t)**2, axis=1)
return numpy.mean(se)
def grad(self, y, t, **kwargs):
return y - t
def get_name(self):
return 'mse'
class CECost(Cost):
"""
Cross Entropy (Negative log-likelihood) cost for multiple classes
"""
def cost(self, y, t, **kwargs):
#assumes t is 1-of-K coded and y is a softmax
#transformed estimate at the output layer
nll = t * numpy.log(y)
return -numpy.mean(numpy.sum(nll, axis=1))
def grad(self, y, t, **kwargs):
#assumes t is 1-of-K coded and y is a softmax
#transformed estimate at the output layer
return y - t
def get_name(self):
return 'ce'

281
mlp/layers.py Normal file
View File

@ -0,0 +1,281 @@
# Machine Learning Practical (INFR11119),
# Pawel Swietojanski, University of Edinburgh
import numpy
import logging
from mlp.costs import Cost
logger = logging.getLogger(__name__)
class MLP(object):
"""
This is a container for an arbitrary sequence of other transforms
On top of this, the class also keeps the state of the model, i.e.
the result of forward (activations) and backward (deltas) passes
through the model (for a mini-batch), which is required to compute
the gradients for the parameters
"""
def __init__(self, cost):
assert isinstance(cost, Cost), (
"Cost needs to be of type mlp.costs.Cost, got %s" % type(cost)
)
self.layers = [] #the actual list of network layers
self.activations = [] #keeps forward-pass activations (h from equations)
# for a given minibatch (or features at 0th index)
self.deltas = [] #keeps back-propagated error signals (deltas from equations)
# for a given minibatch and each layer
self.cost = cost
def fprop(self, x):
"""
:param inputs: mini-batch of data-points x
:return: y (top layer activation) which is an estimate of y given x
"""
if len(self.activations) != len(self.layers) + 1:
self.activations = [None]*(len(self.layers) + 1)
self.activations[0] = x
for i in xrange(0, len(self.layers)):
self.activations[i+1] = self.layers[i].fprop(self.activations[i])
return self.activations[-1]
def bprop(self, cost_grad):
"""
:param cost_grad: matrix -- grad of the cost w.r.t y
:return: None, the deltas are kept in the model
"""
# allocate the list of deltas for each layer
# note, we do not use all of those fields but
# want to keep it aligned 1:1 with activations,
# which will simplify indexing later on when
# computing grads w.r.t parameters
if len(self.deltas) != len(self.activations):
self.deltas = [None]*len(self.activations)
# treat the top layer in special way, as it deals with the
# cost, which may lead to some simplifications
top_layer_idx = len(self.layers)
self.deltas[top_layer_idx], ograds = self.layers[top_layer_idx - 1].\
bprop_cost(self.activations[top_layer_idx], cost_grad, self.cost)
# then back-prop through remaining layers
for i in xrange(top_layer_idx - 1, 0, -1):
self.deltas[i], ograds = self.layers[i - 1].\
bprop(self.activations[i], ograds)
def add_layer(self, layer):
self.layers.append(layer)
def set_layers(self, layers):
self.layers = layers
def get_name(self):
return 'mlp'
class Layer(object):
"""
Abstract class defining an interface for
other transforms.
"""
def __init__(self, rng=None):
if rng is None:
seed=[2015, 10, 1]
self.rng = numpy.random.RandomState(seed)
else:
self.rng = rng
def fprop(self, inputs):
"""
Implements a forward propagation through the i-th layer, that is
some form of:
a^i = xW^i + b^i
h^i = f^i(a^i)
with f^i, W^i, b^i denoting a non-linearity, weight matrix and
biases at the i-th layer, respectively and x denoting inputs.
:param inputs: matrix of features (x) or the output of the previous layer h^{i-1}
:return: h^i, matrix of transformed by layer features
"""
raise NotImplementedError()
def bprop(self, h, igrads):
"""
Implements a backward propagation through the layer, that is, given
h^i denotes the output of the layer and x^i the input, we compute:
dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i
x^i could be either features (x) or the output of the lower layer h^{i-1}
:param h: it's an activation produced in forward pass
:param igrads, error signal (or gradient) flowing to the layer, note,
this in general case does not corresponds to 'deltas' used to update
the layer's parameters, to get deltas ones need to multiply it with
the dh^i/da^i derivative
:return: a tuple (deltas, ograds) where:
deltas = igrads * dh^i/da^i
ograds = deltas \times da^i/dx^i
"""
raise NotImplementedError()
def bprop_cost(self, h, igrads, cost=None):
"""
Implements a backward propagation in case the layer directly
deals with the optimised cost (i.e. the top layer)
By default, method should implement a back-prop for default cost, that is
the one that is natural to the layer's output, i.e.:
linear -> mse, softmax -> cross-entropy, sigmoid -> binary cross-entropy
:param h: it's an activation produced in forward pass
:param igrads, error signal (or gradient) flowing to the layer, note,
this in general case does not corresponds to 'deltas' used to update
the layer's parameters, to get deltas ones need to multiply it with
the dh^i/da^i derivative
:return: a tuple (deltas, ograds) where:
deltas = igrads * dh^i/da^i
ograds = deltas \times da^i/dx^i
"""
raise NotImplementedError()
def pgrads(self, inputs, deltas):
"""
Return gradients w.r.t parameters
"""
raise NotImplementedError()
def get_params(self):
raise NotImplementedError()
def set_params(self):
raise NotImplementedError()
def get_name(self):
return 'abstract_layer'
class Linear(Layer):
def __init__(self, idim, odim,
rng=None,
irange=0.1):
super(Linear, self).__init__(rng=rng)
self.idim = idim
self.odim = odim
self.W = self.rng.uniform(
-irange, irange,
(self.idim, self.odim))
self.b = numpy.zeros((self.odim,), dtype=numpy.float32)
def fprop(self, inputs):
"""
Implements a forward propagation through the i-th layer, that is
some form of:
a^i = xW^i + b^i
h^i = f^i(a^i)
with f^i, W^i, b^i denoting a non-linearity, weight matrix and
biases of this (i-th) layer, respectively and x denoting inputs.
:param inputs: matrix of features (x) or the output of the previous layer h^{i-1}
:return: h^i, matrix of transformed by layer features
"""
a = numpy.dot(inputs, self.W) + self.b
# here f() is an identity function, so just return a linear transformation
return a
def bprop(self, h, igrads):
"""
Implements a backward propagation through the layer, that is, given
h^i denotes the output of the layer and x^i the input, we compute:
dh^i/dx^i which by chain rule is dh^i/da^i da^i/dx^i
x^i could be either features (x) or the output of the lower layer h^{i-1}
:param h: it's an activation produced in forward pass
:param igrads, error signal (or gradient) flowing to the layer, note,
this in general case does not corresponds to 'deltas' used to update
the layer's parameters, to get deltas ones need to multiply it with
the dh^i/da^i derivative
:return: a tuple (deltas, ograds) where:
deltas = igrads * dh^i/da^i
ograds = deltas \times da^i/dx^i
"""
# since df^i/da^i = 1 (f is assumed identity function),
# deltas are in fact the same as igrads
ograds = numpy.dot(igrads, self.W.T)
return igrads, ograds
def bprop_cost(self, h, igrads, cost):
"""
Implements a backward propagation in case the layer directly
deals with the optimised cost (i.e. the top layer)
By default, method should implement a bprop for default cost, that is
the one that is natural to the layer's output, i.e.:
here we implement linear -> mse scenario
:param h: it's an activation produced in forward pass
:param igrads, error signal (or gradient) flowing to the layer, note,
this in general case does not corresponds to 'deltas' used to update
the layer's parameters, to get deltas ones need to multiply it with
the dh^i/da^i derivative
:param cost, mlp.costs.Cost instance defining the used cost
:return: a tuple (deltas, ograds) where:
deltas = igrads * dh^i/da^i
ograds = deltas \times da^i/dx^i
"""
if cost is None or cost.get_name() == 'mse':
# for linear layer and mean square error cost,
# cost back-prop is the same as standard back-prop
return self.bprop(h, igrads)
else:
raise NotImplementedError('Linear.bprop_cost method not implemented '
'for the %s cost' % cost.get_name())
def pgrads(self, inputs, deltas):
"""
Return gradients w.r.t parameters
:param inputs, input to the i-th layer
:param deltas, deltas computed in bprop stage up to -ith layer
:return list of grads w.r.t parameters dE/dW and dE/db in *exactly*
the same order as the params are returned by get_params()
Note: deltas here contain the whole chain rule leading
from the cost up to the the i-th layer, i.e.
dE/dy^L dy^L/da^L da^L/dh^{L-1} dh^{L-1}/da^{L-1} ... dh^{i}/da^{i}
and here we are just asking about
1) da^i/dW^i and 2) da^i/db^i
since W and b are only layer's parameters
"""
grad_W = numpy.dot(inputs.T, deltas)
grad_b = numpy.sum(deltas, axis=0)
return [grad_W, grad_b]
def get_params(self):
return [self.W, self.b]
def set_params(self, params):
#we do not make checks here, but the order on the list
#is assumed to be exactly the same as get_params() returns
self.W = params[0]
self.b = params[1]
def get_name(self):
return 'linear'

170
mlp/optimisers.py Normal file
View File

@ -0,0 +1,170 @@
# Machine Learning Practical (INFR11119),
# Pawel Swietojanski, University of Edinburgh
import numpy
import time
import logging
from mlp.layers import MLP
from mlp.dataset import DataProvider
from mlp.schedulers import LearningRateScheduler
logger = logging.getLogger(__name__)
class Optimiser(object):
def train_epoch(self, model, train_iter):
raise NotImplementedError()
def train(self, model, train_iter, valid_iter=None):
raise NotImplementedError()
def validate(self, model, valid_iterator):
assert isinstance(model, MLP), (
"Expected model to be a subclass of 'mlp.layers.MLP'"
" class but got %s " % type(model)
)
assert isinstance(valid_iterator, DataProvider), (
"Expected iterator to be a subclass of 'mlp.dataset.DataProvider'"
" class but got %s " % type(valid_iterator)
)
acc_list, nll_list = [], []
for x, t in valid_iterator:
y = model.fprop(x)
nll_list.append(model.cost.cost(y, t))
acc_list.append(numpy.mean(self.classification_accuracy(y, t)))
acc = numpy.mean(acc_list)
nll = numpy.mean(nll_list)
return nll, acc
@staticmethod
def classification_accuracy(y, t):
"""
Returns classification accuracy given the estimate y and targets t
:param y: matrix -- estimate produced by the model in fprop
:param t: matrix -- target 1-of-K coded
:return: vector of y.shape[0] size with binary values set to 0
if example was miscalssified or 1 otherwise
"""
y_idx = numpy.argmax(y, axis=1)
t_idx = numpy.argmax(t, axis=1)
rval = numpy.equal(y_idx, t_idx)
return rval
class SGDOptimiser(Optimiser):
def __init__(self, lr_scheduler):
super(SGDOptimiser, self).__init__()
assert isinstance(lr_scheduler, LearningRateScheduler), (
"Expected lr_scheduler to be a subclass of 'mlp.schedulers.LearningRateScheduler'"
" class but got %s " % type(lr_scheduler)
)
self.lr_scheduler = lr_scheduler
def train_epoch(self, model, train_iterator, learning_rate):
assert isinstance(model, MLP), (
"Expected model to be a subclass of 'mlp.layers.MLP'"
" class but got %s " % type(model)
)
assert isinstance(train_iterator, DataProvider), (
"Expected iterator to be a subclass of 'mlp.dataset.DataProvider'"
" class but got %s " % type(train_iterator)
)
acc_list, nll_list = [], []
for x, t in train_iterator:
# get the prediction
y = model.fprop(x)
# compute the cost and grad of the cost w.r.t y
cost = model.cost.cost(y, t)
cost_grad = model.cost.grad(y, t)
# do backward pass through the model
model.bprop(cost_grad)
#update the model, here we iterate over layers
#and then over each parameter in the layer
effective_learning_rate = learning_rate / x.shape[0]
for i in xrange(0, len(model.layers)):
params = model.layers[i].get_params()
grads = model.layers[i].pgrads(model.activations[i], model.deltas[i + 1])
uparams = []
for param, grad in zip(params, grads):
param = param - effective_learning_rate * grad
uparams.append(param)
model.layers[i].set_params(uparams)
nll_list.append(cost)
acc_list.append(numpy.mean(self.classification_accuracy(y, t)))
return numpy.mean(nll_list), numpy.mean(acc_list)
def train(self, model, train_iterator, valid_iterator=None):
converged = False
cost_name = model.cost.get_name()
tr_stats, valid_stats = [], []
# do the initial validation
tr_nll, tr_acc = self.validate(model, train_iterator)
logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))
tr_stats.append((tr_nll, tr_acc))
if valid_iterator is not None:
valid_iterator.reset()
valid_nll, valid_acc = self.validate(model, valid_iterator)
logger.info('Epoch %i: Validation cost (%s) for random model is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.))
valid_stats.append((valid_nll, valid_acc))
while not converged:
train_iterator.reset()
tstart = time.clock()
tr_nll, tr_acc = self.train_epoch(model=model,
train_iterator=train_iterator,
learning_rate=self.lr_scheduler.get_rate())
tstop = time.clock()
tr_stats.append((tr_nll, tr_acc))
logger.info('Epoch %i: Training cost (%s) is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch + 1, cost_name, tr_nll, tr_acc * 100.))
vstart = time.clock()
if valid_iterator is not None:
valid_iterator.reset()
valid_nll, valid_acc = self.validate(model, valid_iterator)
logger.info('Epoch %i: Validation cost (%s) is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch + 1, cost_name, valid_nll, valid_acc * 100.))
self.lr_scheduler.get_next_rate(valid_acc)
valid_stats.append((valid_nll, valid_acc))
else:
self.lr_scheduler.get_next_rate(None)
vstop = time.clock()
train_speed = train_iterator.num_examples() / (tstop - tstart)
valid_speed = valid_iterator.num_examples() / (vstop - vstart)
tot_time = vstop - tstart
#pps = presentations per second
logger.info("Epoch %i: Took %.0f seconds. Training speed %.0f pps. "
"Validation speed %.0f pps."
% (self.lr_scheduler.epoch, tot_time, train_speed, valid_speed))
# we stop training when learning rate, as returned by lr scheduler, is 0
# this is implementation dependent and depending on lr schedule could happen,
# for example, when max_epochs has been reached or if the progress between
# two consecutive epochs is too small, etc.
converged = (self.lr_scheduler.get_rate() == 0)
return tr_stats, valid_stats

155
mlp/schedulers.py Normal file
View File

@ -0,0 +1,155 @@
# Machine Learning Practical (INFR11119),
# Pawel Swietojanski, University of Edinburgh
import logging
class LearningRateScheduler(object):
"""
Define an interface for determining learning rates
"""
def __init__(self, max_epochs=100):
self.epoch = 0
self.max_epochs = max_epochs
def get_rate(self):
raise NotImplementedError()
def get_next_rate(self, current_error=None):
self.epoch += 1
class LearningRateList(LearningRateScheduler):
def __init__(self, learning_rates_list, max_epochs):
super(LearningRateList, self).__init__(max_epochs)
assert isinstance(learning_rates_list, list), (
"The learning_rates_list argument expected"
" to be of type list, got %s" % type(learning_rates_list)
)
self.lr_list = learning_rates_list
def get_rate(self):
if self.epoch < len(self.lr_list):
return self.lr_list[self.epoch]
return 0.0
def get_next_rate(self, current_error=None):
super(LearningRateList, self).get_next_rate(current_error=None)
return self.get_rate()
class LearningRateFixed(LearningRateList):
def __init__(self, learning_rate, max_epochs):
assert learning_rate > 0, (
"learning rate expected to be > 0, got %f" % learning_rate
)
super(LearningRateFixed, self).__init__([learning_rate], max_epochs)
def get_rate(self):
if self.epoch < self.max_epochs:
return self.lr_list[0]
return 0.0
def get_next_rate(self, current_error=None):
super(LearningRateFixed, self).get_next_rate(current_error=None)
return self.get_rate()
class LearningRateNewBob(LearningRateScheduler):
"""
Exponential learning rate schema
"""
def __init__(self, start_rate, scale_by=.5, max_epochs=99, \
min_derror_ramp_start=.5, min_derror_stop=.5, init_error=100.0, \
patience=0, zero_rate=None, ramping=False):
"""
:type start_rate: float
:param start_rate:
:type scale_by: float
:param scale_by:
:type max_epochs: int
:param max_epochs:
:type min_error_start: float
:param min_error_start:
:type min_error_stop: float
:param min_error_stop:
:type init_error: float
:param init_error:
# deltas2 below are just deltas returned by linear Linear,bprop transform
# and are exactly the same as
"""
self.start_rate = start_rate
self.init_error = init_error
self.init_patience = patience
self.rate = start_rate
self.scale_by = scale_by
self.max_epochs = max_epochs
self.min_derror_ramp_start = min_derror_ramp_start
self.min_derror_stop = min_derror_stop
self.lowest_error = init_error
self.epoch = 1
self.ramping = ramping
self.patience = patience
self.zero_rate = zero_rate
def reset(self):
self.rate = self.start_rate
self.lowest_error = self.init_error
self.epoch = 1
self.ramping = False
self.patience = self.init_patience
def get_rate(self):
if (self.epoch==1 and self.zero_rate!=None):
return self.zero_rate
return self.rate
def get_next_rate(self, current_error):
"""
:type current_error: float
:param current_error: percentage error
"""
diff_error = 0.0
if ( (self.max_epochs > 10000) or (self.epoch >= self.max_epochs) ):
#logging.debug('Setting rate to 0.0. max_epochs or epoch>=max_epochs')
self.rate = 0.0
else:
diff_error = self.lowest_error - current_error
if (current_error < self.lowest_error):
self.lowest_error = current_error
if (self.ramping):
if (diff_error < self.min_derror_stop):
if (self.patience > 0):
#logging.debug('Patience decreased to %f' % self.patience)
self.patience -= 1
self.rate *= self.scale_by
else:
#logging.debug('diff_error (%f) < min_derror_stop (%f)' % (diff_error, self.min_derror_stop))
self.rate = 0.0
else:
self.rate *= self.scale_by
else:
if (diff_error < self.min_derror_ramp_start):
#logging.debug('Start ramping.')
self.ramping = True
self.rate *= self.scale_by
self.epoch += 1
return self.rate

2030
res/code_scheme.svg Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 200 KiB