diff --git a/mlp/schedulers.py b/mlp/schedulers.py index 6ae9597..4f53e7e 100644 --- a/mlp/schedulers.py +++ b/mlp/schedulers.py @@ -1,172 +1,34 @@ -# Machine Learning Practical (INFR11119), -# Pawel Swietojanski, University of Edinburgh +# -*- coding: utf-8 -*- +"""Training schedulers. -import logging +This module contains classes implementing schedulers which control the +evolution of learning rule hyperparameters (such as learning rate) over a +training run. +""" + +import numpy as np -class LearningRateScheduler(object): - """ - Define an interface for determining learning rates - """ - def __init__(self, max_epochs=100): - self.epoch = 0 - self.max_epochs = max_epochs +class ConstantLearningRateScheduler(object): + """Example of scheduler interface which sets a constant learning rate.""" - def get_rate(self): - raise NotImplementedError() + def __init__(self, learning_rate): + """Construct a new constant learning rate scheduler object. - def get_next_rate(self, current_accuracy=None): - self.epoch += 1 - - -class LearningRateList(LearningRateScheduler): - def __init__(self, learning_rates_list, max_epochs): - - super(LearningRateList, self).__init__(max_epochs) - - assert isinstance(learning_rates_list, list), ( - "The learning_rates_list argument expected" - " to be of type list, got %s" % type(learning_rates_list) - ) - self.lr_list = learning_rates_list - - def get_rate(self): - if self.epoch < len(self.lr_list): - return self.lr_list[self.epoch] - return 0.0 - - def get_next_rate(self, current_accuracy=None): - super(LearningRateList, self).get_next_rate(current_accuracy=None) - return self.get_rate() - - -class LearningRateFixed(LearningRateList): - - def __init__(self, learning_rate, max_epochs): - assert learning_rate > 0, ( - "learning rate expected to be > 0, got %f" % learning_rate - ) - super(LearningRateFixed, self).__init__([learning_rate], max_epochs) - - def get_rate(self): - if self.epoch < self.max_epochs: - return self.lr_list[0] - return 0.0 - - def get_next_rate(self, current_accuracy=None): - super(LearningRateFixed, self).get_next_rate(current_accuracy=None) - return self.get_rate() - - -class LearningRateNewBob(LearningRateScheduler): - """ - newbob learning rate schedule. - - Fixed learning rate until validation set stops improving then exponential - decay. - """ - - def __init__(self, start_rate, scale_by=.5, max_epochs=99, - min_derror_ramp_start=.5, min_derror_stop=.5, init_error=100.0, - patience=0, zero_rate=None, ramping=False): + Args: + learning_rate: Learning rate to use in learning rule. """ - :type start_rate: float - :param start_rate: - - :type scale_by: float - :param scale_by: - - :type max_epochs: int - :param max_epochs: - - :type min_error_start: float - :param min_error_start: - - :type min_error_stop: float - :param min_error_stop: - - :type init_error: float - :param init_error: + self.learning_rate = learning_rate + + def update_learning_rule(self, learning_rule, epoch_number): + """Update the hyperparameters of the learning rule. + + Run at the beginning of each epoch. + + Args: + learning_rule: Learning rule object being used in training run, + any scheduled hyperparameters to be altered should be + attributes of this object. + epoch_number: Integer index of training epoch about to be run. """ - self.start_rate = start_rate - self.init_error = init_error - self.init_patience = patience - - self.rate = start_rate - self.scale_by = scale_by - self.max_epochs = max_epochs - self.min_derror_ramp_start = min_derror_ramp_start - self.min_derror_stop = min_derror_stop - self.lowest_error = init_error - - self.epoch = 1 - self.ramping = ramping - self.patience = patience - self.zero_rate = zero_rate - - def reset(self): - self.rate = self.start_rate - self.lowest_error = self.init_error - self.epoch = 1 - self.ramping = False - self.patience = self.init_patience - - def get_rate(self): - if (self.epoch==1 and self.zero_rate!=None): - return self.zero_rate - return self.rate - - def get_next_rate(self, current_accuracy): - """ - :type current_accuracy: float - :param current_accuracy: current proportion correctly classified - - """ - - current_error = 1. - current_accuracy - diff_error = 0.0 - - if ( (self.max_epochs > 10000) or (self.epoch >= self.max_epochs) ): - #logging.debug('Setting rate to 0.0. max_epochs or epoch>=max_epochs') - self.rate = 0.0 - else: - diff_error = self.lowest_error - current_error - - if (current_error < self.lowest_error): - self.lowest_error = current_error - - if (self.ramping): - if (diff_error < self.min_derror_stop): - if (self.patience > 0): - #logging.debug('Patience decreased to %f' % self.patience) - self.patience -= 1 - self.rate *= self.scale_by - else: - #logging.debug('diff_error (%f) < min_derror_stop (%f)' % (diff_error, self.min_derror_stop)) - self.rate = 0.0 - else: - self.rate *= self.scale_by - else: - if (diff_error < self.min_derror_ramp_start): - #logging.debug('Start ramping.') - self.ramping = True - self.rate *= self.scale_by - - self.epoch += 1 - - return self.rate - - -class DropoutFixed(LearningRateList): - - def __init__(self, p_inp_keep, p_hid_keep): - assert 0 < p_inp_keep <= 1 and 0 < p_hid_keep <= 1, ( - "Dropout 'keep' probabilites are suppose to be in (0, 1] range" - ) - super(DropoutFixed, self).__init__([(p_inp_keep, p_hid_keep)], max_epochs=999) - - def get_rate(self): - return self.lr_list[0] - - def get_next_rate(self, current_accuracy=None): - return self.get_rate() + learning_rule.learning_rate = self.learning_rate diff --git a/mlp/utils.py b/mlp/utils.py deleted file mode 100644 index 34d62e5..0000000 --- a/mlp/utils.py +++ /dev/null @@ -1,361 +0,0 @@ -# Machine Learning Practical (INFR11119), -# Pawel Swietojanski, University of Edinburgh - -import numpy -from mlp.layers import Layer - - -def numerical_gradient(f, x, eps=1e-4, **kwargs): - """ - Implements the following numerical gradient rule - df(x)/dx = (f(x+eps)-f(x-eps))/(2eps) - """ - - xc = x.copy() - g = numpy.zeros_like(xc) - xf = xc.ravel() - gf = g.ravel() - - for i in xrange(xf.shape[0]): - xx = xf[i] - xf[i] = xx + eps - fp_eps, ___ = f(xc, **kwargs) - xf[i] = xx - eps - fm_eps, ___ = f(xc, **kwargs) - xf[i] = xx - gf[i] = (fp_eps - fm_eps)/(2*eps) - - return g - - -def verify_gradient(f, x, eps=1e-4, tol=1e-6, **kwargs): - """ - Compares the numerical and analytical gradients. - """ - fval, fgrad = f(x=x, **kwargs) - ngrad = numerical_gradient(f=f, x=x, eps=eps, tol=tol, **kwargs) - - fgradnorm = numpy.sqrt(numpy.sum(fgrad**2)) - ngradnorm = numpy.sqrt(numpy.sum(ngrad**2)) - diffnorm = numpy.sqrt(numpy.sum((fgrad-ngrad)**2)) - - if fgradnorm > 0 or ngradnorm > 0: - norm = numpy.maximum(fgradnorm, ngradnorm) - if not (diffnorm < tol or diffnorm/norm < tol): - raise Exception("Numerical and analytical gradients " - "are different: %s != %s!" % (ngrad, fgrad)) - else: - if not (diffnorm < tol): - raise Exception("Numerical and analytical gradients " - "are different: %s != %s!" % (ngrad, fgrad)) - return True - - -def verify_layer_gradient(layer, x, eps=1e-4, tol=1e-6): - - assert isinstance(layer, Layer), ( - "Expected to get the instance of Layer class, got" - " %s " % type(layer) - ) - - def grad_layer_wrapper(x, **kwargs): - h = layer.fprop(x) - deltas, ograds = layer.bprop(h=h, igrads=numpy.ones_like(h)) - return numpy.sum(h), ograds - - return verify_gradient(f=grad_layer_wrapper, x=x, eps=eps, tol=tol, layer=layer) - - -def test_conv_linear_fprop(layer, kernel_order='ioxy', kernels_first=True, - dtype=numpy.float): - """ - Tests forward propagation method of a convolutional layer. - - Checks the outputs of `fprop` method for a fixed input against known - reference values for the outputs and raises an AssertionError if - the outputted values are not consistent with the reference values. If - tests are all passed returns True. - - Parameters - ---------- - layer : instance of Layer subclass - Convolutional (linear only) layer implementation. It must implement - the methods `get_params`, `set_params` and `fprop`. - kernel_order : string - Specifes dimension ordering assumed for convolutional kernels - passed to `layer`. Default is `ioxy` which corresponds to: - input channels, output channels, image x, image y - The other option is 'oixy' which corresponds to - output channels, input channels, image x, image y - Any other value will raise a ValueError exception. - kernels_first : boolean - Specifies order in which parameters are passed to and returned from - `get_params` and `set_params`. Default is True which corresponds - to signatures of `get_params` and `set_params` being: - kernels, biases = layer.get_params() - layer.set_params([kernels, biases]) - If False this corresponds to signatures of `get_params` and - `set_params` being: - biases, kernels = layer.get_params() - layer.set_params([biases, kernels]) - dtype : numpy data type - Data type to use in numpy arrays passed to layer methods. Default - is `numpy.float`. - - Raises - ------ - AssertionError - Raised if output of `layer.fprop` is inconsistent with reference - values either in shape or values. - ValueError - Raised if `kernel_order` is not a valid order string. - """ - inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype) - kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype) - if kernel_order == 'oixy': - kernels = kernels.swapaxes(0, 1) - elif kernel_order != 'ioxy': - raise ValueError('kernel_order must be one of "ioxy" and "oixy"') - biases = numpy.arange(2).astype(dtype) - true_output = numpy.array( - [[[[ 496., 466., 436.], - [ 376., 346., 316.], - [ 256., 226., 196.]], - [[ 1385., 1403., 1421.], - [ 1457., 1475., 1493.], - [ 1529., 1547., 1565.]]], - [[[ -944., -974., -1004.], - [-1064., -1094., -1124.], - [-1184., -1214., -1244.]], - [[ 2249., 2267., 2285.], - [ 2321., 2339., 2357.], - [ 2393., 2411., 2429.]]]], dtype=dtype) - try: - orig_params = layer.get_params() - if kernels_first: - layer.set_params([kernels, biases]) - else: - layer.set_params([biases, kernels]) - layer_output = layer.fprop(inputs) - assert layer_output.shape == true_output.shape, ( - 'Layer fprop gives incorrect shaped output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_output.shape, layer_output.shape) - ) - assert numpy.allclose(layer_output, true_output), ( - 'Layer fprop does not give correct output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_output, layer_output) - ) - finally: - layer.set_params(orig_params) - return True - - -def test_conv_linear_bprop(layer, kernel_order='ioxy', kernels_first=True, - dtype=numpy.float): - """ - Tests input gradients backpropagation method of a convolutional layer. - - Checks the outputs of `bprop` method for a fixed input against known - reference values for the outputs and raises an AssertionError if - the outputted values are not consistent with the reference values. If - tests are all passed returns True. - - Parameters - ---------- - layer : instance of Layer subclass - Convolutional (linear only) layer implementation. It must implement - the methods `get_params`, `set_params` and `bprop`. - kernel_order : string - Specifes dimension ordering assumed for convolutional kernels - passed to `layer`. Default is `ioxy` which corresponds to: - input channels, output channels, image x, image y - The other option is 'oixy' which corresponds to - output channels, input channels, image x, image y - Any other value will raise a ValueError exception. - kernels_first : boolean - Specifies order in which parameters are passed to and returned from - `get_params` and `set_params`. Default is True which corresponds - to signatures of `get_params` and `set_params` being: - kernels, biases = layer.get_params() - layer.set_params([kernels, biases]) - If False this corresponds to signatures of `get_params` and - `set_params` being: - biases, kernels = layer.get_params() - layer.set_params([biases, kernels]) - dtype : numpy data type - Data type to use in numpy arrays passed to layer methods. Default - is `numpy.float`. - - Raises - ------ - AssertionError - Raised if output of `layer.bprop` is inconsistent with reference - values either in shape or values. - ValueError - Raised if `kernel_order` is not a valid order string. - """ - inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype) - kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype) - if kernel_order == 'oixy': - kernels = kernels.swapaxes(0, 1) - elif kernel_order != 'ioxy': - raise ValueError('kernel_order must be one of "ioxy" and "oixy"') - biases = numpy.arange(2).astype(dtype) - igrads = numpy.arange(-20, 16).reshape((2, 2, 3, 3)).astype(dtype) - true_ograds = numpy.array( - [[[[ 328., 605., 567., 261.], - [ 534., 976., 908., 414.], - [ 426., 772., 704., 318.], - [ 170., 305., 275., 123.]], - [[ 80., 125., 119., 45.], - [ 86., 112., 108., 30.], - [ 74., 100., 96., 30.], - [ 18., 17., 19., 3.]], - [[-168., -355., -329., -171.], - [-362., -752., -692., -354.], - [-278., -572., -512., -258.], - [-134., -271., -237., -117.]]], - [[[ -32., -79., -117., -63.], - [-114., -248., -316., -162.], - [-222., -452., -520., -258.], - [-118., -235., -265., -129.]], - [[ 8., 17., 11., 9.], - [ 14., 40., 36., 30.], - [ 2., 28., 24., 30.], - [ 18., 53., 55., 39.]], - [[ 48., 113., 139., 81.], - [ 142., 328., 388., 222.], - [ 226., 508., 568., 318.], - [ 154., 341., 375., 207.]]]], dtype=dtype) - try: - orig_params = layer.get_params() - if kernels_first: - layer.set_params([kernels, biases]) - else: - layer.set_params([biases, kernels]) - layer_deltas, layer_ograds = layer.bprop(None, igrads) - assert layer_deltas.shape == igrads.shape, ( - 'Layer bprop give incorrectly shaped deltas output.' - 'Correct shape is {0} but returned shape is {1}.' - .format(igrads.shape, layer_deltas.shape) - ) - assert numpy.allclose(layer_deltas, igrads), ( - 'Layer bprop does not give correct deltas output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(igrads, layer_deltas) - ) - assert layer_ograds.shape == true_ograds.shape, ( - 'Layer bprop gives incorrect shaped ograds output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_ograds.shape, layer_ograds.shape) - ) - assert numpy.allclose(layer_ograds, true_ograds), ( - 'Layer bprop does not give correct ograds output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_ograds, layer_ograds) - ) - finally: - layer.set_params(orig_params) - return True - - -def test_conv_linear_pgrads(layer, kernel_order='ioxy', kernels_first=True, - dtype=numpy.float): - """ - Tests parameter gradients backpropagation method of a convolutional layer. - - Checks the outputs of `pgrads` method for a fixed input against known - reference values for the outputs and raises an AssertionError if - the outputted values are not consistent with the reference values. If - tests are all passed returns True. - - Parameters - ---------- - layer : instance of Layer subclass - Convolutional (linear only) layer implementation. It must implement - the methods `get_params`, `set_params` and `pgrads`. - kernel_order : string - Specifes dimension ordering assumed for convolutional kernels - passed to `layer`. Default is `ioxy` which corresponds to: - input channels, output channels, image x, image y - The other option is 'oixy' which corresponds to - output channels, input channels, image x, image y - Any other value will raise a ValueError exception. - kernels_first : boolean - Specifies order in which parameters are passed to and returned from - `get_params` and `set_params`. Default is True which corresponds - to signatures of `get_params` and `set_params` being: - kernels, biases = layer.get_params() - layer.set_params([kernels, biases]) - If False this corresponds to signatures of `get_params` and - `set_params` being: - biases, kernels = layer.get_params() - layer.set_params([biases, kernels]) - dtype : numpy data type - Data type to use in numpy arrays passed to layer methods. Default - is `numpy.float`. - - Raises - ------ - AssertionError - Raised if output of `layer.pgrads` is inconsistent with reference - values either in shape or values. - ValueError - Raised if `kernel_order` is not a valid order string. - """ - inputs = numpy.arange(96).reshape((2, 3, 4, 4)).astype(dtype) - kernels = numpy.arange(-12, 12).reshape((3, 2, 2, 2)).astype(dtype) - biases = numpy.arange(2).astype(dtype) - deltas = numpy.arange(-20, 16).reshape((2, 2, 3, 3)).astype(dtype) - true_kernel_grads = numpy.array( - [[[[ 390., 264.], - [ -114., -240.]], - [[ 5088., 5124.], - [ 5232., 5268.]]], - [[[-1626., -1752.], - [-2130., -2256.]], - [[ 5664., 5700.], - [ 5808., 5844.]]], - [[[-3642., -3768.], - [-4146., -4272.]], - [[ 6240., 6276.], - [ 6384., 6420.]]]], dtype=dtype) - if kernel_order == 'oixy': - kernels = kernels.swapaxes(0, 1) - true_kernel_grads = true_kernel_grads.swapaxes(0, 1) - elif kernel_order != 'ioxy': - raise ValueError('kernel_order must be one of "ioxy" and "oixy"') - true_bias_grads = numpy.array([-126., 36.], dtype=dtype) - try: - orig_params = layer.get_params() - if kernels_first: - layer.set_params([kernels, biases]) - else: - layer.set_params([biases, kernels]) - layer_kernel_grads, layer_bias_grads = layer.pgrads(inputs, deltas) - assert layer_kernel_grads.shape == true_kernel_grads.shape, ( - 'Layer pgrads gives incorrect shaped kernel gradients output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_kernel_grads.shape, layer_kernel_grads.shape) - ) - assert numpy.allclose(layer_kernel_grads, true_kernel_grads), ( - 'Layer pgrads does not give correct kernel gradients output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_kernel_grads, layer_kernel_grads) - ) - assert layer_bias_grads.shape == true_bias_grads.shape, ( - 'Layer pgrads gives incorrect shaped bias gradients output. ' - 'Correct shape is {0} but returned shape is {1}.' - .format(true_bias_grads.shape, layer_bias_grads.shape) - ) - assert numpy.allclose(layer_bias_grads, true_bias_grads), ( - 'Layer pgrads does not give correct bias gradients output. ' - 'Correct output is {0}\n but returned output is {1}.' - .format(true_bias_grads, layer_bias_grads) - ) - finally: - layer.set_params(orig_params) - return True -