From 2b204846493fc2242d357608a6996a1c2c31aaa1 Mon Sep 17 00:00:00 2001 From: Matt Graham Date: Thu, 3 Nov 2016 18:31:37 +0000 Subject: [PATCH] Adding adaptive learning rules. --- mlp/learning_rules.py | 250 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py index c7fa3a6..4bfa330 100644 --- a/mlp/learning_rules.py +++ b/mlp/learning_rules.py @@ -270,3 +270,253 @@ class NesterovMomentumLearningRule(GradientDescentLearningRule): mom *= self.mom_coeff mom -= self.learning_rate * grad param += (1. + self.mom_coeff) * mom - self.mom_coeff * mom_prev + + +class AdamLearningRule(GradientDescentLearningRule): + """Adaptive moments (Adam) learning rule. + + First-order gradient-descent based learning rule which uses adaptive + estimates of first and second moments of the parameter gradients to + calculate the parameter updates. + + References: + [1]: Adam: a method for stochastic optimisation + Kingma and Ba, 2015 + """ + + def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999, + epsilon=1e-8): + """Creates a new learning rule object. + + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + beta_1: Exponential decay rate for gradient first moment estimates. + This should be a scalar value in [0, 1]. The running gradient + first moment estimate is calculated using + `m_1 = beta_1 * m_1_prev + (1 - beta_1) * g` + where `m_1_prev` is the previous estimate and `g` the current + parameter gradients. + beta_2: Exponential decay rate for gradient second moment + estimates. This should be a scalar value in [0, 1]. The run + gradient second moment estimate is calculated using + `m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2` + where `m_2_prev` is the previous estimate and `g` the current + parameter gradients. + epsilon: 'Softening' parameter to stop updates diverging when + second moment estimates are close to zero. Should be set to + a small positive value. + """ + super(AdamLearningRule, self).__init__(learning_rate) + assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].' + assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].' + assert epsilon > 0., 'epsilon should be > 0.' + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + + This must be called before `update_params` is first called. + + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(AdamLearningRule, self).initialise(params) + self.moms_1 = [] + for param in self.params: + self.moms_1.append(np.zeros_like(param)) + self.moms_2 = [] + for param in self.params: + self.moms_2.append(np.zeros_like(param)) + self.step_count = 0 + + def reset(self): + """Resets any additional state variables to their initial values. + + For this learning rule this corresponds to zeroing the estimates of + the first and second moments of the gradients. + """ + for mom_1, mom_2 in zip(self.moms_1, self.moms_2): + mom_1 *= 0. + mom_2 *= 0. + self.step_count = 0 + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + + All parameter updates are performed using in-place operations and so + nothing is returned. + + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, mom_1, mom_2, grad in zip( + self.params, self.moms_1, self.moms_2, grads_wrt_params): + mom_1 *= self.beta_1 + mom_1 += (1. - self.beta_1) * grad + mom_2 *= self.beta_2 + mom_2 += (1. - self.beta_2) * grad**2 + alpha_t = ( + self.learning_rate * + (1. - self.beta_2**(self.step_count + 1))**0.5 / + (1. - self.beta_1**(self.step_count + 1)) + ) + param -= alpha_t * mom_1 / (mom_2**0.5 + self.epsilon) + self.step_count += 1 + + +class AdaGradLearningRule(GradientDescentLearningRule): + """Adaptive gradients (AdaGrad) learning rule. + + First-order gradient-descent based learning rule which normalises gradient + updates by a running sum of the past squared gradients. + + References: + [1]: Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization. Duchi, Haxan and Singer, 2011 + """ + + def __init__(self, learning_rate=1e-2, epsilon=1e-8): + """Creates a new learning rule object. + + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + epsilon: 'Softening' parameter to stop updates diverging when + sums of squared gradients are close to zero. Should be set to + a small positive value. + """ + super(AdaGradLearningRule, self).__init__(learning_rate) + assert epsilon > 0., 'epsilon should be > 0.' + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + + This must be called before `update_params` is first called. + + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(AdaGradLearningRule, self).initialise(params) + self.sum_sq_grads = [] + for param in self.params: + self.sum_sq_grads.append(np.zeros_like(param)) + + def reset(self): + """Resets any additional state variables to their initial values. + + For this learning rule this corresponds to zeroing all the sum of + squared gradient states. + """ + for sum_sq_grad in self.sum_sq_grads: + sum_sq_grad *= 0. + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + + All parameter updates are performed using in-place operations and so + nothing is returned. + + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, sum_sq_grad, grad in zip( + self.params, self.sum_sq_grads, grads_wrt_params): + sum_sq_grad += grad**2 + param -= (self.learning_rate * grad / + (sum_sq_grad + self.epsilon)**0.5) + + +class RMSPropLearningRule(GradientDescentLearningRule): + """Root mean squared gradient normalised learning rule (RMSProp). + + First-order gradient-descent based learning rule which normalises gradient + updates by a exponentially smoothed estimate of the gradient second + moments. + + References: + [1]: Neural Networks for Machine Learning: Lecture 6a slides + University of Toronto,Computer Science Course CSC321 + http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf + """ + + def __init__(self, learning_rate=1e-3, beta=0.9, epsilon=1e-8): + """Creates a new learning rule object. + + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + beta: Exponential decay rate for gradient second moment + estimates. This should be a scalar value in [0, 1]. The running + gradient second moment estimate is calculated using + `m_2 = beta * m_2_prev + (1 - beta) * g**2` + where `m_2_prev` is the previous estimate and `g` the current + parameter gradients. + epsilon: 'Softening' parameter to stop updates diverging when + gradient second moment estimates are close to zero. Should be + set to a small positive value. + """ + super(RMSPropLearningRule, self).__init__(learning_rate) + assert beta >= 0. and beta <= 1., 'beta should be in [0, 1].' + assert epsilon > 0., 'epsilon should be > 0.' + self.beta = beta + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + + This must be called before `update_params` is first called. + + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(RMSPropLearningRule, self).initialise(params) + self.moms_2 = [] + for param in self.params: + self.moms_2.append(np.zeros_like(param)) + + def reset(self): + """Resets any additional state variables to their initial values. + + For this learning rule this corresponds to zeroing all gradient + second moment estimates. + """ + for mom_2 in self.moms_2: + mom_2 *= 0. + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + + All parameter updates are performed using in-place operations and so + nothing is returned. + + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, mom_2, grad in zip( + self.params, self.moms_2, grads_wrt_params): + mom_2 *= self.beta + mom_2 += (1. - self.beta) * grad**2 + param -= (self.learning_rate * grad / + (mom_2 + self.epsilon)**0.5)