Adding adaptive learning rules.

This commit is contained in:
Matt Graham 2016-11-03 18:31:37 +00:00
parent 6f4aa5fd5f
commit 2b20484649

View File

@ -270,3 +270,253 @@ class NesterovMomentumLearningRule(GradientDescentLearningRule):
mom *= self.mom_coeff
mom -= self.learning_rate * grad
param += (1. + self.mom_coeff) * mom - self.mom_coeff * mom_prev
class AdamLearningRule(GradientDescentLearningRule):
"""Adaptive moments (Adam) learning rule.
First-order gradient-descent based learning rule which uses adaptive
estimates of first and second moments of the parameter gradients to
calculate the parameter updates.
References:
[1]: Adam: a method for stochastic optimisation
Kingma and Ba, 2015
"""
def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
epsilon=1e-8):
"""Creates a new learning rule object.
Args:
learning_rate: A postive scalar to scale gradient updates to the
parameters by. This needs to be carefully set - if too large
the learning dynamic will be unstable and may diverge, while
if set too small learning will proceed very slowly.
beta_1: Exponential decay rate for gradient first moment estimates.
This should be a scalar value in [0, 1]. The running gradient
first moment estimate is calculated using
`m_1 = beta_1 * m_1_prev + (1 - beta_1) * g`
where `m_1_prev` is the previous estimate and `g` the current
parameter gradients.
beta_2: Exponential decay rate for gradient second moment
estimates. This should be a scalar value in [0, 1]. The run
gradient second moment estimate is calculated using
`m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2`
where `m_2_prev` is the previous estimate and `g` the current
parameter gradients.
epsilon: 'Softening' parameter to stop updates diverging when
second moment estimates are close to zero. Should be set to
a small positive value.
"""
super(AdamLearningRule, self).__init__(learning_rate)
assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].'
assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].'
assert epsilon > 0., 'epsilon should be > 0.'
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
def initialise(self, params):
"""Initialises the state of the learning rule for a set or parameters.
This must be called before `update_params` is first called.
Args:
params: A list of the parameters to be optimised. Note these will
be updated *in-place* to avoid reallocating arrays on each
update.
"""
super(AdamLearningRule, self).initialise(params)
self.moms_1 = []
for param in self.params:
self.moms_1.append(np.zeros_like(param))
self.moms_2 = []
for param in self.params:
self.moms_2.append(np.zeros_like(param))
self.step_count = 0
def reset(self):
"""Resets any additional state variables to their initial values.
For this learning rule this corresponds to zeroing the estimates of
the first and second moments of the gradients.
"""
for mom_1, mom_2 in zip(self.moms_1, self.moms_2):
mom_1 *= 0.
mom_2 *= 0.
self.step_count = 0
def update_params(self, grads_wrt_params):
"""Applies a single update to all parameters.
All parameter updates are performed using in-place operations and so
nothing is returned.
Args:
grads_wrt_params: A list of gradients of the scalar loss function
with respect to each of the parameters passed to `initialise`
previously, with this list expected to be in the same order.
"""
for param, mom_1, mom_2, grad in zip(
self.params, self.moms_1, self.moms_2, grads_wrt_params):
mom_1 *= self.beta_1
mom_1 += (1. - self.beta_1) * grad
mom_2 *= self.beta_2
mom_2 += (1. - self.beta_2) * grad**2
alpha_t = (
self.learning_rate *
(1. - self.beta_2**(self.step_count + 1))**0.5 /
(1. - self.beta_1**(self.step_count + 1))
)
param -= alpha_t * mom_1 / (mom_2**0.5 + self.epsilon)
self.step_count += 1
class AdaGradLearningRule(GradientDescentLearningRule):
"""Adaptive gradients (AdaGrad) learning rule.
First-order gradient-descent based learning rule which normalises gradient
updates by a running sum of the past squared gradients.
References:
[1]: Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization. Duchi, Haxan and Singer, 2011
"""
def __init__(self, learning_rate=1e-2, epsilon=1e-8):
"""Creates a new learning rule object.
Args:
learning_rate: A postive scalar to scale gradient updates to the
parameters by. This needs to be carefully set - if too large
the learning dynamic will be unstable and may diverge, while
if set too small learning will proceed very slowly.
epsilon: 'Softening' parameter to stop updates diverging when
sums of squared gradients are close to zero. Should be set to
a small positive value.
"""
super(AdaGradLearningRule, self).__init__(learning_rate)
assert epsilon > 0., 'epsilon should be > 0.'
self.epsilon = epsilon
def initialise(self, params):
"""Initialises the state of the learning rule for a set or parameters.
This must be called before `update_params` is first called.
Args:
params: A list of the parameters to be optimised. Note these will
be updated *in-place* to avoid reallocating arrays on each
update.
"""
super(AdaGradLearningRule, self).initialise(params)
self.sum_sq_grads = []
for param in self.params:
self.sum_sq_grads.append(np.zeros_like(param))
def reset(self):
"""Resets any additional state variables to their initial values.
For this learning rule this corresponds to zeroing all the sum of
squared gradient states.
"""
for sum_sq_grad in self.sum_sq_grads:
sum_sq_grad *= 0.
def update_params(self, grads_wrt_params):
"""Applies a single update to all parameters.
All parameter updates are performed using in-place operations and so
nothing is returned.
Args:
grads_wrt_params: A list of gradients of the scalar loss function
with respect to each of the parameters passed to `initialise`
previously, with this list expected to be in the same order.
"""
for param, sum_sq_grad, grad in zip(
self.params, self.sum_sq_grads, grads_wrt_params):
sum_sq_grad += grad**2
param -= (self.learning_rate * grad /
(sum_sq_grad + self.epsilon)**0.5)
class RMSPropLearningRule(GradientDescentLearningRule):
"""Root mean squared gradient normalised learning rule (RMSProp).
First-order gradient-descent based learning rule which normalises gradient
updates by a exponentially smoothed estimate of the gradient second
moments.
References:
[1]: Neural Networks for Machine Learning: Lecture 6a slides
University of Toronto,Computer Science Course CSC321
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
"""
def __init__(self, learning_rate=1e-3, beta=0.9, epsilon=1e-8):
"""Creates a new learning rule object.
Args:
learning_rate: A postive scalar to scale gradient updates to the
parameters by. This needs to be carefully set - if too large
the learning dynamic will be unstable and may diverge, while
if set too small learning will proceed very slowly.
beta: Exponential decay rate for gradient second moment
estimates. This should be a scalar value in [0, 1]. The running
gradient second moment estimate is calculated using
`m_2 = beta * m_2_prev + (1 - beta) * g**2`
where `m_2_prev` is the previous estimate and `g` the current
parameter gradients.
epsilon: 'Softening' parameter to stop updates diverging when
gradient second moment estimates are close to zero. Should be
set to a small positive value.
"""
super(RMSPropLearningRule, self).__init__(learning_rate)
assert beta >= 0. and beta <= 1., 'beta should be in [0, 1].'
assert epsilon > 0., 'epsilon should be > 0.'
self.beta = beta
self.epsilon = epsilon
def initialise(self, params):
"""Initialises the state of the learning rule for a set or parameters.
This must be called before `update_params` is first called.
Args:
params: A list of the parameters to be optimised. Note these will
be updated *in-place* to avoid reallocating arrays on each
update.
"""
super(RMSPropLearningRule, self).initialise(params)
self.moms_2 = []
for param in self.params:
self.moms_2.append(np.zeros_like(param))
def reset(self):
"""Resets any additional state variables to their initial values.
For this learning rule this corresponds to zeroing all gradient
second moment estimates.
"""
for mom_2 in self.moms_2:
mom_2 *= 0.
def update_params(self, grads_wrt_params):
"""Applies a single update to all parameters.
All parameter updates are performed using in-place operations and so
nothing is returned.
Args:
grads_wrt_params: A list of gradients of the scalar loss function
with respect to each of the parameters passed to `initialise`
previously, with this list expected to be in the same order.
"""
for param, mom_2, grad in zip(
self.params, self.moms_2, grads_wrt_params):
mom_2 *= self.beta
mom_2 += (1. - self.beta) * grad**2
param -= (self.learning_rate * grad /
(mom_2 + self.epsilon)**0.5)