Adding adaptive learning rules.
This commit is contained in:
parent
6f4aa5fd5f
commit
2b20484649
@ -270,3 +270,253 @@ class NesterovMomentumLearningRule(GradientDescentLearningRule):
|
|||||||
mom *= self.mom_coeff
|
mom *= self.mom_coeff
|
||||||
mom -= self.learning_rate * grad
|
mom -= self.learning_rate * grad
|
||||||
param += (1. + self.mom_coeff) * mom - self.mom_coeff * mom_prev
|
param += (1. + self.mom_coeff) * mom - self.mom_coeff * mom_prev
|
||||||
|
|
||||||
|
|
||||||
|
class AdamLearningRule(GradientDescentLearningRule):
|
||||||
|
"""Adaptive moments (Adam) learning rule.
|
||||||
|
|
||||||
|
First-order gradient-descent based learning rule which uses adaptive
|
||||||
|
estimates of first and second moments of the parameter gradients to
|
||||||
|
calculate the parameter updates.
|
||||||
|
|
||||||
|
References:
|
||||||
|
[1]: Adam: a method for stochastic optimisation
|
||||||
|
Kingma and Ba, 2015
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
|
||||||
|
epsilon=1e-8):
|
||||||
|
"""Creates a new learning rule object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
learning_rate: A postive scalar to scale gradient updates to the
|
||||||
|
parameters by. This needs to be carefully set - if too large
|
||||||
|
the learning dynamic will be unstable and may diverge, while
|
||||||
|
if set too small learning will proceed very slowly.
|
||||||
|
beta_1: Exponential decay rate for gradient first moment estimates.
|
||||||
|
This should be a scalar value in [0, 1]. The running gradient
|
||||||
|
first moment estimate is calculated using
|
||||||
|
`m_1 = beta_1 * m_1_prev + (1 - beta_1) * g`
|
||||||
|
where `m_1_prev` is the previous estimate and `g` the current
|
||||||
|
parameter gradients.
|
||||||
|
beta_2: Exponential decay rate for gradient second moment
|
||||||
|
estimates. This should be a scalar value in [0, 1]. The run
|
||||||
|
gradient second moment estimate is calculated using
|
||||||
|
`m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2`
|
||||||
|
where `m_2_prev` is the previous estimate and `g` the current
|
||||||
|
parameter gradients.
|
||||||
|
epsilon: 'Softening' parameter to stop updates diverging when
|
||||||
|
second moment estimates are close to zero. Should be set to
|
||||||
|
a small positive value.
|
||||||
|
"""
|
||||||
|
super(AdamLearningRule, self).__init__(learning_rate)
|
||||||
|
assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].'
|
||||||
|
assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].'
|
||||||
|
assert epsilon > 0., 'epsilon should be > 0.'
|
||||||
|
self.beta_1 = beta_1
|
||||||
|
self.beta_2 = beta_2
|
||||||
|
self.epsilon = epsilon
|
||||||
|
|
||||||
|
def initialise(self, params):
|
||||||
|
"""Initialises the state of the learning rule for a set or parameters.
|
||||||
|
|
||||||
|
This must be called before `update_params` is first called.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params: A list of the parameters to be optimised. Note these will
|
||||||
|
be updated *in-place* to avoid reallocating arrays on each
|
||||||
|
update.
|
||||||
|
"""
|
||||||
|
super(AdamLearningRule, self).initialise(params)
|
||||||
|
self.moms_1 = []
|
||||||
|
for param in self.params:
|
||||||
|
self.moms_1.append(np.zeros_like(param))
|
||||||
|
self.moms_2 = []
|
||||||
|
for param in self.params:
|
||||||
|
self.moms_2.append(np.zeros_like(param))
|
||||||
|
self.step_count = 0
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets any additional state variables to their initial values.
|
||||||
|
|
||||||
|
For this learning rule this corresponds to zeroing the estimates of
|
||||||
|
the first and second moments of the gradients.
|
||||||
|
"""
|
||||||
|
for mom_1, mom_2 in zip(self.moms_1, self.moms_2):
|
||||||
|
mom_1 *= 0.
|
||||||
|
mom_2 *= 0.
|
||||||
|
self.step_count = 0
|
||||||
|
|
||||||
|
def update_params(self, grads_wrt_params):
|
||||||
|
"""Applies a single update to all parameters.
|
||||||
|
|
||||||
|
All parameter updates are performed using in-place operations and so
|
||||||
|
nothing is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grads_wrt_params: A list of gradients of the scalar loss function
|
||||||
|
with respect to each of the parameters passed to `initialise`
|
||||||
|
previously, with this list expected to be in the same order.
|
||||||
|
"""
|
||||||
|
for param, mom_1, mom_2, grad in zip(
|
||||||
|
self.params, self.moms_1, self.moms_2, grads_wrt_params):
|
||||||
|
mom_1 *= self.beta_1
|
||||||
|
mom_1 += (1. - self.beta_1) * grad
|
||||||
|
mom_2 *= self.beta_2
|
||||||
|
mom_2 += (1. - self.beta_2) * grad**2
|
||||||
|
alpha_t = (
|
||||||
|
self.learning_rate *
|
||||||
|
(1. - self.beta_2**(self.step_count + 1))**0.5 /
|
||||||
|
(1. - self.beta_1**(self.step_count + 1))
|
||||||
|
)
|
||||||
|
param -= alpha_t * mom_1 / (mom_2**0.5 + self.epsilon)
|
||||||
|
self.step_count += 1
|
||||||
|
|
||||||
|
|
||||||
|
class AdaGradLearningRule(GradientDescentLearningRule):
|
||||||
|
"""Adaptive gradients (AdaGrad) learning rule.
|
||||||
|
|
||||||
|
First-order gradient-descent based learning rule which normalises gradient
|
||||||
|
updates by a running sum of the past squared gradients.
|
||||||
|
|
||||||
|
References:
|
||||||
|
[1]: Adaptive Subgradient Methods for Online Learning and Stochastic
|
||||||
|
Optimization. Duchi, Haxan and Singer, 2011
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learning_rate=1e-2, epsilon=1e-8):
|
||||||
|
"""Creates a new learning rule object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
learning_rate: A postive scalar to scale gradient updates to the
|
||||||
|
parameters by. This needs to be carefully set - if too large
|
||||||
|
the learning dynamic will be unstable and may diverge, while
|
||||||
|
if set too small learning will proceed very slowly.
|
||||||
|
epsilon: 'Softening' parameter to stop updates diverging when
|
||||||
|
sums of squared gradients are close to zero. Should be set to
|
||||||
|
a small positive value.
|
||||||
|
"""
|
||||||
|
super(AdaGradLearningRule, self).__init__(learning_rate)
|
||||||
|
assert epsilon > 0., 'epsilon should be > 0.'
|
||||||
|
self.epsilon = epsilon
|
||||||
|
|
||||||
|
def initialise(self, params):
|
||||||
|
"""Initialises the state of the learning rule for a set or parameters.
|
||||||
|
|
||||||
|
This must be called before `update_params` is first called.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params: A list of the parameters to be optimised. Note these will
|
||||||
|
be updated *in-place* to avoid reallocating arrays on each
|
||||||
|
update.
|
||||||
|
"""
|
||||||
|
super(AdaGradLearningRule, self).initialise(params)
|
||||||
|
self.sum_sq_grads = []
|
||||||
|
for param in self.params:
|
||||||
|
self.sum_sq_grads.append(np.zeros_like(param))
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets any additional state variables to their initial values.
|
||||||
|
|
||||||
|
For this learning rule this corresponds to zeroing all the sum of
|
||||||
|
squared gradient states.
|
||||||
|
"""
|
||||||
|
for sum_sq_grad in self.sum_sq_grads:
|
||||||
|
sum_sq_grad *= 0.
|
||||||
|
|
||||||
|
def update_params(self, grads_wrt_params):
|
||||||
|
"""Applies a single update to all parameters.
|
||||||
|
|
||||||
|
All parameter updates are performed using in-place operations and so
|
||||||
|
nothing is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grads_wrt_params: A list of gradients of the scalar loss function
|
||||||
|
with respect to each of the parameters passed to `initialise`
|
||||||
|
previously, with this list expected to be in the same order.
|
||||||
|
"""
|
||||||
|
for param, sum_sq_grad, grad in zip(
|
||||||
|
self.params, self.sum_sq_grads, grads_wrt_params):
|
||||||
|
sum_sq_grad += grad**2
|
||||||
|
param -= (self.learning_rate * grad /
|
||||||
|
(sum_sq_grad + self.epsilon)**0.5)
|
||||||
|
|
||||||
|
|
||||||
|
class RMSPropLearningRule(GradientDescentLearningRule):
|
||||||
|
"""Root mean squared gradient normalised learning rule (RMSProp).
|
||||||
|
|
||||||
|
First-order gradient-descent based learning rule which normalises gradient
|
||||||
|
updates by a exponentially smoothed estimate of the gradient second
|
||||||
|
moments.
|
||||||
|
|
||||||
|
References:
|
||||||
|
[1]: Neural Networks for Machine Learning: Lecture 6a slides
|
||||||
|
University of Toronto,Computer Science Course CSC321
|
||||||
|
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learning_rate=1e-3, beta=0.9, epsilon=1e-8):
|
||||||
|
"""Creates a new learning rule object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
learning_rate: A postive scalar to scale gradient updates to the
|
||||||
|
parameters by. This needs to be carefully set - if too large
|
||||||
|
the learning dynamic will be unstable and may diverge, while
|
||||||
|
if set too small learning will proceed very slowly.
|
||||||
|
beta: Exponential decay rate for gradient second moment
|
||||||
|
estimates. This should be a scalar value in [0, 1]. The running
|
||||||
|
gradient second moment estimate is calculated using
|
||||||
|
`m_2 = beta * m_2_prev + (1 - beta) * g**2`
|
||||||
|
where `m_2_prev` is the previous estimate and `g` the current
|
||||||
|
parameter gradients.
|
||||||
|
epsilon: 'Softening' parameter to stop updates diverging when
|
||||||
|
gradient second moment estimates are close to zero. Should be
|
||||||
|
set to a small positive value.
|
||||||
|
"""
|
||||||
|
super(RMSPropLearningRule, self).__init__(learning_rate)
|
||||||
|
assert beta >= 0. and beta <= 1., 'beta should be in [0, 1].'
|
||||||
|
assert epsilon > 0., 'epsilon should be > 0.'
|
||||||
|
self.beta = beta
|
||||||
|
self.epsilon = epsilon
|
||||||
|
|
||||||
|
def initialise(self, params):
|
||||||
|
"""Initialises the state of the learning rule for a set or parameters.
|
||||||
|
|
||||||
|
This must be called before `update_params` is first called.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params: A list of the parameters to be optimised. Note these will
|
||||||
|
be updated *in-place* to avoid reallocating arrays on each
|
||||||
|
update.
|
||||||
|
"""
|
||||||
|
super(RMSPropLearningRule, self).initialise(params)
|
||||||
|
self.moms_2 = []
|
||||||
|
for param in self.params:
|
||||||
|
self.moms_2.append(np.zeros_like(param))
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets any additional state variables to their initial values.
|
||||||
|
|
||||||
|
For this learning rule this corresponds to zeroing all gradient
|
||||||
|
second moment estimates.
|
||||||
|
"""
|
||||||
|
for mom_2 in self.moms_2:
|
||||||
|
mom_2 *= 0.
|
||||||
|
|
||||||
|
def update_params(self, grads_wrt_params):
|
||||||
|
"""Applies a single update to all parameters.
|
||||||
|
|
||||||
|
All parameter updates are performed using in-place operations and so
|
||||||
|
nothing is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grads_wrt_params: A list of gradients of the scalar loss function
|
||||||
|
with respect to each of the parameters passed to `initialise`
|
||||||
|
previously, with this list expected to be in the same order.
|
||||||
|
"""
|
||||||
|
for param, mom_2, grad in zip(
|
||||||
|
self.params, self.moms_2, grads_wrt_params):
|
||||||
|
mom_2 *= self.beta
|
||||||
|
mom_2 += (1. - self.beta) * grad**2
|
||||||
|
param -= (self.learning_rate * grad /
|
||||||
|
(mom_2 + self.epsilon)**0.5)
|
||||||
|
Loading…
Reference in New Issue
Block a user