From 2b204846493fc2242d357608a6996a1c2c31aaa1 Mon Sep 17 00:00:00 2001
From: Matt Graham <m.m.graham@ed.ac.uk>
Date: Thu, 3 Nov 2016 18:31:37 +0000
Subject: [PATCH] Adding adaptive learning rules.

---
 mlp/learning_rules.py | 250 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)

diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py
index c7fa3a6..4bfa330 100644
--- a/mlp/learning_rules.py
+++ b/mlp/learning_rules.py
@@ -270,3 +270,253 @@ class NesterovMomentumLearningRule(GradientDescentLearningRule):
             mom *= self.mom_coeff
             mom -= self.learning_rate * grad
             param += (1. + self.mom_coeff) * mom - self.mom_coeff * mom_prev
+
+
+class AdamLearningRule(GradientDescentLearningRule):
+    """Adaptive moments (Adam) learning rule.
+
+    First-order gradient-descent based learning rule which uses adaptive
+    estimates of first and second moments of the parameter gradients to
+    calculate the parameter updates.
+
+    References:
+      [1]: Adam: a method for stochastic optimisation
+           Kingma and Ba, 2015
+    """
+
+    def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
+                 epsilon=1e-8):
+        """Creates a new learning rule object.
+
+        Args:
+            learning_rate: A postive scalar to scale gradient updates to the
+                parameters by. This needs to be carefully set - if too large
+                the learning dynamic will be unstable and may diverge, while
+                if set too small learning will proceed very slowly.
+            beta_1: Exponential decay rate for gradient first moment estimates.
+                This should be a scalar value in [0, 1]. The running gradient
+                first moment estimate is calculated using
+                `m_1 = beta_1 * m_1_prev + (1 - beta_1) * g`
+                 where `m_1_prev` is the previous estimate and `g` the current
+                 parameter gradients.
+            beta_2: Exponential decay rate for gradient second moment
+                estimates. This should be a scalar value in [0, 1]. The run
+                gradient second moment estimate is calculated using
+                `m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2`
+                 where `m_2_prev` is the previous estimate and `g` the current
+                 parameter gradients.
+            epsilon: 'Softening' parameter to stop updates diverging when
+                second moment estimates are close to zero. Should be set to
+                a small positive value.
+        """
+        super(AdamLearningRule, self).__init__(learning_rate)
+        assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].'
+        assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].'
+        assert epsilon > 0., 'epsilon should be > 0.'
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+
+    def initialise(self, params):
+        """Initialises the state of the learning rule for a set or parameters.
+
+        This must be called before `update_params` is first called.
+
+        Args:
+            params: A list of the parameters to be optimised. Note these will
+                be updated *in-place* to avoid reallocating arrays on each
+                update.
+        """
+        super(AdamLearningRule, self).initialise(params)
+        self.moms_1 = []
+        for param in self.params:
+            self.moms_1.append(np.zeros_like(param))
+        self.moms_2 = []
+        for param in self.params:
+            self.moms_2.append(np.zeros_like(param))
+        self.step_count = 0
+
+    def reset(self):
+        """Resets any additional state variables to their initial values.
+
+        For this learning rule this corresponds to zeroing the estimates of
+        the first and second moments of the gradients.
+        """
+        for mom_1, mom_2 in zip(self.moms_1, self.moms_2):
+            mom_1 *= 0.
+            mom_2 *= 0.
+        self.step_count = 0
+
+    def update_params(self, grads_wrt_params):
+        """Applies a single update to all parameters.
+
+        All parameter updates are performed using in-place operations and so
+        nothing is returned.
+
+        Args:
+            grads_wrt_params: A list of gradients of the scalar loss function
+                with respect to each of the parameters passed to `initialise`
+                previously, with this list expected to be in the same order.
+        """
+        for param, mom_1, mom_2, grad in zip(
+                self.params, self.moms_1, self.moms_2, grads_wrt_params):
+            mom_1 *= self.beta_1
+            mom_1 += (1. - self.beta_1) * grad
+            mom_2 *= self.beta_2
+            mom_2 += (1. - self.beta_2) * grad**2
+            alpha_t = (
+                self.learning_rate *
+                (1. - self.beta_2**(self.step_count + 1))**0.5 /
+                (1. - self.beta_1**(self.step_count + 1))
+            )
+            param -= alpha_t * mom_1 / (mom_2**0.5 + self.epsilon)
+        self.step_count += 1
+
+
+class AdaGradLearningRule(GradientDescentLearningRule):
+    """Adaptive gradients (AdaGrad) learning rule.
+
+    First-order gradient-descent based learning rule which normalises gradient
+    updates by a running sum of the past squared gradients.
+
+    References:
+      [1]: Adaptive Subgradient Methods for Online Learning and Stochastic
+           Optimization. Duchi, Haxan and Singer, 2011
+    """
+
+    def __init__(self, learning_rate=1e-2, epsilon=1e-8):
+        """Creates a new learning rule object.
+
+        Args:
+            learning_rate: A postive scalar to scale gradient updates to the
+                parameters by. This needs to be carefully set - if too large
+                the learning dynamic will be unstable and may diverge, while
+                if set too small learning will proceed very slowly.
+            epsilon: 'Softening' parameter to stop updates diverging when
+                sums of squared gradients are close to zero. Should be set to
+                a small positive value.
+        """
+        super(AdaGradLearningRule, self).__init__(learning_rate)
+        assert epsilon > 0., 'epsilon should be > 0.'
+        self.epsilon = epsilon
+
+    def initialise(self, params):
+        """Initialises the state of the learning rule for a set or parameters.
+
+        This must be called before `update_params` is first called.
+
+        Args:
+            params: A list of the parameters to be optimised. Note these will
+                be updated *in-place* to avoid reallocating arrays on each
+                update.
+        """
+        super(AdaGradLearningRule, self).initialise(params)
+        self.sum_sq_grads = []
+        for param in self.params:
+            self.sum_sq_grads.append(np.zeros_like(param))
+
+    def reset(self):
+        """Resets any additional state variables to their initial values.
+
+        For this learning rule this corresponds to zeroing all the sum of
+        squared gradient states.
+        """
+        for sum_sq_grad in self.sum_sq_grads:
+            sum_sq_grad *= 0.
+
+    def update_params(self, grads_wrt_params):
+        """Applies a single update to all parameters.
+
+        All parameter updates are performed using in-place operations and so
+        nothing is returned.
+
+        Args:
+            grads_wrt_params: A list of gradients of the scalar loss function
+                with respect to each of the parameters passed to `initialise`
+                previously, with this list expected to be in the same order.
+        """
+        for param, sum_sq_grad, grad in zip(
+                self.params, self.sum_sq_grads, grads_wrt_params):
+            sum_sq_grad += grad**2
+            param -= (self.learning_rate * grad /
+                      (sum_sq_grad + self.epsilon)**0.5)
+
+
+class RMSPropLearningRule(GradientDescentLearningRule):
+    """Root mean squared gradient normalised learning rule (RMSProp).
+
+    First-order gradient-descent based learning rule which normalises gradient
+    updates by a exponentially smoothed estimate of the gradient second
+    moments.
+
+    References:
+      [1]: Neural Networks for Machine Learning: Lecture 6a slides
+           University of Toronto,Computer Science Course CSC321
+      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+    """
+
+    def __init__(self, learning_rate=1e-3, beta=0.9, epsilon=1e-8):
+        """Creates a new learning rule object.
+
+        Args:
+            learning_rate: A postive scalar to scale gradient updates to the
+                parameters by. This needs to be carefully set - if too large
+                the learning dynamic will be unstable and may diverge, while
+                if set too small learning will proceed very slowly.
+            beta: Exponential decay rate for gradient second moment
+                estimates. This should be a scalar value in [0, 1]. The running
+                gradient second moment estimate is calculated using
+                `m_2 = beta * m_2_prev + (1 - beta) * g**2`
+                 where `m_2_prev` is the previous estimate and `g` the current
+                 parameter gradients.
+            epsilon: 'Softening' parameter to stop updates diverging when
+                gradient second moment estimates are close to zero. Should be
+                set to a small positive value.
+        """
+        super(RMSPropLearningRule, self).__init__(learning_rate)
+        assert beta >= 0. and beta <= 1., 'beta should be in [0, 1].'
+        assert epsilon > 0., 'epsilon should be > 0.'
+        self.beta = beta
+        self.epsilon = epsilon
+
+    def initialise(self, params):
+        """Initialises the state of the learning rule for a set or parameters.
+
+        This must be called before `update_params` is first called.
+
+        Args:
+            params: A list of the parameters to be optimised. Note these will
+                be updated *in-place* to avoid reallocating arrays on each
+                update.
+        """
+        super(RMSPropLearningRule, self).initialise(params)
+        self.moms_2 = []
+        for param in self.params:
+            self.moms_2.append(np.zeros_like(param))
+
+    def reset(self):
+        """Resets any additional state variables to their initial values.
+
+        For this learning rule this corresponds to zeroing all gradient
+        second moment estimates.
+        """
+        for mom_2 in self.moms_2:
+            mom_2 *= 0.
+
+    def update_params(self, grads_wrt_params):
+        """Applies a single update to all parameters.
+
+        All parameter updates are performed using in-place operations and so
+        nothing is returned.
+
+        Args:
+            grads_wrt_params: A list of gradients of the scalar loss function
+                with respect to each of the parameters passed to `initialise`
+                previously, with this list expected to be in the same order.
+        """
+        for param, mom_2, grad in zip(
+                self.params, self.moms_2, grads_wrt_params):
+            mom_2 *= self.beta
+            mom_2 += (1. - self.beta) * grad**2
+            param -= (self.learning_rate * grad /
+                      (mom_2 + self.epsilon)**0.5)