From d87f8b05aa638c110b15b22a9315509ae6cf1462 Mon Sep 17 00:00:00 2001
From: pswietojanski <p.swietojanski@gmail.com>
Date: Mon, 2 Nov 2015 12:59:36 +0000
Subject: [PATCH] some clarifications

---
 04_Regularisation.ipynb | 12 +++++++-----
 mlp/layers.py           |  2 +-
 mlp/optimisers.py       | 41 +++++++++++++++++++++++++++++++++++------
 3 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
index 6ceff5d..24f2349 100644
--- a/04_Regularisation.ipynb
+++ b/04_Regularisation.ipynb
@@ -86,9 +86,9 @@
     "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
     "\n",
     "(5) $\n",
-    "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}[2} E^n_{L_2}) }{\\partial w_i} \n",
-    "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} \\frac{\\partial\n",
-    "      \\frac{1}{2}E^n_{L_2}}{\\partial w_i} \\right) \n",
+    "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} 0.5 E^n_{L_2}) }{\\partial w_i} \n",
+    "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} 0.5 \\frac{\\partial\n",
+    "      E^n_{L_2}}{\\partial w_i} \\right) \n",
     "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} w_i \\right)\n",
     "\\end{align*}\n",
     "$\n",
@@ -101,7 +101,9 @@
     "\\end{align*}\n",
     "$\n",
     "\n",
-    "where $\\eta$ is learning rate.\n",
+    "where $\\eta$ is learning rate. \n",
+    "\n",
+    "Exercise 1 gives some more implementational suggestions on how to incorporate this technique into the lab code, the cost related prior contributions (equation (1)) are computed in mlp.optimisers.Optimiser.compute_prior_costs() and your job is to add the relevant optimisation related code when computing the gradients w.r.t parameters. \n",
     "\n",
     "## $L_{p=1}$ (Sparsity)\n",
     "\n",
@@ -283,7 +285,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.10"
+   "version": "2.7.9"
   }
  },
  "nbformat": 4,
diff --git a/mlp/layers.py b/mlp/layers.py
index e7590f8..d548c9c 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -290,7 +290,7 @@ class Sigmoid(Linear):
         #'a' get very negative. We limit both tails, however only
         #negative values may lead to numerical issues -- exp(-a)
         #clip() function does the following operation faster:
-        # a[a < -30.] = 30,
+        # a[a < -30.] = -30,
         # a[a > 30.] = 30.
         numpy.clip(a, -30.0, 30.0, out=a)
         h = 1.0/(1 + numpy.exp(-a))
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index 4b70f0c..77b4db6 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -20,7 +20,7 @@ class Optimiser(object):
     def train(self, model, train_iter, valid_iter=None):
         raise NotImplementedError()
 
-    def validate(self, model, valid_iterator):
+    def validate(self, model, valid_iterator, l1_weight=0, l2_weight=0):
         assert isinstance(model, MLP), (
             "Expected model to be a subclass of 'mlp.layers.MLP'"
             " class but got %s " % type(model)
@@ -40,7 +40,9 @@ class Optimiser(object):
         acc = numpy.mean(acc_list)
         nll = numpy.mean(nll_list)
 
-        return nll, acc
+        prior_costs = Optimiser.compute_prior_costs(model, l1_weight, l2_weight)
+
+        return nll + sum(prior_costs), acc
 
     @staticmethod
     def classification_accuracy(y, t):
@@ -56,6 +58,28 @@ class Optimiser(object):
         rval = numpy.equal(y_idx, t_idx)
         return rval
 
+    @staticmethod
+    def compute_prior_costs(model, l1_weight, l2_weight):
+        """
+        Computes the cost contributions coming from parameter-dependent only
+        regularisation penalties
+        """
+        assert isinstance(model, MLP), (
+            "Expected model to be a subclass of 'mlp.layers.MLP'"
+            " class but got %s " % type(model)
+        )
+
+        l1_cost, l2_cost = 0, 0
+        for i in xrange(0, len(model.layers)):
+            params = model.layers[i].get_params()
+            for param in params:
+                if l2_weight > 0:
+                    l2_cost += 0.5 * l2_weight * numpy.sum(param**2)
+                if l1_weight > 0:
+                    l1_cost += l1_weight * numpy.sum(numpy.sign(param))
+
+        return l1_cost, l2_cost
+
 
 class SGDOptimiser(Optimiser):
     def __init__(self, lr_scheduler,
@@ -117,7 +141,11 @@ class SGDOptimiser(Optimiser):
             nll_list.append(cost)
             acc_list.append(numpy.mean(self.classification_accuracy(y, t)))
 
-        return numpy.mean(nll_list), numpy.mean(acc_list)
+        #compute the prior penalties contribution (parameter dependent only)
+        prior_costs = Optimiser.compute_prior_costs(model, self.l1_weight, self.l2_weight)
+        training_cost = numpy.mean(nll_list) + sum(prior_costs)
+
+        return training_cost, numpy.mean(acc_list)
 
     def train(self, model, train_iterator, valid_iterator=None):
 
@@ -127,14 +155,14 @@ class SGDOptimiser(Optimiser):
 
         # do the initial validation
         train_iterator.reset()
-        tr_nll, tr_acc = self.validate(model, train_iterator)
+        tr_nll, tr_acc = self.validate(model, train_iterator, self.l1_weight, self.l2_weight)
         logger.info('Epoch %i: Training cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
                     % (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))
         tr_stats.append((tr_nll, tr_acc))
 
         if valid_iterator is not None:
             valid_iterator.reset()
-            valid_nll, valid_acc = self.validate(model, valid_iterator)
+            valid_nll, valid_acc = self.validate(model, valid_iterator, self.l1_weight, self.l2_weight)
             logger.info('Epoch %i: Validation cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
                         % (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.))
             valid_stats.append((valid_nll, valid_acc))
@@ -155,7 +183,8 @@ class SGDOptimiser(Optimiser):
             vstart = time.clock()
             if valid_iterator is not None:
                 valid_iterator.reset()
-                valid_nll, valid_acc = self.validate(model, valid_iterator)
+                valid_nll, valid_acc = self.validate(model, valid_iterator,
+                                                     self.l1_weight, self.l2_weight)
                 logger.info('Epoch %i: Validation cost (%s) is %.3f. Accuracy is %.2f%%'
                             % (self.lr_scheduler.epoch + 1, cost_name, valid_nll, valid_acc * 100.))
                 self.lr_scheduler.get_next_rate(valid_acc)