From d87f8b05aa638c110b15b22a9315509ae6cf1462 Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Mon, 2 Nov 2015 12:59:36 +0000
Subject: [PATCH] some clarifications
---
04_Regularisation.ipynb | 12 +++++++-----
mlp/layers.py | 2 +-
mlp/optimisers.py | 41 +++++++++++++++++++++++++++++++++++------
3 files changed, 43 insertions(+), 12 deletions(-)
diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
index 6ceff5d..24f2349 100644
--- a/04_Regularisation.ipynb
+++ b/04_Regularisation.ipynb
@@ -86,9 +86,9 @@
"Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
"\n",
"(5) $\n",
- "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}[2} E^n_{L_2}) }{\\partial w_i} \n",
- " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} \\frac{\\partial\n",
- " \\frac{1}{2}E^n_{L_2}}{\\partial w_i} \\right) \n",
+ "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} 0.5 E^n_{L_2}) }{\\partial w_i} \n",
+ " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} 0.5 \\frac{\\partial\n",
+ " E^n_{L_2}}{\\partial w_i} \\right) \n",
" = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right)\n",
"\\end{align*}\n",
"$\n",
@@ -101,7 +101,9 @@
"\\end{align*}\n",
"$\n",
"\n",
- "where $\\eta$ is learning rate.\n",
+ "where $\\eta$ is learning rate. \n",
+ "\n",
+ "Exercise 1 gives some more implementational suggestions on how to incorporate this technique into the lab code, the cost related prior contributions (equation (1)) are computed in mlp.optimisers.Optimiser.compute_prior_costs() and your job is to add the relevant optimisation related code when computing the gradients w.r.t parameters. \n",
"\n",
"## $L_{p=1}$ (Sparsity)\n",
"\n",
@@ -283,7 +285,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
- "version": "2.7.10"
+ "version": "2.7.9"
}
},
"nbformat": 4,
diff --git a/mlp/layers.py b/mlp/layers.py
index e7590f8..d548c9c 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -290,7 +290,7 @@ class Sigmoid(Linear):
#'a' get very negative. We limit both tails, however only
#negative values may lead to numerical issues -- exp(-a)
#clip() function does the following operation faster:
- # a[a < -30.] = 30,
+ # a[a < -30.] = -30,
# a[a > 30.] = 30.
numpy.clip(a, -30.0, 30.0, out=a)
h = 1.0/(1 + numpy.exp(-a))
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index 4b70f0c..77b4db6 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -20,7 +20,7 @@ class Optimiser(object):
def train(self, model, train_iter, valid_iter=None):
raise NotImplementedError()
- def validate(self, model, valid_iterator):
+ def validate(self, model, valid_iterator, l1_weight=0, l2_weight=0):
assert isinstance(model, MLP), (
"Expected model to be a subclass of 'mlp.layers.MLP'"
" class but got %s " % type(model)
@@ -40,7 +40,9 @@ class Optimiser(object):
acc = numpy.mean(acc_list)
nll = numpy.mean(nll_list)
- return nll, acc
+ prior_costs = Optimiser.compute_prior_costs(model, l1_weight, l2_weight)
+
+ return nll + sum(prior_costs), acc
@staticmethod
def classification_accuracy(y, t):
@@ -56,6 +58,28 @@ class Optimiser(object):
rval = numpy.equal(y_idx, t_idx)
return rval
+ @staticmethod
+ def compute_prior_costs(model, l1_weight, l2_weight):
+ """
+ Computes the cost contributions coming from parameter-dependent only
+ regularisation penalties
+ """
+ assert isinstance(model, MLP), (
+ "Expected model to be a subclass of 'mlp.layers.MLP'"
+ " class but got %s " % type(model)
+ )
+
+ l1_cost, l2_cost = 0, 0
+ for i in xrange(0, len(model.layers)):
+ params = model.layers[i].get_params()
+ for param in params:
+ if l2_weight > 0:
+ l2_cost += 0.5 * l2_weight * numpy.sum(param**2)
+ if l1_weight > 0:
+ l1_cost += l1_weight * numpy.sum(numpy.sign(param))
+
+ return l1_cost, l2_cost
+
class SGDOptimiser(Optimiser):
def __init__(self, lr_scheduler,
@@ -117,7 +141,11 @@ class SGDOptimiser(Optimiser):
nll_list.append(cost)
acc_list.append(numpy.mean(self.classification_accuracy(y, t)))
- return numpy.mean(nll_list), numpy.mean(acc_list)
+ #compute the prior penalties contribution (parameter dependent only)
+ prior_costs = Optimiser.compute_prior_costs(model, self.l1_weight, self.l2_weight)
+ training_cost = numpy.mean(nll_list) + sum(prior_costs)
+
+ return training_cost, numpy.mean(acc_list)
def train(self, model, train_iterator, valid_iterator=None):
@@ -127,14 +155,14 @@ class SGDOptimiser(Optimiser):
# do the initial validation
train_iterator.reset()
- tr_nll, tr_acc = self.validate(model, train_iterator)
+ tr_nll, tr_acc = self.validate(model, train_iterator, self.l1_weight, self.l2_weight)
logger.info('Epoch %i: Training cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))
tr_stats.append((tr_nll, tr_acc))
if valid_iterator is not None:
valid_iterator.reset()
- valid_nll, valid_acc = self.validate(model, valid_iterator)
+ valid_nll, valid_acc = self.validate(model, valid_iterator, self.l1_weight, self.l2_weight)
logger.info('Epoch %i: Validation cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.))
valid_stats.append((valid_nll, valid_acc))
@@ -155,7 +183,8 @@ class SGDOptimiser(Optimiser):
vstart = time.clock()
if valid_iterator is not None:
valid_iterator.reset()
- valid_nll, valid_acc = self.validate(model, valid_iterator)
+ valid_nll, valid_acc = self.validate(model, valid_iterator,
+ self.l1_weight, self.l2_weight)
logger.info('Epoch %i: Validation cost (%s) is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch + 1, cost_name, valid_nll, valid_acc * 100.))
self.lr_scheduler.get_next_rate(valid_acc)