diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb index 6ceff5d..24f2349 100644 --- a/04_Regularisation.ipynb +++ b/04_Regularisation.ipynb @@ -86,9 +86,9 @@ "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n", "\n", "(5) $\n", - "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}[2} E^n_{L_2}) }{\\partial w_i} \n", - " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} \\frac{\\partial\n", - " \\frac{1}{2}E^n_{L_2}}{\\partial w_i} \\right) \n", + "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} 0.5 E^n_{L_2}) }{\\partial w_i} \n", + " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} 0.5 \\frac{\\partial\n", + " E^n_{L_2}}{\\partial w_i} \\right) \n", " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right)\n", "\\end{align*}\n", "$\n", @@ -101,7 +101,9 @@ "\\end{align*}\n", "$\n", "\n", - "where $\\eta$ is learning rate.\n", + "where $\\eta$ is learning rate. \n", + "\n", + "Exercise 1 gives some more implementational suggestions on how to incorporate this technique into the lab code, the cost related prior contributions (equation (1)) are computed in mlp.optimisers.Optimiser.compute_prior_costs() and your job is to add the relevant optimisation related code when computing the gradients w.r.t parameters. \n", "\n", "## $L_{p=1}$ (Sparsity)\n", "\n", @@ -283,7 +285,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.9" } }, "nbformat": 4, diff --git a/mlp/layers.py b/mlp/layers.py index e7590f8..d548c9c 100644 --- a/mlp/layers.py +++ b/mlp/layers.py @@ -290,7 +290,7 @@ class Sigmoid(Linear): #'a' get very negative. We limit both tails, however only #negative values may lead to numerical issues -- exp(-a) #clip() function does the following operation faster: - # a[a < -30.] = 30, + # a[a < -30.] = -30, # a[a > 30.] = 30. numpy.clip(a, -30.0, 30.0, out=a) h = 1.0/(1 + numpy.exp(-a)) diff --git a/mlp/optimisers.py b/mlp/optimisers.py index 4b70f0c..77b4db6 100644 --- a/mlp/optimisers.py +++ b/mlp/optimisers.py @@ -20,7 +20,7 @@ class Optimiser(object): def train(self, model, train_iter, valid_iter=None): raise NotImplementedError() - def validate(self, model, valid_iterator): + def validate(self, model, valid_iterator, l1_weight=0, l2_weight=0): assert isinstance(model, MLP), ( "Expected model to be a subclass of 'mlp.layers.MLP'" " class but got %s " % type(model) @@ -40,7 +40,9 @@ class Optimiser(object): acc = numpy.mean(acc_list) nll = numpy.mean(nll_list) - return nll, acc + prior_costs = Optimiser.compute_prior_costs(model, l1_weight, l2_weight) + + return nll + sum(prior_costs), acc @staticmethod def classification_accuracy(y, t): @@ -56,6 +58,28 @@ class Optimiser(object): rval = numpy.equal(y_idx, t_idx) return rval + @staticmethod + def compute_prior_costs(model, l1_weight, l2_weight): + """ + Computes the cost contributions coming from parameter-dependent only + regularisation penalties + """ + assert isinstance(model, MLP), ( + "Expected model to be a subclass of 'mlp.layers.MLP'" + " class but got %s " % type(model) + ) + + l1_cost, l2_cost = 0, 0 + for i in xrange(0, len(model.layers)): + params = model.layers[i].get_params() + for param in params: + if l2_weight > 0: + l2_cost += 0.5 * l2_weight * numpy.sum(param**2) + if l1_weight > 0: + l1_cost += l1_weight * numpy.sum(numpy.sign(param)) + + return l1_cost, l2_cost + class SGDOptimiser(Optimiser): def __init__(self, lr_scheduler, @@ -117,7 +141,11 @@ class SGDOptimiser(Optimiser): nll_list.append(cost) acc_list.append(numpy.mean(self.classification_accuracy(y, t))) - return numpy.mean(nll_list), numpy.mean(acc_list) + #compute the prior penalties contribution (parameter dependent only) + prior_costs = Optimiser.compute_prior_costs(model, self.l1_weight, self.l2_weight) + training_cost = numpy.mean(nll_list) + sum(prior_costs) + + return training_cost, numpy.mean(acc_list) def train(self, model, train_iterator, valid_iterator=None): @@ -127,14 +155,14 @@ class SGDOptimiser(Optimiser): # do the initial validation train_iterator.reset() - tr_nll, tr_acc = self.validate(model, train_iterator) + tr_nll, tr_acc = self.validate(model, train_iterator, self.l1_weight, self.l2_weight) logger.info('Epoch %i: Training cost (%s) for initial model is %.3f. Accuracy is %.2f%%' % (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.)) tr_stats.append((tr_nll, tr_acc)) if valid_iterator is not None: valid_iterator.reset() - valid_nll, valid_acc = self.validate(model, valid_iterator) + valid_nll, valid_acc = self.validate(model, valid_iterator, self.l1_weight, self.l2_weight) logger.info('Epoch %i: Validation cost (%s) for initial model is %.3f. Accuracy is %.2f%%' % (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.)) valid_stats.append((valid_nll, valid_acc)) @@ -155,7 +183,8 @@ class SGDOptimiser(Optimiser): vstart = time.clock() if valid_iterator is not None: valid_iterator.reset() - valid_nll, valid_acc = self.validate(model, valid_iterator) + valid_nll, valid_acc = self.validate(model, valid_iterator, + self.l1_weight, self.l2_weight) logger.info('Epoch %i: Validation cost (%s) is %.3f. Accuracy is %.2f%%' % (self.lr_scheduler.epoch + 1, cost_name, valid_nll, valid_acc * 100.)) self.lr_scheduler.get_next_rate(valid_acc)