From 2b516f2f97b64b1232751c1e732fae86bbe08b57 Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Wed, 28 Oct 2015 16:59:11 +0000
Subject: [PATCH 1/4] lab4 work
---
03_MLP_Coursework1.ipynb | 67 +++++++++++++++--
04_Regularisation.ipynb | 156 +++++++++++++++++++++++++++++++++++++++
mlp/layers.py | 104 ++++++++++++++++++++++++--
mlp/optimisers.py | 1 +
4 files changed, 316 insertions(+), 12 deletions(-)
create mode 100644 04_Regularisation.ipynb
diff --git a/03_MLP_Coursework1.ipynb b/03_MLP_Coursework1.ipynb
index 0f786ae..021b761 100644
--- a/03_MLP_Coursework1.ipynb
+++ b/03_MLP_Coursework1.ipynb
@@ -78,7 +78,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"collapsed": false
},
@@ -142,11 +142,43 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
- "collapsed": true
+ "collapsed": false
},
"outputs": [],
"source": [
- "%load -s Sigmoid mlp/layers.py\n"
+ "# %load -s Sigmoid mlp/layers.py\n",
+ "class Sigmoid(Linear):\n",
+ " def __init__(self, idim, odim,\n",
+ " rng=None,\n",
+ " irange=0.1):\n",
+ "\n",
+ " super(Sigmoid, self).__init__(idim, odim, rng, irange)\n",
+ " \n",
+ " def fprop(self, inputs):\n",
+ " a = super(Sigmoid, self).fprop(inputs)\n",
+ " h = 1.0/(1 + numpy.exp(-a))\n",
+ " return h\n",
+ " \n",
+ " def bprop(self, h, igrads):\n",
+ " dsigm = h*(1.0 - h)\n",
+ " deltas = igrads*dsigm\n",
+ " ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas)\n",
+ " return deltas, ograds\n",
+ "\n",
+ " def cost_bprop(self, h, igrads, cost):\n",
+ " if cost is None or cost.get_name() == 'bce':\n",
+ " return super(Sigmoid, self).bprop(h=h, igrads=igrads)\n",
+ " else:\n",
+ " raise NotImplementedError('Sigmoid.bprop_cost method not implemented '\n",
+ " 'for the %s cost' % cost.get_name())\n",
+ "\n",
+ " def pgrads(self, inputs, deltas):\n",
+ " \"Return list of gradients w.r.t parameters\"\n",
+ " gparams = super(Sigmoid, self).pgrads(inputs, deltas)\n",
+ " return gparams\n",
+ "\n",
+ " def get_name(self):\n",
+ " return 'sigmoid'\n"
]
},
{
@@ -162,11 +194,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"collapsed": false
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.0\n",
+ "0.0\n",
+ "0.0744177068753\n",
+ "[ 4.571e-05 1.697e-03 9.877e-01 6.631e-04 1.194e-04 8.880e-04\n",
+ " 1.977e-04 8.671e-03]\n",
+ "[ 4.571e-05 1.697e-03 9.877e-01 6.631e-04 1.194e-04 8.880e-04\n",
+ " 1.977e-04 -9.913e-01]\n",
+ "[-0.089 0.03 0.079 0.011 0.017 0.027]\n"
+ ]
+ }
+ ],
"source": [
"from mlp.layers import Softmax\n",
"\n",
@@ -204,7 +251,15 @@
},
"outputs": [],
"source": [
- "%load -s Softmax mlp/layers.py"
+ "%load -s Softmax mlp/layers.py\n",
+ "1.0\n",
+ "-1.11022302463e-16\n",
+ "0.0744177068753\n",
+ "[ 4.571e-05 1.697e-03 9.877e-01 6.631e-04 1.194e-04 8.880e-04\n",
+ " 1.977e-04 8.671e-03]\n",
+ "[ 4.571e-05 1.697e-03 9.877e-01 6.631e-04 1.194e-04 8.880e-04\n",
+ " 1.977e-04 -9.913e-01]\n",
+ "[-0.089 0.03 0.079 0.011 0.017 0.027]"
]
},
{
diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
new file mode 100644
index 0000000..bac175e
--- /dev/null
+++ b/04_Regularisation.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Introduction\n",
+ "\n",
+ "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches - L2 and L1 as well as technique called droput, that.\n",
+ "\n",
+ "\n",
+ "## Virtual environments\n",
+ "\n",
+ "Before you proceed onwards, remember to activate your virtual environment:\n",
+ " * If you were in last week's Tuesday or Wednesday group type `activate_mlp` or `source ~/mlpractical/venv/bin/activate`\n",
+ " * If you were in the Monday group:\n",
+ " + and if you have chosen the **comfy** way type: `workon mlpractical`\n",
+ " + and if you have chosen the **generic** way, `source` your virutal environment using `source` and specyfing the path to the activate script (you need to localise it yourself, there were not any general recommendations w.r.t dir structure and people have installed it in different places, usually somewhere in the home directories. If you cannot easily find it by yourself, use something like: `find . -iname activate` ):\n",
+ "\n",
+ "## Syncing the git repository\n",
+ "\n",
+ "Look here for more details. But in short, we recommend to create a separate branch for this lab, as follows:\n",
+ "\n",
+ "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
+ "2. List the branches and check which is currently active by typing: `git branch`\n",
+ "3. If you have followed recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
+ "```\n",
+ "git commit -am \"stuff I did for the coursework\"\n",
+ "```\n",
+ "4. Now you can switch to `master` branch by typing: \n",
+ "```\n",
+ "git checkout master\n",
+ " ```\n",
+ "5. To update the repository (note, assuming master does not have any conflicts), if there are some, have a look here\n",
+ "```\n",
+ "git pull\n",
+ "```\n",
+ "6. And now, create the new branch & swith to it by typing:\n",
+ "```\n",
+ "git checkout -b lab4\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Regularisation\n",
+ "\n",
+ "Today, we shall build models which can have an arbitrary number of hidden layers. Please have a look at the diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n",
+ "\n",
+ "(1) $E = \\log(\\mathbf{y}|\\mathbf{x}; \\theta) + \\alpha J_{L2}(\\theta) + \\beta J_{L1}(\\theta)$\n",
+ "\n",
+ "## L2 Weight Decay\n",
+ "\n",
+ "(1) $J_{L2}(\\theta) = \\frac{1}{2}||\\theta||^2$\n",
+ "\n",
+ "(1) $\\frac{\\partial J_{L2}}{\\partial\\theta} = \\frac{1}{2}||\\theta||^2$\n",
+ "\n",
+ "## L1 Sparsity \n",
+ "\n",
+ "## Dropout\n",
+ "\n",
+ "Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
+ "\n",
+ "(1) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+ "\n",
+ "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
+ "\n",
+ "(2) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+ "\n",
+ "with $0 30.] = 30.
+ numpy.clip(a, -30.0, 30.0, out=a)
+ h = 1.0/(1 + numpy.exp(-a))
+ return h
+
+ def bprop(self, h, igrads):
+ dsigm = h * (1.0 - h)
+ deltas = igrads * dsigm
+ ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas)
+ return deltas, ograds
+
+ def cost_bprop(self, h, igrads, cost):
+ if cost is None or cost.get_name() == 'bce':
+ return super(Sigmoid, self).bprop(h=h, igrads=igrads)
+ else:
+ raise NotImplementedError('Sigmoid.bprop_cost method not implemented '
+ 'for the %s cost' % cost.get_name())
+
+ def get_name(self):
+ return 'sigmoid'
+
+
+class Softmax(Linear):
+
+ def __init__(self,idim, odim,
+ rng=None,
+ irange=0.1):
+
+ super(Softmax, self).__init__(idim,
+ odim,
+ rng=rng,
+ irange=irange)
+
+ def fprop(self, inputs):
+
+ # compute the linear outputs
+ a = super(Softmax, self).fprop(inputs)
+ # apply numerical stabilisation by subtracting max
+ # from each row (not required for the coursework)
+ # then compute exponent
+ assert a.ndim in [1, 2], (
+ "Expected the linear activation in Softmax layer to be either "
+ "vector or matrix, got %ith dimensional tensor" % a.ndim
+ )
+ axis = a.ndim - 1
+ exp_a = numpy.exp(a - numpy.max(a, axis=axis, keepdims=True))
+ # finally, normalise by the sum within each example
+ y = exp_a/numpy.sum(exp_a, axis=axis, keepdims=True)
+
+ return y
+
+ def bprop(self, h, igrads):
+ raise NotImplementedError()
+
+ def bprop_cost(self, h, igrads, cost):
+
+ if cost is None or cost.get_name() == 'ce':
+ return super(Softmax, self).bprop(h=h, igrads=igrads)
+ else:
+ raise NotImplementedError('Softmax.bprop_cost method not implemented '
+ 'for %s cost' % cost.get_name())
+
+ def get_name(self):
+ return 'softmax'
-
-
-
-
\ No newline at end of file
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index 9d4b947..f03c3cc 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -116,6 +116,7 @@ class SGDOptimiser(Optimiser):
tr_stats, valid_stats = [], []
# do the initial validation
+ train_iterator.reset()
tr_nll, tr_acc = self.validate(model, train_iterator)
logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))
From 18b36babde4d52e25204089025f4ffcebfea8e79 Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Sun, 1 Nov 2015 15:50:26 +0000
Subject: [PATCH 2/4] lab4
---
04_Regularisation.ipynb | 180 +++++++++++++++++++++++++++++++++-------
mlp/dataset.py | 26 +++++-
mlp/layers.py | 31 +++----
mlp/optimisers.py | 22 +++--
4 files changed, 201 insertions(+), 58 deletions(-)
diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
index bac175e..d1e4c05 100644
--- a/04_Regularisation.ipynb
+++ b/04_Regularisation.ipynb
@@ -6,7 +6,7 @@
"source": [
"# Introduction\n",
"\n",
- "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches - L2 and L1 as well as technique called droput, that.\n",
+ "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches which are added to optimised objective and the third technique, called *droput*, is a form of noise injection by random corruption of information carried by hidden units during training.\n",
"\n",
"\n",
"## Virtual environments\n",
@@ -23,9 +23,9 @@
"\n",
"1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
"2. List the branches and check which is currently active by typing: `git branch`\n",
- "3. If you have followed recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
+ "3. If you have followed our recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
"```\n",
- "git commit -am \"stuff I did for the coursework\"\n",
+ "git commit -am \"finished coursework\"\n",
"```\n",
"4. Now you can switch to `master` branch by typing: \n",
"```\n",
@@ -47,61 +47,133 @@
"source": [
"# Regularisation\n",
"\n",
- "Today, we shall build models which can have an arbitrary number of hidden layers. Please have a look at the diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n",
+ "Regularisation add some *complexity term* to the cost function. It's purpose is to put some prior on the model's parameters. The most common prior is perhaps the one which assumes smoother solutions (the one which are not able to fit training data too well) are better as they are more likely to better generalise to unseen data. \n",
"\n",
- "(1) $E = \\log(\\mathbf{y}|\\mathbf{x}; \\theta) + \\alpha J_{L2}(\\theta) + \\beta J_{L1}(\\theta)$\n",
+ "A way to incorporate such prior in the model is to add some term that penalise certain configurations of the parameters -- either from growing too large ($L_2$) or the one that prefers solution that could be modelled with less parameters ($L_1$), hence encouraging some parameters to become 0. One can, of course, combine many such priors when optimising the model, however, in the lab we shall use $L_1$ and/or $L_2$ priors.\n",
"\n",
- "## L2 Weight Decay\n",
+ "They can be easily incorporated into the training objective by adding some additive terms, as follows:\n",
"\n",
- "(1) $J_{L2}(\\theta) = \\frac{1}{2}||\\theta||^2$\n",
+ "(1) $\n",
+ " \\begin{align*}\n",
+ " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
+ " \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} + \\underbrace{\\beta_{L_2} E^n_{L_2}}_{\\text{prior term}}\n",
+ "\\end{align*}\n",
+ "$\n",
"\n",
- "(1) $\\frac{\\partial J_{L2}}{\\partial\\theta} = \\frac{1}{2}||\\theta||^2$\n",
+ "where $ E^n_{\\text{train}} = - \\sum_{k=1}^K t^n_k \\ln y^n_k $, $\\beta_{L_1}$ and $\\beta_{L_2}$ some non-negative constants specified a priori (hyper-parameters) and $E^n_{L_1}$ and $E^n_{L_2}$ norm metric specifying certain properties of parameters:\n",
"\n",
- "## L1 Sparsity \n",
+ "(2) $\n",
+ " \\begin{align*}\n",
+ " E^n_{L_p}(\\mathbf{W}) = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
+ "\\end{align*}\n",
+ "$\n",
+ "\n",
+ "where $p$ denotes the norm-order (for regularisation either 1 or 2). (TODO: explain here why we usualy skip square root for p=2)\n",
+ "\n",
+ "## $L_{p=2}$ (Weight Decay)\n",
+ "\n",
+ "(3) $\n",
+ " \\begin{align*}\n",
+ " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
+ " \\underbrace{\\beta E^n_{L_2}}_{\\text{prior term}} = E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}{2}|\\mathbf{W}|^2\n",
+ "\\end{align*}\n",
+ "$\n",
+ "\n",
+ "(4) $\n",
+ "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} E_{L_2}) }{\\partial w_i} \n",
+ " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} \\frac{\\partial\n",
+ " E_{L_2}}{\\partial w_i} \\right) \n",
+ " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right)\n",
+ "\\end{align*}\n",
+ "$\n",
+ "\n",
+ "(5) $\n",
+ "\\begin{align*}\n",
+ " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right) \n",
+ "\\end{align*}\n",
+ "$\n",
+ "\n",
+ "where $\\eta$ is learning rate.\n",
+ "\n",
+ "## $L_{p=1}$ (Sparsity)\n",
+ "\n",
+ "(6) $\n",
+ " \\begin{align*}\n",
+ " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
+ " \\underbrace{\\beta E^n_{L_1}}_{\\text{prior term}} \n",
+ " = E^n_{\\text{train}} + \\beta_{L_1} |\\mathbf{W}|\n",
+ "\\end{align*}\n",
+ "$\n",
+ "\n",
+ "(7) $\\begin{align*}\n",
+ " \\frac{\\partial E^n}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i)\n",
+ "\\end{align*}\n",
+ "$\n",
+ "\n",
+ "(8) $\\begin{align*}\n",
+ " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n",
+ "\\end{align*}$\n",
+ "\n",
+ "Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n",
+ "\n",
+ "One can also apply those penalty terms for biases, however, this is usually not necessary as biases have secondary impact on smoothnes of the given solution.\n",
"\n",
"## Dropout\n",
"\n",
"Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
"\n",
- "(1) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+ "(9) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
"\n",
"where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
"\n",
- "(2) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+ "(10) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
"\n",
- "with $0 0:
+ return self.rng.permutation(self._rand_idx)
+ else:
+ #the max_to_present secures that random examples
+ #are returned from the same pool each time (in case
+ #the total num of examples was limited by max_num_batches)
+ max_to_present = self.batch_size*self._max_num_batches \
+ if self._max_num_batches > 0 else self.x.shape[0]
+ return self.rng.permutation(numpy.arange(0, self.x.shape[0]))[0:max_to_present]
def next(self):
@@ -152,6 +167,9 @@ class MNISTDataProvider(DataProvider):
def num_examples(self):
return self.x.shape[0]
+ def num_examples_presented(self):
+ return self._curr_idx + 1
+
def __to_one_of_k(self, y):
rval = numpy.zeros((y.shape[0], self.num_classes), dtype=numpy.float32)
for i in xrange(y.shape[0]):
diff --git a/mlp/layers.py b/mlp/layers.py
index b03e2a3..e6a379b 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -46,21 +46,6 @@ class MLP(object):
self.activations[i+1] = self.layers[i].fprop(self.activations[i])
return self.activations[-1]
- def fprop_droput(self, x, dropout_probabilites=None):
- """
-
- :param inputs: mini-batch of data-points x
- :return: y (top layer activation) which is an estimate of y given x
- """
-
- if len(self.activations) != len(self.layers) + 1:
- self.activations = [None]*(len(self.layers) + 1)
-
- self.activations[0] = x
- for i in xrange(0, len(self.layers)):
- self.activations[i+1] = self.layers[i].fprop(self.activations[i])
- return self.activations[-1]
-
def bprop(self, cost_grad):
"""
:param cost_grad: matrix -- grad of the cost w.r.t y
@@ -255,7 +240,7 @@ class Linear(Layer):
raise NotImplementedError('Linear.bprop_cost method not implemented '
'for the %s cost' % cost.get_name())
- def pgrads(self, inputs, deltas, **kwargs):
+ def pgrads(self, inputs, deltas, l1_weight=0, l2_weight=0):
"""
Return gradients w.r.t parameters
@@ -272,9 +257,18 @@ class Linear(Layer):
1) da^i/dW^i and 2) da^i/db^i
since W and b are only layer's parameters
"""
+ l2_W_penalty, l2_b_penalty = 0, 0
+ if l2_weight > 0:
+ l2_W_penalty = l2_weight*self.W
+ l2_b_penalty = l2_weight*self.b
- grad_W = numpy.dot(inputs.T, deltas)
- grad_b = numpy.sum(deltas, axis=0)
+ l1_W_penalty, l1_b_penalty = 0, 0
+ if l1_weight > 0:
+ l1_W_penalty = l1_weight*numpy.sign(self.W)
+ l1_b_penalty = l1_weight*numpy.sign(self.b)
+
+ grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty
+ grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty
return [grad_W, grad_b]
@@ -370,4 +364,3 @@ class Softmax(Linear):
def get_name(self):
return 'softmax'
-
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index f03c3cc..4b70f0c 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -58,7 +58,11 @@ class Optimiser(object):
class SGDOptimiser(Optimiser):
- def __init__(self, lr_scheduler):
+ def __init__(self, lr_scheduler,
+ dp_scheduler=None,
+ l1_weight=0.0,
+ l2_weight=0.0):
+
super(SGDOptimiser, self).__init__()
assert isinstance(lr_scheduler, LearningRateScheduler), (
@@ -67,6 +71,9 @@ class SGDOptimiser(Optimiser):
)
self.lr_scheduler = lr_scheduler
+ self.dp_scheduler = dp_scheduler
+ self.l1_weight = l1_weight
+ self.l2_weight = l2_weight
def train_epoch(self, model, train_iterator, learning_rate):
@@ -97,7 +104,10 @@ class SGDOptimiser(Optimiser):
for i in xrange(0, len(model.layers)):
params = model.layers[i].get_params()
- grads = model.layers[i].pgrads(model.activations[i], model.deltas[i + 1])
+ grads = model.layers[i].pgrads(inputs=model.activations[i],
+ deltas=model.deltas[i + 1],
+ l1_weight=self.l1_weight,
+ l2_weight=self.l2_weight)
uparams = []
for param, grad in zip(params, grads):
param = param - effective_learning_rate * grad
@@ -118,14 +128,14 @@ class SGDOptimiser(Optimiser):
# do the initial validation
train_iterator.reset()
tr_nll, tr_acc = self.validate(model, train_iterator)
- logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%'
+ logger.info('Epoch %i: Training cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))
tr_stats.append((tr_nll, tr_acc))
if valid_iterator is not None:
valid_iterator.reset()
valid_nll, valid_acc = self.validate(model, valid_iterator)
- logger.info('Epoch %i: Validation cost (%s) for random model is %.3f. Accuracy is %.2f%%'
+ logger.info('Epoch %i: Validation cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
% (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.))
valid_stats.append((valid_nll, valid_acc))
@@ -154,8 +164,8 @@ class SGDOptimiser(Optimiser):
self.lr_scheduler.get_next_rate(None)
vstop = time.clock()
- train_speed = train_iterator.num_examples() / (tstop - tstart)
- valid_speed = valid_iterator.num_examples() / (vstop - vstart)
+ train_speed = train_iterator.num_examples_presented() / (tstop - tstart)
+ valid_speed = valid_iterator.num_examples_presented() / (vstop - vstart)
tot_time = vstop - tstart
#pps = presentations per second
logger.info("Epoch %i: Took %.0f seconds. Training speed %.0f pps. "
From 35490a68fc4587f1aa4139effd39c6ef711de84f Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Sun, 1 Nov 2015 16:49:35 +0000
Subject: [PATCH 3/4] Merge branches 'lab4_solved' and 'master'
From faaa6cb172530d8746a6c980de1d81a380e36cf5 Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Sun, 1 Nov 2015 19:24:35 +0000
Subject: [PATCH 4/4] lab 4
---
04_Regularisation.ipynb | 56 ++++++++++++++++++++++++++---------------
mlp/layers.py | 15 +++--------
2 files changed, 39 insertions(+), 32 deletions(-)
diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
index d1e4c05..f3e8f25 100644
--- a/04_Regularisation.ipynb
+++ b/04_Regularisation.ipynb
@@ -64,22 +64,31 @@
"\n",
"(2) $\n",
" \\begin{align*}\n",
- " E^n_{L_p}(\\mathbf{W}) = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
+ " E^n_{L_p}(\\mathbf{W}) = ||\\mathbf{W}||_p = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
"\\end{align*}\n",
"$\n",
"\n",
- "where $p$ denotes the norm-order (for regularisation either 1 or 2). (TODO: explain here why we usualy skip square root for p=2)\n",
+ "where $p$ denotes the norm-order (for regularisation either 1 or 2). Notice, in practice for computational purposes we will rather compute squared $L_{p=2}$ norm, which omits the square root in (2), that is:\n",
+ "\n",
+ "(3)$ \\begin{align*}\n",
+ " E^n_{L_{p=2}}(\\mathbf{W}) = ||\\mathbf{W}||^2_2 = \\left ( \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2 \\right )^{\\frac{1}{2}} \\right )^2 = \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2\n",
+ "\\end{align*}\n",
+ "$\n",
"\n",
"## $L_{p=2}$ (Weight Decay)\n",
"\n",
- "(3) $\n",
+ "Our cost with $L_{2}$ regulariser then becomes ($\\frac{1}{2}$ simplifies a derivative later):\n",
+ "\n",
+ "(4) $\n",
" \\begin{align*}\n",
" E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
- " \\underbrace{\\beta E^n_{L_2}}_{\\text{prior term}} = E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}{2}|\\mathbf{W}|^2\n",
+ " \\underbrace{\\beta_{L_2} \\frac{1}{2} E^n_{L_2}}_{\\text{prior term}}\n",
"\\end{align*}\n",
"$\n",
"\n",
- "(4) $\n",
+ "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
+ "\n",
+ "(5) $\n",
"\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} E_{L_2}) }{\\partial w_i} \n",
" = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} \\frac{\\partial\n",
" E_{L_2}}{\\partial w_i} \\right) \n",
@@ -87,7 +96,9 @@
"\\end{align*}\n",
"$\n",
"\n",
- "(5) $\n",
+ "And the actual update we to the $W_i$ parameter is:\n",
+ "\n",
+ "(6) $\n",
"\\begin{align*}\n",
" \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right) \n",
"\\end{align*}\n",
@@ -97,49 +108,54 @@
"\n",
"## $L_{p=1}$ (Sparsity)\n",
"\n",
- "(6) $\n",
+ "Our cost with $L_{1}$ regulariser then becomes:\n",
+ "\n",
+ "(7) $\n",
" \\begin{align*}\n",
" E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
- " \\underbrace{\\beta E^n_{L_1}}_{\\text{prior term}} \n",
- " = E^n_{\\text{train}} + \\beta_{L_1} |\\mathbf{W}|\n",
+ " \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} \n",
"\\end{align*}\n",
"$\n",
"\n",
- "(7) $\\begin{align*}\n",
+ "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
+ "\n",
+ "(8) $\\begin{align*}\n",
" \\frac{\\partial E^n}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i)\n",
"\\end{align*}\n",
"$\n",
"\n",
- "(8) $\\begin{align*}\n",
+ "And the actual update we to the $W_i$ parameter is:\n",
+ "\n",
+ "(9) $\\begin{align*}\n",
" \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n",
"\\end{align*}$\n",
"\n",
"Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n",
"\n",
- "One can also apply those penalty terms for biases, however, this is usually not necessary as biases have secondary impact on smoothnes of the given solution.\n",
+ "One can also easily apply those penalty terms for biases, however, this is usually not necessary as biases do not affect the smoothness of the solution (given data).\n",
"\n",
"## Dropout\n",
"\n",
"Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
"\n",
- "(9) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+ "(10) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
"\n",
"where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
"\n",
- "(10) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+ "(11) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
"\n",
"with $0 0:
- l2_W_penalty = l2_weight*self.W
- l2_b_penalty = l2_weight*self.b
- l1_W_penalty, l1_b_penalty = 0, 0
- if l1_weight > 0:
- l1_W_penalty = l1_weight*numpy.sign(self.W)
- l1_b_penalty = l1_weight*numpy.sign(self.b)
-
- grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty
- grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty
+ grad_W = numpy.dot(inputs.T, deltas)
+ grad_b = numpy.sum(deltas, axis=0)
return [grad_W, grad_b]
@@ -352,7 +343,7 @@ class Softmax(Linear):
return y
def bprop(self, h, igrads):
- raise NotImplementedError()
+ raise NotImplementedError('Softmax.bprop not implemented for hidden layer.')
def bprop_cost(self, h, igrads, cost):