From 2b516f2f97b64b1232751c1e732fae86bbe08b57 Mon Sep 17 00:00:00 2001
From: pswietojanski <p.swietojanski@gmail.com>
Date: Wed, 28 Oct 2015 16:59:11 +0000
Subject: [PATCH 1/4] lab4 work

---
 03_MLP_Coursework1.ipynb |  67 +++++++++++++++--
 04_Regularisation.ipynb  | 156 +++++++++++++++++++++++++++++++++++++++
 mlp/layers.py            | 104 ++++++++++++++++++++++++--
 mlp/optimisers.py        |   1 +
 4 files changed, 316 insertions(+), 12 deletions(-)
 create mode 100644 04_Regularisation.ipynb

diff --git a/03_MLP_Coursework1.ipynb b/03_MLP_Coursework1.ipynb
index 0f786ae..021b761 100644
--- a/03_MLP_Coursework1.ipynb
+++ b/03_MLP_Coursework1.ipynb
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
@@ -142,11 +142,43 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "%load -s Sigmoid mlp/layers.py\n"
+    "# %load -s Sigmoid mlp/layers.py\n",
+    "class Sigmoid(Linear):\n",
+    "    def __init__(self,  idim, odim,\n",
+    "                 rng=None,\n",
+    "                 irange=0.1):\n",
+    "\n",
+    "        super(Sigmoid, self).__init__(idim, odim, rng, irange)\n",
+    "    \n",
+    "    def fprop(self, inputs):\n",
+    "        a = super(Sigmoid, self).fprop(inputs)\n",
+    "        h = 1.0/(1 + numpy.exp(-a))\n",
+    "        return h\n",
+    "    \n",
+    "    def bprop(self, h, igrads):\n",
+    "        dsigm = h*(1.0 - h)\n",
+    "        deltas = igrads*dsigm\n",
+    "        ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas)\n",
+    "        return deltas, ograds\n",
+    "\n",
+    "    def cost_bprop(self, h, igrads, cost):\n",
+    "        if cost is None or cost.get_name() == 'bce':\n",
+    "            return super(Sigmoid, self).bprop(h=h, igrads=igrads)\n",
+    "        else:\n",
+    "            raise NotImplementedError('Sigmoid.bprop_cost method not implemented '\n",
+    "                                      'for the %s cost' % cost.get_name())\n",
+    "\n",
+    "    def pgrads(self, inputs, deltas):\n",
+    "        \"Return list of gradients w.r.t parameters\"\n",
+    "        gparams = super(Sigmoid, self).pgrads(inputs, deltas)\n",
+    "        return gparams\n",
+    "\n",
+    "    def get_name(self):\n",
+    "        return 'sigmoid'\n"
    ]
   },
   {
@@ -162,11 +194,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.0\n",
+      "0.0\n",
+      "0.0744177068753\n",
+      "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+      "   1.977e-04   8.671e-03]\n",
+      "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+      "   1.977e-04  -9.913e-01]\n",
+      "[-0.089  0.03   0.079  0.011  0.017  0.027]\n"
+     ]
+    }
+   ],
    "source": [
     "from mlp.layers import Softmax\n",
     "\n",
@@ -204,7 +251,15 @@
    },
    "outputs": [],
    "source": [
-    "%load -s Softmax mlp/layers.py"
+    "%load -s Softmax mlp/layers.py\n",
+    "1.0\n",
+    "-1.11022302463e-16\n",
+    "0.0744177068753\n",
+    "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+    "   1.977e-04   8.671e-03]\n",
+    "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+    "   1.977e-04  -9.913e-01]\n",
+    "[-0.089  0.03   0.079  0.011  0.017  0.027]"
    ]
   },
   {
diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
new file mode 100644
index 0000000..bac175e
--- /dev/null
+++ b/04_Regularisation.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "\n",
+    "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches - L2 and L1 as well as technique called droput, that.\n",
+    "\n",
+    "\n",
+    "## Virtual environments\n",
+    "\n",
+    "Before you proceed onwards, remember to activate your virtual environment:\n",
+    "   * If you were in last week's Tuesday or Wednesday group type `activate_mlp` or `source ~/mlpractical/venv/bin/activate`\n",
+    "   * If you were in the Monday group:\n",
+    "      + and if you have chosen the **comfy** way type: `workon mlpractical`\n",
+    "      + and if you have chosen the **generic** way, `source` your virutal environment using `source` and specyfing the path to the activate script (you need to localise it yourself, there were not any general recommendations w.r.t dir structure and people have installed it in different places, usually somewhere in the home directories. If you cannot easily find it by yourself, use something like: `find . -iname activate` ):\n",
+    "\n",
+    "## Syncing the git repository\n",
+    "\n",
+    "Look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a> for more details. But in short, we recommend to create a separate branch for this lab, as follows:\n",
+    "\n",
+    "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
+    "2. List the branches and check which is currently active by typing: `git branch`\n",
+    "3. If you have followed recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
+    "```\n",
+    "git commit -am \"stuff I did for the coursework\"\n",
+    "```\n",
+    "4. Now you can switch to `master` branch by typing: \n",
+    "```\n",
+    "git checkout master\n",
+    " ```\n",
+    "5. To update the repository (note, assuming master does not have any conflicts), if there are some, have a look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a>\n",
+    "```\n",
+    "git pull\n",
+    "```\n",
+    "6. And now, create the new branch & swith to it by typing:\n",
+    "```\n",
+    "git checkout -b lab4\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Regularisation\n",
+    "\n",
+    "Today, we shall build models which can have an arbitrary number of hidden layers.  Please have a look at the  diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n",
+    "\n",
+    "(1) $E = \\log(\\mathbf{y}|\\mathbf{x}; \\theta) + \\alpha J_{L2}(\\theta) + \\beta J_{L1}(\\theta)$\n",
+    "\n",
+    "## L2 Weight Decay\n",
+    "\n",
+    "(1) $J_{L2}(\\theta) = \\frac{1}{2}||\\theta||^2$\n",
+    "\n",
+    "(1) $\\frac{\\partial J_{L2}}{\\partial\\theta} = \\frac{1}{2}||\\theta||^2$\n",
+    "\n",
+    "## L1 Sparsity \n",
+    "\n",
+    "## Dropout\n",
+    "\n",
+    "Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
+    "\n",
+    "(1) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+    "\n",
+    "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
+    "\n",
+    "(2) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+    "\n",
+    "with $0<p^l_d<1$ denoting the probability the unit is kept unchanged (dropping probability is thus $1-p^l_d$). We ignore here edge scenarios where $p^l_d=1$ and there is no dropout applied (and the training is exactly the same as in standard SGD) and $p^l_d=0$ where all units would have been dropped, hence the model would not learn anything.\n",
+    "\n",
+    "The probability $p^l_d$ is a hyperparameter (like learning rate) meaning it needs to be provided before training and also very often tuned for the given task. As the notation suggest, it can be specified separately for each layer, including scenario where $l=0$ and one randomply drops also input features.\n",
+    "\n",
+    "### Keeping the $l$-th layer output $\\mathbf{\\hat h}^l$ (input to the upper layer) appropiately scaled at test-time\n",
+    "\n",
+    "The other issue one needs to take into account is the mismatch that arises between training and test (runtime) stages of when dropout is applied. It is due to the fact that droput is not applied when testing hence the average input to the next layer is gonna be bigger when compared to training stage, in average $1/p^l_d$ times bigger. \n",
+    "\n",
+    "So to account for this you can either (it's up to you which way you decide to implement):\n",
+    "\n",
+    "1. When training is finished scale the final weight matrices $\\mathbf{W}^l, l=1,\\ldots,L$ by $p^{l-1}_d$ (remember, $p^{0}_d$ is the probability related to the input features)\n",
+    "2. Scale the activations in equation (1) during training, that is, for each mini-batch multiply $\\mathbf{\\hat h}^l$ by $1/p^l_d$ to compensate for dropped units and then at run-time use the model as usual, **without** scaling. Make sure the $1/p^l_d$ scaler is taken into account for both forward and backward passes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 1: Implement L1 based regularisation\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 2:  Implement L2 based regularisation\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 3: Implement Dropout \n",
+    "\n",
+    "Modify the above code by adding an intemediate linear layer of size 200 hidden units between input and output layers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/mlp/layers.py b/mlp/layers.py
index c511792..b03e2a3 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -46,6 +46,21 @@ class MLP(object):
             self.activations[i+1] = self.layers[i].fprop(self.activations[i])
         return self.activations[-1]
 
+    def fprop_droput(self, x, dropout_probabilites=None):
+        """
+
+        :param inputs: mini-batch of data-points x
+        :return: y (top layer activation) which is an estimate of y given x
+        """
+
+        if len(self.activations) != len(self.layers) + 1:
+            self.activations = [None]*(len(self.layers) + 1)
+
+        self.activations[0] = x
+        for i in xrange(0, len(self.layers)):
+            self.activations[i+1] = self.layers[i].fprop(self.activations[i])
+        return self.activations[-1]
+
     def bprop(self, cost_grad):
         """
         :param cost_grad: matrix -- grad of the cost w.r.t y
@@ -144,7 +159,7 @@ class Layer(object):
 
         raise NotImplementedError()
 
-    def pgrads(self, inputs, deltas):
+    def pgrads(self, inputs, deltas, **kwargs):
         """
         Return gradients w.r.t parameters
         """
@@ -240,12 +255,13 @@ class Linear(Layer):
             raise NotImplementedError('Linear.bprop_cost method not implemented '
                                       'for the %s cost' % cost.get_name())
 
-    def pgrads(self, inputs, deltas):
+    def pgrads(self, inputs, deltas, **kwargs):
         """
         Return gradients w.r.t parameters
 
         :param inputs, input to the i-th layer
         :param deltas, deltas computed in bprop stage up to -ith layer
+        :param kwargs, key-value optional arguments
         :return list of grads w.r.t parameters dE/dW and dE/db in *exactly*
                 the same order as the params are returned by get_params()
 
@@ -274,8 +290,84 @@ class Linear(Layer):
     def get_name(self):
         return 'linear'
 
+
+class Sigmoid(Linear):
+    def __init__(self,  idim, odim,
+                 rng=None,
+                 irange=0.1):
+
+        super(Sigmoid, self).__init__(idim, odim, rng, irange)
+    
+    def fprop(self, inputs):
+        #get the linear activations
+        a = super(Sigmoid, self).fprop(inputs)
+        #stabilise the exp() computation in case some values in
+        #'a' get very negative. We limit both tails, however only
+        #negative values may lead to numerical issues -- exp(-a)
+        #clip() function does the following operation faster:
+        # a[a < -30.] = 30,
+        # a[a > 30.] = 30.
+        numpy.clip(a, -30.0, 30.0, out=a)
+        h = 1.0/(1 + numpy.exp(-a))
+        return h
+    
+    def bprop(self, h, igrads):
+        dsigm = h * (1.0 - h)
+        deltas = igrads * dsigm
+        ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas)
+        return deltas, ograds
+
+    def cost_bprop(self, h, igrads, cost):
+        if cost is None or cost.get_name() == 'bce':
+            return super(Sigmoid, self).bprop(h=h, igrads=igrads)
+        else:
+            raise NotImplementedError('Sigmoid.bprop_cost method not implemented '
+                                      'for the %s cost' % cost.get_name())
+
+    def get_name(self):
+        return 'sigmoid'
+
+
+class Softmax(Linear):
+
+    def __init__(self,idim, odim,
+                 rng=None,
+                 irange=0.1):
+
+        super(Softmax, self).__init__(idim,
+                                      odim,
+                                      rng=rng,
+                                      irange=irange)
+    
+    def fprop(self, inputs):
+
+        # compute the linear outputs
+        a = super(Softmax, self).fprop(inputs)
+        # apply numerical stabilisation by subtracting max 
+        # from each row (not required for the coursework)
+        # then compute exponent
+        assert a.ndim in [1, 2], (
+            "Expected the linear activation in Softmax layer to be either "
+            "vector or matrix, got %ith dimensional tensor" % a.ndim
+        )
+        axis = a.ndim - 1
+        exp_a = numpy.exp(a - numpy.max(a, axis=axis, keepdims=True))
+        # finally, normalise by the sum within each example
+        y = exp_a/numpy.sum(exp_a, axis=axis, keepdims=True)
+
+        return y
+
+    def bprop(self, h, igrads):
+        raise NotImplementedError()
+
+    def bprop_cost(self, h, igrads, cost):
+
+        if cost is None or cost.get_name() == 'ce':
+            return super(Softmax, self).bprop(h=h, igrads=igrads)
+        else:
+            raise NotImplementedError('Softmax.bprop_cost method not implemented '
+                                      'for %s cost' % cost.get_name())
+
+    def get_name(self):
+        return 'softmax'
         
-        
-        
-        
-        
\ No newline at end of file
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index 9d4b947..f03c3cc 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -116,6 +116,7 @@ class SGDOptimiser(Optimiser):
         tr_stats, valid_stats = [], []
 
         # do the initial validation
+        train_iterator.reset()
         tr_nll, tr_acc = self.validate(model, train_iterator)
         logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%'
                     % (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))

From 18b36babde4d52e25204089025f4ffcebfea8e79 Mon Sep 17 00:00:00 2001
From: pswietojanski <p.swietojanski@gmail.com>
Date: Sun, 1 Nov 2015 15:50:26 +0000
Subject: [PATCH 2/4] lab4

---
 04_Regularisation.ipynb | 180 +++++++++++++++++++++++++++++++++-------
 mlp/dataset.py          |  26 +++++-
 mlp/layers.py           |  31 +++----
 mlp/optimisers.py       |  22 +++--
 4 files changed, 201 insertions(+), 58 deletions(-)

diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
index bac175e..d1e4c05 100644
--- a/04_Regularisation.ipynb
+++ b/04_Regularisation.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Introduction\n",
     "\n",
-    "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches - L2 and L1 as well as technique called droput, that.\n",
+    "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches which are added to optimised objective and the third technique, called *droput*, is a form of noise injection by random corruption of information carried by hidden units during training.\n",
     "\n",
     "\n",
     "## Virtual environments\n",
@@ -23,9 +23,9 @@
     "\n",
     "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
     "2. List the branches and check which is currently active by typing: `git branch`\n",
-    "3. If you have followed recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
+    "3. If you have followed our recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
     "```\n",
-    "git commit -am \"stuff I did for the coursework\"\n",
+    "git commit -am \"finished coursework\"\n",
     "```\n",
     "4. Now you can switch to `master` branch by typing: \n",
     "```\n",
@@ -47,61 +47,133 @@
    "source": [
     "# Regularisation\n",
     "\n",
-    "Today, we shall build models which can have an arbitrary number of hidden layers.  Please have a look at the  diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n",
+    "Regularisation add some *complexity term* to the cost function. It's purpose is to put some prior on the model's parameters. The most common prior is perhaps the one which assumes smoother solutions (the one which are not able to fit training data too well) are better as they are more likely to better generalise to unseen data. \n",
     "\n",
-    "(1) $E = \\log(\\mathbf{y}|\\mathbf{x}; \\theta) + \\alpha J_{L2}(\\theta) + \\beta J_{L1}(\\theta)$\n",
+    "A way to incorporate such prior in the model is to add some term that penalise certain configurations of the parameters -- either from growing too large ($L_2$) or the one that prefers solution that could be modelled with less parameters ($L_1$), hence encouraging some parameters to become 0. One can, of course, combine many such priors when optimising the model, however, in the lab we shall use $L_1$ and/or $L_2$ priors.\n",
     "\n",
-    "## L2 Weight Decay\n",
+    "They can be easily incorporated into the training objective by adding some additive terms, as follows:\n",
     "\n",
-    "(1) $J_{L2}(\\theta) = \\frac{1}{2}||\\theta||^2$\n",
+    "(1) $\n",
+    " \\begin{align*}\n",
+    "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
+    "    \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} + \\underbrace{\\beta_{L_2} E^n_{L_2}}_{\\text{prior term}}\n",
+    "\\end{align*}\n",
+    "$\n",
     "\n",
-    "(1) $\\frac{\\partial J_{L2}}{\\partial\\theta} = \\frac{1}{2}||\\theta||^2$\n",
+    "where $ E^n_{\\text{train}} = - \\sum_{k=1}^K t^n_k \\ln y^n_k $,  $\\beta_{L_1}$ and $\\beta_{L_2}$ some non-negative constants specified a priori (hyper-parameters) and $E^n_{L_1}$ and $E^n_{L_2}$ norm metric specifying certain properties of parameters:\n",
     "\n",
-    "## L1 Sparsity \n",
+    "(2) $\n",
+    " \\begin{align*}\n",
+    " E^n_{L_p}(\\mathbf{W}) = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
+    "\\end{align*}\n",
+    "$\n",
+    "\n",
+    "where $p$ denotes the norm-order (for regularisation either 1 or 2). (TODO: explain here why we usualy skip square root for p=2)\n",
+    "\n",
+    "## $L_{p=2}$ (Weight Decay)\n",
+    "\n",
+    "(3) $\n",
+    " \\begin{align*}\n",
+    "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
+    "    \\underbrace{\\beta E^n_{L_2}}_{\\text{prior term}} = E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}{2}|\\mathbf{W}|^2\n",
+    "\\end{align*}\n",
+    "$\n",
+    "\n",
+    "(4) $\n",
+    "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} E_{L_2}) }{\\partial w_i} \n",
+    "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} \\frac{\\partial\n",
+    "      E_{L_2}}{\\partial w_i} \\right) \n",
+    "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} w_i \\right)\n",
+    "\\end{align*}\n",
+    "$\n",
+    "\n",
+    "(5) $\n",
+    "\\begin{align*}\n",
+    "  \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} w_i \\right) \n",
+    "\\end{align*}\n",
+    "$\n",
+    "\n",
+    "where $\\eta$ is learning rate.\n",
+    "\n",
+    "## $L_{p=1}$ (Sparsity)\n",
+    "\n",
+    "(6) $\n",
+    " \\begin{align*}\n",
+    "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
+    "    \\underbrace{\\beta E^n_{L_1}}_{\\text{prior term}} \n",
+    "        = E^n_{\\text{train}} + \\beta_{L_1} |\\mathbf{W}|\n",
+    "\\end{align*}\n",
+    "$\n",
+    "\n",
+    "(7) $\\begin{align*}\n",
+    "  \\frac{\\partial E^n}{\\partial w_i} =  \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i}  =  \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1}  \\mbox{sgn}(w_i)\n",
+    "\\end{align*}\n",
+    "$\n",
+    "\n",
+    "(8) $\\begin{align*}\n",
+    "  \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n",
+    "\\end{align*}$\n",
+    "\n",
+    "Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n",
+    "\n",
+    "One can also apply those penalty terms for biases, however, this is usually not necessary as biases have secondary impact on smoothnes of the given solution.\n",
     "\n",
     "## Dropout\n",
     "\n",
     "Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
     "\n",
-    "(1) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+    "(9) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
     "\n",
     "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
     "\n",
-    "(2) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+    "(10) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
     "\n",
-    "with $0<p^l_d<1$ denoting the probability the unit is kept unchanged (dropping probability is thus $1-p^l_d$). We ignore here edge scenarios where $p^l_d=1$ and there is no dropout applied (and the training is exactly the same as in standard SGD) and $p^l_d=0$ where all units would have been dropped, hence the model would not learn anything.\n",
+    "with $0<p^l_d<1$ denoting the probability the given unit is kept unchanged (dropping probability is thus $1-p^l_d$). We ignore here edge scenarios where $p^l_d=1$ and there is no dropout applied (and the training would be exactly the same as in standard SGD) and $p^l_d=0$ where all units would have been dropped, hence the model would not learn anything.\n",
     "\n",
-    "The probability $p^l_d$ is a hyperparameter (like learning rate) meaning it needs to be provided before training and also very often tuned for the given task. As the notation suggest, it can be specified separately for each layer, including scenario where $l=0$ and one randomply drops also input features.\n",
+    "The probability $p^l_d$ is a hyperparameter (like learning rate) meaning it needs to be provided before training and also very often tuned for the given task. As the notation suggest, it can be specified separately for each layer, including scenario where $l=0$ when some random input features (pixels in the image for MNIST) are being also ommitted.\n",
     "\n",
     "### Keeping the $l$-th layer output $\\mathbf{\\hat h}^l$ (input to the upper layer) appropiately scaled at test-time\n",
     "\n",
-    "The other issue one needs to take into account is the mismatch that arises between training and test (runtime) stages of when dropout is applied. It is due to the fact that droput is not applied when testing hence the average input to the next layer is gonna be bigger when compared to training stage, in average $1/p^l_d$ times bigger. \n",
+    "The other issue one needs to take into account is the mismatch that arises between training and test (runtime) stages when dropout is applied. It is due to the fact that droput is not applied when testing hence the average input to the unit in upper layer is going to be bigger when compared to training stage (where some inputs are set to 0), in average $1/p^l_d$ times bigger. \n",
     "\n",
-    "So to account for this you can either (it's up to you which way you decide to implement):\n",
+    "So to account for this mismatch one could either:\n",
     "\n",
     "1. When training is finished scale the final weight matrices $\\mathbf{W}^l, l=1,\\ldots,L$ by $p^{l-1}_d$ (remember, $p^{0}_d$ is the probability related to the input features)\n",
-    "2. Scale the activations in equation (1) during training, that is, for each mini-batch multiply $\\mathbf{\\hat h}^l$ by $1/p^l_d$ to compensate for dropped units and then at run-time use the model as usual, **without** scaling. Make sure the $1/p^l_d$ scaler is taken into account for both forward and backward passes."
+    "2. Scale the activations in equation (9) during training, that is, for each mini-batch multiply $\\mathbf{\\hat h}^l$ by $1/p^l_d$ to compensate for dropped units and then at run-time use the model as usual, **without** scaling. Make sure the $1/p^l_d$ scaler is taken into account for both forward and backward passes.\n",
+    "\n",
+    "Our recommendation is option 2 as it will make some things easier from implementation perspective. "
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
    "source": [
-    "## Exercise 1: Implement L1 based regularisation\n",
-    "\n"
+    "from mlp.datasets import MNISTDataProvider\n",
+    "\n",
+    "train_dp = MNISTDataProvider(dset='train', batch_size=10, max_num_batches=100, randomize=True)\n",
+    "valid_dp = MNISTDataProvider(dset='valid', batch_size=10000, randomize=False)\n",
+    "test_dp = MNISTDataProvider(dset='eval', batch_size=10000, randomize=False)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Exercise 2:  Implement L2 based regularisation\n",
-    "\n"
+    "# Exercise 1:  Implement L2 based regularisation\n",
+    "\n",
+    "Implement L2 regularisation method (for weight matrices, optionally for biases). Test your solution on one hidden layer model similar to the one from coursework's Task 4 (800 hidden units) but limit training data to 1000 (random) data-points (keep validation and test sets the same). You may use data providers specified in the above cell. \n",
+    "\n",
+    "*Note (optional): We limit both the amount of data as well as the size of a mini-batch - it is due to the fact that those two parameters directly affect the number of updates we do to the model's parameters per epoch (i.e. for `batch_size=100` and `max_num_batches=10` one can only adjusts parameters `10` times per epoch versus `100` times in case those parameters are swapped -- `batch_size=10` and `max_num_batches=100`). Since SGD relies on making many small upates, this ratio (number of updates given data) is another hyper-parmater one need to consider before optimisation.*\n",
+    "\n",
+    "To follow with this exercise first build and train not-regularised model as a basline. Then train regularised models starting with $\\beta_{L2}$ set to 0.0001 and do some grid search for better values. Observe how different $L_2$ penalties affect model ability to fit training and validation data.\n",
+    "\n",
+    "Implementation tips:\n",
+    "* Have a look at the constructor of mlp.optimiser.SGDOptimiser class, it has been modified to take more optimisation-related arguments.\n",
+    "* The best place to implement regularisation terms is `pgrads` method of mlp.layers.Layer (sub)-classes "
    ]
   },
   {
@@ -117,9 +189,59 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Exercise 3: Implement Dropout \n",
+    "# Exercise 2: Implement L1 based regularisation\n",
     "\n",
-    "Modify the above code by adding an intemediate linear layer of size 200 hidden units between input and output layers."
+    "Implement L1 regularisation penalty. Test your solution on one hidden layer model similar to the one from Exercise 1. Then train $L_1$ regularised model starting with $\\beta_{L1}$ set to 0.0001 and do some grid search for better values. Observe how different $L_1$ penalties affect the model ability to fit training and validation data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercise 3:\n",
+    "    \n",
+    "Droput applied to input features (turning on/off some random pixels) may be also viewed as a form of data augmentation -- as we effectively create images that differ in some way from training one but also model is tasked to properly classify imperfect data-points.\n",
+    "\n",
+    "Your task in this exercise is to pick a random digit from MNIST dataset (use MNISTDataProvider) and corrupt it pixel-wise with different levels of probabilities $p_{d} \\in \\{0.9, 0.7, 0.5, 0.2, 0.1\\}$ (reminder, dropout probability is $1-p_d$) that is, for each pixel $x_{i,j}$ in image $\\mathbf{X} \\in \\mathbb{R}^{W\\times H}$:\n",
+    "\n",
+    "$\\begin{align}\n",
+    "d_{i,j} & \\sim\\ \\mbox{Bernoulli}(p_{d}) \\\\\n",
+    "x_{i,j} &=\n",
+    "\\begin{cases}\n",
+    "     0     & \\quad \\text{if } d_{i,j} = 0\\\\\n",
+    "     x_{i,j}       & \\quad \\text{if } d_{i,j} = 1\\\\\n",
+    "\\end{cases}\n",
+    "\\end{align}\n",
+    "$\n",
+    "\n",
+    "Plot the solution as a 2x3 grid of images for each $p_d$ scenario, at position (0, 0) plot an original (uncorrupted) image.\n",
+    "\n",
+    "Tip: You may use numpy.random.binomial function to draw samples from Bernoulli distribution."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercise 4: Implement Dropout \n",
+    "\n",
+    "Implement dropout regularisation technique. Then for the same initial configuration as in Exercise 1. investigate effectivness of different dropout rates applied to input features and/or hidden layers. Start with $p_{inp}=0.5$ and $p_{hid}=0.5$ and do some search for better settings.\n",
+    "\n",
+    "Implementation tips:\n",
+    "* Add a function `fprop_dropout` to `mlp.layers.MLP` class which (on top of `inputs` argument) takes also dropout-related argument(s) and perform dropout forward propagation through the model.\n",
+    "* One also would have to introduce required modificastions to `mlp.optimisers.SGDOptimiser.train_epoch()` function.\n",
+    "* Design and implemnt dropout scheduler in a similar way to how learning rates are handled (that is, allowing for some implementation dependent schedule which is kept independent of implementation in `mlp.optimisers.SGDOptimiser.train()`). \n",
+    "   +  For this exercise implement only fixed dropout scheduler - `DropoutFixed`, but implementation should allow to easily add other schedules in the future. \n",
+    "   +  Dropout scheduler of any type should return a tuple of two numbers $(p_{inp},\\; p_{hid})$, the first one is dropout factor for input features (data-points), and the latter dropout factor for hidden layers (assumed the same for all hidden layers)."
    ]
   },
   {
diff --git a/mlp/dataset.py b/mlp/dataset.py
index 86857f2..d081cc4 100644
--- a/mlp/dataset.py
+++ b/mlp/dataset.py
@@ -17,7 +17,7 @@ class DataProvider(object):
     Data provider defines an interface for our
     generic data-independent readers.
     """
-    def __init__(self, batch_size, randomize=True):
+    def __init__(self, batch_size, randomize=True, rng=None):
         """
         :param batch_size: int, specifies the number
                of elements returned at each step
@@ -29,6 +29,11 @@ class DataProvider(object):
         self.batch_size = batch_size
         self.randomize = randomize
         self._curr_idx = 0
+        self.rng = rng
+
+        if self.rng is None:
+            seed=[2015, 10, 1]
+            self.rng = numpy.random.RandomState(seed)
 
     def reset(self):
         """
@@ -77,10 +82,11 @@ class MNISTDataProvider(DataProvider):
                  batch_size=10,
                  max_num_batches=-1,
                  max_num_examples=-1,
-                 randomize=True):
+                 randomize=True,
+                 rng=None):
 
         super(MNISTDataProvider, self).\
-            __init__(batch_size, randomize)
+            __init__(batch_size, randomize, rng)
 
         assert dset in ['train', 'valid', 'eval'], (
             "Expected dset to be either 'train', "
@@ -125,7 +131,16 @@ class MNISTDataProvider(DataProvider):
 
     def __randomize(self):
         assert isinstance(self.x, numpy.ndarray)
-        return numpy.random.permutation(numpy.arange(0, self.x.shape[0]))
+
+        if self._rand_idx is not None and self._max_num_batches > 0:
+            return self.rng.permutation(self._rand_idx)
+        else:
+            #the max_to_present secures that random examples
+            #are returned from the same pool each time (in case
+            #the total num of examples was limited by max_num_batches)
+            max_to_present = self.batch_size*self._max_num_batches \
+                                if self._max_num_batches > 0 else self.x.shape[0]
+            return self.rng.permutation(numpy.arange(0, self.x.shape[0]))[0:max_to_present]
 
     def next(self):
 
@@ -152,6 +167,9 @@ class MNISTDataProvider(DataProvider):
     def num_examples(self):
         return self.x.shape[0]
 
+    def num_examples_presented(self):
+        return self._curr_idx + 1
+
     def __to_one_of_k(self, y):
         rval = numpy.zeros((y.shape[0], self.num_classes), dtype=numpy.float32)
         for i in xrange(y.shape[0]):
diff --git a/mlp/layers.py b/mlp/layers.py
index b03e2a3..e6a379b 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -46,21 +46,6 @@ class MLP(object):
             self.activations[i+1] = self.layers[i].fprop(self.activations[i])
         return self.activations[-1]
 
-    def fprop_droput(self, x, dropout_probabilites=None):
-        """
-
-        :param inputs: mini-batch of data-points x
-        :return: y (top layer activation) which is an estimate of y given x
-        """
-
-        if len(self.activations) != len(self.layers) + 1:
-            self.activations = [None]*(len(self.layers) + 1)
-
-        self.activations[0] = x
-        for i in xrange(0, len(self.layers)):
-            self.activations[i+1] = self.layers[i].fprop(self.activations[i])
-        return self.activations[-1]
-
     def bprop(self, cost_grad):
         """
         :param cost_grad: matrix -- grad of the cost w.r.t y
@@ -255,7 +240,7 @@ class Linear(Layer):
             raise NotImplementedError('Linear.bprop_cost method not implemented '
                                       'for the %s cost' % cost.get_name())
 
-    def pgrads(self, inputs, deltas, **kwargs):
+    def pgrads(self, inputs, deltas, l1_weight=0, l2_weight=0):
         """
         Return gradients w.r.t parameters
 
@@ -272,9 +257,18 @@ class Linear(Layer):
           1) da^i/dW^i and 2) da^i/db^i
         since W and b are only layer's parameters
         """
+        l2_W_penalty, l2_b_penalty = 0, 0
+        if l2_weight > 0:
+            l2_W_penalty = l2_weight*self.W
+            l2_b_penalty = l2_weight*self.b
 
-        grad_W = numpy.dot(inputs.T, deltas)
-        grad_b = numpy.sum(deltas, axis=0)
+        l1_W_penalty, l1_b_penalty = 0, 0
+        if l1_weight > 0:
+            l1_W_penalty = l1_weight*numpy.sign(self.W)
+            l1_b_penalty = l1_weight*numpy.sign(self.b)
+
+        grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty
+        grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty
 
         return [grad_W, grad_b]
 
@@ -370,4 +364,3 @@ class Softmax(Linear):
 
     def get_name(self):
         return 'softmax'
-        
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index f03c3cc..4b70f0c 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -58,7 +58,11 @@ class Optimiser(object):
 
 
 class SGDOptimiser(Optimiser):
-    def __init__(self, lr_scheduler):
+    def __init__(self, lr_scheduler,
+                 dp_scheduler=None,
+                 l1_weight=0.0,
+                 l2_weight=0.0):
+
         super(SGDOptimiser, self).__init__()
 
         assert isinstance(lr_scheduler, LearningRateScheduler), (
@@ -67,6 +71,9 @@ class SGDOptimiser(Optimiser):
         )
 
         self.lr_scheduler = lr_scheduler
+        self.dp_scheduler = dp_scheduler
+        self.l1_weight = l1_weight
+        self.l2_weight = l2_weight
 
     def train_epoch(self, model, train_iterator, learning_rate):
 
@@ -97,7 +104,10 @@ class SGDOptimiser(Optimiser):
 
             for i in xrange(0, len(model.layers)):
                 params = model.layers[i].get_params()
-                grads = model.layers[i].pgrads(model.activations[i], model.deltas[i + 1])
+                grads = model.layers[i].pgrads(inputs=model.activations[i],
+                                               deltas=model.deltas[i + 1],
+                                               l1_weight=self.l1_weight,
+                                               l2_weight=self.l2_weight)
                 uparams = []
                 for param, grad in zip(params, grads):
                     param = param - effective_learning_rate * grad
@@ -118,14 +128,14 @@ class SGDOptimiser(Optimiser):
         # do the initial validation
         train_iterator.reset()
         tr_nll, tr_acc = self.validate(model, train_iterator)
-        logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%'
+        logger.info('Epoch %i: Training cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
                     % (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))
         tr_stats.append((tr_nll, tr_acc))
 
         if valid_iterator is not None:
             valid_iterator.reset()
             valid_nll, valid_acc = self.validate(model, valid_iterator)
-            logger.info('Epoch %i: Validation cost (%s) for random model is %.3f. Accuracy is %.2f%%'
+            logger.info('Epoch %i: Validation cost (%s) for initial model is %.3f. Accuracy is %.2f%%'
                         % (self.lr_scheduler.epoch, cost_name, valid_nll, valid_acc * 100.))
             valid_stats.append((valid_nll, valid_acc))
 
@@ -154,8 +164,8 @@ class SGDOptimiser(Optimiser):
                 self.lr_scheduler.get_next_rate(None)
             vstop = time.clock()
 
-            train_speed = train_iterator.num_examples() / (tstop - tstart)
-            valid_speed = valid_iterator.num_examples() / (vstop - vstart)
+            train_speed = train_iterator.num_examples_presented() / (tstop - tstart)
+            valid_speed = valid_iterator.num_examples_presented() / (vstop - vstart)
             tot_time = vstop - tstart
             #pps = presentations per second
             logger.info("Epoch %i: Took %.0f seconds. Training speed %.0f pps. "

From 35490a68fc4587f1aa4139effd39c6ef711de84f Mon Sep 17 00:00:00 2001
From: pswietojanski <p.swietojanski@gmail.com>
Date: Sun, 1 Nov 2015 16:49:35 +0000
Subject: [PATCH 3/4] Merge branches 'lab4_solved' and 'master'


From faaa6cb172530d8746a6c980de1d81a380e36cf5 Mon Sep 17 00:00:00 2001
From: pswietojanski <p.swietojanski@gmail.com>
Date: Sun, 1 Nov 2015 19:24:35 +0000
Subject: [PATCH 4/4] lab 4

---
 04_Regularisation.ipynb | 56 ++++++++++++++++++++++++++---------------
 mlp/layers.py           | 15 +++--------
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
index d1e4c05..f3e8f25 100644
--- a/04_Regularisation.ipynb
+++ b/04_Regularisation.ipynb
@@ -64,22 +64,31 @@
     "\n",
     "(2) $\n",
     " \\begin{align*}\n",
-    " E^n_{L_p}(\\mathbf{W}) = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
+    " E^n_{L_p}(\\mathbf{W}) = ||\\mathbf{W}||_p = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
     "\\end{align*}\n",
     "$\n",
     "\n",
-    "where $p$ denotes the norm-order (for regularisation either 1 or 2). (TODO: explain here why we usualy skip square root for p=2)\n",
+    "where $p$ denotes the norm-order (for regularisation either 1 or 2). Notice, in practice for computational purposes we will rather compute squared $L_{p=2}$ norm, which omits the square root in (2), that is:\n",
+    "\n",
+    "(3)$ \\begin{align*}\n",
+    " E^n_{L_{p=2}}(\\mathbf{W}) = ||\\mathbf{W}||^2_2 = \\left ( \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2 \\right )^{\\frac{1}{2}} \\right )^2 = \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2\n",
+    "\\end{align*}\n",
+    "$\n",
     "\n",
     "## $L_{p=2}$ (Weight Decay)\n",
     "\n",
-    "(3) $\n",
+    "Our cost with $L_{2}$ regulariser then becomes ($\\frac{1}{2}$ simplifies a derivative later):\n",
+    "\n",
+    "(4) $\n",
     " \\begin{align*}\n",
     "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
-    "    \\underbrace{\\beta E^n_{L_2}}_{\\text{prior term}} = E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}{2}|\\mathbf{W}|^2\n",
+    "    \\underbrace{\\beta_{L_2} \\frac{1}{2} E^n_{L_2}}_{\\text{prior term}}\n",
     "\\end{align*}\n",
     "$\n",
     "\n",
-    "(4) $\n",
+    "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
+    "\n",
+    "(5) $\n",
     "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} E_{L_2}) }{\\partial w_i} \n",
     "  = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} \\frac{\\partial\n",
     "      E_{L_2}}{\\partial w_i} \\right) \n",
@@ -87,7 +96,9 @@
     "\\end{align*}\n",
     "$\n",
     "\n",
-    "(5) $\n",
+    "And the actual update we to the $W_i$ parameter is:\n",
+    "\n",
+    "(6) $\n",
     "\\begin{align*}\n",
     "  \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_2} w_i \\right) \n",
     "\\end{align*}\n",
@@ -97,49 +108,54 @@
     "\n",
     "## $L_{p=1}$ (Sparsity)\n",
     "\n",
-    "(6) $\n",
+    "Our cost with $L_{1}$ regulariser then becomes:\n",
+    "\n",
+    "(7) $\n",
     " \\begin{align*}\n",
     "        E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
-    "    \\underbrace{\\beta E^n_{L_1}}_{\\text{prior term}} \n",
-    "        = E^n_{\\text{train}} + \\beta_{L_1} |\\mathbf{W}|\n",
+    "    \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} \n",
     "\\end{align*}\n",
     "$\n",
     "\n",
-    "(7) $\\begin{align*}\n",
+    "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
+    "\n",
+    "(8) $\\begin{align*}\n",
     "  \\frac{\\partial E^n}{\\partial w_i} =  \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i}  =  \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1}  \\mbox{sgn}(w_i)\n",
     "\\end{align*}\n",
     "$\n",
     "\n",
-    "(8) $\\begin{align*}\n",
+    "And the actual update we to the $W_i$ parameter is:\n",
+    "\n",
+    "(9) $\\begin{align*}\n",
     "  \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i}  + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n",
     "\\end{align*}$\n",
     "\n",
     "Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n",
     "\n",
-    "One can also apply those penalty terms for biases, however, this is usually not necessary as biases have secondary impact on smoothnes of the given solution.\n",
+    "One can also easily apply those penalty terms for biases, however, this is usually not necessary as biases do not affect the smoothness of the solution (given data).\n",
     "\n",
     "## Dropout\n",
     "\n",
     "Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
     "\n",
-    "(9) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+    "(10) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
     "\n",
     "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
     "\n",
-    "(10) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+    "(11) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
     "\n",
     "with $0<p^l_d<1$ denoting the probability the given unit is kept unchanged (dropping probability is thus $1-p^l_d$). We ignore here edge scenarios where $p^l_d=1$ and there is no dropout applied (and the training would be exactly the same as in standard SGD) and $p^l_d=0$ where all units would have been dropped, hence the model would not learn anything.\n",
     "\n",
-    "The probability $p^l_d$ is a hyperparameter (like learning rate) meaning it needs to be provided before training and also very often tuned for the given task. As the notation suggest, it can be specified separately for each layer, including scenario where $l=0$ when some random input features (pixels in the image for MNIST) are being also ommitted.\n",
+    "The probability $p^l_d$ is a hyperparameter (like learning rate) meaning it needs to be provided before training and also very often tuned for the given task. As the notation suggest, it can be specified separately for each layer, including scenario where $l=0$ when some random dimensions in input features (pixels in the image for MNIST) are being also corrupted.\n",
     "\n",
     "### Keeping the $l$-th layer output $\\mathbf{\\hat h}^l$ (input to the upper layer) appropiately scaled at test-time\n",
     "\n",
-    "The other issue one needs to take into account is the mismatch that arises between training and test (runtime) stages when dropout is applied. It is due to the fact that droput is not applied when testing hence the average input to the unit in upper layer is going to be bigger when compared to training stage (where some inputs are set to 0), in average $1/p^l_d$ times bigger. \n",
+    "The other issue one needs to take into account is the mismatch that arises between training and test (runtime) stages when dropout is applied. It is due to the fact that droput is not applied at testing (run-time) stage hence the average input to the unit in the upper layer is going to be bigger compared to training stage (where some inputs were set to 0), in average $1/p^l_d$ times bigger. \n",
     "\n",
     "So to account for this mismatch one could either:\n",
     "\n",
-    "1. When training is finished scale the final weight matrices $\\mathbf{W}^l, l=1,\\ldots,L$ by $p^{l-1}_d$ (remember, $p^{0}_d$ is the probability related to the input features)\n",
-    "2. Scale the activations in equation (9) during training, that is, for each mini-batch multiply $\\mathbf{\\hat h}^l$ by $1/p^l_d$ to compensate for dropped units and then at run-time use the model as usual, **without** scaling. Make sure the $1/p^l_d$ scaler is taken into account for both forward and backward passes.\n",
+    "1. When training is finished scale the final weight matrices $\\mathbf{W}^l, l=1,\\ldots,L$ by $p^{l-1}_d$ (remember, $p^{0}_d$ is the probability related to dropping input features)\n",
+    "2. Scale the activations in equation (10) during training, that is, for each mini-batch multiply $\\mathbf{\\hat h}^l$ by $1/p^l_d$ to compensate for dropped units and then at run-time use the model as usual, **without** scaling. Make sure the $1/p^l_d$ scaler is taken into account for both forward and backward passes.\n",
     "\n",
     "Our recommendation is option 2 as it will make some things easier from implementation perspective. "
    ]
@@ -173,7 +189,7 @@
     "\n",
     "Implementation tips:\n",
     "* Have a look at the constructor of mlp.optimiser.SGDOptimiser class, it has been modified to take more optimisation-related arguments.\n",
-    "* The best place to implement regularisation terms is `pgrads` method of mlp.layers.Layer (sub)-classes "
+    "* The best place to implement regularisation terms is `pgrads` method of mlp.layers.Layer (sub)-classes. See equations (6) and (9) why."
    ]
   },
   {
@@ -234,7 +250,7 @@
    "source": [
     "# Exercise 4: Implement Dropout \n",
     "\n",
-    "Implement dropout regularisation technique. Then for the same initial configuration as in Exercise 1. investigate effectivness of different dropout rates applied to input features and/or hidden layers. Start with $p_{inp}=0.5$ and $p_{hid}=0.5$ and do some search for better settings.\n",
+    "Implement dropout regularisation technique. Then for the same initial configuration as in Exercise 1. investigate effectivness of different dropout rates applied to input features and/or hidden layers. Start with $p_{inp}=0.5$ and $p_{hid}=0.5$ and do some search for better settings. Dropout usually slows training down (approximately two times) so train dropout models for around twice as many epochs as baseline model.\n",
     "\n",
     "Implementation tips:\n",
     "* Add a function `fprop_dropout` to `mlp.layers.MLP` class which (on top of `inputs` argument) takes also dropout-related argument(s) and perform dropout forward propagation through the model.\n",
diff --git a/mlp/layers.py b/mlp/layers.py
index e6a379b..e7590f8 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -257,18 +257,9 @@ class Linear(Layer):
           1) da^i/dW^i and 2) da^i/db^i
         since W and b are only layer's parameters
         """
-        l2_W_penalty, l2_b_penalty = 0, 0
-        if l2_weight > 0:
-            l2_W_penalty = l2_weight*self.W
-            l2_b_penalty = l2_weight*self.b
 
-        l1_W_penalty, l1_b_penalty = 0, 0
-        if l1_weight > 0:
-            l1_W_penalty = l1_weight*numpy.sign(self.W)
-            l1_b_penalty = l1_weight*numpy.sign(self.b)
-
-        grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty
-        grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty
+        grad_W = numpy.dot(inputs.T, deltas)
+        grad_b = numpy.sum(deltas, axis=0)
 
         return [grad_W, grad_b]
 
@@ -352,7 +343,7 @@ class Softmax(Linear):
         return y
 
     def bprop(self, h, igrads):
-        raise NotImplementedError()
+        raise NotImplementedError('Softmax.bprop not implemented for hidden layer.')
 
     def bprop_cost(self, h, igrads, cost):