From 2b516f2f97b64b1232751c1e732fae86bbe08b57 Mon Sep 17 00:00:00 2001
From: pswietojanski <p.swietojanski@gmail.com>
Date: Wed, 28 Oct 2015 16:59:11 +0000
Subject: [PATCH] lab4 work

---
 03_MLP_Coursework1.ipynb |  67 +++++++++++++++--
 04_Regularisation.ipynb  | 156 +++++++++++++++++++++++++++++++++++++++
 mlp/layers.py            | 104 ++++++++++++++++++++++++--
 mlp/optimisers.py        |   1 +
 4 files changed, 316 insertions(+), 12 deletions(-)
 create mode 100644 04_Regularisation.ipynb

diff --git a/03_MLP_Coursework1.ipynb b/03_MLP_Coursework1.ipynb
index 0f786ae..021b761 100644
--- a/03_MLP_Coursework1.ipynb
+++ b/03_MLP_Coursework1.ipynb
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
@@ -142,11 +142,43 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "%load -s Sigmoid mlp/layers.py\n"
+    "# %load -s Sigmoid mlp/layers.py\n",
+    "class Sigmoid(Linear):\n",
+    "    def __init__(self,  idim, odim,\n",
+    "                 rng=None,\n",
+    "                 irange=0.1):\n",
+    "\n",
+    "        super(Sigmoid, self).__init__(idim, odim, rng, irange)\n",
+    "    \n",
+    "    def fprop(self, inputs):\n",
+    "        a = super(Sigmoid, self).fprop(inputs)\n",
+    "        h = 1.0/(1 + numpy.exp(-a))\n",
+    "        return h\n",
+    "    \n",
+    "    def bprop(self, h, igrads):\n",
+    "        dsigm = h*(1.0 - h)\n",
+    "        deltas = igrads*dsigm\n",
+    "        ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas)\n",
+    "        return deltas, ograds\n",
+    "\n",
+    "    def cost_bprop(self, h, igrads, cost):\n",
+    "        if cost is None or cost.get_name() == 'bce':\n",
+    "            return super(Sigmoid, self).bprop(h=h, igrads=igrads)\n",
+    "        else:\n",
+    "            raise NotImplementedError('Sigmoid.bprop_cost method not implemented '\n",
+    "                                      'for the %s cost' % cost.get_name())\n",
+    "\n",
+    "    def pgrads(self, inputs, deltas):\n",
+    "        \"Return list of gradients w.r.t parameters\"\n",
+    "        gparams = super(Sigmoid, self).pgrads(inputs, deltas)\n",
+    "        return gparams\n",
+    "\n",
+    "    def get_name(self):\n",
+    "        return 'sigmoid'\n"
    ]
   },
   {
@@ -162,11 +194,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.0\n",
+      "0.0\n",
+      "0.0744177068753\n",
+      "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+      "   1.977e-04   8.671e-03]\n",
+      "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+      "   1.977e-04  -9.913e-01]\n",
+      "[-0.089  0.03   0.079  0.011  0.017  0.027]\n"
+     ]
+    }
+   ],
    "source": [
     "from mlp.layers import Softmax\n",
     "\n",
@@ -204,7 +251,15 @@
    },
    "outputs": [],
    "source": [
-    "%load -s Softmax mlp/layers.py"
+    "%load -s Softmax mlp/layers.py\n",
+    "1.0\n",
+    "-1.11022302463e-16\n",
+    "0.0744177068753\n",
+    "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+    "   1.977e-04   8.671e-03]\n",
+    "[  4.571e-05   1.697e-03   9.877e-01   6.631e-04   1.194e-04   8.880e-04\n",
+    "   1.977e-04  -9.913e-01]\n",
+    "[-0.089  0.03   0.079  0.011  0.017  0.027]"
    ]
   },
   {
diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
new file mode 100644
index 0000000..bac175e
--- /dev/null
+++ b/04_Regularisation.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "\n",
+    "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches - L2 and L1 as well as technique called droput, that.\n",
+    "\n",
+    "\n",
+    "## Virtual environments\n",
+    "\n",
+    "Before you proceed onwards, remember to activate your virtual environment:\n",
+    "   * If you were in last week's Tuesday or Wednesday group type `activate_mlp` or `source ~/mlpractical/venv/bin/activate`\n",
+    "   * If you were in the Monday group:\n",
+    "      + and if you have chosen the **comfy** way type: `workon mlpractical`\n",
+    "      + and if you have chosen the **generic** way, `source` your virutal environment using `source` and specyfing the path to the activate script (you need to localise it yourself, there were not any general recommendations w.r.t dir structure and people have installed it in different places, usually somewhere in the home directories. If you cannot easily find it by yourself, use something like: `find . -iname activate` ):\n",
+    "\n",
+    "## Syncing the git repository\n",
+    "\n",
+    "Look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a> for more details. But in short, we recommend to create a separate branch for this lab, as follows:\n",
+    "\n",
+    "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n",
+    "2. List the branches and check which is currently active by typing: `git branch`\n",
+    "3. If you have followed recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n",
+    "```\n",
+    "git commit -am \"stuff I did for the coursework\"\n",
+    "```\n",
+    "4. Now you can switch to `master` branch by typing: \n",
+    "```\n",
+    "git checkout master\n",
+    " ```\n",
+    "5. To update the repository (note, assuming master does not have any conflicts), if there are some, have a look <a href=\"https://github.com/CSTR-Edinburgh/mlpractical/blob/master/gitFAQ.md\">here</a>\n",
+    "```\n",
+    "git pull\n",
+    "```\n",
+    "6. And now, create the new branch & swith to it by typing:\n",
+    "```\n",
+    "git checkout -b lab4\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Regularisation\n",
+    "\n",
+    "Today, we shall build models which can have an arbitrary number of hidden layers.  Please have a look at the  diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n",
+    "\n",
+    "(1) $E = \\log(\\mathbf{y}|\\mathbf{x}; \\theta) + \\alpha J_{L2}(\\theta) + \\beta J_{L1}(\\theta)$\n",
+    "\n",
+    "## L2 Weight Decay\n",
+    "\n",
+    "(1) $J_{L2}(\\theta) = \\frac{1}{2}||\\theta||^2$\n",
+    "\n",
+    "(1) $\\frac{\\partial J_{L2}}{\\partial\\theta} = \\frac{1}{2}||\\theta||^2$\n",
+    "\n",
+    "## L1 Sparsity \n",
+    "\n",
+    "## Dropout\n",
+    "\n",
+    "Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
+    "\n",
+    "(1) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+    "\n",
+    "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
+    "\n",
+    "(2) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+    "\n",
+    "with $0<p^l_d<1$ denoting the probability the unit is kept unchanged (dropping probability is thus $1-p^l_d$). We ignore here edge scenarios where $p^l_d=1$ and there is no dropout applied (and the training is exactly the same as in standard SGD) and $p^l_d=0$ where all units would have been dropped, hence the model would not learn anything.\n",
+    "\n",
+    "The probability $p^l_d$ is a hyperparameter (like learning rate) meaning it needs to be provided before training and also very often tuned for the given task. As the notation suggest, it can be specified separately for each layer, including scenario where $l=0$ and one randomply drops also input features.\n",
+    "\n",
+    "### Keeping the $l$-th layer output $\\mathbf{\\hat h}^l$ (input to the upper layer) appropiately scaled at test-time\n",
+    "\n",
+    "The other issue one needs to take into account is the mismatch that arises between training and test (runtime) stages of when dropout is applied. It is due to the fact that droput is not applied when testing hence the average input to the next layer is gonna be bigger when compared to training stage, in average $1/p^l_d$ times bigger. \n",
+    "\n",
+    "So to account for this you can either (it's up to you which way you decide to implement):\n",
+    "\n",
+    "1. When training is finished scale the final weight matrices $\\mathbf{W}^l, l=1,\\ldots,L$ by $p^{l-1}_d$ (remember, $p^{0}_d$ is the probability related to the input features)\n",
+    "2. Scale the activations in equation (1) during training, that is, for each mini-batch multiply $\\mathbf{\\hat h}^l$ by $1/p^l_d$ to compensate for dropped units and then at run-time use the model as usual, **without** scaling. Make sure the $1/p^l_d$ scaler is taken into account for both forward and backward passes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 1: Implement L1 based regularisation\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 2:  Implement L2 based regularisation\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 3: Implement Dropout \n",
+    "\n",
+    "Modify the above code by adding an intemediate linear layer of size 200 hidden units between input and output layers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/mlp/layers.py b/mlp/layers.py
index c511792..b03e2a3 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -46,6 +46,21 @@ class MLP(object):
             self.activations[i+1] = self.layers[i].fprop(self.activations[i])
         return self.activations[-1]
 
+    def fprop_droput(self, x, dropout_probabilites=None):
+        """
+
+        :param inputs: mini-batch of data-points x
+        :return: y (top layer activation) which is an estimate of y given x
+        """
+
+        if len(self.activations) != len(self.layers) + 1:
+            self.activations = [None]*(len(self.layers) + 1)
+
+        self.activations[0] = x
+        for i in xrange(0, len(self.layers)):
+            self.activations[i+1] = self.layers[i].fprop(self.activations[i])
+        return self.activations[-1]
+
     def bprop(self, cost_grad):
         """
         :param cost_grad: matrix -- grad of the cost w.r.t y
@@ -144,7 +159,7 @@ class Layer(object):
 
         raise NotImplementedError()
 
-    def pgrads(self, inputs, deltas):
+    def pgrads(self, inputs, deltas, **kwargs):
         """
         Return gradients w.r.t parameters
         """
@@ -240,12 +255,13 @@ class Linear(Layer):
             raise NotImplementedError('Linear.bprop_cost method not implemented '
                                       'for the %s cost' % cost.get_name())
 
-    def pgrads(self, inputs, deltas):
+    def pgrads(self, inputs, deltas, **kwargs):
         """
         Return gradients w.r.t parameters
 
         :param inputs, input to the i-th layer
         :param deltas, deltas computed in bprop stage up to -ith layer
+        :param kwargs, key-value optional arguments
         :return list of grads w.r.t parameters dE/dW and dE/db in *exactly*
                 the same order as the params are returned by get_params()
 
@@ -274,8 +290,84 @@ class Linear(Layer):
     def get_name(self):
         return 'linear'
 
+
+class Sigmoid(Linear):
+    def __init__(self,  idim, odim,
+                 rng=None,
+                 irange=0.1):
+
+        super(Sigmoid, self).__init__(idim, odim, rng, irange)
+    
+    def fprop(self, inputs):
+        #get the linear activations
+        a = super(Sigmoid, self).fprop(inputs)
+        #stabilise the exp() computation in case some values in
+        #'a' get very negative. We limit both tails, however only
+        #negative values may lead to numerical issues -- exp(-a)
+        #clip() function does the following operation faster:
+        # a[a < -30.] = 30,
+        # a[a > 30.] = 30.
+        numpy.clip(a, -30.0, 30.0, out=a)
+        h = 1.0/(1 + numpy.exp(-a))
+        return h
+    
+    def bprop(self, h, igrads):
+        dsigm = h * (1.0 - h)
+        deltas = igrads * dsigm
+        ___, ograds = super(Sigmoid, self).bprop(h=None, igrads=deltas)
+        return deltas, ograds
+
+    def cost_bprop(self, h, igrads, cost):
+        if cost is None or cost.get_name() == 'bce':
+            return super(Sigmoid, self).bprop(h=h, igrads=igrads)
+        else:
+            raise NotImplementedError('Sigmoid.bprop_cost method not implemented '
+                                      'for the %s cost' % cost.get_name())
+
+    def get_name(self):
+        return 'sigmoid'
+
+
+class Softmax(Linear):
+
+    def __init__(self,idim, odim,
+                 rng=None,
+                 irange=0.1):
+
+        super(Softmax, self).__init__(idim,
+                                      odim,
+                                      rng=rng,
+                                      irange=irange)
+    
+    def fprop(self, inputs):
+
+        # compute the linear outputs
+        a = super(Softmax, self).fprop(inputs)
+        # apply numerical stabilisation by subtracting max 
+        # from each row (not required for the coursework)
+        # then compute exponent
+        assert a.ndim in [1, 2], (
+            "Expected the linear activation in Softmax layer to be either "
+            "vector or matrix, got %ith dimensional tensor" % a.ndim
+        )
+        axis = a.ndim - 1
+        exp_a = numpy.exp(a - numpy.max(a, axis=axis, keepdims=True))
+        # finally, normalise by the sum within each example
+        y = exp_a/numpy.sum(exp_a, axis=axis, keepdims=True)
+
+        return y
+
+    def bprop(self, h, igrads):
+        raise NotImplementedError()
+
+    def bprop_cost(self, h, igrads, cost):
+
+        if cost is None or cost.get_name() == 'ce':
+            return super(Softmax, self).bprop(h=h, igrads=igrads)
+        else:
+            raise NotImplementedError('Softmax.bprop_cost method not implemented '
+                                      'for %s cost' % cost.get_name())
+
+    def get_name(self):
+        return 'softmax'
         
-        
-        
-        
-        
\ No newline at end of file
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index 9d4b947..f03c3cc 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -116,6 +116,7 @@ class SGDOptimiser(Optimiser):
         tr_stats, valid_stats = [], []
 
         # do the initial validation
+        train_iterator.reset()
         tr_nll, tr_acc = self.validate(model, train_iterator)
         logger.info('Epoch %i: Training cost (%s) for random model is %.3f. Accuracy is %.2f%%'
                     % (self.lr_scheduler.epoch, cost_name, tr_nll, tr_acc * 100.))