From 18b36babde4d52e25204089025f4ffcebfea8e79 Mon Sep 17 00:00:00 2001 From: pswietojanski Date: Sun, 1 Nov 2015 15:50:26 +0000 Subject: [PATCH] lab4 --- 04_Regularisation.ipynb | 180 +++++++++++++++++++++++++++++++++------- mlp/dataset.py | 26 +++++- mlp/layers.py | 31 +++---- mlp/optimisers.py | 22 +++-- 4 files changed, 201 insertions(+), 58 deletions(-) diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb index bac175e..d1e4c05 100644 --- a/04_Regularisation.ipynb +++ b/04_Regularisation.ipynb @@ -6,7 +6,7 @@ "source": [ "# Introduction\n", "\n", - "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches - L2 and L1 as well as technique called droput, that.\n", + "This tutorial focuses on implementation of three reqularisaion techniques, two of them are norm based approaches which are added to optimised objective and the third technique, called *droput*, is a form of noise injection by random corruption of information carried by hidden units during training.\n", "\n", "\n", "## Virtual environments\n", @@ -23,9 +23,9 @@ "\n", "1. Enter the mlpractical directory `cd ~/mlpractical/repo-mlp`\n", "2. List the branches and check which is currently active by typing: `git branch`\n", - "3. If you have followed recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n", + "3. If you have followed our recommendations, you should be in the `coursework1` branch, please commit your local changed to the repo index by typing:\n", "```\n", - "git commit -am \"stuff I did for the coursework\"\n", + "git commit -am \"finished coursework\"\n", "```\n", "4. Now you can switch to `master` branch by typing: \n", "```\n", @@ -47,61 +47,133 @@ "source": [ "# Regularisation\n", "\n", - "Today, we shall build models which can have an arbitrary number of hidden layers. Please have a look at the diagram below, and the corresponding computations (which have an *exact* matrix form as expected by numpy, and row-wise orientation; note that $\\circ$ denotes an element-wise product). In the diagram, we briefly describe how each comptation relates to the code we have provided.\n", + "Regularisation add some *complexity term* to the cost function. It's purpose is to put some prior on the model's parameters. The most common prior is perhaps the one which assumes smoother solutions (the one which are not able to fit training data too well) are better as they are more likely to better generalise to unseen data. \n", "\n", - "(1) $E = \\log(\\mathbf{y}|\\mathbf{x}; \\theta) + \\alpha J_{L2}(\\theta) + \\beta J_{L1}(\\theta)$\n", + "A way to incorporate such prior in the model is to add some term that penalise certain configurations of the parameters -- either from growing too large ($L_2$) or the one that prefers solution that could be modelled with less parameters ($L_1$), hence encouraging some parameters to become 0. One can, of course, combine many such priors when optimising the model, however, in the lab we shall use $L_1$ and/or $L_2$ priors.\n", "\n", - "## L2 Weight Decay\n", + "They can be easily incorporated into the training objective by adding some additive terms, as follows:\n", "\n", - "(1) $J_{L2}(\\theta) = \\frac{1}{2}||\\theta||^2$\n", + "(1) $\n", + " \\begin{align*}\n", + " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", + " \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} + \\underbrace{\\beta_{L_2} E^n_{L_2}}_{\\text{prior term}}\n", + "\\end{align*}\n", + "$\n", "\n", - "(1) $\\frac{\\partial J_{L2}}{\\partial\\theta} = \\frac{1}{2}||\\theta||^2$\n", + "where $ E^n_{\\text{train}} = - \\sum_{k=1}^K t^n_k \\ln y^n_k $, $\\beta_{L_1}$ and $\\beta_{L_2}$ some non-negative constants specified a priori (hyper-parameters) and $E^n_{L_1}$ and $E^n_{L_2}$ norm metric specifying certain properties of parameters:\n", "\n", - "## L1 Sparsity \n", + "(2) $\n", + " \\begin{align*}\n", + " E^n_{L_p}(\\mathbf{W}) = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n", + "\\end{align*}\n", + "$\n", + "\n", + "where $p$ denotes the norm-order (for regularisation either 1 or 2). (TODO: explain here why we usualy skip square root for p=2)\n", + "\n", + "## $L_{p=2}$ (Weight Decay)\n", + "\n", + "(3) $\n", + " \\begin{align*}\n", + " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", + " \\underbrace{\\beta E^n_{L_2}}_{\\text{prior term}} = E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}{2}|\\mathbf{W}|^2\n", + "\\end{align*}\n", + "$\n", + "\n", + "(4) $\n", + "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} E_{L_2}) }{\\partial w_i} \n", + " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} \\frac{\\partial\n", + " E_{L_2}}{\\partial w_i} \\right) \n", + " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right)\n", + "\\end{align*}\n", + "$\n", + "\n", + "(5) $\n", + "\\begin{align*}\n", + " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right) \n", + "\\end{align*}\n", + "$\n", + "\n", + "where $\\eta$ is learning rate.\n", + "\n", + "## $L_{p=1}$ (Sparsity)\n", + "\n", + "(6) $\n", + " \\begin{align*}\n", + " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", + " \\underbrace{\\beta E^n_{L_1}}_{\\text{prior term}} \n", + " = E^n_{\\text{train}} + \\beta_{L_1} |\\mathbf{W}|\n", + "\\end{align*}\n", + "$\n", + "\n", + "(7) $\\begin{align*}\n", + " \\frac{\\partial E^n}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i)\n", + "\\end{align*}\n", + "$\n", + "\n", + "(8) $\\begin{align*}\n", + " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n", + "\\end{align*}$\n", + "\n", + "Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n", + "\n", + "One can also apply those penalty terms for biases, however, this is usually not necessary as biases have secondary impact on smoothnes of the given solution.\n", "\n", "## Dropout\n", "\n", "Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n", "\n", - "(1) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n", + "(9) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n", "\n", "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n", "\n", - "(2) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n", + "(10) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n", "\n", - "with $0