From faaa6cb172530d8746a6c980de1d81a380e36cf5 Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Sun, 1 Nov 2015 19:24:35 +0000
Subject: [PATCH] lab 4
---
04_Regularisation.ipynb | 56 ++++++++++++++++++++++++++---------------
mlp/layers.py | 15 +++--------
2 files changed, 39 insertions(+), 32 deletions(-)
diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb
index d1e4c05..f3e8f25 100644
--- a/04_Regularisation.ipynb
+++ b/04_Regularisation.ipynb
@@ -64,22 +64,31 @@
"\n",
"(2) $\n",
" \\begin{align*}\n",
- " E^n_{L_p}(\\mathbf{W}) = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
+ " E^n_{L_p}(\\mathbf{W}) = ||\\mathbf{W}||_p = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n",
"\\end{align*}\n",
"$\n",
"\n",
- "where $p$ denotes the norm-order (for regularisation either 1 or 2). (TODO: explain here why we usualy skip square root for p=2)\n",
+ "where $p$ denotes the norm-order (for regularisation either 1 or 2). Notice, in practice for computational purposes we will rather compute squared $L_{p=2}$ norm, which omits the square root in (2), that is:\n",
+ "\n",
+ "(3)$ \\begin{align*}\n",
+ " E^n_{L_{p=2}}(\\mathbf{W}) = ||\\mathbf{W}||^2_2 = \\left ( \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2 \\right )^{\\frac{1}{2}} \\right )^2 = \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2\n",
+ "\\end{align*}\n",
+ "$\n",
"\n",
"## $L_{p=2}$ (Weight Decay)\n",
"\n",
- "(3) $\n",
+ "Our cost with $L_{2}$ regulariser then becomes ($\\frac{1}{2}$ simplifies a derivative later):\n",
+ "\n",
+ "(4) $\n",
" \\begin{align*}\n",
" E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
- " \\underbrace{\\beta E^n_{L_2}}_{\\text{prior term}} = E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}{2}|\\mathbf{W}|^2\n",
+ " \\underbrace{\\beta_{L_2} \\frac{1}{2} E^n_{L_2}}_{\\text{prior term}}\n",
"\\end{align*}\n",
"$\n",
"\n",
- "(4) $\n",
+ "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
+ "\n",
+ "(5) $\n",
"\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} E_{L_2}) }{\\partial w_i} \n",
" = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} \\frac{\\partial\n",
" E_{L_2}}{\\partial w_i} \\right) \n",
@@ -87,7 +96,9 @@
"\\end{align*}\n",
"$\n",
"\n",
- "(5) $\n",
+ "And the actual update we to the $W_i$ parameter is:\n",
+ "\n",
+ "(6) $\n",
"\\begin{align*}\n",
" \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right) \n",
"\\end{align*}\n",
@@ -97,49 +108,54 @@
"\n",
"## $L_{p=1}$ (Sparsity)\n",
"\n",
- "(6) $\n",
+ "Our cost with $L_{1}$ regulariser then becomes:\n",
+ "\n",
+ "(7) $\n",
" \\begin{align*}\n",
" E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n",
- " \\underbrace{\\beta E^n_{L_1}}_{\\text{prior term}} \n",
- " = E^n_{\\text{train}} + \\beta_{L_1} |\\mathbf{W}|\n",
+ " \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} \n",
"\\end{align*}\n",
"$\n",
"\n",
- "(7) $\\begin{align*}\n",
+ "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n",
+ "\n",
+ "(8) $\\begin{align*}\n",
" \\frac{\\partial E^n}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i)\n",
"\\end{align*}\n",
"$\n",
"\n",
- "(8) $\\begin{align*}\n",
+ "And the actual update we to the $W_i$ parameter is:\n",
+ "\n",
+ "(9) $\\begin{align*}\n",
" \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n",
"\\end{align*}$\n",
"\n",
"Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n",
"\n",
- "One can also apply those penalty terms for biases, however, this is usually not necessary as biases have secondary impact on smoothnes of the given solution.\n",
+ "One can also easily apply those penalty terms for biases, however, this is usually not necessary as biases do not affect the smoothness of the solution (given data).\n",
"\n",
"## Dropout\n",
"\n",
"Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n",
"\n",
- "(9) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
+ "(10) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n",
"\n",
"where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n",
"\n",
- "(10) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
+ "(11) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n",
"\n",
"with $0 0:
- l2_W_penalty = l2_weight*self.W
- l2_b_penalty = l2_weight*self.b
- l1_W_penalty, l1_b_penalty = 0, 0
- if l1_weight > 0:
- l1_W_penalty = l1_weight*numpy.sign(self.W)
- l1_b_penalty = l1_weight*numpy.sign(self.b)
-
- grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty
- grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty
+ grad_W = numpy.dot(inputs.T, deltas)
+ grad_b = numpy.sum(deltas, axis=0)
return [grad_W, grad_b]
@@ -352,7 +343,7 @@ class Softmax(Linear):
return y
def bprop(self, h, igrads):
- raise NotImplementedError()
+ raise NotImplementedError('Softmax.bprop not implemented for hidden layer.')
def bprop_cost(self, h, igrads, cost):