diff --git a/04_Regularisation.ipynb b/04_Regularisation.ipynb index d1e4c05..f3e8f25 100644 --- a/04_Regularisation.ipynb +++ b/04_Regularisation.ipynb @@ -64,22 +64,31 @@ "\n", "(2) $\n", " \\begin{align*}\n", - " E^n_{L_p}(\\mathbf{W}) = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n", + " E^n_{L_p}(\\mathbf{W}) = ||\\mathbf{W}||_p = \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^p \\right )^{\\frac{1}{p}}\n", "\\end{align*}\n", "$\n", "\n", - "where $p$ denotes the norm-order (for regularisation either 1 or 2). (TODO: explain here why we usualy skip square root for p=2)\n", + "where $p$ denotes the norm-order (for regularisation either 1 or 2). Notice, in practice for computational purposes we will rather compute squared $L_{p=2}$ norm, which omits the square root in (2), that is:\n", + "\n", + "(3)$ \\begin{align*}\n", + " E^n_{L_{p=2}}(\\mathbf{W}) = ||\\mathbf{W}||^2_2 = \\left ( \\left ( \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2 \\right )^{\\frac{1}{2}} \\right )^2 = \\sum_{i,j \\in \\mathbf{W}} |w_{i,j}|^2\n", + "\\end{align*}\n", + "$\n", "\n", "## $L_{p=2}$ (Weight Decay)\n", "\n", - "(3) $\n", + "Our cost with $L_{2}$ regulariser then becomes ($\\frac{1}{2}$ simplifies a derivative later):\n", + "\n", + "(4) $\n", " \\begin{align*}\n", " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", - " \\underbrace{\\beta E^n_{L_2}}_{\\text{prior term}} = E^n_{\\text{train}} + \\beta_{L_2} \\frac{1}{2}|\\mathbf{W}|^2\n", + " \\underbrace{\\beta_{L_2} \\frac{1}{2} E^n_{L_2}}_{\\text{prior term}}\n", "\\end{align*}\n", "$\n", "\n", - "(4) $\n", + "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n", + "\n", + "(5) $\n", "\\begin{align*}\\frac{\\partial E^n}{\\partial w_i} &= \\frac{\\partial (E^n_{\\text{train}} + \\beta_{L_2} E_{L_2}) }{\\partial w_i} \n", " = \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} \\frac{\\partial\n", " E_{L_2}}{\\partial w_i} \\right) \n", @@ -87,7 +96,9 @@ "\\end{align*}\n", "$\n", "\n", - "(5) $\n", + "And the actual update we to the $W_i$ parameter is:\n", + "\n", + "(6) $\n", "\\begin{align*}\n", " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_2} w_i \\right) \n", "\\end{align*}\n", @@ -97,49 +108,54 @@ "\n", "## $L_{p=1}$ (Sparsity)\n", "\n", - "(6) $\n", + "Our cost with $L_{1}$ regulariser then becomes:\n", + "\n", + "(7) $\n", " \\begin{align*}\n", " E^n &= \\underbrace{E^n_{\\text{train}}}_{\\text{data term}} + \n", - " \\underbrace{\\beta E^n_{L_1}}_{\\text{prior term}} \n", - " = E^n_{\\text{train}} + \\beta_{L_1} |\\mathbf{W}|\n", + " \\underbrace{\\beta_{L_1} E^n_{L_1}}_{\\text{prior term}} \n", "\\end{align*}\n", "$\n", "\n", - "(7) $\\begin{align*}\n", + "Hence, the gradient of the cost w.r.t parameter $w_i$ is given as follows:\n", + "\n", + "(8) $\\begin{align*}\n", " \\frac{\\partial E^n}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\frac{\\partial E_{L_1}}{\\partial w_i} = \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i)\n", "\\end{align*}\n", "$\n", "\n", - "(8) $\\begin{align*}\n", + "And the actual update we to the $W_i$ parameter is:\n", + "\n", + "(9) $\\begin{align*}\n", " \\Delta w_i &= -\\eta \\left( \\frac{\\partial E^n_{\\text{train}}}{\\partial w_i} + \\beta_{L_1} \\mbox{sgn}(w_i) \\right) \n", "\\end{align*}$\n", "\n", "Where $\\mbox{sgn}(w_i)$ is the sign of $w_i$: $\\mbox{sgn}(w_i) = 1$ if $w_i>0$ and $\\mbox{sgn}(w_i) = -1$ if $w_i<0$\n", "\n", - "One can also apply those penalty terms for biases, however, this is usually not necessary as biases have secondary impact on smoothnes of the given solution.\n", + "One can also easily apply those penalty terms for biases, however, this is usually not necessary as biases do not affect the smoothness of the solution (given data).\n", "\n", "## Dropout\n", "\n", "Dropout, for a given layer's output $\\mathbf{h}^i \\in \\mathbb{R}^{BxH^l}$ (where $B$ is batch size and $H^l$ is the $l$-th layer output dimensionality) implements the following transformation:\n", "\n", - "(9) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n", + "(10) $\\mathbf{\\hat h}^l = \\mathbf{d}^l\\circ\\mathbf{h}^l$\n", "\n", "where $\\circ$ denotes an elementwise product and $\\mathbf{d}^l \\in \\{0,1\\}^{BxH^i}$ is a matrix in which $d^l_{ij}$ element is sampled from the Bernoulli distribution:\n", "\n", - "(10) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n", + "(11) $d^l_{ij} \\sim \\mbox{Bernoulli}(p^l_d)$\n", "\n", "with $0
0: - l2_W_penalty = l2_weight*self.W - l2_b_penalty = l2_weight*self.b - l1_W_penalty, l1_b_penalty = 0, 0 - if l1_weight > 0: - l1_W_penalty = l1_weight*numpy.sign(self.W) - l1_b_penalty = l1_weight*numpy.sign(self.b) - - grad_W = numpy.dot(inputs.T, deltas) + l2_W_penalty + l1_W_penalty - grad_b = numpy.sum(deltas, axis=0) + l2_b_penalty + l1_b_penalty + grad_W = numpy.dot(inputs.T, deltas) + grad_b = numpy.sum(deltas, axis=0) return [grad_W, grad_b] @@ -352,7 +343,7 @@ class Softmax(Linear): return y def bprop(self, h, igrads): - raise NotImplementedError() + raise NotImplementedError('Softmax.bprop not implemented for hidden layer.') def bprop_cost(self, h, igrads, cost):