|
|
|
|
|
|
|
\section{Introduction to Neural Networks}
|
|
|
|
|
|
|
|
Neural Networks (NN) are a mathematical construct inspired by the
|
|
|
|
connection of neurons in nature. It consists of an input and output
|
|
|
|
layer with an arbitrary amount of hidden layers between them. Each
|
|
|
|
layer consits of a numer of neurons (nodes) with the number of nodes
|
|
|
|
in the in-/output layers corresponding to the dimensions of the
|
|
|
|
in-/output.\par
|
|
|
|
Each neuron recieves the output of all layers in the previous layers,
|
|
|
|
except for the input layer, which recieves the components of the input.
|
|
|
|
|
|
|
|
\tikzset{%
|
|
|
|
every neuron/.style={
|
|
|
|
circle,
|
|
|
|
draw,
|
|
|
|
minimum size=1cm
|
|
|
|
},
|
|
|
|
neuron missing/.style={
|
|
|
|
draw=none,
|
|
|
|
scale=1.5,
|
|
|
|
text height=0.333cm,
|
|
|
|
execute at begin node=\color{black}$\vdots$
|
|
|
|
},
|
|
|
|
}
|
|
|
|
\begin{figure}[h!]
|
|
|
|
\center
|
|
|
|
|
|
|
|
% \fbox{
|
|
|
|
|
|
|
|
\resizebox{\textwidth}{!}{%
|
|
|
|
\begin{tikzpicture}[x=1.75cm, y=1.75cm, >=stealth]
|
|
|
|
\tikzset{myptr/.style={decoration={markings,mark=at position 1 with %
|
|
|
|
{\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
|
|
|
|
|
|
|
|
\foreach \m/\l [count=\y] in {1,2,3,missing,4}
|
|
|
|
\node [every neuron/.try, neuron \m/.try] (input-\m) at (0,2.5-\y) {};
|
|
|
|
|
|
|
|
\foreach \m [count=\y] in {1,missing,2}
|
|
|
|
\node [every neuron/.try, neuron \m/.try ] (hidden1-\m) at (2,2-\y*1.25) {};
|
|
|
|
|
|
|
|
\foreach \m [count=\y] in {1,missing,2}
|
|
|
|
\node [every neuron/.try, neuron \m/.try ] (hidden2-\m) at (5,2-\y*1.25) {};
|
|
|
|
|
|
|
|
\foreach \m [count=\y] in {1,missing,2}
|
|
|
|
\node [every neuron/.try, neuron \m/.try ] (output-\m) at (7,1.5-\y) {};
|
|
|
|
|
|
|
|
\foreach \l [count=\i] in {1,2,3,d_i}
|
|
|
|
\draw [myptr] (input-\i)+(-1,0) -- (input-\i)
|
|
|
|
node [above, midway] {$x_{\l}$};
|
|
|
|
|
|
|
|
\foreach \l [count=\i] in {1,n_1}
|
|
|
|
\node [above] at (hidden1-\i.north) {$\mathcal{N}_{1,\l}$};
|
|
|
|
|
|
|
|
\foreach \l [count=\i] in {1,n_l}
|
|
|
|
\node [above] at (hidden2-\i.north) {$\mathcal{N}_{l,\l}$};
|
|
|
|
|
|
|
|
\foreach \l [count=\i] in {1,d_o}
|
|
|
|
\draw [myptr] (output-\i) -- ++(1,0)
|
|
|
|
node [above, midway] {$O_{\l}$};
|
|
|
|
|
|
|
|
\foreach \i in {1,...,4}
|
|
|
|
\foreach \j in {1,...,2}
|
|
|
|
\draw [myptr] (input-\i) -- (hidden1-\j);
|
|
|
|
|
|
|
|
\foreach \i in {1,...,2}
|
|
|
|
\foreach \j in {1,...,2}
|
|
|
|
\draw [myptr] (hidden1-\i) -- (hidden2-\j);
|
|
|
|
|
|
|
|
\foreach \i in {1,...,2}
|
|
|
|
\foreach \j in {1,...,2}
|
|
|
|
\draw [myptr] (hidden2-\i) -- (output-\j);
|
|
|
|
|
|
|
|
\node [align=center, above] at (0,2) {Input\\layer};
|
|
|
|
\node [align=center, above] at (2,2) {Hidden \\layer $1$};
|
|
|
|
\node [align=center, above] at (5,2) {Hidden \\layer $l$};
|
|
|
|
\node [align=center, above] at (7,2) {Output \\layer};
|
|
|
|
|
|
|
|
\node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$};
|
|
|
|
|
|
|
|
\end{tikzpicture}}%}
|
|
|
|
\caption{Illustration of a neural network with $d_i$ inputs, $l$
|
|
|
|
hidden layers with $n_{\cdot}$ nodes in each layer, as well as
|
|
|
|
$d_o$ outputs.
|
|
|
|
}
|
|
|
|
\end{figure}
|
|
|
|
|
|
|
|
\subsection{Nonlinearity of Neural Networks}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
\begin{figure}
|
|
|
|
\begin{tikzpicture}[x=1.5cm, y=1.5cm, >=stealth]
|
|
|
|
|
|
|
|
|
|
|
|
\tikzset{myptr/.style={decoration={markings,mark=at position 1 with %
|
|
|
|
{\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
|
|
|
|
|
|
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
|
|
1.5mm, left] (i_1) at (0, 2.5) {};
|
|
|
|
\node [align=left, left] at (-0.125, 2.5) {\(i_1\)};
|
|
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
|
|
1.5mm] (i_2) at (0, 1.25) {};
|
|
|
|
\node [align=left, left] at (-0.125, 1.25) {\(i_2\)};
|
|
|
|
\node [neuron missing] (i_3) at (0, 0) {};
|
|
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
|
|
1.5mm] (i_4) at (0, -1.25) {};
|
|
|
|
\node [align=left, left] at (-0.125, -1.25) {\(i_m\)};
|
|
|
|
\draw[decoration={calligraphic brace,amplitude=5pt, mirror}, decorate, line width=1.25pt]
|
|
|
|
(-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Inputs};
|
|
|
|
|
|
|
|
\node [align = center, above] at (1.25, 3) {Synaptic\\weights};
|
|
|
|
\node [every neuron] (w_1) at (1.25, 2.5) {\(w_{k, 1}\)};
|
|
|
|
\node [every neuron] (w_2) at (1.25, 1.25) {\(w_{k, 2}\)};
|
|
|
|
\node [neuron missing] (w_3) at (1.25, 0) {};
|
|
|
|
\node [every neuron] (w_4) at (1.25, -1.25) {\(w_{k, m}\)};
|
|
|
|
|
|
|
|
\node [circle, draw] (sig) at (3, 0.625) {\Large\(\sum\)};
|
|
|
|
\node [align = center, below] at (3, 0) {Summing \\junction};
|
|
|
|
|
|
|
|
\node [draw, minimum size = 1.25cm] (act) at (4.5, 0.625)
|
|
|
|
{\(\sigma(.)\)};
|
|
|
|
\node [align = center, above] at (4.5, 1.25) {Activation \\function};
|
|
|
|
|
|
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
|
|
1.5mm] (b) at (3, 2.5) {};
|
|
|
|
\node [align = center, above] at (3, 2.75) {Bias \\\(b_k\)};
|
|
|
|
|
|
|
|
\node [align = center] (out) at (6, 0.625) {Output \\\(o_k\)};
|
|
|
|
|
|
|
|
|
|
|
|
\draw [myptr] (i_1) -- (w_1);
|
|
|
|
\draw [myptr] (i_2) -- (w_2);
|
|
|
|
\draw [myptr] (i_4) -- (w_4);
|
|
|
|
|
|
|
|
\draw [myptr] (w_1) -- (sig);
|
|
|
|
\draw [myptr] (w_2) -- (sig);
|
|
|
|
\draw [myptr] (w_4) -- (sig);
|
|
|
|
|
|
|
|
\draw [myptr] (b) -- (sig);
|
|
|
|
|
|
|
|
\draw [myptr] (sig) -- (act);
|
|
|
|
|
|
|
|
\draw [myptr] (act) -- (out);
|
|
|
|
|
|
|
|
% \foreach \m [count=\y] in {1,2,missing,3,4}
|
|
|
|
% \node [every neuron/.try, neuron \m/.try ] (hidden-\m) at (1.25,3.25-\y*1.25) {\(w_{k,\y}\)};
|
|
|
|
|
|
|
|
% \foreach \m [count=\y] in {1}
|
|
|
|
% \node [every neuron/.try, neuron \m/.try ] (output-\m) at (2.5,0.5-\y) {};
|
|
|
|
|
|
|
|
% \foreach \l [count=\i] in {1}
|
|
|
|
% \draw [<-] (input-\i) -- ++(-1,0)
|
|
|
|
% node [above, midway] {$x$};
|
|
|
|
|
|
|
|
% \foreach \l [count=\i] in {1,2,n-1,n}
|
|
|
|
% \node [above] at (hidden-\i.north) {$\mathcal{N}_{\l}$};
|
|
|
|
|
|
|
|
% \foreach \l [count=\i] in {1,n_l}
|
|
|
|
% \node [above] at (output-\i.north) {};
|
|
|
|
|
|
|
|
% \foreach \l [count=\i] in {1}
|
|
|
|
% \draw [->] (output-\i) -- ++(1,0)
|
|
|
|
% node [above, midway] {$y$};
|
|
|
|
|
|
|
|
% \foreach \i in {1}
|
|
|
|
% \foreach \j in {1,2,...,3,4}
|
|
|
|
% \draw [->] (input-\i) -- (hidden-\j);
|
|
|
|
|
|
|
|
% \foreach \i in {1,2,...,3,4}
|
|
|
|
% \foreach \j in {1}
|
|
|
|
% \draw [->] (hidden-\i) -- (output-\j);
|
|
|
|
|
|
|
|
\end{tikzpicture}
|
|
|
|
\caption{Structure of a single neuron}
|
|
|
|
\end{figure}
|
|
|
|
|
|
|
|
\clearpage
|
|
|
|
\subsection{Training Neural Networks}
|
|
|
|
|
|
|
|
After a neural network model is designed, like most statistical models
|
|
|
|
it has to be fit to the data. In the machine learning context this is
|
|
|
|
often called ``training'' as due to the complexity and amount of
|
|
|
|
variables in these models they are fitted iteratively to the data,
|
|
|
|
``learing'' the properties of the data better with each iteration.
|
|
|
|
|
|
|
|
There are two main categories of machine learning models, being
|
|
|
|
supervised and unsupervised learners. Unsupervised learners learn
|
|
|
|
structure in the data without guidance form outside (as labeling data
|
|
|
|
beforehand for training) popular examples of this are clustering
|
|
|
|
algorithms\todo{quelle}. Supervised learners on the other hand are as
|
|
|
|
the name suggest supervised during learning. This generally amounts to
|
|
|
|
using data with the expected response (label) attached to each
|
|
|
|
data-point in fitting the model, where usually some distance between
|
|
|
|
the model output and the labels is minimized.
|
|
|
|
|
|
|
|
\subsubsection{Interpreting the Output / Classification vs Regression
|
|
|
|
/ Nonliniarity in last layer}
|
|
|
|
|
|
|
|
Given the nature of the neural net the output of the last layer are
|
|
|
|
real numbers. For regression tasks this is desirable, for
|
|
|
|
classification problems however some transformations might be
|
|
|
|
necessary.
|
|
|
|
As the goal in the latter is to predict a certain class or classes for
|
|
|
|
an object the output needs to be of a form that allows this
|
|
|
|
interpretation.
|
|
|
|
Commonly the nodes in the output layer each correspond to a class and
|
|
|
|
the class chosen as prediction is the one with the highest value at
|
|
|
|
the corresponding output node.
|
|
|
|
The naive transformation to achieve this is transforming the output
|
|
|
|
vector $o$ into a one-hot vector
|
|
|
|
\[
|
|
|
|
\text{pred}_i =
|
|
|
|
\begin{cases}
|
|
|
|
1,& \text{if } o_i = \max_j o_j \\
|
|
|
|
0,& \text{else}.
|
|
|
|
\end{cases}
|
|
|
|
\]
|
|
|
|
This however makes training the model with gradient based methods impossible, as the derivative of
|
|
|
|
the transformation is either zero or undefined.
|
|
|
|
A continuous transformation that is close to the argmax one is given by
|
|
|
|
softmax
|
|
|
|
\[
|
|
|
|
\text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}.
|
|
|
|
\]
|
|
|
|
The softmax function transforms the realm of the output to the interval $[0,1]$
|
|
|
|
and the individual values sum to one, thus the output can be interpreted as
|
|
|
|
a probability for each class given the input.
|
|
|
|
Additionally to being differentiable this allows for evaluataing the
|
|
|
|
cetainiy of a prediction, rather than just whether it is accurate.
|
|
|
|
|
|
|
|
\todo{vielleicht additiv invarianz}
|
|
|
|
% Another property that makes softmax attractive is the invariance to addition
|
|
|
|
% \[
|
|
|
|
% \text{sofmax}(o) = \text{softmax}(o + c
|
|
|
|
% \]
|
|
|
|
|
|
|
|
|
|
|
|
% In order to properly interpret the output of a neural network and
|
|
|
|
% training it, depending on the problem it might be advantageous to
|
|
|
|
% transform the output form the last layer. Given the nature of the
|
|
|
|
% neural network the value at each output node is a real number. This is
|
|
|
|
% desirable for applications where the desired output is a real numbered
|
|
|
|
% vector (e.g. steering inputs for a autonomous car), however for
|
|
|
|
% classification problems it is desirable to transform this
|
|
|
|
% output. Often classification problems are modeled in such a way that
|
|
|
|
% each output node corresponds to a class. Then the output vector needs
|
|
|
|
% to be normalized in order to give a prediction. The naive approach is
|
|
|
|
% to transform the output vector $o$ into a one-hot vector $p$
|
|
|
|
% corresponding to a $0$
|
|
|
|
% entry for all classes except one, which is the predicted class.
|
|
|
|
|
|
|
|
% \[
|
|
|
|
% p_i =
|
|
|
|
% \begin{cases}
|
|
|
|
% 1,& i < j, \forall i,j \in \text{arg}\max o_i, \\
|
|
|
|
% 0,& \text{else.}
|
|
|
|
% \end{cases}
|
|
|
|
% \]\todo{besser formulieren}
|
|
|
|
|
|
|
|
% However this imposes difficulties in training the network as with this
|
|
|
|
% addition the model is no longer differentiable which imitates the
|
|
|
|
% ways the model can be trained. Additionally information about the
|
|
|
|
% ``certainty'' for each class in the prediction gets lost. A popular
|
|
|
|
% way to circumvent this problem is to normalize the output vector is
|
|
|
|
% such a way that the entries add up to one, this allows for the
|
|
|
|
% interpretation of probabilities assigned to each class.
|
|
|
|
|
|
|
|
\subsubsection{Error Measurement}
|
|
|
|
|
|
|
|
In order to make assessment about the quality of a network $\mathcal{NN}$ and train
|
|
|
|
it we need to discuss how we measure error. The choice of the error
|
|
|
|
function is highly dependent on the type of the problem. For
|
|
|
|
regression problems a commonly used error measure is the mean squared
|
|
|
|
error (MSE)
|
|
|
|
which for a function $f$ and data $(x_i,y_i), i=1,\dots,n$ is given by
|
|
|
|
\[
|
|
|
|
MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2.
|
|
|
|
\]
|
|
|
|
However depending on the problem error measures with differnt
|
|
|
|
properties might be needed, for example in some contexts it is
|
|
|
|
required to consider a proportional rather than absolute error as is
|
|
|
|
common in time series models. \todo{komisch}
|
|
|
|
|
|
|
|
As discussed above the output of a neural network for a classification
|
|
|
|
problem can be interpreted as a probability distribution over the classes
|
|
|
|
conditioned on the input. In this case it is \todo{can?} desirable to
|
|
|
|
use error functions designed to compare probability distributions. A
|
|
|
|
widespread error function for this use case is the cross entropy (\textcite{PRML}),
|
|
|
|
which for two discrete distributions $p, q$ with the same realm $C$ is given by
|
|
|
|
\[
|
|
|
|
H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
|
|
|
|
\]
|
|
|
|
which compares a $q$ to a true underlying distribution $p$.
|
|
|
|
For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$
|
|
|
|
corresponds to the probability of class $c$ given $x_i$ and predictor
|
|
|
|
$f$ we get the loss function
|
|
|
|
\[
|
|
|
|
Bla = \sum_{i=1}^n H(y_i, f(x_i)).
|
|
|
|
\]
|
|
|
|
|
|
|
|
-Maximum Likelihood
|
|
|
|
-Ableitung mit softmax pseudo linear -> fast improvemtns possible
|
|
|
|
|
|
|
|
\subsubsection{Gradient Descent Algorithm}
|
|
|
|
|
|
|
|
When trying to fit a neural network it is hard
|
|
|
|
to predict the impact of the single parameters on the accuracy of the
|
|
|
|
output. Thus applying numeric optimization algorithms is the only
|
|
|
|
feasible way to fit the model. A attractive algorithm for training
|
|
|
|
neural networks is gradient descent where each parameter $\theta_i$ is
|
|
|
|
iterative changed according to the gradient regarding the error
|
|
|
|
measure and a step size $\gamma$. For this all parameters are
|
|
|
|
initialized (often random or close to zero) and then iteratively
|
|
|
|
updated until a certain criteria is hit, mostly either being a fixed
|
|
|
|
number of iterations or a desired upper limit for the error measure.
|
|
|
|
% For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$
|
|
|
|
% and a error function $L(f_\theta)$ the gradient descent algorithm is
|
|
|
|
% given in \ref{alg:gd}.
|
|
|
|
|
|
|
|
\begin{algorithm}[H]
|
|
|
|
\SetAlgoLined
|
|
|
|
\KwInput{function $f_\theta$ with parameters $\theta \in
|
|
|
|
\mathbb{R}^n$ \newline step size $\gamma$}
|
|
|
|
initialize $\theta^0$\;
|
|
|
|
$i \leftarrow 1$\;
|
|
|
|
\While{While termination condition is not met}{
|
|
|
|
$\nabla \leftarrow \frac{\mathrm{d}f_\theta}{\mathrm{d} \theta}\vert_{\theta^{i-1}}$\;
|
|
|
|
$\theta^i \leftarrow \theta^{i-1} - \gamma \nabla $\;
|
|
|
|
$i \leftarrow i +1$\;
|
|
|
|
}
|
|
|
|
|
|
|
|
\caption{Gradient Descent}
|
|
|
|
\label{alg:gd}
|
|
|
|
\end{algorithm}
|
|
|
|
|
|
|
|
The algorithm for gradient descent is given in
|
|
|
|
Algorithm~\ref{alg:gd}. In the context of fitting a neural network
|
|
|
|
$f_\theta$ corresponds to the error measurement of the network
|
|
|
|
$L\left(\mathcal{NN}_{\theta}\right)$ where $\theta$ is a vector
|
|
|
|
containing all the weights and biases of the network.
|
|
|
|
As ca be seen this requires computing the derivative of the network
|
|
|
|
with regard to each variable. With the number of variables getting
|
|
|
|
large in networks with multiple layers of high neuron count naively
|
|
|
|
computing these can get quite memory and computational expensive. But
|
|
|
|
by using the chain rule and exploiting the layered structure we can
|
|
|
|
compute the gradient much more efficiently by using backpropagation
|
|
|
|
first introduced by \textcite{backprop}.
|
|
|
|
|
|
|
|
\subsubsection{Backpropagation}
|
|
|
|
|
|
|
|
As with an increasing amount of layers the derivative of a loss
|
|
|
|
function with respect to a certain variable becomes more intensive to
|
|
|
|
compute there have been efforts in increasing the efficiency of
|
|
|
|
computing these derivatives. Today the BACKPROPAGATION algorithm is
|
|
|
|
widely used to compute the derivatives needed for the optimization
|
|
|
|
algorithms. Here instead of naively calculating the derivative for
|
|
|
|
each variable, the chain rule is used in order to compute derivatives
|
|
|
|
for each layer from output layer towards the first layer while only
|
|
|
|
needing to ....
|
|
|
|
|
|
|
|
\[
|
|
|
|
\frac{\partial L(...)}{}
|
|
|
|
\]
|
|
|
|
|
|
|
|
%%% Local Variables:
|
|
|
|
%%% mode: latex
|
|
|
|
%%% TeX-master: "main"
|
|
|
|
%%% End:
|