Masterarbeit/TeX/introduction_nn.tex


\section{Introduction to Neural Networks}

This chapter is based on \textcite[Chapter~6]{Goodfellow} and \textcite{Haykin}.

Neural Networks are a mathematical construct inspired by the
structure of brains in mammals. They consist of an array of neurons that
receive inputs and compute an accumulated output. These neurons are
arranged in layers, with one input and output layer
and an arbitrary
amount of hidden layers between them.
The number of neurons in the in- and output layers correspond to the
desired dimensions of in- and outputs of the model.

In conventional neural networks, the information is fed forward from the
input layer towards the output layer, hence they are often called
feed forward networks. Each neuron in a layer has the outputs of all 
neurons in the preceding layer as input and computes an accumulated
value from these (fully connected).
% An illustration of an example neural network is given in
% Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}.
Illustrations of a neural network and the structure of a neuron are given
in Figure~\ref{fig:nn} and Figure~\ref{fig:neuron}.

\tikzset{%
  every neuron/.style={
    circle,
    draw,
    minimum size=1cm
  },
  neuron missing/.style={
    draw=none, 
    scale=1.5,
    text height=0.333cm,
    execute at begin node=\color{black}$\vdots$
  },
}
\begin{figure}[h!]
  \center
  
  % \fbox{
  
  \resizebox{\textwidth}{!}{%
    \begin{tikzpicture}[x=1.75cm, y=1.75cm, >=stealth]
      \tikzset{myptr/.style={decoration={markings,mark=at position 1 with %
            {\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
      
      \foreach \m/\l [count=\y] in {1,2,3,missing,4}
      \node [every neuron/.try, neuron \m/.try] (input-\m) at (0,2.55-\y*0.85) {};
      
      \foreach \m [count=\y] in {1,missing,2}
      \node [every neuron/.try, neuron \m/.try ] (hidden1-\m) at (2.5,2.5-\y*1.25) {};
      
      \foreach \m [count=\y] in {1,missing,2}
      \node [every neuron/.try, neuron \m/.try ] (hidden2-\m) at (5,2.5-\y*1.25) {};
        
      \foreach \m [count=\y] in {1,missing,2}
      \node [every neuron/.try, neuron \m/.try ] (output-\m) at (7,1.5-\y*0.75) {};

      \foreach \l [count=\i] in {1,2,3,d_i}
      \draw [myptr] (input-\i)+(-1,0) -- (input-\i)
      node [above, midway] {$x_{\l}$};
      
      \foreach \l [count=\i] in {1,n_1}
      \node [above] at (hidden1-\i.north) {$\mathcal{N}_{1,\l}$};
      
      \foreach \l [count=\i] in {1,n_l}
      \node [above] at (hidden2-\i.north) {$\mathcal{N}_{l,\l}$};
      
      \foreach \l [count=\i] in {1,d_o}
      \draw [myptr] (output-\i) -- ++(1,0)
      node [above, midway] {$O_{\l}$};
      
      \foreach \i in {1,...,4}
      \foreach \j in {1,...,2}
      \draw [myptr] (input-\i) -- (hidden1-\j);
      
      \foreach \i in {1,...,2}
      \foreach \j in {1,...,2}
      \draw [myptr] (hidden1-\i) -- (hidden2-\j);
      
      \foreach \i in {1,...,2}
      \foreach \j in {1,...,2}
      \draw [myptr] (hidden2-\i) -- (output-\j);
      
      \node [align=center, above] at (0,2) {Input\\layer};
      \node [align=center, above] at (2,2) {Hidden \\layer $1$};
      \node [align=center, above] at (5,2) {Hidden \\layer $l$};
      \node [align=center, above] at (7,2) {Output \\layer};
      
      \node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$};
        
    \end{tikzpicture}}%}
  \caption[Illustration of a Neural Network]{Illustration of a neural network with $d_i$ inputs, $l$
    hidden layers with $n_{\cdot}$ nodes in each layer, as well as
    $d_o$ outputs.
  }
  \label{fig:nn}
\end{figure}

\subsection{Nonlinearity of Neural Networks}

The arguably most important feature of neural networks which sets them
apart from linear models is the activation function implemented in the
neurons. As illustrated in Figure~\ref{fig:neuron} on the weighted sum of the
inputs an activation function $\sigma$ is applied resulting in the
output of the $k$-th neuron in a layer $l$ with $m$ nodes in layer $l-1$
being given by
\begin{align*}
  o_{l,k} = \sigma\left(b_{l,k} + \sum_{j=1}^{m} w_{l,k,j}
            o_{l-1,j}\right),
\end{align*}
for weights $w_{l,k,j}$ and biases $b_{l,k}$. For a network with $L$
hidden layers and inputs $o_{0}$ the final outputs of the network
are thus given by $o_{L+1}$.
The activation function is usually chosen nonlinear (a linear one
would result in the entire network collapsing into a linear model) which
allows it to better model data where the relation of in- and output is
of nonlinear nature.
There are two types of activation functions, saturating and
non-saturating ones. Popular examples for the former are sigmoid 
functions where most commonly the standard logistic function or tangens
hyperbolicus are used
as they have easy to compute derivatives which is desirable for
gradient-based optimization algorithms. The standard logistic function
(often simply referred to as sigmoid function) is given by
\[
  f(x) = \frac{1}{1+e^{-x}} 
\]
and has a realm of $[0,1]$. The tangens hyperbolicus is given by
\[
  \tanh(x) = \frac{2}{e^{2x}+1}
\]
and has a realm of $[-1,1]$. Both functions result in neurons that are
close to inactive until a certain threshold is reached where they grow
until saturation.
The downside of these saturating activation functions is, that their
derivatives are close to zero on most of their realm, only assuming
larger values in proximity to zero.
This can hinder the progress of gradient-based methods.

The non-saturating activation functions commonly used are the rectified
linear unit (ReLU) or the leaky ReLU. The ReLU is given by
\begin{equation}
  r(x) = \max\left\{0, x\right\}.
  \label{eq:relu}
\end{equation}
This has the benefit of having a constant derivative for values larger
than zero. However, the derivative being zero for negative values has
the same downside for
fitting the model with gradient-based methods. The leaky ReLU is
an attempt to counteract this problem by assigning a small constant
derivative to all values smaller than zero and for a scalar $\alpha$ is given by
\[
  l(x) = \max\left\{0, x\right\} + \alpha \min \left\{0, x\right\}.
\]
In Figure~\ref{fig:activation} visualizations of these functions are given.
%In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}.


\begin{figure}
  \begin{tikzpicture}[x=1.5cm, y=1.5cm, >=stealth]

    
    \tikzset{myptr/.style={decoration={markings,mark=at position 1 with %
      {\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
    
    \node [circle, draw, fill=black, inner sep = 0pt, minimum size =
    1.5mm, left] (i_1) at (0, 2.5) {};
    \node [align=left, left] at (-0.125, 2.5) {\(i_1\)};
    \node [circle, draw, fill=black, inner sep = 0pt, minimum size =
    1.5mm] (i_2) at (0, 1.25) {};
    \node [align=left, left] at (-0.125, 1.25) {\(i_2\)};
    \node [neuron missing] (i_3) at (0, 0) {};
    \node [circle, draw, fill=black, inner sep = 0pt, minimum size =
    1.5mm] (i_4) at (0, -1.25) {};
    \node [align=left, left] at (-0.125, -1.25) {\(i_m\)};
    \draw[decoration={calligraphic brace,amplitude=5pt, mirror}, decorate, line width=1.25pt]
    (-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Inputs};

    \node [align = center, above] at (1.25, 3) {Synaptic\\weights};
    \node [every neuron] (w_1) at (1.25, 2.5) {\(w_{k, 1}\)};
    \node [every neuron] (w_2) at (1.25, 1.25) {\(w_{k, 2}\)};
    \node [neuron missing] (w_3) at (1.25, 0) {};
    \node [every neuron] (w_4) at (1.25, -1.25) {\(w_{k, m}\)};

    \node [circle, draw] (sig) at (3, 0.625) {\Large\(\sum\)};
    \node [align = center, below] at (3, 0) {Summing \\junction};

    \node [draw, minimum size = 1.25cm] (act) at (4.5, 0.625)
    {\(\sigma(.)\)};
    \node [align = center, above] at (4.5, 1.25) {Activation \\function}; 

    \node [circle, draw, fill=black, inner sep = 0pt, minimum size =
    1.5mm] (b) at (3, 2.5) {};
    \node [align = center, above] at (3, 2.75) {Bias \\\(b_k\)};

    \node [align = center] (out) at (6, 0.625) {Output \\\(o_k\)};


    \draw [myptr] (i_1) -- (w_1);
    \draw [myptr] (i_2) -- (w_2);
    \draw [myptr] (i_4) -- (w_4);

    \draw [myptr] (w_1) -- (sig);
    \draw [myptr] (w_2) -- (sig);
    \draw [myptr] (w_4) -- (sig);

    \draw [myptr] (b) -- (sig);

    \draw [myptr] (sig) -- (act);

    \draw [myptr] (act) -- (out);

    % \foreach \m [count=\y] in {1,2,missing,3,4}
    % \node [every neuron/.try, neuron \m/.try ] (hidden-\m) at (1.25,3.25-\y*1.25) {\(w_{k,\y}\)};

    % \foreach \m [count=\y] in {1}
    % \node [every neuron/.try, neuron \m/.try ] (output-\m) at (2.5,0.5-\y) {};

    % \foreach \l [count=\i] in {1}
    % \draw [<-] (input-\i) -- ++(-1,0)
    % node [above, midway] {$x$};

    % \foreach \l [count=\i] in {1,2,n-1,n}
    % \node [above] at (hidden-\i.north) {$\mathcal{N}_{\l}$};

    % \foreach \l [count=\i] in {1,n_l}
    % \node [above] at (output-\i.north) {};

    % \foreach \l [count=\i] in {1}
    % \draw [->] (output-\i) -- ++(1,0)
    % node [above, midway] {$y$};

    % \foreach \i in {1}
    % \foreach \j in {1,2,...,3,4}
    % \draw [->] (input-\i) -- (hidden-\j);

    % \foreach \i in {1,2,...,3,4}
    % \foreach \j in {1}
    % \draw [->] (hidden-\i) -- (output-\j);

  \end{tikzpicture}
  \caption[Structure of a Single Neuron]{Structure of a single neuron.}
  \label{fig:neuron}
\end{figure}

\begin{figure}
  \centering
  \begin{subfigure}{.45\linewidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[enlargelimits=false, ymin=0, ymax = 1, width=\textwidth]
        \addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
      \end{axis}
    \end{tikzpicture}
    \caption{Standard Logistic Function}
  \end{subfigure}
  \begin{subfigure}{.45\linewidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[enlargelimits=false, width=\textwidth]
        \addplot[domain=-5:5, samples=100]{tanh(x)};
      \end{axis}
    \end{tikzpicture}
    \caption{Tangens Hyperbolicus}
  \end{subfigure}
  \begin{subfigure}{.45\linewidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[enlargelimits=false, width=\textwidth,
        ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
        \addplot[domain=-5:5, samples=100]{max(0,x)};
      \end{axis}
    \end{tikzpicture}
    \caption{ReLU}
  \end{subfigure}
  \begin{subfigure}{.45\linewidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
        ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
        \addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
      \end{axis}
    \end{tikzpicture}
    \caption{Leaky ReLU, $\alpha = 0.1$}
  \end{subfigure}
  \caption[Plots of the Activation Functions]{Plots of the activation functions.}
  \label{fig:activation}
\end{figure}

\clearpage
\subsection{Training Neural Networks}

As neural networks are parametric models we need to fit the
parameters to the input
data to get meaningful predictions from the network. In order
to accomplish this we need to discuss how we interpret the output of the
neural network and assess the quality of predictions.

% After a neural network model is designed, like most statistical models
% it has to be fit to the data. In the machine learning context this is
% often called ``training'' as due to the complexity and amount of
% variables in these models they are fitted iteratively to the data,
% ``learing'' the properties of the data better with each iteration.

% There are two main categories of machine learning models, being
% supervised and unsupervised learners. Unsupervised learners learn
% structure in the data without guidance form outside (as labeling data
% beforehand for training) popular examples of this are clustering
% algorithms\todo{quelle}. Supervised learners on the other hand are as
% the name suggest supervised during learning. This generally amounts to
% using data with the expected response (label) attached to each
% data-point in fitting the model, where usually some distance between
% the model output and the labels is minimized.

\subsubsection{Nonlinearity in the Last Layer}

Given the nature of the neural net, the outputs of the last layer are
real numbers. For regression tasks, this is desirable, for
classification problems however some transformations might be
necessary.
As the goal in the latter is to predict a certain class or classes for
an object, the output needs to be of a form that allows this
interpretation.
Commonly the nodes in the output layer each correspond to a class and
the class chosen as prediction is the one with the highest value at
the corresponding output node.
This can be modeled as a transformation of the output
vector $o \in \mathbb{R}^n$ into a one-hot vector
\[
  \text{pred}_i =
  \begin{cases}
    1,& \text{if } o_i = \max_j o_j \\
    0,& \text{else}.
  \end{cases}
\]
This however makes training the model with gradient-based methods impossible, as the derivative of
the transformation is either zero or undefined.
An continuous transformation that is close to argmax is given by
softmax
\begin{equation}
  \text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}.
  \label{eq:softmax}
\end{equation}
The softmax function transforms the realm of the output to the interval $[0,1]$
and the individual values sum to one, thus the output can be interpreted as
a probability for each class conditioned on the input.
Additionally, to being differentiable this allows to evaluate the
certainty of a prediction, rather than just whether it is accurate.
A similar effect is obtained when for a binary or two-class problem the
sigmoid function
\[
  f(x) = \frac{1}{1 + e^{-x}}
\]
is used and the output $f(x)$ is interpreted as the probability for
the first class and $1-f(x)$ for the second class.

% Another property that makes softmax attractive is the invariance to addition
% \[
%   \text{sofmax}(o) = \text{softmax}(o + c
% \]


% In order to properly interpret the output of a neural network and
% training it, depending on the problem it might be advantageous to
% transform the output form the last layer. Given the nature of the
% neural network the value at each output node is a real number. This is
% desirable for applications where the desired output is a real numbered
% vector (e.g. steering inputs for a autonomous car), however for
% classification problems it is desirable to transform this
% output. Often classification problems are modeled in such a way that
% each output node corresponds to a class. Then the output vector needs
% to be normalized in order to give a prediction. The naive approach is
% to transform the output vector $o$ into a one-hot vector $p$
% corresponding to a $0$
% entry for all classes except one, which is the predicted class.

% \[
%   p_i =
%   \begin{cases}
%     1,&  i < j, \forall i,j \in \text{arg}\max o_i, \\ 
%     0,& \text{else.}
%   \end{cases}
% \]\todo{besser formulieren}

% However this imposes difficulties in training the network as with this
% addition the model is no longer differentiable which imitates the
% ways the model can be trained. Additionally information about the
% ``certainty'' for each class in the prediction gets lost. A popular
% way to circumvent this problem is to normalize the output vector is
% such a way that the entries add up to one, this allows for the
% interpretation of probabilities assigned to each class.
\clearpage
\subsubsection{Error Measurement}

In order to train the network we need to be able to assess the quality
of predictions using some error measure. 
The choice of the error
function is highly dependent on the type of problem. For
regression problems, a commonly used error measure is the mean squared
error (MSE)
which for a function $f$ and data $(x_i,y_i), i \in \left\{1,\dots,n\right\}$ is given by
\[
  MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2.
\]
However, depending on the problem error measures with different
properties might be needed. For example in some contexts it is
required to consider a proportional rather than absolute error.

As discussed above the output of a neural network for a classification
problem can be interpreted as a probability distribution over the classes
conditioned on the input. In this case, it is desirable to
use error functions designed to compare probability distributions. A
widespread error function for this use case is the categorical cross entropy (\textcite{PRML}),
which for two discrete distributions $p, q$ with the same realm $C$ is given by
\[
  H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
\]
comparing $q$ to a target density $p$.  
For a data set $(x_i,y_i), i \in \left\{1,\dots,n\right\}$ where each $y_{i,c}$
corresponds to the probability of class $c$ given $x_i$ and a predictor
$f$ we get the loss function
\begin{equation}
  CE(f) = \sum_{i=1}^n H(y_i, f(x_i)).
  \label{eq:cross_entropy}
\end{equation}

% \todo{Den satz einbauen}
% -Maximum Likelihood
% -Ableitung mit softmax pseudo linear -> fast improvemtns possible

\subsubsection{Gradient Descent Algorithm}

Trying to find the optimal parameter for fitting the model to the data
can be a hard problem. Given the complex nature of a neural network
with many layers and neurons, it is hard to predict the impact of
single parameters on the accuracy of the output.
Thus using numeric optimization algorithms is the only
feasible way to fit the model.

An attractive algorithm for training
neural networks is gradient descent. Here all parameters are
initialized with certain values (often random or close to zero) and
then iteratively updated. The updates are made in the direction of the
gradient regarding the error with a step size $\gamma$ until a
specified stopping criterion is hit.
% This  mostly either being a fixed
% number of iterations or a desired upper limit for the error measure.
% For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$
% and a error function $L(f_\theta)$ the gradient descent algorithm is
% given in \ref{alg:gd}.

\begin{algorithm}[H]
  \SetAlgoLined
  \KwInput{function $f_\theta$ with parameters $\theta \in
    \mathbb{R}^n$ \newline step size $\gamma$}
  initialize $\theta^0$\;
  $i \leftarrow 1$\;
  \While{While termination condition is not met}{
    $\nabla \leftarrow \frac{\mathrm{d}f_\theta}{\mathrm{d} \theta}\vert_{\theta^{i-1}}$\;
    $\theta^i \leftarrow \theta^{i-1} - \gamma \nabla $\;
    $i \leftarrow i +1$\;
  }
  
  \caption{Gradient Descent}
  \label{alg:gd}
\end{algorithm}

The algorithm for gradient descent is given in
Algorithm~\ref{alg:gd}. In the context of fitting a neural network
$f_\theta$ corresponds to an error measurement of a neural network
$\mathcal{NN}_{\theta}$ where $\theta$ is a vector
containing all the weights and biases of the network.
As can be seen, this requires computing the derivative of the network
with regard to each variable. With the number of variables getting
large in networks with multiple layers of high neuron count naively
computing the derivatives can get quite memory and computational
expensive.
By using the chain rule and exploiting the layered structure we can
compute the parameter update much more efficiently. This practice is
called backpropagation and was introduced for use in neural networks by
\textcite{backprop}. The algorithm
for one data point is given in Algorithm~\ref{alg:backprop}, but for all error
functions that are sums of errors for single data points (MSE, cross
entropy) backpropagation works analogously for larger training data. 

% \subsubsection{Backpropagation}

% As with an increasing amount of layers the derivative of a loss
% function with respect to a certain variable becomes more intensive to
% compute there have been efforts in increasing the efficiency of
% computing these derivatives. Today the BACKPROPAGATION algorithm is
% widely used to compute the derivatives needed for the optimization
% algorithms. Here instead of naively calculating the derivative for
% each variable, the chain rule is used in order to compute derivatives
% for each layer from output layer towards the first layer while only
% needing to ....

\begin{algorithm}[H]
  \SetAlgoLined
  \KwInput{Inputs $o_0$, neural network
    with $L$ hidden layers, weights $w$, and biases $b$ for $n_l$
    nodes as well as an activation function $\sigma_l$ in layer $l$
    and loss function $\tilde{L}$.}
  Forward Propagation:
  \For{$l \in \left\{1, \dots, L+1\right\}$}{
    Compute values for layer $l$:
    $z_{l,k} \leftarrow b_{l,k} + w_{l,k}^{\mathrm{T}} o_{l-1}, k \in \left\{1,\dots,n_l\right\}$\;
    $o_{l,k} \leftarrow \sigma_l(z_{l,k}), k \in \left\{1,\dots,n_l\right\}$ \;
  }
  Calculate derivative for output layer: $\delta_{L+1, k} \leftarrow
  \frac{\partial\tilde{L}(o_{L+1})}{\partial o_{L+1,k}} \sigma_{L+1}'(z_{L+1,k})$\;
  Back propagate the error:
  \For{$l \in \left\{L,\dots,1\right\}$}{
    $\delta_{l,k} \leftarrow w_{l+1,k}^{\mathrm{T}} \delta_{l+1}
    \sigma_{l}'(z_{l,k}), k=1,\dots,n_k$
  }
  Calculate gradients:
  $\frac{\partial\tilde{L}}{\partial w_{l,k,j}} =
  \delta_{l,k}o_{l-1,j}$,
  $\frac{\partial\tilde{L}}{\partial b_{l,k}} =
  \delta_{l,k}$\;
  
  \caption{Backpropagation for one data point}
  \label{alg:backprop}
\end{algorithm}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: