You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
302 lines
11 KiB
TeX
302 lines
11 KiB
TeX
|
|
\section{Introduction to Neural Networks}
|
|
|
|
Neural Networks (NN) are a mathematical construct inspired by the
|
|
connection of neurons in nature. It consists of an input and output
|
|
layer with an arbitrary amount of hidden layers between them. Each
|
|
layer consits of a numer of neurons (nodes) with the number of nodes
|
|
in the in-/output layers corresponding to the dimensions of the
|
|
in-/output.\par
|
|
Each neuron recieves the output of all layers in the previous layers,
|
|
except for the input layer, which recieves the components of the input.
|
|
|
|
\tikzset{%
|
|
every neuron/.style={
|
|
circle,
|
|
draw,
|
|
minimum size=1cm
|
|
},
|
|
neuron missing/.style={
|
|
draw=none,
|
|
scale=1.5,
|
|
text height=0.333cm,
|
|
execute at begin node=\color{black}$\vdots$
|
|
},
|
|
}
|
|
\begin{figure}[h!]
|
|
\center
|
|
|
|
% \fbox{
|
|
|
|
\resizebox{\textwidth}{!}{%
|
|
\begin{tikzpicture}[x=1.75cm, y=1.75cm, >=stealth]
|
|
\tikzset{myptr/.style={decoration={markings,mark=at position 1 with %
|
|
{\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
|
|
|
|
\foreach \m/\l [count=\y] in {1,2,3,missing,4}
|
|
\node [every neuron/.try, neuron \m/.try] (input-\m) at (0,2.5-\y) {};
|
|
|
|
\foreach \m [count=\y] in {1,missing,2}
|
|
\node [every neuron/.try, neuron \m/.try ] (hidden1-\m) at (2,2-\y*1.25) {};
|
|
|
|
\foreach \m [count=\y] in {1,missing,2}
|
|
\node [every neuron/.try, neuron \m/.try ] (hidden2-\m) at (5,2-\y*1.25) {};
|
|
|
|
\foreach \m [count=\y] in {1,missing,2}
|
|
\node [every neuron/.try, neuron \m/.try ] (output-\m) at (7,1.5-\y) {};
|
|
|
|
\foreach \l [count=\i] in {1,2,3,d_i}
|
|
\draw [myptr] (input-\i)+(-1,0) -- (input-\i)
|
|
node [above, midway] {$x_{\l}$};
|
|
|
|
\foreach \l [count=\i] in {1,n_1}
|
|
\node [above] at (hidden1-\i.north) {$\mathcal{N}_{1,\l}$};
|
|
|
|
\foreach \l [count=\i] in {1,n_l}
|
|
\node [above] at (hidden2-\i.north) {$\mathcal{N}_{l,\l}$};
|
|
|
|
\foreach \l [count=\i] in {1,d_o}
|
|
\draw [myptr] (output-\i) -- ++(1,0)
|
|
node [above, midway] {$O_{\l}$};
|
|
|
|
\foreach \i in {1,...,4}
|
|
\foreach \j in {1,...,2}
|
|
\draw [myptr] (input-\i) -- (hidden1-\j);
|
|
|
|
\foreach \i in {1,...,2}
|
|
\foreach \j in {1,...,2}
|
|
\draw [myptr] (hidden1-\i) -- (hidden2-\j);
|
|
|
|
\foreach \i in {1,...,2}
|
|
\foreach \j in {1,...,2}
|
|
\draw [myptr] (hidden2-\i) -- (output-\j);
|
|
|
|
\node [align=center, above] at (0,2) {Input\\layer};
|
|
\node [align=center, above] at (2,2) {Hidden \\layer $1$};
|
|
\node [align=center, above] at (5,2) {Hidden \\layer $l$};
|
|
\node [align=center, above] at (7,2) {Output \\layer};
|
|
|
|
\node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$};
|
|
|
|
\end{tikzpicture}}%}
|
|
\caption{Illustration of a neural network with $d_i$ inputs, $l$
|
|
hidden layers with $n_{\cdot}$ nodes in each layer, as well as
|
|
$d_o$ outputs.
|
|
}
|
|
\end{figure}
|
|
|
|
\subsection{Nonlinearity of Neural Networks}
|
|
|
|
|
|
|
|
\begin{figure}
|
|
\begin{tikzpicture}[x=1.5cm, y=1.5cm, >=stealth]
|
|
|
|
|
|
\tikzset{myptr/.style={decoration={markings,mark=at position 1 with %
|
|
{\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
|
|
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
1.5mm, left] (i_1) at (0, 2.5) {};
|
|
\node [align=left, left] at (-0.125, 2.5) {\(i_1\)};
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
1.5mm] (i_2) at (0, 1.25) {};
|
|
\node [align=left, left] at (-0.125, 1.25) {\(i_2\)};
|
|
\node [neuron missing] (i_3) at (0, 0) {};
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
1.5mm] (i_4) at (0, -1.25) {};
|
|
\node [align=left, left] at (-0.125, -1.25) {\(i_m\)};
|
|
\draw[decoration={calligraphic brace,amplitude=5pt, mirror}, decorate, line width=1.25pt]
|
|
(-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Inputs};
|
|
|
|
\node [align = center, above] at (1.25, 3) {Synaptic\\weights};
|
|
\node [every neuron] (w_1) at (1.25, 2.5) {\(w_{k, 1}\)};
|
|
\node [every neuron] (w_2) at (1.25, 1.25) {\(w_{k, 2}\)};
|
|
\node [neuron missing] (w_3) at (1.25, 0) {};
|
|
\node [every neuron] (w_4) at (1.25, -1.25) {\(w_{k, m}\)};
|
|
|
|
\node [circle, draw] (sig) at (3, 0.625) {\Large\(\sum\)};
|
|
\node [align = center, below] at (3, 0) {Summing \\junction};
|
|
|
|
\node [draw, minimum size = 1.25cm] (act) at (4.5, 0.625)
|
|
{\(\sigma(.)\)};
|
|
\node [align = center, above] at (4.5, 1.25) {Activation \\function};
|
|
|
|
\node [circle, draw, fill=black, inner sep = 0pt, minimum size =
|
|
1.5mm] (b) at (3, 2.5) {};
|
|
\node [align = center, above] at (3, 2.75) {Bias \\\(b_k\)};
|
|
|
|
\node [align = center] (out) at (6, 0.625) {Output \\\(o_k\)};
|
|
|
|
|
|
\draw [myptr] (i_1) -- (w_1);
|
|
\draw [myptr] (i_2) -- (w_2);
|
|
\draw [myptr] (i_4) -- (w_4);
|
|
|
|
\draw [myptr] (w_1) -- (sig);
|
|
\draw [myptr] (w_2) -- (sig);
|
|
\draw [myptr] (w_4) -- (sig);
|
|
|
|
\draw [myptr] (b) -- (sig);
|
|
|
|
\draw [myptr] (sig) -- (act);
|
|
|
|
\draw [myptr] (act) -- (out);
|
|
|
|
% \foreach \m [count=\y] in {1,2,missing,3,4}
|
|
% \node [every neuron/.try, neuron \m/.try ] (hidden-\m) at (1.25,3.25-\y*1.25) {\(w_{k,\y}\)};
|
|
|
|
% \foreach \m [count=\y] in {1}
|
|
% \node [every neuron/.try, neuron \m/.try ] (output-\m) at (2.5,0.5-\y) {};
|
|
|
|
% \foreach \l [count=\i] in {1}
|
|
% \draw [<-] (input-\i) -- ++(-1,0)
|
|
% node [above, midway] {$x$};
|
|
|
|
% \foreach \l [count=\i] in {1,2,n-1,n}
|
|
% \node [above] at (hidden-\i.north) {$\mathcal{N}_{\l}$};
|
|
|
|
% \foreach \l [count=\i] in {1,n_l}
|
|
% \node [above] at (output-\i.north) {};
|
|
|
|
% \foreach \l [count=\i] in {1}
|
|
% \draw [->] (output-\i) -- ++(1,0)
|
|
% node [above, midway] {$y$};
|
|
|
|
% \foreach \i in {1}
|
|
% \foreach \j in {1,2,...,3,4}
|
|
% \draw [->] (input-\i) -- (hidden-\j);
|
|
|
|
% \foreach \i in {1,2,...,3,4}
|
|
% \foreach \j in {1}
|
|
% \draw [->] (hidden-\i) -- (output-\j);
|
|
|
|
\end{tikzpicture}
|
|
\caption{Structure of a single neuron}
|
|
\end{figure}
|
|
|
|
\clearpage
|
|
\subsection{Training Neural Networks}
|
|
|
|
After a neural network model is designed, like most statistical models
|
|
it has to be fit to the data. In the machine learning context this is
|
|
often called ``training'' as due to the complexity and amount of
|
|
variables in these models they are fitted iteratively to the data,
|
|
``learing'' the properties of the data better with each iteration.
|
|
|
|
There are two main categories of machine learning models, being
|
|
supervised and unsupervised learners. Unsupervised learners learn
|
|
structure in the data without guidance form outside (as labeling data
|
|
beforehand for training) popular examples of this are clustering
|
|
algorithms\todo{quelle}. Supervised learners on the other hand are as
|
|
the name suggest supervised during learning. This generally amounts to
|
|
using data with the expected response (label) attached to each
|
|
data-point in fitting the model, where usually some distance between
|
|
the model output and the labels is minimized.
|
|
|
|
\subsubsection{Interpreting the Output}
|
|
|
|
In order to properly interpret the output of a neural network and
|
|
training it, depending on the problem it might be advantageous to
|
|
transform the output form the last layer. Given the nature of the
|
|
neural network the value at each output node is a real number. This is
|
|
desirable for applications where the desired output is a real numbered
|
|
vector (e.g. steering inputs for a autonomous car), however for
|
|
classification problems it is desirable to transform this
|
|
output. Often classification problems are modeled in such a way that
|
|
each output node corresponds to a class. Then the output vector needs
|
|
to be normalized in order to give a prediction. The naive approach is
|
|
to transform the output vector $o$ into a one-hot vector $p$
|
|
corresponding to a $0$
|
|
entry for all classes except one, which is the predicted class.
|
|
|
|
\[
|
|
p_i =
|
|
\begin{cases}
|
|
1,& i < j, \forall i,j \in \text{arg}\max o_i, \\
|
|
0,& \text{else.}
|
|
\end{cases}
|
|
\]\todo{besser formulieren}
|
|
|
|
However this imposes difficulties in training the network as with this
|
|
addition the model is no longer differentiable which imitates the
|
|
ways the model can be trained. Additionally information about the
|
|
``certainty'' for each class in the prediction gets lost. A popular
|
|
way to circumvent this problem is to normalize the output vector is
|
|
such a way that the entries add up to one, this allows for the
|
|
interpretation of probabilities assigned to each class.
|
|
|
|
\subsubsection{Error Measurement}
|
|
|
|
In order to make assessment about the quality of a network $\mathcal{NN}$ and train
|
|
it we need to discuss how we measure error. As for regression problems
|
|
the output is continuous in contrast to the class predictions in a
|
|
classification problem, we need to discuss these problems separately.
|
|
\paragraph{Regression Problems}
|
|
|
|
\subsubsection{Gradient Descent Algorithm}
|
|
|
|
When trying to fit a neural network it is hard
|
|
to predict the impact of the single parameters on the accuracy of the
|
|
output. Thus applying numeric optimization algorithms is the only
|
|
feasible way to fit the model. A attractive algorithm for training
|
|
neural networks is gradient descent where each parameter $\theta_i$ is
|
|
iterative changed according to the gradient regarding the error
|
|
measure and a step size $\gamma$. For this all parameters are
|
|
initialized (often random or close to zero) and then iteratively
|
|
updated until a certain criteria is hit, mostly either being a fixed
|
|
number of iterations or a desired upper limit for the error measure.
|
|
% For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$
|
|
% and a error function $L(f_\theta)$ the gradient descent algorithm is
|
|
% given in \ref{alg:gd}.
|
|
|
|
\begin{algorithm}[H]
|
|
\SetAlgoLined
|
|
\KwInput{function $f_\theta$ with parameters $\theta \in
|
|
\mathbb{R}^n$ \newline step size $\gamma$}
|
|
initialize $\theta^0$\;
|
|
$i \leftarrow 1$\;
|
|
\While{While termination condition is not met}{
|
|
$\nabla \leftarrow \frac{\mathrm{d}f_\theta}{\mathrm{d} \theta}\vert_{\theta^{i-1}}$\;
|
|
$\theta^i \leftarrow \theta^{i-1} - \gamma \nabla $\;
|
|
$i \leftarrow i +1$\;
|
|
}
|
|
|
|
\caption{Gradient Descent}
|
|
\label{alg:gd}
|
|
\end{algorithm}
|
|
|
|
The algorithm for gradient descent is given in
|
|
Algorithm~\ref{alg:gd}. In the context of fitting a neural network
|
|
$f_\theta$ corresponds to the error measurement of the network
|
|
$L\left(\mathcal{NN}_{\theta}\right)$ where $\theta$ is a vector
|
|
containing all the weights and biases of the network.
|
|
As ca be seen this requires computing the derivative of the network
|
|
with regard to each variable. With the number of variables getting
|
|
large in networks with multiple layers of high neuron count naively
|
|
computing these can get quite memory and computational expensive. But
|
|
by using the chain rule and exploiting the layered structure we can
|
|
compute the gradient much more efficiently by using backpropagation
|
|
first introduced by \textcite{backprop}.
|
|
|
|
\subsubsection{Backpropagation}
|
|
|
|
As with an increasing amount of layers the derivative of a loss
|
|
function with respect to a certain variable becomes more intensive to
|
|
compute there have been efforts in increasing the efficiency of
|
|
computing these derivatives. Today the BACKPROPAGATION algorithm is
|
|
widely used to compute the derivatives needed for the optimization
|
|
algorithms. Here instead of naively calculating the derivative for
|
|
each variable, the chain rule is used in order to compute derivatives
|
|
for each layer from output layer towards the first layer while only
|
|
needing to ....
|
|
|
|
\[
|
|
\frac{\partial L(...)}{}
|
|
\]
|
|
|
|
%%% Local Variables:
|
|
%%% mode: latex
|
|
%%% TeX-master: "main"
|
|
%%% End:
|