new commit

main
Tobias Arndt 4 years ago
parent cb9777f037
commit 06d93ef937

4
.gitignore vendored

@ -27,3 +27,7 @@ main-blx.bib
# no slurm logs # no slurm logs
*slurm*.out *slurm*.out
# no plot data
*.csv
*.mean

@ -0,0 +1,37 @@
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
\babel@toc {english}{}
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {2.1}{\ignorespaces Illustration of a neural network}}{2}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {2.2}{\ignorespaces Plots of the activation functions\relax }}{4}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {2.3}{\ignorespaces Structure of a single neuron\relax }}{4}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {3.1}{\ignorespaces Overfitting of shallow neural networks}}{10}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {3.2}{\ignorespaces Comparison of shallow neural networks and regression splines}}{21}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.1}{\ignorespaces Signal smoothing using convolution}}{23}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.2}{\ignorespaces Channel separation of color image}}{24}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.3}{\ignorespaces Convolution applied on image}}{25}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.4}{\ignorespaces MNIST data set}}{29}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.5}{\ignorespaces architecture\relax }}{29}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.6}{\ignorespaces Performance comparison of SDG and GD}}{30}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.7}{\ignorespaces Performance comparison of training algorithms}}{35}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.8}{\ignorespaces Image data generation}}{37}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.9}{\ignorespaces Performance comparison of overfitting measures}}{38}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.10}{\ignorespaces Fashion MNIST data set}}{39}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.11}{\ignorespaces \relax }}{41}%
\defcounter {refsection}{0}\relax
\contentsline {figure}{\numberline {4.12}{\ignorespaces Sample pictures of the mnist fashioyn dataset, one per class.\relax }}{41}%

@ -0,0 +1,58 @@
datagen_dropout_02_1
test
0.6604& 0.5175& 0.60136& 0.002348447
datagen_dropout_00_1
test
0.6704& 0.4878& 0.58621& 0.003600539
dropout_02_1
test
0.5312& 0.4224& 0.47137& 0.001175149
default_1
test
0.5633& 0.3230& 0.45702& 0.004021449
datagen_dropout_02_10
test
0.9441& 0.9061& 0.92322& 0.00015
train
1& 0.97& 0.989& 1e-04
datagen_dropout_00_10
test
0.931& 0.9018& 0.9185& 6e-05
train
1& 0.97& 0.99& 0.00013
dropout_02_10
test
0.9423& 0.9081& 0.92696& 0.00013
train
1& 0.99& 0.992& 2e-05
default_10
test
0.8585& 0.8148& 0.83771& 0.00027
train
1& 1& 1& 0
datagen_dropout_02_100
test
0.9805& 0.9727& 0.97826& 0
train
datagen_dropout_00_100
test
0.981& 0.9702& 0.9769& 1e-05
train
dropout_02_100
test
0.9796& 0.9719& 0.97703& 1e-05
train
default_100
test
0.9637& 0.9506& 0.95823& 2e-05

@ -115,7 +115,9 @@ plot coordinates {
\caption{$\lambda = 3.0$} \caption{$\lambda = 3.0$}
\end{subfigure} \end{subfigure}
\end{subfigure} \end{subfigure}
\caption{% In these Figures the behaviour stated in ... is visualized \caption[Comparison of shallow neural networks and regression
splines]{% In these Figures the behaviour stated in ... is
% visualized
% in two exaples. For $(a), (b), (c)$ six values of sinus equidistantly % in two exaples. For $(a), (b), (c)$ six values of sinus equidistantly
% spaced on $[-\pi, \pi]$ have been used as training data. For % spaced on $[-\pi, \pi]$ have been used as training data. For
% $(d),(e),(f)$ 15 equidistand values have been used, where % $(d),(e),(f)$ 15 equidistand values have been used, where
@ -131,6 +133,7 @@ plot coordinates {
$\text{data}_B$ in d), e), f). $\text{data}_B$ in d), e), f).
The Parameters of each are given above. The Parameters of each are given above.
} }
\label{fig:rn_vs_rs}
\end{figure} \end{figure}
%%% Local Variables: %%% Local Variables:
%%% mode: latex %%% mode: latex

@ -65,7 +65,7 @@ plot coordinates {
\caption{Performance metrics during training} \caption{Performance metrics during training}
\end{subfigure} \end{subfigure}
% \\~\\ % \\~\\
\caption{The neural network given in ?? trained with different \caption[Performance comparison of SDG and GD]{The neural network given in ?? trained with different
algorithms on the MNIST handwritten digits data set. For gradient algorithms on the MNIST handwritten digits data set. For gradient
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For
stochastic gradient descend a batch size of 32 and learning rate stochastic gradient descend a batch size of 32 and learning rate

@ -40,7 +40,7 @@
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf} \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf}
\caption{Ankle boot} \caption{Ankle boot}
\end{subfigure} \end{subfigure}
\caption{The fashtion MNIST data set contains 70.000 images of \caption[Fashion MNIST data set]{The fashtion MNIST data set contains 70.000 images of
preprocessed product images from Zalando, which are categorized as preprocessed product images from Zalando, which are categorized as
T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while

@ -51,7 +51,7 @@ plot coordinates {
\begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}} \begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}}
\Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\ \Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\
\hline \hline
\multicolumn{7}{c}{Classification Accuracy}\Bstrut \\ \multicolumn{7}{c}{Test Accuracy}\Bstrut \\
\cline{2-7} \cline{2-7}
mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\ mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\
max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\ max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\
@ -64,8 +64,9 @@ plot coordinates {
min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\ min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\
\end{tabu} \end{tabu}
\caption{Mean and maximum accuracy after 48 epochs of training.} \caption{Mean and maximum accuracy after 48 epochs of training.}
\label{fig:gen_dropout_b}
\end{subfigure} \end{subfigure}
\caption{Accuracy for the net given in ... with Dropout (D.), \caption[Performance comparison of overfitting measures]{Accuracy for the net given in ... with Dropout (D.),
data generation (G.), a combination, or neither (Default) implemented and trained data generation (G.), a combination, or neither (Default) implemented and trained
with \textsc{Adam}. For each epoch the 60.000 training samples with \textsc{Adam}. For each epoch the 60.000 training samples
were used, or for data generation 10.000 steps with each using were used, or for data generation 10.000 steps with each using
@ -73,6 +74,7 @@ plot coordinates {
model was trained 5 times and the average accuracies at each epoch model was trained 5 times and the average accuracies at each epoch
are given in (a). Mean, maximum and minimum values of accuracy on are given in (a). Mean, maximum and minimum values of accuracy on
the test and training set are given in (b).} the test and training set are given in (b).}
\label{fig:gen_dropout}
\end{figure} \end{figure}
%%% Local Variables: %%% Local Variables:
%%% mode: latex %%% mode: latex

@ -30,7 +30,7 @@
\begin{subfigure}{0.19\textwidth} \begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf} \includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
\end{subfigure} \end{subfigure}
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten \caption[MNIST data set]{The MNIST data set contains 70.000 images of preprocessed handwritten
digits. Of these images 60.000 are used as training images, while digits. Of these images 60.000 are used as training images, while
the rest are used to validate the models trained.} the rest are used to validate the models trained.}
\label{fig:MNIST} \label{fig:MNIST}

@ -5,7 +5,9 @@
\usepackage{adjustbox} \usepackage{adjustbox}
\usepackage{xcolor} \usepackage{xcolor}
\usepackage{tabu} \usepackage{tabu}
\usepackage{showframe}
\usepackage{graphicx} \usepackage{graphicx}
\usepackage{titlecaps}
\usetikzlibrary{calc, 3d} \usetikzlibrary{calc, 3d}
\usepgfplotslibrary{colorbrewer} \usepgfplotslibrary{colorbrewer}
@ -29,33 +31,62 @@ plot coordinates {
\begin{tikzpicture} \begin{tikzpicture}
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east}, height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle ylabel = {Test Accuracy}, cycle
list/Dark2, every axis plot/.append style={line width =1.25pt}] list/Dark2, every axis plot/.append style={line width
=1.25pt}]
% \addplot [dashed] table % \addplot [dashed] table
% [x=epoch, y=accuracy, col sep=comma, mark = none] % [x=epoch, y=accuracy, col sep=comma, mark = none]
% {Data/adam_datagen_full.log}; % {Data/adam_datagen_full.log};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_datagen_full_mean.log}; {Data/adam_1.mean};
% \addplot [dashed] table % \addplot [dashed] table
% [x=epoch, y=accuracy, col sep=comma, mark = none] % [x=epoch, y=accuracy, col sep=comma, mark = none]
% {Data/adam_datagen_dropout_02_full.log}; % {Data/adam_datagen_dropout_02_full.log};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_datagen_dropout_02_full_mean.log}; {Data/adam_datagen_1.mean};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_datagen_dropout_04_full_mean.log}; {Data/adam_datagen_dropout_02_1.mean};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_dropout_02_full_mean.log}; {Data/adam_dropout_02_1.mean};
\addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G. + D. 0.2}}
\addlegendentry{\footnotesize{G. + D. 0.4}}
\addlegendentry{\footnotesize{D. 0.2}}
\addlegendentry{\footnotesize{D. 0.4}}
\addlegendentry{\footnotesize{Default}}
\end{axis}
\end{tikzpicture}
\caption{1 sample per class}
\vspace{0.25cm}
\end{subfigure}
\begin{subfigure}[h]{\textwidth}
\begin{tikzpicture}
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
ylabel = {Test Accuracy}, cycle
list/Dark2, every axis plot/.append style={line width
=1.25pt}]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_dropout_00_10.mean};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_dropout_04_full_mean.log}; {Data/adam_dropout_02_10.mean};
\addplot [dashed] table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_datagen_dropout_00_10.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_full_mean.log}; {Data/adam_datagen_dropout_02_10.mean};
\addlegendentry{\footnotesize{G.}} \addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G. + D. 0.2}} \addlegendentry{\footnotesize{G. + D. 0.2}}
@ -65,26 +96,39 @@ plot coordinates {
\addlegendentry{\footnotesize{Default}} \addlegendentry{\footnotesize{Default}}
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
\caption{Classification accuracy} \caption{10 samples per class}
\vspace{.25cm}
\end{subfigure} \end{subfigure}
\begin{subfigure}[h]{1.0\linewidth} \begin{subfigure}[h]{\textwidth}
\begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}} \begin{tikzpicture}
\Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\ \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
\hline /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
\multicolumn{7}{c}{Classification Accuracy}\Bstrut \\ height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
\cline{2-7} xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\ list/Dark2, every axis plot/.append style={line width
max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\ =1.25pt}, ymin = {0.92}]
min & 0.9887 & 0.9909 & 0.9922 & 0.9929 & 0.9929 & 0.9934 \\ \addplot table
\hline [x=epoch, y=val_accuracy, col sep=comma, mark = none]
\multicolumn{7}{c}{Training Accuracy}\Bstrut \\ {Data/adam_dropout_00_100.mean};
\cline{2-7} \addplot table
mean \Tstrut & 0.9994 & 0.9991 & 0.9989 & 0.9967 & 0.9954 & 0.9926 \\ [x=epoch, y=val_accuracy, col sep=comma, mark = none]
max & 0.9996 & 0.9996 & 0.9992 & 0.9979 & 0.9971 & 0.9937 \\ {Data/adam_dropout_02_100.mean};
min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\ \addplot table
\end{tabu} [x=epoch, y=val_accuracy, col sep=comma, mark = none]
\caption{Mean and maximum accuracy after 48 epochs of training.} {Data/adam_datagen_dropout_00_100.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_datagen_dropout_02_100.mean};
\addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G. + D. 0.2}}
\addlegendentry{\footnotesize{G. + D. 0.4}}
\addlegendentry{\footnotesize{D. 0.2}}
\addlegendentry{\footnotesize{D. 0.4}}
\addlegendentry{\footnotesize{Default}}
\end{axis}
\end{tikzpicture}
\caption{100 samples per class}
\vspace{.25cm}
\end{subfigure} \end{subfigure}
\caption{Accuracy for the net given in ... with Dropout (D.), \caption{Accuracy for the net given in ... with Dropout (D.),
data generation (G.), a combination, or neither (Default) implemented and trained data generation (G.), a combination, or neither (Default) implemented and trained
@ -95,6 +139,40 @@ plot coordinates {
are given in (a). Mean, maximum and minimum values of accuracy on are given in (a). Mean, maximum and minimum values of accuracy on
the test and training set are given in (b).} the test and training set are given in (b).}
\end{figure} \end{figure}
\begin{table}
\centering
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
\hline
&
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
\cline{2-5}
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
\hline
&
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
\cline{2-5}
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
\hline
&
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
\cline{2-5}
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
\hline
\end{tabu}
\caption{Values of the test accuracy of the model trained 10 times
of random training sets containing 1, 10 and 100 data points per
class.}
\end{table}
\begin{center} \begin{center}
\begin{figure}[h] \begin{figure}[h]

@ -10,7 +10,7 @@ plot coordinates {
} }
} }
\begin{figure} \begin{figure}
\begin{subfigure}[b]{\textwidth} \begin{subfigure}[h]{\textwidth}
\begin{tikzpicture} \begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth, \begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east}, height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
@ -32,30 +32,31 @@ plot coordinates {
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
%\caption{Classification accuracy} %\caption{Classification accuracy}
\vspace{.25cm}
\end{subfigure} \end{subfigure}
\begin{subfigure}[b]{\textwidth} % \begin{subfigure}[b]{\textwidth}
\begin{tikzpicture} % \begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth, % \begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymax = 0.5, % height = 0.6\textwidth, ymax = 0.5,
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels = % xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}] % {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
\addplot table % \addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log}; % [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
\addplot table % \addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log}; % [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
\addplot table % \addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log}; % [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
\addlegendentry{\footnotesize{ADAGRAD}} % \addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}} % \addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}} % \addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$} % \addlegendentry{SGD$_{0.01}$}
\end{axis} % \end{axis}
\end{tikzpicture} % \end{tikzpicture}
\caption{Performance metrics during training} % \caption{Performance metrics during training}
\end{subfigure} % \vspace{.25cm}
\\~\\ % \end{subfigure}
\begin{subfigure}[b]{1.0\linewidth} \begin{subfigure}[b]{1.0\linewidth}
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}} \begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
\multicolumn{3}{c}{Classification Accuracy} \multicolumn{3}{c}{Classification Accuracy}
@ -67,8 +68,9 @@ plot coordinates {
\end{tabu} \end{tabu}
\caption{Performace metrics after 20 epochs} \caption{Performace metrics after 20 epochs}
\end{subfigure} \end{subfigure}
\caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained \caption[Performance comparison of training algorithms]{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
with different optimization algorithms} with different optimization algorithms}
\label{fig:comp_alg}
\end{figure} \end{figure}
%%% Local Variables: %%% Local Variables:
%%% mode: latex %%% mode: latex

@ -28,7 +28,7 @@
\end{adjustbox} \end{adjustbox}
\caption{True position (\textcolor{red}{red}), filtered position data (black)} \caption{True position (\textcolor{red}{red}), filtered position data (black)}
\end{subfigure} \end{subfigure}
\caption{Example for noise reduction using convolution with simulated \caption[Signal smoothing using convolution]{Example for noise reduction using convolution with simulated
positional data. As filter positional data. As filter
$g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$ $g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$
is chosen and applied to the $x$ and $y$ coordinate is chosen and applied to the $x$ and $y$ coordinate

@ -177,3 +177,28 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib}, biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org} bibsource = {dblp computer science bibliography, https://dblp.org}
} }
@article{transfer_learning,
author = {Zhao,Wei},
title = {Research on the deep learning of the small sample data based on transfer learning},
journal = {AIP Conference Proceedings},
volume = {1864},
number = {1},
pages = {020018},
year = {2017},
doi = {10.1063/1.4992835},
URL = {https://aip.scitation.org/doi/abs/10.1063/1.4992835},
eprint = {https://aip.scitation.org/doi/pdf/10.1063/1.4992835}
}
@article{gan,
title = "GAN-based synthetic medical image augmentation for increased CNN performance in liver lesion classification",
journal = "Neurocomputing",
volume = 321,
pages = "321 - 331",
year = 2018,
issn = "0925-2312",
doi = "https://doi.org/10.1016/j.neucom.2018.09.013",
url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan"
}

@ -85,7 +85,7 @@ channel (color) $c$ to the respective value $v$
\end{scope} \end{scope}
\end{tikzpicture} \end{tikzpicture}
\end{adjustbox} \end{adjustbox}
\caption{On the right the red, green and blue chances of the picture \caption[Channel separation of color image]{On the right the red, green and blue chances of the picture
are displayed. In order to better visualize the color channels the are displayed. In order to better visualize the color channels the
black and white picture of each channel has been colored in the black and white picture of each channel has been colored in the
respective color. Combining the layers results in the image on the respective color. Combining the layers results in the image on the
@ -177,7 +177,7 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
% \includegraphics[width=\textwidth]{Plots/Data/image_conv6.png} % \includegraphics[width=\textwidth]{Plots/Data/image_conv6.png}
% \caption{test} % \caption{test}
% \end{subfigure} % \end{subfigure}
\caption{Convolution of original greyscale Image (a) with different \caption[Convolution applied on image]{Convolution of original greyscale Image (a) with different
kernels. In (b) and (c) Gaussian kernels of size 11 and stated kernels. In (b) and (c) Gaussian kernels of size 11 and stated
$\sigma^2$ are used. In (d) - (f) the above defined Sobel Operator $\sigma^2$ are used. In (d) - (f) the above defined Sobel Operator
kernels are used.} kernels are used.}
@ -186,7 +186,7 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
\clearpage \clearpage
\newpage \newpage
\subsection{Convolutional NN} \subsection{Convolutional NN}
\todo{Eileitung zu CNN} \todo{Eileitung zu CNN amout of parameters}
% Conventional neural network as described in chapter .. are made up of % Conventional neural network as described in chapter .. are made up of
% fully connected layers, meaning each node in a layer is influenced by % fully connected layers, meaning each node in a layer is influenced by
% all nodes of the previous layer. If one wants to extract information % all nodes of the previous layer. If one wants to extract information
@ -219,11 +219,11 @@ The usage of multiple filters results in multiple outputs of the same
size as the input. These are often called channels. Depending on the size as the input. These are often called channels. Depending on the
size of the filters this can result in the dimension of the output size of the filters this can result in the dimension of the output
being one larger than the input. being one larger than the input.
However for convolutional layers following a convolutional layer the However for convolutional layers that are preceded by convolutional layers the
size of the filter is often chosen to coincide with the amount of channels size of the filter is often chosen to coincide with the amount of channels
of the output of the previous layer without using padding in this of the output of the previous layer without using padding in this
direction in order to prevent gaining additional direction in order to prevent gaining additional
dimensions\todo{komisch} in the output. dimensions\todo{filter mit ganzer tiefe besser erklären} in the output.
This can also be used to flatten certain less interesting channels of This can also be used to flatten certain less interesting channels of
the input as for example a color channels. the input as for example a color channels.
Thus filters used in convolutional networks are usually have the same Thus filters used in convolutional networks are usually have the same
@ -264,11 +264,11 @@ reduced in size by extracting a single value from a
neighborhood \todo{moving...}... . The resulting output size is dependent on neighborhood \todo{moving...}... . The resulting output size is dependent on
the offset of the neighborhoods used. Popular is max-pooling where the the offset of the neighborhoods used. Popular is max-pooling where the
largest value in a neighborhood is used or. largest value in a neighborhood is used or.
\todo{kleine grafik}
This construct allows for extraction of features from the input while The combination of convolution and pooling layers allows for
using far less input variables. extraction of features from the input in the from of feature maps while
using relatively few parameters that need to be trained.
... \todo{Beispiel mit kleinem Bild, am besten das von oben} \todo{Beispiel feature maps}
\subsubsection{Parallels to the Visual Cortex in Mammals} \subsubsection{Parallels to the Visual Cortex in Mammals}
@ -447,11 +447,15 @@ algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD})
laying the base work. Here for each parameter update the learning rate laying the base work. Here for each parameter update the learning rate
is given my a constant is given my a constant
$\gamma$ is divided by the sum of the squares of the past partial $\gamma$ is divided by the sum of the squares of the past partial
derivatives in this parameter. This results in a monotonously derivatives in this parameter. This results in a monotonous decaying
decreasing learning rate for each parameter. This results in a faster learning rate with faster
decaying learning rate for parameters with large updates, where as decay for parameters with large updates, where as
parameters with small updates experience smaller decay. The \textsc{AdaGrad} parameters with small updates experience smaller decay. The \textsc{AdaGrad}
algorithm is given in Algorithm~\ref{alg:ADAGRAD}. algorithm is given in Algorithm~\ref{alg:ADAGRAD}. Note that while
this algorithm is still based upon the idea of gradient descent it no
longer takes steps in the direction of the gradient while
updating. Due to the individual learning rates for each parameter only
the direction/sign for single parameters remain the same.
\begin{algorithm}[H] \begin{algorithm}[H]
\SetAlgoLined \SetAlgoLined
@ -461,29 +465,64 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{ \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
Compute Gradient: $g_t$\; Compute Gradient: $g_t$\;
Compute Update: $\Delta x_{t,i} \leftarrow Compute Update: $\Delta x_{t,i} \leftarrow
-\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_t, \forall i = -\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_{t,i}, \forall i =
1, \dots,p$\; 1, \dots,p$\;
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
} }
\caption{\textls{\textsc{AdaGrad}}} \caption{\textsc{AdaGrad}}
\label{alg:ADAGRAD} \label{alg:ADAGRAD}
\end{algorithm} \end{algorithm}
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta}) Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the
\textsc{AdaDelta} algorithm
in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the
continual decay of the learning rate and the need for a manually continual decay of the learning rate and the need for a manually
selected global learning rate $\gamma$. selected global learning rate $\gamma$.
As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will
eventually become infinitely small. eventually become infinitely small.
In order to ensure that even after a significant of iterations In order to ensure that even after a significant of iterations
learning continues to make progress instead of summing the gradients a learning continues to make progress instead of summing the squared gradients a
exponentially decaying average of the past gradients is used to .... exponentially decaying average of the past squared gradients is used to for
regularizing the learning rate resulting in
\begin{align*}
E[g^2]_t & = \rho E[g^2]_{t-1} + (1-\rho) g_t^2, \\
\Delta x_t & = -\frac{\gamma}{\sqrt{E[g^2]_t + \varepsilon}} g_t,
\end{align*}
for a decay rate $\rho$.
Additionally the fixed global learning rate $\gamma$ is substituted by Additionally the fixed global learning rate $\gamma$ is substituted by
a exponentially decaying average of the past parameter updates. a exponentially decaying average of the past parameter updates.
The usage of the past parameter updates is motivated by ensuring that The usage of the past parameter updates is motivated by ensuring that
if the parameter vector had some hypothetical units they would be matched hypothetical units of the parameter vector match those of the
by these of the parameter update $\Delta x_t$. This proper parameter update $\Delta x_t$. When only using the
\todo{erklärung unit} gradient with a scalar learning rate as in SDG the resulting unit of
the parameter update is:
\[
\text{units of } \Delta x \propto \text{units of } g \propto
\frac{\partial f}{\partial x} \propto \frac{1}{\text{units of } x},
\]
assuming the cost function $f$ is unitless. \textsc{AdaGrad} neither
has correct units since the update is given by a ratio of gradient
quantities resulting in a unitless parameter update. If however
Hessian information or a approximation thereof is used to scale the
gradients the unit of the updates will be correct:
\[
\text{units of } \Delta x \propto H^{-1} g \propto
\frac{\frac{\partial f}{\partial x}}{\frac{\partial ^2 f}{\partial
x^2}} \propto \text{units of } x
\]
Since using the second derivative results in correct units, Newton's
method (assuming diagonal hessian) is rearranged to determine the
quantities involved in the inverse of the second derivative:
\[
\Delta x = \frac{\frac{\partial f}{\partial x}}{\frac{\partial ^2
f}{\partial x^2}} \iff \frac{1}{\frac{\partial^2 f}{\partial
x^2}} = \frac{\Delta x}{\frac{\partial f}{\partial x}}.
\]
As the root mean square of the past gradients is already used in the
denominator of the learning rate a exponentially decaying root mean
square of the past updates is used to obtain a $\Delta x$ quantity for
the denominator resulting in the correct unit of the update. The full
algorithm is given by Algorithm~\ref{alg:adadelta}.
\begin{algorithm}[H] \begin{algorithm}[H]
\SetAlgoLined \SetAlgoLined
@ -501,23 +540,24 @@ by these of the parameter update $\Delta x_t$. This proper
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
} }
\caption{\textsc{AdaDelta}, \textcite{ADADELTA}} \caption{\textsc{AdaDelta}, \textcite{ADADELTA}}
\label{alg:gd} \label{alg:adadelta}
\end{algorithm} \end{algorithm}
While the stochastic gradient algorithm is less susceptible to local While the stochastic gradient algorithm is less susceptible to getting
stuck in local
extrema than gradient descent the problem still persists especially extrema than gradient descent the problem still persists especially
with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14} for saddle points with steep .... \textcite{DBLP:journals/corr/Dauphinpgcgb14}
An approach to the problem of ``getting stuck'' in saddle point or An approach to the problem of ``getting stuck'' in saddle point or
local minima/maxima is the addition of momentum to SDG. Instead of local minima/maxima is the addition of momentum to SDG. Instead of
using the actual gradient for the parameter update an average over the using the actual gradient for the parameter update an average over the
past gradients is used. In order to avoid the need to SAVE the past past gradients is used. In order to avoid the need to SAVE the past
values usually a exponentially decaying average is used resulting in values usually a exponentially decaying average is used resulting in
Algorithm~\ref{alg_momentum}. This is comparable of following the path Algorithm~\ref{alg:sgd_m}. This is comparable of following the path
of a marble with mass rolling down the SLOPE of the error of a marble with mass rolling down the slope of the error
function. The decay rate for the average is comparable to the TRÄGHEIT function. The decay rate for the average is comparable to the inertia
of the marble. of the marble.
This results in the algorithm being able to escape ... due to the This results in the algorithm being able to escape some local extrema due to the
build up momentum from approaching it. build up momentum from approaching it.
% \begin{itemize} % \begin{itemize}
@ -539,14 +579,26 @@ build up momentum from approaching it.
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
} }
\caption{SDG with momentum} \caption{SDG with momentum}
\label{alg:gd} \label{alg:sgd_m}
\end{algorithm} \end{algorithm}
In an effort to combine the properties of the momentum method and the In an effort to combine the properties of the momentum method and the
automatic adapted learning rate of \textsc{AdaDelta} \textcite{ADAM} automatic adapted learning rate of \textsc{AdaDelta} \textcite{ADAM}
developed the \textsc{Adam} algorithm. The developed the \textsc{Adam} algorithm, given in
Algorithm~\ref{alg:adam}. Here the exponentially decaying
Problems / Improvements ADAM \textcite{rADAM} root mean square of the gradients is still used for realizing and
combined with the momentum method. Both terms are normalized such that
the ... are the first and second moment of the gradient. However the term used in
\textsc{AdaDelta} to ensure correct units is dropped for a scalar
global learning rate. This results in .. hyperparameters, however the
algorithms seems to be exceptionally stable with the recommended
parameters of ... and is a very reliable algorithm for training
neural networks.
However the \textsc{Adam} algorithm can have problems with high
variance of the adaptive learning rate early in training.
\textcite{rADAM} try to address these issues with the Rectified Adam
algorithm
\todo{will ich das einbauen?}
\begin{algorithm}[H] \begin{algorithm}[H]
@ -556,21 +608,27 @@ Problems / Improvements ADAM \textcite{rADAM}
Initialize accumulation variables $m_0 = 0$, $v_0 = 0$\; Initialize accumulation variables $m_0 = 0$, $v_0 = 0$\;
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{ \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
Compute Gradient: $g_t$\; Compute Gradient: $g_t$\;
Accumulate first and second Moment of the Gradient: Accumulate first Moment of the Gradient and correct for bias:
\begin{align*} $m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t;$\hspace{\linewidth}
m_t &\leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ $\hat{m}_t \leftarrow \frac{m_t}{1-\beta_1^t}$\;
v_t &\leftarrow \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\; Accumulate second Moment of the Gradient and correct for bias:
\end{align*} $v_t \leftarrow \beta_2 v_{t-1} + (1 - \beta_2)g_t^2;$\hspace{\linewidth}
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta $\hat{v}_t \leftarrow \frac{v_t}{1-\beta_2^t}$\;
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\; Compute Update: $\Delta x_t \leftarrow
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta -\frac{\alpha}{\sqrt{\hat{v}_t + \varepsilon}}
x^2]_{t-1} + (1+p)\Delta x_t^2$\; \hat{m}_t$\;
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
} }
\caption{ADAM, \cite{ADAM}} \caption{ADAM, \cite{ADAM}}
\label{alg:gd} \label{alg:adam}
\end{algorithm} \end{algorithm}
In order to get an understanding of the performance of the above
discussed training algorithms the neural network given in ... has been
trained on the ... and the results are given in
Figure~\ref{fig:comp_alg}.
Here it can be seen that the ADAM algorithm performs far better than
the other algorithms, with AdaGrad and Adelta following... bla bla
\input{Plots/sdg_comparison.tex} \input{Plots/sdg_comparison.tex}
@ -594,15 +652,27 @@ Problems / Improvements ADAM \textcite{rADAM}
% \cite{Dropout} % \cite{Dropout}
Similarly to shallow networks overfitting still can impact the quality of Similarly to shallow networks overfitting still can impact the quality of
convolutional neural networks. A popular way to combat this problem is convolutional neural networks.
by introducing noise into the training of the model. This is a Popular ways to combat this problem for a .. of models is averaging
successful strategy for ofter models as well, the a conglomerate of over multiple models trained on subsets (bootstrap) or introducing
descision trees grown on bootstrapped trainig samples benefit greatly noise directly during the training (for example random forest, where a
of randomizing the features available to use in each training conglomerate of decision trees benefit greatly of randomizing the
iteration (Hastie, Bachelorarbeit??). features available to use in each training iteration).
There are two approaches to introduce noise to the model during We explore implementations of these approaches for neural networks
learning, either by manipulating the model it self or by manipulating being dropout for simulating a conglomerate of networks and
the input data. introducing noise during training by slightly altering the input
pictures.
% A popular way to combat this problem is
% by introducing noise into the training of the model.
% This can be done in a variety
% This is a
% successful strategy for ofter models as well, the a conglomerate of
% descision trees grown on bootstrapped trainig samples benefit greatly
% of randomizing the features available to use in each training
% iteration (Hastie, Bachelorarbeit??).
% There are two approaches to introduce noise to the model during
% learning, either by manipulating the model it self or by manipulating
% the input data.
\subsubsection{Dropout} \subsubsection{Dropout}
If a neural network has enough hidden nodes there will be sets of If a neural network has enough hidden nodes there will be sets of
weights that accurately fit the training set (proof for a small weights that accurately fit the training set (proof for a small
@ -690,21 +760,35 @@ mirroring.
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf} \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf}
\caption{random\\positional shift} \caption{random\\positional shift}
\end{subfigure} \end{subfigure}
\caption{Example for the manipuations used in ... As all images are \caption[Image data generation]{Example for the manipuations used in ... As all images are
of the same intensity brightness manipulation does not seem of the same intensity brightness manipulation does not seem
... Additionally mirroring is not used for ... reasons.} ... Additionally mirroring is not used for ... reasons.}
\end{figure} \end{figure}
In order to compare the benefits obtained from implementing these In order to compare the benefits obtained from implementing these
measures we have trained the network given in ... on the same problem measures we have trained the network given in ... on the same problem
and implemented different combinations of the measures. The results and implemented different combinations of data generation and dropout. The results
are given in Figure~\ref{fig:gen_dropout}. Here it can be seen that ... are given in Figure~\ref{fig:gen_dropout}. For each scennario the
model was trained five times and the performance measures were
averaged. It can be seen that implementing the measures does indeed
increase the performance of the model. Implementing data generation on
its own seems to have a larger impact than dropout and applying both
increases the accuracy even further.
The better performance stems most likely from reduced overfitting. The
reduction in overfitting can be seen in
\ref{fig:gen_dropout}~(\subref{fig:gen_dropout_b}) as the training
accuracy decreases with test accuracy increasing. However utlitizing
data generation as well as dropout with a probability of 0.4 seems to
be a too aggressive approach as the training accuracy drops below the
test accuracy.
\input{Plots/gen_dropout.tex} \input{Plots/gen_dropout.tex}
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
training set?} training set?}
\clearpage
\subsubsection{\titlecap{effectivety for small training sets}} \subsubsection{\titlecap{effectivety for small training sets}}
For some applications (medical problems with small amount of patients) For some applications (medical problems with small amount of patients)
@ -726,13 +810,141 @@ full dataset: ... per class\\
100 per class 100 per class
10 per class 10 per class
the results for training .. are given in ... Here can be seen... the results for training .. are given in ... Here can be seen... that
for small training sets data generation has a large impact on the accuracy.
\begin{figure}[h] \begin{table}
\centering \centering
\missingfigure{datagen digits} \begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
\caption{Sample pictures of the mnist fashioyn dataset, one per \Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
\hline
&
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
\cline{2-5}
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
\hline
&
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
\cline{2-5}
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
\hline
&
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
\cline{2-5}
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
\hline
\end{tabu}
\caption{Values of the test accuracy of the model trained 10 times
of random training sets containing 1, 10 and 100 data points per
class.} class.}
\end{table}
\begin{figure}[h]
\centering
\begin{subfigure}[h]{\textwidth}
\begin{tikzpicture}
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
ylabel = {Test Accuracy}, cycle
list/Dark2, every axis plot/.append style={line width
=1.25pt}]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_1.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_dropout_02_1.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_1.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_dropout_02_1.mean};
\addlegendentry{\footnotesize{Default}}
\addlegendentry{\footnotesize{D. 0.2}}
\addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G. + D. 0.2}}
\addlegendentry{\footnotesize{D. 0.4}}
\addlegendentry{\footnotesize{Default}}
\end{axis}
\end{tikzpicture}
\caption{1 sample per class}
\vspace{0.25cm}
\end{subfigure}
\begin{subfigure}[h]{\textwidth}
\begin{tikzpicture}
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
ylabel = {Test Accuracy}, cycle
list/Dark2, every axis plot/.append style={line width
=1.25pt}]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_dropout_00_10.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_dropout_02_10.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_dropout_00_10.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_dropout_02_10.mean};
\addlegendentry{\footnotesize{Default.}}
\addlegendentry{\footnotesize{D. 0.2}}
\addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G + D. 0.2}}
\end{axis}
\end{tikzpicture}
\caption{10 samples per class}
\end{subfigure}
\begin{subfigure}[h]{\textwidth}
\begin{tikzpicture}
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
list/Dark2, every axis plot/.append style={line width
=1.25pt}, ymin = {0.92}]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_dropout_00_100.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_dropout_02_100.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_dropout_00_100.mean};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_dropout_02_100.mean};
\addlegendentry{\footnotesize{Default.}}
\addlegendentry{\footnotesize{D. 0.2}}
\addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G + D. 0.2}}
\end{axis}
\end{tikzpicture}
\caption{100 samples per class}
\vspace{.25cm}
\end{subfigure}
\caption{}
\label{mnist fashion} \label{mnist fashion}
\end{figure} \end{figure}
@ -752,6 +964,8 @@ the results for training .. are given in ... Here can be seen...
\item Transfer learning, use network trained on different task and \item Transfer learning, use network trained on different task and
repurpose it / train it with the training data repurpose it / train it with the training data
\end{itemize} \end{itemize}
\textcite{transfer_learning}
\textcite{gan}
%%% Local Variables: %%% Local Variables:
%%% mode: latex %%% mode: latex

@ -2,13 +2,18 @@
\section{Introduction to Neural Networks} \section{Introduction to Neural Networks}
Neural Networks (NN) are a mathematical construct inspired by the Neural Networks (NN) are a mathematical construct inspired by the
connection of neurons in nature. It consists of an input and output ... of brains in mammals. It consists of an array of neurons that
layer with an arbitrary amount of hidden layers between them. Each receive inputs and compute a accumulated output. These neurons are
layer consits of a numer of neurons (nodes) with the number of nodes arranged in layers, with one input and output layer and a arbirtary
in the in-/output layers corresponding to the dimensions of the amount of hidden layer between them.
in-/output.\par The amount of neurons in the in- and output layers correspond to the
Each neuron recieves the output of all layers in the previous layers, desired dimensions of in- and outputs of the model.
except for the input layer, which recieves the components of the input. In conventional neural networks the information is passed ... from the
input layer towards the output layer hence they are often called feed
forward networks. Each neuron in a layer has the outputs of all
neurons in the preceding layer as input (fully connected). A
illustration of a example neuronal network is given in
Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}
\tikzset{% \tikzset{%
every neuron/.style={ every neuron/.style={
@ -79,10 +84,11 @@ except for the input layer, which recieves the components of the input.
\node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$}; \node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$};
\end{tikzpicture}}%} \end{tikzpicture}}%}
\caption{Illustration of a neural network with $d_i$ inputs, $l$ \caption[Illustration of a neural network]{Illustration of a neural network with $d_i$ inputs, $l$
hidden layers with $n_{\cdot}$ nodes in each layer, as well as hidden layers with $n_{\cdot}$ nodes in each layer, as well as
$d_o$ outputs. $d_o$ outputs.
} }
\label{fig:nn}
\end{figure} \end{figure}
\subsection{Nonlinearity of Neural Networks} \subsection{Nonlinearity of Neural Networks}
@ -91,17 +97,21 @@ The arguably most important feature of neural networks that sets them
apart from linear models is the activation function implemented in the apart from linear models is the activation function implemented in the
neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the
inputs a activation function $\sigma$ is applied in order to obtain inputs a activation function $\sigma$ is applied in order to obtain
the output resulting in the output being given by the output resulting in the output of the $k$-th. neuron in a layer
being given by
\[ \[
o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right). o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right)
\] \]
for weights $w_{k,j}$ and biases $b_k$.
The activation function is usually chosen nonlinear (a linear one The activation function is usually chosen nonlinear (a linear one
would result in the entire model collapsing into a linear one) which would result in the entire model collapsing into a linear one\todo{beweis?}) which
allows it to better model data (beispiel satz ...). allows it to better model data where the relation of in- and output is
of nonlinear nature.
There are two types of activation functions, saturating and not There are two types of activation functions, saturating and not
saturating ones. Popular examples for the former are sigmoid saturating ones. Popular examples for the former are sigmoid
functions where most commonly the standard logisitc function or tanh are used functions where most commonly the standard logisitc function or tangen
as they have easy to compute derivatives which is ... for gradient hyperbolicus are used
as they have easy to compute derivatives which is desirable for gradient
based optimization algorithms. The standard logistic function (often based optimization algorithms. The standard logistic function (often
referred to simply as sigmoid function) is given by referred to simply as sigmoid function) is given by
\[ \[
@ -111,15 +121,15 @@ and has a realm of $[0,1]$. Its usage as an activation function is
motivated by modeling neurons which motivated by modeling neurons which
are close to deactive until a certain threshold where they grow in are close to deactive until a certain threshold where they grow in
intensity until they are fully intensity until they are fully
active, which is similar to the behavior of neurons in brains active, which is similar to the behavior of neurons in
\todo{besser schreiben}. The tanh function is given by brains\todo{besser schreiben}. The tangens hyperbolicus is given by
\[ \[
tanh(x) = \frac{2}{e^{2x}+1} \tanh(x) = \frac{2}{e^{2x}+1}
\] \]
The downside of these saturating activation functions is that given The downside of these saturating activation functions is that given
their ... their derivatives are close to zero for large or small their saturating nature their derivatives are close to zero for large or small
input values which can ... the ... of gradient based methods. input values which can slow or hinder the progress of gradient based methods.
The nonsaturating activation functions commonly used are the recified The nonsaturating activation functions commonly used are the recified
linear using (ReLU) or the leaky RelU. The ReLU is given by linear using (ReLU) or the leaky RelU. The ReLU is given by
@ -127,11 +137,12 @@ linear using (ReLU) or the leaky RelU. The ReLU is given by
r(x) = \max\left\{0, x\right\}. r(x) = \max\left\{0, x\right\}.
\] \]
This has the benefit of having a constant derivative for values larger This has the benefit of having a constant derivative for values larger
than zero. However the derivative being zero ... . The leaky ReLU is than zero. However the derivative being zero has the same downside for
fitting the model with gradient based methods. The leaky ReLU is
an attempt to counteract this problem by assigning a small constant an attempt to counteract this problem by assigning a small constant
derivative to all values smaller than zero and for scalar $\alpha$ is given by derivative to all values smaller than zero and for scalar $\alpha$ is given by
\[ \[
l(x) = \max\left\{0, x\right\} + \alpha. l(x) = \max\left\{0, x\right\} + \alpha \min \left\{0, x\right\}.
\] \]
In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}. In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}.
@ -144,6 +155,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)}; \addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
\caption{\titlecap{standard logistic function}}
\end{subfigure} \end{subfigure}
\begin{subfigure}{.45\linewidth} \begin{subfigure}{.45\linewidth}
\centering \centering
@ -152,6 +164,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
\addplot[domain=-5:5, samples=100]{tanh(x)}; \addplot[domain=-5:5, samples=100]{tanh(x)};
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
\caption{\titlecap{tangens hyperbolicus}}
\end{subfigure} \end{subfigure}
\begin{subfigure}{.45\linewidth} \begin{subfigure}{.45\linewidth}
\centering \centering
@ -161,6 +174,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
\addplot[domain=-5:5, samples=100]{max(0,x)}; \addplot[domain=-5:5, samples=100]{max(0,x)};
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
\caption{ReLU}
\end{subfigure} \end{subfigure}
\begin{subfigure}{.45\linewidth} \begin{subfigure}{.45\linewidth}
\centering \centering
@ -170,8 +184,9 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
\addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)}; \addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
\caption{Leaky ReLU, $\alpha = 0.1$}
\end{subfigure} \end{subfigure}
\caption{Plots of the activation fucntoins...} \caption{Plots of the activation functions}
\label{fig:activation} \label{fig:activation}
\end{figure} \end{figure}
@ -266,24 +281,28 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
\clearpage \clearpage
\subsection{Training Neural Networks} \subsection{Training Neural Networks}
After a neural network model is designed, like most statistical models As neural networks are a PARAMETRIC model we need to fit it to input
it has to be fit to the data. In the machine learning context this is data in order to get meaningfull OUTPUT from the network in order to
often called ``training'' as due to the complexity and amount of do this we first need to discuss how we interpret the output of the
variables in these models they are fitted iteratively to the data, neural network.
``learing'' the properties of the data better with each iteration.
% After a neural network model is designed, like most statistical models
There are two main categories of machine learning models, being % it has to be fit to the data. In the machine learning context this is
supervised and unsupervised learners. Unsupervised learners learn % often called ``training'' as due to the complexity and amount of
structure in the data without guidance form outside (as labeling data % variables in these models they are fitted iteratively to the data,
beforehand for training) popular examples of this are clustering % ``learing'' the properties of the data better with each iteration.
algorithms\todo{quelle}. Supervised learners on the other hand are as
the name suggest supervised during learning. This generally amounts to % There are two main categories of machine learning models, being
using data with the expected response (label) attached to each % supervised and unsupervised learners. Unsupervised learners learn
data-point in fitting the model, where usually some distance between % structure in the data without guidance form outside (as labeling data
the model output and the labels is minimized. % beforehand for training) popular examples of this are clustering
% algorithms\todo{quelle}. Supervised learners on the other hand are as
\subsubsection{Interpreting the Output / Classification vs Regression % the name suggest supervised during learning. This generally amounts to
/ Nonliniarity in last layer} % using data with the expected response (label) attached to each
% data-point in fitting the model, where usually some distance between
% the model output and the labels is minimized.
\subsubsection{\titlecap{nonliniarity in last layer}}
Given the nature of the neural net the output of the last layer are Given the nature of the neural net the output of the last layer are
real numbers. For regression tasks this is desirable, for real numbers. For regression tasks this is desirable, for
@ -316,6 +335,13 @@ and the individual values sum to one, thus the output can be interpreted as
a probability for each class given the input. a probability for each class given the input.
Additionally to being differentiable this allows for evaluataing the Additionally to being differentiable this allows for evaluataing the
cetainiy of a prediction, rather than just whether it is accurate. cetainiy of a prediction, rather than just whether it is accurate.
A similar effect is obtained when for a binary or two class problem the
sigmoid function
\[
f(x) = \frac{1}{1 + e^{-x}}
\]
is used and the output $f(x)$ is interpreted as the probability for
the first class and $1-f(x)$ for the second class.
\todo{vielleicht additiv invarianz} \todo{vielleicht additiv invarianz}
% Another property that makes softmax attractive is the invariance to addition % Another property that makes softmax attractive is the invariance to addition
@ -372,7 +398,7 @@ common in time series models. \todo{komisch}
As discussed above the output of a neural network for a classification As discussed above the output of a neural network for a classification
problem can be interpreted as a probability distribution over the classes problem can be interpreted as a probability distribution over the classes
conditioned on the input. In this case it is \todo{can?} desirable to conditioned on the input. In this case it is desirable to
use error functions designed to compare probability distributions. A use error functions designed to compare probability distributions. A
widespread error function for this use case is the cross entropy (\textcite{PRML}), widespread error function for this use case is the cross entropy (\textcite{PRML}),
which for two discrete distributions $p, q$ with the same realm $C$ is given by which for two discrete distributions $p, q$ with the same realm $C$ is given by
@ -392,15 +418,17 @@ $f$ we get the loss function
\subsubsection{Gradient Descent Algorithm} \subsubsection{Gradient Descent Algorithm}
When trying to fit a neural network it is hard Trying to find the optimal parameter for fitting the model to the data
to predict the impact of the single parameters on the accuracy of the can be a hard problem. Given the complex nature of a neural network
output. Thus applying numeric optimization algorithms is the only with many layers and neurons it is hard to predict the impact of
single parameters on the accuracy of the output.
Thus applying numeric optimization algorithms is the only
feasible way to fit the model. A attractive algorithm for training feasible way to fit the model. A attractive algorithm for training
neural networks is gradient descent where each parameter $\theta_i$ is neural networks is gradient descent where each parameter $\theta_i$ is
iterative changed according to the gradient regarding the error iterative changed according to the gradient regarding the error
measure and a step size $\gamma$. For this all parameters are measure and a step size $\gamma$. For this all parameters are
initialized (often random or close to zero) and then iteratively initialized (often random or close to zero) and then iteratively
updated until a certain criteria is hit, mostly either being a fixed updated until a certain stopping criterion is hit, mostly either being a fixed
number of iterations or a desired upper limit for the error measure. number of iterations or a desired upper limit for the error measure.
% For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$ % For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$
% and a error function $L(f_\theta)$ the gradient descent algorithm is % and a error function $L(f_\theta)$ the gradient descent algorithm is
@ -450,6 +478,7 @@ introduced by \textcite{backprop}.
\[ \[
\frac{\partial L(...)}{} \frac{\partial L(...)}{}
\] \]
\todo{Backprop richtig aufschreiben}
%%% Local Variables: %%% Local Variables:
%%% mode: latex %%% mode: latex

@ -34,7 +34,7 @@
\usepackage{todonotes} \usepackage{todonotes}
\usepackage{lipsum} \usepackage{lipsum}
\usepackage[ruled,vlined]{algorithm2e} \usepackage[ruled,vlined]{algorithm2e}
%\usepackage{showframe} \usepackage{showframe}
\usepackage[protrusion=true, expansion=true, kerning=true, letterspace \usepackage[protrusion=true, expansion=true, kerning=true, letterspace
= 150]{microtype} = 150]{microtype}
\usepackage{titlecaps} \usepackage{titlecaps}
@ -113,6 +113,8 @@
\newpage \newpage
%\setcounter{tocdepth}{4} %\setcounter{tocdepth}{4}
\tableofcontents \tableofcontents
\clearpage
\listoffigures
\listoftodos \listoftodos
\newpage \newpage
\pagenumbering{arabic} \pagenumbering{arabic}

@ -6,14 +6,15 @@
%%% End: %%% End:
\section{Shallow Neural Networks} \section{Shallow Neural Networks}
In order to get a some understanding of the behavior of neural % In order to get a some understanding of the behavior of neural
networks we study a simplified class of networks called shallow neural % networks we study a simplified class of networks called shallow neural
networks in this chapter. We consider shallow neural networks consist of a single % networks in this chapter.
hidden layer and % We consider shallow neural networks consist of a single
In order to examine some behavior of neural networks in this chapter % hidden layer and
we consider a simple class of networks, the shallow ones. These In order to get some understanding of the behavior of neural networks
networks only contain one hidden layer and have a single output node. we examine a simple class of networks in this chapter. We consider
networks that contain only one hidden layer and have a single output
node. We call these networks shallow neural networks.
\begin{Definition}[Shallow neural network] \begin{Definition}[Shallow neural network]
For a input dimension $d$ and a Lipschitz continuous activation function $\sigma: For a input dimension $d$ and a Lipschitz continuous activation function $\sigma:
\mathbb{R} \to \mathbb{R}$ we define a shallow neural network with \mathbb{R} \to \mathbb{R}$ we define a shallow neural network with
@ -84,15 +85,16 @@ with
% \end{figure} % \end{figure}
As neural networks with a large amount of nodes have a large amount of As neural networks with a large amount of nodes have a large amount of
parameters that can be tuned it can often fit the data quite well. If a ReLU parameters that can be tuned it can often fit the data quite well. If
a ReLU activation function
\[ \[
\sigma(x) \coloneqq \max{(0, x)} \sigma(x) \coloneqq \max{(0, x)}
\] \]
is chosen as activation function one can easily prove that if the is chosen one can easily prove that if the
amount of hidden nodes exceeds the amount of hidden nodes exceeds the
amount of data points in the training data a shallow network trained amount of data points in the training data a shallow network trained
on MSE will perfectly fit the data. on MSE will perfectly fit the data.
\begin{Theorem}[sinnvoller titel] \begin{Theorem}[Shallow neural network can fit data perfectly]
For training data of size t For training data of size t
\[ \[
\left(x_i^{\text{train}}, y_i^{\text{train}}\right) \in \mathbb{R}^d \left(x_i^{\text{train}}, y_i^{\text{train}}\right) \in \mathbb{R}^d
@ -150,17 +152,18 @@ on MSE will perfectly fit the data.
\label{theo:overfit} \label{theo:overfit}
\end{Theorem} \end{Theorem}
However this behavior is often not desired as over fit models often However this behavior is often not desired as over fit models generally
have bad generalization properties especially if noise is present in have bad generalization properties especially if noise is present in
the data. This effect can be seen in the data. This effect is illustrated in
Figure~\ref{fig:overfit}. Here a network that perfectly fits the Figure~\ref{fig:overfit}. Here a shallow neural network that perfectly fits the
training data regarding the MSE is \todo{Formulierung} training data regarding MSE is \todo{Formulierung}
constructed and compared to a regression spline constructed according to the proof of Theorem~\ref{theo:overfit} and
(Definition~\ref{def:wrs}). While the network compared to a regression spline
fits the data better than the spline, the spline is much closer to the (Definition~\ref{def:wrs}). While the neural network
underlying mechanism that was used to generate the data. The better fits the data better than the spline, the spline represents the
underlying mechanism that was used to generate the data more accurately. The better
generalization of the spline compared to the network is further generalization of the spline compared to the network is further
illustrated by the better validation error computed with new generated demonstrated by the better validation error computed on newly generated
test data. test data.
In order to improve the accuracy of the model we want to reduce In order to improve the accuracy of the model we want to reduce
overfitting. A possible way to achieve this is by explicitly overfitting. A possible way to achieve this is by explicitly
@ -168,7 +171,7 @@ regularizing the network through the cost function as done with
ridge penalized networks ridge penalized networks
(Definition~\ref{def:rpnn}) where large weights $w$ are punished. In (Definition~\ref{def:rpnn}) where large weights $w$ are punished. In
Theorem~\ref{theo:main1} we will Theorem~\ref{theo:main1} we will
prove that this will result in the network converging to prove that this will result in the shallow neural network converging to
regressions splines as the amount of nodes in the hidden layer is regressions splines as the amount of nodes in the hidden layer is
increased. increased.
@ -205,7 +208,7 @@ plot coordinates {
\addlegendentry{\footnotesize{spline}}; \addlegendentry{\footnotesize{spline}};
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
\caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) + \caption[Overfitting of shallow neural networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
\varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$ \varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
(\textcolor{blue}{blue dots}) the neural network constructed (\textcolor{blue}{blue dots}) the neural network constructed
according to the proof of Theorem~\ref{theo:overfit} (black) and the according to the proof of Theorem~\ref{theo:overfit} (black) and the
@ -224,14 +227,24 @@ plot coordinates {
Networks} Networks}
This section is based on \textcite{heiss2019}. We will analyze the connection of randomized shallow This section is based on \textcite{heiss2019}. We will analyze the
Neural Networks with one dimensional input and regression splines. We connection between randomized shallow
will see that the punishment of the size of the weights in training Neural Networks with one dimensional input with a ReLU as activation
function for all neurons and regression splines.
% \[
% \sigma(x) = \max\left\{0,x\right\}.
% \]
We will see that the punishment of the size of the weights in training
the randomized shallow the randomized shallow
Neural Network will result in a function that minimizes the second Neural Network will result in a learned function that minimizes the second
derivative as the amount of hidden nodes is grown to infinity. In order derivative as the amount of hidden nodes is grown to infinity. In order
to properly formulate this relation we will first need to introduce to properly formulate this relation we will first need to introduce
some definitions. some definitions, all neural networks introduced in the following will
use a ReLU as activation at all neurons.
A randomized shallow network is characterized by only the weight
parameter of the output layer being trainable, whereas the other
parameters are random numbers.
\begin{Definition}[Randomized shallow neural network] \begin{Definition}[Randomized shallow neural network]
For an input dimension $d$, let $n \in \mathbb{N}$ be the number of For an input dimension $d$, let $n \in \mathbb{N}$ be the number of
@ -244,11 +257,20 @@ some definitions.
\] \]
\label{def:rsnn} \label{def:rsnn}
\end{Definition} \end{Definition}
We call a one dimensional randomized shallow neural network were the
$L^2$ norm of the trainable weights $w$ are penalized in the loss
function ridge penalized neural networks.
% We call a randomized shallow neural network where the size of the trainable
% weights is punished in the error function a ridge penalized
% neural network. For a tuning parameter $\tilde{\lambda}$ .. the extent
% of penalization we get:
\begin{Definition}[Ridge penalized Neural Network] \begin{Definition}[Ridge penalized Neural Network]
\label{def:rpnn} \label{def:rpnn}
Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
network, as introduced in ???. Then the optimal ridge penalized network, as introduced in Definition~\ref{def:rsnn} and tuning
parameter $\tilde{\lambda} \in \mathbb{R}$. Then the optimal ridge
penalized
network is given by network is given by
\[ \[
\mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq \mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
@ -263,9 +285,8 @@ some definitions.
\tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}. \tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
\] \]
\end{Definition} \end{Definition}
In the ridge penalized Neural Network large weights are penalized, the If the amount of hidden nodes $n$ is larger than the amount of
extend of which can be tuned with the parameter $\tilde{\lambda}$. If training samples $N$ then for
$n$ is larger than the amount of training samples $N$ then for
$\tilde{\lambda} \to 0$ the network will interpolate the data while $\tilde{\lambda} \to 0$ the network will interpolate the data while
having minimal weights, resulting in the \textit{minimum norm having minimal weights, resulting in the \textit{minimum norm
network} $\mathcal{RN}_{w^{\text{min}}, \omega}$. network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
@ -280,15 +301,109 @@ having minimal weights, resulting in the \textit{minimum norm
\left\{1,\dots,N\right\}. \left\{1,\dots,N\right\}.
\] \]
For $\tilde{\lambda} \to \infty$ the learned For $\tilde{\lambda} \to \infty$ the learned
function will resemble the data less and less with the weights function will resemble the data less and with the weights
approaching $0$. .\par approaching $0$ will converge to the constant $0$ function.
In order to make the notation more convinient in the follwoing the
In order to make the notation more convinient in the following the
$\omega$ used to express the realised random parameters will no longer $\omega$ used to express the realised random parameters will no longer
be explizitly mentioned. be explicitly mentioned.
We call a function that minimizes the cubic distance between training points
and the function with respect\todo{richtiges wort} to the second
derivative of the function a regression spline.
\begin{Definition}[Regression Spline]
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
\left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
\mathbb{R}$ the regression spline is given by
\[
f^{*,\lambda} :\in \argmin_{f \in
\mathcal{C}^2}\left\{\sum_{i=1}^N
\left(f\left(x_i^{\text{train}}\right) -
y_i^{\text{train}}\right)^2 + \lambda \int f^{''}(x)^2dx\right\}.
\]
\end{Definition}
We will show that for specific hyper parameters the ridge penalized
shallow neural networks converge to a slightly modified variant of the
regression spline. We will need to incorporate the densities of the
random parameters in the loss function of the spline to ensure
convergence. Thus we define
the adapted weighted regression spline where the loss for the second
derivative is weighted by a function $g$ and the support of the second
derivative of $f$ has to be a subset the support of $g$. The formal
definition is given in Definition~\ref{def:wrs}.
% We will later ... the converging .. of the ridge penalized shallow
% neural network, in order to do so we will need a slightly modified
% version of the regression
% spline that allows for weighting the penalty term for the second
% derivative with a weight function $g$. This is needed to ...the
% distributions of the random parameters ... We call this the adapted
% weighted regression spline.
% Now we take a look at weighted regression splines. Later we will prove
% that the ridge penalized neural network as defined in
% Definition~\ref{def:rpnn} converges a weighted regression spline, as
% the amount of hidden nodes is grown to inifity.
\begin{Definition}[Adapted Weighted regression spline]
\label{def:wrs}
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
regression spline $f^{*, \lambda}_g$ is given by
\[
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
\\ \supp(f'') \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
1}^N \left(f(x_i^{\text{train}}) - y_i^{\text{train}}\right)^2}^{L(f)} +
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
\]
\todo{Anforderung an Ableitung von f, doch nicht?}
\end{Definition}
Similarly to ridge weight penalized neural networks the parameter
$\lambda$ controls a trade-off between accuracy on the training data
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
resulting function $f^{*, 0+}$ will interpolate the training data while minimizing
the second derivative. Such a function is known as cubic spline
interpolation.
\todo{cite cubic spline}
\[
f^{*, 0+} \text{ smooth spline interpolation: }
\]
\[
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
\]
For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
to linear regression of the data.
We use two intermediary functions in order to show the convergence of
the ridge penalized shallow neural network to adapted regression splines.
% In order to show that ridge penalized shallow neural networks converge
% to adapted regression splines for a growing amount of hidden nodes we
% define two intermediary functions.
One being a smooth approximation of
the neural network, and a randomized shallow neural network designed
to approximate a spline.
In order to properly BUILD these functions we need to take the points
of the network into consideration where the TRAJECTORY changes or
their points of discontinuity
As we use the ReLU activation the function learned by the
network will possess points of discontinuity where a neuron in the hidden
layer gets activated (goes from 0 -> x>0). We formalize these points
as kinks in Definition~\ref{def:kink}.
\begin{Definition} \begin{Definition}
\label{def:kink} \label{def:kink}
Let $\mathcal{RN}_w$ be a randomized shallow Neural Let $\mathcal{RN}_w$ be a randomized shallow Neural
Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can Network according to Definition~\ref{def:rsnn}, then kinks depending
on the random parameters can
be observed. be observed.
\[ \[
\mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \sigma(b_k + v_kx) \mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \sigma(b_k + v_kx)
@ -307,15 +422,14 @@ be explizitly mentioned.
\end{enumerate} \end{enumerate}
\end{Definition} \end{Definition}
In order to later prove the connection between randomised shallow Using the density of the kinks we construct a kernel and smooth the
Neural Networks and regression splines, we first take a look at a network by applying the kernel similar to convolution.
smooth approximation of the RSNN.
\begin{Definition}[Smooth Approximation of Randomized Shallow Neural \begin{Definition}[Smooth Approximation of Randomized Shallow Neural
Network] Network]
\label{def:srsnn} \label{def:srsnn}
Let $RS_{w}$ be a randomized shallow Neural Network according to Let $RS_{w}$ be a randomized shallow Neural Network according to
Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with Definition~\ref{def:rsnn} with weights $w$ and kinks $\xi_k$ with
corresponding kink density $g_{\xi}$ as given by corresponding kink density $g_{\xi}$ as given by
Definition~\ref{def:kink}. Definition~\ref{def:kink}.
In order to smooth the RSNN consider following kernel for every $x$: In order to smooth the RSNN consider following kernel for every $x$:
@ -338,53 +452,19 @@ satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$ similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w * is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not. \kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
We use $f^{w^{*,\tilde{\lambda}}}$ do describe the spline
approximating the ... ridge penalized network
$\mathrm{RN}^{*,\tilde{\lambda}}$.
Now we take a look at weighted regression splines. Later we will prove Next we construct a randomized shallow neural network which
that the ridge penalized neural network as defined in approximates a spline independent from the realization of the random
Definition~\ref{def:rpnn} converges a weighted regression spline, as parameters. In order to achieve this we ...
the amount of hidden nodes is grown to inifity.
\begin{Definition}[Adapted Weighted regression spline]
\label{def:wrs}
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
regression spline $f^{*, \lambda}_g$ is given by
\[
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
\\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
1}^N \left(f(x_i^{\text{train}}) - y_i^{\text{train}}\right)^2}^{L(f)} +
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
\]
\todo{Anforderung an Ableitung von f, doch nicht?}
\end{Definition}
Similary to ridge weight penalized neural networks the parameter
$\lambda$ controls a trade-off between accuracy on the training data
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
the second derivative. Such a function is known as cubic spline
interpolation.
\todo{cite cubic spline}
\[
f^{*, 0+} \text{ smooth spline interpolation: }
\]
\[
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
\]
For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
to linear regression of the data.
\begin{Definition}[Spline approximating Randomised Shallow Neural \begin{Definition}[Spline approximating Randomised Shallow Neural
Network] Network]
\label{def:sann} \label{def:sann}
Let $\mathcal{RN}$ be a randomised shallow Neural Network according Let $\mathcal{RN}$ be a randomised shallow Neural Network according
to Definition~\ref{def:RSNN} and $f^{*, \lambda}_g$ be the weighted to Definition~\ref{def:rsnn} and $f^{*, \lambda}_g$ be the weighted
regression spline as introduced in Definition~\ref{def:wrs}. Then regression spline as introduced in Definition~\ref{def:wrs}. Then
the randomised shallow neural network approximating $f^{*, the randomised shallow neural network approximating $f^{*,
\lambda}_g$ is given by \lambda}_g$ is given by
@ -399,9 +479,8 @@ to linear regression of the data.
\end{Definition} \end{Definition}
The approximating nature of the network in The approximating nature of the network in
Definition~\ref{def:sann} can be seen by LOOKING \todo{besseres Wort Definition~\ref{def:sann} can be seen by examining the first
finden} at the first derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given by
by
\begin{align} \begin{align}
\frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x} \frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x}
\Big{|}_{x} &= \sum_k^n \tilde{w}_k \mathds{1}_{\left\{b_k + v_k x > \Big{|}_{x} &= \sum_k^n \tilde{w}_k \mathds{1}_{\left\{b_k + v_k x >
@ -411,16 +490,18 @@ by
\xi_k < x}} \frac{v_k^2}{g_{\xi}(\xi_k) \mathbb{E}[v^2 \vert \xi \xi_k < x}} \frac{v_k^2}{g_{\xi}(\xi_k) \mathbb{E}[v^2 \vert \xi
= \xi_k]} (f_g^{*, \lambda})''(\xi_k). \label{eq:derivnn} = \xi_k]} (f_g^{*, \lambda})''(\xi_k). \label{eq:derivnn}
\end{align} \end{align}
\todo{gescheite Ableitungs Notation}
As the expression (\ref{eq:derivnn}) behaves similary to a As the expression (\ref{eq:derivnn}) behaves similary to a
Riemann-sum for $n \to \infty$ it will converge to the first Riemann-sum for $n \to \infty$ it will converge in probability to the
derievative of $f^{*,\lambda}_g$. A formal proof of this behaviour first derivative of $f^{*,\lambda}_g$. A formal proof of this behaviour
is given in Lemma~\ref{lem:s0}. is given in Lemma~\ref{lem:s0}.
In order to ensure the functions used in the proof of the convergence
are well defined we need to assume some properties of the random
parameters and their densities
In order to formulate the theorem describing the convergence of $RN_w$ % In order to formulate the theorem describing the convergence of $RN_w$
we need to make a couple of assumptions. % we need to make a couple of assumptions.
\todo{Bessere Formulierung} % \todo{Bessere Formulierung}
\begin{Assumption}~ \begin{Assumption}~
\label{ass:theo38} \label{ass:theo38}
@ -440,8 +521,8 @@ we need to make a couple of assumptions.
\end{enumerate} \end{enumerate}
\end{Assumption} \end{Assumption}
As we will prove the prorpsition in the Sobolev space, we hereby As we will prove the convergence of in the Sobolev space, we hereby
introduce it and its inuced\todo{richtiges wort?} norm. introduce it and the corresponding induced norm.
\begin{Definition}[Sobolev Space] \begin{Definition}[Sobolev Space]
For $K \subset \mathbb{R}^n$ open and $1 \leq p \leq \infty$ we For $K \subset \mathbb{R}^n$ open and $1 \leq p \leq \infty$ we
@ -473,9 +554,10 @@ introduce it and its inuced\todo{richtiges wort?} norm.
\] \]
\end{Definition} \end{Definition}
With these assumption in place we can formulate the main theorem. With the important definitions and assumptions in place we can now
\todo{Bezug Raum} formulate the main theorem ... the convergence of ridge penalized
random neural networks to adapted regression splines when the
parameters are chosen accordingly.
\begin{Theorem}[Ridge weight penaltiy corresponds to weighted regression spline] \begin{Theorem}[Ridge weight penaltiy corresponds to weighted regression spline]
\label{theo:main1} \label{theo:main1}
@ -498,7 +580,8 @@ With these assumption in place we can formulate the main theorem.
\tilde{\lambda} & \coloneqq \lambda n g(0). \tilde{\lambda} & \coloneqq \lambda n g(0).
\end{align*} \end{align*}
\end{Theorem} \end{Theorem}
We will proof Theo~\ref{theo:main1} by showing that As mentioned above we will prof Theorem~\ref{theo:main1} utilizing
the ... functions. We show that
\begin{equation} \begin{equation}
\label{eq:main2} \label{eq:main2}
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f^{w^*}}_{W^{1, \plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f^{w^*}}_{W^{1,
@ -509,10 +592,10 @@ and
\label{eq:main3} \label{eq:main3}
\plimn \norm{f^{w^*} - f_g^{*, \lambda}}_{W^{1,\infty}(K)} = 0 \plimn \norm{f^{w^*} - f_g^{*, \lambda}}_{W^{1,\infty}(K)} = 0
\end{equation} \end{equation}
and then using the triangle inequality to follow (\ref{eq:main1}). In and then get (\ref{eq:main1}) using the triangle inequality. In
order to prove (\ref{eq:main2}) and (\ref{eq:main3}) we will need to order to prove (\ref{eq:main2}) and (\ref{eq:main3}) we will need to
introduce a number of auxiliary lemmmata, proves to these will be introduce a number of auxiliary lemmmata, proves of these will be
provided in the appendix, as they would SPRENGEN DEN RAHMEN. provided in the appendix.
@ -534,7 +617,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
\exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq \exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
C_K^2 \norm{f''}_{L^2(K)}. C_K^2 \norm{f''}_{L^2(K)}.
\end{equation*} \end{equation*}
% \proof \proof The proof is given in the appendix...
% With the fundamental theorem of calculus, if % With the fundamental theorem of calculus, if
% \(\norm{f}_{L^{\infty}(K)}<\infty\) we get % \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
% \begin{equation} % \begin{equation}
@ -584,7 +667,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
\mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx \mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
\] \]
uniformly in \(T \in K\). uniformly in \(T \in K\).
% \proof \proof The proof is given in appendix...
% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to % For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and % consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\), % \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
@ -620,7 +703,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in % \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{= % \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
% 1}\right) \\ % 1}\right) \\
% % \intertext{} % \intertext{}
% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\ % &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}} % \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
% \varphi\left(l\delta, v_k\right)} % \varphi\left(l\delta, v_k\right)}
@ -685,6 +768,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
By the fundamental theorem of calculus and $\supp(f') \subset By the fundamental theorem of calculus and $\supp(f') \subset
\supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}. \supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}.
\qed \qed
\label{lem:s0}
\end{Lemma} \end{Lemma}
\begin{Lemma}[Step 2] \begin{Lemma}[Step 2]
@ -696,19 +780,22 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
F^{\lambda, g}(f^{*, \lambda}_g) = 0. F^{\lambda, g}(f^{*, \lambda}_g) = 0.
\] \]
\proof \proof
This can be prooven by showing The proof is given in the appendix...
\label{lem:s2}
\end{Lemma} \end{Lemma}
\begin{Lemma}[Step 3] \begin{Lemma}[Step 3]
For any $\lambda > 0$ and training data $(x_i^{\text{train}}, For any $\lambda > 0$ and training data $(x_i^{\text{train}},
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
\left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as \left\{1,\dots,N\right\}$, with $w^*$ as
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1} defined in Definition~\ref{def:rpnn} and $\tilde{\lambda}$ as
respectively, it holds defined in Theroem~\ref{theo:main1}, it holds
\[ \[
\plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} - \plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} -
f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0. f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0.
\] \]
\proof The proof is given in Appendix ..
\label{lem:s3}
\end{Lemma} \end{Lemma}
\begin{Lemma}[Step 4] \begin{Lemma}[Step 4]
@ -718,9 +805,11 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1} defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
respectively, it holds respectively, it holds
\[ \[
\plimn \abs{F_n^{\lambda}(\mathcal{RN}^{*,\tilde{\lambda}}) - \plimn \abs{F_n^{\tilde{\lambda}}(\mathcal{RN}^{*,\tilde{\lambda}}) -
F^{\lambda, g}(f^{w*, \tilde{\lambda}})} = 0. F^{\lambda, g}(f^{w*, \tilde{\lambda}})} = 0.
\] \]
\proof The proof is given in appendix...
\label{lem:s4}
\end{Lemma} \end{Lemma}
\begin{Lemma}[Step 7] \begin{Lemma}[Step 7]
@ -735,11 +824,81 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
\[ \[
\plimn \norm{f^n - f^{*, \lambda}} = 0. \plimn \norm{f^n - f^{*, \lambda}} = 0.
\] \]
\proof The proof is given in appendix ...
\label{lem:s7}
\end{Lemma} \end{Lemma}
Using these lemmata we can now proof Theorem~\ref{theo:main1}. We
start by showing that the error measure of the smooth approximation of
the ridge penalized randomized shallow neural network $F^{\lambda,
g}\left(f^{w^{*,\tilde{\lambda}}}\right)$
will converge in probability to the error measure of the adapted weighted regression
spline $F^{\lambda, g}\left(f^{*,\lambda}\right)$ for the specified
parameters.
\textcite{heiss2019} further show a link between ridge penalized Using Lemma~\ref{lem:s4} we get that for every $P \in (0,1)$ and
networks and randomized shallow neural networks which are trained with $\varepsilon > 0$ there exists a $n_1 \in \mathbb{N}$ such that
gradient descent which is stopped after a certain amount of iterations. \[
\mathbb{P}\left[F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) \in
F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right)
+[-\varepsilon, \varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_1}.
\]
As $\mathcal{RN}^{*,\tilde{\lambda}}$ is the optimal network for
$F_n^{\tilde{\lambda}}$ we know that
\[
F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right)
\leq F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right).
\]
Using Lemma~\ref{lem:s2} we get that for every $P \in (0,1)$ and
$\varepsilon > 0$ there exists a $n_2 \in \mathbb{N}$ such that
\[
\mathbb{P}\left[F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right)
\in F^{\lambda, g}\left(f^{*,\lambda}_g\right)+[-\varepsilon,
\varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_2}.
\]
If we combine these ... we get that for every $P \in (0,1)$ and
$\varepsilon > 0$ and $n_3 \geq
\max\left\{n_1,n_2\right\}$
\[
\mathbb{P}\left[F^{\lambda,
g}\left(f^{w^{*,\tilde{\lambda}}}\right) \leq F^{\lambda,
g}\left(f^{*,\lambda}_g\right)+2\varepsilon\right] > P, \forall
n \in \mathbb{N}_{> n_3}.
\]
As ... is in ... and ... is optimal we know that
\[
F^{\lambda, g}\left(f^{*,\lambda}_g\right) \leq F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right)
\]
and thus get with the squeeze theorem
\[
\plimn F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) = F^{\lambda, g}\left(f^{*,\lambda}_g\right).
\]
We can now use Lemma~\ref{lem:s7} to follow that
\begin{equation}
\plimn \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
_{W^{1,\infty}} = 0.
\label{eq:main2}
\end{equation}
Now by using the triangle inequality with Lemma~\ref{lem:s3} and
(\ref{eq:main2}) we get
\begin{align*}
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f_g^{*,\lambda}}
\leq& \plimn \bigg(\norm{\mathcal{RN}^{*, \tilde{\lambda}} -
f_g^{w^{*,\tilde{\lambda}}}}_{W^{1,\infty}}\\
&+ \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
_{W^{1,\infty}}\bigg) = 0
\end{align*}
and thus have proven Theorem~\ref{theo:main1}.
We now know that randomized shallow neural networks behave similar to
spline regression if we regularize the size of the weights during
training.
\textcite{heiss2019} further explore a connection between ridge penalized
networks and randomized shallow neural networks which are trained
which are only trained for a certain amount of epoch using gradient
descent.
And ... that the effect of weight regularization can be achieved by
training for a certain amount of iterations this ... between adapted
weighted regression splines and randomized shallow neural networks
where training is stopped early.
\newpage \newpage
\subsection{Simulations} \subsection{Simulations}
@ -755,7 +914,7 @@ data have been generated.
y_{i, A}^{\text{train}} &\coloneqq \sin( x_{i, A}^{\text{train}}). \phantom{(i - 1), y_{i, A}^{\text{train}} &\coloneqq \sin( x_{i, A}^{\text{train}}). \phantom{(i - 1),
i \in \left\{1, \dots, 6\right\}} i \in \left\{1, \dots, 6\right\}}
\end{align*} \end{align*}
\item $\text{data}_b = (x_{i, B}^{\text{train}}, y_{i, \item $\text{data}_B = (x_{i, B}^{\text{train}}, y_{i,
B}^{\text{train}})$ with B}^{\text{train}})$ with
\begin{align*} \begin{align*}
x_{i, B}^{\text{train}} &\coloneqq \pi\frac{i - 8}{7}, x_{i, B}^{\text{train}} &\coloneqq \pi\frac{i - 8}{7},
@ -785,9 +944,9 @@ been calculated with Matlab's ..... As ... minimizes
the smoothing parameter used for fittment is $\bar{\lambda} = the smoothing parameter used for fittment is $\bar{\lambda} =
\frac{1}{1 + \lambda}$. The parameter $\tilde{\lambda}$ for training \frac{1}{1 + \lambda}$. The parameter $\tilde{\lambda}$ for training
the networks is chosen as defined in Theorem~\ref{theo:main1} and each the networks is chosen as defined in Theorem~\ref{theo:main1} and each
one is trained on the full training data for 5000 iterations using one is trained on the full training data for 5000 epoch using
gradient descent. The gradient descent. The
results are given in Figure~\ref{blblb}, here it can be seen that in results are given in Figure~\ref{fig:rs_vs_rs}, here it can be seen that in
the intervall of the traing data $[-\pi, \pi]$ the neural network and the intervall of the traing data $[-\pi, \pi]$ the neural network and
smoothing spline are nearly identical, coinciding with the proposition. smoothing spline are nearly identical, coinciding with the proposition.

Loading…
Cancel
Save