new commit
This commit is contained in:
parent
cb9777f037
commit
06d93ef937
4
.gitignore
vendored
4
.gitignore
vendored
@ -27,3 +27,7 @@ main-blx.bib
|
||||
|
||||
# no slurm logs
|
||||
*slurm*.out
|
||||
|
||||
# no plot data
|
||||
*.csv
|
||||
*.mean
|
||||
|
37
TeX/#main.lof#
Normal file
37
TeX/#main.lof#
Normal file
@ -0,0 +1,37 @@
|
||||
|
||||
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
|
||||
\babel@toc {english}{}
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {2.1}{\ignorespaces Illustration of a neural network}}{2}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {2.2}{\ignorespaces Plots of the activation functions\relax }}{4}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {2.3}{\ignorespaces Structure of a single neuron\relax }}{4}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {3.1}{\ignorespaces Overfitting of shallow neural networks}}{10}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {3.2}{\ignorespaces Comparison of shallow neural networks and regression splines}}{21}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.1}{\ignorespaces Signal smoothing using convolution}}{23}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.2}{\ignorespaces Channel separation of color image}}{24}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.3}{\ignorespaces Convolution applied on image}}{25}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.4}{\ignorespaces MNIST data set}}{29}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.5}{\ignorespaces architecture\relax }}{29}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.6}{\ignorespaces Performance comparison of SDG and GD}}{30}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.7}{\ignorespaces Performance comparison of training algorithms}}{35}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.8}{\ignorespaces Image data generation}}{37}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.9}{\ignorespaces Performance comparison of overfitting measures}}{38}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.10}{\ignorespaces Fashion MNIST data set}}{39}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.11}{\ignorespaces \relax }}{41}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.12}{\ignorespaces Sample pictures of the mnist fashioyn dataset, one per class.\relax }}{41}%
|
58
TeX/Plots/Data/min_max.txt
Executable file
58
TeX/Plots/Data/min_max.txt
Executable file
@ -0,0 +1,58 @@
|
||||
datagen_dropout_02_1
|
||||
test
|
||||
0.6604& 0.5175& 0.60136& 0.002348447
|
||||
|
||||
datagen_dropout_00_1
|
||||
test
|
||||
0.6704& 0.4878& 0.58621& 0.003600539
|
||||
|
||||
dropout_02_1
|
||||
test
|
||||
0.5312& 0.4224& 0.47137& 0.001175149
|
||||
|
||||
default_1
|
||||
test
|
||||
0.5633& 0.3230& 0.45702& 0.004021449
|
||||
|
||||
datagen_dropout_02_10
|
||||
test
|
||||
0.9441& 0.9061& 0.92322& 0.00015
|
||||
train
|
||||
1& 0.97& 0.989& 1e-04
|
||||
|
||||
datagen_dropout_00_10
|
||||
test
|
||||
0.931& 0.9018& 0.9185& 6e-05
|
||||
train
|
||||
1& 0.97& 0.99& 0.00013
|
||||
|
||||
dropout_02_10
|
||||
test
|
||||
0.9423& 0.9081& 0.92696& 0.00013
|
||||
train
|
||||
1& 0.99& 0.992& 2e-05
|
||||
|
||||
default_10
|
||||
test
|
||||
0.8585& 0.8148& 0.83771& 0.00027
|
||||
train
|
||||
1& 1& 1& 0
|
||||
|
||||
datagen_dropout_02_100
|
||||
test
|
||||
0.9805& 0.9727& 0.97826& 0
|
||||
train
|
||||
|
||||
datagen_dropout_00_100
|
||||
test
|
||||
0.981& 0.9702& 0.9769& 1e-05
|
||||
train
|
||||
|
||||
dropout_02_100
|
||||
test
|
||||
0.9796& 0.9719& 0.97703& 1e-05
|
||||
train
|
||||
|
||||
default_100
|
||||
test
|
||||
0.9637& 0.9506& 0.95823& 2e-05
|
@ -115,7 +115,9 @@ plot coordinates {
|
||||
\caption{$\lambda = 3.0$}
|
||||
\end{subfigure}
|
||||
\end{subfigure}
|
||||
\caption{% In these Figures the behaviour stated in ... is visualized
|
||||
\caption[Comparison of shallow neural networks and regression
|
||||
splines]{% In these Figures the behaviour stated in ... is
|
||||
% visualized
|
||||
% in two exaples. For $(a), (b), (c)$ six values of sinus equidistantly
|
||||
% spaced on $[-\pi, \pi]$ have been used as training data. For
|
||||
% $(d),(e),(f)$ 15 equidistand values have been used, where
|
||||
@ -131,6 +133,7 @@ plot coordinates {
|
||||
$\text{data}_B$ in d), e), f).
|
||||
The Parameters of each are given above.
|
||||
}
|
||||
\label{fig:rn_vs_rs}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -65,7 +65,7 @@ plot coordinates {
|
||||
\caption{Performance metrics during training}
|
||||
\end{subfigure}
|
||||
% \\~\\
|
||||
\caption{The neural network given in ?? trained with different
|
||||
\caption[Performance comparison of SDG and GD]{The neural network given in ?? trained with different
|
||||
algorithms on the MNIST handwritten digits data set. For gradient
|
||||
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For
|
||||
stochastic gradient descend a batch size of 32 and learning rate
|
||||
|
@ -40,7 +40,7 @@
|
||||
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf}
|
||||
\caption{Ankle boot}
|
||||
\end{subfigure}
|
||||
\caption{The fashtion MNIST data set contains 70.000 images of
|
||||
\caption[Fashion MNIST data set]{The fashtion MNIST data set contains 70.000 images of
|
||||
preprocessed product images from Zalando, which are categorized as
|
||||
T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
|
||||
Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while
|
||||
|
@ -51,7 +51,7 @@ plot coordinates {
|
||||
\begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Classification Accuracy}\Bstrut \\
|
||||
\multicolumn{7}{c}{Test Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\
|
||||
max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\
|
||||
@ -64,8 +64,9 @@ plot coordinates {
|
||||
min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\
|
||||
\end{tabu}
|
||||
\caption{Mean and maximum accuracy after 48 epochs of training.}
|
||||
\label{fig:gen_dropout_b}
|
||||
\end{subfigure}
|
||||
\caption{Accuracy for the net given in ... with Dropout (D.),
|
||||
\caption[Performance comparison of overfitting measures]{Accuracy for the net given in ... with Dropout (D.),
|
||||
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||
with \textsc{Adam}. For each epoch the 60.000 training samples
|
||||
were used, or for data generation 10.000 steps with each using
|
||||
@ -73,6 +74,7 @@ plot coordinates {
|
||||
model was trained 5 times and the average accuracies at each epoch
|
||||
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||
the test and training set are given in (b).}
|
||||
\label{fig:gen_dropout}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -30,7 +30,7 @@
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
\caption[MNIST data set]{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\label{fig:MNIST}
|
||||
|
@ -5,7 +5,9 @@
|
||||
\usepackage{adjustbox}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{tabu}
|
||||
\usepackage{showframe}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{titlecaps}
|
||||
\usetikzlibrary{calc, 3d}
|
||||
\usepgfplotslibrary{colorbrewer}
|
||||
|
||||
@ -29,33 +31,29 @@ plot coordinates {
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width =1.25pt}]
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
% \addplot [dashed] table
|
||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||
% {Data/adam_datagen_full.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_full_mean.log};
|
||||
{Data/adam_1.mean};
|
||||
% \addplot [dashed] table
|
||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||
% {Data/adam_datagen_dropout_02_full.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_full_mean.log};
|
||||
{Data/adam_datagen_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_04_full_mean.log};
|
||||
{Data/adam_datagen_dropout_02_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_full_mean.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_04_full_mean.log};
|
||||
\addplot [dashed] table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_full_mean.log};
|
||||
{Data/adam_dropout_02_1.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
@ -65,26 +63,72 @@ plot coordinates {
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Classification accuracy}
|
||||
\vspace{.25cm}
|
||||
\caption{1 sample per class}
|
||||
\vspace{0.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{1.0\linewidth}
|
||||
\begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Classification Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\
|
||||
max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\
|
||||
min & 0.9887 & 0.9909 & 0.9922 & 0.9929 & 0.9929 & 0.9934 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Training Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9994 & 0.9991 & 0.9989 & 0.9967 & 0.9954 & 0.9926 \\
|
||||
max & 0.9996 & 0.9996 & 0.9992 & 0.9979 & 0.9971 & 0.9937 \\
|
||||
min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\
|
||||
\end{tabu}
|
||||
\caption{Mean and maximum accuracy after 48 epochs of training.}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_10.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{10 samples per class}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}, ymin = {0.92}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_100.mean};
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{100 samples per class}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\caption{Accuracy for the net given in ... with Dropout (D.),
|
||||
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||
@ -95,6 +139,40 @@ plot coordinates {
|
||||
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||
the test and training set are given in (b).}
|
||||
\end{figure}
|
||||
\begin{table}
|
||||
\centering
|
||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
||||
\hline
|
||||
\end{tabu}
|
||||
\caption{Values of the test accuracy of the model trained 10 times
|
||||
of random training sets containing 1, 10 and 100 data points per
|
||||
class.}
|
||||
\end{table}
|
||||
|
||||
\begin{center}
|
||||
\begin{figure}[h]
|
||||
|
@ -10,7 +10,7 @@ plot coordinates {
|
||||
}
|
||||
}
|
||||
\begin{figure}
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
|
||||
@ -32,30 +32,31 @@ plot coordinates {
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
%\caption{Classification accuracy}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth, ymax = 0.5,
|
||||
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
|
||||
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
|
||||
% \begin{subfigure}[b]{\textwidth}
|
||||
% \begin{tikzpicture}
|
||||
% \begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
% height = 0.6\textwidth, ymax = 0.5,
|
||||
% xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
|
||||
% {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
|
||||
|
||||
\addlegendentry{\footnotesize{ADAGRAD}}
|
||||
\addlegendentry{\footnotesize{ADADELTA}}
|
||||
\addlegendentry{\footnotesize{ADAM}}
|
||||
\addlegendentry{SGD$_{0.01}$}
|
||||
% \addlegendentry{\footnotesize{ADAGRAD}}
|
||||
% \addlegendentry{\footnotesize{ADADELTA}}
|
||||
% \addlegendentry{\footnotesize{ADAM}}
|
||||
% \addlegendentry{SGD$_{0.01}$}
|
||||
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Performance metrics during training}
|
||||
\end{subfigure}
|
||||
\\~\\
|
||||
% \end{axis}
|
||||
% \end{tikzpicture}
|
||||
% \caption{Performance metrics during training}
|
||||
% \vspace{.25cm}
|
||||
% \end{subfigure}
|
||||
\begin{subfigure}[b]{1.0\linewidth}
|
||||
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
|
||||
\multicolumn{3}{c}{Classification Accuracy}
|
||||
@ -67,8 +68,9 @@ plot coordinates {
|
||||
\end{tabu}
|
||||
\caption{Performace metrics after 20 epochs}
|
||||
\end{subfigure}
|
||||
\caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
|
||||
\caption[Performance comparison of training algorithms]{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
|
||||
with different optimization algorithms}
|
||||
\label{fig:comp_alg}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -28,7 +28,7 @@
|
||||
\end{adjustbox}
|
||||
\caption{True position (\textcolor{red}{red}), filtered position data (black)}
|
||||
\end{subfigure}
|
||||
\caption{Example for noise reduction using convolution with simulated
|
||||
\caption[Signal smoothing using convolution]{Example for noise reduction using convolution with simulated
|
||||
positional data. As filter
|
||||
$g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$
|
||||
is chosen and applied to the $x$ and $y$ coordinate
|
||||
|
@ -176,4 +176,29 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
|
||||
timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@article{transfer_learning,
|
||||
author = {Zhao,Wei},
|
||||
title = {Research on the deep learning of the small sample data based on transfer learning},
|
||||
journal = {AIP Conference Proceedings},
|
||||
volume = {1864},
|
||||
number = {1},
|
||||
pages = {020018},
|
||||
year = {2017},
|
||||
doi = {10.1063/1.4992835},
|
||||
URL = {https://aip.scitation.org/doi/abs/10.1063/1.4992835},
|
||||
eprint = {https://aip.scitation.org/doi/pdf/10.1063/1.4992835}
|
||||
}
|
||||
|
||||
@article{gan,
|
||||
title = "GAN-based synthetic medical image augmentation for increased CNN performance in liver lesion classification",
|
||||
journal = "Neurocomputing",
|
||||
volume = 321,
|
||||
pages = "321 - 331",
|
||||
year = 2018,
|
||||
issn = "0925-2312",
|
||||
doi = "https://doi.org/10.1016/j.neucom.2018.09.013",
|
||||
url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
|
||||
author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan"
|
||||
}
|
@ -85,7 +85,7 @@ channel (color) $c$ to the respective value $v$
|
||||
\end{scope}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{On the right the red, green and blue chances of the picture
|
||||
\caption[Channel separation of color image]{On the right the red, green and blue chances of the picture
|
||||
are displayed. In order to better visualize the color channels the
|
||||
black and white picture of each channel has been colored in the
|
||||
respective color. Combining the layers results in the image on the
|
||||
@ -177,7 +177,7 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
|
||||
% \includegraphics[width=\textwidth]{Plots/Data/image_conv6.png}
|
||||
% \caption{test}
|
||||
% \end{subfigure}
|
||||
\caption{Convolution of original greyscale Image (a) with different
|
||||
\caption[Convolution applied on image]{Convolution of original greyscale Image (a) with different
|
||||
kernels. In (b) and (c) Gaussian kernels of size 11 and stated
|
||||
$\sigma^2$ are used. In (d) - (f) the above defined Sobel Operator
|
||||
kernels are used.}
|
||||
@ -186,7 +186,7 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
|
||||
\clearpage
|
||||
\newpage
|
||||
\subsection{Convolutional NN}
|
||||
\todo{Eileitung zu CNN}
|
||||
\todo{Eileitung zu CNN amout of parameters}
|
||||
% Conventional neural network as described in chapter .. are made up of
|
||||
% fully connected layers, meaning each node in a layer is influenced by
|
||||
% all nodes of the previous layer. If one wants to extract information
|
||||
@ -219,11 +219,11 @@ The usage of multiple filters results in multiple outputs of the same
|
||||
size as the input. These are often called channels. Depending on the
|
||||
size of the filters this can result in the dimension of the output
|
||||
being one larger than the input.
|
||||
However for convolutional layers following a convolutional layer the
|
||||
However for convolutional layers that are preceded by convolutional layers the
|
||||
size of the filter is often chosen to coincide with the amount of channels
|
||||
of the output of the previous layer without using padding in this
|
||||
direction in order to prevent gaining additional
|
||||
dimensions\todo{komisch} in the output.
|
||||
dimensions\todo{filter mit ganzer tiefe besser erklären} in the output.
|
||||
This can also be used to flatten certain less interesting channels of
|
||||
the input as for example a color channels.
|
||||
Thus filters used in convolutional networks are usually have the same
|
||||
@ -264,11 +264,11 @@ reduced in size by extracting a single value from a
|
||||
neighborhood \todo{moving...}... . The resulting output size is dependent on
|
||||
the offset of the neighborhoods used. Popular is max-pooling where the
|
||||
largest value in a neighborhood is used or.
|
||||
|
||||
This construct allows for extraction of features from the input while
|
||||
using far less input variables.
|
||||
|
||||
... \todo{Beispiel mit kleinem Bild, am besten das von oben}
|
||||
\todo{kleine grafik}
|
||||
The combination of convolution and pooling layers allows for
|
||||
extraction of features from the input in the from of feature maps while
|
||||
using relatively few parameters that need to be trained.
|
||||
\todo{Beispiel feature maps}
|
||||
|
||||
\subsubsection{Parallels to the Visual Cortex in Mammals}
|
||||
|
||||
@ -447,11 +447,15 @@ algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD})
|
||||
laying the base work. Here for each parameter update the learning rate
|
||||
is given my a constant
|
||||
$\gamma$ is divided by the sum of the squares of the past partial
|
||||
derivatives in this parameter. This results in a monotonously
|
||||
decreasing learning rate for each parameter. This results in a faster
|
||||
decaying learning rate for parameters with large updates, where as
|
||||
derivatives in this parameter. This results in a monotonous decaying
|
||||
learning rate with faster
|
||||
decay for parameters with large updates, where as
|
||||
parameters with small updates experience smaller decay. The \textsc{AdaGrad}
|
||||
algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
||||
algorithm is given in Algorithm~\ref{alg:ADAGRAD}. Note that while
|
||||
this algorithm is still based upon the idea of gradient descent it no
|
||||
longer takes steps in the direction of the gradient while
|
||||
updating. Due to the individual learning rates for each parameter only
|
||||
the direction/sign for single parameters remain the same.
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
@ -461,29 +465,64 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Compute Update: $\Delta x_{t,i} \leftarrow
|
||||
-\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_t, \forall i =
|
||||
-\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_{t,i}, \forall i =
|
||||
1, \dots,p$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{\textls{\textsc{AdaGrad}}}
|
||||
\caption{\textsc{AdaGrad}}
|
||||
\label{alg:ADAGRAD}
|
||||
\end{algorithm}
|
||||
|
||||
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta})
|
||||
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the
|
||||
\textsc{AdaDelta} algorithm
|
||||
in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the
|
||||
continual decay of the learning rate and the need for a manually
|
||||
selected global learning rate $\gamma$.
|
||||
As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will
|
||||
eventually become infinitely small.
|
||||
In order to ensure that even after a significant of iterations
|
||||
learning continues to make progress instead of summing the gradients a
|
||||
exponentially decaying average of the past gradients is used to ....
|
||||
learning continues to make progress instead of summing the squared gradients a
|
||||
exponentially decaying average of the past squared gradients is used to for
|
||||
regularizing the learning rate resulting in
|
||||
\begin{align*}
|
||||
E[g^2]_t & = \rho E[g^2]_{t-1} + (1-\rho) g_t^2, \\
|
||||
\Delta x_t & = -\frac{\gamma}{\sqrt{E[g^2]_t + \varepsilon}} g_t,
|
||||
\end{align*}
|
||||
for a decay rate $\rho$.
|
||||
Additionally the fixed global learning rate $\gamma$ is substituted by
|
||||
a exponentially decaying average of the past parameter updates.
|
||||
The usage of the past parameter updates is motivated by ensuring that
|
||||
if the parameter vector had some hypothetical units they would be matched
|
||||
by these of the parameter update $\Delta x_t$. This proper
|
||||
\todo{erklärung unit}
|
||||
hypothetical units of the parameter vector match those of the
|
||||
parameter update $\Delta x_t$. When only using the
|
||||
gradient with a scalar learning rate as in SDG the resulting unit of
|
||||
the parameter update is:
|
||||
\[
|
||||
\text{units of } \Delta x \propto \text{units of } g \propto
|
||||
\frac{\partial f}{\partial x} \propto \frac{1}{\text{units of } x},
|
||||
\]
|
||||
assuming the cost function $f$ is unitless. \textsc{AdaGrad} neither
|
||||
has correct units since the update is given by a ratio of gradient
|
||||
quantities resulting in a unitless parameter update. If however
|
||||
Hessian information or a approximation thereof is used to scale the
|
||||
gradients the unit of the updates will be correct:
|
||||
\[
|
||||
\text{units of } \Delta x \propto H^{-1} g \propto
|
||||
\frac{\frac{\partial f}{\partial x}}{\frac{\partial ^2 f}{\partial
|
||||
x^2}} \propto \text{units of } x
|
||||
\]
|
||||
Since using the second derivative results in correct units, Newton's
|
||||
method (assuming diagonal hessian) is rearranged to determine the
|
||||
quantities involved in the inverse of the second derivative:
|
||||
\[
|
||||
\Delta x = \frac{\frac{\partial f}{\partial x}}{\frac{\partial ^2
|
||||
f}{\partial x^2}} \iff \frac{1}{\frac{\partial^2 f}{\partial
|
||||
x^2}} = \frac{\Delta x}{\frac{\partial f}{\partial x}}.
|
||||
\]
|
||||
As the root mean square of the past gradients is already used in the
|
||||
denominator of the learning rate a exponentially decaying root mean
|
||||
square of the past updates is used to obtain a $\Delta x$ quantity for
|
||||
the denominator resulting in the correct unit of the update. The full
|
||||
algorithm is given by Algorithm~\ref{alg:adadelta}.
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
@ -501,23 +540,24 @@ by these of the parameter update $\Delta x_t$. This proper
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{\textsc{AdaDelta}, \textcite{ADADELTA}}
|
||||
\label{alg:gd}
|
||||
\label{alg:adadelta}
|
||||
\end{algorithm}
|
||||
|
||||
While the stochastic gradient algorithm is less susceptible to local
|
||||
While the stochastic gradient algorithm is less susceptible to getting
|
||||
stuck in local
|
||||
extrema than gradient descent the problem still persists especially
|
||||
with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
|
||||
for saddle points with steep .... \textcite{DBLP:journals/corr/Dauphinpgcgb14}
|
||||
|
||||
An approach to the problem of ``getting stuck'' in saddle point or
|
||||
local minima/maxima is the addition of momentum to SDG. Instead of
|
||||
using the actual gradient for the parameter update an average over the
|
||||
past gradients is used. In order to avoid the need to SAVE the past
|
||||
values usually a exponentially decaying average is used resulting in
|
||||
Algorithm~\ref{alg_momentum}. This is comparable of following the path
|
||||
of a marble with mass rolling down the SLOPE of the error
|
||||
function. The decay rate for the average is comparable to the TRÄGHEIT
|
||||
Algorithm~\ref{alg:sgd_m}. This is comparable of following the path
|
||||
of a marble with mass rolling down the slope of the error
|
||||
function. The decay rate for the average is comparable to the inertia
|
||||
of the marble.
|
||||
This results in the algorithm being able to escape ... due to the
|
||||
This results in the algorithm being able to escape some local extrema due to the
|
||||
build up momentum from approaching it.
|
||||
|
||||
% \begin{itemize}
|
||||
@ -539,14 +579,26 @@ build up momentum from approaching it.
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{SDG with momentum}
|
||||
\label{alg:gd}
|
||||
\label{alg:sgd_m}
|
||||
\end{algorithm}
|
||||
|
||||
In an effort to combine the properties of the momentum method and the
|
||||
automatic adapted learning rate of \textsc{AdaDelta} \textcite{ADAM}
|
||||
developed the \textsc{Adam} algorithm. The
|
||||
|
||||
Problems / Improvements ADAM \textcite{rADAM}
|
||||
developed the \textsc{Adam} algorithm, given in
|
||||
Algorithm~\ref{alg:adam}. Here the exponentially decaying
|
||||
root mean square of the gradients is still used for realizing and
|
||||
combined with the momentum method. Both terms are normalized such that
|
||||
the ... are the first and second moment of the gradient. However the term used in
|
||||
\textsc{AdaDelta} to ensure correct units is dropped for a scalar
|
||||
global learning rate. This results in .. hyperparameters, however the
|
||||
algorithms seems to be exceptionally stable with the recommended
|
||||
parameters of ... and is a very reliable algorithm for training
|
||||
neural networks.
|
||||
However the \textsc{Adam} algorithm can have problems with high
|
||||
variance of the adaptive learning rate early in training.
|
||||
\textcite{rADAM} try to address these issues with the Rectified Adam
|
||||
algorithm
|
||||
\todo{will ich das einbauen?}
|
||||
|
||||
|
||||
\begin{algorithm}[H]
|
||||
@ -556,21 +608,27 @@ Problems / Improvements ADAM \textcite{rADAM}
|
||||
Initialize accumulation variables $m_0 = 0$, $v_0 = 0$\;
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Accumulate first and second Moment of the Gradient:
|
||||
\begin{align*}
|
||||
m_t &\leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\
|
||||
v_t &\leftarrow \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\;
|
||||
\end{align*}
|
||||
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
|
||||
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
|
||||
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
|
||||
x^2]_{t-1} + (1+p)\Delta x_t^2$\;
|
||||
Accumulate first Moment of the Gradient and correct for bias:
|
||||
$m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t;$\hspace{\linewidth}
|
||||
$\hat{m}_t \leftarrow \frac{m_t}{1-\beta_1^t}$\;
|
||||
Accumulate second Moment of the Gradient and correct for bias:
|
||||
$v_t \leftarrow \beta_2 v_{t-1} + (1 - \beta_2)g_t^2;$\hspace{\linewidth}
|
||||
$\hat{v}_t \leftarrow \frac{v_t}{1-\beta_2^t}$\;
|
||||
Compute Update: $\Delta x_t \leftarrow
|
||||
-\frac{\alpha}{\sqrt{\hat{v}_t + \varepsilon}}
|
||||
\hat{m}_t$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{ADAM, \cite{ADAM}}
|
||||
\label{alg:gd}
|
||||
\label{alg:adam}
|
||||
\end{algorithm}
|
||||
|
||||
In order to get an understanding of the performance of the above
|
||||
discussed training algorithms the neural network given in ... has been
|
||||
trained on the ... and the results are given in
|
||||
Figure~\ref{fig:comp_alg}.
|
||||
Here it can be seen that the ADAM algorithm performs far better than
|
||||
the other algorithms, with AdaGrad and Adelta following... bla bla
|
||||
|
||||
|
||||
\input{Plots/sdg_comparison.tex}
|
||||
@ -594,15 +652,27 @@ Problems / Improvements ADAM \textcite{rADAM}
|
||||
% \cite{Dropout}
|
||||
|
||||
Similarly to shallow networks overfitting still can impact the quality of
|
||||
convolutional neural networks. A popular way to combat this problem is
|
||||
by introducing noise into the training of the model. This is a
|
||||
successful strategy for ofter models as well, the a conglomerate of
|
||||
descision trees grown on bootstrapped trainig samples benefit greatly
|
||||
of randomizing the features available to use in each training
|
||||
iteration (Hastie, Bachelorarbeit??).
|
||||
There are two approaches to introduce noise to the model during
|
||||
learning, either by manipulating the model it self or by manipulating
|
||||
the input data.
|
||||
convolutional neural networks.
|
||||
Popular ways to combat this problem for a .. of models is averaging
|
||||
over multiple models trained on subsets (bootstrap) or introducing
|
||||
noise directly during the training (for example random forest, where a
|
||||
conglomerate of decision trees benefit greatly of randomizing the
|
||||
features available to use in each training iteration).
|
||||
We explore implementations of these approaches for neural networks
|
||||
being dropout for simulating a conglomerate of networks and
|
||||
introducing noise during training by slightly altering the input
|
||||
pictures.
|
||||
% A popular way to combat this problem is
|
||||
% by introducing noise into the training of the model.
|
||||
% This can be done in a variety
|
||||
% This is a
|
||||
% successful strategy for ofter models as well, the a conglomerate of
|
||||
% descision trees grown on bootstrapped trainig samples benefit greatly
|
||||
% of randomizing the features available to use in each training
|
||||
% iteration (Hastie, Bachelorarbeit??).
|
||||
% There are two approaches to introduce noise to the model during
|
||||
% learning, either by manipulating the model it self or by manipulating
|
||||
% the input data.
|
||||
\subsubsection{Dropout}
|
||||
If a neural network has enough hidden nodes there will be sets of
|
||||
weights that accurately fit the training set (proof for a small
|
||||
@ -690,21 +760,35 @@ mirroring.
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf}
|
||||
\caption{random\\positional shift}
|
||||
\end{subfigure}
|
||||
\caption{Example for the manipuations used in ... As all images are
|
||||
\caption[Image data generation]{Example for the manipuations used in ... As all images are
|
||||
of the same intensity brightness manipulation does not seem
|
||||
... Additionally mirroring is not used for ... reasons.}
|
||||
\end{figure}
|
||||
|
||||
In order to compare the benefits obtained from implementing these
|
||||
measures we have trained the network given in ... on the same problem
|
||||
and implemented different combinations of the measures. The results
|
||||
are given in Figure~\ref{fig:gen_dropout}. Here it can be seen that ...
|
||||
and implemented different combinations of data generation and dropout. The results
|
||||
are given in Figure~\ref{fig:gen_dropout}. For each scennario the
|
||||
model was trained five times and the performance measures were
|
||||
averaged. It can be seen that implementing the measures does indeed
|
||||
increase the performance of the model. Implementing data generation on
|
||||
its own seems to have a larger impact than dropout and applying both
|
||||
increases the accuracy even further.
|
||||
|
||||
The better performance stems most likely from reduced overfitting. The
|
||||
reduction in overfitting can be seen in
|
||||
\ref{fig:gen_dropout}~(\subref{fig:gen_dropout_b}) as the training
|
||||
accuracy decreases with test accuracy increasing. However utlitizing
|
||||
data generation as well as dropout with a probability of 0.4 seems to
|
||||
be a too aggressive approach as the training accuracy drops below the
|
||||
test accuracy.
|
||||
|
||||
\input{Plots/gen_dropout.tex}
|
||||
|
||||
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
|
||||
training set?}
|
||||
|
||||
\clearpage
|
||||
\subsubsection{\titlecap{effectivety for small training sets}}
|
||||
|
||||
For some applications (medical problems with small amount of patients)
|
||||
@ -726,13 +810,141 @@ full dataset: ... per class\\
|
||||
100 per class
|
||||
10 per class
|
||||
|
||||
the results for training .. are given in ... Here can be seen...
|
||||
the results for training .. are given in ... Here can be seen... that
|
||||
for small training sets data generation has a large impact on the accuracy.
|
||||
|
||||
\begin{table}
|
||||
\centering
|
||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
||||
\hline
|
||||
\end{tabu}
|
||||
\caption{Values of the test accuracy of the model trained 10 times
|
||||
of random training sets containing 1, 10 and 100 data points per
|
||||
class.}
|
||||
\end{table}
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\missingfigure{datagen digits}
|
||||
\caption{Sample pictures of the mnist fashioyn dataset, one per
|
||||
class.}
|
||||
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_02_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_02_1.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{1 sample per class}
|
||||
\vspace{0.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_02_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_02_10.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{Default.}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G + D. 0.2}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{10 samples per class}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}, ymin = {0.92}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_02_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_02_100.mean};
|
||||
|
||||
\addlegendentry{\footnotesize{Default.}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G + D. 0.2}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{100 samples per class}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\caption{}
|
||||
\label{mnist fashion}
|
||||
\end{figure}
|
||||
|
||||
@ -752,6 +964,8 @@ the results for training .. are given in ... Here can be seen...
|
||||
\item Transfer learning, use network trained on different task and
|
||||
repurpose it / train it with the training data
|
||||
\end{itemize}
|
||||
\textcite{transfer_learning}
|
||||
\textcite{gan}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -2,13 +2,18 @@
|
||||
\section{Introduction to Neural Networks}
|
||||
|
||||
Neural Networks (NN) are a mathematical construct inspired by the
|
||||
connection of neurons in nature. It consists of an input and output
|
||||
layer with an arbitrary amount of hidden layers between them. Each
|
||||
layer consits of a numer of neurons (nodes) with the number of nodes
|
||||
in the in-/output layers corresponding to the dimensions of the
|
||||
in-/output.\par
|
||||
Each neuron recieves the output of all layers in the previous layers,
|
||||
except for the input layer, which recieves the components of the input.
|
||||
... of brains in mammals. It consists of an array of neurons that
|
||||
receive inputs and compute a accumulated output. These neurons are
|
||||
arranged in layers, with one input and output layer and a arbirtary
|
||||
amount of hidden layer between them.
|
||||
The amount of neurons in the in- and output layers correspond to the
|
||||
desired dimensions of in- and outputs of the model.
|
||||
In conventional neural networks the information is passed ... from the
|
||||
input layer towards the output layer hence they are often called feed
|
||||
forward networks. Each neuron in a layer has the outputs of all
|
||||
neurons in the preceding layer as input (fully connected). A
|
||||
illustration of a example neuronal network is given in
|
||||
Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}
|
||||
|
||||
\tikzset{%
|
||||
every neuron/.style={
|
||||
@ -79,10 +84,11 @@ except for the input layer, which recieves the components of the input.
|
||||
\node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$};
|
||||
|
||||
\end{tikzpicture}}%}
|
||||
\caption{Illustration of a neural network with $d_i$ inputs, $l$
|
||||
\caption[Illustration of a neural network]{Illustration of a neural network with $d_i$ inputs, $l$
|
||||
hidden layers with $n_{\cdot}$ nodes in each layer, as well as
|
||||
$d_o$ outputs.
|
||||
}
|
||||
\label{fig:nn}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Nonlinearity of Neural Networks}
|
||||
@ -91,35 +97,39 @@ The arguably most important feature of neural networks that sets them
|
||||
apart from linear models is the activation function implemented in the
|
||||
neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the
|
||||
inputs a activation function $\sigma$ is applied in order to obtain
|
||||
the output resulting in the output being given by
|
||||
the output resulting in the output of the $k$-th. neuron in a layer
|
||||
being given by
|
||||
\[
|
||||
o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right).
|
||||
o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right)
|
||||
\]
|
||||
for weights $w_{k,j}$ and biases $b_k$.
|
||||
The activation function is usually chosen nonlinear (a linear one
|
||||
would result in the entire model collapsing into a linear one) which
|
||||
allows it to better model data (beispiel satz ...).
|
||||
would result in the entire model collapsing into a linear one\todo{beweis?}) which
|
||||
allows it to better model data where the relation of in- and output is
|
||||
of nonlinear nature.
|
||||
There are two types of activation functions, saturating and not
|
||||
saturating ones. Popular examples for the former are sigmoid
|
||||
functions where most commonly the standard logisitc function or tanh are used
|
||||
as they have easy to compute derivatives which is ... for gradient
|
||||
functions where most commonly the standard logisitc function or tangen
|
||||
hyperbolicus are used
|
||||
as they have easy to compute derivatives which is desirable for gradient
|
||||
based optimization algorithms. The standard logistic function (often
|
||||
referred to simply as sigmoid function) is given by
|
||||
\[
|
||||
f(x) = \frac{1}{1+e^{-x}}
|
||||
f(x) = \frac{1}{1+e^{-x}}
|
||||
\]
|
||||
and has a realm of $[0,1]$. Its usage as an activation function is
|
||||
motivated by modeling neurons which
|
||||
are close to deactive until a certain threshold where they grow in
|
||||
intensity until they are fully
|
||||
active, which is similar to the behavior of neurons in brains
|
||||
\todo{besser schreiben}. The tanh function is given by
|
||||
active, which is similar to the behavior of neurons in
|
||||
brains\todo{besser schreiben}. The tangens hyperbolicus is given by
|
||||
\[
|
||||
tanh(x) = \frac{2}{e^{2x}+1}
|
||||
\tanh(x) = \frac{2}{e^{2x}+1}
|
||||
\]
|
||||
|
||||
The downside of these saturating activation functions is that given
|
||||
their ... their derivatives are close to zero for large or small
|
||||
input values which can ... the ... of gradient based methods.
|
||||
their saturating nature their derivatives are close to zero for large or small
|
||||
input values which can slow or hinder the progress of gradient based methods.
|
||||
|
||||
The nonsaturating activation functions commonly used are the recified
|
||||
linear using (ReLU) or the leaky RelU. The ReLU is given by
|
||||
@ -127,11 +137,12 @@ linear using (ReLU) or the leaky RelU. The ReLU is given by
|
||||
r(x) = \max\left\{0, x\right\}.
|
||||
\]
|
||||
This has the benefit of having a constant derivative for values larger
|
||||
than zero. However the derivative being zero ... . The leaky ReLU is
|
||||
than zero. However the derivative being zero has the same downside for
|
||||
fitting the model with gradient based methods. The leaky ReLU is
|
||||
an attempt to counteract this problem by assigning a small constant
|
||||
derivative to all values smaller than zero and for scalar $\alpha$ is given by
|
||||
\[
|
||||
l(x) = \max\left\{0, x\right\} + \alpha.
|
||||
l(x) = \max\left\{0, x\right\} + \alpha \min \left\{0, x\right\}.
|
||||
\]
|
||||
In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}.
|
||||
|
||||
@ -144,6 +155,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{\titlecap{standard logistic function}}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
@ -152,6 +164,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{\titlecap{tangens hyperbolicus}}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
@ -161,6 +174,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{ReLU}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
@ -170,8 +184,9 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Leaky ReLU, $\alpha = 0.1$}
|
||||
\end{subfigure}
|
||||
\caption{Plots of the activation fucntoins...}
|
||||
\caption{Plots of the activation functions}
|
||||
\label{fig:activation}
|
||||
\end{figure}
|
||||
|
||||
@ -266,24 +281,28 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\clearpage
|
||||
\subsection{Training Neural Networks}
|
||||
|
||||
After a neural network model is designed, like most statistical models
|
||||
it has to be fit to the data. In the machine learning context this is
|
||||
often called ``training'' as due to the complexity and amount of
|
||||
variables in these models they are fitted iteratively to the data,
|
||||
``learing'' the properties of the data better with each iteration.
|
||||
As neural networks are a PARAMETRIC model we need to fit it to input
|
||||
data in order to get meaningfull OUTPUT from the network in order to
|
||||
do this we first need to discuss how we interpret the output of the
|
||||
neural network.
|
||||
|
||||
There are two main categories of machine learning models, being
|
||||
supervised and unsupervised learners. Unsupervised learners learn
|
||||
structure in the data without guidance form outside (as labeling data
|
||||
beforehand for training) popular examples of this are clustering
|
||||
algorithms\todo{quelle}. Supervised learners on the other hand are as
|
||||
the name suggest supervised during learning. This generally amounts to
|
||||
using data with the expected response (label) attached to each
|
||||
data-point in fitting the model, where usually some distance between
|
||||
the model output and the labels is minimized.
|
||||
% After a neural network model is designed, like most statistical models
|
||||
% it has to be fit to the data. In the machine learning context this is
|
||||
% often called ``training'' as due to the complexity and amount of
|
||||
% variables in these models they are fitted iteratively to the data,
|
||||
% ``learing'' the properties of the data better with each iteration.
|
||||
|
||||
\subsubsection{Interpreting the Output / Classification vs Regression
|
||||
/ Nonliniarity in last layer}
|
||||
% There are two main categories of machine learning models, being
|
||||
% supervised and unsupervised learners. Unsupervised learners learn
|
||||
% structure in the data without guidance form outside (as labeling data
|
||||
% beforehand for training) popular examples of this are clustering
|
||||
% algorithms\todo{quelle}. Supervised learners on the other hand are as
|
||||
% the name suggest supervised during learning. This generally amounts to
|
||||
% using data with the expected response (label) attached to each
|
||||
% data-point in fitting the model, where usually some distance between
|
||||
% the model output and the labels is minimized.
|
||||
|
||||
\subsubsection{\titlecap{nonliniarity in last layer}}
|
||||
|
||||
Given the nature of the neural net the output of the last layer are
|
||||
real numbers. For regression tasks this is desirable, for
|
||||
@ -316,6 +335,13 @@ and the individual values sum to one, thus the output can be interpreted as
|
||||
a probability for each class given the input.
|
||||
Additionally to being differentiable this allows for evaluataing the
|
||||
cetainiy of a prediction, rather than just whether it is accurate.
|
||||
A similar effect is obtained when for a binary or two class problem the
|
||||
sigmoid function
|
||||
\[
|
||||
f(x) = \frac{1}{1 + e^{-x}}
|
||||
\]
|
||||
is used and the output $f(x)$ is interpreted as the probability for
|
||||
the first class and $1-f(x)$ for the second class.
|
||||
|
||||
\todo{vielleicht additiv invarianz}
|
||||
% Another property that makes softmax attractive is the invariance to addition
|
||||
@ -372,7 +398,7 @@ common in time series models. \todo{komisch}
|
||||
|
||||
As discussed above the output of a neural network for a classification
|
||||
problem can be interpreted as a probability distribution over the classes
|
||||
conditioned on the input. In this case it is \todo{can?} desirable to
|
||||
conditioned on the input. In this case it is desirable to
|
||||
use error functions designed to compare probability distributions. A
|
||||
widespread error function for this use case is the cross entropy (\textcite{PRML}),
|
||||
which for two discrete distributions $p, q$ with the same realm $C$ is given by
|
||||
@ -392,15 +418,17 @@ $f$ we get the loss function
|
||||
|
||||
\subsubsection{Gradient Descent Algorithm}
|
||||
|
||||
When trying to fit a neural network it is hard
|
||||
to predict the impact of the single parameters on the accuracy of the
|
||||
output. Thus applying numeric optimization algorithms is the only
|
||||
Trying to find the optimal parameter for fitting the model to the data
|
||||
can be a hard problem. Given the complex nature of a neural network
|
||||
with many layers and neurons it is hard to predict the impact of
|
||||
single parameters on the accuracy of the output.
|
||||
Thus applying numeric optimization algorithms is the only
|
||||
feasible way to fit the model. A attractive algorithm for training
|
||||
neural networks is gradient descent where each parameter $\theta_i$ is
|
||||
iterative changed according to the gradient regarding the error
|
||||
measure and a step size $\gamma$. For this all parameters are
|
||||
initialized (often random or close to zero) and then iteratively
|
||||
updated until a certain criteria is hit, mostly either being a fixed
|
||||
updated until a certain stopping criterion is hit, mostly either being a fixed
|
||||
number of iterations or a desired upper limit for the error measure.
|
||||
% For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$
|
||||
% and a error function $L(f_\theta)$ the gradient descent algorithm is
|
||||
@ -450,6 +478,7 @@ introduced by \textcite{backprop}.
|
||||
\[
|
||||
\frac{\partial L(...)}{}
|
||||
\]
|
||||
\todo{Backprop richtig aufschreiben}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -34,7 +34,7 @@
|
||||
\usepackage{todonotes}
|
||||
\usepackage{lipsum}
|
||||
\usepackage[ruled,vlined]{algorithm2e}
|
||||
%\usepackage{showframe}
|
||||
\usepackage{showframe}
|
||||
\usepackage[protrusion=true, expansion=true, kerning=true, letterspace
|
||||
= 150]{microtype}
|
||||
\usepackage{titlecaps}
|
||||
@ -113,6 +113,8 @@
|
||||
\newpage
|
||||
%\setcounter{tocdepth}{4}
|
||||
\tableofcontents
|
||||
\clearpage
|
||||
\listoffigures
|
||||
\listoftodos
|
||||
\newpage
|
||||
\pagenumbering{arabic}
|
||||
|
523
TeX/theo_3_8.tex
523
TeX/theo_3_8.tex
@ -6,14 +6,15 @@
|
||||
%%% End:
|
||||
\section{Shallow Neural Networks}
|
||||
|
||||
In order to get a some understanding of the behavior of neural
|
||||
networks we study a simplified class of networks called shallow neural
|
||||
networks in this chapter. We consider shallow neural networks consist of a single
|
||||
hidden layer and
|
||||
In order to examine some behavior of neural networks in this chapter
|
||||
we consider a simple class of networks, the shallow ones. These
|
||||
networks only contain one hidden layer and have a single output node.
|
||||
|
||||
% In order to get a some understanding of the behavior of neural
|
||||
% networks we study a simplified class of networks called shallow neural
|
||||
% networks in this chapter.
|
||||
% We consider shallow neural networks consist of a single
|
||||
% hidden layer and
|
||||
In order to get some understanding of the behavior of neural networks
|
||||
we examine a simple class of networks in this chapter. We consider
|
||||
networks that contain only one hidden layer and have a single output
|
||||
node. We call these networks shallow neural networks.
|
||||
\begin{Definition}[Shallow neural network]
|
||||
For a input dimension $d$ and a Lipschitz continuous activation function $\sigma:
|
||||
\mathbb{R} \to \mathbb{R}$ we define a shallow neural network with
|
||||
@ -84,15 +85,16 @@ with
|
||||
% \end{figure}
|
||||
|
||||
As neural networks with a large amount of nodes have a large amount of
|
||||
parameters that can be tuned it can often fit the data quite well. If a ReLU
|
||||
parameters that can be tuned it can often fit the data quite well. If
|
||||
a ReLU activation function
|
||||
\[
|
||||
\sigma(x) \coloneqq \max{(0, x)}
|
||||
\]
|
||||
is chosen as activation function one can easily prove that if the
|
||||
is chosen one can easily prove that if the
|
||||
amount of hidden nodes exceeds the
|
||||
amount of data points in the training data a shallow network trained
|
||||
on MSE will perfectly fit the data.
|
||||
\begin{Theorem}[sinnvoller titel]
|
||||
\begin{Theorem}[Shallow neural network can fit data perfectly]
|
||||
For training data of size t
|
||||
\[
|
||||
\left(x_i^{\text{train}}, y_i^{\text{train}}\right) \in \mathbb{R}^d
|
||||
@ -150,17 +152,18 @@ on MSE will perfectly fit the data.
|
||||
\label{theo:overfit}
|
||||
\end{Theorem}
|
||||
|
||||
However this behavior is often not desired as over fit models often
|
||||
However this behavior is often not desired as over fit models generally
|
||||
have bad generalization properties especially if noise is present in
|
||||
the data. This effect can be seen in
|
||||
Figure~\ref{fig:overfit}. Here a network that perfectly fits the
|
||||
training data regarding the MSE is \todo{Formulierung}
|
||||
constructed and compared to a regression spline
|
||||
(Definition~\ref{def:wrs}). While the network
|
||||
fits the data better than the spline, the spline is much closer to the
|
||||
underlying mechanism that was used to generate the data. The better
|
||||
the data. This effect is illustrated in
|
||||
Figure~\ref{fig:overfit}. Here a shallow neural network that perfectly fits the
|
||||
training data regarding MSE is \todo{Formulierung}
|
||||
constructed according to the proof of Theorem~\ref{theo:overfit} and
|
||||
compared to a regression spline
|
||||
(Definition~\ref{def:wrs}). While the neural network
|
||||
fits the data better than the spline, the spline represents the
|
||||
underlying mechanism that was used to generate the data more accurately. The better
|
||||
generalization of the spline compared to the network is further
|
||||
illustrated by the better validation error computed with new generated
|
||||
demonstrated by the better validation error computed on newly generated
|
||||
test data.
|
||||
In order to improve the accuracy of the model we want to reduce
|
||||
overfitting. A possible way to achieve this is by explicitly
|
||||
@ -168,7 +171,7 @@ regularizing the network through the cost function as done with
|
||||
ridge penalized networks
|
||||
(Definition~\ref{def:rpnn}) where large weights $w$ are punished. In
|
||||
Theorem~\ref{theo:main1} we will
|
||||
prove that this will result in the network converging to
|
||||
prove that this will result in the shallow neural network converging to
|
||||
regressions splines as the amount of nodes in the hidden layer is
|
||||
increased.
|
||||
|
||||
@ -205,7 +208,7 @@ plot coordinates {
|
||||
\addlegendentry{\footnotesize{spline}};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
||||
\caption[Overfitting of shallow neural networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
||||
\varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
|
||||
(\textcolor{blue}{blue dots}) the neural network constructed
|
||||
according to the proof of Theorem~\ref{theo:overfit} (black) and the
|
||||
@ -224,14 +227,24 @@ plot coordinates {
|
||||
Networks}
|
||||
|
||||
|
||||
This section is based on \textcite{heiss2019}. We will analyze the connection of randomized shallow
|
||||
Neural Networks with one dimensional input and regression splines. We
|
||||
will see that the punishment of the size of the weights in training
|
||||
This section is based on \textcite{heiss2019}. We will analyze the
|
||||
connection between randomized shallow
|
||||
Neural Networks with one dimensional input with a ReLU as activation
|
||||
function for all neurons and regression splines.
|
||||
% \[
|
||||
% \sigma(x) = \max\left\{0,x\right\}.
|
||||
% \]
|
||||
We will see that the punishment of the size of the weights in training
|
||||
the randomized shallow
|
||||
Neural Network will result in a function that minimizes the second
|
||||
Neural Network will result in a learned function that minimizes the second
|
||||
derivative as the amount of hidden nodes is grown to infinity. In order
|
||||
to properly formulate this relation we will first need to introduce
|
||||
some definitions.
|
||||
some definitions, all neural networks introduced in the following will
|
||||
use a ReLU as activation at all neurons.
|
||||
|
||||
A randomized shallow network is characterized by only the weight
|
||||
parameter of the output layer being trainable, whereas the other
|
||||
parameters are random numbers.
|
||||
|
||||
\begin{Definition}[Randomized shallow neural network]
|
||||
For an input dimension $d$, let $n \in \mathbb{N}$ be the number of
|
||||
@ -244,11 +257,20 @@ some definitions.
|
||||
\]
|
||||
\label{def:rsnn}
|
||||
\end{Definition}
|
||||
We call a one dimensional randomized shallow neural network were the
|
||||
$L^2$ norm of the trainable weights $w$ are penalized in the loss
|
||||
function ridge penalized neural networks.
|
||||
|
||||
% We call a randomized shallow neural network where the size of the trainable
|
||||
% weights is punished in the error function a ridge penalized
|
||||
% neural network. For a tuning parameter $\tilde{\lambda}$ .. the extent
|
||||
% of penalization we get:
|
||||
\begin{Definition}[Ridge penalized Neural Network]
|
||||
\label{def:rpnn}
|
||||
Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
|
||||
network, as introduced in ???. Then the optimal ridge penalized
|
||||
network, as introduced in Definition~\ref{def:rsnn} and tuning
|
||||
parameter $\tilde{\lambda} \in \mathbb{R}$. Then the optimal ridge
|
||||
penalized
|
||||
network is given by
|
||||
\[
|
||||
\mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
|
||||
@ -263,9 +285,8 @@ some definitions.
|
||||
\tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
|
||||
\]
|
||||
\end{Definition}
|
||||
In the ridge penalized Neural Network large weights are penalized, the
|
||||
extend of which can be tuned with the parameter $\tilde{\lambda}$. If
|
||||
$n$ is larger than the amount of training samples $N$ then for
|
||||
If the amount of hidden nodes $n$ is larger than the amount of
|
||||
training samples $N$ then for
|
||||
$\tilde{\lambda} \to 0$ the network will interpolate the data while
|
||||
having minimal weights, resulting in the \textit{minimum norm
|
||||
network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
|
||||
@ -280,15 +301,109 @@ having minimal weights, resulting in the \textit{minimum norm
|
||||
\left\{1,\dots,N\right\}.
|
||||
\]
|
||||
For $\tilde{\lambda} \to \infty$ the learned
|
||||
function will resemble the data less and less with the weights
|
||||
approaching $0$. .\par
|
||||
In order to make the notation more convinient in the follwoing the
|
||||
function will resemble the data less and with the weights
|
||||
approaching $0$ will converge to the constant $0$ function.
|
||||
|
||||
In order to make the notation more convinient in the following the
|
||||
$\omega$ used to express the realised random parameters will no longer
|
||||
be explizitly mentioned.
|
||||
be explicitly mentioned.
|
||||
|
||||
We call a function that minimizes the cubic distance between training points
|
||||
and the function with respect\todo{richtiges wort} to the second
|
||||
derivative of the function a regression spline.
|
||||
|
||||
\begin{Definition}[Regression Spline]
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
|
||||
\mathbb{R}$ the regression spline is given by
|
||||
\[
|
||||
f^{*,\lambda} :\in \argmin_{f \in
|
||||
\mathcal{C}^2}\left\{\sum_{i=1}^N
|
||||
\left(f\left(x_i^{\text{train}}\right) -
|
||||
y_i^{\text{train}}\right)^2 + \lambda \int f^{''}(x)^2dx\right\}.
|
||||
\]
|
||||
\end{Definition}
|
||||
|
||||
We will show that for specific hyper parameters the ridge penalized
|
||||
shallow neural networks converge to a slightly modified variant of the
|
||||
regression spline. We will need to incorporate the densities of the
|
||||
random parameters in the loss function of the spline to ensure
|
||||
convergence. Thus we define
|
||||
the adapted weighted regression spline where the loss for the second
|
||||
derivative is weighted by a function $g$ and the support of the second
|
||||
derivative of $f$ has to be a subset the support of $g$. The formal
|
||||
definition is given in Definition~\ref{def:wrs}.
|
||||
|
||||
% We will later ... the converging .. of the ridge penalized shallow
|
||||
% neural network, in order to do so we will need a slightly modified
|
||||
% version of the regression
|
||||
% spline that allows for weighting the penalty term for the second
|
||||
% derivative with a weight function $g$. This is needed to ...the
|
||||
% distributions of the random parameters ... We call this the adapted
|
||||
% weighted regression spline.
|
||||
|
||||
% Now we take a look at weighted regression splines. Later we will prove
|
||||
% that the ridge penalized neural network as defined in
|
||||
% Definition~\ref{def:rpnn} converges a weighted regression spline, as
|
||||
% the amount of hidden nodes is grown to inifity.
|
||||
|
||||
\begin{Definition}[Adapted Weighted regression spline]
|
||||
\label{def:wrs}
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
||||
regression spline $f^{*, \lambda}_g$ is given by
|
||||
|
||||
\[
|
||||
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
|
||||
\\ \supp(f'') \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
|
||||
1}^N \left(f(x_i^{\text{train}}) - y_i^{\text{train}}\right)^2}^{L(f)} +
|
||||
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
|
||||
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
|
||||
\]
|
||||
\todo{Anforderung an Ableitung von f, doch nicht?}
|
||||
\end{Definition}
|
||||
|
||||
Similarly to ridge weight penalized neural networks the parameter
|
||||
$\lambda$ controls a trade-off between accuracy on the training data
|
||||
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
|
||||
resulting function $f^{*, 0+}$ will interpolate the training data while minimizing
|
||||
the second derivative. Such a function is known as cubic spline
|
||||
interpolation.
|
||||
\todo{cite cubic spline}
|
||||
|
||||
\[
|
||||
f^{*, 0+} \text{ smooth spline interpolation: }
|
||||
\]
|
||||
\[
|
||||
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
|
||||
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
|
||||
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
|
||||
\]
|
||||
|
||||
For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
|
||||
to linear regression of the data.
|
||||
|
||||
We use two intermediary functions in order to show the convergence of
|
||||
the ridge penalized shallow neural network to adapted regression splines.
|
||||
% In order to show that ridge penalized shallow neural networks converge
|
||||
% to adapted regression splines for a growing amount of hidden nodes we
|
||||
% define two intermediary functions.
|
||||
One being a smooth approximation of
|
||||
the neural network, and a randomized shallow neural network designed
|
||||
to approximate a spline.
|
||||
In order to properly BUILD these functions we need to take the points
|
||||
of the network into consideration where the TRAJECTORY changes or
|
||||
their points of discontinuity
|
||||
As we use the ReLU activation the function learned by the
|
||||
network will possess points of discontinuity where a neuron in the hidden
|
||||
layer gets activated (goes from 0 -> x>0). We formalize these points
|
||||
as kinks in Definition~\ref{def:kink}.
|
||||
\begin{Definition}
|
||||
\label{def:kink}
|
||||
Let $\mathcal{RN}_w$ be a randomized shallow Neural
|
||||
Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can
|
||||
Network according to Definition~\ref{def:rsnn}, then kinks depending
|
||||
on the random parameters can
|
||||
be observed.
|
||||
\[
|
||||
\mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \sigma(b_k + v_kx)
|
||||
@ -307,15 +422,14 @@ be explizitly mentioned.
|
||||
\end{enumerate}
|
||||
\end{Definition}
|
||||
|
||||
In order to later prove the connection between randomised shallow
|
||||
Neural Networks and regression splines, we first take a look at a
|
||||
smooth approximation of the RSNN.
|
||||
Using the density of the kinks we construct a kernel and smooth the
|
||||
network by applying the kernel similar to convolution.
|
||||
|
||||
\begin{Definition}[Smooth Approximation of Randomized Shallow Neural
|
||||
Network]
|
||||
\label{def:srsnn}
|
||||
Let $RS_{w}$ be a randomized shallow Neural Network according to
|
||||
Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with
|
||||
Definition~\ref{def:rsnn} with weights $w$ and kinks $\xi_k$ with
|
||||
corresponding kink density $g_{\xi}$ as given by
|
||||
Definition~\ref{def:kink}.
|
||||
In order to smooth the RSNN consider following kernel for every $x$:
|
||||
@ -338,53 +452,19 @@ satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
|
||||
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
|
||||
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
|
||||
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
|
||||
We use $f^{w^{*,\tilde{\lambda}}}$ do describe the spline
|
||||
approximating the ... ridge penalized network
|
||||
$\mathrm{RN}^{*,\tilde{\lambda}}$.
|
||||
|
||||
Now we take a look at weighted regression splines. Later we will prove
|
||||
that the ridge penalized neural network as defined in
|
||||
Definition~\ref{def:rpnn} converges a weighted regression spline, as
|
||||
the amount of hidden nodes is grown to inifity.
|
||||
Next we construct a randomized shallow neural network which
|
||||
approximates a spline independent from the realization of the random
|
||||
parameters. In order to achieve this we ...
|
||||
|
||||
\begin{Definition}[Adapted Weighted regression spline]
|
||||
\label{def:wrs}
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
||||
regression spline $f^{*, \lambda}_g$ is given by
|
||||
|
||||
\[
|
||||
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
|
||||
\\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
|
||||
1}^N \left(f(x_i^{\text{train}}) - y_i^{\text{train}}\right)^2}^{L(f)} +
|
||||
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
|
||||
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
|
||||
\]
|
||||
\todo{Anforderung an Ableitung von f, doch nicht?}
|
||||
\end{Definition}
|
||||
|
||||
Similary to ridge weight penalized neural networks the parameter
|
||||
$\lambda$ controls a trade-off between accuracy on the training data
|
||||
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
|
||||
resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
|
||||
the second derivative. Such a function is known as cubic spline
|
||||
interpolation.
|
||||
\todo{cite cubic spline}
|
||||
|
||||
\[
|
||||
f^{*, 0+} \text{ smooth spline interpolation: }
|
||||
\]
|
||||
\[
|
||||
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
|
||||
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
|
||||
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
|
||||
\]
|
||||
|
||||
For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
|
||||
to linear regression of the data.
|
||||
\begin{Definition}[Spline approximating Randomised Shallow Neural
|
||||
Network]
|
||||
\label{def:sann}
|
||||
Let $\mathcal{RN}$ be a randomised shallow Neural Network according
|
||||
to Definition~\ref{def:RSNN} and $f^{*, \lambda}_g$ be the weighted
|
||||
to Definition~\ref{def:rsnn} and $f^{*, \lambda}_g$ be the weighted
|
||||
regression spline as introduced in Definition~\ref{def:wrs}. Then
|
||||
the randomised shallow neural network approximating $f^{*,
|
||||
\lambda}_g$ is given by
|
||||
@ -399,9 +479,8 @@ to linear regression of the data.
|
||||
\end{Definition}
|
||||
|
||||
The approximating nature of the network in
|
||||
Definition~\ref{def:sann} can be seen by LOOKING \todo{besseres Wort
|
||||
finden} at the first derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given
|
||||
by
|
||||
Definition~\ref{def:sann} can be seen by examining the first
|
||||
derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given by
|
||||
\begin{align}
|
||||
\frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x}
|
||||
\Big{|}_{x} &= \sum_k^n \tilde{w}_k \mathds{1}_{\left\{b_k + v_k x >
|
||||
@ -411,16 +490,18 @@ by
|
||||
\xi_k < x}} \frac{v_k^2}{g_{\xi}(\xi_k) \mathbb{E}[v^2 \vert \xi
|
||||
= \xi_k]} (f_g^{*, \lambda})''(\xi_k). \label{eq:derivnn}
|
||||
\end{align}
|
||||
\todo{gescheite Ableitungs Notation}
|
||||
As the expression (\ref{eq:derivnn}) behaves similary to a
|
||||
Riemann-sum for $n \to \infty$ it will converge to the first
|
||||
derievative of $f^{*,\lambda}_g$. A formal proof of this behaviour
|
||||
Riemann-sum for $n \to \infty$ it will converge in probability to the
|
||||
first derivative of $f^{*,\lambda}_g$. A formal proof of this behaviour
|
||||
is given in Lemma~\ref{lem:s0}.
|
||||
|
||||
In order to ensure the functions used in the proof of the convergence
|
||||
are well defined we need to assume some properties of the random
|
||||
parameters and their densities
|
||||
|
||||
In order to formulate the theorem describing the convergence of $RN_w$
|
||||
we need to make a couple of assumptions.
|
||||
\todo{Bessere Formulierung}
|
||||
% In order to formulate the theorem describing the convergence of $RN_w$
|
||||
% we need to make a couple of assumptions.
|
||||
% \todo{Bessere Formulierung}
|
||||
|
||||
\begin{Assumption}~
|
||||
\label{ass:theo38}
|
||||
@ -440,8 +521,8 @@ we need to make a couple of assumptions.
|
||||
\end{enumerate}
|
||||
\end{Assumption}
|
||||
|
||||
As we will prove the prorpsition in the Sobolev space, we hereby
|
||||
introduce it and its inuced\todo{richtiges wort?} norm.
|
||||
As we will prove the convergence of in the Sobolev space, we hereby
|
||||
introduce it and the corresponding induced norm.
|
||||
|
||||
\begin{Definition}[Sobolev Space]
|
||||
For $K \subset \mathbb{R}^n$ open and $1 \leq p \leq \infty$ we
|
||||
@ -473,9 +554,10 @@ introduce it and its inuced\todo{richtiges wort?} norm.
|
||||
\]
|
||||
\end{Definition}
|
||||
|
||||
With these assumption in place we can formulate the main theorem.
|
||||
\todo{Bezug Raum}
|
||||
|
||||
With the important definitions and assumptions in place we can now
|
||||
formulate the main theorem ... the convergence of ridge penalized
|
||||
random neural networks to adapted regression splines when the
|
||||
parameters are chosen accordingly.
|
||||
|
||||
\begin{Theorem}[Ridge weight penaltiy corresponds to weighted regression spline]
|
||||
\label{theo:main1}
|
||||
@ -498,7 +580,8 @@ With these assumption in place we can formulate the main theorem.
|
||||
\tilde{\lambda} & \coloneqq \lambda n g(0).
|
||||
\end{align*}
|
||||
\end{Theorem}
|
||||
We will proof Theo~\ref{theo:main1} by showing that
|
||||
As mentioned above we will prof Theorem~\ref{theo:main1} utilizing
|
||||
the ... functions. We show that
|
||||
\begin{equation}
|
||||
\label{eq:main2}
|
||||
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f^{w^*}}_{W^{1,
|
||||
@ -509,10 +592,10 @@ and
|
||||
\label{eq:main3}
|
||||
\plimn \norm{f^{w^*} - f_g^{*, \lambda}}_{W^{1,\infty}(K)} = 0
|
||||
\end{equation}
|
||||
and then using the triangle inequality to follow (\ref{eq:main1}). In
|
||||
and then get (\ref{eq:main1}) using the triangle inequality. In
|
||||
order to prove (\ref{eq:main2}) and (\ref{eq:main3}) we will need to
|
||||
introduce a number of auxiliary lemmmata, proves to these will be
|
||||
provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
introduce a number of auxiliary lemmmata, proves of these will be
|
||||
provided in the appendix.
|
||||
|
||||
|
||||
|
||||
@ -534,7 +617,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
|
||||
C_K^2 \norm{f''}_{L^2(K)}.
|
||||
\end{equation*}
|
||||
% \proof
|
||||
\proof The proof is given in the appendix...
|
||||
% With the fundamental theorem of calculus, if
|
||||
% \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
|
||||
% \begin{equation}
|
||||
@ -555,17 +638,17 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
% get (\ref{eq:pti1}).
|
||||
% By using the Hölder inequality, we can proof the second claim.
|
||||
% \begin{align*}
|
||||
% \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
|
||||
% \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
|
||||
% K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
|
||||
% &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
|
||||
% \in
|
||||
% K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
|
||||
% = \abs{b-a}\norm{f''}_{L^2(K)}.
|
||||
% \end{align*}
|
||||
% Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
|
||||
% \abs{b-a}C_K^{\infty}\).
|
||||
% \qed
|
||||
% \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
|
||||
% \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
|
||||
% K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
|
||||
% &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
|
||||
% \in
|
||||
% K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
|
||||
% = \abs{b-a}\norm{f''}_{L^2(K)}.
|
||||
% \end{align*}
|
||||
% Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
|
||||
% \abs{b-a}C_K^{\infty}\).
|
||||
% \qed
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}
|
||||
@ -584,62 +667,62 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
|
||||
\]
|
||||
uniformly in \(T \in K\).
|
||||
% \proof
|
||||
% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
|
||||
% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
|
||||
% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
|
||||
% \begin{equation}
|
||||
% \label{eq:psi_stet}
|
||||
% \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
|
||||
% \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
|
||||
% \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
|
||||
% \frac{1}{g_{\xi}(\xi')}} < \varepsilon
|
||||
% \end{equation}
|
||||
% uniformly in \(v\). In order to
|
||||
% save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
|
||||
% \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
|
||||
% intervall. By splitting the interval in disjoint strips of length \(\delta
|
||||
% \leq \delta(\varepsilon)\) we get:
|
||||
|
||||
% \[
|
||||
% \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||
% \frac{\bar{h}_k}{2}}_{\circled{1}} =
|
||||
% \underbrace{\sum_{l \in \mathbb{Z}:
|
||||
% \left[\delta l, \delta (l + 1)\right] \subseteq
|
||||
% \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
|
||||
% \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
|
||||
% \]
|
||||
% Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
|
||||
% \begin{align*}
|
||||
% \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
|
||||
% \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
|
||||
% 1}\right) \\
|
||||
% % \intertext{}
|
||||
% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(l\delta, v_k\right)}
|
||||
% {\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l +
|
||||
% 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
|
||||
% \intertext{We use the mean to approximate the number of kinks in
|
||||
% each $\delta$-strip, as it follows a bonomial distribution this
|
||||
% amounts to
|
||||
% \[
|
||||
% \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||
% \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
|
||||
% 1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
|
||||
% \tilde{\varepsilon}).
|
||||
% \]
|
||||
% Bla Bla Bla $v_k$}
|
||||
% \circled{1} & \approx
|
||||
% \end{align*}
|
||||
\proof The proof is given in appendix...
|
||||
% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
|
||||
% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
|
||||
% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
|
||||
% \begin{equation}
|
||||
% \label{eq:psi_stet}
|
||||
% \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
|
||||
% \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
|
||||
% \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
|
||||
% \frac{1}{g_{\xi}(\xi')}} < \varepsilon
|
||||
% \end{equation}
|
||||
% uniformly in \(v\). In order to
|
||||
% save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
|
||||
% \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
|
||||
% intervall. By splitting the interval in disjoint strips of length \(\delta
|
||||
% \leq \delta(\varepsilon)\) we get:
|
||||
|
||||
% \[
|
||||
% \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||
% \frac{\bar{h}_k}{2}}_{\circled{1}} =
|
||||
% \underbrace{\sum_{l \in \mathbb{Z}:
|
||||
% \left[\delta l, \delta (l + 1)\right] \subseteq
|
||||
% \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
|
||||
% \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
|
||||
% \]
|
||||
% Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
|
||||
% \begin{align*}
|
||||
% \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
|
||||
% \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
|
||||
% 1}\right) \\
|
||||
% \intertext{}
|
||||
% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(l\delta, v_k\right)}
|
||||
% {\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l +
|
||||
% 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
|
||||
% \intertext{We use the mean to approximate the number of kinks in
|
||||
% each $\delta$-strip, as it follows a bonomial distribution this
|
||||
% amounts to
|
||||
% \[
|
||||
% \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||
% \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
|
||||
% 1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
|
||||
% \tilde{\varepsilon}).
|
||||
% \]
|
||||
% Bla Bla Bla $v_k$}
|
||||
% \circled{1} & \approx
|
||||
% \end{align*}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 0]
|
||||
@ -666,18 +749,18 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\begin{align*}
|
||||
\plimn \frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x}
|
||||
\stackrel{(\ref{eq:derivnn})}{=}
|
||||
& \plimn \sum_{\substack{k \in \mathbb{N} \\
|
||||
& \plimn \sum_{\substack{k \in \mathbb{N} \\
|
||||
\xi_k < x}} \frac{v_k^2}{\mathbb{E}[v^2 \vert \xi
|
||||
= \xi_k]} (f_g^{*, \lambda})''(\xi_k) h_{k,n}
|
||||
\stackrel{\text{Lemma}~\ref{lem:cnvh}}{=} \\
|
||||
\stackrel{\text{Lemma}~\ref{lem:cnvh}}{=} \\
|
||||
\stackrel{\phantom{(\ref{eq:derivnn})}}{=}
|
||||
&
|
||||
&
|
||||
\int_{\min\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
|
||||
\mathbb{E}\left[\frac{v^2}{\mathbb{E}[v^2|\xi = z]} (f^{*,
|
||||
\lambda}_w)''(\xi) \vert
|
||||
\xi = x \right] dx \equals^{\text{Tower-}}_{\text{property}} \\
|
||||
\stackrel{\phantom{(\ref{eq:derivnn})}}{=}
|
||||
&
|
||||
&
|
||||
\int_{\min\left\{C_{g_{\xi}}^l,
|
||||
T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}(f^{*,\lambda}_w)''(x)
|
||||
dx.
|
||||
@ -685,6 +768,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
By the fundamental theorem of calculus and $\supp(f') \subset
|
||||
\supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}.
|
||||
\qed
|
||||
\label{lem:s0}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 2]
|
||||
@ -696,19 +780,22 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
F^{\lambda, g}(f^{*, \lambda}_g) = 0.
|
||||
\]
|
||||
\proof
|
||||
This can be prooven by showing
|
||||
The proof is given in the appendix...
|
||||
\label{lem:s2}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 3]
|
||||
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
||||
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
||||
\left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
|
||||
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
|
||||
respectively, it holds
|
||||
\left\{1,\dots,N\right\}$, with $w^*$ as
|
||||
defined in Definition~\ref{def:rpnn} and $\tilde{\lambda}$ as
|
||||
defined in Theroem~\ref{theo:main1}, it holds
|
||||
\[
|
||||
\plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} -
|
||||
f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0.
|
||||
\]
|
||||
\proof The proof is given in Appendix ..
|
||||
\label{lem:s3}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 4]
|
||||
@ -718,9 +805,11 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
|
||||
respectively, it holds
|
||||
\[
|
||||
\plimn \abs{F_n^{\lambda}(\mathcal{RN}^{*,\tilde{\lambda}}) -
|
||||
\plimn \abs{F_n^{\tilde{\lambda}}(\mathcal{RN}^{*,\tilde{\lambda}}) -
|
||||
F^{\lambda, g}(f^{w*, \tilde{\lambda}})} = 0.
|
||||
\]
|
||||
\proof The proof is given in appendix...
|
||||
\label{lem:s4}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 7]
|
||||
@ -735,11 +824,81 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\[
|
||||
\plimn \norm{f^n - f^{*, \lambda}} = 0.
|
||||
\]
|
||||
\proof The proof is given in appendix ...
|
||||
\label{lem:s7}
|
||||
\end{Lemma}
|
||||
Using these lemmata we can now proof Theorem~\ref{theo:main1}. We
|
||||
start by showing that the error measure of the smooth approximation of
|
||||
the ridge penalized randomized shallow neural network $F^{\lambda,
|
||||
g}\left(f^{w^{*,\tilde{\lambda}}}\right)$
|
||||
will converge in probability to the error measure of the adapted weighted regression
|
||||
spline $F^{\lambda, g}\left(f^{*,\lambda}\right)$ for the specified
|
||||
parameters.
|
||||
|
||||
\textcite{heiss2019} further show a link between ridge penalized
|
||||
networks and randomized shallow neural networks which are trained with
|
||||
gradient descent which is stopped after a certain amount of iterations.
|
||||
Using Lemma~\ref{lem:s4} we get that for every $P \in (0,1)$ and
|
||||
$\varepsilon > 0$ there exists a $n_1 \in \mathbb{N}$ such that
|
||||
\[
|
||||
\mathbb{P}\left[F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) \in
|
||||
F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right)
|
||||
+[-\varepsilon, \varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_1}.
|
||||
\]
|
||||
As $\mathcal{RN}^{*,\tilde{\lambda}}$ is the optimal network for
|
||||
$F_n^{\tilde{\lambda}}$ we know that
|
||||
\[
|
||||
F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right)
|
||||
\leq F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right).
|
||||
\]
|
||||
Using Lemma~\ref{lem:s2} we get that for every $P \in (0,1)$ and
|
||||
$\varepsilon > 0$ there exists a $n_2 \in \mathbb{N}$ such that
|
||||
\[
|
||||
\mathbb{P}\left[F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right)
|
||||
\in F^{\lambda, g}\left(f^{*,\lambda}_g\right)+[-\varepsilon,
|
||||
\varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_2}.
|
||||
\]
|
||||
If we combine these ... we get that for every $P \in (0,1)$ and
|
||||
$\varepsilon > 0$ and $n_3 \geq
|
||||
\max\left\{n_1,n_2\right\}$
|
||||
\[
|
||||
\mathbb{P}\left[F^{\lambda,
|
||||
g}\left(f^{w^{*,\tilde{\lambda}}}\right) \leq F^{\lambda,
|
||||
g}\left(f^{*,\lambda}_g\right)+2\varepsilon\right] > P, \forall
|
||||
n \in \mathbb{N}_{> n_3}.
|
||||
\]
|
||||
As ... is in ... and ... is optimal we know that
|
||||
\[
|
||||
F^{\lambda, g}\left(f^{*,\lambda}_g\right) \leq F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right)
|
||||
\]
|
||||
and thus get with the squeeze theorem
|
||||
\[
|
||||
\plimn F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) = F^{\lambda, g}\left(f^{*,\lambda}_g\right).
|
||||
\]
|
||||
We can now use Lemma~\ref{lem:s7} to follow that
|
||||
\begin{equation}
|
||||
\plimn \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
|
||||
_{W^{1,\infty}} = 0.
|
||||
\label{eq:main2}
|
||||
\end{equation}
|
||||
Now by using the triangle inequality with Lemma~\ref{lem:s3} and
|
||||
(\ref{eq:main2}) we get
|
||||
\begin{align*}
|
||||
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f_g^{*,\lambda}}
|
||||
\leq& \plimn \bigg(\norm{\mathcal{RN}^{*, \tilde{\lambda}} -
|
||||
f_g^{w^{*,\tilde{\lambda}}}}_{W^{1,\infty}}\\
|
||||
&+ \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
|
||||
_{W^{1,\infty}}\bigg) = 0
|
||||
\end{align*}
|
||||
and thus have proven Theorem~\ref{theo:main1}.
|
||||
We now know that randomized shallow neural networks behave similar to
|
||||
spline regression if we regularize the size of the weights during
|
||||
training.
|
||||
\textcite{heiss2019} further explore a connection between ridge penalized
|
||||
networks and randomized shallow neural networks which are trained
|
||||
which are only trained for a certain amount of epoch using gradient
|
||||
descent.
|
||||
And ... that the effect of weight regularization can be achieved by
|
||||
training for a certain amount of iterations this ... between adapted
|
||||
weighted regression splines and randomized shallow neural networks
|
||||
where training is stopped early.
|
||||
|
||||
\newpage
|
||||
\subsection{Simulations}
|
||||
@ -755,7 +914,7 @@ data have been generated.
|
||||
y_{i, A}^{\text{train}} &\coloneqq \sin( x_{i, A}^{\text{train}}). \phantom{(i - 1),
|
||||
i \in \left\{1, \dots, 6\right\}}
|
||||
\end{align*}
|
||||
\item $\text{data}_b = (x_{i, B}^{\text{train}}, y_{i,
|
||||
\item $\text{data}_B = (x_{i, B}^{\text{train}}, y_{i,
|
||||
B}^{\text{train}})$ with
|
||||
\begin{align*}
|
||||
x_{i, B}^{\text{train}} &\coloneqq \pi\frac{i - 8}{7},
|
||||
@ -785,9 +944,9 @@ been calculated with Matlab's ..... As ... minimizes
|
||||
the smoothing parameter used for fittment is $\bar{\lambda} =
|
||||
\frac{1}{1 + \lambda}$. The parameter $\tilde{\lambda}$ for training
|
||||
the networks is chosen as defined in Theorem~\ref{theo:main1} and each
|
||||
one is trained on the full training data for 5000 iterations using
|
||||
one is trained on the full training data for 5000 epoch using
|
||||
gradient descent. The
|
||||
results are given in Figure~\ref{blblb}, here it can be seen that in
|
||||
results are given in Figure~\ref{fig:rs_vs_rs}, here it can be seen that in
|
||||
the intervall of the traing data $[-\pi, \pi]$ the neural network and
|
||||
smoothing spline are nearly identical, coinciding with the proposition.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user