From 2ef7cda1dd4eb360cb87e79a21f496d96f09bd57 Mon Sep 17 00:00:00 2001 From: Tobias Arndt Date: Thu, 24 Sep 2020 10:30:18 +0200 Subject: [PATCH] hoffentlich final --- .gitignore | 1 + TeX/Appendix_code.tex | 460 ++++++++++++++++- TeX/Figures/RN_vs_RS.tex | 22 +- TeX/Figures/SGD_vs_GD.tex | 114 ++-- TeX/Figures/fashion_mnist.tex | 4 +- TeX/Figures/gen_dropout.tex | 19 +- TeX/Figures/mnist.tex | 5 +- TeX/Figures/sdg_comparison.tex | 77 ++- TeX/Figures/sin_conv.tex | 3 +- TeX/appendixA.tex | 500 +++++++++++------- TeX/bibliograpy.bib | 15 + TeX/further_applications_of_nn.tex | 805 ++++++++++++++++------------- TeX/introduction.tex | 86 ++- TeX/introduction_nn.tex | 193 +++---- TeX/main.lot | 6 +- TeX/main.out | 25 + TeX/main.tex | 425 +-------------- TeX/theo_3_8.tex | 444 ++++++++-------- 18 files changed, 1790 insertions(+), 1414 deletions(-) create mode 100644 TeX/main.out diff --git a/.gitignore b/.gitignore index ebbe901..d175d27 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ main-blx.bib *.tex~ *#*.tex* *~ +*#* # no pdfs *.pdf diff --git a/TeX/Appendix_code.tex b/TeX/Appendix_code.tex index f639eff..bac1fdb 100644 --- a/TeX/Appendix_code.tex +++ b/TeX/Appendix_code.tex @@ -1,19 +1,16 @@ -\section{Code...} -In this ... the implementations of the models used in ... are -given. The randomized shallow neural network used in CHAPTER... are -implemented in Scala from ground up to ensure the model is exactly to -... of Theorem~\ref{theo:main1}. - -The neural networks used in CHAPTER are implemented in python using -the Keras framework given in Tensorflow. Tensorflow is a library -containing highly efficient GPU implementations of most important +\section{Implementations} +In this section the implementations models used are given. +The randomized shallow neural network used in Section~\ref{sec:conv} are +implemented in Scala. No preexisting frameworks were used to ensure +the implementation was according to the definitions used in Theorem~\ref{theo:main1}. + +The neural networks used in Section~\ref{sec:cnn} are implemented in python using +the Keras framework given in TensorFlow. TensorFlow is a library +containing highly efficient GPU implementations of a wide variety tensor operations, such as convolution as well as efficient algorithms -for training neural networks (computing derivatives, updating parameters). -\begin{itemize} - \item Code for randomized shallow neural network - \item Code for keras -\end{itemize} +for training neural networks.% (computing derivatives, updating parameters). +\vspace*{-0.5cm} \begin{lstfloat} \begin{lstlisting}[language=iPython] import breeze.stats.distributions.Uniform @@ -72,10 +69,11 @@ class RSNN(val n: Int, val gamma: Double = 0.001) { } \end{lstlisting} \caption{Scala code used to build and train the ridge penalized - randomized shallow neural network in .... The parameter \textit{lam} - in the train function represents the $\lambda$ parameter in the error - function. The parameters \textit{n} and \textit{gamma} set the number - of hidden nodes and the stepsize for training.} + randomized shallow neural network in Section~\ref{sec:rsnn_sim}.} + % The parameter \textit{lam} + % in the train function represents the $\lambda$ parameter in the error + % function. The parameters \textit{n} and \textit{gamma} set the number + % of hidden nodes and the stepsize for training.} \label{lst:rsnn} \end{lstfloat} \clearpage @@ -126,8 +124,8 @@ validation_data=(x_test, y_test), steps_per_epoch = x_train.shape[0]//50) \end{lstlisting} - \caption{Python code for the model used... the MNIST handwritten digits - dataset.} + \caption{Python code used to build the network modeling the MNIST + handwritten digits data set.} \label{lst:handwriting} \end{lstfloat} \clearpage @@ -163,11 +161,11 @@ model.add(tf.keras.layers.Dense(10, activation='softmax')) model.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss="categorical_crossentropy", metrics=["accuracy"]) datagen = ImageDataGenerator( - rotation_range = 15, - zoom_range = 0.1, + rotation_range = 6, + zoom_range = 0.15, width_shift_range=2, height_shift_range=2, - shear_range = 0.5, + shear_range = 0.15, fill_mode = 'constant', cval = 0) @@ -180,8 +178,8 @@ datagen = ImageDataGenerator( shuffle=True) \end{lstlisting} - \caption{Python code for the model used... the fashion MNIST - dataset.} + \caption[Python Code for fashion MNIST]{Python code + used to build the network modeling the fashion MNIST data set.} \label{lst:fashion} \end{lstfloat} \clearpage @@ -205,6 +203,418 @@ def get_random_sample(a, b, number_of_samples=10): \caption{Python code used to generate the datasets containing a certain amount of random datapoints per class.} \end{lstfloat} + +\section{Additional Comparisons} +\label{app:comp} +In this section comparisons of cross entropy loss and training +accuracy for the models trained in Section~\ref{sec:smalldata} are given. +\begin{figure}[h] + \centering + \small + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Test Loss}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_1.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_dropout_02_1.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_datagen_1.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_02_1.mean}; + + + \addlegendentry{\footnotesize{Default}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G. + D. 0.2}} + \addlegendentry{\footnotesize{D. 0.4}} + \addlegendentry{\footnotesize{Default}} + \end{axis} + \end{tikzpicture} + \caption{1 Sample per Class} + \vspace{0.25cm} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Test Loss}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_dropout_00_10.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_dropout_02_10.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_00_10.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_02_10.mean}; + + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{10 Samples per Class} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch}, ylabel = {Test Loss}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_dropout_00_100.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_dropout_02_100.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_00_100.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_02_100.mean}; + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{100 Samples per Class} + \vspace{.25cm} + \end{subfigure} + \caption[Mean Test Loss for Subsets of MNIST Handwritten + Digits]{Mean test cross entropy loss of the models fitting the + sampled subsets of MNIST + handwritten digits over the 125 epochs of training.} +\end{figure} + +\begin{figure}[h] + \centering + \small + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = + {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Test Loss}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_0_1.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_2_1.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_0_1.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_2_1.mean}; + + + \addlegendentry{\footnotesize{Default}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G. + D. 0.2}} + \addlegendentry{\footnotesize{D. 0.4}} + \end{axis} + \end{tikzpicture} + \caption{1 Sample per Class} + \vspace{0.25cm} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Test Loss}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}, ymin = {0.62}] + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_0_10.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_2_10.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_0_10.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_2_10.mean}; + + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{10 Samples per Class} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch}, ylabel = {Test Loss}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_0_100.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_2_100.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_0_100.mean}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_2_100.mean}; + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{100 Samples per Class} + \vspace{.25cm} + \end{subfigure} + \caption[Mean Test Accuracies for Subsets of Fashion MNIST]{Mean + test cross entropy loss of the models fitting the sampled subsets + of fashion MNIST + over the 125 epochs of training.} +\end{figure} + +\begin{figure}[h] + \centering + \small + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Training Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_1.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_dropout_02_1.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_datagen_1.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_02_1.mean}; + + + \addlegendentry{\footnotesize{Default}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G. + D. 0.2}} + \addlegendentry{\footnotesize{D. 0.4}} + \addlegendentry{\footnotesize{Default}} + \end{axis} + \end{tikzpicture} + \caption{1 Sample per Class} + \vspace{0.25cm} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Test Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_dropout_00_10.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_dropout_02_10.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_00_10.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_02_10.mean}; + + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{10 Samples per Class} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch}, ylabel = {Training Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}, ymin = {0.92}] + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_dropout_00_100.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_dropout_02_100.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_00_100.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/adam_datagen_dropout_02_100.mean}; + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{100 Samples per Class} + \vspace{.25cm} + \end{subfigure} + \caption[Mean Training Accuracies for Subsets of MNIST Handwritten + Digits]{Mean training accuracies of the models fitting the sampled + subsets of MNIST + handwritten digits over the 125 epochs of training.} +\end{figure} + +\begin{figure}[h] + \centering + \small + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = + {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Training Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_0_1.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_2_1.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_0_1.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_2_1.mean}; + + + \addlegendentry{\footnotesize{Default}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G. + D. 0.2}} + \addlegendentry{\footnotesize{D. 0.4}} + \end{axis} + \end{tikzpicture} + \caption{1 Sample per Class} + \vspace{0.25cm} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch},ylabel = {Training Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}, ymin = {0.62}] + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_0_10.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_2_10.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_0_10.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_2_10.mean}; + + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{10 Samples per Class} + \end{subfigure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, + height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch}, ylabel = {Training Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_0_100.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_dropout_2_100.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_0_100.mean}; + \addplot table + [x=epoch, y=accuracy, col sep=comma, mark = none] + {Figures/Data/fashion_datagen_dropout_2_100.mean}; + + \addlegendentry{\footnotesize{Default.}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G + D. 0.2}} + \end{axis} + \end{tikzpicture} + \caption{100 Samples per Class} + \vspace{.25cm} + \end{subfigure} + \caption[Mean Training Accuracies for Subsets of Fashion MNIST]{Mean + training accuracies of the models fitting the sampled subsets of fashion MNIST + over the 125 epochs of training.} +\end{figure} + %%% Local Variables: %%% mode: latex %%% TeX-master: "main" diff --git a/TeX/Figures/RN_vs_RS.tex b/TeX/Figures/RN_vs_RS.tex index a561abf..88e7201 100644 --- a/TeX/Figures/RN_vs_RS.tex +++ b/TeX/Figures/RN_vs_RS.tex @@ -10,13 +10,14 @@ plot coordinates { } } \begin{figure} - \begin{subfigure}[b]{0.5\textwidth} + \begin{subfigure}[b]{0.48\textwidth} \begin{subfigure}[b]{\textwidth} \begin{adjustbox}{width=\textwidth, height=0.25\textheight} \begin{tikzpicture} \begin{axis}[ ytick = {-1, 0, 1, 2}, - yticklabels = {$-1$, $\phantom{-0.}0$, $1$, $2$},] + yticklabels = {$-1$, $\phantom{-0.}0$, $1$, $2$}, + restrict x to domain=-4:4, enlarge x limits = {0.1}] \addplot table [x=x, y=y, col sep=comma, only marks, forget plot] {Figures/Data/sin_6.csv}; \addplot [black, line width=2pt] table [x=x, y=y, col @@ -33,7 +34,7 @@ plot coordinates { \begin{subfigure}[b]{\textwidth} \begin{adjustbox}{width=\textwidth, height=0.25\textheight} \begin{tikzpicture} - \begin{axis} + \begin{axis}[restrict x to domain=-4:4, enlarge x limits = {0.1}] \addplot table [x=x, y=y, col sep=comma, only marks, forget plot] {Figures/Data/sin_6.csv}; \addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_1.csv}; @@ -49,7 +50,7 @@ plot coordinates { \begin{subfigure}[b]{\textwidth} \begin{adjustbox}{width=\textwidth, height=0.25\textheight} \begin{tikzpicture} - \begin{axis} + \begin{axis}[restrict x to domain=-4:4, enlarge x limits = {0.1}] \addplot table [x=x, y=y, col sep=comma, only marks, forget plot] {Figures/Data/sin_6.csv}; \addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_3.csv}; @@ -63,13 +64,14 @@ plot coordinates { \caption{$\lambda = 3.0$} \end{subfigure} \end{subfigure} - \begin{subfigure}[b]{0.5\textwidth} + \begin{subfigure}[b]{0.48\textwidth} \begin{subfigure}[b]{\textwidth} \begin{adjustbox}{width=\textwidth, height=0.245\textheight} \begin{tikzpicture} \begin{axis}[ ytick = {-2,-1, 0, 1, 2}, - yticklabels = {$-2$,$-1$, $\phantom{-0.}0$, $1$, $2$},] + yticklabels = {$-2$,$-1$, $\phantom{-0.}0$, $1$, $2$}, + restrict x to domain=-4:4, enlarge x limits = {0.1}] \addplot table [x=x, y=y, col sep=comma, only marks, forget plot] {Figures/Data/data_sin_d_t.csv}; \addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_sin_d_01.csv}; @@ -85,7 +87,7 @@ plot coordinates { \begin{subfigure}[b]{\textwidth} \begin{adjustbox}{width=\textwidth, height=0.25\textheight} \begin{tikzpicture} - \begin{axis} + \begin{axis}[restrict x to domain=-4:4, enlarge x limits = {0.1}] \addplot table [x=x, y=y, col sep=comma, only marks, forget plot] {Figures/Data/data_sin_d_t.csv}; \addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_sin_d_1.csv}; @@ -101,7 +103,7 @@ plot coordinates { \begin{subfigure}[b]{\textwidth} \begin{adjustbox}{width=\textwidth, height=0.25\textheight} \begin{tikzpicture} - \begin{axis} + \begin{axis}[restrict x to domain=-4:4, enlarge x limits = {0.1}] \addplot table [x=x, y=y, col sep=comma, only marks, forget plot] {Figures/Data/data_sin_d_t.csv}; \addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_sin_d_3.csv}; @@ -115,8 +117,8 @@ plot coordinates { \caption{$\lambda = 3.0$} \end{subfigure} \end{subfigure} - \caption[Comparison of shallow neural networks and regression - splines]{% In these Figures the behaviour stated in ... is + \caption[Comparison of Shallow Neural Networks and Regression + Splines] {% In these Figures the behaviour stated in ... is % visualized % in two exaples. For $(a), (b), (c)$ six values of sinus equidistantly % spaced on $[-\pi, \pi]$ have been used as training data. For diff --git a/TeX/Figures/SGD_vs_GD.tex b/TeX/Figures/SGD_vs_GD.tex index 10318ea..0aa3e89 100644 --- a/TeX/Figures/SGD_vs_GD.tex +++ b/TeX/Figures/SGD_vs_GD.tex @@ -4,28 +4,32 @@ legend image code/.code={ \draw[mark repeat=2,mark phase=2] plot coordinates { (0cm,0cm) -(0.0cm,0cm) %% default is (0.3cm,0cm) -(0.0cm,0cm) %% default is (0.6cm,0cm) +(0.15cm,0cm) %% default is (0.3cm,0cm) +(0.3cm,0cm) %% default is (0.6cm,0cm) };% } } \begin{figure} \begin{subfigure}[h!]{\textwidth} \begin{tikzpicture} - \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.6\textwidth, + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.975\textwidth, + height = 0.6\textwidth, legend + style={at={(0.0125,0.7)},anchor=north west}, + xlabel = {Epoch}, ylabel = {Test Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt, mark = *, mark size=1pt}, xtick = {1, 3, 5,7,9,11,13,15,17,19}, xticklabels = {$2$, $4$, $6$, $8$, - $10$,$12$,$14$,$16$,$18$,$20$}, - xlabel = {training epoch}, ylabel = {classification accuracy}] + $10$,$12$,$14$,$16$,$18$,$20$}] \addplot table [x=epoch, y=val_accuracy, col sep=comma] {Figures/Data/GD_01.log}; \addplot table - [x=epoch, y=val_accuracy, col sep=comma] {Figures/Data/GD_05.log}; + [x=epoch, y=val_accuracy, col sep=comma, mark = *] {Figures/Data/GD_05.log}; \addplot table - [x=epoch, y=val_accuracy, col sep=comma] {Figures/Data/GD_1.log}; + [x=epoch, y=val_accuracy, col sep=comma, mark = *] {Figures/Data/GD_1.log}; \addplot table - [x=epoch, y=val_accuracy, col sep=comma] + [x=epoch, y=val_accuracy, col sep=comma, mark = *] {Figures/Data/SGD_01_b32.log}; \addlegendentry{GD$_{0.01}$} @@ -34,59 +38,65 @@ plot coordinates { \addlegendentry{SGD$_{0.01}$} \end{axis} \end{tikzpicture} - %\caption{Classification accuracy} + \caption{Test accuracy during training.} \end{subfigure} - \begin{subfigure}[b]{\textwidth} - \begin{tikzpicture} - \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.6\textwidth, - ytick = {0, 1, 2, 3, 4}, - yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$}, - xtick = {1, 3, 5,7,9,11,13,15,17,19}, - xticklabels = {$2$, $4$, $6$, $8$, - $10$,$12$,$14$,$16$,$18$,$20$}, - xlabel = {training epoch}, ylabel = {error measure\vphantom{fy}}] - \addplot table - [x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_01.log}; - \addplot table - [x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_05.log}; - \addplot table - [x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_1.log}; - \addplot table - [x=epoch, y=val_loss, col sep=comma] {Figures/Data/SGD_01_b32.log}; + % \begin{subfigure}[b]{\textwidth} + % \begin{tikzpicture} + % \begin{axis}[tick style = {draw = none}, width = \textwidth, + % height = 0.6\textwidth, + % ytick = {0, 1, 2, 3, 4}, + % yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$}, + % xtick = {1, 3, 5,7,9,11,13,15,17,19}, + % xticklabels = {$2$, $4$, $6$, $8$, + % $10$,$12$,$14$,$16$,$18$,$20$}, + % xlabel = {training epoch}, ylabel = {error measure\vphantom{fy}}] + % \addplot table + % [x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_01.log}; + % \addplot table + % [x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_05.log}; + % \addplot table + % [x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_1.log}; + % \addplot table + % [x=epoch, y=val_loss, col sep=comma] {Figures/Data/SGD_01_b32.log}; - \addlegendentry{GD$_{0.01}$} - \addlegendentry{GD$_{0.05}$} - \addlegendentry{GD$_{0.1}$} - \addlegendentry{SGD$_{0.01}$} + % \addlegendentry{GD$_{0.01}$} + % \addlegendentry{GD$_{0.05}$} + % \addlegendentry{GD$_{0.1}$} + % \addlegendentry{SGD$_{0.01}$} - \end{axis} - \end{tikzpicture} - \caption{Performance metrics during training} - \end{subfigure} + % \end{axis} + % \end{tikzpicture} + % \caption{Performance metrics during training} + % \end{subfigure} % \\~\\ - \caption[Performance comparison of SDG and GD]{The neural network + + \begin{subfigure}[b]{1.0\linewidth} + \begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}} + \multicolumn{4}{c}{Test Accuracy} + &~&\multicolumn{4}{c}{Test Loss} + \\\cline{1-4}\cline{6-9} + GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$ + \\\cline{1-4}\cline{6-9} + 0.265&0.633&0.203&0.989&&2.267&1.947&3.911&0.032 \\ + \multicolumn{4}{c}{Training Accuracy} + &~&\multicolumn{4}{c}{Training Loss} + \\\cline{1-4}\cline{6-9} + GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$ + \\\cline{1-4}\cline{6-9} + 0.250&0.599&0.685&0.996&&2.271&1.995&1.089&0.012 \\ + \end{tabu} + \caption{Performance metrics after 20 training epochs.} + \label{table:sgd_vs_gd} + \end{subfigure} + + \caption[Performance Comparison of SDG and GD]{The neural network given in Figure~\ref{fig:mnist_architecture} trained with different algorithms on the MNIST handwritten digits data set. For gradient - descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For + descent the learning rated 0.01, 0.05, and 0.1 are (GD$_{\cdot}$). For stochastic gradient descend a batch size of 32 and learning rate of 0.01 is used (SDG$_{0.01}$).} \label{fig:sgd_vs_gd} \end{figure} - -\begin{table}[h] - \begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}} - \multicolumn{4}{c}{Classification Accuracy} - &~&\multicolumn{4}{c}{Error Measure} - \\\cline{1-4}\cline{6-9} - GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$ - \\\cline{1-4}\cline{6-9} - 0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032 - \end{tabu} - \caption{Performance metrics of the networks trained in - Figure~\ref{fig:sgd_vs_gd} after 20 training epochs.} - \label{table:sgd_vs_gd} -\end{table} %%% Local Variables: %%% mode: latex %%% TeX-master: "../main" diff --git a/TeX/Figures/fashion_mnist.tex b/TeX/Figures/fashion_mnist.tex index 0caced7..c6c69c4 100644 --- a/TeX/Figures/fashion_mnist.tex +++ b/TeX/Figures/fashion_mnist.tex @@ -40,11 +40,11 @@ \includegraphics[width=\textwidth]{Figures/Data/fashion_mnist9.pdf} \caption{Ankle boot} \end{subfigure} - \caption[Fashion MNIST data set]{The fashtion MNIST data set contains 70.000 images of + \caption[Fashion MNIST Data Set]{The fashtion MNIST data set contains 70.000 images of preprocessed product images from Zalando, which are categorized as T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while - the rest are used to validate the models trained.} + the rest is used to validate the models trained.} \label{fig:fashionMNIST} \end{figure} %%% Local Variables: diff --git a/TeX/Figures/gen_dropout.tex b/TeX/Figures/gen_dropout.tex index fc24bac..8479748 100644 --- a/TeX/Figures/gen_dropout.tex +++ b/TeX/Figures/gen_dropout.tex @@ -16,7 +16,7 @@ plot coordinates { \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, /pgf/number format/precision=3},tick style = {draw = none}, width = 0.975\textwidth, height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle + xlabel = {Epoch}, ylabel = {Test Accuracy}, cycle list/Dark2, every axis plot/.append style={line width =1.25pt}] \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] @@ -45,18 +45,18 @@ plot coordinates { \addlegendentry{\footnotesize{Default}} \end{axis} \end{tikzpicture} - \caption{Classification accuracy} + \caption{Test Accuracy} \vspace{.25cm} \end{subfigure} \begin{subfigure}[h]{1.0\linewidth} \begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}} - \Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\ + \Tstrut \Bstrut & Default & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\ \hline \multicolumn{7}{c}{Test Accuracy}\Bstrut \\ \cline{2-7} - mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\ - max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\ - min & 0.9887 & 0.9909 & 0.9922 & 0.9929 & 0.9929 & 0.9934 \\ + mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9943 & 0.9944 \\ + max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9957 & 0.9956 \\ + min & 0.9887 & 0.9909 & 0.9922 & 0.9929 & 0.9930 & 0.9934 \\ \hline \multicolumn{7}{c}{Training Accuracy}\Bstrut \\ \cline{2-7} @@ -64,15 +64,16 @@ plot coordinates { max & 0.9996 & 0.9996 & 0.9992 & 0.9979 & 0.9971 & 0.9937 \\ min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\ \end{tabu} - \caption{Mean and maximum accuracy after 48 epochs of training.} + \caption{Mean, maximum and minimum accuracy after 50 epochs of training.} \label{fig:gen_dropout_b} \end{subfigure} - \caption[Performance comparison of overfitting measures]{Accuracy for the net given in ... with Dropout (D.), + \caption[Performance Comparison of Overfitting Measures]{Accuracy + for the net given in Figure~\ref{fig:mnist_architecture} with Dropout (D.), data generation (G.), a combination, or neither (Default) implemented and trained with \textsc{Adam}. For each epoch the 60.000 training samples were used, or for data generation 10.000 steps with each using batches of 60 generated data points. For each configuration the - model was trained 5 times and the average accuracies at each epoch + model was trained five times and the average accuracies at each epoch are given in (a). Mean, maximum and minimum values of accuracy on the test and training set are given in (b).} \label{fig:gen_dropout} diff --git a/TeX/Figures/mnist.tex b/TeX/Figures/mnist.tex index fa053d9..afd2dbe 100644 --- a/TeX/Figures/mnist.tex +++ b/TeX/Figures/mnist.tex @@ -30,9 +30,10 @@ \begin{subfigure}{0.19\textwidth} \includegraphics[width=\textwidth]{Figures/Data/mnist9.pdf} \end{subfigure} - \caption[MNIST data set]{The MNIST data set contains 70.000 images of preprocessed handwritten + \caption[MNIST Database of Handwritten Digits]{The MNIST database of handwritten + digits contains 70.000 images of preprocessed handwritten digits. Of these images 60.000 are used as training images, while - the rest are used to validate the models trained.} + the rest is used to validate the models trained.} \label{fig:MNIST} \end{figure} %%% Local Variables: diff --git a/TeX/Figures/sdg_comparison.tex b/TeX/Figures/sdg_comparison.tex index cee21c7..b49c6b3 100644 --- a/TeX/Figures/sdg_comparison.tex +++ b/TeX/Figures/sdg_comparison.tex @@ -4,34 +4,56 @@ legend image code/.code={ \draw[mark repeat=2,mark phase=2] plot coordinates { (0cm,0cm) -(0.0cm,0cm) %% default is (0.3cm,0cm) -(0.0cm,0cm) %% default is (0.6cm,0cm) +(0.15cm,0cm) %% default is (0.3cm,0cm) +(0.3cm,0cm) %% default is (0.6cm,0cm) };% } } \begin{figure} \begin{subfigure}[h]{\textwidth} \begin{tikzpicture} - \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east}, - xlabel = {epoch}, ylabel = {Classification Accuracy}] + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.975\textwidth, + height = 0.6\textwidth, ymin = 0.885, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {Epoch}, ylabel = {Test Accuracy}, cycle + list/Dark2, every axis plot/.append style={line width + =1.25pt}] + % [tick style = {draw = none}, width = \textwidth, + % height = 0.6\textwidth, ymin = 0.905, legend style={at={(0.9825,0.75)},anchor=north east}, + % xlabel = {epoch}, ylabel = {Classification Accuracy}] + % \addplot table + % [x=epoch, y=val_accuracy, col sep=comma, mark = none] + % {Figures/Data/adagrad.log}; + % \addplot table + % [x=epoch, y=val_accuracy, col sep=comma, mark = none] + % {Figures/Data/adadelta.log}; + % \addplot table + % [x=epoch, y=val_accuracy, col sep=comma, mark = none] + % {Figures/Data/adam.log}; \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] - {Figures/Data/adagrad.log}; + {Figures/Data/Adagrad.mean}; \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] - {Figures/Data/adadelta.log}; + {Figures/Data/Adadelta.mean}; \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] - {Figures/Data/adam.log}; + {Figures/Data/Adam.mean}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Figures/Data/SGD_00.mean}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Figures/Data/SGD_09.mean}; - \addlegendentry{\footnotesize{ADAGRAD}} - \addlegendentry{\footnotesize{ADADELTA}} - \addlegendentry{\footnotesize{ADAM}} - \addlegendentry{SGD$_{0.01}$} + \addlegendentry{\footnotesize{\textsc{AdaGrad}}} + \addlegendentry{\footnotesize{\textsc{Adadelta}}} + \addlegendentry{\footnotesize{\textsc{Adam}}} + \addlegendentry{\footnotesize{\textsc{Sgd}}} + \addlegendentry{\footnotesize{Momentum}} \end{axis} \end{tikzpicture} - %\caption{Classification accuracy} + \caption{Test accuracies during training} \vspace{.25cm} \end{subfigure} % \begin{subfigure}[b]{\textwidth} @@ -58,18 +80,27 @@ plot coordinates { % \vspace{.25cm} % \end{subfigure} \begin{subfigure}[b]{1.0\linewidth} - \begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}} - \multicolumn{3}{c}{Classification Accuracy} - &~&\multicolumn{3}{c}{Error Measure} - \\\cline{1-3}\cline{5-7} - \textsc{AdaGad}&\textsc{AdaDelta}&\textsc{Adam}&&\textsc{AdaGrad}&\textsc{AdaDelta}&\textsc{Adam} - \\\cline{1-3}\cline{5-7} - 1&1&1&&1&1&1 + \begin{tabu} to \textwidth {@{}l*5{X[c]}@{}} + \Tstrut \Bstrut &\textsc{AdaGrad}& \textsc{AdaDelta}& + \textsc{Adam} & \textsc{Sgd} & Momentum \\ + \hline + \Tstrut Accuracy &0.9870 & 0.9562 & 0.9925 & 0.9866 & 0.9923 \\ + \Tstrut Loss &0.0404 & 0.1447 & 0.0999 & 0.0403 & 0.0246 \\ \end{tabu} - \caption{Performace metrics after 20 epochs} + % \begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}} + % \multicolumn{3}{c}{Classification Accuracy} + % &~&\multicolumn{3}{c}{Error Measure} + % \\\cline{1-3}\cline{5-7} + % \textsc{AdaGad}&\textsc{AdaDelta}&\textsc{Adam}&&\textsc{AdaGrad}&\textsc{AdaDelta}&\textsc{Adam} + % \\\cline{1-3}\cline{5-7} + % 1&1&1&&1&1&1 + % \end{tabu} + \caption{Performace metrics after 50 epochs} \end{subfigure} - \caption[Performance comparison of training algorithms]{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained - with different optimization algorithms} + \caption[Performance Comparison of Training Algorithms]{ + Average performance metrics of the neural network given in + Figure~\ref{fig:mnist_architecture} trained 5 times for 50 epochs + using different optimization algorithms.} \label{fig:comp_alg} \end{figure} %%% Local Variables: diff --git a/TeX/Figures/sin_conv.tex b/TeX/Figures/sin_conv.tex index 9d918a1..ba037e4 100644 --- a/TeX/Figures/sin_conv.tex +++ b/TeX/Figures/sin_conv.tex @@ -14,6 +14,7 @@ \end{adjustbox} \caption{True position (\textcolor{red}{red}), distorted position data (black)} \end{subfigure} + \hfill \begin{subfigure}[b]{0.49\textwidth} \centering \begin{adjustbox}{width=\textwidth, height=0.25\textheight} @@ -28,7 +29,7 @@ \end{adjustbox} \caption{True position (\textcolor{red}{red}), filtered position data (black)} \end{subfigure} - \caption[Signal smoothing using convolution]{Example for noise reduction using convolution with simulated + \caption[Signal Smoothing Using Convolution]{Example for noise reduction using convolution with simulated positional data. As filter $g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$ is chosen and applied to the $x$ and $y$ coordinate diff --git a/TeX/appendixA.tex b/TeX/appendixA.tex index 3854d20..09dccaa 100644 --- a/TeX/appendixA.tex +++ b/TeX/appendixA.tex @@ -2,216 +2,330 @@ \newpage \begin{appendices} \counterwithin{lstfloat}{section} - \section{Proofs for sone Lemmata in ...} - In the following there will be proofs for some important Lemmata in - Section~\ref{sec:theo38}. Further proofs not discussed here can be - found in \textcite{heiss2019} - The proves in this section are based on \textcite{heiss2019}. Slight - alterations have been made to accommodate for not splitting $f$ into - $f_+$ and $f_-$. - \begin{Theorem}[Proof of Lemma~\ref{theo38}] - \end{Theorem} + \section{Notes on Proofs of Lemmata in Section~\ref{sec:conv}} + \label{appendix:proofs} + Contrary to \textcite{heiss2019} we do not make the distinction between $f_+$ and + $f_-$. + This results in some alterations in the proofs being necessary. In + the following the affected proofs and the required changes are given. + % Because of that slight alterations are needed in the proofs of + % .. auxiliary lemmata. + % Alterations that go beyond substituting $F_{+-}^{}$ + % As the proofs are ... for the most part only + % the alterations needed are specified. + - \begin{Lemma}[$\frac{w^{*,\tilde{\lambda}}_k}{v_k}\approx\mathcal{O}(\frac{1}{n})$] - For any $\lambda > 0$ and training data $(x_i^{\text{train}}, - y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in - \left\{1,\dots,N\right\}$, we have - \[ - \max_{k \in \left\{1,\dots,n\right\}} \frac{w^{*, - \tilde{\lambda}}_k}{v_k} = \po_{n\to\infty} - \] + + + % In the following there will be proofs for some important Lemmata in + % Section~\ref{sec:theo38}. Further proofs not discussed here can be + % found in \textcite{heiss2019} + % The proves in this section are based on \textcite{heiss2019}. Slight + % alterations have been made to accommodate for not splitting $f$ into + % $f_+$ and $f_-$. + % \begin{Theorem}[Proof of Lemma~\ref{theo38}] + % \end{Theorem} + + % \begin{Lemma}[$\frac{w^{*,\tilde{\lambda}}_k}{v_k}\approx\mathcal{O}(\frac{1}{n})$] + % For any $\lambda > 0$ and training data $(x_i^{\text{train}}, + % y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in + % \left\{1,\dots,N\right\}$, we have + % \[ + % \max_{k \in \left\{1,\dots,n\right\}} \frac{w^{*, + % \tilde{\lambda}}_k}{v_k} = \po_{n\to\infty} + % \] - \end{Lemma} - - \begin{Proof}[Proof of Lemma~\ref{lem:s3}] - \[ - \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k) - h_{k,n} = \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta - (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}} - \left(\sum_{\substack{k \in \kappa \\ \xi_k \in - [\delta l , \delta(l+1))}} \varphi(\xi_k, v_k) - h_{k,n}\right) \approx -\] -\[ - \approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta - (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}} - \left(\sum_{\substack{k \in \kappa \\ \xi_k \in - [\delta l , \delta(l+1))}} \left(\varphi(\delta l, v_k) - \frac{1}{n g_\xi (\delta l)} \pm \frac{\varepsilon}{n}\right) - \frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l, - \delta(l+1))\right\}}}{\abs{\left\{m \in \kappa : \xi_m - \in [\delta l, \delta(l+1))\right\}}}\right) -\] -\[ - \approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta - (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}} - \left(\frac{\sum_{\substack{k \in \kappa \\ \xi_k \in - [\delta l , \delta(l+1))}}\varphi(\delta l, - v_k)}{\abs{\left\{m \in \kappa : \xi_m - \in [\delta l, \delta(l+1))\right\}}} - \frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l, - \delta(l+1))\right\}}}{n g_\xi (\delta l)}\right) \pm \varepsilon -\] -The amount of kinks in a given interval of length $\delta$ follows a -binomial distribution, -\[ - \mathbb{E} \left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l, + % \end{Lemma} + + \begin{Proof}[Heiss, Teichmann, and Wutte (2019, Lemma A.9)]~\\\noindent + \label{proof:lem9} + With $\tilde{\lambda} \coloneqq \lambda n g(0)$ Lemma~\ref{lem:cnvh} follows + analogously when considering $\tilde{w}$, $f_g^{*, \lambda}$, and $h_k$ + instead of $\tilde{w}^+$, $f_{g,+}^{*, \lambda}$, and $\bar{h}_k$. + Consider $\kappa = \left\{1, \dots, n \right\}$ for $n$ nodes + instead of $\kappa^+$. With $h_k = \frac{1}{n g_\xi(\xi_n)}$ + instead of $\bar{h}_k$ + and \[ + \mathbb{E} \left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l, \delta(l+1))\right\}}\right] = n \int_{\delta l}^{\delta(l+1)}g_\xi (x) dx \approx n (\delta g_\xi(\delta l) - \pm \delta \tilde{\varepsilon}), -\] -for any $\delta \leq \delta(\varepsilon, \tilde{\varepsilon})$, since $g_\xi$ is uniformly continuous on its -support by Assumption.. -As the distribution of $v$ is continuous as well we get that -$\mathcal{L}(v_k) = \mathcal{L} v| \xi = \delta l) \forall k \in -\kappa : \xi_k \in [\delta l, \delta(l+1))$ for $\delta \leq -\delta(\varepsilon, \tilde{\varepsilon})$. Thus we get with the law of -large numbers -\begin{align*} - &\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k) - h_{k,n} \approx\\ - &\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta - (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T - \}]}}\left(\mathbb{E}[\phi(\xi, v)|\xi=\delta l] - \stackrel{\mathbb{P}}{\pm}\right) \delta \left(1 \pm - \frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon - \\ - &\approx \left(\sum_{\substack{l \in \mathbb{Z} \\ [\delta - l, \delta - (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T - \}]}}\mathbb{E}[\phi(\xi, v)|\xi=\delta l] \delta - \stackrel{\mathbb{P}}{\pm}\tilde{\tilde{\varepsilon}} - \abs{C_{g_\xi}^u - C_{g_\xi}^l} - \right)\\ - &\phantom{\approx}\cdot \left(1 \pm - \frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon -\end{align*} + \pm \delta \tilde{\varepsilon}). + \] + % \[ +% \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k) +% h_{k,n} = \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta +% (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}} +% \left(\sum_{\substack{k \in \kappa \\ \xi_k \in +% [\delta l , \delta(l+1))}} \varphi(\xi_k, v_k) +% h_{k,n}\right) \approx +% \] +% \[ +% \approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta +% (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}} +% \left(\sum_{\substack{k \in \kappa \\ \xi_k \in +% [\delta l , \delta(l+1))}} \left(\varphi(\delta l, v_k) +% \frac{1}{n g_\xi (\delta l)} \pm \frac{\varepsilon}{n}\right) +% \frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l, +% \delta(l+1))\right\}}}{\abs{\left\{m \in \kappa : \xi_m +% \in [\delta l, \delta(l+1))\right\}}}\right) +% \] +% \[ +% \approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta +% (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}} +% \left(\frac{\sum_{\substack{k \in \kappa \\ \xi_k \in +% [\delta l , \delta(l+1))}}\varphi(\delta l, +% v_k)}{\abs{\left\{m \in \kappa : \xi_m +% \in [\delta l, \delta(l+1))\right\}}} +% \frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l, +% \delta(l+1))\right\}}}{n g_\xi (\delta l)}\right) \pm \varepsilon +% \] +% The amount of kinks in a given interval of length $\delta$ follows a +% binomial distribution, +% \[ +% \mathbb{E} \left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l, +% \delta(l+1))\right\}}\right] = n \int_{\delta +% l}^{\delta(l+1)}g_\xi (x) dx \approx n (\delta g_\xi(\delta l) +% \pm \delta \tilde{\varepsilon}), +% \] +% for any $\delta \leq \delta(\varepsilon, \tilde{\varepsilon})$, since $g_\xi$ is uniformly continuous on its +% support by Assumption.. +% As the distribution of $v$ is continuous as well we get that +% $\mathcal{L}(v_k) = \mathcal{L} v| \xi = \delta l) \forall k \in +% \kappa : \xi_k \in [\delta l, \delta(l+1))$ for $\delta \leq +% \delta(\varepsilon, \tilde{\varepsilon})$. Thus we get with the law of +% large numbers +% \begin{align*} +% &\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k) +% h_{k,n} \approx\\ +% &\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta +% (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T +% \}]}}\left(\mathbb{E}[\phi(\xi, v)|\xi=\delta l] +% \stackrel{\mathbb{P}}{\pm}\right) \delta \left(1 \pm +% \frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon +% \\ +% &\approx \left(\sum_{\substack{l \in \mathbb{Z} \\ [\delta +% l, \delta +% (l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T +% \}]}}\mathbb{E}[\phi(\xi, v)|\xi=\delta l] \delta +% \stackrel{\mathbb{P}}{\pm}\tilde{\tilde{\varepsilon}} +% \abs{C_{g_\xi}^u - C_{g_\xi}^l} +% \right)\\ +% &\phantom{\approx}\cdot \left(1 \pm +% \frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon +% \end{align*} \end{Proof} -\begin{Lemma}[($L(f_n) \to L(f)$), Heiss, Teichmann, and -Wutte (2019, Lemma A.11)] - For any data $(x_i^{\text{train}}, y_i^{\text{train}}) \in - \mathbb{R}^2, i \in \left\{1,\dots,N\right\}$, let $(f_n)_{n \in - \mathbb{N}}$ be a sequence of functions that converges point-wise - in probability to a function $f : \mathbb{R}\to\mathbb{R}$, then the - loss $L$ of $f_n$ converges is probability to $L(f)$ as $n$ tends to - infinity, +% \begin{Lemma}[($L(f_n) \to L(f)$), Heiss, Teichmann, and +% Wutte (2019, Lemma A.11)] +% For any data $(x_i^{\text{train}}, y_i^{\text{train}}) \in +% \mathbb{R}^2, i \in \left\{1,\dots,N\right\}$, let $(f_n)_{n \in +% \mathbb{N}}$ be a sequence of functions that converges point-wise +% in probability to a function $f : \mathbb{R}\to\mathbb{R}$, then the +% loss $L$ of $f_n$ converges is probability to $L(f)$ as $n$ tends to +% infinity, +% \[ +% \plimn L(f_n) = L(f). +% \] +% \proof Vgl. ... +% \end{Lemma} + +\begin{Proof}[Heiss, Teichmann, and Wutte (2019, Lemma A.12)]~\\\noindent + \label{proof:lem12} + With $\tilde{\lambda} \coloneqq \lambda n g(0)$ Lemma~\ref{lem:s2} follows + analogously when considering $\tilde{w}$, $f_g^{*, \lambda}$, and $h_k$ + instead of $\tilde{w}^+$, $f_{g,+}^{*, \lambda}$, and $\bar{h}_k$. + % We start by showing that + % \[ + % \plimn \tilde{\lambda} \norm{\tilde{w}}_2^2 = \lambda g(0) + % \left(\int \frac{\left(f_g^{*,\lambda''}\right)^2}{g(x)} dx\right) + % \] + % With the definitions of $\tilde{w}$, $\tilde{\lambda}$ and + % $h$ we have + % \begin{align*} + % \tilde{\lambda} \norm{\tilde{w}}_2^2 + % &= \tilde{\lambda} \sum_{k \in + % \kappa}\left(f_g^{*,\lambda''}(\xi_k) \frac{h_k + % v_k}{\mathbb{E}v^2|\xi = \xi_k]}\right)^2\\ + % &= \tilde{\lambda} \sum_{k \in + % \kappa}\left(\left(f_g^{*,\lambda''}\right)^2(\xi_k) \frac{h_k + % v_k^2}{\mathbb{E}v^2|\xi = \xi_k]}\right) h_k\\ + % & = \lambda g(0) \sum_{k \in + % \kappa}\left(\left(f_g^{*,\lambda''}\right)^2(\xi_k)\frac{v_k^2}{g_\xi(\xi_k)\mathbb{E} + % [v^2|\xi=\xi_k]}\right)h_k. + % \end{align*} + % By using Lemma~\ref{lem} with $\phi(x,y) = + % \left(f_g^{*,\lambda''}\right)^2(x)\frac{y^2}{g_\xi(\xi)\mathbb{E}[v^2|\xi=y]}$ + % this converges to + % \begin{align*} + % &\plimn \tilde{\lambda}\norm{\tilde{w}}_2^2 = \\ + % &=\lambda + % g_\xi(0)\mathbb{E}[v^2|\xi=0]\int_{\supp{g_\xi}}\mathbb{E}\left[ + % \left(f_g^{*,\lambda''}\right)^2(\xi)\frac{v^2}{ + % g_\xi(\xi)\mathbb{E}[v^2|\xi=x]^2}\Big{|} \xi = x\right]dx\\ + % &=\lambda g_\xi(0) \mathbb{E}[v^2|\xi=0] \int_{\supp{g_xi}} + % \frac{\left(f_g^{*,\lambda''}\right)^2 (x)}{g_\xi(x) + % \mathbb{E}[v^2|\xi=x]} dx \\ + % &=\lambda g(0) \int_{\supp{g_\xi}} \frac{\left(f_g^{*,\lambda''}\right)^2}{g(x)}dx. + % \end{align*} +\end{Proof} + +\begin{Proof}[Heiss, Teichmann, and Wutte (2019, Lemma A.14)]~\\\noindent + \label{proof:lem14} + Substitute $F_{+-}^{\lambda, g}\left(f_{g,+}^{*,\lambda}, + f_{g,-}^{*,\lambda}\right)$ with $F^{\lambda,g}\left(f_g^{*,\lambda}\right)$. +\end{Proof} +% \begin{Lemma}[Heiss, Teichmann, and +% Wutte (2019, Lemma A.13)] +% Using the notation of Definition .. and ... the following statement +% holds: +% $\forall \varepsilon \in \mathbb{R}_{>0} : \exists \delta \in +% \mathbb{R}_{>0} : \forall \omega \in \Omega : \forall l, l' \in +% \left\{1,\dots,N\right\} : \forall n \in \mathbb{N}$ +% \[ +% \left(\abs{\xi_l(\omega) - \xi_{l'}(\omega)} < \delta \wedge +% \text{sign}(v_l(\omega)) = \text{sign}(v_{l'}(\omega))\right) +% \implies \abs{\frac{w_l^{*, \tilde{\lambda}}(\omega)}{v_l(\omega)} +% - \frac{w_{l'}^{*, \tilde{\lambda}}(\omega)}{v_{l'}(\omega)}} < +% \frac{\varepsilon}{n}, +% \] +% if we assume that $v_k$ is never zero. +% \proof given in .. +% \end{Lemma} + +% \begin{Lemma}[$\frac{w^{*,\tilde{\lambda}}}{v} \approx +% \mathcal{O}(\frac{1}{n})$, Heiss, Teichmann, and +% Wutte (2019, Lemma A.14)] +% For any $\lambda > 0$ and data $(x_i^{\text{train}}, +% y_i^{\text{train}}) \in \mathbb{R}^2, i\in +% \left\{1,\dots,\right\}$, we have +% \[ +% \forall P \in (0,1) : \exists C \in \mathbb{R}_{>0} : \exists +% n_0 \in \mathbb{N} : \forall n > n_0 : \mathbb{P} +% \left[\max_{k\in \left\{1,\dots,n\right\}} +% \frac{w_k^{*,\tilde{\lambda}}}{v_k} < C +% \frac{1}{n}\right] > P +% % \max_{k\in \left\{1,\dots,n\right\}} +% % \frac{w_k^{*,\tilde{\lambda}}}{v_k} = \plimn +% \] +% \proof + + +% Let $k^*_+ \in \argmax_{k\in +% \left\{1,\dots,n\right\}}\frac{w^{*,\tilde{\lambda}}}{v_k} : v_k +% > 0$ and $k^*_- \in \argmax_{k\in +% \left\{1,\dots,n\right\}}\frac{w^{*,\tilde{\lambda}}}{v_k} : v_k +% < 0$. W.l.o.g. assume $\frac{w_{k_+^*}^2}{v_{k_+^*}^2} \geq +% \frac{w_{k_-^*}^2}{v_{k_-^*}^2}$ +% \begin{align*} +% \frac{F^{\lambda, +% g}\left(f^{*,\lambda}_g\right)}{\tilde{\lambda}} +% \makebox[2cm][c]{$\stackrel{\mathbb{P}}{\geq}$} +% & \frac{1}{2 \tilde{\lambda}} +% F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right) +% = \frac{1}{2 \tilde{\lambda}}\left[\sum ... + \tilde{\lambda} \norm{w}_2^2\right] +% \\ +% \makebox[2cm][c]{$\geq$} +% & \frac{1}{2}\left( \sum_{\substack{k: v_k +% > 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} +% + \delta)}} \left(w_k^{*,\tilde{\lambda}}\right)^2 + +% \sum_{\substack{k: v_k < 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} +% + \delta)}} \left(w_k^{*,\tilde{\lambda}}\right)^2\right) \\ +% \makebox[2cm][c]{$\overset{\text{Lem. A.6}}{\underset{\delta \text{ +% small enough}}{\geq}} $} +% & +% \frac{1}{4}\left(\left(\frac{w_{k_+^*}^{*,\tilde{\lambda}}} +% {v_{k_+^*}}\right)^2\sum_{\substack{k: +% v_k > 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} + \delta)}}v_k^2 + +% \left(\frac{w_{k_-^*}^{*,\tilde{\lambda}}}{v_{k_-^*}}\right)^2 +% \sum_{\substack{k: +% v_k < 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} + +% \delta)}}v_k^2\right)\\ +% \makebox[2cm][c]{$\stackrel{\mathbb{P}}{\geq}$} +% & \frac{1}{8} +% \left(\frac{w_{k_+^*}^{*,\tilde{\lambda}}}{v_{k^*}}\right)^2 +% n \delta g_\xi(\xi_{k_+^*}) \mathbb{P}(v_k +% >0)\mathbb{E}[v_k^2|\xi_k = \xi_{k^*_+}] +% \end{align*} + +% \end{Lemma} + +\begin{Proof}[Heiss, Teichmann, and Wutte (2019, Lemma A.15)]~\\\noindent + \label{proof:lem15} + Consider $\mathcal{RN}^{*,\tilde{\lambda}}$, + $f^{w^{*,\tilde{\lambda}}}$, and $\kappa = \left\{1, \dots, n + \right\}$ instead of $\mathcal{RN}_+^{*,\tilde{\lambda}}$, + $f_+^{w^{*,\tilde{\lambda}}}$, and $\kappa^+$. + Assuming w.l.o.g. $max_{k \in + \kappa^+}\abs{\frac{w_k^{*,\tilde{\lambda}}}{v_k}} \geq max_{k \in + \kappa^-}\abs{\frac{w_k^{*,\tilde{\lambda}}}{v_k}}$ + Lemma~ref{lem:s3} follows analogously by multiplying (58b) with two. +\end{Proof} + +\begin{Proof}[Heiss, Teichmann, and Wutte (2019, Lemma + A.16)]~\\\noindent + \label{proof:lem16} + As we are considering $F^{\lambda,g}$ instead of + $F^{\lambda,g}_{+-}$ we need to substitute $2\lambda g(0)$ with + $\lambda g(0)$ + and thus get \[ - \plimn L(f_n) = L(f). + \left(f^{w^{*,\tilde{\lambda}}}\right)''(x) \approx + \frac{w_{l_x}^{*,\tilde{\lambda}}}{v_{l_x}} n g_\xi(x) + \mathbb{E}\left[v_k^2|\xi_k = x\right] \stackrel{\mathbb{P}}{\pm} \varepsilon_3 \] - \proof Vgl. ... -\end{Lemma} - -\begin{Proof}[Step 2] - We start by showing that + and use this to follow \[ - \plimn \tilde{\lambda} \norm{\tilde{w}}_2^2 = \lambda g(0) - \left(\int \frac{\left(f_g^{*,\lambda''}\right)^2}{g(x)} dx\right) + \lambda g(0) + \int_{\supp(g)}\hspace{-0.15cm}\frac{\left(\left(f^{w^{*,\tilde{\lambda}}}\right)''(x)\right)^2}{g(0)}dx + \approx \tilde{\lambda} n + \int_{\supp(g)}\left(\frac{w_{l_x}^{*,\tilde{\lambda}}}{v_{l_x}}\right)^2 \hspace{-0.1cm} + g_xi(x) \mathbb{E}\left[v_k^2|\xi_k=x\right]dx \] - With the definitions of $\tilde{w}$, $\tilde{\lambda}$ and - $h$ we have - \begin{align*} - \tilde{\lambda} \norm{\tilde{w}}_2^2 - &= \tilde{\lambda} \sum_{k \in - \kappa}\left(f_g^{*,\lambda''}(\xi_k) \frac{h_k - v_k}{\mathbb{E}v^2|\xi = \xi_k]}\right)^2\\ - &= \tilde{\lambda} \sum_{k \in - \kappa}\left(\left(f_g^{*,\lambda''}\right)^2(\xi_k) \frac{h_k - v_k^2}{\mathbb{E}v^2|\xi = \xi_k]}\right) h_k\\ - & = \lambda g(0) \sum_{k \in - \kappa}\left(\left(f_g^{*,\lambda''}\right)^2(\xi_k)\frac{v_k^2}{g_\xi(\xi_k)\mathbb{E} - [v^2|\xi=\xi_k]}\right)h_k. - \end{align*} - By using Lemma~\ref{lem} with $\phi(x,y) = - \left(f_g^{*,\lambda''}\right)^2(x)\frac{y^2}{g_\xi(\xi)\mathbb{E}[v^2|\xi=y]}$ - this converges to + Analogous to the proof of \textcite{heiss2019} we get \begin{align*} - &\plimn \tilde{\lambda}\norm{\tilde{w}}_2^2 = \\ - &=\lambda - g_\xi(0)\mathbb{E}[v^2|\xi=0]\int_{\supp{g_\xi}}\mathbb{E}\left[ - \left(f_g^{*,\lambda''}\right)^2(\xi)\frac{v^2}{ - g_\xi(\xi)\mathbb{E}[v^2|\xi=x]^2}\Big{|} \xi = x\right]dx\\ - &=\lambda g_\xi(0) \mathbb{E}[v^2|\xi=0] \int_{\supp{g_xi}} - \frac{\left(f_g^{*,\lambda''}\right)^2 (x)}{g_\xi(x) - \mathbb{E}[v^2|\xi=x]} dx \\ - &=\lambda g(0) \int_{\supp{g_\xi}} \frac{\left(f_g^{*,\lambda''}\right)^2}{g(x)}dx. + \tilde{\lambda} \sum_{k \in \kappa} + \left(w_k^{*,\tilde{\lambda}}\right)^2 + &= \tilde{\lambda} \sum_{k \in \kappa^+} + \left(w_k^{*,\tilde{\lambda}}\right)^2 + \tilde{\lambda} \sum_{k \in \kappa^-} + \left(w_k^{*,\tilde{\lambda}}\right)^2 \\ + &\approx \left(\mathbb{P}[v_k <0] + \mathbb{P}[v_k >0]\right)\\ + &\phantom{=} + \int_{\supp(g_xi)} + \left(\frac{w_{l_x}^{*,\tilde{\lambda}}}{v_{l_x}}\right)^2 + g_\xi(x) \mathbb{E}\left[v_k^2|\xi_k = x\right] dx + \stackrel{\mathbb{P}}{\pm} \varepsilon_9 \\ + &= \int_{\supp{g_xi}} + \left(\frac{w_{l_x}^{*,\tilde{\lambda}}}{v_{l_x}}\right)^2 + g_\xi(x) \mathbb{E}\left[v_k^2|\xi_k = x\right] dx + \stackrel{\mathbb{P}}{\pm} \varepsilon_9. \end{align*} + With these transformations Lemma~\ref{lem:s4} follows analogously. \end{Proof} -\begin{Lemma}[Heiss, Teichmann, and - Wutte (2019, Lemma A.13)] - Using the notation of Definition .. and ... the following statement - holds: - $\forall \varepsilon \in \mathbb{R}_{>0} : \exists \delta \in - \mathbb{R}_{>0} : \forall \omega \in \Omega : \forall l, l' \in - \left\{1,\dots,N\right\} : \forall n \in \mathbb{N}$ +\begin{Proof}[Heiss, Teichmann, and Wutte (2019, Lemma A.19)]~\\\noindent + \label{proof:lem19} + The proof works analogously if $F_{+-}^{\lambda,g}$ is substituted + by + \begin{align*} + F_{+-}^{\lambda,g '}(f_+, f_-) = + & \sum_{i = + 1}^N \left(f(x_i^{\text{train}}) - + y_i^{\text{train}}\right)^2 \\ + & + \lambda g(0) \left(\int_{\supp(g)}\frac{\left(f_+''(x)\right)^2}{g(x)} + dx + \int_{\supp(g)}\frac{\left(f''_-(x)\right)^2}{g(x)} + dx\right). + \end{align*} + As for $f^n = f_+^n + f_-^n$ such that $\supp(f_+^n) \cap \supp(f_-^n) = + \emptyset$ and $h = h_+ + h_-$ such that $\supp(h_+) \cap \supp(h_-) = + \emptyset$ it holds \[ - \left(\abs{\xi_l(\omega) - \xi_{l'}(\omega)} < \delta \wedge - \text{sign}(v_l(\omega)) = \text{sign}(v_{l'}(\omega))\right) - \implies \abs{\frac{w_l^{*, \tilde{\lambda}}(\omega)}{v_l(\omega)} - - \frac{w_{l'}^{*, \tilde{\lambda}}(\omega)}{v_{l'}(\omega)}} < - \frac{\varepsilon}{n}, + \plimn F^{\lambda, g}(f^n) = F^{\lambda, g}(h) \implies + \plimn F_{+-}^{\lambda,g '}(f_+,f_-) = F_{+-}^{\lambda,g '}(h_+,h_-), \] - if we assume that $v_k$ is never zero. - \proof given in .. -\end{Lemma} - -\begin{Lemma}[$\frac{w^{*,\tilde{\lambda}}}{v} \approx - \mathcal{O}(\frac{1}{n})$, Heiss, Teichmann, and - Wutte (2019, Lemma A.14)] - For any $\lambda > 0$ and data $(x_i^{\text{train}}, - y_i^{\text{train}}) \in \mathbb{R}^2, i\in - \left\{1,\dots,\right\}$, we have - \[ - \forall P \in (0,1) : \exists C \in \mathbb{R}_{>0} : \exists - n_0 \in \mathbb{N} : \forall n > n_0 : \mathbb{P} - \left[\max_{k\in \left\{1,\dots,n\right\}} - \frac{w_k^{*,\tilde{\lambda}}}{v_k} < C - \frac{1}{n}\right] > P - % \max_{k\in \left\{1,\dots,n\right\}} - % \frac{w_k^{*,\tilde{\lambda}}}{v_k} = \plimn - \] - \proof - Let $k^*_+ \in \argmax_{k\in - \left\{1,\dots,n\right\}}\frac{w^{*,\tilde{\lambda}}}{v_k} : v_k - > 0$ and $k^*_- \in \argmax_{k\in - \left\{1,\dots,n\right\}}\frac{w^{*,\tilde{\lambda}}}{v_k} : v_k - < 0$. W.l.o.g. assume $\frac{w_{k_+^*}^2}{v_{k_+^*}^2} \geq - \frac{w_{k_-^*}^2}{v_{k_-^*}^2}$ - \begin{align*} - \frac{F^{\lambda, - g}\left(f^{*,\lambda}_g\right)}{\tilde{\lambda}} - \makebox[2cm][c]{$\stackrel{\mathbb{P}}{\geq}$} - & \frac{1}{2 \tilde{\lambda}} - F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right) - = \frac{1}{2 \tilde{\lambda}}\left[\sum ... + \tilde{\lambda} \norm{w}_2^2\right] - \\ - \makebox[2cm][c]{$\geq$} - & \frac{1}{2}\left( \sum_{\substack{k: v_k - > 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} - + \delta)}} \left(w_k^{*,\tilde{\lambda}}\right)^2 + - \sum_{\substack{k: v_k < 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} - + \delta)}} \left(w_k^{*,\tilde{\lambda}}\right)^2\right) \\ - \makebox[2cm][c]{$\overset{\text{Lem. A.6}}{\underset{\delta \text{ - small enough}}{\geq}} $} - & - \frac{1}{4}\left(\left(\frac{w_{k_+^*}^{*,\tilde{\lambda}}} - {v_{k_+^*}}\right)^2\sum_{\substack{k: - v_k > 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} + \delta)}}v_k^2 + - \left(\frac{w_{k_-^*}^{*,\tilde{\lambda}}}{v_{k_-^*}}\right)^2 - \sum_{\substack{k: - v_k < 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} + - \delta)}}v_k^2\right)\\ - \makebox[2cm][c]{$\stackrel{\mathbb{P}}{\geq}$} - & \frac{1}{8} - \left(\frac{w_{k_+^*}^{*,\tilde{\lambda}}}{v_{k^*}}\right)^2 - n \delta g_\xi(\xi_{k_+^*}) \mathbb{P}(v_k - >0)\mathbb{E}[v_k^2|\xi_k = \xi_{k^*_+}] - \end{align*} - -\end{Lemma} - + and all functions can be split in two functions with disjoint support + Lemma~\ref{lem:s7} follows. +\end{Proof} \input{Appendix_code.tex} \end{appendices} diff --git a/TeX/bibliograpy.bib b/TeX/bibliograpy.bib index f2aa34b..4d64d78 100644 --- a/TeX/bibliograpy.bib +++ b/TeX/bibliograpy.bib @@ -296,3 +296,18 @@ year = {2014}, publisher = {Curran Associates, Inc.}, url = {http://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf} } + +@book{hastie01statisticallearning, + added-at = {2008-05-16T16:17:42.000+0200}, + address = {New York, NY, USA}, + author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome}, + biburl = {https://www.bibsonomy.org/bibtex/2f58afc5c9793fcc8ad8389824e57984c/sb3000}, + interhash = {d585aea274f2b9b228fc1629bc273644}, + intrahash = {f58afc5c9793fcc8ad8389824e57984c}, + keywords = {ml statistics}, + publisher = {Springer New York Inc.}, + series = {Springer Series in Statistics}, + timestamp = {2008-05-16T16:17:43.000+0200}, + title = {The Elements of Statistical Learning}, + year = 2001 +} diff --git a/TeX/further_applications_of_nn.tex b/TeX/further_applications_of_nn.tex index 75ef73f..972e9b5 100644 --- a/TeX/further_applications_of_nn.tex +++ b/TeX/further_applications_of_nn.tex @@ -1,15 +1,19 @@ -\section{\titlecap{application of neural networks to higher complexity problems}} +\section{Application of Neural Networks to Higher Complexity Problems} +\label{sec:cnn} +This section is based on \textcite[Chapter~9]{Goodfellow}. -This section is based on \textcite[Chapter~9]{Goodfellow} - -As neural networks are applied to problems of higher complexity often -resulting in higher dimensionality of the input the amount of +As neural networks are applied to problems of higher complexity which often +results in higher dimensionality of the input the number of parameters in the network rises drastically. -For very large inputs such as high resolution image data due to the -fully connected nature of the neural network the amount of parameters -can ... exceed the amount that is feasible for training and storage. -A way to combat this is by using layers which are only sparsely -connected and share parameters between nodes.\todo{Überleitung zu conv?} +For very large inputs such as high-resolution image data due to the +fully connected nature of the neural network the number of parameters +can exceed what is feasible for training and storage. + +The number of parameters for a given network size can be reduced by +using layers which are only sparsely +connected and share parameters between nodes. An effective way to +implement this is by using convolution with filters that are shared +among the nodes of a layer. \subsection{Convolution} @@ -19,18 +23,20 @@ functions is integrated after one has been reversed and shifted. \[ (f * g) (t) \coloneqq \int_{-\infty}^{\infty} f(t-s) g(s) ds. \] - This operation can be described as a filter-function $g$ being applied to $f$, as values $f(t)$ are being replaced by an average of values of $f$ weighted by a filter-function $g$ in position $t$. -The convolution operation allows plentiful manipulation of data, with -a simple example being smoothing of real-time data. Consider a sensor -measuring the location of an object (e.g. via GPS). We expect the -output of the sensor to be noisy as a result of a number of factors -will impact the accuracy of the measurements. In order to get a better estimate of -the actual location we want to smooth -the data to reduce the noise. Using convolution for this task, we +Convolution operation allows plentiful manipulation of data, with +a simple example being smoothing of real-time data. + +Consider a sensor measuring the location of an object (e.g. via +GPS). We expect the output of the sensor to be noisy as a result of +some factors impacting the accuracy of the measurements. In order to +get a better estimate of the actual location, we want to smooth +the data to reduce the noise. + +Using convolution for this task, we can control the significance we want to give each data-point. We might want to give a larger weight to more recent measurements than older ones. If we assume these measurements are taken on a discrete @@ -44,6 +50,7 @@ Applying this on the data with the filter $g$ chosen accordingly we are able to improve the accuracy, which can be seen in Figure~\ref{fig:sin_conv}. +\clearpage \input{Figures/sin_conv.tex} This form of discrete convolution can also be applied to functions with inputs of higher dimensionality. Let $f$, $g: \mathbb{Z}^d \to @@ -54,19 +61,21 @@ with inputs of higher dimensionality. Let $f$, $g: \mathbb{Z}^d \to \dots, x_d - i_d) g(i_1, \dots, i_d) \] This will prove to be a useful framework for image manipulation but -in order to apply convolution to images we need to discuss -representation of image data first. Most often images are represented +to apply convolution to images, we need to discuss the +representation of image data. + +Most often images are represented by each pixel being a mixture of base colors. These base colors define the color-space in which the image is encoded. Often used are color-spaces RGB (red, blue, green) or CMYK (cyan, magenta, yellow, black). An example of an -image decomposed in its red, green and blue channel is given in -Figure~\ref{fig:rgb}. Using this -encoding of the image we can define a corresponding discrete function -describing the image, by mapping the coordinates $(x,y)$ of an pixel -and the -channel (color) $c$ to the respective value $v$ +image decomposed in its red, green, and blue channel is given in +Figure~\ref{fig:rgb}. +Using this encoding of the image we can define a corresponding +discrete function describing the image, by mapping the coordinates +$(x,y)$ of a pixel and the channel (color) $c$ to the respective value +$v$ \begin{align} \begin{split} I: \mathbb{N}^3 & \to \mathbb{R}, \\ @@ -90,7 +99,7 @@ channel (color) $c$ to the respective value $v$ \end{scope} \end{tikzpicture} \end{adjustbox} - \caption[Channel separation of color image]{On the right the red, green and blue chances of the picture + \caption[Channel Separation of Color Image]{On the right the red, green, and blue chances of the picture are displayed. In order to better visualize the color channels the black and white picture of each channel has been colored in the respective color. Combining the layers results in the image on the @@ -100,7 +109,7 @@ channel (color) $c$ to the respective value $v$ With this representation of an image as a function, we can apply filters to the image using convolution for multidimensional functions -as described above. In order to simplify the notation we will write +as described~above. To simplify the notation, we will write the function $I$ given in (\ref{def:I}) as well as the filter-function $g$ as a tensor from now on, resulting in the modified notation of convolution @@ -111,17 +120,22 @@ convolution As images are finite in size for pixels to close to the border the convolution is not well defined. -Thus the output will be of reduced size, with the now size in each -dimension $d$ being \textit{(size of input in dimension $d$) - - (size of kernel in dimension $d$) +1}. -In order to ensure the output is of the same size as the input the -image can be padded in each dimension with 0 entries which ensures the +Thus the output will be of reduced size. With $s_i$ being the size of +the input in dimension $d$ and $s_k$ being the size of kernel in +dimension $d$, the size of the output in dimension $d$ is $d_i - d_o ++ 1$. +% with the new size in each +% dimension $d$ being \textit{(size of input in dimension $d$) - +% (size of kernel in dimension $d$) + 1}. \todo{den dims namen geben +% formal in eine zeile} +In order to receive outputs of the same size as the input, the +image can be padded in each dimension with 0 entries which ensure the convolution is well defined for all pixels of the image. -Simple examples for image manipulation using +Simple examples of image manipulation using convolution are smoothing operations or rudimentary detection of edges in grayscale images, meaning they only -have one channel. A popular filter for smoothing images +have one channel. A filter often used to smooth or blur images is the Gauss-filter which for a given $\sigma \in \mathbb{R}_+$ and size $s \in \mathbb{N}$ is defined as @@ -129,10 +143,11 @@ defined as G_{x,y} = \frac{1}{2 \pi \sigma^2} e^{-\frac{x^2 + y^2}{2 \sigma^2}}, ~ x,y \in \left\{1,\dots,s\right\}. \] +\pagebreak[4] -For edge detection purposes the Sobel operator is widespread. Here two +\noindent An effective filter for edge detection purposes is the Sobel operator. Here two filters are applied to the -image $I$ and then combined. Edges in the $x$ direction are detected +image $I$ and then the outputs are combined. Edges in the $x$ direction are detected by convolution with \[ G =\left[ @@ -144,16 +159,13 @@ by convolution with \] and edges is the y direction by convolution with $G^T$, the final output is given by - \[ O = \sqrt{(I * G)^2 + (I*G^T)^2} \] -where $\sqrt{\cdot}$ and $\cdot^2$ are applied component -wise. Examples for convolution of an image with both kernels are given -in Figure~\ref{fig:img_conv}. - - -\begin{figure}[h] +where $\sqrt{\cdot}$ and $\cdot^2$ are applied componentwise. Examples +for convolution of an image with both kernels are given +in Figure~\ref{fig:img_conv}. +\begin{figure}[H] \centering \begin{subfigure}{0.27\textwidth} \centering @@ -195,16 +207,16 @@ in Figure~\ref{fig:img_conv}. % \includegraphics[width=\textwidth]{Figures/Data/image_conv6.png} % \caption{test} % \end{subfigure} - \caption[Convolution applied on image]{Convolution of original greyscale Image (a) with different + \vspace{-0.1cm} + \caption[Convolution Applied on Image]{Convolution of original greyscale Image (a) with different kernels. In (b) and (c) Gaussian kernels of size 11 and stated - $\sigma^2$ are used. In (d) - (f) the above defined Sobel Operator + $\sigma^2$ are used. In (d) to (f) the above defined Sobel Operator kernels are used.} \label{fig:img_conv} \end{figure} +\vspace{-0.2cm} \clearpage -\newpage \subsection{Convolutional Neural Networks} -\todo{Eileitung zu CNN amout of parameters} % Conventional neural network as described in chapter .. are made up of % fully connected layers, meaning each node in a layer is influenced by % all nodes of the previous layer. If one wants to extract information @@ -225,15 +237,16 @@ in Figure~\ref{fig:img_conv}. % orders of magnitude smaller than $o\cdot i$ . As seen in the previous section convolution can lend itself to -manipulation of images or other large data which motivates it usage in +manipulation of images or other large data which motivates its usage in neural networks. This is achieved by implementing convolutional layers where several -trainable filters are applied to the input. +trainable filters are applied to the input. + Each node in such a layer corresponds to a pixel of the output of convolution with one of those filters, on which a bias and activation -function are applied. +function is applied. Depending on the sizes this can drastically reduce the amount of -variables in a layer compared to fully connected ones. +variables compared to fully connected layers. As the variables of the filters are shared among all nodes a convolutional layer with input of size $s_i$, output size $s_o$ and $n$ filters of size $f$ will contain $n f + s_o$ parameters whereas a @@ -241,25 +254,24 @@ fully connected layer has $(s_i + 1) s_o$ trainable weights. The usage of multiple filters results in multiple outputs of the same size as the input (or slightly smaller if no padding is used). These -are often called channels. -For convolutional layers that are preceded by convolutional layers the -size of the filters are often chosen to coincide with the amount of channels -of the output of the previous layer and not padded in this -direction. -This results in these channels ``being squashed'' and prevents gaining -additional -dimensions\todo{filter mit ganzer tiefe besser erklären} in the output. -This can also be used to flatten certain less interesting channels of -the input as for example color channels. +are often called (convolution) channels. + +Filters in layers that are preceded by convolutional layers are +often chosen such that the convolution channels of the input are +flattened into a single layer. This prevents gaining additional +dimensions with each convolutional layer. +To accomplish this in the direction of the convolution channels no +padding is used and the size of the filter is chosen to match the +number of these channels. % Thus filters used in convolutional networks are usually have the same % amount of dimensions as the input or one more. -A way additionally reduce the size using convolution is not applying the +An additional way to reduce the size using convolution is to not apply the convolution on every pixel, but rather specifying a certain ``stride'' $s$ for each direction at which the filter $g$ is moved over the input $I$, \[ - O_{x,\dots,c} = \sum_{i,\dots,l \in \mathbb{Z}} I_{(x \cdot - s_x)-i,\dots,(c \cdot s_c)-l} \cdot g_{i,\dots,l}. + O_{x,\dots,c} = \sum_{i,\dots,l \in \mathbb{Z}} \left(I_{(x \cdot + s_x)-i,\dots,(c \cdot s_c)-l}\right) \left(g_{i,\dots,l}\right). \] The sizes and stride should be the same for all filters in a layer in @@ -295,64 +307,76 @@ order to get a uniform tensor as output. As a means to further reduce the size towards the final layer, convolutional layers are often followed by a pooling layer. -In a pooling layer the input is +In a pooling layer, the input is reduced in size by extracting a single value from a neighborhood of pixels, often by taking the maximum value in the neighborhood (max-pooling). The resulting output size is dependent on the offset (stride) of the neighborhoods used. The combination of convolution and pooling layers allows for -extraction of features from the input in the from of feature maps while +extraction of features from the input in the form of feature maps while using relatively few parameters that need to be trained. -A example of this is given in Figure~\ref{fig:feature_map} where -intermediary outputs of a small convoluninal neural network consisting -of two convolutional and pooling layers each with one filter followed -by two fully connected layers. + +An example of this is given in Figure~\ref{fig:feature_map} where +intermediary outputs of a small convolutional neural network, consisting +of two convolutional and pooling layers, each with one filter followed +by two fully connected layers, are shown. \begin{figure}[h] + \renewcommand{\thesubfigure}{\alph{subfigure}1} \centering \begin{subfigure}{0.19\textwidth} \includegraphics[width=\textwidth]{Figures/Data/mnist0bw.pdf} + %\caption{input} \caption{input} \end{subfigure} + \hfill \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/conv2d_6.pdf} - \caption{convolution} + \includegraphics[width=\textwidth]{Figures/Data/conv2d_2_5.pdf} + \caption{\hspace{-1pt}convolution} \end{subfigure} + \hfill \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_6.pdf} + \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_2_5.pdf} \caption{max-pool} \end{subfigure} + \hfill \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/conv2d_7.pdf} - \caption{convolution} + \includegraphics[width=\textwidth]{Figures/Data/conv2d_3_5.pdf} + \caption{\hspace{-1pt}convolution} \end{subfigure} + \hfill \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_7.pdf} + \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_3_5.pdf} \caption{max-pool} \end{subfigure} \centering + \setcounter{subfigure}{0} + \renewcommand{\thesubfigure}{\alph{subfigure}2} \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/mnist0bw.pdf} + \includegraphics[width=\textwidth]{Figures/Data/mnist1bw.pdf} \caption{input} \end{subfigure} + \hfill \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/conv2d_6.pdf} - \caption{convolution} + \includegraphics[width=\textwidth]{Figures/Data/conv2d_2_0.pdf} + \caption{\hspace{-1pt}convolution} \end{subfigure} + \hfill \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_6.pdf} + \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_2_0.pdf} \caption{max-pool} \end{subfigure} \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/conv2d_7.pdf} - \caption{convolution} + \includegraphics[width=\textwidth]{Figures/Data/conv2d_3_0.pdf} + \caption{\hspace{-1pt}convolution} \end{subfigure} + \hfill \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_7.pdf} + \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_3_0.pdf} \caption{max-pool} \end{subfigure} - \caption[Feature map]{Intermediary outputs of a + \caption[Feature Map]{Intermediary outputs of a convolutional neural network, starting with the input and ending with the corresponding feature map.} \label{fig:feature_map} @@ -370,24 +394,24 @@ by two fully connected layers. % -Problems navigating valleys -> momentum % -Different scale of gradients for vars in different layers -> ADAdelta -\subsection{\titlecap{stochastic training algorithms}} +\subsection{Stochastic Training Algorithms} For many applications in which neural networks are used such as image classification or segmentation, large training data sets become detrimental to capture the nuances of the -data. However as training sets get larger the memory requirement +data. However, as training sets get larger the memory requirement during training grows with it. -In order to update the weights with the gradient descent algorithm -derivatives of the network with respect for each +To update the weights with the gradient descent algorithm, +derivatives of the network with respect to each variable need to be computed for all data points. Thus the amount of memory and computing power available limits the size of the training data that can be efficiently used in fitting the network. A class of algorithms that augment the gradient descent -algorithm in order to lessen this problem are stochastic gradient +algorithm to lessen this problem are stochastic gradient descent algorithms. Here the full dataset is split into smaller disjoint subsets. -Then in each iteration a (different) subset of data is chosen to +Then in each iteration, a (different) subset of data is chosen to compute the gradient (Algorithm~\ref{alg:sgd}). The training period until each data point has been considered at least once in @@ -396,7 +420,7 @@ updating the parameters is commonly called an ``epoch''. Using subsets reduces the amount of memory required for storing the necessary values for each update, thus making it possible to use very large training sets to fit the model. -Additionally the noise introduced on the gradient can improve +Additionally, the noise introduced on the gradient can improve the accuracy of the fit as stochastic gradient descent algorithms are less likely to get stuck on local extrema. @@ -405,7 +429,7 @@ gradient can be calculated far quicker which allows for more parameter updates in the same time. If the approximated gradient is close enough to the ``real'' one this can drastically cut down the time required for training the model to a certain degree or improve the accuracy achievable in a given -mount of training time. +amount of training time. \begin{algorithm} \SetAlgoLined @@ -431,58 +455,63 @@ mount of training time. \label{alg:sgd} \end{algorithm} -In order to illustrate this behavior we modeled a convolutional neural +To illustrate this behavior, we modeled a convolutional neural network to classify handwritten digits. The data set used for this is the MNIST database of handwritten digits (\textcite{MNIST}, Figure~\ref{fig:MNIST}). -The network used consists of two convolution and max pooling layers +The network used consists of two convolution and max-pooling layers followed by one fully connected hidden layer and the output layer. -Both covolutional layers utilize square filters of size five which are +Both convolutional layers utilize square filters of size five which are applied with a stride of one. The first layer consists of 32 filters and the second of 64. Both -pooling layers pool a $2\times 2$ area. The fully connected layer +pooling layers pool a $2\times 2$ area with a stride of two in both +directions. The fully connected layer consists of 256 nodes and the output layer of 10, one for each digit. -All layers use a ReLU as activation function, except the output layer +All layers use a ReLU (\ref{eq:relu}) as activation function, except the output layer which uses softmax (\ref{eq:softmax}). As loss function categorical cross entropy (\ref{eq:cross_entropy}) is used. The architecture of the convolutional neural network is summarized in Figure~\ref{fig:mnist_architecture}. +% The network is trained with gradient descent and stochastic gradient +% descent five times for ... epochs. The reluts The results of the network being trained with gradient descent and -stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd} -and Table~\ref{table:sgd_vs_gd}. -Here it can be seen that the network trained with stochstic gradient +stochastic gradient descent for 20 epochs are given in +Figure~\ref{fig:sgd_vs_gd}. +Here it can be seen that the network trained with stochastic gradient descent is more accurate after the first epoch than the ones trained with gradient descent after 20 epochs. This is due to the former using a batch size of 32 and thus having made 1.875 updates to the weights -after the first epoch in comparison to one update. While each of -these updates only use a approximate +after the first epoch in comparison to just one update. While each of +these updates only uses an approximate gradient calculated on the subset it performs far better than the -network using true gradients when training for the same mount of time. -\todo{vergleich training time} - +network using true gradients when training for the same amount of +time. +\vfill \input{Figures/mnist.tex} - -\begin{figure} +\vfill +\begin{figure}[h] \includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf} - \caption{Convolutional neural network architecture used to model the - MNIST handwritten digits dataset. This figure was created using the - draw\textunderscore convnet Python script by \textcite{draw_convnet}.} + \caption[CNN Architecture for MNIST Handwritten + Digits]{Convolutional neural network architecture used to model the + MNIST handwritten digits dataset. This figure was created with + help of the + {\sffamily{draw\textunderscore convnet}} Python script by \textcite{draw_convnet}.} \label{fig:mnist_architecture} \end{figure} \input{Figures/SGD_vs_GD.tex} \clearpage -\subsection{\titlecap{modified stochastic gradient descent}} +\subsection{Modified Stochastic Gradient Descent} This section is based on \textcite{ruder}, \textcite{ADAGRAD}, -\textcite{ADADELTA} and \textcite{ADAM}. +\textcite{ADADELTA}, and \textcite{ADAM}. While stochastic gradient descent can work quite well in fitting models its sensitivity to the learning rate $\gamma$ is an inherent problem. -This results in having to find an appropriate learning rate for each problem +It is necessary to find an appropriate learning rate for each problem which is largely guesswork. The impact of choosing a bad learning rate can be seen in Figure~\ref{fig:sgd_vs_gd}. % There is a inherent problem in the sensitivity of the gradient descent @@ -490,9 +519,10 @@ can be seen in Figure~\ref{fig:sgd_vs_gd}. % The difficulty of choosing the learning rate can be seen % in Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small -but for learning rates to large the algorithm can become unstable with -updates being larger then the parameters themselves which can result -in the parameters diverging to infinity. +but for learning rates too large the algorithm can become unstable. +This is caused by updates being larger than the parameters themselves +which can result in the parameters diverging to infinity. + Even for learning rates small enough to ensure the parameters do not diverge to infinity, steep valleys in the function to be minimized can hinder the progress of @@ -503,31 +533,48 @@ algorithm ``bouncing between'' the walls of the valley rather then following the downwards trend. A possible way to combat this is to alter the learning -rate over the course of training, often called leaning rate +rate over the course of training. This is often called learning rate scheduling. -The most popular implementations of this are time based -decay -\[ - \gamma_{n+1} = \frac{\gamma_n}{1 + d n}, -\] -where $d$ is the decay parameter and $n$ is the number of epochs. -Step based decay where the learning rate is fixed for a span of $r$ -epochs and then decreased according to parameter $d$ -\[ - \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}. -\] -And exponential decay where the learning rate is decreased after each epoch +The most popular three implementations of this are: +\begin{itemize} + \item Time-based decay, where $d$ is the decay parameter and $n$ is the number of epochs + \[ + \gamma_{n+1} = \frac{\gamma_n}{1 + d n}. + \] + \item Step based decay, where the learning rate is fixed for a span of $r$ + epochs and then decreased according to parameter $d$ + \[ + \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}. + \] + \item Exponential deca,y where the learning rate is decreased after each epoch \[ \gamma_n = \gamma_o e^{-n d}. -\]\todo{satz aufteilen} +\] +\end{itemize} +% time-based +% decay +% \[ +% \gamma_{n+1} = \frac{\gamma_n}{1 + d n}, +% \] +% where $d$ is the decay parameter and $n$ is the number of epochs. +% Step based decay where the learning rate is fixed for a span of $r$ +% epochs and then decreased according to parameter $d$ +% \[ +% \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}. +% \] +% And exponential decay where the learning rate is decreased after each epoch +% \[ +% \gamma_n = \gamma_o e^{-n d}. +% \] These methods are able to increase the accuracy of models by large margins as seen in the training of RESnet by \textcite{resnet}, cf. Figure~\ref{fig:resnet}. \begin{figure}[h] \centering \includegraphics[width=\textwidth]{Figures/Data/7780459-fig-4-source-hires.png} \caption[Learning Rate Decay]{Error history of convolutional neural - network trained with learning rate decay. \textcite[Figure - 4]{resnet}} + network trained with learning rate decay. The drops seen at 15.000 and + 30.000 iterations correspond to changes of the learning rate. \textcite[Figure + 4]{resnet}.} \label{fig:resnet} \end{figure} @@ -535,32 +582,28 @@ margins as seen in the training of RESnet by \textcite{resnet}, cf. Figure~\ref{ However stochastic gradient descent with weight decay is still highly sensitive to the choice of the hyperparameters $\gamma_0$ and $d$. -In order to mitigate this problem a number of algorithms have been -developed to regularize the learning rate with as minimal +Several algorithms have been developed to mitigate this problem by +regularizing the learning rate with as minimal hyperparameter guesswork as possible. -In the following we will compare three algorithms that use a adaptive +In the following, we will compare three algorithms that use an adaptive learning rate, meaning they scale the updates according to past iterations. -% We will examine and compare a four algorithms that use a adaptive -% learning rate. -% They all scale the gradient for the update depending of past gradients -% for each weight individually. - -The algorithms are build up on each other with the adaptive gradient +The algorithms are built upon each other with the adaptive gradient algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD}) -laying the base work. Here for each parameter update the learning rate +laying the base~work. +Here,~for~each~parameter~update, the learning rate is given by a constant global rate $\gamma$ divided by the sum of the squares of the past partial derivatives in this parameter. This results in a monotonous decaying learning rate with faster -decay for parameters with large updates, where as +decay for parameters with large updates, whereas parameters with small updates experience smaller decay. The \textsc{AdaGrad} algorithm is given in Algorithm~\ref{alg:ADAGRAD}. Note that while this algorithm is still based upon the idea of gradient descent it no longer takes steps in the direction of the gradient while -updating. Due to the individual learning rates for each parameter only -the direction/sign for single parameters remain the same compared to +updating. Due to the individual learning rates for each parameter, only +the direction or sign for single parameters remains the same compared to gradient descent. \begin{algorithm}[H] @@ -579,22 +622,27 @@ gradient descent. \label{alg:ADAGRAD} \end{algorithm} -Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the +Building on \textsc{AdaGrad}, \textcite{ADADELTA} developed the \textsc{AdaDelta} algorithm -in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the -continual decay of the learning rate and the need for a manually +to improve upon the two main drawbacks of \textsc{AdaGrad}, being the +continuous decay of the learning rate and the need for a manually selected global learning rate $\gamma$. As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will eventually become infinitely small. -In order to ensure that even after a significant of iterations -learning continues to make progress instead of summing the squared gradients a -exponentially decaying average of the past squared gradients is used to for -regularizing the learning rate resulting in +Instead of summing the squared gradients a exponential decaying +average of the past squared gradients is used to regularize the +learning rate +% In order to ensure that even after a significant of iterations +% learning continues to make progress instead of summing the squared gradients a +% exponentially decaying average of the past squared gradients is used to for +% regularizing the learning rate resulting in \begin{align*} E[g^2]_t & = \rho E[g^2]_{t-1} + (1-\rho) g_t^2, \\ \Delta x_t & = -\frac{\gamma}{\sqrt{E[g^2]_t + \varepsilon}} g_t, \end{align*} -for a decay rate $\rho$. +for a decay rate $\rho$. This is done to ensure that even after a +significant amount of iterations learning can make progress. + Additionally the fixed global learning rate $\gamma$ is substituted by a exponentially decaying average of the past parameter updates. The usage of the past parameter updates is motivated by ensuring that @@ -625,7 +673,7 @@ quantities involved in the inverse of the second derivative: x^2}} = \frac{\Delta x}{\frac{\partial f}{\partial x}}. \] As the root mean square of the past gradients is already used in the -denominator of the learning rate a exponentially decaying root mean +denominator of the learning rate an exponentially decaying root mean square of the past updates is used to obtain a $\Delta x$ quantity for the denominator resulting in the correct unit of the update. The full algorithm is given in Algorithm~\ref{alg:adadelta}. @@ -636,30 +684,35 @@ algorithm is given in Algorithm~\ref{alg:adadelta}. \KwInput{Initial parameter $x_1$} Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\; \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{ + Compute Gradient: $g_t$\; - Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} + + Accumulate Gradient: $E[g^2]_t \leftarrow \rho D[g^2]_{t-1} + (1-\rho)g_t^2$\; Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\; Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta x^2]_{t-1} + (1+p)\Delta x_t^2$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; - } + } \caption{\textsc{AdaDelta}, \textcite{ADADELTA}} \label{alg:adadelta} \end{algorithm} While the stochastic gradient algorithm is less susceptible to getting stuck in local -extrema than gradient descent the problem still persists especially +extrema than gradient descent the problem persists especially for saddle points (\textcite{DBLP:journals/corr/Dauphinpgcgb14}). An approach to the problem of ``getting stuck'' in saddle point or local minima/maxima is the addition of momentum to SDG. Instead of using the actual gradient for the parameter update an average over the -past gradients is used. In order to avoid the need to hold the past -values in memory usually a exponentially decaying average is used resulting in -Algorithm~\ref{alg:sgd_m}. This is comparable of following the path +past gradients is used. +Usually, an exponentially decaying average is used to avoid the need to +hold the past values in memory, resulting in Algorithm~\ref{alg:sgd_m}. +% In order to avoid the need to hold the past +% values in memory usually a exponentially decaying average is used resulting in +% Algorithm~\ref{alg:sgd_m}. +This is comparable to following the path of a marble with mass rolling down the slope of the error function. The decay rate for the average is comparable to the inertia of the marble. @@ -695,12 +748,13 @@ Algorithm~\ref{alg:adam}. Here the exponentially decaying root mean square of the gradients is still used for regularizing the learning rate and combined with the momentum method. Both terms are normalized such that -their means are the first and second moment of the gradient. However the term used in +their means are the first and second moments of the gradient. However, +the term used in \textsc{AdaDelta} to ensure correct units is dropped for a scalar -global learning rate. This results in four tunable hyperparameters, -however the -algorithms seems to be exceptionally stable with the recommended -parameters of $\alpha = 0.001, \beta_1 = 0.9, \beta_2 = 0.999, \varepsilon=$1e-7 and is a very reliable algorithm for training +global learning rate. This results in four tunable hyperparameters. +However, the +algorithm seems to be exceptionally stable with the recommended +parameters of $\alpha = 0.001, \beta_1 = 0.9, \beta_2 = 0.999, \varepsilon=10^{-7}$ and is a very reliable algorithm for training neural networks. \begin{algorithm}[H] @@ -725,21 +779,31 @@ neural networks. \label{alg:adam} \end{algorithm} -In order to get an understanding of the performance of the above -discussed training algorithms the neural network given in \ref{fig:mnist_architecture} has been +To get an understanding of the performance of the above +discussed training algorithms the neural network given in +\ref{fig:mnist_architecture} has been trained on the MNIST handwriting dataset with the above described -algorithms. +algorithms. For all algorithms, a global learning rate of $0.001$ is +chosen. The parameter preventing divisions by zero is set to +$\varepsilon = 10^{-7}$. For \textsc{AdaDelta} and +Momentum $\rho = 0.95$ is used as decay rate. For \textsc{Adam} the recommended +parameters are chosen. The performance metrics of the resulting learned functions are given in Figure~\ref{fig:comp_alg}. -Here it can be seen that the ADAM algorithm performs far better than -the other algorithms, with AdaGrad and Adelta following... bla bla +Here it can be seen that \textsc{AdaDelta} is the least effective of +the algorithms for the problem. Stochastic gradient descent and +\textsc{AdaGrad} perform similarly with \textsc{AdaGrad} being slightly +faster. \textsc{Adam} and stochastic gradient +descent with momentum achieve similar accuracies. However, the model +trained with \textsc{Adam} learns the fastest and achieves the best +accuracy. Thus we will use \textsc{Adam} for following comparisons. +\newpage \input{Figures/sdg_comparison.tex} -% \subsubsubsection{Stochastic Gradient Descent} \clearpage -\subsection{\titlecap{combating overfitting}} +\subsection{Combating Overfitting} % As in many machine learning applications if the model is overfit in % the data it can drastically reduce the generalization of the model. In @@ -754,14 +818,17 @@ the other algorithms, with AdaGrad and Adelta following... bla bla % strategies exist. A popular approach in regularizing convolutional neural network % is \textit{dropout} which has been first introduced in % \cite{Dropout} -This section is based on .... +This section is based on \textcite{Dropout1} and \textcite{Dropout}. Similarly to shallow networks overfitting still can impact the quality of convolutional neural networks. -Popular ways to combat this problem for a .. of models is averaging +Effective ways to combat this problem for many models is averaging over multiple models trained on subsets (bootstrap) or introducing -noise directly during the training (for example random forest, where a -conglomerate of decision trees benefit greatly of randomizing the -features available to use in each training iteration). +noise directly during the training. +For example decision trees benefit greatly from averaging many trees +trained on slightly different training sets and the +introduction of noise during training by limiting the variables +available at each iteration +(cf. \textcite[Chapter~15]{hastie01statisticallearning}). We explore implementations of these approaches for neural networks being dropout for simulating a conglomerate of networks and introducing noise during training by slightly altering the input @@ -780,16 +847,16 @@ pictures. \subsubsection{Dropout} If a neural network has enough hidden nodes there will be sets of weights that accurately fit the training set (proof for a small -scenario given in ...) this expecially occurs when the relation -between the input and output is highly complex, which requires a large -network to model and the training set is limited in size (vgl cnn -wening bilder). However each of these weights will result in different -predicitons for a test set and all of them will perform worse on the +scenario is given in Theorem~\ref{theo:overfit}) this especially +occurs when the relation between the in- and output is highly complex, +which requires a large network to model and the training set is +limited in size. However, each of these weights will result in different +predictions for a test set and all of them will perform worse on the test data than the training data. A way to improve the predictions and -reduce the overfitting would -be to train a large number of networks and average their results (vgl -random forests) however this is often computational not feasible in -training as well as testing. +reduce the overfitting would be to train a large number of networks +and average their results. +However, this is often computational not feasible in +training as well as in testing. % Similarly to decision trees and random forests training multiple % models on the same task and averaging the predictions can improve the % results and combat overfitting. However training a very large @@ -801,14 +868,15 @@ Instead of training different models, for each data point in a batch randomly chosen nodes in the network are disabled (their output is fixed to zero) and the updates for the weights in the remaining smaller network are computed. -After updates have been ... this way for each data point in a batch +After updates have been obtained this way for each data point in a batch, the updates are accumulated and applied to the full network. This can be compared to many small networks which share their weights -for their active neurons being trained simultaniously. -For testing the ``mean network'' with all nodes active but their -output scaled accordingly to compensate for more active nodes is -used. \todo{comparable to averaging dropout networks, beispiel für - besser in kleinem fall} +for their active neurons being trained simultaneously. +For testing the ``mean network'' with all nodes active is used. But the +output of the nodes is scaled accordingly to compensate for more nodes +being active. +%\todo{comparable to averaging dropout networks, beispiel für +% besser in kleinem fall} % Here for each training iteration from a before specified (sub)set of nodes % randomly chosen ones are deactivated (their output is fixed to 0). % During training @@ -827,24 +895,26 @@ used. \todo{comparable to averaging dropout networks, beispiel für % iteration, this practice is called Dropout and was introduced by % \textcite{Dropout}. -\subsubsection{\titlecap{manipulation of input data}} -Another way to combat overfitting is to keep the network form -``memorizing'' -the training data rather then learning the relation between in- and -output is to randomly alter the training inputs for -each iteration of training. -This is commonly used in image based tasks as there are -often ways to maipulate the input while still being sure the labels -remain the same. For example in a image classification task such as -handwritten digits the associated label should remain right when the +\subsubsection{Manipulation of Input Data} +Another way to combat overfitting is to randomly alter the training +inputs for each iteration of training. +% This is done keep the network from +% ``memorizing'' the training data rather than learning the relation +% between in- and output. +This can often be used in image based tasks as there are +often ways to manipulate the input while still being sure the labels +remain the same. For example, in an image classification task such as +handwritten digits, the associated label should remain right when the image is rotated or stretched by a small amount. -When using this one has to be sure that the labels indeed remain the -same or else the network will not learn the desired ... -In the case of handwritten digits for example a to high rotation angle -will make the distinction between a nine or six hard and will lessen +When applying this, one has to ensure that the alterations are +reasonable in the context of the data, or else the network might make +false connections between in- and output. +In the case of handwritten digits for example a too high rotation angle +will make the distinction between a nine or a six hard and will lessen the quality of the learned function. The most common transformations are rotation, zoom, shear, brightness, -mirroring. Examples of this are given in Figure~\ref{fig:datagen}. +mirroring. Examples of these are given in Figure~\ref{fig:datagen}. In +to following this practice will be referred to as data generation. \begin{figure}[h] \centering @@ -868,28 +938,29 @@ mirroring. Examples of this are given in Figure~\ref{fig:datagen}. \includegraphics[width=\textwidth]{Figures/Data/mnist_gen_shift.pdf} \caption{random\\positional shift} \end{subfigure} - \caption[Image data generation]{Example for the manipuations used in ... As all images are - of the same intensity brightness manipulation does not seem - ... Additionally mirroring is not used for ... reasons.} + \caption[Image Data Generation]{Example for the manipuations used in + later comparisons. Brightness manipulation and mirroring are not + used, as the images are equal in brightness and digits are not + invariant to mirroring.} \label{fig:datagen} \end{figure} -\subsubsection{\titlecap{comparisons}} +\subsubsection{Comparisons} -In order to compare the benefits obtained from implementing these +To compare the benefits obtained from implementing these measures we have trained the network given in \ref{fig:mnist_architecture} on the handwriting recognition problem and implemented different combinations of data generation and dropout. The results -are given in Figure~\ref{fig:gen_dropout}. For each scennario the +are given in Figure~\ref{fig:gen_dropout}. For each scenario, the model was trained five times and the performance measures were averaged. It can be seen that implementing the measures does indeed increase the performance of the model. Using data generation to alter the training data seems to have a -larger impact than dropout, however utilizing both measures yields the +larger impact than dropout however, utilizing both measures yields the best results. -\todo{auf zahlen in tabelle verweisen?} +%\todo{auf zahlen in tabelle verweisen?} % Implementing data generation on % its own seems to have a larger impact than dropout and applying both @@ -898,32 +969,32 @@ best results. The better performance stems most likely from reduced overfitting. The reduction in overfitting can be seen in \ref{fig:gen_dropout}~(\subref{fig:gen_dropout_b}) as the training -accuracy decreases with test accuracy increasing. However utlitizing -data generation as well as dropout with a probability of 0.4 seems to +accuracy decreases with test accuracy increasing. However, utilizing +data generation, as well as dropout with a probability of 0.4, seems to be a too aggressive approach as the training accuracy drops below the -test accuracy\todo{kleine begründung}. +test accuracy. \input{Figures/gen_dropout.tex} +\subsubsection{Effectiveness for Small Training Sets} -\clearpage -\subsubsection{\titlecap{effectivety for small training sets}} +\label{sec:smalldata} -For some applications (medical problems with small amount of patients) +For some applications (medical problems with a small number of patients) the available data can be highly limited. -In these problems the networks are highly prone to overfit the -data. In order to get a understanding of accuracys achievable and the -impact of the methods aimed at mitigating overfitting discussed above we and train -networks with different measures implemented to fit datasets of +In these scenarios, the networks are highly prone to overfit the +data. To get an understanding of accuracies achievable and the +impact of the methods aimed at mitigating overfitting discussed above we fit +networks with different measures implemented to datasets of varying sizes. -For training we use the mnist handwriting dataset as well as the fashion -mnist dataset. The fashion mnist dataset is a benchmark set build by -\textcite{fashionMNIST} in order to provide a harder set, as state of -the art models are able to achive accuracies of 99.88\% +For training, we use the MNIST handwriting dataset as well as the fashion +MNIST dataset. The fashion MNIST dataset is a benchmark set build by +\textcite{fashionMNIST} to provide a more challenging set, as state of +the art models are able to achieve accuracies of 99.88\% (\textcite{10.1145/3206098.3206111}) on the handwriting set. The dataset contains 70.000 preprocessed and labeled images of clothes from -Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. +Zalando. An overview is given in Figure~\ref{fig:fashionMNIST}. \input{Figures/fashion_mnist.tex} @@ -935,7 +1006,7 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. \Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\ \hline & - \multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\ + \multicolumn{4}{c}{Test Accuracy for 1 Sample}\Bstrut \\ \cline{2-5} max \Tstrut & 0.5633 & 0.5312 & \textbf{0.6704} & 0.6604 \\ min & 0.3230 & 0.4224 & 0.4878 & \textbf{0.5175} \\ @@ -943,7 +1014,7 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. var \Bstrut & 4.021e-3 & \textbf{1.175e-3} & 3.600e-3 & 2.348e-3 \\ \hline & - \multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\ + \multicolumn{4}{c}{Test Accuracy for 10 Sample}\Bstrut \\ \cline{2-5} max \Tstrut & 0.8585 & 0.9423 & 0.9310 & \textbf{0.9441} \\ min & 0.8148 & \textbf{0.9081} & 0.9018 & 0.9061 \\ @@ -951,7 +1022,7 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. var \Bstrut & 2.694e-4 & \textbf{1.278e-4} & 6.419e-5 & 1.504e-4 \\ \hline & - \multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\ + \multicolumn{4}{c}{Test Accuracy for 100 Sample}\Bstrut \\ \cline{2-5} max \Tstrut & 0.9637 & 0.9796 & 0.9810 & \textbf{0.9811} \\ min & 0.9506 & 0.9719 & 0.9702 & \textbf{0.9727} \\ @@ -960,11 +1031,12 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. \hline \end{tabu} \normalsize - \captionof{table}{Values of the test accuracy of the model trained - 10 times - on random MNIST handwriting training sets containing 1, 10 and 100 - data points per class after 125 epochs. The mean accuracy achieved - for the full set employing both overfitting measures is } + \captionof{table}[Values of Test Accuracies for Models Trained on + Subsets of MNIST Handwritten Digits]{Values of the test accuracy of + the model trained 10 times on random MNIST handwritten digits + training sets containing 1, 10, and 100 data points per class after + 125 epochs. The mean accuracy achieved for the full set employing + both overfitting measures is 99.58\%.} \label{table:digitsOF} \small \centering @@ -972,7 +1044,7 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. \Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\ \hline & - \multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\ + \multicolumn{4}{c}{Test Accuracy for 1 Sample}\Bstrut \\ \cline{2-5} max \Tstrut & 0.4885 & \textbf{0.5513} & 0.5488 & 0.5475 \\ min & 0.3710 & \textbf{0.3858} & 0.3736 & 0.3816 \\ @@ -980,7 +1052,7 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. var & \textbf{1.999e-3} & 2.945e-3 & 3.375e-3 & 2.976e-3 \\ \hline & - \multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\ + \multicolumn{4}{c}{Test Accuracy for 10 Sample}\Bstrut \\ \cline{2-5} max \Tstrut & 0.7370 & 0.7340 & 0.7236 & \textbf{0.7502} \\ min & \textbf{0.6818} & 0.6673 & 0.6709 & 0.6799 \\ @@ -988,7 +1060,7 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. var \Bstrut & \textbf{3.184e-4} & 3.356e-4 & 3.194e-4 & 4.508e-4 \\ \hline & - \multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\ + \multicolumn{4}{c}{Test Accuracy for 100 Sample}\Bstrut \\ \cline{2-5} max \Tstrut & 0.8454 & 0.8385 & 0.8456 & \textbf{0.8459} \\ min & 0.8227 & 0.8200 & \textbf{0.8305} & 0.8274 \\ @@ -997,57 +1069,58 @@ Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}. \hline \end{tabu} \normalsize - \captionof{table}{Values of the test accuracy of the model trained - 10 times - on random fashion MNIST training sets containing 1, 10 and 100 - data points per class after 125 epochs. The mean accuracy achieved - for the full set employing both overfitting measures is } + \captionof{table}[Values of Test Accuracies for Models Trained on + Subsets of Fashion MNIST]{Values of the test accuracy of the model + trained 10 times on random fashion MNIST training sets containing + 1, 10, and 100 data points per class after 125 epochs. The mean + accuracy achieved for the full set employing both overfitting + measures is 93.72\%.} \label{table:fashionOF} -\end{minipage}\todo{check values} +\end{minipage} \clearpage } The models are trained on subsets with a certain amount of randomly -chosen datapoints per class. -The sizes chosen for the comparisons are the full dataset, 100, 10 and 1 +chosen data points per class. +The sizes chosen for the comparisons are the full dataset, 100, 10, and 1 data points per class. For the task of classifying the fashion data a slightly altered model is used. The convolutional layers with filters of size 5 are replaced by two consecutive convolutional layers with filters of size 3. -This is done in order to have more ... in order to better accommodate -for the more complex nature of the data. A diagram of the architecture is given in +\newpage +\begin{figure}[h] + \includegraphics[width=\textwidth]{Figures/Data/cnn_fashion_fig.pdf} + \caption[CNN Architecture for Fashion MNIST]{Convolutional neural + network architecture used to model the + fashion MNIST dataset. This figure was created using the + draw\textunderscore convnet Python script by \textcite{draw_convnet}.} + \label{fig:fashion_MNIST} +\end{figure} +This is done in order to better accommodate +the more complex nature of the data by having +more degrees of freedom. A diagram of the architecture is given in Figure~\ref{fig:fashion_MNIST}. -\afterpage{ - \noindent - \begin{figure}[h] - \includegraphics[width=\textwidth]{Figures/Data/cnn_fashion_fig.pdf} - \caption{Convolutional neural network architecture used to model the - fashion MNIST dataset. This figure was created using the - draw\textunderscore convnet Python script by \textcite{draw_convnet}.} - \label{fig:fashion_MNIST} - \end{figure} -} - -For both scenarios the models are trained 10 times on randomly +For both scenarios, the models are trained 10 times on randomly sampled training sets. -For each scenario the models are trained without overfitting measures and combinations -of dropout and datageneration implemented. The Python implementation -of the models and the parameters used for the datageneration are given -in Listing~\ref{lst:handwriting} for the handwriting model and +The models are trained without overfitting measures and combinations +of dropout and data generation implemented. The Python implementation +of the models and the parameters used for data generation are given +in Listing~\ref{lst:handwriting} for the handwriting model and in Listing~\ref{lst:fashion} for the fashion model. -The models are trained for 125 epoch in order +The models are trained for 125s epochs in order to have enough random -augmentations of the input images are present during training -for the networks to fully profit of the additional training data generated. +augmentations of the input images present during training, +for the networks to fully profit from the additional training data generated. The test accuracies of the models after training for 125 epochs are given in Table~\ref{table:digitsOF} for the handwritten digits and in Table~\ref{table:fashionOF} for the fashion datasets. Additionally the average test accuracies over the course of learning are given in -Figure~\ref{fig:plotOF_digits} for the handwriting application and Figure~\ref{fig:plotOF_fashion} for the +Figure~\ref{fig:plotOF_digits} for the handwriting application and +Figure~\ref{fig:plotOF_fashion} for the fashion application. \begin{figure}[h] @@ -1056,11 +1129,13 @@ fashion application. \begin{subfigure}[h]{\textwidth} \begin{tikzpicture} \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle + xlabel = {Epoch},ylabel = {Test Accuracy}, cycle list/Dark2, every axis plot/.append style={line width - =1.25pt}] + =1.25pt}, + ytick = {0.2,0.4,0.6}, + yticklabels = {$0.2$,$0.4$,$\phantom{0}0.6$}] \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] {Figures/Data/adam_1.mean}; @@ -1083,17 +1158,19 @@ fashion application. \addlegendentry{\footnotesize{Default}} \end{axis} \end{tikzpicture} - \caption{1 sample per class} + \caption{1 Sample per Class} \vspace{0.25cm} \end{subfigure} \begin{subfigure}[h]{\textwidth} \begin{tikzpicture} \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle + xlabel = {Epoch},ylabel = {Test Accuracy}, cycle list/Dark2, every axis plot/.append style={line width - =1.25pt}] + =1.25pt}, + ytick = {0.2,0.6,0.8}, + yticklabels = {$0.2$,$0.6$,$\phantom{0}0.8$}] \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] {Figures/Data/adam_dropout_00_10.mean}; @@ -1114,14 +1191,14 @@ fashion application. \addlegendentry{\footnotesize{G + D. 0.2}} \end{axis} \end{tikzpicture} - \caption{10 samples per class} + \caption{10 Samples per Class} \end{subfigure} \begin{subfigure}[h]{\textwidth} \begin{tikzpicture} \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch}, ylabel = {Test Accuracy}, cycle + xlabel = {Epoch}, ylabel = {Test Accuracy}, cycle list/Dark2, every axis plot/.append style={line width =1.25pt}, ymin = {0.92}] \addplot table @@ -1143,10 +1220,11 @@ fashion application. \addlegendentry{\footnotesize{G + D. 0.2}} \end{axis} \end{tikzpicture} - \caption{100 samples per class} + \caption{100 Samples per Class} \vspace{.25cm} \end{subfigure} - \caption{Mean test accuracies of the models fitting the sampled MNIST + \caption[Mean Test Accuracies for Subsets of MNIST Handwritten + Digits]{Mean test accuracies of the models fitting the sampled MNIST handwriting datasets over the 125 epochs of training.} \label{fig:plotOF_digits} \end{figure} @@ -1158,11 +1236,13 @@ fashion application. \begin{tikzpicture} \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, /pgf/number format/precision=3},tick style = - {draw = none}, width = \textwidth, + {draw = none}, width = 0.9875\textwidth, height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle + xlabel = {Epoch},ylabel = {Test Accuracy}, cycle list/Dark2, every axis plot/.append style={line width - =1.25pt}] + =1.25pt}, + ytick = {0.2,0.3,0.4,0.5}, + yticklabels = {$0.2$,$0.3$,$0.4$,$\phantom{0}0.5$}] \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] {Figures/Data/fashion_dropout_0_1.mean}; @@ -1190,9 +1270,9 @@ fashion application. \begin{subfigure}[h]{\textwidth} \begin{tikzpicture} \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle + xlabel = {Epoch},ylabel = {Test Accuracy}, cycle list/Dark2, every axis plot/.append style={line width =1.25pt}, ymin = {0.62}] \addplot table @@ -1215,14 +1295,14 @@ fashion application. \addlegendentry{\footnotesize{G + D. 0.2}} \end{axis} \end{tikzpicture} - \caption{10 samples per class} + \caption{10 Samples per Class} \end{subfigure} \begin{subfigure}[h]{\textwidth} \begin{tikzpicture} \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch}, ylabel = {Test Accuracy}, cycle + xlabel = {Epoch}, ylabel = {Test Accuracy}, cycle list/Dark2, every axis plot/.append style={line width =1.25pt}, ymin = {0.762}] \addplot table @@ -1244,49 +1324,51 @@ fashion application. \addlegendentry{\footnotesize{G + D. 0.2}} \end{axis} \end{tikzpicture} - \caption{100 samples per class} + \caption{100 Samples per Class} \vspace{.25cm} \end{subfigure} - \caption{Mean test accuracies of the models fitting the sampled fashion MNIST + \caption[Mean Test Accuracies for Subsets of Fashion MNIST]{Mean test + accuracies of the models fitting the sampled fashion MNIST over the 125 epochs of training.} \label{fig:plotOF_fashion} \end{figure} -It can be seen in figure ... that for the handwritten digits scenario +It can be seen in Figure~\ref{fig:plotOF_digits} that for the +handwritten digits scenario using data generation greatly improves the accuracy for the smallest training set of one sample per class. While the addition of dropout only seems to have a small effect on the -accuracy of the model, the variance get further reduced than with data +accuracy of the model, the variance gets further reduced than with data generation. This drop in variance translates to the combination of both measures, resulting in the overall best performing model. -In the scenario with 10 and 100 samples per class the measures improve +In the scenario with 10 and 100 samples per class, the measures improve the performance as well, however the difference in performance between overfitting measures is much smaller than in the first scenario with the accuracy gain of dropout being similar to data generation. -While the observation of the variances persist for the scenario with +While the observation of the variances persists for the scenario with 100 samples per class it does not for the one with 10 samples per class. -However in all scenarios the addition of the measures reduces the +In all scenarios, the addition of the measures reduces the variance of the model. The model fit to the fashion MNIST data set benefits less of the measures. -For the smallest scenario of one sample fer class a substantial -increase in accuracy can be observed for the models with the -... measures.... Contrary to the digits data set dropout improves the +For the smallest scenario of one sample per class, a substantial +increase in accuracy can be observed for both measures. +Contrary to the digits data set, dropout improves the model by a similar margin to data generation. -For the larger data sets however the benefits are far smaller. While +For the larger data sets, the benefits are far smaller. While in the scenario with 100 samples per class a performance increase can -be seen for ... of data generation, it performs worse in the 10 -samples per class scenario than the baseline mode. -Dropout does seem to have negligible impact on its own in both the 10 -and 100 sample scenario. However in all scenarios the addition of -dropout to data generation seems to ... +be seen for with data generation, in the scenario with 10 samples per +class it performs worse than the baseline model. +Dropout does seem to have a negligible impact on its own in both the 10 +and 100 sample scenario. In all scenarios data generation seems to +benefit from the addition of dropout. Additional Figures and Tables for the same comparisons with different -performance metrics are given in Appendix ... -There it cam be seen that while the measures ... reduce overfitting +performance metrics are given in Appendix~\ref{app:comp} +There it can be seen that while the measures are able reduce overfitting effectively for the handwritten digits data set, the neural networks trained on the fashion data set overfit despite these measures being in place. @@ -1296,91 +1378,96 @@ in place. % measures greatly improves the accuracy for small datasets. However for % the smallest size of one datapoint per class generating more data % ... outperforms dropout with only a ... improvment being seen by the -% implementation of dropout whereas datageneration improves the accuracy +% implementation of dropout whereas data generation improves the accuracy % by... . On the other hand the implementation of dropout seems to % reduce the variance in the model accuracy, as the variance in accuracy % for the dropout model is less than .. while the variance of the -% datagen .. model is nearly the same. The model with datageneration +% datagen .. model is nearly the same. The model with data generation % ... a reduction in variance with the addition of dropout. % For the slightly larger training sets of ten samples per class the % difference between the two measures seems smaller. Here the % improvement in accuracy % seen by dropout is slightly larger than the one of -% datageneration. However for the larger sized training set the variance -% in test accuracies is lower for the model with datageneration than the +% data generation. However for the larger sized training set the variance +% in test accuracies is lower for the model with data generation than the % one with dropout. % The results for the training sets with 100 samples per class resemble % the ones for the sets with 10 per class. -Overall it seems that both measures can increase the performance of -a convolution neural network however the success is dependent on the problem. -For the handwritten digits the great result of data generation likely -stems from the .. As the digits are not rotated the same way or -aligned the same way in all ... using images that are altered in such -a way can help the network learn to recognize digits that are written -at a different slant. +Overall it seems that both measures are able increase the performance of +a convolutional neural network however, the success is dependent on the problem. +For the handwritten digits, the great result of data generation likely +stems from a large portion of the differences between two data points +of the same class being explainable by different positions, sizes or +slants. Which is what data generation emulates. + In the fashion data set however the alignment of all images are very -COHERENT and little to no difference between two data points of the -same class can be ... by rotation, shifts or shear ... +uniform with little to no differences in size or angle between +data points which might explain the worse performance of data generation. \clearpage -\section{\titlecap{summary and outlook}} +\section{Summary and Outlook} -In this thesis we have taken a look at neural networks, their +In this thesis, we have taken a look at neural networks, their behavior in small scenarios and their application on image classification with limited datasets. -We have shown that ridge penalized neural networks ... to -slightly altered cubic smoothing splines, giving us an insight about -the behavior of the learned function of neural networks. +We have explored the relation between ridge penalized neural networks +and slightly altered cubic smoothing splines, giving us an insight +about the behavior of the learned function of neural networks. -We have seen that choosing the right training algorithm can have a +When comparing optimization algorithms, we have seen that choosing the +right training algorithm can have a drastic impact on the efficiency of training and quality of a model obtainable in a reasonable time frame. -The \textsc{Adam} algorithm has proven itself as best fit for the task -of classifying images. However there is ... ongoing research in -improving these algorithms, for example \textcite{rADAM} propose an -alteration to the \textsc{Adam} algorithm in order to make the -... term more stable in early phases of training. +The \textsc{Adam} algorithm has performed well in training the +convolutional neural networks. +However, there is ongoing research in further +improving these algorithms. For example, \textcite{rADAM} propose an +alteration to the \textsc{Adam} algorithm in order to reduce variance +of the learning rate in the early phases of training. We have seen that a convolutional network can benefit greatly from measures combating overfitting, especially if the available training sets are of -a small size. However the success of the measures we have examined -seem to be highly dependent on ... -... there is further research being done on the topic of combating -overfitting. +a small size. The success of the measures we have examined +seems to be highly dependent on the use case and further research is +being done on the topic of combating overfitting in neural networks. \textcite{random_erasing} propose randomly erasing parts of the inputs -images during training and are able to achieve high a high accuracy on the fashion MNIST -data set this way (96,35\%). +images during training and are able to achieve a high accuracy of 96,35\% on the fashion MNIST +data set this way. While data generation explored in this thesis is able to rudimentary -generate new training data there is ... in using more elaborate methods -to enlagre the training set. -\textcite{gan} explore the application of generative adversarial -networks in order to ... for medical images with small ... -These networks ... in order to generate completely new images -... (cf. \textcite{goodfellow_gan}). - -Convolutional neural networks are able to achieve remarkable results -and with further improvements and ... will find further applications -and is a staple here to stay. +generate new training data, further research is being done in more +elaborate ways +to enlarge the training set. +\textcite{gan} explore the usage of generative adversarial +networks to generate training images for the task of +classifying liver lesions. +These networks are trained to generate new images from +random noise, ideally resulting in completely new data that can be used +in training (cf. \textcite{goodfellow_gan}). + +Overall, convolutional neural networks are able to achieve remarkable +results in many use cases +and are a staple here to stay. -\begin{itemize} - \item generate more data, GAN etc \textcite{gan} - \item Transfer learning, use network trained on different task and - repurpose it / train it with the training data \textcite{transfer_learning} - \item random erasing fashion mnist 96.35\% accuracy - \textcite{random_erasing} - \item However the \textsc{Adam} algorithm can have problems with high -variance of the adaptive learning rate early in training. -\textcite{rADAM} try to address these issues with the Rectified Adam -algorithm -\end{itemize} +% \begin{itemize} +% \item generate more data, GAN etc \textcite{gan} +% \item Transfer learning, use network trained on different task and +% repurpose it / train it with the training data \textcite{transfer_learning} +% \item random erasing fashion MNIST 96.35\% accuracy +% \textcite{random_erasing} +% \item However the \textsc{Adam} algorithm can have problems with high +% variance of the adaptive learning rate early in training. +% \textcite{rADAM} try to address these issues with the Rectified Adam +% \item error measure: Robust error measure for supervised neural network learning with outliers +% algorithm +% \end{itemize} diff --git a/TeX/introduction.tex b/TeX/introduction.tex index 7b014a7..3a94c08 100644 --- a/TeX/introduction.tex +++ b/TeX/introduction.tex @@ -1,22 +1,74 @@ \section{Introduction} -Neural networks have become a widely used model as they are relatively -easy to build with modern frameworks like tensorflow and are able to -model complex data. -In this thesis we will .. networks .. - -In order to get some understanding about the behavior of the learned -function of neural networks we examine the convergence behavior for -.... - -An interesting application of neural networks is the application to -image classification tasks. We ... impact of ... on the performance of -a neural network in such a task. - -As in some applications such as medical imaging one might be limited -to very small training data we study the impact of two measures in -improving the accuracy in such a case by trying to ... the model from -overfitting the data. +Neural networks have become a widely used model for a plethora of +applications. +They are an attractive choice as they are able to +model complex data with relatively little additional input to the +training data needed. +Additionally, as the price of parallelized computing +power in the form of graphics processing unit has decreased drastically over the last +years, it has become far more accessible to train and use large +neural networks. +Furthermore, highly optimized and parallelized frameworks for tensor +operations have been developed. +With these frameworks, such as TensorFlow and PyTorch, building neural +networks as become a much more straightforward process. +% Furthermore, with the development of highly optimized and +% parallelized implementations of mathematical operations needed for +% neural networks, such as TensorFlow or PyTorch, building neural network +% models has become a much more straightforward process. +% For example the flagship consumer GPU GeForce RTX 3080 of NVIDIA's current +% generation has 5.888 CUDS cores at a ... price of 799 Euro compared +% to the last generations flagship GeForce RTX 2080 Ti with 4352 CUDA +% cores at a ... price of 1259 Euro. These CUDA cores are computing +% cores specialized for tensor operations, which are necessary in +% fitting and using neural networks. + +In this thesis we want to get an understanding of the behavior of neural % +networks and +how we can use them for problems with a complex relationship between +in and output. +In Section 2 we introduce the mathematical construct of neural +networks and how to fit them to training data. + +To gain some insight about the learned function, +we examine a simple class of neural networks that only contain one +hidden layer. +In Section~\ref{sec:shallownn} we proof a relation between such networks and +functions that minimize the distance to training data +with respect to its second derivative. + +An interesting application of neural networks is the task of +classifying images. +However, for such complex problems the number of parameters in fully +connected neural networks can exceed what is +feasible for training. +In Section~\ref{sec:cnn} we explore the addition of convolution to neural +networks to reduce the number of parameters. + +As these large networks are commonly trained using gradient decent +algorithms we compare the performance of different algorithms based on +gradient descent in Section~4.4. +% and +% show that it is beneficial to only use small subsets of the training +% data in each iteration rather than using the whole data set to update +% the parameters. +Most statistical models especially these with large amounts of +trainable parameter can struggle with overfitting the data. +In Section 4.5 we examine the impact of two measures designed to combat +overfitting. + +In some applications such as working with medical images the data +available for training can be scarce, which results in the networks +being prone to overfitting. +As these are interesting applications of neural networks we examine +the benefit of the measures to combat overfitting for +scenarios with limited amounts of training data. + +% As in some applications such as medical imaging one might be limited +% to very small training data we study the impact of two measures in +% improving the accuracy in such a case by trying to ... the model from +% overfitting the data. diff --git a/TeX/introduction_nn.tex b/TeX/introduction_nn.tex index a39a58c..86788aa 100644 --- a/TeX/introduction_nn.tex +++ b/TeX/introduction_nn.tex @@ -1,23 +1,26 @@ -\section{\titlecap{Introduction to Neural Networks}} +\section{Introduction to Neural Networks} This chapter is based on \textcite[Chapter~6]{Goodfellow} and \textcite{Haykin}. -Neural Networks (NN) are a mathematical construct inspired by the -structure of brains in mammals. It consists of an array of neurons that +Neural Networks are a mathematical construct inspired by the +structure of brains in mammals. They consist of an array of neurons that receive inputs and compute an accumulated output. These neurons are arranged in layers, with one input and output layer -and a arbirtary -amount of hidden layer between them. -The amount of neurons in the in- and output layers correspond to the +and an arbitrary +amount of hidden layers between them. +The number of neurons in the in- and output layers correspond to the desired dimensions of in- and outputs of the model. -In conventional neural networks the information is fed forward from the -input layer towards the output layer hence they are often called feed -forward networks. Each neuron in a layer has the outputs of all -neurons in the preceding layer as input and computes a accumulated -value from these (fully connected). A -illustration of an example neuronal network is given in -Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}. + +In conventional neural networks, the information is fed forward from the +input layer towards the output layer, hence they are often called +feed forward networks. Each neuron in a layer has the outputs of all +neurons in the preceding layer as input and computes an accumulated +value from these (fully connected). +% An illustration of an example neural network is given in +% Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}. +Illustrations of a neural network and the structure of a neuron are given +in Figure~\ref{fig:nn} and Figure~\ref{fig:neuron}. \tikzset{% every neuron/.style={ @@ -88,71 +91,71 @@ Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}. \node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$}; \end{tikzpicture}}%} - \caption[Illustration of a neural network]{Illustration of a neural network with $d_i$ inputs, $l$ + \caption[Illustration of a Neural Network]{Illustration of a neural network with $d_i$ inputs, $l$ hidden layers with $n_{\cdot}$ nodes in each layer, as well as $d_o$ outputs. } \label{fig:nn} \end{figure} -\subsection{\titlecap{nonlinearity of neural networks}} +\subsection{Nonlinearity of Neural Networks} -The arguably most important feature of neural networks that sets them +The arguably most important feature of neural networks which sets them apart from linear models is the activation function implemented in the -neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the -inputs a activation function $\sigma$ is applied resulting in the +neurons. As illustrated in Figure~\ref{fig:neuron} on the weighted sum of the +inputs an activation function $\sigma$ is applied resulting in the output of the $k$-th neuron in a layer $l$ with $m$ nodes in layer $l-1$ being given by \begin{align*} o_{l,k} = \sigma\left(b_{l,k} + \sum_{j=1}^{m} w_{l,k,j} - o_{l-1,j}\right) + o_{l-1,j}\right), \end{align*} for weights $w_{l,k,j}$ and biases $b_{l,k}$. For a network with $L$ hidden layers and inputs $o_{0}$ the final outputs of the network are thus given by $o_{L+1}$. The activation function is usually chosen nonlinear (a linear one -would result in the entire model collapsing into a linear one\todo{beweis?}) which +would result in the entire network collapsing into a linear model) which allows it to better model data where the relation of in- and output is of nonlinear nature. There are two types of activation functions, saturating and not saturating ones. Popular examples for the former are sigmoid -functions where most commonly the standard logisitc function or tangens +functions where most commonly the standard logistic function or tangens hyperbolicus are used -as they have easy to compute derivatives which is desirable for gradient -based optimization algorithms. The standard logistic function (often -referred to simply as sigmoid function) is given by +as they have easy to compute derivatives which is desirable for +gradient-based optimization algorithms. The standard logistic function +(often simply referred to as sigmoid function) is given by \[ f(x) = \frac{1}{1+e^{-x}} \] -and has a realm of $[0,1]$. Its usage as an activation function is -motivated by modeling neurons which -are close to deactive until a certain threshold is hit and then grow in -intensity until they are fully -active. This is similar to the behavior of neurons in -brains\todo{besser schreiben}. The tangens hyperbolicus is given by +and has a realm of $[0,1]$. The tangens hyperbolicus is given by \[ \tanh(x) = \frac{2}{e^{2x}+1} \] -and has a realm of $[-1,1]$. -The downside of these saturating activation functions is that given -their saturating nature their derivatives are close to zero for large or small -input values. This can slow or hinder the progress of gradient based methods. - -The nonsaturating activation functions commonly used are the recified +and has a realm of $[-1,1]$. Both functions result in neurons that are +close to inactive until a certain threshold is reached where they grow +until saturation. +The downside of these saturating activation functions is, that their +derivatives are close to zero on most of their realm, only assuming +larger values in proximity to zero. +This can hinder the progress of gradient-based methods. + +The nonsaturating activation functions commonly used are the rectified linear unit (ReLU) or the leaky ReLU. The ReLU is given by -\[ +\begin{equation} r(x) = \max\left\{0, x\right\}. -\] + \label{eq:relu} +\end{equation} This has the benefit of having a constant derivative for values larger -than zero. However the derivative being zero for negative values has -the same downside for -fitting the model with gradient based methods. The leaky ReLU is +than zero. However, the derivative being zero for negative values has +the same downside for +fitting the model with gradient-based methods. The leaky ReLU is an attempt to counteract this problem by assigning a small constant -derivative to all values smaller than zero and for scalar $\alpha$ is given by +derivative to all values smaller than zero and for a scalar $\alpha$ is given by \[ l(x) = \max\left\{0, x\right\} + \alpha \min \left\{0, x\right\}. \] -In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}. +In Figure~\ref{fig:activation} visualizations of these functions are given. +%In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}. \begin{figure} @@ -238,7 +241,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi % \draw [->] (hidden-\i) -- (output-\j); \end{tikzpicture} - \caption{Structure of a single neuron} + \caption[Structure of a Single Neuron]{Structure of a single neuron.} \label{fig:neuron} \end{figure} @@ -251,7 +254,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi \addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)}; \end{axis} \end{tikzpicture} - \caption{\titlecap{standard logistic function}} + \caption{Standard Logistic Function} \end{subfigure} \begin{subfigure}{.45\linewidth} \centering @@ -260,7 +263,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi \addplot[domain=-5:5, samples=100]{tanh(x)}; \end{axis} \end{tikzpicture} - \caption{\titlecap{tangens hyperbolicus}} + \caption{Tangens Hyperbolicus} \end{subfigure} \begin{subfigure}{.45\linewidth} \centering @@ -282,7 +285,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi \end{tikzpicture} \caption{Leaky ReLU, $\alpha = 0.1$} \end{subfigure} - \caption{Plots of the activation functions} + \caption[Plots of the Activation Functions]{Plots of the activation functions.} \label{fig:activation} \end{figure} @@ -291,9 +294,9 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi As neural networks are a parametric model we need to fit the parameters to the input -data in order to get meaningful results from the network. To be able -do this we first need to discuss how we interpret the output of the -neural network. +data to get meaningful predictions from the network. In order +to accomplish this we need to discuss how we interpret the output of the +neural network and assess the quality of predictions. % After a neural network model is designed, like most statistical models % it has to be fit to the data. In the machine learning context this is @@ -311,20 +314,20 @@ neural network. % data-point in fitting the model, where usually some distance between % the model output and the labels is minimized. -\subsubsection{\titlecap{nonliniarity in the last layer}} +\subsubsection{Nonlinearity in the Last Layer} -Given the nature of the neural net the outputs of the last layer are -real numbers. For regression tasks this is desirable, for +Given the nature of the neural net, the outputs of the last layer are +real numbers. For regression tasks, this is desirable, for classification problems however some transformations might be necessary. As the goal in the latter is to predict a certain class or classes for -an object the output needs to be of a form that allows this +an object, the output needs to be of a form that allows this interpretation. Commonly the nodes in the output layer each correspond to a class and the class chosen as prediction is the one with the highest value at the corresponding output node. -This corresponds to a transformation of the output -vector $o$ into a one-hot vector +This can be modeled as a transformation of the output +vector $o \in \mathbb{R}^n$ into a one-hot vector \[ \text{pred}_i = \begin{cases} @@ -332,9 +335,9 @@ vector $o$ into a one-hot vector 0,& \text{else}. \end{cases} \] -This however makes training the model with gradient based methods impossible, as the derivative of +This however makes training the model with gradient-based methods impossible, as the derivative of the transformation is either zero or undefined. -A continuous transformation that is close to the argmax one is given by +An continuous transformation that is close to argmax is given by softmax \begin{equation} \text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}. @@ -342,10 +345,10 @@ softmax \end{equation} The softmax function transforms the realm of the output to the interval $[0,1]$ and the individual values sum to one, thus the output can be interpreted as -a probability for each class given the input. -Additionally to being differentiable this allows for evaluataing the -cetainiy of a prediction, rather than just whether it is accurate. -A similar effect is obtained when for a binary or two class problem the +a probability for each class conditioned on the input. +Additionally, to being differentiable this allows to evaluate the +certainty of a prediction, rather than just whether it is accurate. +A similar effect is obtained when for a binary or two-class problem the sigmoid function \[ f(x) = \frac{1}{1 + e^{-x}} @@ -353,7 +356,6 @@ sigmoid function is used and the output $f(x)$ is interpreted as the probability for the first class and $1-f(x)$ for the second class. -\todo{vielleicht additiv invarianz} % Another property that makes softmax attractive is the invariance to addition % \[ % \text{sofmax}(o) = \text{softmax}(o + c @@ -389,26 +391,26 @@ the first class and $1-f(x)$ for the second class. % way to circumvent this problem is to normalize the output vector is % such a way that the entries add up to one, this allows for the % interpretation of probabilities assigned to each class. - +\clearpage \subsubsection{Error Measurement} -In order to train the network we need to be able to make an assessment -about the quality of predictions using some error measure. +In order to train the network we need to be able to assess the quality +of predictions using some error measure. The choice of the error -function is highly dependent on the type of the problem. For -regression problems a commonly used error measure is the mean squared +function is highly dependent on the type of problem. For +regression problems, a commonly used error measure is the mean squared error (MSE) -which for a function $f$ and data $(x_i,y_i), i=1,\dots,n$ is given by +which for a function $f$ and data $(x_i,y_i), i \in \left\{1,\dots,n\right\}$ is given by \[ MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2. \] -However depending on the problem error measures with different -properties might be needed, for example in some contexts it is +However, depending on the problem error measures with different +properties might be needed. For example in some contexts it is required to consider a proportional rather than absolute error. As discussed above the output of a neural network for a classification problem can be interpreted as a probability distribution over the classes -conditioned on the input. In this case it is desirable to +conditioned on the input. In this case, it is desirable to use error functions designed to compare probability distributions. A widespread error function for this use case is the categorical cross entropy (\textcite{PRML}), which for two discrete distributions $p, q$ with the same realm $C$ is given by @@ -416,33 +418,35 @@ which for two discrete distributions $p, q$ with the same realm $C$ is given by H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right), \] comparing $q$ to a target density $p$. -For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$ -corresponds to the probability of class $c$ given $x_i$ and predictor +For a data set $(x_i,y_i), i \in \left\{1,\dots,n\right\}$ where each $y_{i,c}$ +corresponds to the probability of class $c$ given $x_i$ and a predictor $f$ we get the loss function \begin{equation} CE(f) = \sum_{i=1}^n H(y_i, f(x_i)). \label{eq:cross_entropy} \end{equation} -\todo{Den satz einbauen} --Maximum Likelihood --Ableitung mit softmax pseudo linear -> fast improvemtns possible +% \todo{Den satz einbauen} +% -Maximum Likelihood +% -Ableitung mit softmax pseudo linear -> fast improvemtns possible \subsubsection{Gradient Descent Algorithm} Trying to find the optimal parameter for fitting the model to the data can be a hard problem. Given the complex nature of a neural network -with many layers and neurons it is hard to predict the impact of +with many layers and neurons, it is hard to predict the impact of single parameters on the accuracy of the output. Thus using numeric optimization algorithms is the only -feasible way to fit the model. A attractive algorithm for training -neural networks is gradient descent where each parameter -$\theta_i$\todo{parameter name?} is -iterative changed according to the gradient regarding the error -measure and a step size $\gamma$. For this all parameters are -initialized (often random or close to zero) and then iteratively -updated until a certain stopping criterion is hit, mostly either being a fixed -number of iterations or a desired upper limit for the error measure. +feasible way to fit the model. + +An attractive algorithm for training +neural networks is gradient descent. Here all parameters are +initialized with certain values (often random or close to zero) and +then iteratively updated. The updates are made in the direction of the +gradient regarding the error with a step size $\gamma$ until a +specified stopping criterion is hit. +% This mostly either being a fixed +% number of iterations or a desired upper limit for the error measure. % For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$ % and a error function $L(f_\theta)$ the gradient descent algorithm is % given in \ref{alg:gd}. @@ -465,21 +469,21 @@ number of iterations or a desired upper limit for the error measure. The algorithm for gradient descent is given in Algorithm~\ref{alg:gd}. In the context of fitting a neural network -$f_\theta$ corresponds to a error measurement of a neural network +$f_\theta$ corresponds to an error measurement of a neural network $\mathcal{NN}_{\theta}$ where $\theta$ is a vector containing all the weights and biases of the network. -As can be seen this requires computing the derivative of the network +As can be seen, this requires computing the derivative of the network with regard to each variable. With the number of variables getting large in networks with multiple layers of high neuron count naively computing the derivatives can get quite memory and computational expensive. By using the chain rule and exploiting the layered structure we can -compute the parameter update much more efficiently, this practice is -called backpropagation and was introduced by -\textcite{backprop}\todo{nachsehen ob richtige quelle}. The algorithm +compute the parameter update much more efficiently. This practice is +called backpropagation and was introduced for use in neural networks by +\textcite{backprop}. The algorithm for one data point is given in Algorithm~\ref{alg:backprop}, but for all error functions that are sums of errors for single data points (MSE, cross -entropy) backpropagation works analogous for larger training data. +entropy) backpropagation works analogously for larger training data. % \subsubsection{Backpropagation} @@ -496,8 +500,9 @@ entropy) backpropagation works analogous for larger training data. \begin{algorithm}[H] \SetAlgoLined \KwInput{Inputs $o_0$, neural network - with $L$ hidden layers and weights $w$ and biases $b$ for $n_l$ - nodes and activation function $\sigma_l$ in layer $l$, loss $\tilde{L}$.} + with $L$ hidden layers, weights $w$, and biases $b$ for $n_l$ + nodes as well as an activation function $\sigma_l$ in layer $l$ + and loss function $\tilde{L}$.} Forward Propagation: \For{$l \in \left\{1, \dots, L+1\right\}$}{ Compute values for layer $l$: diff --git a/TeX/main.lot b/TeX/main.lot index 79504e6..9b9092a 100644 --- a/TeX/main.lot +++ b/TeX/main.lot @@ -1,8 +1,6 @@ \boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax \babel@toc {english}{} \defcounter {refsection}{0}\relax -\contentsline {table}{\numberline {4.1}{\ignorespaces Performance metrics of the networks trained in Figure~\ref {fig:sgd_vs_gd} after 20 training epochs.\relax }}{29}{table.caption.32}% +\contentsline {table}{\numberline {4.1}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of MNIST Handwritten Digits}}{41}% \defcounter {refsection}{0}\relax -\contentsline {table}{\numberline {4.2}{\ignorespaces Values of the test accuracy of the model trained 10 times on random MNIST handwriting training sets containing 1, 10 and 100 data points per class after 125 epochs. The mean accuracy achieved for the full set employing both overfitting measures is \relax }}{42}{table.4.2}% -\defcounter {refsection}{0}\relax -\contentsline {table}{\numberline {4.3}{\ignorespaces Values of the test accuracy of the model trained 10 times on random fashion MNIST training sets containing 1, 10 and 100 data points per class after 125 epochs. The mean accuracy achieved for the full set employing both overfitting measures is \relax }}{42}{table.4.3}% +\contentsline {table}{\numberline {4.2}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of Fashion MNIST}}{41}% diff --git a/TeX/main.out b/TeX/main.out new file mode 100644 index 0000000..f4c1fe5 --- /dev/null +++ b/TeX/main.out @@ -0,0 +1,25 @@ +\BOOKMARK [1][-]{section.1}{Introduction}{}% 1 +\BOOKMARK [1][-]{section.2}{Introduction to Neural Networks}{}% 2 +\BOOKMARK [2][-]{subsection.2.1}{Nonlinearity of Neural Networks}{section.2}% 3 +\BOOKMARK [2][-]{subsection.2.2}{Training Neural Networks}{section.2}% 4 +\BOOKMARK [3][-]{subsubsection.2.2.1}{Nonlinearity in the Last Layer}{subsection.2.2}% 5 +\BOOKMARK [3][-]{subsubsection.2.2.2}{Error Measurement}{subsection.2.2}% 6 +\BOOKMARK [3][-]{subsubsection.2.2.3}{Gradient Descent Algorithm}{subsection.2.2}% 7 +\BOOKMARK [1][-]{section.3}{Shallow Neural Networks}{}% 8 +\BOOKMARK [2][-]{subsection.3.1}{Convergence Behavior of One-Dimensional Randomized Shallow Neural Networks}{section.3}% 9 +\BOOKMARK [2][-]{subsection.3.2}{Simulations}{section.3}% 10 +\BOOKMARK [1][-]{section.4}{Application of Neural Networks to Higher Complexity Problems}{}% 11 +\BOOKMARK [2][-]{subsection.4.1}{Convolution}{section.4}% 12 +\BOOKMARK [2][-]{subsection.4.2}{Convolutional Neural Networks}{section.4}% 13 +\BOOKMARK [2][-]{subsection.4.3}{Stochastic Training Algorithms}{section.4}% 14 +\BOOKMARK [2][-]{subsection.4.4}{Modified Stochastic Gradient Descent}{section.4}% 15 +\BOOKMARK [2][-]{subsection.4.5}{Combating Overfitting}{section.4}% 16 +\BOOKMARK [3][-]{subsubsection.4.5.1}{Dropout}{subsection.4.5}% 17 +\BOOKMARK [3][-]{subsubsection.4.5.2}{Manipulation of Input Data}{subsection.4.5}% 18 +\BOOKMARK [3][-]{subsubsection.4.5.3}{Comparisons}{subsection.4.5}% 19 +\BOOKMARK [3][-]{subsubsection.4.5.4}{Effectiveness for Small Training Sets}{subsection.4.5}% 20 +\BOOKMARK [1][-]{section.5}{Summary and Outlook}{}% 21 +\BOOKMARK [1][-]{section*.28}{Appendices}{}% 22 +\BOOKMARK [1][-]{Appendix.a.A}{Notes on Proofs of Lemmata in Section 3.1}{}% 23 +\BOOKMARK [1][-]{Appendix.a.B}{Implementations}{}% 24 +\BOOKMARK [1][-]{Appendix.a.C}{Additional Comparisons}{}% 25 diff --git a/TeX/main.tex b/TeX/main.tex index 6427570..7512ceb 100644 --- a/TeX/main.tex +++ b/TeX/main.tex @@ -1,4 +1,4 @@ -\documentclass[a4paper, 12pt, draft=true]{article} +\documentclass[a4paper, 12pt]{article} %\usepackage[margin=1in]{geometry} %\geometry{a4paper, left=30mm, right=40mm,top=25mm, bottom=20mm} @@ -34,17 +34,17 @@ \usepackage{todonotes} \usepackage{lipsum} \usepackage[ruled,vlined]{algorithm2e} -\usepackage{showframe} +%\usepackage{showframe} \usepackage[protrusion=true, expansion=true, kerning=true, letterspace = 150]{microtype} -\usepackage{titlecaps} +%\usepackage{titlecaps} \usepackage{afterpage} \usepackage{xcolor} \usepackage{chngcntr} -\usepackage{hyperref} -\hypersetup{ - linktoc=all, %set to all if you want both sections and subsections linked -} +%\usepackage{hyperref} +% \hypersetup{ +% linktoc=all, %set to all if you want both sections and subsections linked +% } \allowdisplaybreaks \captionsetup[sub]{justification=centering} @@ -245,8 +245,7 @@ \begin{center} \vspace{1cm} - \huge \textbf{\titlecap{neural networks and their application on - higher complexity problems}}\\ + \huge \textbf{Neural Networks and their Application on Higher Complexity Problems}\\ \vspace{1cm} \huge \textbf{Tim Tobias Arndt}\\ \vspace{1cm} @@ -260,7 +259,6 @@ \clearpage \listoffigures \listoftables -\listoftodos \newpage \pagenumbering{arabic} % Introduction @@ -288,413 +286,6 @@ % Appendix A \input{appendixA.tex} -\section{\titlecap{additional comparisons}} -In this section we show additional comparisons for the neural networks -trained in Section~\ref{...}. In ... the same comparisons given for -the test accuracy are given for the cross entropy loss on the test -set, as well as on the training set. - - -\begin{figure}[h] - \centering - \small - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_1.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_dropout_02_1.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_datagen_1.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_02_1.mean}; - - - \addlegendentry{\footnotesize{Default}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G. + D. 0.2}} - \addlegendentry{\footnotesize{D. 0.4}} - \addlegendentry{\footnotesize{Default}} - \end{axis} - \end{tikzpicture} - \caption{1 sample per class} - \vspace{0.25cm} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_dropout_00_10.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_dropout_02_10.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_00_10.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_02_10.mean}; - - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{10 samples per class} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch}, ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_dropout_00_100.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_dropout_02_100.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_00_100.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_02_100.mean}; - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{100 samples per class} - \vspace{.25cm} - \end{subfigure} - \caption{Mean test accuracies of the models fitting the sampled MNIST - handwriting datasets over the 125 epochs of training.} -\end{figure} - -\begin{figure}[h] - \centering - \small - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = - {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_0_1.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_2_1.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_0_1.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_2_1.mean}; - - - \addlegendentry{\footnotesize{Default}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G. + D. 0.2}} - \addlegendentry{\footnotesize{D. 0.4}} - \end{axis} - \end{tikzpicture} - \caption{1 sample per class} - \vspace{0.25cm} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}, ymin = {0.62}] - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_0_10.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_2_10.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_0_10.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_2_10.mean}; - - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{10 samples per class} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch}, ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_0_100.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_2_100.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_0_100.mean}; - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_2_100.mean}; - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{100 samples per class} - \vspace{.25cm} - \end{subfigure} - \caption{Mean test accuracies of the models fitting the sampled fashion MNIST - over the 125 epochs of training.} -\end{figure} - -\begin{figure}[h] - \centering - \small - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_1.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_dropout_02_1.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_datagen_1.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_02_1.mean}; - - - \addlegendentry{\footnotesize{Default}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G. + D. 0.2}} - \addlegendentry{\footnotesize{D. 0.4}} - \addlegendentry{\footnotesize{Default}} - \end{axis} - \end{tikzpicture} - \caption{1 sample per class} - \vspace{0.25cm} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_dropout_00_10.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_dropout_02_10.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_00_10.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_02_10.mean}; - - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{10 samples per class} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch}, ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}, ymin = {0.92}] - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_dropout_00_100.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_dropout_02_100.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_00_100.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/adam_datagen_dropout_02_100.mean}; - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{100 samples per class} - \vspace{.25cm} - \end{subfigure} - \caption{Mean test accuracies of the models fitting the sampled MNIST - handwriting datasets over the 125 epochs of training.} -\end{figure} - -\begin{figure}[h] - \centering - \small - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = - {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_0_1.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_2_1.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_0_1.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_2_1.mean}; - - - \addlegendentry{\footnotesize{Default}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G. + D. 0.2}} - \addlegendentry{\footnotesize{D. 0.4}} - \end{axis} - \end{tikzpicture} - \caption{1 sample per class} - \vspace{0.25cm} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch},ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}, ymin = {0.62}] - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_0_10.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_2_10.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_0_10.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_2_10.mean}; - - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{10 samples per class} - \end{subfigure} - \begin{subfigure}[h]{\textwidth} - \begin{tikzpicture} - \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, - /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth, - height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east}, - xlabel = {epoch}, ylabel = {Test Accuracy}, cycle - list/Dark2, every axis plot/.append style={line width - =1.25pt}] - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_0_100.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_dropout_2_100.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_0_100.mean}; - \addplot table - [x=epoch, y=accuracy, col sep=comma, mark = none] - {Figures/Data/fashion_datagen_dropout_2_100.mean}; - - \addlegendentry{\footnotesize{Default.}} - \addlegendentry{\footnotesize{D. 0.2}} - \addlegendentry{\footnotesize{G.}} - \addlegendentry{\footnotesize{G + D. 0.2}} - \end{axis} - \end{tikzpicture} - \caption{100 samples per class} - \vspace{.25cm} - \end{subfigure} - \caption{Mean test accuracies of the models fitting the sampled fashion MNIST - over the 125 epochs of training.} -\end{figure} - \end{document} %%% Local Variables: diff --git a/TeX/theo_3_8.tex b/TeX/theo_3_8.tex index d9a79ff..05c2bf0 100644 --- a/TeX/theo_3_8.tex +++ b/TeX/theo_3_8.tex @@ -5,16 +5,16 @@ %%% TeX-master: "main" %%% End: \section{Shallow Neural Networks} - +\label{sec:shallownn} % In order to get a some understanding of the behavior of neural % networks we study a simplified class of networks called shallow neural % networks in this chapter. % We consider shallow neural networks consist of a single % hidden layer and -In order to get some understanding of the behavior of neural networks +To get some understanding of the behavior of neural networks we examine a simple class of networks in this chapter. We consider networks that contain only one hidden layer and have a single output -node. We call these networks shallow neural networks. +node and call these networks shallow neural networks. \begin{Definition}[Shallow neural network, Heiss, Teichmann, and Wutte (2019, Definition 1.4)] For a input dimension $d$ and a Lipschitz continuous activation function $\sigma: @@ -85,8 +85,8 @@ with % \label{fig:shallowNN} % \end{figure} -As neural networks with a large amount of nodes have a large amount of -parameters that can be tuned it can often fit the data quite well. If +As neural networks with a large number of nodes have a large amount of +tunable parameters it can often fit data quite well. If a ReLU activation function \[ \sigma(x) \coloneqq \max{(0, x)} @@ -106,7 +106,7 @@ on MSE will perfectly fit the data. minimizing squared error loss. \proof W.l.o.g. all values $x_{ij}^{\text{train}} \in [0,1],~\forall i \in - \left\{1,\dots\right\}, j \in \left\{1,\dots,d\right\}$. Now we + \left\{1,\dots, t\right\}, j \in \left\{1,\dots,d\right\}$. Now we chose $v^*$ in order to calculate a unique value for all $x_i^{\text{train}}$: \[ @@ -142,30 +142,32 @@ on MSE will perfectly fit the data. and $\vartheta^* = (w^*, b^*, v^*, c = 0)$ we get \[ \mathcal{NN}_{\vartheta^*} (x_i^{\text{train}}) = \sum_{k = - 1}^{i-1} w_k\left(\left(v^*\right)^{\mathrm{T}} - x_i^{\text{train}}\right) + w_i\left(\left(v^*\right)^{\mathrm{T}} + 1}^{i-1} w_k\left(b_k^* + \left(v^*\right)^{\mathrm{T}} + x_i^{\text{train}}\right) + w_i\left(b_i^* +\left(v^*\right)^{\mathrm{T}} x_i^{\text{train}}\right) = y_i^{\text{train}}. \] As the squared error of $\mathcal{NN}_{\vartheta^*}$ is zero all squared error loss minimizing shallow networks with at least $t$ hidden - nodes will perfectly fit the data. - \qed + nodes will perfectly fit the data. \qed \label{theo:overfit} \end{Theorem} -However this behavior is often not desired as over fit models generally -have bad generalization properties especially if noise is present in +However, this behavior is often not desired as overfit models tend to +have bad generalization properties, especially if noise is present in the data. This effect is illustrated in -Figure~\ref{fig:overfit}. Here a shallow neural network that perfectly fits the -training data is -constructed according to the proof of Theorem~\ref{theo:overfit} and +Figure~\ref{fig:overfit}. + +Here a shallow neural network is +constructed according to the proof of Theorem~\ref{theo:overfit} to +perfectly fit some data and compared to a cubic smoothing spline (Definition~\ref{def:wrs}). While the neural network fits the data better than the spline, the spline represents the underlying mechanism that was used to generate the data more accurately. The better generalization of the spline compared to the network is further -demonstrated by the better validation error computed on newly generated +demonstrated by the better performance on newly generated test data. + In order to improve the accuracy of the model we want to reduce overfitting. A possible way to achieve this is by explicitly regularizing the network through the cost function as done with @@ -173,48 +175,47 @@ ridge penalized networks (Definition~\ref{def:rpnn}) where large weights $w$ are punished. In Theorem~\ref{theo:main1} we will prove that this will result in the shallow neural network converging to -regressions splines as the amount of nodes in the hidden layer is +a form of splines as the number of nodes in the hidden layer is increased. - - - -\begin{figure} - \pgfplotsset{ - compat=1.11, -legend image code/.code={ - \draw[mark repeat=2,mark phase=2] -plot coordinates { - (0cm,0cm) -(0.15cm,0cm) %% default is (0.3cm,0cm) -(0.3cm,0cm) %% default is (0.6cm,0cm) -};% -} -} - \begin{tikzpicture} - \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.6\textwidth] - \addplot table - [x=x, y=y, col sep=comma, only marks,mark options={scale = - 0.7}] {Figures/Data/overfit.csv}; - \addplot [red, line width=0.8pt] table [x=x_n, y=s_n, col - sep=comma, forget plot] {Figures/Data/overfit.csv}; - \addplot [black, line width=0.8pt] table [x=x_n, y=y_n, col - sep=comma] {Figures/Data/overfit.csv}; - \addplot [black, line width=0.8pt, dashed] table [x=x, y=y, col - sep=comma] {Figures/Data/overfit_spline.csv}; - - \addlegendentry{\footnotesize{data}}; - \addlegendentry{\footnotesize{$\mathcal{NN}_{\vartheta^*}$}}; - \addlegendentry{\footnotesize{spline}}; - \end{axis} - \end{tikzpicture} - \caption[Overfitting of shallow neural networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) + +\vfill + +\begin{figure}[h] + \pgfplotsset{ + compat=1.11, + legend image code/.code={ + \draw[mark repeat=2,mark phase=2] + plot coordinates { + (0cm,0cm) + (0.15cm,0cm) %% default is (0.3cm,0cm) + (0.3cm,0cm) %% default is (0.6cm,0cm) + };% + } + } + \begin{tikzpicture} + \begin{axis}[tick style = {draw = none}, width = \textwidth, + height = 0.6\textwidth] + \addplot table + [x=x, y=y, col sep=comma, only marks,mark options={scale = + 0.7}] {Figures/Data/overfit.csv}; + \addplot [red, line width=0.8pt] table [x=x_n, y=s_n, col + sep=comma, forget plot] {Figures/Data/overfit.csv}; + \addplot [black, line width=0.8pt] table [x=x_n, y=y_n, col + sep=comma] {Figures/Data/overfit.csv}; + \addplot [black, line width=0.8pt, dashed] table [x=x, y=y, col + sep=comma] {Figures/Data/overfit_spline.csv}; + + \addlegendentry{\footnotesize{data}}; + \addlegendentry{\footnotesize{$\mathcal{NN}_{\vartheta^*}$}}; + \addlegendentry{\footnotesize{spline}}; + \end{axis} + \end{tikzpicture} + \caption[Overfitting of Shallow Neural Networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) + \varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$ - (\textcolor{blue}{blue dots}) the neural network constructed + (\textcolor{blue}{blue}) the neural network constructed according to the proof of Theorem~\ref{theo:overfit} (black) and the underlying signal (\textcolor{red}{red}). While the network has no - bias a cubic smoothing spline (black dashed) fits the data much + bias a cubic smoothing spline (black, dashed) fits the data much better. For a test set of size 20 with uniformly distributed $x$ values and responses of the same fashion as the training data the MSE of the neural network is 0.30, while the MSE of the spline is only 0.14 thus generalizing @@ -223,17 +224,20 @@ plot coordinates { \label{fig:overfit} \end{figure} -\clearpage -\subsection{\titlecap{convergence behaviour of 1-dim. randomized shallow neural - networks}} +\vfill +\clearpage +\subsection{Convergence Behavior of One-Dimensional Randomized Shallow + Neural Networks} +\label{sec:conv} This section is based on \textcite{heiss2019}. - -... shallow neural networks with a one dimensional input where the parameters in the +In this section, we examine the convergence behavior of certain shallow +neural networks. +We consider shallow neural networks with a one dimensional input where the parameters in the hidden layer are randomized resulting in only the weights is the output layer being trainable. -Additionally we assume all neurons use a ReLU as activation function +Additionally, we assume all neurons use a ReLU as an activation function and call such networks randomized shallow neural networks. % We will analyze the @@ -271,14 +275,12 @@ and call such networks randomized shallow neural networks. % are penalized in the loss % function ridge penalized neural networks. - -We will prove that ... nodes .. a randomized shallow neural network will +We will prove that if we penalize the amount of the trainable weights +when fitting a randomized shallow neural network it will converge to a function that minimizes the distance to the training -data with .. to its second derivative, -if the $L^2$ norm of the trainable weights $w$ is -penalized in the loss function. +data with respect to its second derivative as the amount of nodes is increased. We call such a network that is fitted according to MSE and a penalty term for -the amount of the weights a ridge penalized neural network. +the $L^2$ norm of the trainable weights $w$ a ridge penalized neural network. % $\lam$ % We call a randomized shallow neural network trained on MSE and % punished for the amount of the weights $w$ according to a @@ -300,7 +302,7 @@ the amount of the weights a ridge penalized neural network. \mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq \mathcal{RN}_{w^{*, \tilde{\lambda}}(\omega), \omega} \] - with + with \ \[ w^{*,\tilde{\lambda}}(\omega) :\in \argmin_{w \in \mathbb{R}^n} \underbrace{ \left\{\overbrace{\sum_{i = 1}^N \left(\mathcal{RN}_{w, @@ -316,7 +318,7 @@ having minimal weights, resulting in the \textit{minimum norm network} $\mathcal{RN}_{w^{\text{min}}, \omega}$. \[ \mathcal{RN}_{w^{\text{min}}, \omega} \text{ randomized shallow - Neural network with weights } w^{\text{min}}: + neural network with weights } w^{\text{min}}\colon \] \[ w^{\text{min}} \in \argmin_{w \in \mathbb{R}^n} \norm{w}, \text{ @@ -328,8 +330,8 @@ For $\tilde{\lambda} \to \infty$ the learned function will resemble the data less and with the weights approaching $0$ will converge to the constant $0$ function. -In order to make the notation more convinient in the following the -$\omega$ used to express the realised random parameters will no longer +To make the notation more convenient, in the following the +$\omega$ used to express the realized random parameters will no longer be explicitly mentioned. We call a function that minimizes the cubic distance between training points @@ -348,11 +350,11 @@ derivative of the function a cubic smoothing spline. \] \end{Definition} -We will show that for specific hyper parameters the ridge penalized +We will show that for specific hyperparameters the ridge penalized shallow neural networks converge to a slightly modified variant of the -cubic smoothing spline. We will need to incorporate the densities of the +cubic smoothing spline. We need to incorporate the densities of the random parameters in the loss function of the spline to ensure -convergence. Thus we define +convergence. Thus we define the adapted weighted cubic smoothing spline where the loss for the second derivative is weighted by a function $g$ and the support of the second derivative of $f$ has to be a subset the support of $g$. The formal @@ -371,7 +373,8 @@ definition is given in Definition~\ref{def:wrs}. % Definition~\ref{def:rpnn} converges a weighted cubic smoothing spline, as % the amount of hidden nodes is grown to inifity. -\begin{Definition}[Adapted weighted cubic smoothing spline] +\begin{Definition}[Adapted weighted cubic smoothing spline, Heiss, Teichmann, and + Wutte (2019, Definition 3.5)] \label{def:wrs} Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in \left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$ @@ -385,16 +388,15 @@ definition is given in Definition~\ref{def:wrs}. \lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)} dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}. \] - \todo{Anforderung an Ableitung von f, doch nicht?} +% \todo{Anforderung an Ableitung von f, doch nicht?} \end{Definition} - Similarly to ridge weight penalized neural networks the parameter $\lambda$ controls a trade-off between accuracy on the training data -and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the +and smoothness or low second derivative. For $g \equiv 1$ and $\lambda \to 0$ the resulting function $f^{*, 0+}$ will interpolate the training data while minimizing the second derivative. Such a function is known as cubic spline interpolation. - +\vspace{-0.2cm} \[ f^{*, 0+} \text{ smooth spline interpolation: } \] @@ -403,7 +405,6 @@ interpolation. \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R}), \\ f(x_i^{\text{train}}) = y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right). \] - For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges to linear regression of the data. @@ -412,16 +413,16 @@ the ridge penalized shallow neural network to adapted cubic smoothing splines. % In order to show that ridge penalized shallow neural networks converge % to adapted cubic smoothing splines for a growing amount of hidden nodes we % define two intermediary functions. -One being a smooth approximation of -the neural network, and a randomized shallow neural network designed +One being a smooth approximation of a +neural network and the other being a randomized shallow neural network designed to approximate a spline. -In order to properly BUILD these functions we need to take the points -of the network into consideration where the TRAJECTORY of the learned +In order to properly construct these functions, we need to take the points +of the network into consideration where the trajectory of the learned function changes (or their points of discontinuity). As we use the ReLU activation the function learned by the network will possess points of discontinuity where a neuron in the hidden -layer gets activated (goes from 0 -> x>0). We formalize these points +layer gets activated and their output is no longer zero. We formalize these points as kinks in Definition~\ref{def:kink}. \begin{Definition} \label{def:kink} @@ -439,9 +440,9 @@ as kinks in Definition~\ref{def:kink}. \item Let $\xi_k \coloneqq -\frac{b_k}{v_k}$ be the k-th kink of $\mathcal{RN}_w$. \item Let $g_{\xi}(\xi_k)$ be the density of the kinks $\xi_k = - \frac{b_k}{v_k}$ in accordance to the distributions of $b_k$ and - $v_k$. + $v_k$. With $\supp(g_\xi) = \left[C_{g_\xi}^l, C_{g_\xi}^u\right]$. \item Let $h_{k,n} \coloneqq \frac{1}{n g_{\xi}(\xi_k)}$ be the - average estmated distance from kink $\xi_k$ to the next nearest + average estimated distance from kink $\xi_k$ to the next nearest one. \end{enumerate} \end{Definition} @@ -457,40 +458,36 @@ network by applying the kernel similar to convolution. corresponding kink density $g_{\xi}$ as given by Definition~\ref{def:kink}. In order to smooth the RSNN consider following kernel for every $x$: - - \[ - \kappa_x(s) \coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n} - g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R} - \] - - Using this kernel we define a smooth approximation of - $\mathcal{RN}_w$ by - - \[ - f^w(x) \coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) \kappa_x(s) ds. - \] + \begin{align*} + \kappa_x(s) &\coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n} + g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R}\\ + \intertext{Using this kernel we define a smooth approximation of + $\mathcal{RN}_w$ by} + f^w(x) &\coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) + \kappa_x(s) ds. + \end{align*} \end{Definition} - Note that the kernel introduced in Definition~\ref{def:srsnn} -satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly +satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$ is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w * \kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not. We use $f^{w^{*,\tilde{\lambda}}}$ to describe the spline approximating the ridge penalized network -$\mathrm{RN}^{*,\tilde{\lambda}}$. +$\mathcal{RN}^{*,\tilde{\lambda}}$. -Next we construct a randomized shallow neural network which -approximates a spline independent from the realization of the random -parameters. In order to achieve this we ... +Next, we construct a randomized shallow neural network that +is designed to be close to a spline, independent from the realization of the random +parameters, by approximating the splines curvature between the +kinks. -\begin{Definition}[Spline approximating Randomised Shallow Neural +\begin{Definition}[Spline approximating Randomized Shallow Neural Network] \label{def:sann} - Let $\mathcal{RN}$ be a randomised shallow Neural Network according + Let $\mathcal{RN}$ be a randomized shallow Neural Network according to Definition~\ref{def:rsnn} and $f^{*, \lambda}_g$ be the weighted cubic smoothing spline as introduced in Definition~\ref{def:wrs}. Then - the randomised shallow neural network approximating $f^{*, + the randomized shallow neural network approximating $f^{*, \lambda}_g$ is given by \[ \mathcal{RN}_{\tilde{w}}(x) = \sum_{k = 1}^n \tilde{w}_k \sigma(b_k + v_k x), @@ -498,7 +495,7 @@ parameters. In order to achieve this we ... with the weights $\tilde{w}_k$ defined as \[ \tilde{w}_k \coloneqq \frac{h_{k,n} v_k}{\mathbb{E}[v^2 \vert \xi - = \xi_k]} (f_g^{*, \lambda})''(\xi_k). + = \xi_k]} \left(f_g^{*, \lambda}\right)''(\xi_k). \] \end{Definition} @@ -512,16 +509,16 @@ derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given by x}} \tilde{w}_k v_k \nonumber \\ &= \frac{1}{n} \sum_{\substack{k \in \mathbb{N} \\ \xi_k < x}} \frac{v_k^2}{g_{\xi}(\xi_k) \mathbb{E}[v^2 \vert \xi - = \xi_k]} (f_g^{*, \lambda})''(\xi_k). \label{eq:derivnn} + = \xi_k]} \left(f_g^{*, \lambda}\right)''(\xi_k). \label{eq:derivnn} \end{align} -As the expression (\ref{eq:derivnn}) behaves similary to a +As the expression (\ref{eq:derivnn}) behaves similarly to a Riemann-sum for $n \to \infty$ it will converge in probability to the -first derivative of $f^{*,\lambda}_g$. A formal proof of this behaviour +first derivative of $f^{*,\lambda}_g$. A formal proof of this behavior is given in Lemma~\ref{lem:s0}. In order to ensure the functions used in the proof of the convergence -are well defined we need to assume some properties of the random -parameters and their densities +are well defined we need to make some assumptions about properties of the random +parameters and their densities. % In order to formulate the theorem describing the convergence of $RN_w$ % we need to make a couple of assumptions. @@ -530,7 +527,7 @@ parameters and their densities \begin{Assumption}~ \label{ass:theo38} \begin{enumerate}[label=(\alph*)] - \item The probability density fucntion of the kinks $\xi_k$, + \item The probability density function of the kinks $\xi_k$, namely $g_{\xi}$ as defined in Definition~\ref{def:kink} exists and is well defined. \item The density function $g_\xi$ @@ -545,7 +542,7 @@ parameters and their densities \end{enumerate} \end{Assumption} -As we will prove the convergence of in the Sobolev space, we hereby +As we will prove the convergence of in the Sobolev Space, we hereby introduce it and the corresponding induced norm. \begin{Definition}[Sobolev Space] @@ -563,7 +560,7 @@ introduce it and the corresponding induced norm. \norm{u^{(\alpha)}}_{L^p} < \infty. \] \label{def:sobonorm} - The natural norm of the sobolev space is given by + The natural norm of the Sobolev Space is given by \[ \norm{f}_{W^{k,p}(K)} = \begin{cases} @@ -577,18 +574,21 @@ introduce it and the corresponding induced norm. \] \end{Definition} -With the important definitions and assumptions in place we can now -formulate the main theorem ... the convergence of ridge penalized -random neural networks to adapted cubic smoothing splines when the -parameters are chosen accordingly. +With the important definitions and assumptions in place, we can now +formulate the main theorem. +% ... the convergence of ridge penalized +% random neural networks to adapted cubic smoothing splines when the +% parameters are chosen accordingly. -\begin{Theorem}[Ridge weight penaltiy corresponds to weighted cubic smoothing spline] +\begin{Theorem}[Ridge Weight Penalty Corresponds to Weighted Cubic + Smoothing Spline] \label{theo:main1} - For $N \in \mathbb{N}$ arbitrary training data - \(\left(x_i^{\text{train}}, y_i^{\text{train}} - \right)\) and $\mathcal{RN}^{*, \tilde{\lambda}}, f_g^{*, \lambda}$ + For $N \in \mathbb{N}$, arbitrary training data + $\left(x_i^{\text{train}}, y_i^{\text{train}} + \right)~\in~\mathbb{R}^2$, with $i \in \left\{1,\dots,N\right\}$, + and $\mathcal{RN}^{*, \tilde{\lambda}}, f_g^{*, \lambda}$ according to Definition~\ref{def:rpnn} and Definition~\ref{def:wrs} - respectively with Assumption~\ref{ass:theo38} it holds + respectively with Assumption~\ref{ass:theo38} it holds that \begin{equation} \label{eq:main1} @@ -604,7 +604,7 @@ parameters are chosen accordingly. \end{align*} \end{Theorem} As mentioned above we will prof Theorem~\ref{theo:main1} utilizing -the ... functions. We show that +intermediary functions. We show that \begin{equation} \label{eq:main2} \plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f^{w^*}}_{W^{1, @@ -616,13 +616,13 @@ and \plimn \norm{f^{w^*} - f_g^{*, \lambda}}_{W^{1,\infty}(K)} = 0 \end{equation} and then get (\ref{eq:main1}) using the triangle inequality. In -order to prove (\ref{eq:main2}) and (\ref{eq:main3}) we will need to -introduce a number of auxiliary lemmmata, proves of these will be -provided in the appendix. +order to prove (\ref{eq:main2}) and (\ref{eq:main3}) we need to +introduce a number of auxiliary lemmata, proves of which are +given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}. -\begin{Lemma}[Poincar\'e typed inequality] +\begin{Lemma}[Poincar\'e Typed Inequality] \label{lem:pieq} Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' : \mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b] @@ -634,13 +634,14 @@ provided in the appendix. \norm{f'}_{L^{\infty}(K)}. \end{equation*} If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to - \mathbb{R}\) Lesbeque integrable then additionally + \mathbb{R}\) Lesbeque integrable then \begin{equation*} \label{eq:pti2} \exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq C_K^2 \norm{f''}_{L^2(K)}. \end{equation*} - \proof The proof is given in the appendix... + % \proof The proof is given in the appendix... + % With the fundamental theorem of calculus, if % \(\norm{f}_{L^{\infty}(K)}<\infty\) we get % \begin{equation} @@ -682,6 +683,7 @@ provided in the appendix. \forall x \in \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v) \frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty, \] + \clearpage it holds, that \[ \plimn \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k) @@ -690,7 +692,7 @@ provided in the appendix. \mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx \] uniformly in \(T \in K\). - \proof The proof is given in appendix... + % \proof The proof is given in appendix... % For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to % consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and % \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\), @@ -735,7 +737,7 @@ provided in the appendix. % \kappa : \xi_m \in [\delta l, \delta(l + % 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\ % \intertext{We use the mean to approximate the number of kinks in - % each $\delta$-strip, as it follows a bonomial distribution this + % each $\delta$-strip, as it follows a binomial distribution this % amounts to % \[ % \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l, @@ -745,13 +747,14 @@ provided in the appendix. % \] % Bla Bla Bla $v_k$} % \circled{1} & \approx - % \end{align*} + % \end{align*} + \proof Notes on the proof are given in Proof~\ref{proof:lem9}. \end{Lemma} \begin{Lemma} - For any $\lambda > 0$, training data $(x_i^{\text{train}} + For any $\lambda > 0$, $N \in \mathbb{N}$, training data $(x_i^{\text{train}} y_i^{\text{train}}) \in \mathbb{R}^2$, with $ i \in - \left\{1,\dots,N\right\}$ and subset $K \subset \mathbb{R}$ the spline approximating randomized + \left\{1,\dots,N\right\}$, and subset $K \subset \mathbb{R}$ the spline approximating randomized shallow neural network $\mathcal{RN}_{\tilde{w}}$ converges to the cubic smoothing spline $f^{*, \lambda}_g$ in $\norm{.}_{W^{1,\infty}(K)}$ as the node count $n$ increases, @@ -767,50 +770,63 @@ provided in the appendix. \lambda}_g)'}_{L^{\infty}} = 0. \] This can be achieved by using Lemma~\ref{lem:cnvh} with $\varphi(\xi_k, - v_k) = \frac{v_k^2}{\mathbb{E}[v^2|\xi = z]} (f^{*, \lambda}_w)''(\xi_k) $ + v_k) = \frac{v_k^2}{\mathbb{E}[v^2|\xi = z]} (f^{*, \lambda}_g)''(\xi_k) $ thus obtaining \begin{align*} - \plimn \frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x} - \stackrel{(\ref{eq:derivnn})}{=} - & \plimn \sum_{\substack{k \in \mathbb{N} \\ - \xi_k < x}} \frac{v_k^2}{\mathbb{E}[v^2 \vert \xi - = \xi_k]} (f_g^{*, \lambda})''(\xi_k) h_{k,n} - \stackrel{\text{Lemma}~\ref{lem:cnvh}}{=} \\ - \stackrel{\phantom{(\ref{eq:derivnn})}}{=} - & - \int_{\min\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}} + \plimn \frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x} (x) + \equals^{(\ref{eq:derivnn})}_{\phantom{\text{Lemma 3.1.4}}} + %\stackrel{(\ref{eq:derivnn})}{=} + & + \plimn \sum_{\substack{k \in \mathbb{N} \\ + \xi_k < x}} \frac{v_k^2}{\mathbb{E}[v^2 \vert \xi + = \xi_k]} (f_g^{*, \lambda})''(\xi_k) h_{k,n} \\ + \stackrel{\text{Lemma}~\ref{lem:cnvh}}{=} + %\stackrel{\phantom{(\ref{eq:derivnn})}}{=} + & + \int_{\max\left\{C_{g_{\xi}}^l,x\right\}}^{\min\left\{C_{g_{\xi}}^u,x\right\}} \mathbb{E}\left[\frac{v^2}{\mathbb{E}[v^2|\xi = z]} (f^{*, - \lambda}_w)''(\xi) \vert - \xi = x \right] dx \equals^{\text{Tower-}}_{\text{property}} \\ - \stackrel{\phantom{(\ref{eq:derivnn})}}{=} - & - \int_{\min\left\{C_{g_{\xi}}^l, - T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}(f^{*,\lambda}_w)''(x) - dx. + \lambda}_g)''(\xi) \vert + \xi = z \right] dz\\ + \mathmakebox[\widthof{$\stackrel{\text{Lemma 3.14}}{=}$}][c]{\equals^{\text{Tower-}}_{\text{property}}} + %\stackrel{\phantom{(\ref{eq:derivnn})}}{=} + & + \int_{\max\left\{C_{g_{\xi}}^l, + x\right\}}^{\min\left\{C_{g_{\xi}}^u,x\right\}}(f^{*,\lambda}_g)''(z) + dz. \end{align*} - By the fundamental theorem of calculus and $\supp(f') \subset - \supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}. - \todo{ist die 0 wichtig?} + With the fundamental theorem of calculus we get + \[ + \plimn \mathcal{RN}_{\tilde{w}}'(x) = f_g^{*,\lambda + '}(\min\left\{C_{g_{\xi}}^u, x\right\}) - f_g^{*,\lambda + '}(\max\left\{C_{g_{\xi}}^l, x\right\}) + \] + As $f_g^{*,\lambda '}$ is constant on $\left[C_{g_\xi}^l, + C_{g_\xi}^u\right]^C$ because $\supp(f_g^{*,\lambda ''}) \subseteq + \supp(g) \subseteq \supp(g_\xi)$ we get + \[ + \plimn \mathcal{RN}_{\tilde{w}}'(x) = f_g^{*,\lambda + '}, + \] + thus (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}. \qed \label{lem:s0} \end{Lemma} \begin{Lemma} - For any $\lambda > 0$ and training data $(x_i^{\text{train}}, - y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in + For any $\lambda > 0$, $N \in \mathbb{N}$, and training data $(x_i^{\text{train}}, + y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in \left\{1,\dots,N\right\}$, we have \[ \plimn F^{\tilde{\lambda}}_n(\mathcal{RN}_{\tilde{w}}) = F^{\lambda, g}(f^{*, \lambda}_g) = 0. \] - \proof - The proof is given in the appendix... + \proof Notes on the proof are given in Proof~\ref{proof:lem14}. \label{lem:s2} \end{Lemma} \begin{Lemma} - For any $\lambda > 0$ and training data $(x_i^{\text{train}}, - y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in + For any $\lambda > 0$, $N \in \mathbb{N}$, and training data $(x_i^{\text{train}}, + y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in \left\{1,\dots,N\right\}$, with $w^*$ as defined in Definition~\ref{def:rpnn} and $\tilde{\lambda}$ as defined in Theroem~\ref{theo:main1}, it holds @@ -818,13 +834,13 @@ provided in the appendix. \plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} - f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0. \] - \proof The proof is given in Appendix .. + \proof Notes on the proof are given in Proof~\ref{proof:lem15}. \label{lem:s3} \end{Lemma} \begin{Lemma} - For any $\lambda > 0$ and training data $(x_i^{\text{train}}, - y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in + For any $\lambda > 0$, $N \in \mathbb{N}$, and training data $(x_i^{\text{train}}, + y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in \left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1} respectively, it holds @@ -832,13 +848,13 @@ provided in the appendix. \plimn \abs{F_n^{\tilde{\lambda}}(\mathcal{RN}^{*,\tilde{\lambda}}) - F^{\lambda, g}(f^{w*, \tilde{\lambda}})} = 0. \] - \proof The proof is given in appendix... + \proof Notes on the proof are given in Proof~\ref{proof:lem16}. \label{lem:s4} \end{Lemma} \begin{Lemma} - For any $\lambda > 0$ and training data $(x_i^{\text{train}}, - y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in + For any $\lambda > 0$, $N \in \mathbb{N}$, and training data $(x_i^{\text{train}}, + y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in \left\{1,\dots,N\right\}$, for any sequence of functions $f^n \in W^{2,2}$ with \[ @@ -848,39 +864,45 @@ provided in the appendix. \[ \plimn \norm{f^n - f^{*, \lambda}} = 0. \] - \proof The proof is given in appendix ... + \proof Notes on the proof are given in Proof~\ref{proof:lem19}. \label{lem:s7} \end{Lemma} Using these lemmata we can now proof Theorem~\ref{theo:main1}. We start by showing that the error measure of the smooth approximation of the ridge penalized randomized shallow neural network $F^{\lambda, - g}\left(f^{w^{*,\tilde{\lambda}}}\right)$ + g}(f^{w^{*,\tilde{\lambda}}})$ will converge in probability to the error measure of the adapted weighted regression spline $F^{\lambda, g}\left(f^{*,\lambda}\right)$ for the specified parameters. Using Lemma~\ref{lem:s4} we get that for every $P \in (0,1)$ and $\varepsilon > 0$ there exists a $n_1 \in \mathbb{N}$ such that -\[ +\begin{equation} \mathbb{P}\left[F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) \in F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right) - +[-\varepsilon, \varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_1}. -\] + +[-\varepsilon, \varepsilon]\right] > P, \forall n \in + \mathbb{N}_{> n_1}. + \label{eq:squeeze_1} +\end{equation} As $\mathcal{RN}^{*,\tilde{\lambda}}$ is the optimal network for $F_n^{\tilde{\lambda}}$ we know that -\[ +\begin{equation} F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right) \leq F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right). -\] + \label{eq:squeeze_2} +\end{equation} Using Lemma~\ref{lem:s2} we get that for every $P \in (0,1)$ and -$\varepsilon > 0$ there exists a $n_2 \in \mathbb{N}$ such that -\[ +$\varepsilon > 0$ a $n_2 \in \mathbb{N}$ exists such that +\begin{equation} \mathbb{P}\left[F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right) \in F^{\lambda, g}\left(f^{*,\lambda}_g\right)+[-\varepsilon, \varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_2}. -\] -If we combine these ... we get that for every $P \in (0,1)$ and -$\varepsilon > 0$ and $n_3 \geq + \label{eq:squeeze_3} +\end{equation} +Combining (\ref{eq:squeeze_1}), (\ref{eq:squeeze_2}), and +(\ref{eq:squeeze_3}) we get that for every $P \in (0,1)$ and for \linebreak +every +$\varepsilon > 0$ with $n_3 \geq \max\left\{n_1,n_2\right\}$ \[ \mathbb{P}\left[F^{\lambda, @@ -888,47 +910,51 @@ $\varepsilon > 0$ and $n_3 \geq g}\left(f^{*,\lambda}_g\right)+2\varepsilon\right] > P, \forall n \in \mathbb{N}_{> n_3}. \] -As ... is in ... and ... is optimal we know that +As $\supp(f^{w^{*,\tilde{\lambda}}}) \subseteq \supp(g_\xi)$ and $f^{*,\lambda}_g$ is optimal we know that \[ - F^{\lambda, g}\left(f^{*,\lambda}_g\right) \leq F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) + F^{\lambda, g}\left(f^{*,\lambda}_g\right) \leq F^{\lambda, + g}\left(f^{w^{*,\tilde{\lambda}}}\right) \] and thus get with the squeeze theorem \[ \plimn F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) = F^{\lambda, g}\left(f^{*,\lambda}_g\right). \] -We can now use Lemma~\ref{lem:s7} to follow that +With Lemma~\ref{lem:s7} it follows that \begin{equation} \plimn \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g} _{W^{1,\infty}} = 0. \label{eq:main4} \end{equation} -Now by using the triangle inequality with Lemma~\ref{lem:s3} and +By using the triangle inequality with Lemma~\ref{lem:s3} and (\ref{eq:main4}) we get -\begin{align*} - \plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f_g^{*,\lambda}} - \leq& \plimn \bigg(\norm{\mathcal{RN}^{*, \tilde{\lambda}} - - f_g^{w^{*,\tilde{\lambda}}}}_{W^{1,\infty}}\\ - &+ \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g} +\begin{multline} + \plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f_g^{*,\lambda}}\\ + \leq \plimn \bigg(\norm{\mathcal{RN}^{*, \tilde{\lambda}} - + f_g^{w^{*,\tilde{\lambda}}}}_{W^{1,\infty}} + + \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g} _{W^{1,\infty}}\bigg) = 0 -\end{align*} +\end{multline} and thus have proven Theorem~\ref{theo:main1}. + We now know that randomized shallow neural networks behave similar to spline regression if we regularize the size of the weights during -training. +training. + \textcite{heiss2019} further explore a connection between ridge penalized networks and randomized shallow neural networks trained using gradient descent. -They come to the conclusion that the effect of weight regularization +They infer that the effect of weight regularization can be achieved by stopping the training of the randomized shallow -neural network early, with the amount of epochs being proportional to -the punishment for weight size. -This ... that randomized shallow neural networks trained for a certain -amount of iterations converge for a increasing amount of nodes to -cubic smoothing splines with appropriate weights. -\todo{nochmal nachlesen wie es genau war} +neural network early, with the number of iterations being proportional to +the tuning parameter penalizing the size of the weights. +They use this to further conclude that for a large number of training epochs and number of +neurons shallow neural networks trained with gradient descent are +very close to spline interpolations. Alternatively if the training +is stopped early, they are close to adapted weighted cubic smoothing splines. \newpage \subsection{Simulations} +\label{sec:rsnn_sim} In the following the behaviour described in Theorem~\ref{theo:main1} is visualized in a simulated example. For this two sets of training data have been generated. @@ -962,20 +988,26 @@ Theorem~\ref{theo:main1} would equate to $g(x) = \frac{\mathbb{E}[v_k^2|\xi_k = x]}{10}$. In order to utilize the smoothing spline implemented in Mathlab, $g$ has been simplified to $g -\equiv \frac{1}{10}$ instead. For all figures $f_1^{*, \lambda}$ has -been calculated with Matlab's ``smoothingspline'', as this minimizes +\equiv \frac{1}{10}$ instead. + +For all figures $f_1^{*, \lambda}$ has +been calculated with Matlab's {\sffamily{smoothingspline}}, as this minimizes \[ \bar{\lambda} \sum_{i=1}^N(y_i^{train} - f(x_i^{train}))^2 + (1 - \bar{\lambda}) \int (f''(x))^2 dx \] -the smoothing parameter used for fittment is $\bar{\lambda} = +the smoothing parameter used for fitment is $\bar{\lambda} = \frac{1}{1 + \lambda}$. The parameter $\tilde{\lambda}$ for training -the networks is chosen as defined in Theorem~\ref{theo:main1} and each -network is trained on the full training data for 5000 epochs using +the networks is chosen as defined in Theorem~\ref{theo:main1}. + +Each +network contains 10.000 hidden nodes and is trained on the full +training data for 100.000 epochs using gradient descent. The -results are given in Figure~\ref{fig:rn_vs_rs}, here it can be seen that in -the intervall of the traing data $[-\pi, \pi]$ the neural network and -smoothing spline are nearly identical, coinciding with the proposition. +results are given in Figure~\ref{fig:rn_vs_rs}, where it can be seen +that the neural network and +smoothing spline are nearly identical, coinciding with the +proposition. \input{Figures/RN_vs_RS}