Lemma Appendix, cleanup to 4.2
This commit is contained in:
parent
e96331d072
commit
a498fb1a8c
@ -1,11 +1,20 @@
|
|||||||
\section{Code...}
|
\section{Code...}
|
||||||
\begin{itemize}
|
In this ... the implementations of the models used in ... are
|
||||||
\item Code for randomized shallow neural network
|
given. The randomized shallow neural network used in CHAPTER... are
|
||||||
\item Code for keras
|
implemented in Scala from ground up to ensure the model is exactly to
|
||||||
\end{itemize}
|
... of Theorem~\ref{theo:main1}.
|
||||||
|
|
||||||
\clearpage
|
The neural networks used in CHAPTER are implemented in python using
|
||||||
\begin{lstfloat}
|
the Keras framework given in Tensorflow. Tensorflow is a library
|
||||||
|
containing highly efficient GPU implementations of most important
|
||||||
|
tensor operations, such as convolution as well as efficient algorithms
|
||||||
|
for training neural networks (computing derivatives, updating parameters).
|
||||||
|
\begin{itemize}
|
||||||
|
\item Code for randomized shallow neural network
|
||||||
|
\item Code for keras
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{lstfloat}
|
||||||
\begin{lstlisting}[language=iPython]
|
\begin{lstlisting}[language=iPython]
|
||||||
import breeze.stats.distributions.Uniform
|
import breeze.stats.distributions.Uniform
|
||||||
import breeze.stats.distributions.Gaussian
|
import breeze.stats.distributions.Gaussian
|
||||||
@ -50,7 +59,7 @@ class RSNN(val n: Int, val gamma: Double = 0.001) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def train(data: Seq[(Double, Double)], iter: Int, lam: Double,
|
def train(data: Seq[(Double, Double)], iter: Int, lam: Double,
|
||||||
gamma: Double = gamma): (Seq[Double], Double => Double)= {
|
gamma: Double = gamma): (Seq[Double], Double => Double) = {
|
||||||
|
|
||||||
val ws = (1 to iter).foldRight((1 to n).map(
|
val ws = (1 to iter).foldRight((1 to n).map(
|
||||||
_ => 0.0) :Seq[Double])((i, w) => {
|
_ => 0.0) :Seq[Double])((i, w) => {
|
||||||
@ -62,15 +71,15 @@ class RSNN(val n: Int, val gamma: Double = 0.001) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
\end{lstlisting}
|
\end{lstlisting}
|
||||||
\caption{Scala code used to build and train the ridge penalized
|
\caption{Scala code used to build and train the ridge penalized
|
||||||
randomized shallow neural network in .... The parameter \textit{lam}
|
randomized shallow neural network in .... The parameter \textit{lam}
|
||||||
in the train function represents the $\lambda$ parameter in the error
|
in the train function represents the $\lambda$ parameter in the error
|
||||||
function. The parameters \textit{n} and \textit{gamma} set the number
|
function. The parameters \textit{n} and \textit{gamma} set the number
|
||||||
of hidden nodes and the stepsize for training.}
|
of hidden nodes and the stepsize for training.}
|
||||||
\label{lst:rsnn}
|
\label{lst:rsnn}
|
||||||
\end{lstfloat}
|
\end{lstfloat}
|
||||||
\clearpage
|
\clearpage
|
||||||
|
\begin{lstfloat}
|
||||||
\begin{lstlisting}[language=iPython]
|
\begin{lstlisting}[language=iPython]
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -117,7 +126,12 @@ validation_data=(x_test, y_test),
|
|||||||
steps_per_epoch = x_train.shape[0]//50)
|
steps_per_epoch = x_train.shape[0]//50)
|
||||||
|
|
||||||
\end{lstlisting}
|
\end{lstlisting}
|
||||||
|
\caption{Python code for the model used... the MNIST handwritten digits
|
||||||
|
dataset.}
|
||||||
|
\label{lst:handwriting}
|
||||||
|
\end{lstfloat}
|
||||||
\clearpage
|
\clearpage
|
||||||
|
\begin{lstfloat}
|
||||||
\begin{lstlisting}[language=iPython]
|
\begin{lstlisting}[language=iPython]
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -160,13 +174,18 @@ datagen = ImageDataGenerator(
|
|||||||
csv_logger = CSVLogger(<Target File>)
|
csv_logger = CSVLogger(<Target File>)
|
||||||
|
|
||||||
history = model.fit(datagen.flow(x_train, y_train, batch_size=30),
|
history = model.fit(datagen.flow(x_train, y_train, batch_size=30),
|
||||||
steps_per_epoch=2000,
|
steps_per_epoch=x_train.shape[0]//30,
|
||||||
validation_data=(x_test, y_test),
|
validation_data=(x_test, y_test),
|
||||||
epochs=125, callbacks=[csv_logger],
|
epochs=125, callbacks=[csv_logger],
|
||||||
shuffle=True)
|
shuffle=True)
|
||||||
|
|
||||||
\end{lstlisting}
|
\end{lstlisting}
|
||||||
|
\caption{Python code for the model used... the fashion MNIST
|
||||||
|
dataset.}
|
||||||
|
\label{lst:fashion}
|
||||||
|
\end{lstfloat}
|
||||||
\clearpage
|
\clearpage
|
||||||
|
\begin{lstfloat}
|
||||||
\begin{lstlisting}[language=iPython]
|
\begin{lstlisting}[language=iPython]
|
||||||
def get_random_sample(a, b, number_of_samples=10):
|
def get_random_sample(a, b, number_of_samples=10):
|
||||||
x = []
|
x = []
|
||||||
@ -183,6 +202,9 @@ def get_random_sample(a, b, number_of_samples=10):
|
|||||||
return (np.asarray(x).reshape(-1, 28, 28, 1),
|
return (np.asarray(x).reshape(-1, 28, 28, 1),
|
||||||
np.asarray(y).reshape(10*number_of_samples,1))
|
np.asarray(y).reshape(10*number_of_samples,1))
|
||||||
\end{lstlisting}
|
\end{lstlisting}
|
||||||
|
\caption{Python code used to generate the datasets containing a
|
||||||
|
certain amount of random datapoints per class.}
|
||||||
|
\end{lstfloat}
|
||||||
%%% Local Variables:
|
%%% Local Variables:
|
||||||
%%% mode: latex
|
%%% mode: latex
|
||||||
%%% TeX-master: "main"
|
%%% TeX-master: "main"
|
||||||
|
@ -65,7 +65,8 @@ plot coordinates {
|
|||||||
\caption{Performance metrics during training}
|
\caption{Performance metrics during training}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
% \\~\\
|
% \\~\\
|
||||||
\caption[Performance comparison of SDG and GD]{The neural network given in ?? trained with different
|
\caption[Performance comparison of SDG and GD]{The neural network
|
||||||
|
given in Figure~\ref{fig:mnist_architecture} trained with different
|
||||||
algorithms on the MNIST handwritten digits data set. For gradient
|
algorithms on the MNIST handwritten digits data set. For gradient
|
||||||
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For
|
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For
|
||||||
stochastic gradient descend a batch size of 32 and learning rate
|
stochastic gradient descend a batch size of 32 and learning rate
|
||||||
|
@ -8,287 +8,364 @@
|
|||||||
\usepackage{showframe}
|
\usepackage{showframe}
|
||||||
\usepackage{graphicx}
|
\usepackage{graphicx}
|
||||||
\usepackage{titlecaps}
|
\usepackage{titlecaps}
|
||||||
|
\usepackage{amssymb}
|
||||||
|
\usepackage{mathtools}%add-on and patches to amsmath
|
||||||
\usetikzlibrary{calc, 3d}
|
\usetikzlibrary{calc, 3d}
|
||||||
\usepgfplotslibrary{colorbrewer}
|
\usepgfplotslibrary{colorbrewer}
|
||||||
|
|
||||||
\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut
|
\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut
|
||||||
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut
|
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut
|
||||||
|
|
||||||
|
\DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim}
|
||||||
|
\DeclareMathOperator{\supp}{supp}
|
||||||
|
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||||
|
\DeclareMathOperator*{\po}{\mathbb{P}\text{-}\mathcal{O}}
|
||||||
|
\DeclareMathOperator*{\equals}{=}
|
||||||
\begin{document}
|
\begin{document}
|
||||||
\pgfplotsset{
|
\newcommand{\plimn}[0]{\plim\limits_{n \to \infty}}
|
||||||
compat=1.11,
|
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
|
||||||
legend image code/.code={
|
% \pgfplotsset{
|
||||||
\draw[mark repeat=2,mark phase=2]
|
% compat=1.11,
|
||||||
plot coordinates {
|
% legend image code/.code={
|
||||||
(0cm,0cm)
|
% \draw[mark repeat=2,mark phase=2]
|
||||||
(0.3cm,0cm) %% default is (0.3cm,0cm)
|
% plot coordinates {
|
||||||
(0.6cm,0cm) %% default is (0.6cm,0cm)
|
% (0cm,0cm)
|
||||||
};%
|
% (0.3cm,0cm) %% default is (0.3cm,0cm)
|
||||||
}
|
% (0.6cm,0cm) %% default is (0.6cm,0cm)
|
||||||
}
|
% };%
|
||||||
\begin{figure}
|
% }
|
||||||
\begin{subfigure}[h]{\textwidth}
|
% }
|
||||||
\begin{tikzpicture}
|
% \begin{figure}
|
||||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
% \begin{subfigure}[h]{\textwidth}
|
||||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
% \begin{tikzpicture}
|
||||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
% \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
ylabel = {Test Accuracy}, cycle
|
% /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||||
list/Dark2, every axis plot/.append style={line width
|
% height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
=1.25pt}]
|
% ylabel = {Test Accuracy}, cycle
|
||||||
% \addplot [dashed] table
|
% list/Dark2, every axis plot/.append style={line width
|
||||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
% =1.25pt}]
|
||||||
% {Data/adam_datagen_full.log};
|
% % \addplot [dashed] table
|
||||||
\addplot table
|
% % [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% % {Data/adam_datagen_full.log};
|
||||||
{Data/adam_1.mean};
|
% \addplot table
|
||||||
% \addplot [dashed] table
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
% {Data/adam_1.mean};
|
||||||
% {Data/adam_datagen_dropout_02_full.log};
|
% % \addplot [dashed] table
|
||||||
\addplot table
|
% % [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% % {Data/adam_datagen_dropout_02_full.log};
|
||||||
{Data/adam_datagen_1.mean};
|
% \addplot table
|
||||||
\addplot table
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% {Data/adam_datagen_1.mean};
|
||||||
{Data/adam_datagen_dropout_02_1.mean};
|
% \addplot table
|
||||||
\addplot table
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% {Data/adam_datagen_dropout_02_1.mean};
|
||||||
{Data/adam_dropout_02_1.mean};
|
% \addplot table
|
||||||
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
% {Data/adam_dropout_02_1.mean};
|
||||||
|
|
||||||
|
|
||||||
\addlegendentry{\footnotesize{G.}}
|
% \addlegendentry{\footnotesize{G.}}
|
||||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
% \addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
% \addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||||
\addlegendentry{\footnotesize{D. 0.2}}
|
% \addlegendentry{\footnotesize{D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{D. 0.4}}
|
% \addlegendentry{\footnotesize{D. 0.4}}
|
||||||
\addlegendentry{\footnotesize{Default}}
|
% \addlegendentry{\footnotesize{Default}}
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
\caption{1 sample per class}
|
% \caption{1 sample per class}
|
||||||
\vspace{0.25cm}
|
% \vspace{0.25cm}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}[h]{\textwidth}
|
% \begin{subfigure}[h]{\textwidth}
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
% \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
% /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
% height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
ylabel = {Test Accuracy}, cycle
|
% ylabel = {Test Accuracy}, cycle
|
||||||
list/Dark2, every axis plot/.append style={line width
|
% list/Dark2, every axis plot/.append style={line width
|
||||||
=1.25pt}]
|
% =1.25pt}]
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_dropout_00_10.mean};
|
% {Data/adam_dropout_00_10.mean};
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_dropout_02_10.mean};
|
% {Data/adam_dropout_02_10.mean};
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_datagen_dropout_00_10.mean};
|
% {Data/adam_datagen_dropout_00_10.mean};
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_datagen_dropout_02_10.mean};
|
% {Data/adam_datagen_dropout_02_10.mean};
|
||||||
|
|
||||||
|
|
||||||
\addlegendentry{\footnotesize{G.}}
|
% \addlegendentry{\footnotesize{G.}}
|
||||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
% \addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
% \addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||||
\addlegendentry{\footnotesize{D. 0.2}}
|
% \addlegendentry{\footnotesize{D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{D. 0.4}}
|
% \addlegendentry{\footnotesize{D. 0.4}}
|
||||||
\addlegendentry{\footnotesize{Default}}
|
% \addlegendentry{\footnotesize{Default}}
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
\caption{10 samples per class}
|
% \caption{10 samples per class}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}[h]{\textwidth}
|
% \begin{subfigure}[h]{\textwidth}
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
% \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
% /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
||||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
% height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
% xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
||||||
list/Dark2, every axis plot/.append style={line width
|
% list/Dark2, every axis plot/.append style={line width
|
||||||
=1.25pt}, ymin = {0.92}]
|
% =1.25pt}, ymin = {0.92}]
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_dropout_00_100.mean};
|
% {Data/adam_dropout_00_100.mean};
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_dropout_02_100.mean};
|
% {Data/adam_dropout_02_100.mean};
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_datagen_dropout_00_100.mean};
|
% {Data/adam_datagen_dropout_00_100.mean};
|
||||||
\addplot table
|
% \addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
% [x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam_datagen_dropout_02_100.mean};
|
% {Data/adam_datagen_dropout_02_100.mean};
|
||||||
|
|
||||||
\addlegendentry{\footnotesize{G.}}
|
% \addlegendentry{\footnotesize{G.}}
|
||||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
% \addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
% \addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||||
\addlegendentry{\footnotesize{D. 0.2}}
|
% \addlegendentry{\footnotesize{D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{D. 0.4}}
|
% \addlegendentry{\footnotesize{D. 0.4}}
|
||||||
\addlegendentry{\footnotesize{Default}}
|
% \addlegendentry{\footnotesize{Default}}
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
\caption{100 samples per class}
|
% \caption{100 samples per class}
|
||||||
\vspace{.25cm}
|
% \vspace{.25cm}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\caption{Accuracy for the net given in ... with Dropout (D.),
|
% \caption{Accuracy for the net given in ... with Dropout (D.),
|
||||||
data generation (G.), a combination, or neither (Default) implemented and trained
|
% data generation (G.), a combination, or neither (Default) implemented and trained
|
||||||
with \textsc{Adam}. For each epoch the 60.000 training samples
|
% with \textsc{Adam}. For each epoch the 60.000 training samples
|
||||||
were used, or for data generation 10.000 steps with each using
|
% were used, or for data generation 10.000 steps with each using
|
||||||
batches of 60 generated data points. For each configuration the
|
% batches of 60 generated data points. For each configuration the
|
||||||
model was trained 5 times and the average accuracies at each epoch
|
% model was trained 5 times and the average accuracies at each epoch
|
||||||
are given in (a). Mean, maximum and minimum values of accuracy on
|
% are given in (a). Mean, maximum and minimum values of accuracy on
|
||||||
the test and training set are given in (b).}
|
% the test and training set are given in (b).}
|
||||||
\end{figure}
|
% \end{figure}
|
||||||
\begin{table}
|
% \begin{table}
|
||||||
\centering
|
% \centering
|
||||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
% \begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
% \Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||||
\hline
|
% \hline
|
||||||
&
|
% &
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
% \multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||||
\cline{2-5}
|
% \cline{2-5}
|
||||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
% max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
||||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
% min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
||||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
% mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
||||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
% var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
||||||
\hline
|
% \hline
|
||||||
&
|
% &
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
% \multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||||
\cline{2-5}
|
% \cline{2-5}
|
||||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
% max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
||||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
% min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
||||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
% mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
||||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
% var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
||||||
\hline
|
% \hline
|
||||||
&
|
% &
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
% \multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||||
\cline{2-5}
|
% \cline{2-5}
|
||||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
% max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
||||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
% min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
||||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
% mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
||||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
% var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
||||||
\hline
|
% \hline
|
||||||
\end{tabu}
|
% \end{tabu}
|
||||||
\caption{Values of the test accuracy of the model trained 10 times
|
% \caption{Values of the test accuracy of the model trained 10 times
|
||||||
of random training sets containing 1, 10 and 100 data points per
|
% of random training sets containing 1, 10 and 100 data points per
|
||||||
class.}
|
% class.}
|
||||||
\end{table}
|
% \end{table}
|
||||||
|
|
||||||
\begin{center}
|
% \begin{center}
|
||||||
\begin{figure}[h]
|
% \begin{figure}[h]
|
||||||
\centering
|
% \centering
|
||||||
\begin{subfigure}{\textwidth}
|
% \begin{subfigure}{\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/cnn_fashion_fig.pdf}
|
% \includegraphics[width=\textwidth]{Data/cnn_fashion_fig.pdf}
|
||||||
\caption{original\\image}
|
% \caption{original\\image}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{\textwidth}
|
% \begin{subfigure}{\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/cnn_fashion_fig1.pdf}
|
% \includegraphics[width=\textwidth]{Data/cnn_fashion_fig1.pdf}
|
||||||
\caption{random\\zoom}
|
% \caption{random\\zoom}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf}
|
||||||
\caption{random\\shear}
|
% \caption{random\\shear}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf}
|
||||||
\caption{random\\rotation}
|
% \caption{random\\rotation}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf}
|
||||||
\caption{random\\positional shift}
|
% \caption{random\\positional shift}
|
||||||
\end{subfigure}\\
|
% \end{subfigure}\\
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist5.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist5.pdf}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist6.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist6.pdf}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist7.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist7.pdf}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist8.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist8.pdf}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
% \begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist9.pdf}
|
% \includegraphics[width=\textwidth]{Data/mnist9.pdf}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
% \caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||||
digits. Of these images 60.000 are used as training images, while
|
% digits. Of these images 60.000 are used as training images, while
|
||||||
the rest are used to validate the models trained.}
|
% the rest are used to validate the models trained.}
|
||||||
\end{figure}
|
% \end{figure}
|
||||||
\end{center}
|
% \end{center}
|
||||||
|
|
||||||
\begin{figure}
|
% \begin{figure}
|
||||||
\begin{adjustbox}{width=\textwidth}
|
% \begin{adjustbox}{width=\textwidth}
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{scope}[x = (0:1cm), y=(90:1cm), z=(15:-0.5cm)]
|
% \begin{scope}[x = (0:1cm), y=(90:1cm), z=(15:-0.5cm)]
|
||||||
\node[canvas is xy plane at z=0, transform shape] at (0,0)
|
% \node[canvas is xy plane at z=0, transform shape] at (0,0)
|
||||||
{\includegraphics[width=5cm]{Data/klammern_r.jpg}};
|
% {\includegraphics[width=5cm]{Data/klammern_r.jpg}};
|
||||||
\node[canvas is xy plane at z=2, transform shape] at (0,-0.2)
|
% \node[canvas is xy plane at z=2, transform shape] at (0,-0.2)
|
||||||
{\includegraphics[width=5cm]{Data/klammern_g.jpg}};
|
% {\includegraphics[width=5cm]{Data/klammern_g.jpg}};
|
||||||
\node[canvas is xy plane at z=4, transform shape] at (0,-0.4)
|
% \node[canvas is xy plane at z=4, transform shape] at (0,-0.4)
|
||||||
{\includegraphics[width=5cm]{Data/klammern_b.jpg}};
|
% {\includegraphics[width=5cm]{Data/klammern_b.jpg}};
|
||||||
\node[canvas is xy plane at z=4, transform shape] at (-8,-0.2)
|
% \node[canvas is xy plane at z=4, transform shape] at (-8,-0.2)
|
||||||
{\includegraphics[width=5.3cm]{Data/klammern_rgb.jpg}};
|
% {\includegraphics[width=5.3cm]{Data/klammern_rgb.jpg}};
|
||||||
\end{scope}
|
% \end{scope}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
\end{adjustbox}
|
% \end{adjustbox}
|
||||||
\caption{On the right the red, green and blue chanels of the picture
|
% \caption{On the right the red, green and blue chanels of the picture
|
||||||
are displayed. In order to better visualize the color channes the
|
% are displayed. In order to better visualize the color channes the
|
||||||
black and white picture of each channel has been colored in the
|
% black and white picture of each channel has been colored in the
|
||||||
respective color. Combining the layers results in the image on the
|
% respective color. Combining the layers results in the image on the
|
||||||
left}
|
% left}
|
||||||
\end{figure}
|
% \end{figure}
|
||||||
|
|
||||||
\begin{figure}
|
% \begin{figure}
|
||||||
\centering
|
% \centering
|
||||||
\begin{subfigure}{\linewidth}
|
% \begin{subfigure}{\linewidth}
|
||||||
\centering
|
% \centering
|
||||||
\includegraphics[width=\textwidth]{Data/convnet_fig.pdf}
|
% \includegraphics[width=\textwidth]{Data/convnet_fig.pdf}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{.45\linewidth}
|
% \begin{subfigure}{.45\linewidth}
|
||||||
\centering
|
% \centering
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{axis}[enlargelimits=false, width=\textwidth]
|
% \begin{axis}[enlargelimits=false, width=\textwidth]
|
||||||
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
% \addplot[domain=-5:5, samples=100]{tanh(x)};
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{.45\linewidth}
|
% \begin{subfigure}{.45\linewidth}
|
||||||
\centering
|
% \centering
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{axis}[enlargelimits=false, width=\textwidth,
|
% \begin{axis}[enlargelimits=false, width=\textwidth,
|
||||||
ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
|
% ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
|
||||||
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
% \addplot[domain=-5:5, samples=100]{max(0,x)};
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\begin{subfigure}{.45\linewidth}
|
% \begin{subfigure}{.45\linewidth}
|
||||||
\centering
|
% \centering
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
|
% \begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
|
||||||
ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
|
% ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
|
||||||
\addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
|
% \addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
\end{subfigure}
|
% \end{subfigure}
|
||||||
\end{figure}
|
% \end{figure}
|
||||||
|
|
||||||
|
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{axis}[enlargelimits=false]
|
% \begin{axis}[enlargelimits=false]
|
||||||
\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
|
% \addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
|
||||||
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
% \addplot[domain=-5:5, samples=100]{tanh(x)};
|
||||||
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
% \addplot[domain=-5:5, samples=100]{max(0,x)};
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
|
|
||||||
\begin{tikzpicture}
|
% \begin{tikzpicture}
|
||||||
\begin{axis}[enlargelimits=false]
|
% \begin{axis}[enlargelimits=false]
|
||||||
\addplot[domain=-2*pi:2*pi, samples=100]{cos(deg(x))};
|
% \addplot[domain=-2*pi:2*pi, samples=100]{cos(deg(x))};
|
||||||
\end{axis}
|
% \end{axis}
|
||||||
\end{tikzpicture}
|
% \end{tikzpicture}
|
||||||
|
\newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}
|
||||||
|
|
||||||
|
\[
|
||||||
|
\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||||
|
h_{k,n} = \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}}
|
||||||
|
\left(\sum_{\substack{k \in \kappa \\ \xi_k \in
|
||||||
|
[\delta l , \delta(l+1))}} \varphi(\xi_k, v_k)
|
||||||
|
h_{k,n}\right) \approx
|
||||||
|
\]
|
||||||
|
\[
|
||||||
|
\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}}
|
||||||
|
\left(\sum_{\substack{k \in \kappa \\ \xi_k \in
|
||||||
|
[\delta l , \delta(l+1))}} \left(\varphi(\delta l, v_k)
|
||||||
|
\frac{1}{n g_\xi (\delta l)} \pm \frac{\varepsilon}{n}\right)
|
||||||
|
\frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||||
|
\delta(l+1))\right\}}}{\abs{\left\{m \in \kappa : \xi_m
|
||||||
|
\in [\delta l, \delta(l+1))\right\}}}\right)
|
||||||
|
\]
|
||||||
|
\[
|
||||||
|
\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}}
|
||||||
|
\left(\frac{\sum_{\substack{k \in \kappa \\ \xi_k \in
|
||||||
|
[\delta l , \delta(l+1))}}\varphi(\delta l,
|
||||||
|
v_k)}{\abs{\left\{m \in \kappa : \xi_m
|
||||||
|
\in [\delta l, \delta(l+1))\right\}}}
|
||||||
|
\frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||||
|
\delta(l+1))\right\}}}{n g_\xi (\delta l)}\right) \pm \varepsilon
|
||||||
|
\]
|
||||||
|
The amount of kinks in a given interval of length $\delta$ follows a
|
||||||
|
binomial distribution,
|
||||||
|
\[
|
||||||
|
\mathbb{E} \left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||||
|
\delta(l+1))\right\}}\right] = n \int_{\delta
|
||||||
|
l}^{\delta(l+1)}g_\xi (x) dx \approx n (\delta g_\xi(\delta l)
|
||||||
|
\pm \delta \tilde{\varepsilon}),
|
||||||
|
\]
|
||||||
|
for any $\delta \leq \delta(\varepsilon, \tilde{\varepsilon})$, since $g_\xi$ is uniformly continuous on its
|
||||||
|
support by Assumption..
|
||||||
|
As the distribution of $v$ is continuous as well we get
|
||||||
|
$\mathcal{L}(v_k) = \mathcal{L} v| \xi = \delta l) \forall k \in
|
||||||
|
\kappa : \xi_k \in [\delta l, \delta(l+1))$ for $delta \leq
|
||||||
|
\delta(\varepsilon, \tilde{\varepsilon})$. Thus we get with the law of
|
||||||
|
large numbers
|
||||||
|
\begin{align*}
|
||||||
|
&\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||||
|
h_{k,n} \approx\\
|
||||||
|
&\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T
|
||||||
|
\}]}}\left(\mathbb{E}[\phi(\xi, v)|\xi=\delta l]
|
||||||
|
\stackrel{\mathbb{P}}{\pm}\right) \delta \left(1 \pm
|
||||||
|
\frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon
|
||||||
|
\\
|
||||||
|
&\approx \left(\sum_{\substack{l \in \mathbb{Z} \\ [\delta
|
||||||
|
l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T
|
||||||
|
\}]}}\mathbb{E}[\phi(\xi, v)|\xi=\delta l] \delta
|
||||||
|
\stackrel{\mathbb{P}}{\pm}\tilde{\tilde{\varepsilon}}
|
||||||
|
\abs{C_{g_\xi}^u - C_{g_\xi}^l}
|
||||||
|
\right)\\
|
||||||
|
&\phantom{\approx}\cdot \left(1 \pm
|
||||||
|
\frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon
|
||||||
|
\end{align*}
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|
||||||
%%% Local Variables:
|
%%% Local Variables:
|
||||||
|
@ -6,6 +6,9 @@
|
|||||||
In the following there will be proofs for some important Lemmata in
|
In the following there will be proofs for some important Lemmata in
|
||||||
Section~\ref{sec:theo38}. Further proofs not discussed here can be
|
Section~\ref{sec:theo38}. Further proofs not discussed here can be
|
||||||
found in \textcite{heiss2019}
|
found in \textcite{heiss2019}
|
||||||
|
The proves in this section are based on \textcite{heiss2019}. Slight
|
||||||
|
alterations have been made to accommodate for not splitting $f$ into
|
||||||
|
$f_+$ and $f_-$.
|
||||||
\begin{Theorem}[Proof of Lemma~\ref{theo38}]
|
\begin{Theorem}[Proof of Lemma~\ref{theo38}]
|
||||||
\end{Theorem}
|
\end{Theorem}
|
||||||
|
|
||||||
@ -20,9 +23,142 @@
|
|||||||
|
|
||||||
|
|
||||||
\end{Lemma}
|
\end{Lemma}
|
||||||
|
|
||||||
|
\begin{Proof}[Proof of Lemma~\ref{lem:s3}]
|
||||||
|
\[
|
||||||
|
\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||||
|
h_{k,n} = \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}}
|
||||||
|
\left(\sum_{\substack{k \in \kappa \\ \xi_k \in
|
||||||
|
[\delta l , \delta(l+1))}} \varphi(\xi_k, v_k)
|
||||||
|
h_{k,n}\right) \approx
|
||||||
|
\]
|
||||||
|
\[
|
||||||
|
\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}}
|
||||||
|
\left(\sum_{\substack{k \in \kappa \\ \xi_k \in
|
||||||
|
[\delta l , \delta(l+1))}} \left(\varphi(\delta l, v_k)
|
||||||
|
\frac{1}{n g_\xi (\delta l)} \pm \frac{\varepsilon}{n}\right)
|
||||||
|
\frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||||
|
\delta(l+1))\right\}}}{\abs{\left\{m \in \kappa : \xi_m
|
||||||
|
\in [\delta l, \delta(l+1))\right\}}}\right)
|
||||||
|
\]
|
||||||
|
\[
|
||||||
|
\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T \}]}}
|
||||||
|
\left(\frac{\sum_{\substack{k \in \kappa \\ \xi_k \in
|
||||||
|
[\delta l , \delta(l+1))}}\varphi(\delta l,
|
||||||
|
v_k)}{\abs{\left\{m \in \kappa : \xi_m
|
||||||
|
\in [\delta l, \delta(l+1))\right\}}}
|
||||||
|
\frac{\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||||
|
\delta(l+1))\right\}}}{n g_\xi (\delta l)}\right) \pm \varepsilon
|
||||||
|
\]
|
||||||
|
The amount of kinks in a given interval of length $\delta$ follows a
|
||||||
|
binomial distribution,
|
||||||
|
\[
|
||||||
|
\mathbb{E} \left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||||
|
\delta(l+1))\right\}}\right] = n \int_{\delta
|
||||||
|
l}^{\delta(l+1)}g_\xi (x) dx \approx n (\delta g_\xi(\delta l)
|
||||||
|
\pm \delta \tilde{\varepsilon}),
|
||||||
|
\]
|
||||||
|
for any $\delta \leq \delta(\varepsilon, \tilde{\varepsilon})$, since $g_\xi$ is uniformly continuous on its
|
||||||
|
support by Assumption..
|
||||||
|
As the distribution of $v$ is continuous as well we get that
|
||||||
|
$\mathcal{L}(v_k) = \mathcal{L} v| \xi = \delta l) \forall k \in
|
||||||
|
\kappa : \xi_k \in [\delta l, \delta(l+1))$ for $\delta \leq
|
||||||
|
\delta(\varepsilon, \tilde{\varepsilon})$. Thus we get with the law of
|
||||||
|
large numbers
|
||||||
|
\begin{align*}
|
||||||
|
&\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||||
|
h_{k,n} \approx\\
|
||||||
|
&\approx \sum_{\substack{l \in \mathbb{Z} \\ [\delta l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T
|
||||||
|
\}]}}\left(\mathbb{E}[\phi(\xi, v)|\xi=\delta l]
|
||||||
|
\stackrel{\mathbb{P}}{\pm}\right) \delta \left(1 \pm
|
||||||
|
\frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon
|
||||||
|
\\
|
||||||
|
&\approx \left(\sum_{\substack{l \in \mathbb{Z} \\ [\delta
|
||||||
|
l, \delta
|
||||||
|
(l+1)) \in [C_{g_\xi}^l,\min\{C_{g_\xi}^u, T
|
||||||
|
\}]}}\mathbb{E}[\phi(\xi, v)|\xi=\delta l] \delta
|
||||||
|
\stackrel{\mathbb{P}}{\pm}\tilde{\tilde{\varepsilon}}
|
||||||
|
\abs{C_{g_\xi}^u - C_{g_\xi}^l}
|
||||||
|
\right)\\
|
||||||
|
&\phantom{\approx}\cdot \left(1 \pm
|
||||||
|
\frac{\tilde{\varepsilon}}{g_\xi(\delta l)}\right) \pm \varepsilon
|
||||||
|
\end{align*}
|
||||||
|
\end{Proof}
|
||||||
|
|
||||||
|
\begin{Lemma}[($L(f_n) \to L(f)$), Heiss, Teichmann, and
|
||||||
|
Wutte (2019, Lemma A.11)]
|
||||||
|
For any data $(x_i^{\text{train}}, y_i^{\text{train}}) \in
|
||||||
|
\mathbb{R}^2, i \in \left\{1,\dots,N\right\}$, let $(f_n)_{n \in
|
||||||
|
\mathbb{N}}$ be a sequence of functions that converges point-wise
|
||||||
|
in probability to a function $f : \mathbb{R}\to\mathbb{R}$, then the
|
||||||
|
loss $L$ of $f_n$ converges is probability to $L(f)$ as $n$ tends to
|
||||||
|
infinity,
|
||||||
|
\[
|
||||||
|
\plimn L(f_n) = L(f).
|
||||||
|
\]
|
||||||
|
\proof Vgl. ...
|
||||||
|
\end{Lemma}
|
||||||
|
|
||||||
|
\begin{Proof}[Step 2]
|
||||||
|
We start by showing that
|
||||||
|
\[
|
||||||
|
\plimn \tilde{\lambda} \norm{\tilde{w}}_2^2 = \lambda g(0)
|
||||||
|
\left(\int \frac{\left(f_g^{*,\lambda''}\right)^2}{g(x)} dx\right)
|
||||||
|
\]
|
||||||
|
With the definitions of $\tilde{w}$, $\tilde{\lambda}$ and
|
||||||
|
$h$ we have
|
||||||
|
\begin{align*}
|
||||||
|
\tilde{\lambda} \norm{\tilde{w}}_2^2
|
||||||
|
&= \tilde{\lambda} \sum_{k \in
|
||||||
|
\kappa}\left(f_g^{*,\lambda''}(\xi_k) \frac{h_k
|
||||||
|
v_k}{\mathbb{E}v^2|\xi = \xi_k]}\right)^2\\
|
||||||
|
&= \tilde{\lambda} \sum_{k \in
|
||||||
|
\kappa}\left(\left(f_g^{*,\lambda''}\right)^2(\xi_k) \frac{h_k
|
||||||
|
v_k^2}{\mathbb{E}v^2|\xi = \xi_k]}\right) h_k\\
|
||||||
|
& = \lambda g(0) \sum_{k \in
|
||||||
|
\kappa}\left(\left(f_g^{*,\lambda''}\right)^2(\xi_k)\frac{v_k^2}{g_\xi(\xi_k)\mathbb{E}
|
||||||
|
[v^2|\xi=\xi_k]}\right)h_k.
|
||||||
|
\end{align*}
|
||||||
|
By using Lemma~\ref{lem} with $\phi(x,y) =
|
||||||
|
\left(f_g^{*,\lambda''}\right)^2(x)\frac{y^2}{g_\xi(\xi)\mathbb{E}[v^2|\xi=y]}$
|
||||||
|
this converges to
|
||||||
|
\begin{align*}
|
||||||
|
&\plimn \tilde{\lambda}\norm{\tilde{w}}_2^2 = \\
|
||||||
|
&=\lambda
|
||||||
|
g_\xi(0)\mathbb{E}[v^2|\xi=0]\int_{\supp{g_\xi}}\mathbb{E}\left[
|
||||||
|
\left(f_g^{*,\lambda''}\right)^2(\xi)\frac{v^2}{
|
||||||
|
g_\xi(\xi)\mathbb{E}[v^2|\xi=x]^2}\Big{|} \xi = x\right]dx\\
|
||||||
|
&=\lambda g_\xi(0) \mathbb{E}[v^2|\xi=0] \int_{\supp{g_xi}}
|
||||||
|
\frac{\left(f_g^{*,\lambda''}\right)^2 (x)}{g_\xi(x)
|
||||||
|
\mathbb{E}[v^2|\xi=x]} dx \\
|
||||||
|
&=\lambda g(0) \int_{\supp{g_\xi}} \frac{\left(f_g^{*,\lambda''}\right)^2}{g(x)}dx.
|
||||||
|
\end{align*}
|
||||||
|
\end{Proof}
|
||||||
|
|
||||||
|
\begin{Lemma}[Heiss, Teichmann, and
|
||||||
|
Wutte (2019, Lemma A.13)]
|
||||||
|
Using the notation of Definition .. and ... the following statement
|
||||||
|
holds:
|
||||||
|
$\forall \varepsilon \in \mathbb{R}_{>0} : \exists \delta \in
|
||||||
|
\mathbb{R}_{>0} : \forall \omega \in \Omega : \forall l, l' \in
|
||||||
|
\left\{1,\dots,N\right\} : \forall n \in \mathbb{N}$
|
||||||
|
\[
|
||||||
|
\left(\abs{\xi_l(\omega) - \xi_{l'}(\omega)} < \delta \angle
|
||||||
|
\text{sign}(v_l(\omega)) = \text{sign}(v_{l'}(\omega))\right)
|
||||||
|
\implies \abs{\frac{w_l^{*, \tilde{\lambda}}(\omega)}{v_l(\omega)}
|
||||||
|
- \frac{w_{l'}^{*, \tilde{\lambda}}(\omega)}{v_{l'}(\omega)}} <
|
||||||
|
\frac{\varepsilon}{n},
|
||||||
|
\]
|
||||||
|
if we assume that $v_k$ is never zero.
|
||||||
|
\proof given in ..
|
||||||
|
\end{Lemma}
|
||||||
|
|
||||||
\input{Appendix_code.tex}
|
\input{Appendix_code.tex}
|
||||||
|
|
||||||
\end{appendices}
|
\end{appendices}
|
||||||
|
|
||||||
|
|
||||||
|
@ -239,11 +239,50 @@ series = {ICISDM '18}
|
|||||||
title = {Random Erasing Data Augmentation},
|
title = {Random Erasing Data Augmentation},
|
||||||
journal = {CoRR},
|
journal = {CoRR},
|
||||||
volume = {abs/1708.04896},
|
volume = {abs/1708.04896},
|
||||||
year = {2017},
|
year = 2017,
|
||||||
url = {http://arxiv.org/abs/1708.04896},
|
url = {http://arxiv.org/abs/1708.04896},
|
||||||
archivePrefix = {arXiv},
|
archivePrefix = {arXiv},
|
||||||
eprint = {1708.04896},
|
eprint = {1708.04896},
|
||||||
timestamp = {Mon, 13 Aug 2018 16:47:52 +0200},
|
timestamp = {Mon, 13 Aug 2018 16:47:52 +0200},
|
||||||
biburl = {https://dblp.org/rec/journals/corr/abs-1708-04896.bib},
|
biburl = {https://dblp.org/rec/journals/corr/abs-1708-04896.bib},
|
||||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{draw_convnet,
|
||||||
|
title = {Python script for illustrating Convolutional Neural Network (ConvNet)},
|
||||||
|
howpublished = {\url{https://github.com/gwding/draw_convnet}},
|
||||||
|
note = {Accessed: 30.08.2020},
|
||||||
|
author = {Gavin Weiguang Ding},
|
||||||
|
year = {2018}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{Haykin,
|
||||||
|
added-at = {2009-06-26T15:25:19.000+0200},
|
||||||
|
author = {Haykin, Simon},
|
||||||
|
note = {2nd edition},
|
||||||
|
publisher = {Prentice Hall},
|
||||||
|
title = {Neural Networks: {A} Comprehensive Foundation},
|
||||||
|
year = 1999
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{Goodfellow,
|
||||||
|
title={Deep Learning},
|
||||||
|
author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
|
||||||
|
publisher={MIT Press},
|
||||||
|
note={\url{http://www.deeplearningbook.org}},
|
||||||
|
year=2016
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{ruder,
|
||||||
|
author = {Sebastian Ruder},
|
||||||
|
title = {An overview of gradient descent optimization algorithms},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/1609.04747},
|
||||||
|
year = {2016},
|
||||||
|
url = {http://arxiv.org/abs/1609.04747},
|
||||||
|
archivePrefix = {arXiv},
|
||||||
|
eprint = {1609.04747},
|
||||||
|
timestamp = {Mon, 13 Aug 2018 16:48:10 +0200},
|
||||||
|
biburl = {https://dblp.org/rec/journals/corr/Ruder16.bib},
|
||||||
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
}
|
}
|
@ -1,5 +1,7 @@
|
|||||||
\section{Application of NN to higher complexity Problems}
|
\section{Application of NN to higher complexity Problems}
|
||||||
|
|
||||||
|
This section is based on \textcite[Chapter~9]{Goodfellow}
|
||||||
|
|
||||||
As neural networks are applied to problems of higher complexity often
|
As neural networks are applied to problems of higher complexity often
|
||||||
resulting in higher dimensionality of the input the amount of
|
resulting in higher dimensionality of the input the amount of
|
||||||
parameters in the network rises drastically.
|
parameters in the network rises drastically.
|
||||||
@ -7,8 +9,7 @@ For very large inputs such as high resolution image data due to the
|
|||||||
fully connected nature of the neural network the amount of parameters
|
fully connected nature of the neural network the amount of parameters
|
||||||
can ... exceed the amount that is feasible for training and storage.
|
can ... exceed the amount that is feasible for training and storage.
|
||||||
A way to combat this is by using layers which are only sparsely
|
A way to combat this is by using layers which are only sparsely
|
||||||
connected and share parameters between nodes. This can be implemented
|
connected and share parameters between nodes.\todo{Überleitung zu conv?}
|
||||||
using convolution.\todo{Überleitung besser schreiben}
|
|
||||||
|
|
||||||
\subsection{Convolution}
|
\subsection{Convolution}
|
||||||
|
|
||||||
@ -27,13 +28,13 @@ The convolution operation allows plentiful manipulation of data, with
|
|||||||
a simple example being smoothing of real-time data. Consider a sensor
|
a simple example being smoothing of real-time data. Consider a sensor
|
||||||
measuring the location of an object (e.g. via GPS). We expect the
|
measuring the location of an object (e.g. via GPS). We expect the
|
||||||
output of the sensor to be noisy as a result of a number of factors
|
output of the sensor to be noisy as a result of a number of factors
|
||||||
that will impact the accuracy. In order to get a better estimate of
|
will impact the accuracy of the measurements. In order to get a better estimate of
|
||||||
the actual location we want to smooth
|
the actual location we want to smooth
|
||||||
the data to reduce the noise. Using convolution for this task, we
|
the data to reduce the noise. Using convolution for this task, we
|
||||||
can control the significance we want to give each data-point. We
|
can control the significance we want to give each data-point. We
|
||||||
might want to give a larger weight to more recent measurements than
|
might want to give a larger weight to more recent measurements than
|
||||||
older ones. If we assume these measurements are taken on a discrete
|
older ones. If we assume these measurements are taken on a discrete
|
||||||
timescale, we need to introduce discrete convolution first. \\Let $f$,
|
timescale, we need to define convolution for discrete functions. \\Let $f$,
|
||||||
$g: \mathbb{Z} \to \mathbb{R}$ then
|
$g: \mathbb{Z} \to \mathbb{R}$ then
|
||||||
|
|
||||||
\[
|
\[
|
||||||
@ -59,7 +60,7 @@ by each pixel being a mixture of base colors. These base colors define
|
|||||||
the color-space in which the image is encoded. Often used are
|
the color-space in which the image is encoded. Often used are
|
||||||
color-spaces RGB (red,
|
color-spaces RGB (red,
|
||||||
blue, green) or CMYK (cyan, magenta, yellow, black). An example of an
|
blue, green) or CMYK (cyan, magenta, yellow, black). An example of an
|
||||||
image split in its red, green and blue channel is given in
|
image decomposed in its red, green and blue channel is given in
|
||||||
Figure~\ref{fig:rgb}. Using this
|
Figure~\ref{fig:rgb}. Using this
|
||||||
encoding of the image we can define a corresponding discrete function
|
encoding of the image we can define a corresponding discrete function
|
||||||
describing the image, by mapping the coordinates $(x,y)$ of an pixel
|
describing the image, by mapping the coordinates $(x,y)$ of an pixel
|
||||||
@ -108,13 +109,14 @@ convolution
|
|||||||
(I * g)_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
|
(I * g)_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
|
||||||
\]
|
\]
|
||||||
|
|
||||||
As images are finite in size for pixels close enough to the border
|
As images are finite in size for pixels to close to the border the
|
||||||
that the filter ... the convolution is not well defined. In such cases
|
convolution is not well defined.
|
||||||
padding can be used. With padding the image is enlarged beyond .. with
|
Thus the output will be of reduced size, with the now size in each
|
||||||
0 entries to
|
dimension $d$ being \textit{(size of input in dimension $d$) -
|
||||||
ensure the convolution is well defined for all pixels. If no padding
|
(size of kernel in dimension $d$) +1}.
|
||||||
is used the size of the output is reduced to \textit{size of input -
|
In order to ensure the output is of the same size as the input the
|
||||||
size of kernel +1} in each dimension.
|
image can be padded in each dimension with 0 entries which ensures the
|
||||||
|
convolution is well defined for all pixels of the image.
|
||||||
|
|
||||||
Simple examples for image manipulation using
|
Simple examples for image manipulation using
|
||||||
convolution are smoothing operations or
|
convolution are smoothing operations or
|
||||||
@ -147,8 +149,8 @@ output is given by
|
|||||||
O = \sqrt{(I * G)^2 + (I*G^T)^2}
|
O = \sqrt{(I * G)^2 + (I*G^T)^2}
|
||||||
\]
|
\]
|
||||||
where $\sqrt{\cdot}$ and $\cdot^2$ are applied component
|
where $\sqrt{\cdot}$ and $\cdot^2$ are applied component
|
||||||
wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img_conv}.
|
wise. Examples for convolution of an image with both kernels are given
|
||||||
\todo{padding}
|
in Figure~\ref{fig:img_conv}.
|
||||||
|
|
||||||
|
|
||||||
\begin{figure}[h]
|
\begin{figure}[h]
|
||||||
@ -222,65 +224,114 @@ As seen in the previous section convolution can lend itself to
|
|||||||
manipulation of images or other large data which motivates it usage in
|
manipulation of images or other large data which motivates it usage in
|
||||||
neural networks.
|
neural networks.
|
||||||
This is achieved by implementing convolutional layers where several
|
This is achieved by implementing convolutional layers where several
|
||||||
filters are applied to the input. Where the values of the filters are
|
trainable filters are applied to the input.
|
||||||
trainable parameters of the model.
|
|
||||||
Each node in such a layer corresponds to a pixel of the output of
|
Each node in such a layer corresponds to a pixel of the output of
|
||||||
convolution with one of those filters on which a bias and activation
|
convolution with one of those filters, on which a bias and activation
|
||||||
function are applied.
|
function are applied.
|
||||||
|
Depending on the sizes this can drastically reduce the amount of
|
||||||
|
variables in a layer compared to fully connected ones.
|
||||||
|
As the variables of the filters are shared among all nodes a
|
||||||
|
convolutional layer with input of size $s_i$, output size $s_o$ and
|
||||||
|
$n$ filters of size $f$ will contain $n f + s_o$ parameters whereas a
|
||||||
|
fully connected layer has $(s_i + 1) s_o$ trainable weights.
|
||||||
|
|
||||||
The usage of multiple filters results in multiple outputs of the same
|
The usage of multiple filters results in multiple outputs of the same
|
||||||
size as the input. These are often called channels. Depending on the
|
size as the input (or slightly smaller if no padding is used). These
|
||||||
size of the filters this can result in the dimension of the output
|
are often called channels.
|
||||||
being one larger than the input.
|
For convolutional layers that are preceded by convolutional layers the
|
||||||
However for convolutional layers that are preceded by convolutional layers the
|
|
||||||
size of the filter is often chosen to coincide with the amount of channels
|
size of the filter is often chosen to coincide with the amount of channels
|
||||||
of the output of the previous layer without using padding in this
|
of the output of the previous layer and not padded in this
|
||||||
direction in order to prevent gaining additional
|
direction.
|
||||||
|
This results in the channels ``being squashed'' and prevents gaining
|
||||||
|
additional
|
||||||
dimensions\todo{filter mit ganzer tiefe besser erklären} in the output.
|
dimensions\todo{filter mit ganzer tiefe besser erklären} in the output.
|
||||||
This can also be used to flatten certain less interesting channels of
|
This can also be used to flatten certain less interesting channels of
|
||||||
the input as for example a color channels.
|
the input as for example color channels.
|
||||||
Thus filters used in convolutional networks are usually have the same
|
% Thus filters used in convolutional networks are usually have the same
|
||||||
amount of dimensions as the input or one more.
|
% amount of dimensions as the input or one more.
|
||||||
|
|
||||||
The size of the filters and the way they are applied can be tuned
|
|
||||||
while building the model should be the same for all filters in one
|
|
||||||
layer in order for the output being of consistent size in all channels.
|
|
||||||
It is common to reduce the d< by not applying the
|
|
||||||
filters on each ``pixel'' but rather specify a ``stride'' $s$ at which
|
|
||||||
the filter $g$ is moved over the input $I$
|
|
||||||
|
|
||||||
|
A way additionally reduce the size using convolution is not applying the
|
||||||
|
convolution on every pixel, but rather specifying a certain ``stride''
|
||||||
|
$s$ at which the filter $g$ is moved over the input $I$,
|
||||||
\[
|
\[
|
||||||
O_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
|
O_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
|
||||||
\]
|
\]
|
||||||
|
|
||||||
As seen convolution lends itself for image manipulation. In this
|
The size and stride for all filters in a layer should be the same in
|
||||||
chapter we will explore how we can incorporate convolution in neural
|
order to get a uniform tensor as output.
|
||||||
networks, and how that might be beneficial.
|
T% he size of the filters and the way they are applied can be tuned
|
||||||
|
% while building the model should be the same for all filters in one
|
||||||
|
% layer in order for the output being of consistent size in all channels.
|
||||||
|
% It is common to reduce the d< by not applying the
|
||||||
|
% filters on each ``pixel'' but rather specify a ``stride'' $s$ at which
|
||||||
|
% the filter $g$ is moved over the input $I$
|
||||||
|
|
||||||
Convolutional Neural Networks as described by ... are made up of
|
% \[
|
||||||
convolutional layers, pooling layers, and fully connected ones. The
|
% O_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
|
||||||
fully connected layers are layers in which each input node is
|
% \]
|
||||||
connected to each output node which is the structure introduced in
|
|
||||||
chapter ...
|
|
||||||
|
|
||||||
In a convolutional layer instead of combining all input nodes for each
|
% As seen convolution lends itself for image manipulation. In this
|
||||||
output node, the input nodes are interpreted as a tensor on which a
|
% chapter we will explore how we can incorporate convolution in neural
|
||||||
kernel is applied via convolution, resulting in the output. Most often
|
% networks, and how that might be beneficial.
|
||||||
multiple kernels are used, resulting in multiple output tensors. These
|
|
||||||
kernels are the variables, which can be altered in order to fit the
|
% Convolutional Neural Networks as described by ... are made up of
|
||||||
model to the data. Using multiple kernels it is possible to extract
|
% convolutional layers, pooling layers, and fully connected ones. The
|
||||||
different features from the image (e.g. edges -> sobel). As this
|
% fully connected layers are layers in which each input node is
|
||||||
increases dimensionality even further which is undesirable as it
|
% connected to each output node which is the structure introduced in
|
||||||
increases the amount of variables in later layers of the model, a convolutional layer
|
% chapter ...
|
||||||
is often followed by a pooling one. In a pooling layer the input is
|
|
||||||
|
% In a convolutional layer instead of combining all input nodes for each
|
||||||
|
% output node, the input nodes are interpreted as a tensor on which a
|
||||||
|
% kernel is applied via convolution, resulting in the output. Most often
|
||||||
|
% multiple kernels are used, resulting in multiple output tensors. These
|
||||||
|
% kernels are the variables, which can be altered in order to fit the
|
||||||
|
% model to the data. Using multiple kernels it is possible to extract
|
||||||
|
% different features from the image (e.g. edges -> sobel).
|
||||||
|
|
||||||
|
In order to further reduce the size towards the final layer, convolutional
|
||||||
|
layers are often followed by a pooling layer.
|
||||||
|
In a pooling layer the input is
|
||||||
reduced in size by extracting a single value from a
|
reduced in size by extracting a single value from a
|
||||||
neighborhood \todo{moving...}... . The resulting output size is dependent on
|
neighborhood of pixels, often by taking the maximum value in the
|
||||||
the offset of the neighborhoods used. Popular is max-pooling where the
|
neighborhood (max-pooling). The resulting output size is dependent on
|
||||||
largest value in a neighborhood is used or.
|
the offset of the neighborhoods used, this offset is commonly called
|
||||||
\todo{kleine grafik}
|
``stride''\todo{zwei mal stride}.
|
||||||
The combination of convolution and pooling layers allows for
|
The combination of convolution and pooling layers allows for
|
||||||
extraction of features from the input in the from of feature maps while
|
extraction of features from the input in the from of feature maps while
|
||||||
using relatively few parameters that need to be trained.
|
using relatively few parameters that need to be trained.
|
||||||
\todo{Beispiel feature maps}
|
A example of this is given in Figure~\ref{fig:feature_map} where
|
||||||
|
intermediary outputs of a small convoluninal neural network consisting
|
||||||
|
of two convolutional and pooling layers each with one filter followed
|
||||||
|
by two fully connected layers.
|
||||||
|
|
||||||
|
|
||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Figures/Data/mnist0bw.pdf}
|
||||||
|
\caption{input}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Figures/Data/conv2d_6.pdf}
|
||||||
|
\caption{convolution}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_6.pdf}
|
||||||
|
\caption{max-pool}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Figures/Data/conv2d_7.pdf}
|
||||||
|
\caption{convolution}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_7.pdf}
|
||||||
|
\caption{max-pool}
|
||||||
|
\end{subfigure}
|
||||||
|
\caption[Feature map]{Intermediary outputs of a
|
||||||
|
convolutional neural network, starting with the input and ending
|
||||||
|
with the corresponding feature map.}
|
||||||
|
\label{fig:feature_map}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
\subsubsection{Parallels to the Visual Cortex in Mammals}
|
\subsubsection{Parallels to the Visual Cortex in Mammals}
|
||||||
|
|
||||||
@ -295,7 +346,6 @@ arbitrary. ... auge... bla bla
|
|||||||
% -Different scale of gradients for vars in different layers -> ADAdelta
|
% -Different scale of gradients for vars in different layers -> ADAdelta
|
||||||
|
|
||||||
\subsection{Stochastic Training Algorithms}
|
\subsection{Stochastic Training Algorithms}
|
||||||
|
|
||||||
For many applications in which neural networks are used such as
|
For many applications in which neural networks are used such as
|
||||||
image classification or segmentation, large training data sets become
|
image classification or segmentation, large training data sets become
|
||||||
detrimental to capture the nuances of the
|
detrimental to capture the nuances of the
|
||||||
@ -303,20 +353,21 @@ data. However as training sets get larger the memory requirement
|
|||||||
during training grows with it.
|
during training grows with it.
|
||||||
In order to update the weights with the gradient descent algorithm
|
In order to update the weights with the gradient descent algorithm
|
||||||
derivatives of the network with respect for each
|
derivatives of the network with respect for each
|
||||||
variable need to be calculated for all data points in order to get the
|
variable need to be computed for all data points.
|
||||||
full gradient of the error of the network.
|
|
||||||
Thus the amount of memory and computing power available limits the
|
Thus the amount of memory and computing power available limits the
|
||||||
size of the training data that can be efficiently used in fitting the
|
size of the training data that can be efficiently used in fitting the
|
||||||
network. A class of algorithms that augment the gradient descent
|
network. A class of algorithms that augment the gradient descent
|
||||||
algorithm in order to lessen this problem are stochastic gradient
|
algorithm in order to lessen this problem are stochastic gradient
|
||||||
descent algorithms. Here the premise is that instead of using the whole
|
descent algorithms.
|
||||||
dataset a (different) subset of data is chosen to
|
Here the full dataset is split into smaller disjoint subsets.
|
||||||
compute the gradient in each iteration (Algorithm~\ref{alg:sdg}).
|
Then in each iteration a (different) subset of data is chosen to
|
||||||
The training period until each data point has been considered in
|
compute the gradient (Algorithm~\ref{alg:sdg}).
|
||||||
|
The training period until each data point has been considered at least
|
||||||
|
once in
|
||||||
updating the parameters is commonly called an ``epoch''.
|
updating the parameters is commonly called an ``epoch''.
|
||||||
Using subsets reduces the amount of memory and computing power required for
|
Using subsets reduces the amount of memory required for storing the
|
||||||
each iteration. This makes it possible to use very large training
|
necessary values for each update, thus making it possible to use very
|
||||||
sets to fit the model.
|
large training sets to fit the model.
|
||||||
Additionally the noise introduced on the gradient can improve
|
Additionally the noise introduced on the gradient can improve
|
||||||
the accuracy of the fit as stochastic gradient descent algorithms are
|
the accuracy of the fit as stochastic gradient descent algorithms are
|
||||||
less likely to get stuck on local extrema.
|
less likely to get stuck on local extrema.
|
||||||
@ -353,7 +404,7 @@ mount of training time.
|
|||||||
\end{algorithm}
|
\end{algorithm}
|
||||||
|
|
||||||
In order to illustrate this behavior we modeled a convolutional neural
|
In order to illustrate this behavior we modeled a convolutional neural
|
||||||
network to ... handwritten digits. The data set used for this is the
|
network to classify handwritten digits. The data set used for this is the
|
||||||
MNIST database of handwritten digits (\textcite{MNIST},
|
MNIST database of handwritten digits (\textcite{MNIST},
|
||||||
Figure~\ref{fig:MNIST}).
|
Figure~\ref{fig:MNIST}).
|
||||||
\input{Figures/mnist.tex}
|
\input{Figures/mnist.tex}
|
||||||
@ -364,15 +415,17 @@ applied with a stride of one.
|
|||||||
The first layer consists of 32 filters and the second of 64. Both
|
The first layer consists of 32 filters and the second of 64. Both
|
||||||
pooling layers pool a $2\times 2$ area. The fully connected layer
|
pooling layers pool a $2\times 2$ area. The fully connected layer
|
||||||
consists of 256 nodes and the output layer of 10, one for each digit.
|
consists of 256 nodes and the output layer of 10, one for each digit.
|
||||||
All layers except the output layer use RELU as activation function
|
All layers use RELU as activation function, except the output layer
|
||||||
with the output layer using softmax (\ref{def:softmax}).
|
with the output layer which uses softmax (\ref{def:softmax}).
|
||||||
As loss function categorical crossentropy is used (\ref{def:...}).
|
As loss function categorical crossentropy is used (\ref{eq:cross_entropy}).
|
||||||
The architecture of the convolutional neural network is summarized in
|
The architecture of the convolutional neural network is summarized in
|
||||||
Figure~\ref{fig:mnist_architecture}.
|
Figure~\ref{fig:mnist_architecture}.
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf}
|
\includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf}
|
||||||
\caption{architecture}
|
\caption{Convolutional neural network architecture used to model the
|
||||||
|
MNIST handwritten digits dataset. This figure was created using the
|
||||||
|
draw\textunderscore convnet Python script by \textcite{draw_convnet}.}
|
||||||
\label{fig:mnist_architecture}
|
\label{fig:mnist_architecture}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
@ -387,7 +440,7 @@ with gradient descent after 20 epochs.
|
|||||||
This is due to the former using a batch size of 32 and thus having
|
This is due to the former using a batch size of 32 and thus having
|
||||||
made 1.875 updates to the weights
|
made 1.875 updates to the weights
|
||||||
after the first epoch in comparison to one update. While each of
|
after the first epoch in comparison to one update. While each of
|
||||||
these updates uses a approximate
|
these updates only use a approximate
|
||||||
gradient calculated on the subset it performs far better than the
|
gradient calculated on the subset it performs far better than the
|
||||||
network using true gradients when training for the same mount of time.
|
network using true gradients when training for the same mount of time.
|
||||||
\todo{vergleich training time}
|
\todo{vergleich training time}
|
||||||
@ -395,6 +448,8 @@ network using true gradients when training for the same mount of time.
|
|||||||
\input{Figures/SGD_vs_GD.tex}
|
\input{Figures/SGD_vs_GD.tex}
|
||||||
\clearpage
|
\clearpage
|
||||||
\subsection{\titlecap{modified stochastic gradient descent}}
|
\subsection{\titlecap{modified stochastic gradient descent}}
|
||||||
|
This section is based on \textcite{ruder}.
|
||||||
|
|
||||||
An inherent problem of the stochastic gradient descent algorithm is
|
An inherent problem of the stochastic gradient descent algorithm is
|
||||||
its sensitivity to the learning rate $\gamma$. This results in the
|
its sensitivity to the learning rate $\gamma$. This results in the
|
||||||
problem of having to find a appropriate learning rate for each problem
|
problem of having to find a appropriate learning rate for each problem
|
||||||
@ -606,12 +661,6 @@ global learning rate. This results in .. hyperparameters, however the
|
|||||||
algorithms seems to be exceptionally stable with the recommended
|
algorithms seems to be exceptionally stable with the recommended
|
||||||
parameters of ... and is a very reliable algorithm for training
|
parameters of ... and is a very reliable algorithm for training
|
||||||
neural networks.
|
neural networks.
|
||||||
However the \textsc{Adam} algorithm can have problems with high
|
|
||||||
variance of the adaptive learning rate early in training.
|
|
||||||
\textcite{rADAM} try to address these issues with the Rectified Adam
|
|
||||||
algorithm
|
|
||||||
\todo{will ich das einbauen?}
|
|
||||||
|
|
||||||
|
|
||||||
\begin{algorithm}[H]
|
\begin{algorithm}[H]
|
||||||
\SetAlgoLined
|
\SetAlgoLined
|
||||||
@ -662,7 +711,7 @@ the other algorithms, with AdaGrad and Adelta following... bla bla
|
|||||||
% strategies exist. A popular approach in regularizing convolutional neural network
|
% strategies exist. A popular approach in regularizing convolutional neural network
|
||||||
% is \textit{dropout} which has been first introduced in
|
% is \textit{dropout} which has been first introduced in
|
||||||
% \cite{Dropout}
|
% \cite{Dropout}
|
||||||
|
This section is based on ....
|
||||||
Similarly to shallow networks overfitting still can impact the quality of
|
Similarly to shallow networks overfitting still can impact the quality of
|
||||||
convolutional neural networks.
|
convolutional neural networks.
|
||||||
Popular ways to combat this problem for a .. of models is averaging
|
Popular ways to combat this problem for a .. of models is averaging
|
||||||
@ -748,7 +797,7 @@ same or else the network will not learn the desired ...
|
|||||||
In the case of handwritten digits for example a to high rotation angle
|
In the case of handwritten digits for example a to high rotation angle
|
||||||
will ... a nine or six.
|
will ... a nine or six.
|
||||||
The most common transformations are rotation, zoom, shear, brightness,
|
The most common transformations are rotation, zoom, shear, brightness,
|
||||||
mirroring.
|
mirroring. Examples of this are given in Figure~\ref{fig:datagen}.
|
||||||
|
|
||||||
\begin{figure}[h]
|
\begin{figure}[h]
|
||||||
\centering
|
\centering
|
||||||
@ -775,6 +824,7 @@ mirroring.
|
|||||||
\caption[Image data generation]{Example for the manipuations used in ... As all images are
|
\caption[Image data generation]{Example for the manipuations used in ... As all images are
|
||||||
of the same intensity brightness manipulation does not seem
|
of the same intensity brightness manipulation does not seem
|
||||||
... Additionally mirroring is not used for ... reasons.}
|
... Additionally mirroring is not used for ... reasons.}
|
||||||
|
\label{fig:datagen}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
In order to compare the benefits obtained from implementing these
|
In order to compare the benefits obtained from implementing these
|
||||||
@ -824,31 +874,31 @@ zalando, a overview is given in Figure~\ref{fig:fashionMNIST}.
|
|||||||
\begin{minipage}{\textwidth}
|
\begin{minipage}{\textwidth}
|
||||||
\small
|
\small
|
||||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||||
\hline
|
\hline
|
||||||
&
|
&
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||||
\cline{2-5}
|
\cline{2-5}
|
||||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
max \Tstrut & 0.5633 & 0.5312 & \textbf{0.6704} & 0.6604 \\
|
||||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
min & 0.3230 & 0.4224 & 0.4878 & \textbf{0.5175} \\
|
||||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
mean & 0.4570 & 0.4714 & 0.5862 & \textbf{0.6014} \\
|
||||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
var \Bstrut & 0.0040 & \textbf{0.0012} & 0.0036 & 0.0023 \\
|
||||||
\hline
|
\hline
|
||||||
&
|
&
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||||
\cline{2-5}
|
\cline{2-5}
|
||||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & \textbf{0.9441} \\
|
||||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
min & 0.8148 & \textbf{0.9081} & 0.9018 & 0.9061 \\
|
||||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
mean & 0.8377 & \textbf{0.9270} & 0.9185 & 0.9232 \\
|
||||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
var \Bstrut & 2.7e-04 & 1.3e-04 & 6e-05 & 1.5e-04 \\
|
||||||
\hline
|
\hline
|
||||||
&
|
&
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||||
\cline{2-5}
|
\cline{2-5}
|
||||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
max \Tstrut & 0.9637 & 0.9796 & 0.9810 & \textbf{0.9811} \\
|
||||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
min & 0.9506 & 0.9719 & 0.9702 & \textbf{0.9727} \\
|
||||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
mean & 0.9582 & 0.9770 & 0.9769 & \textbf{0.9783} \\
|
||||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
var \Bstrut & 2e-05 & 1e-05 & 1e-05 & 1e-05 \\
|
||||||
\hline
|
\hline
|
||||||
\end{tabu}
|
\end{tabu}
|
||||||
\normalsize
|
\normalsize
|
||||||
@ -857,40 +907,42 @@ zalando, a overview is given in Figure~\ref{fig:fashionMNIST}.
|
|||||||
on random MNIST handwriting training sets containing 1, 10 and 100
|
on random MNIST handwriting training sets containing 1, 10 and 100
|
||||||
data points per class after 125 epochs. The mean achieved accuracy
|
data points per class after 125 epochs. The mean achieved accuracy
|
||||||
for the full set employing both overfitting measures is }
|
for the full set employing both overfitting measures is }
|
||||||
|
\label{table:digitsOF}
|
||||||
\small
|
\small
|
||||||
\centering
|
\centering
|
||||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||||
\hline
|
\hline
|
||||||
&
|
&
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||||
\cline{2-5}
|
\cline{2-5}
|
||||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
max \Tstrut & 0.4885 & \textbf{0.5613} & 0.5488 & 0.5475 \\
|
||||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
min & 0.3710 & \textbf{0.3858} & 0.3736 & 0.3816 \\
|
||||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
mean \Bstrut & 0.4166 & 0.4838 & 0.4769 & \textbf{0.4957} \\
|
||||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
var & \textbf{0.002} & 0.00294 & 0.00338 & 0.0030 \\
|
||||||
\hline
|
\hline
|
||||||
&
|
&
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||||
\cline{2-5}
|
\cline{2-5}
|
||||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
max \Tstrut & 0.7370 & 0.7340 & 0.7236 & \textbf{0.7502} \\
|
||||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
min & 0.6818 & 0.6673 & 0.6709 & \textbf{0.6799} \\
|
||||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
mean & 0.7130 & \textbf{0.7156} & 0.7031 & 0.7136 \\
|
||||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
var \Bstrut & 3.2e-04 & 3.4e-04 & 3.2e-04 & 4.5e-04 \\
|
||||||
\hline
|
\hline
|
||||||
&
|
&
|
||||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||||
\cline{2-5}
|
\cline{2-5}
|
||||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
max \Tstrut & 0.8454 & 0.8385 & 0.8456 & \textbf{0.8459} \\
|
||||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
min & 0.8227 & 0.8200 & \textbf{0.8305} & 0.8274 \\
|
||||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
mean & 0.8331 & 0.8289 & 0.8391 & \textbf{0.8409} \\
|
||||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
var \Bstrut & 4e-05 & 4e-05 & 2e-05 & 3e-05 \\
|
||||||
\hline
|
\hline
|
||||||
\end{tabu}
|
\end{tabu}
|
||||||
\normalsize
|
\normalsize
|
||||||
\captionof{table}{Values of the test accuracy of the model trained 10 times
|
\captionof{table}{Values of the test accuracy of the model trained 10 times
|
||||||
on random fashion MNIST training sets containing 1, 10 and 100 data points per
|
on random fashion MNIST training sets containing 1, 10 and 100 data points per
|
||||||
class. The mean achieved accuracy for the full dataset is: ....}
|
class. The mean achieved accuracy for the full dataset is: ....}
|
||||||
|
\label{table:fashionOF}
|
||||||
\end{minipage}
|
\end{minipage}
|
||||||
\clearpage % if needed/desired
|
\clearpage % if needed/desired
|
||||||
}
|
}
|
||||||
@ -908,26 +960,36 @@ This is done in order to have more ... in order to better ... the data
|
|||||||
in the model. A diagram of the architecture is given in
|
in the model. A diagram of the architecture is given in
|
||||||
Figure~\ref{fig:fashion_MNIST}.
|
Figure~\ref{fig:fashion_MNIST}.
|
||||||
|
|
||||||
For both scenarios the model are trained 10 times on randomly
|
\afterpage{
|
||||||
... training sets. Additionally models of the same architecture where
|
\noindent
|
||||||
a dropout layer with a ... 20\% is implemented and/or datageneration
|
\begin{figure}[h]
|
||||||
is used to augment the data during training. The values for the
|
\includegraphics[width=\textwidth]{Figures/Data/cnn_fashion_fig.pdf}
|
||||||
datageneration are given in CODE APPENDIX.
|
\caption{Convolutional neural network architecture used to model the
|
||||||
|
fashion MNIST dataset. This figure was created using the
|
||||||
|
draw\textunderscore convnet Python script by \textcite{draw_convnet}.}
|
||||||
|
\label{fig:fashion_MNIST}
|
||||||
|
\end{figure}
|
||||||
|
}
|
||||||
|
|
||||||
The models are trained for 125 epoch to ensure enough random
|
For both scenarios the models are trained 10 times on randomly
|
||||||
augmentations of the input images are considered to ensure
|
sampled training sets.
|
||||||
convergence. The test accuracies of the models after training for 125
|
For each scenario the models are trained without overfitting measures and combinations
|
||||||
epoch are given in Figure~\ref{...} for the handwriting
|
of dropout and datageneration implemented. The Python implementation
|
||||||
and in Figure~\ref{...} for the fashion scenario. Additionally the
|
of the models and the parameters used for the datageneration are given
|
||||||
average test accuracies of the models are given for each epoch in
|
in Listing~\ref{lst:handwriting} for the handwriting model and
|
||||||
Figure ... and Figure...
|
Listing~\ref{lst:fashion} for the fashion model.
|
||||||
|
|
||||||
\begin{figure}
|
The models are trained for 125 epoch in order
|
||||||
\includegraphics[width=\textwidth]{Figures/Data/cnn_fashion_fig.pdf}
|
to have enough random
|
||||||
\caption{Convolutional neural network architecture used to model the
|
augmentations of the input images present during training
|
||||||
fashion MNIST dataset.}
|
for the networks to fully profit of the additional training data generated.
|
||||||
\label{fig:mnist_architecture}
|
The test accuracies of the models after
|
||||||
\end{figure}
|
training for 125
|
||||||
|
epochs are given in Table~\ref{table:digitsOF} for the handwritten digits
|
||||||
|
and in Table~\ref{table:fashionOF} for the fashion datasets. Additionally the
|
||||||
|
average test accuracies over the course of learning are given in
|
||||||
|
Figure~\ref{fig:plotOF_digits} for the handwriting application and Figure~\ref{fig:plotOF_fashion} for the
|
||||||
|
fashion application.
|
||||||
|
|
||||||
\begin{figure}[h]
|
\begin{figure}[h]
|
||||||
\centering
|
\centering
|
||||||
@ -937,7 +999,7 @@ Figure ... and Figure...
|
|||||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
ylabel = {Test Accuracy}, cycle
|
xlabel = {epoch},ylabel = {Test Accuracy}, cycle
|
||||||
list/Dark2, every axis plot/.append style={line width
|
list/Dark2, every axis plot/.append style={line width
|
||||||
=1.25pt}]
|
=1.25pt}]
|
||||||
\addplot table
|
\addplot table
|
||||||
@ -970,7 +1032,7 @@ Figure ... and Figure...
|
|||||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
ylabel = {Test Accuracy}, cycle
|
xlabel = {epoch},ylabel = {Test Accuracy}, cycle
|
||||||
list/Dark2, every axis plot/.append style={line width
|
list/Dark2, every axis plot/.append style={line width
|
||||||
=1.25pt}]
|
=1.25pt}]
|
||||||
\addplot table
|
\addplot table
|
||||||
@ -986,7 +1048,7 @@ Figure ... and Figure...
|
|||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Figures/Data/adam_datagen_dropout_02_10.mean};
|
{Figures/Data/adam_datagen_dropout_02_10.mean};
|
||||||
|
|
||||||
|
|
||||||
\addlegendentry{\footnotesize{Default.}}
|
\addlegendentry{\footnotesize{Default.}}
|
||||||
\addlegendentry{\footnotesize{D. 0.2}}
|
\addlegendentry{\footnotesize{D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{G.}}
|
\addlegendentry{\footnotesize{G.}}
|
||||||
@ -1025,18 +1087,143 @@ Figure ... and Figure...
|
|||||||
\caption{100 samples per class}
|
\caption{100 samples per class}
|
||||||
\vspace{.25cm}
|
\vspace{.25cm}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\caption{}
|
\caption{Mean test accuracies of the models fitting the sampled MNIST
|
||||||
\label{fig:MNISTfashion}
|
handwriting datasets over the 125 epochs of training.}
|
||||||
|
\label{fig:plotOF_digits}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{figure}[h]
|
\begin{figure}[h]
|
||||||
\centering
|
\centering
|
||||||
\missingfigure{datagen fashion}
|
\small
|
||||||
\caption{Sample pictures of the mnist fashion dataset, one per
|
\begin{subfigure}[h]{\textwidth}
|
||||||
class.}
|
\begin{tikzpicture}
|
||||||
\label{mnist fashion}
|
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
|
/pgf/number format/precision=3},tick style =
|
||||||
|
{draw = none}, width = \textwidth,
|
||||||
|
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
|
xlabel = {epoch},ylabel = {Test Accuracy}, cycle
|
||||||
|
list/Dark2, every axis plot/.append style={line width
|
||||||
|
=1.25pt}]
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_dropout_0_1.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_dropout_2_1.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_datagen_dropout_0_1.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_datagen_dropout_2_1.mean};
|
||||||
|
|
||||||
|
|
||||||
|
\addlegendentry{\footnotesize{Default}}
|
||||||
|
\addlegendentry{\footnotesize{D. 0.2}}
|
||||||
|
\addlegendentry{\footnotesize{G.}}
|
||||||
|
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||||
|
\addlegendentry{\footnotesize{D. 0.4}}
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{1 sample per class}
|
||||||
|
\vspace{0.25cm}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}[h]{\textwidth}
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
|
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||||
|
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
|
xlabel = {epoch},ylabel = {Test Accuracy}, cycle
|
||||||
|
list/Dark2, every axis plot/.append style={line width
|
||||||
|
=1.25pt}, ymin = {0.62}]
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_dropout_0_10.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_dropout_2_10.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_datagen_dropout_0_10.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_datagen_dropout_2_10.mean};
|
||||||
|
|
||||||
|
|
||||||
|
\addlegendentry{\footnotesize{Default.}}
|
||||||
|
\addlegendentry{\footnotesize{D. 0.2}}
|
||||||
|
\addlegendentry{\footnotesize{G.}}
|
||||||
|
\addlegendentry{\footnotesize{G + D. 0.2}}
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{10 samples per class}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}[h]{\textwidth}
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
|
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
||||||
|
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
|
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
||||||
|
list/Dark2, every axis plot/.append style={line width
|
||||||
|
=1.25pt}, ymin = {0.762}]
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_dropout_0_100.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_dropout_2_100.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_datagen_dropout_0_100.mean};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Figures/Data/fashion_datagen_dropout_2_100.mean};
|
||||||
|
|
||||||
|
\addlegendentry{\footnotesize{Default.}}
|
||||||
|
\addlegendentry{\footnotesize{D. 0.2}}
|
||||||
|
\addlegendentry{\footnotesize{G.}}
|
||||||
|
\addlegendentry{\footnotesize{G + D. 0.2}}
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{100 samples per class}
|
||||||
|
\vspace{.25cm}
|
||||||
|
\end{subfigure}
|
||||||
|
\caption{Mean test accuracies of the models fitting the sampled MNIST
|
||||||
|
handwriting datasets over the 125 epochs of training.}
|
||||||
|
\label{fig:plotOF_fashion}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
It can be seen in ... and ... that the usage of .. overfitting
|
||||||
|
measures greatly improves the accuracy for small datasets. However for
|
||||||
|
the smallest size of one datapoint per class generating more data
|
||||||
|
... outperforms dropout with only a ... improvment being seen by the
|
||||||
|
implementation of dropout whereas datageneration improves the accuracy
|
||||||
|
by... . On the other hand the implementation of dropout seems to
|
||||||
|
reduce the variance in the model accuracy, as the variance in accuracy
|
||||||
|
for the dropout model is less than .. while the variance of the
|
||||||
|
datagen .. model is nearly the same. The model with datageneration
|
||||||
|
... a reduction in variance with the addition of dropout.
|
||||||
|
|
||||||
|
For the slightly larger training sets of ten samples per class the
|
||||||
|
difference between the two measures seems smaller. Here the
|
||||||
|
improvement in accuracy
|
||||||
|
seen by dropout is slightly larger than the one of
|
||||||
|
datageneration. However for the larger sized training set the variance
|
||||||
|
in test accuracies is lower for the model with datageneration than the
|
||||||
|
one with dropout.
|
||||||
|
|
||||||
|
The results for the training sets with 100 samples per class resemble
|
||||||
|
the ones for the sets with 10 per class.
|
||||||
|
|
||||||
|
Overall the models ... both measures to combat overfitting seem to
|
||||||
|
perform considerably well compared to the ones without. The usage of
|
||||||
|
these measures has great potential in improving models used for
|
||||||
|
applications with limited training data. Additional tables and figures
|
||||||
|
visualizing the effects on the logarithmic corssentropy rather than
|
||||||
|
loss are given in the appendix\todo{figs für appendix}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\clearpage
|
\clearpage
|
||||||
\section{Schluss}
|
\section{Schluss}
|
||||||
@ -1044,7 +1231,12 @@ Figure ... and Figure...
|
|||||||
\item generate more data, GAN etc \textcite{gan}
|
\item generate more data, GAN etc \textcite{gan}
|
||||||
\item Transfer learning, use network trained on different task and
|
\item Transfer learning, use network trained on different task and
|
||||||
repurpose it / train it with the training data \textcite{transfer_learning}
|
repurpose it / train it with the training data \textcite{transfer_learning}
|
||||||
\item random erasing fashion mnist 96.35\% accuracy \textcite{random_erasing}
|
\item random erasing fashion mnist 96.35\% accuracy
|
||||||
|
\textcite{random_erasing}
|
||||||
|
\item However the \textsc{Adam} algorithm can have problems with high
|
||||||
|
variance of the adaptive learning rate early in training.
|
||||||
|
\textcite{rADAM} try to address these issues with the Rectified Adam
|
||||||
|
algorithm
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,19 +1,23 @@
|
|||||||
|
|
||||||
\section{Introduction to Neural Networks}
|
\section{Introduction to Neural Networks}
|
||||||
|
|
||||||
|
This chapter is based on \textcite[Chapter~6]{Goodfellow} and \textcite{Haykin}.
|
||||||
|
|
||||||
Neural Networks (NN) are a mathematical construct inspired by the
|
Neural Networks (NN) are a mathematical construct inspired by the
|
||||||
... of brains in mammals. It consists of an array of neurons that
|
structure of brains in mammals. It consists of an array of neurons that
|
||||||
receive inputs and compute a accumulated output. These neurons are
|
receive inputs and compute an accumulated output. These neurons are
|
||||||
arranged in layers, with one input and output layer and a arbirtary
|
arranged in layers, with one input and output layer
|
||||||
|
and a arbirtary
|
||||||
amount of hidden layer between them.
|
amount of hidden layer between them.
|
||||||
The amount of neurons in the in- and output layers correspond to the
|
The amount of neurons in the in- and output layers correspond to the
|
||||||
desired dimensions of in- and outputs of the model.
|
desired dimensions of in- and outputs of the model.
|
||||||
In conventional neural networks the information is passed ... from the
|
In conventional neural networks the information is fed forward from the
|
||||||
input layer towards the output layer hence they are often called feed
|
input layer towards the output layer hence they are often called feed
|
||||||
forward networks. Each neuron in a layer has the outputs of all
|
forward networks. Each neuron in a layer has the outputs of all
|
||||||
neurons in the preceding layer as input (fully connected). A
|
neurons in the preceding layer as input and computes a accumulated
|
||||||
illustration of a example neuronal network is given in
|
value from these (fully connected). A
|
||||||
Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}
|
illustration of an example neuronal network is given in
|
||||||
|
Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}.
|
||||||
|
|
||||||
\tikzset{%
|
\tikzset{%
|
||||||
every neuron/.style={
|
every neuron/.style={
|
||||||
@ -39,17 +43,17 @@ Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}
|
|||||||
{\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
|
{\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}
|
||||||
|
|
||||||
\foreach \m/\l [count=\y] in {1,2,3,missing,4}
|
\foreach \m/\l [count=\y] in {1,2,3,missing,4}
|
||||||
\node [every neuron/.try, neuron \m/.try] (input-\m) at (0,2.5-\y) {};
|
\node [every neuron/.try, neuron \m/.try] (input-\m) at (0,2.55-\y*0.85) {};
|
||||||
|
|
||||||
\foreach \m [count=\y] in {1,missing,2}
|
\foreach \m [count=\y] in {1,missing,2}
|
||||||
\node [every neuron/.try, neuron \m/.try ] (hidden1-\m) at (2,2-\y*1.25) {};
|
\node [every neuron/.try, neuron \m/.try ] (hidden1-\m) at (2.5,2.5-\y*1.25) {};
|
||||||
|
|
||||||
\foreach \m [count=\y] in {1,missing,2}
|
\foreach \m [count=\y] in {1,missing,2}
|
||||||
\node [every neuron/.try, neuron \m/.try ] (hidden2-\m) at (5,2-\y*1.25) {};
|
\node [every neuron/.try, neuron \m/.try ] (hidden2-\m) at (5,2.5-\y*1.25) {};
|
||||||
|
|
||||||
\foreach \m [count=\y] in {1,missing,2}
|
\foreach \m [count=\y] in {1,missing,2}
|
||||||
\node [every neuron/.try, neuron \m/.try ] (output-\m) at (7,1.5-\y) {};
|
\node [every neuron/.try, neuron \m/.try ] (output-\m) at (7,1.5-\y*0.75) {};
|
||||||
|
|
||||||
\foreach \l [count=\i] in {1,2,3,d_i}
|
\foreach \l [count=\i] in {1,2,3,d_i}
|
||||||
\draw [myptr] (input-\i)+(-1,0) -- (input-\i)
|
\draw [myptr] (input-\i)+(-1,0) -- (input-\i)
|
||||||
node [above, midway] {$x_{\l}$};
|
node [above, midway] {$x_{\l}$};
|
||||||
@ -96,20 +100,20 @@ Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}
|
|||||||
The arguably most important feature of neural networks that sets them
|
The arguably most important feature of neural networks that sets them
|
||||||
apart from linear models is the activation function implemented in the
|
apart from linear models is the activation function implemented in the
|
||||||
neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the
|
neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the
|
||||||
inputs a activation function $\sigma$ is applied in order to obtain
|
inputs a activation function $\sigma$ is applied resulting in the
|
||||||
the output resulting in the output of the $k$-th. neuron in a layer
|
output of the $k$-th neuron in a layer $l$
|
||||||
being given by
|
being given by
|
||||||
\[
|
\[
|
||||||
o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right)
|
o_{l,k} = \sigma\left(b_{l,k} + \sum_{j=1}^m w_{l,k,j} o_{l-1,j}\right)
|
||||||
\]
|
\]
|
||||||
for weights $w_{k,j}$ and biases $b_k$.
|
for weights $w_{l,k,j}$ and biases $b_{l,k}$.
|
||||||
The activation function is usually chosen nonlinear (a linear one
|
The activation function is usually chosen nonlinear (a linear one
|
||||||
would result in the entire model collapsing into a linear one\todo{beweis?}) which
|
would result in the entire model collapsing into a linear one\todo{beweis?}) which
|
||||||
allows it to better model data where the relation of in- and output is
|
allows it to better model data where the relation of in- and output is
|
||||||
of nonlinear nature.
|
of nonlinear nature.
|
||||||
There are two types of activation functions, saturating and not
|
There are two types of activation functions, saturating and not
|
||||||
saturating ones. Popular examples for the former are sigmoid
|
saturating ones. Popular examples for the former are sigmoid
|
||||||
functions where most commonly the standard logisitc function or tangen
|
functions where most commonly the standard logisitc function or tangens
|
||||||
hyperbolicus are used
|
hyperbolicus are used
|
||||||
as they have easy to compute derivatives which is desirable for gradient
|
as they have easy to compute derivatives which is desirable for gradient
|
||||||
based optimization algorithms. The standard logistic function (often
|
based optimization algorithms. The standard logistic function (often
|
||||||
@ -119,25 +123,26 @@ referred to simply as sigmoid function) is given by
|
|||||||
\]
|
\]
|
||||||
and has a realm of $[0,1]$. Its usage as an activation function is
|
and has a realm of $[0,1]$. Its usage as an activation function is
|
||||||
motivated by modeling neurons which
|
motivated by modeling neurons which
|
||||||
are close to deactive until a certain threshold where they grow in
|
are close to deactive until a certain threshold is hit and then grow in
|
||||||
intensity until they are fully
|
intensity until they are fully
|
||||||
active, which is similar to the behavior of neurons in
|
active. This is similar to the behavior of neurons in
|
||||||
brains\todo{besser schreiben}. The tangens hyperbolicus is given by
|
brains\todo{besser schreiben}. The tangens hyperbolicus is given by
|
||||||
\[
|
\[
|
||||||
\tanh(x) = \frac{2}{e^{2x}+1}
|
\tanh(x) = \frac{2}{e^{2x}+1}
|
||||||
\]
|
\]
|
||||||
|
and has a realm of $[-1,1]$.
|
||||||
The downside of these saturating activation functions is that given
|
The downside of these saturating activation functions is that given
|
||||||
their saturating nature their derivatives are close to zero for large or small
|
their saturating nature their derivatives are close to zero for large or small
|
||||||
input values which can slow or hinder the progress of gradient based methods.
|
input values. This can slow or hinder the progress of gradient based methods.
|
||||||
|
|
||||||
The nonsaturating activation functions commonly used are the recified
|
The nonsaturating activation functions commonly used are the recified
|
||||||
linear using (ReLU) or the leaky RelU. The ReLU is given by
|
linear unit (ReLU) or the leaky ReLU. The ReLU is given by
|
||||||
\[
|
\[
|
||||||
r(x) = \max\left\{0, x\right\}.
|
r(x) = \max\left\{0, x\right\}.
|
||||||
\]
|
\]
|
||||||
This has the benefit of having a constant derivative for values larger
|
This has the benefit of having a constant derivative for values larger
|
||||||
than zero. However the derivative being zero has the same downside for
|
than zero. However the derivative being zero for negative values has
|
||||||
|
the same downside for
|
||||||
fitting the model with gradient based methods. The leaky ReLU is
|
fitting the model with gradient based methods. The leaky ReLU is
|
||||||
an attempt to counteract this problem by assigning a small constant
|
an attempt to counteract this problem by assigning a small constant
|
||||||
derivative to all values smaller than zero and for scalar $\alpha$ is given by
|
derivative to all values smaller than zero and for scalar $\alpha$ is given by
|
||||||
@ -146,50 +151,6 @@ derivative to all values smaller than zero and for scalar $\alpha$ is given by
|
|||||||
\]
|
\]
|
||||||
In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}.
|
In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}.
|
||||||
|
|
||||||
\begin{figure}
|
|
||||||
\centering
|
|
||||||
\begin{subfigure}{.45\linewidth}
|
|
||||||
\centering
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\begin{axis}[enlargelimits=false, ymin=0, ymax = 1, width=\textwidth]
|
|
||||||
\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
|
|
||||||
\end{axis}
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{\titlecap{standard logistic function}}
|
|
||||||
\end{subfigure}
|
|
||||||
\begin{subfigure}{.45\linewidth}
|
|
||||||
\centering
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\begin{axis}[enlargelimits=false, width=\textwidth]
|
|
||||||
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
|
||||||
\end{axis}
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{\titlecap{tangens hyperbolicus}}
|
|
||||||
\end{subfigure}
|
|
||||||
\begin{subfigure}{.45\linewidth}
|
|
||||||
\centering
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\begin{axis}[enlargelimits=false, width=\textwidth,
|
|
||||||
ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
|
|
||||||
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
|
||||||
\end{axis}
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{ReLU}
|
|
||||||
\end{subfigure}
|
|
||||||
\begin{subfigure}{.45\linewidth}
|
|
||||||
\centering
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
|
|
||||||
ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
|
|
||||||
\addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
|
|
||||||
\end{axis}
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{Leaky ReLU, $\alpha = 0.1$}
|
|
||||||
\end{subfigure}
|
|
||||||
\caption{Plots of the activation functions}
|
|
||||||
\label{fig:activation}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\begin{tikzpicture}[x=1.5cm, y=1.5cm, >=stealth]
|
\begin{tikzpicture}[x=1.5cm, y=1.5cm, >=stealth]
|
||||||
@ -278,11 +239,56 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
|||||||
\label{fig:neuron}
|
\label{fig:neuron}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\begin{subfigure}{.45\linewidth}
|
||||||
|
\centering
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\begin{axis}[enlargelimits=false, ymin=0, ymax = 1, width=\textwidth]
|
||||||
|
\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{\titlecap{standard logistic function}}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{.45\linewidth}
|
||||||
|
\centering
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\begin{axis}[enlargelimits=false, width=\textwidth]
|
||||||
|
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{\titlecap{tangens hyperbolicus}}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{.45\linewidth}
|
||||||
|
\centering
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\begin{axis}[enlargelimits=false, width=\textwidth,
|
||||||
|
ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
|
||||||
|
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{ReLU}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{.45\linewidth}
|
||||||
|
\centering
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
|
||||||
|
ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
|
||||||
|
\addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{Leaky ReLU, $\alpha = 0.1$}
|
||||||
|
\end{subfigure}
|
||||||
|
\caption{Plots of the activation functions}
|
||||||
|
\label{fig:activation}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
\clearpage
|
\clearpage
|
||||||
\subsection{Training Neural Networks}
|
\subsection{Training Neural Networks}
|
||||||
|
|
||||||
As neural networks are a PARAMETRIC model we need to fit it to input
|
As neural networks are a parametric model we need to fit the
|
||||||
data in order to get meaningfull OUTPUT from the network in order to
|
parameters to the input
|
||||||
|
data in order to get meaningful results from the network. To be able
|
||||||
do this we first need to discuss how we interpret the output of the
|
do this we first need to discuss how we interpret the output of the
|
||||||
neural network.
|
neural network.
|
||||||
|
|
||||||
@ -304,7 +310,7 @@ neural network.
|
|||||||
|
|
||||||
\subsubsection{\titlecap{nonliniarity in last layer}}
|
\subsubsection{\titlecap{nonliniarity in last layer}}
|
||||||
|
|
||||||
Given the nature of the neural net the output of the last layer are
|
Given the nature of the neural net the outputs of the last layer are
|
||||||
real numbers. For regression tasks this is desirable, for
|
real numbers. For regression tasks this is desirable, for
|
||||||
classification problems however some transformations might be
|
classification problems however some transformations might be
|
||||||
necessary.
|
necessary.
|
||||||
@ -382,8 +388,9 @@ the first class and $1-f(x)$ for the second class.
|
|||||||
|
|
||||||
\subsubsection{Error Measurement}
|
\subsubsection{Error Measurement}
|
||||||
|
|
||||||
In order to make assessment about the quality of a network $\mathcal{NN}$ and train
|
In order to train the network we need to be able to make an assessment
|
||||||
it we need to discuss how we measure error. The choice of the error
|
about the quality of predictions using some error measure.
|
||||||
|
The choice of the error
|
||||||
function is highly dependent on the type of the problem. For
|
function is highly dependent on the type of the problem. For
|
||||||
regression problems a commonly used error measure is the mean squared
|
regression problems a commonly used error measure is the mean squared
|
||||||
error (MSE)
|
error (MSE)
|
||||||
@ -391,10 +398,9 @@ which for a function $f$ and data $(x_i,y_i), i=1,\dots,n$ is given by
|
|||||||
\[
|
\[
|
||||||
MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2.
|
MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2.
|
||||||
\]
|
\]
|
||||||
However depending on the problem error measures with differnt
|
However depending on the problem error measures with different
|
||||||
properties might be needed, for example in some contexts it is
|
properties might be needed, for example in some contexts it is
|
||||||
required to consider a proportional rather than absolute error as is
|
required to consider a proportional rather than absolute error.
|
||||||
common in time series models. \todo{komisch}
|
|
||||||
|
|
||||||
As discussed above the output of a neural network for a classification
|
As discussed above the output of a neural network for a classification
|
||||||
problem can be interpreted as a probability distribution over the classes
|
problem can be interpreted as a probability distribution over the classes
|
||||||
@ -405,14 +411,15 @@ which for two discrete distributions $p, q$ with the same realm $C$ is given by
|
|||||||
\[
|
\[
|
||||||
H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
|
H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
|
||||||
\]
|
\]
|
||||||
which compares a $q$ to a true underlying distribution $p$.
|
comparing $q$ to a target density $p$.
|
||||||
For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$
|
For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$
|
||||||
corresponds to the probability of class $c$ given $x_i$ and predictor
|
corresponds to the probability of class $c$ given $x_i$ and predictor
|
||||||
$f$ we get the loss function
|
$f$ we get the loss function
|
||||||
\[
|
\[
|
||||||
Bla = \sum_{i=1}^n H(y_i, f(x_i)).
|
CE(f) = \sum_{i=1}^n H(y_i, f(x_i)).
|
||||||
\]
|
\]
|
||||||
|
|
||||||
|
\todo{Den satz einbauen}
|
||||||
-Maximum Likelihood
|
-Maximum Likelihood
|
||||||
-Ableitung mit softmax pseudo linear -> fast improvemtns possible
|
-Ableitung mit softmax pseudo linear -> fast improvemtns possible
|
||||||
|
|
||||||
@ -422,9 +429,10 @@ Trying to find the optimal parameter for fitting the model to the data
|
|||||||
can be a hard problem. Given the complex nature of a neural network
|
can be a hard problem. Given the complex nature of a neural network
|
||||||
with many layers and neurons it is hard to predict the impact of
|
with many layers and neurons it is hard to predict the impact of
|
||||||
single parameters on the accuracy of the output.
|
single parameters on the accuracy of the output.
|
||||||
Thus applying numeric optimization algorithms is the only
|
Thus using numeric optimization algorithms is the only
|
||||||
feasible way to fit the model. A attractive algorithm for training
|
feasible way to fit the model. A attractive algorithm for training
|
||||||
neural networks is gradient descent where each parameter $\theta_i$ is
|
neural networks is gradient descent where each parameter
|
||||||
|
$\theta_i$\todo{parameter name?} is
|
||||||
iterative changed according to the gradient regarding the error
|
iterative changed according to the gradient regarding the error
|
||||||
measure and a step size $\gamma$. For this all parameters are
|
measure and a step size $\gamma$. For this all parameters are
|
||||||
initialized (often random or close to zero) and then iteratively
|
initialized (often random or close to zero) and then iteratively
|
||||||
@ -452,16 +460,18 @@ number of iterations or a desired upper limit for the error measure.
|
|||||||
|
|
||||||
The algorithm for gradient descent is given in
|
The algorithm for gradient descent is given in
|
||||||
Algorithm~\ref{alg:gd}. In the context of fitting a neural network
|
Algorithm~\ref{alg:gd}. In the context of fitting a neural network
|
||||||
$f_\theta$ corresponds to the error measurement of the network
|
$f_\theta$ corresponds to a error measurement of a neural network
|
||||||
$L\left(\mathcal{NN}_{\theta}\right)$ where $\theta$ is a vector
|
$\mathcal{NN}_{\theta}$ where $\theta$ is a vector
|
||||||
containing all the weights and biases of the network.
|
containing all the weights and biases of the network.
|
||||||
As ca be seen this requires computing the derivative of the network
|
As can be seen this requires computing the derivative of the network
|
||||||
with regard to each variable. With the number of variables getting
|
with regard to each variable. With the number of variables getting
|
||||||
large in networks with multiple layers of high neuron count naively
|
large in networks with multiple layers of high neuron count naively
|
||||||
computing these can get quite memory and computational expensive. But
|
computing the derivatives can get quite memory and computational
|
||||||
by using the chain rule and exploiting the layered structure we can
|
expensive.
|
||||||
compute the gradient much more efficiently by using backpropagation
|
By using the chain rule and exploiting the layered structure we can
|
||||||
introduced by \textcite{backprop}.
|
compute the parameter update much more efficiently, this practice is
|
||||||
|
called backpropagation and was introduced by
|
||||||
|
\textcite{backprop}\todo{nachsehen ob richtige quelle}.
|
||||||
|
|
||||||
% \subsubsection{Backpropagation}
|
% \subsubsection{Backpropagation}
|
||||||
|
|
||||||
@ -478,6 +488,7 @@ introduced by \textcite{backprop}.
|
|||||||
\[
|
\[
|
||||||
\frac{\partial L(...)}{}
|
\frac{\partial L(...)}{}
|
||||||
\]
|
\]
|
||||||
|
Backprop noch aufschreiben
|
||||||
\todo{Backprop richtig aufschreiben}
|
\todo{Backprop richtig aufschreiben}
|
||||||
|
|
||||||
%%% Local Variables:
|
%%% Local Variables:
|
||||||
|
8
TeX/main.lot
Normal file
8
TeX/main.lot
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
|
||||||
|
\babel@toc {english}{}
|
||||||
|
\defcounter {refsection}{0}\relax
|
||||||
|
\contentsline {table}{\numberline {4.1}{\ignorespaces Performance metrics of the networks trained in Figure~\ref {fig:sgd_vs_gd} after 20 training epochs.\relax }}{30}{table.caption.34}%
|
||||||
|
\defcounter {refsection}{0}\relax
|
||||||
|
\contentsline {table}{\numberline {4.2}{\ignorespaces Values of the test accuracy of the model trained 10 times on random MNIST handwriting training sets containing 1, 10 and 100 data points per class after 125 epochs. The mean achieved accuracy for the full set employing both overfitting measures is \relax }}{41}{table.4.2}%
|
||||||
|
\defcounter {refsection}{0}\relax
|
||||||
|
\contentsline {table}{\numberline {4.3}{\ignorespaces Values of the test accuracy of the model trained 10 times on random fashion MNIST training sets containing 1, 10 and 100 data points per class. The mean achieved accuracy for the full dataset is: ....\relax }}{41}{table.4.3}%
|
@ -41,6 +41,10 @@
|
|||||||
\usepackage{afterpage}
|
\usepackage{afterpage}
|
||||||
\usepackage{xcolor}
|
\usepackage{xcolor}
|
||||||
\usepackage{chngcntr}
|
\usepackage{chngcntr}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
\hypersetup{
|
||||||
|
linktoc=all, %set to all if you want both sections and subsections linked
|
||||||
|
}
|
||||||
|
|
||||||
\captionsetup[sub]{justification=centering}
|
\captionsetup[sub]{justification=centering}
|
||||||
|
|
||||||
@ -192,6 +196,7 @@
|
|||||||
\newtheorem{Algorithm}[Theorem]{Algorithm}
|
\newtheorem{Algorithm}[Theorem]{Algorithm}
|
||||||
\newtheorem{Example}[Theorem]{Example}
|
\newtheorem{Example}[Theorem]{Example}
|
||||||
\newtheorem{Assumption}[Theorem]{Assumption}
|
\newtheorem{Assumption}[Theorem]{Assumption}
|
||||||
|
\newtheorem{Proof}[Theorem]{Proof}
|
||||||
|
|
||||||
|
|
||||||
\DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim}
|
\DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim}
|
||||||
@ -238,7 +243,8 @@
|
|||||||
|
|
||||||
\begin{center}
|
\begin{center}
|
||||||
\vspace{1cm}
|
\vspace{1cm}
|
||||||
\huge \textbf{TITLE Neural Network bla blub langer Titel}\\
|
\huge \textbf{\titlecap{neural networks and their application on
|
||||||
|
higher complexity problems}}\\
|
||||||
\vspace{1cm}
|
\vspace{1cm}
|
||||||
\huge \textbf{Tim Tobias Arndt}\\
|
\huge \textbf{Tim Tobias Arndt}\\
|
||||||
\vspace{1cm}
|
\vspace{1cm}
|
||||||
@ -251,6 +257,7 @@
|
|||||||
\tableofcontents
|
\tableofcontents
|
||||||
\clearpage
|
\clearpage
|
||||||
\listoffigures
|
\listoffigures
|
||||||
|
\listoftables
|
||||||
\listoftodos
|
\listoftodos
|
||||||
\newpage
|
\newpage
|
||||||
\pagenumbering{arabic}
|
\pagenumbering{arabic}
|
||||||
|
157
TeX/theo_3_8.tex
157
TeX/theo_3_8.tex
@ -15,7 +15,8 @@ In order to get some understanding of the behavior of neural networks
|
|||||||
we examine a simple class of networks in this chapter. We consider
|
we examine a simple class of networks in this chapter. We consider
|
||||||
networks that contain only one hidden layer and have a single output
|
networks that contain only one hidden layer and have a single output
|
||||||
node. We call these networks shallow neural networks.
|
node. We call these networks shallow neural networks.
|
||||||
\begin{Definition}[Shallow neural network]
|
\begin{Definition}[Shallow neural network, Heiss, Teichmann, and
|
||||||
|
Wutte (2019, Definition 1.4)]
|
||||||
For a input dimension $d$ and a Lipschitz continuous activation function $\sigma:
|
For a input dimension $d$ and a Lipschitz continuous activation function $\sigma:
|
||||||
\mathbb{R} \to \mathbb{R}$ we define a shallow neural network with
|
\mathbb{R} \to \mathbb{R}$ we define a shallow neural network with
|
||||||
$n$ hidden nodes as
|
$n$ hidden nodes as
|
||||||
@ -156,9 +157,9 @@ However this behavior is often not desired as over fit models generally
|
|||||||
have bad generalization properties especially if noise is present in
|
have bad generalization properties especially if noise is present in
|
||||||
the data. This effect is illustrated in
|
the data. This effect is illustrated in
|
||||||
Figure~\ref{fig:overfit}. Here a shallow neural network that perfectly fits the
|
Figure~\ref{fig:overfit}. Here a shallow neural network that perfectly fits the
|
||||||
training data regarding MSE is \todo{Formulierung}
|
training data is
|
||||||
constructed according to the proof of Theorem~\ref{theo:overfit} and
|
constructed according to the proof of Theorem~\ref{theo:overfit} and
|
||||||
compared to a regression spline
|
compared to a cubic smoothing spline
|
||||||
(Definition~\ref{def:wrs}). While the neural network
|
(Definition~\ref{def:wrs}). While the neural network
|
||||||
fits the data better than the spline, the spline represents the
|
fits the data better than the spline, the spline represents the
|
||||||
underlying mechanism that was used to generate the data more accurately. The better
|
underlying mechanism that was used to generate the data more accurately. The better
|
||||||
@ -213,7 +214,7 @@ plot coordinates {
|
|||||||
(\textcolor{blue}{blue dots}) the neural network constructed
|
(\textcolor{blue}{blue dots}) the neural network constructed
|
||||||
according to the proof of Theorem~\ref{theo:overfit} (black) and the
|
according to the proof of Theorem~\ref{theo:overfit} (black) and the
|
||||||
underlying signal (\textcolor{red}{red}). While the network has no
|
underlying signal (\textcolor{red}{red}). While the network has no
|
||||||
bias a regression spline (black dashed) fits the data much
|
bias a cubic smoothing spline (black dashed) fits the data much
|
||||||
better. For a test set of size 20 with uniformly distributed $x$
|
better. For a test set of size 20 with uniformly distributed $x$
|
||||||
values and responses of the same fashion as the training data the MSE of the neural network is
|
values and responses of the same fashion as the training data the MSE of the neural network is
|
||||||
0.30, while the MSE of the spline is only 0.14 thus generalizing
|
0.30, while the MSE of the spline is only 0.14 thus generalizing
|
||||||
@ -227,26 +228,35 @@ plot coordinates {
|
|||||||
Networks}
|
Networks}
|
||||||
|
|
||||||
|
|
||||||
This section is based on \textcite{heiss2019}. We will analyze the
|
This section is based on \textcite{heiss2019}.
|
||||||
connection between randomized shallow
|
|
||||||
Neural Networks with one dimensional input with a ReLU as activation
|
|
||||||
function for all neurons and regression splines.
|
|
||||||
% \[
|
|
||||||
% \sigma(x) = \max\left\{0,x\right\}.
|
|
||||||
% \]
|
|
||||||
We will see that the punishment of the size of the weights in training
|
|
||||||
the randomized shallow
|
|
||||||
Neural Network will result in a learned function that minimizes the second
|
|
||||||
derivative as the amount of hidden nodes is grown to infinity. In order
|
|
||||||
to properly formulate this relation we will first need to introduce
|
|
||||||
some definitions, all neural networks introduced in the following will
|
|
||||||
use a ReLU as activation at all neurons.
|
|
||||||
|
|
||||||
A randomized shallow network is characterized by only the weight
|
... shallow neural networks with a one dimensional input where the parameters in the
|
||||||
parameter of the output layer being trainable, whereas the other
|
hidden layer are randomized resulting in only the weights is the
|
||||||
parameters are random numbers.
|
output layer being trainable.
|
||||||
|
Additionally we assume all neurons use a ReLU as activation function
|
||||||
|
and call such networks randomized shallow neural networks.
|
||||||
|
|
||||||
\begin{Definition}[Randomized shallow neural network]
|
% We will analyze the
|
||||||
|
% connection between randomized shallow
|
||||||
|
% Neural Networks with one dimensional input with a ReLU as activation
|
||||||
|
% function for all neurons and cubic smoothing splines.
|
||||||
|
% % \[
|
||||||
|
% % \sigma(x) = \max\left\{0,x\right\}.
|
||||||
|
% % \]
|
||||||
|
% We will see that the punishment of the size of the weights in training
|
||||||
|
% the randomized shallow
|
||||||
|
% Neural Network will result in a learned function that minimizes the second
|
||||||
|
% derivative as the amount of hidden nodes is grown to infinity. In order
|
||||||
|
% to properly formulate this relation we will first need to introduce
|
||||||
|
% some definitions, all neural networks introduced in the following will
|
||||||
|
% use a ReLU as activation at all neurons.
|
||||||
|
|
||||||
|
% A randomized shallow network is characterized by only the weight
|
||||||
|
% parameter of the output layer being trainable, whereas the other
|
||||||
|
% parameters are random numbers.
|
||||||
|
|
||||||
|
\begin{Definition}[Randomized shallow neural network, Heiss, Teichmann, and
|
||||||
|
Wutte (2019, Definition 2.1)]
|
||||||
For an input dimension $d$, let $n \in \mathbb{N}$ be the number of
|
For an input dimension $d$, let $n \in \mathbb{N}$ be the number of
|
||||||
hidden nodes and $v(\omega) \in \mathbb{R}^{i \times n}, b(\omega)
|
hidden nodes and $v(\omega) \in \mathbb{R}^{i \times n}, b(\omega)
|
||||||
\in \mathbb{R}^n$ randomly drawn weights. Then for a weight vector
|
\in \mathbb{R}^n$ randomly drawn weights. Then for a weight vector
|
||||||
@ -257,15 +267,29 @@ parameters are random numbers.
|
|||||||
\]
|
\]
|
||||||
\label{def:rsnn}
|
\label{def:rsnn}
|
||||||
\end{Definition}
|
\end{Definition}
|
||||||
We call a one dimensional randomized shallow neural network were the
|
% We call a one dimensional randomized shallow neural network were the
|
||||||
$L^2$ norm of the trainable weights $w$ are penalized in the loss
|
% are penalized in the loss
|
||||||
function ridge penalized neural networks.
|
% function ridge penalized neural networks.
|
||||||
|
|
||||||
|
|
||||||
|
We will prove that ... nodes .. a randomized shallow neural network will
|
||||||
|
converge to a function that minimizes the distance to the training
|
||||||
|
data with .. to its second derivative,
|
||||||
|
if the $L^2$ norm of the trainable weights $w$ is
|
||||||
|
penalized in the loss function.
|
||||||
|
We call such a network that is fitted according to MSE and a penalty term for
|
||||||
|
the amount of the weights a ridge penalized neural network.
|
||||||
|
% $\lam$
|
||||||
|
% We call a randomized shallow neural network trained on MSE and
|
||||||
|
% punished for the amount of the weights $w$ according to a
|
||||||
|
% ... $\lambda$ ridge penalized neural networks.
|
||||||
|
|
||||||
% We call a randomized shallow neural network where the size of the trainable
|
% We call a randomized shallow neural network where the size of the trainable
|
||||||
% weights is punished in the error function a ridge penalized
|
% weights is punished in the error function a ridge penalized
|
||||||
% neural network. For a tuning parameter $\tilde{\lambda}$ .. the extent
|
% neural network. For a tuning parameter $\tilde{\lambda}$ .. the extent
|
||||||
% of penalization we get:
|
% of penalization we get:
|
||||||
\begin{Definition}[Ridge penalized Neural Network]
|
\begin{Definition}[Ridge penalized Neural Network, Heiss, Teichmann, and
|
||||||
|
Wutte (2019, Definition 3.2)]
|
||||||
\label{def:rpnn}
|
\label{def:rpnn}
|
||||||
Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
|
Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
|
||||||
network, as introduced in Definition~\ref{def:rsnn} and tuning
|
network, as introduced in Definition~\ref{def:rsnn} and tuning
|
||||||
@ -309,13 +333,13 @@ $\omega$ used to express the realised random parameters will no longer
|
|||||||
be explicitly mentioned.
|
be explicitly mentioned.
|
||||||
|
|
||||||
We call a function that minimizes the cubic distance between training points
|
We call a function that minimizes the cubic distance between training points
|
||||||
and the function with respect\todo{richtiges wort} to the second
|
and the function with regard to the second
|
||||||
derivative of the function a regression spline.
|
derivative of the function a cubic smoothing spline.
|
||||||
|
|
||||||
\begin{Definition}[Regression Spline]
|
\begin{Definition}[Cubic Smoothing Spline]
|
||||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||||
\left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
|
\left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
|
||||||
\mathbb{R}$ the regression spline is given by
|
\mathbb{R}$ the cubic smoothing spline is given by
|
||||||
\[
|
\[
|
||||||
f^{*,\lambda} :\in \argmin_{f \in
|
f^{*,\lambda} :\in \argmin_{f \in
|
||||||
\mathcal{C}^2}\left\{\sum_{i=1}^N
|
\mathcal{C}^2}\left\{\sum_{i=1}^N
|
||||||
@ -326,10 +350,10 @@ derivative of the function a regression spline.
|
|||||||
|
|
||||||
We will show that for specific hyper parameters the ridge penalized
|
We will show that for specific hyper parameters the ridge penalized
|
||||||
shallow neural networks converge to a slightly modified variant of the
|
shallow neural networks converge to a slightly modified variant of the
|
||||||
regression spline. We will need to incorporate the densities of the
|
cubic smoothing spline. We will need to incorporate the densities of the
|
||||||
random parameters in the loss function of the spline to ensure
|
random parameters in the loss function of the spline to ensure
|
||||||
convergence. Thus we define
|
convergence. Thus we define
|
||||||
the adapted weighted regression spline where the loss for the second
|
the adapted weighted cubic smoothing spline where the loss for the second
|
||||||
derivative is weighted by a function $g$ and the support of the second
|
derivative is weighted by a function $g$ and the support of the second
|
||||||
derivative of $f$ has to be a subset the support of $g$. The formal
|
derivative of $f$ has to be a subset the support of $g$. The formal
|
||||||
definition is given in Definition~\ref{def:wrs}.
|
definition is given in Definition~\ref{def:wrs}.
|
||||||
@ -340,19 +364,19 @@ definition is given in Definition~\ref{def:wrs}.
|
|||||||
% spline that allows for weighting the penalty term for the second
|
% spline that allows for weighting the penalty term for the second
|
||||||
% derivative with a weight function $g$. This is needed to ...the
|
% derivative with a weight function $g$. This is needed to ...the
|
||||||
% distributions of the random parameters ... We call this the adapted
|
% distributions of the random parameters ... We call this the adapted
|
||||||
% weighted regression spline.
|
% weighted cubic smoothing spline.
|
||||||
|
|
||||||
% Now we take a look at weighted regression splines. Later we will prove
|
% Now we take a look at weighted cubic smoothing splines. Later we will prove
|
||||||
% that the ridge penalized neural network as defined in
|
% that the ridge penalized neural network as defined in
|
||||||
% Definition~\ref{def:rpnn} converges a weighted regression spline, as
|
% Definition~\ref{def:rpnn} converges a weighted cubic smoothing spline, as
|
||||||
% the amount of hidden nodes is grown to inifity.
|
% the amount of hidden nodes is grown to inifity.
|
||||||
|
|
||||||
\begin{Definition}[Adapted Weighted regression spline]
|
\begin{Definition}[Adapted weighted cubic smoothing spline]
|
||||||
\label{def:wrs}
|
\label{def:wrs}
|
||||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||||
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||||
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
||||||
regression spline $f^{*, \lambda}_g$ is given by
|
cubic smoothing spline $f^{*, \lambda}_g$ is given by
|
||||||
|
|
||||||
\[
|
\[
|
||||||
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
|
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
|
||||||
@ -370,14 +394,13 @@ and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ th
|
|||||||
resulting function $f^{*, 0+}$ will interpolate the training data while minimizing
|
resulting function $f^{*, 0+}$ will interpolate the training data while minimizing
|
||||||
the second derivative. Such a function is known as cubic spline
|
the second derivative. Such a function is known as cubic spline
|
||||||
interpolation.
|
interpolation.
|
||||||
\todo{cite cubic spline}
|
|
||||||
|
|
||||||
\[
|
\[
|
||||||
f^{*, 0+} \text{ smooth spline interpolation: }
|
f^{*, 0+} \text{ smooth spline interpolation: }
|
||||||
\]
|
\]
|
||||||
\[
|
\[
|
||||||
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
|
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
|
||||||
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
|
\argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R}), \\ f(x_i^{\text{train}}) =
|
||||||
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
|
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
|
||||||
\]
|
\]
|
||||||
|
|
||||||
@ -385,16 +408,17 @@ For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
|
|||||||
to linear regression of the data.
|
to linear regression of the data.
|
||||||
|
|
||||||
We use two intermediary functions in order to show the convergence of
|
We use two intermediary functions in order to show the convergence of
|
||||||
the ridge penalized shallow neural network to adapted regression splines.
|
the ridge penalized shallow neural network to adapted cubic smoothing splines.
|
||||||
% In order to show that ridge penalized shallow neural networks converge
|
% In order to show that ridge penalized shallow neural networks converge
|
||||||
% to adapted regression splines for a growing amount of hidden nodes we
|
% to adapted cubic smoothing splines for a growing amount of hidden nodes we
|
||||||
% define two intermediary functions.
|
% define two intermediary functions.
|
||||||
One being a smooth approximation of
|
One being a smooth approximation of
|
||||||
the neural network, and a randomized shallow neural network designed
|
the neural network, and a randomized shallow neural network designed
|
||||||
to approximate a spline.
|
to approximate a spline.
|
||||||
In order to properly BUILD these functions we need to take the points
|
In order to properly BUILD these functions we need to take the points
|
||||||
of the network into consideration where the TRAJECTORY changes or
|
of the network into consideration where the TRAJECTORY of the learned
|
||||||
their points of discontinuity
|
function changes
|
||||||
|
(or their points of discontinuity).
|
||||||
As we use the ReLU activation the function learned by the
|
As we use the ReLU activation the function learned by the
|
||||||
network will possess points of discontinuity where a neuron in the hidden
|
network will possess points of discontinuity where a neuron in the hidden
|
||||||
layer gets activated (goes from 0 -> x>0). We formalize these points
|
layer gets activated (goes from 0 -> x>0). We formalize these points
|
||||||
@ -452,8 +476,8 @@ satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
|
|||||||
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
|
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
|
||||||
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
|
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
|
||||||
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
|
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
|
||||||
We use $f^{w^{*,\tilde{\lambda}}}$ do describe the spline
|
We use $f^{w^{*,\tilde{\lambda}}}$ to describe the spline
|
||||||
approximating the ... ridge penalized network
|
approximating the ridge penalized network
|
||||||
$\mathrm{RN}^{*,\tilde{\lambda}}$.
|
$\mathrm{RN}^{*,\tilde{\lambda}}$.
|
||||||
|
|
||||||
Next we construct a randomized shallow neural network which
|
Next we construct a randomized shallow neural network which
|
||||||
@ -465,7 +489,7 @@ parameters. In order to achieve this we ...
|
|||||||
\label{def:sann}
|
\label{def:sann}
|
||||||
Let $\mathcal{RN}$ be a randomised shallow Neural Network according
|
Let $\mathcal{RN}$ be a randomised shallow Neural Network according
|
||||||
to Definition~\ref{def:rsnn} and $f^{*, \lambda}_g$ be the weighted
|
to Definition~\ref{def:rsnn} and $f^{*, \lambda}_g$ be the weighted
|
||||||
regression spline as introduced in Definition~\ref{def:wrs}. Then
|
cubic smoothing spline as introduced in Definition~\ref{def:wrs}. Then
|
||||||
the randomised shallow neural network approximating $f^{*,
|
the randomised shallow neural network approximating $f^{*,
|
||||||
\lambda}_g$ is given by
|
\lambda}_g$ is given by
|
||||||
\[
|
\[
|
||||||
@ -538,7 +562,6 @@ introduce it and the corresponding induced norm.
|
|||||||
\[
|
\[
|
||||||
\norm{u^{(\alpha)}}_{L^p} < \infty.
|
\norm{u^{(\alpha)}}_{L^p} < \infty.
|
||||||
\]
|
\]
|
||||||
\todo{feritg machen}
|
|
||||||
\label{def:sobonorm}
|
\label{def:sobonorm}
|
||||||
The natural norm of the sobolev space is given by
|
The natural norm of the sobolev space is given by
|
||||||
\[
|
\[
|
||||||
@ -556,10 +579,10 @@ introduce it and the corresponding induced norm.
|
|||||||
|
|
||||||
With the important definitions and assumptions in place we can now
|
With the important definitions and assumptions in place we can now
|
||||||
formulate the main theorem ... the convergence of ridge penalized
|
formulate the main theorem ... the convergence of ridge penalized
|
||||||
random neural networks to adapted regression splines when the
|
random neural networks to adapted cubic smoothing splines when the
|
||||||
parameters are chosen accordingly.
|
parameters are chosen accordingly.
|
||||||
|
|
||||||
\begin{Theorem}[Ridge weight penaltiy corresponds to weighted regression spline]
|
\begin{Theorem}[Ridge weight penaltiy corresponds to weighted cubic smoothing spline]
|
||||||
\label{theo:main1}
|
\label{theo:main1}
|
||||||
For $N \in \mathbb{N}$ arbitrary training data
|
For $N \in \mathbb{N}$ arbitrary training data
|
||||||
\(\left(x_i^{\text{train}}, y_i^{\text{train}}
|
\(\left(x_i^{\text{train}}, y_i^{\text{train}}
|
||||||
@ -725,12 +748,12 @@ provided in the appendix.
|
|||||||
% \end{align*}
|
% \end{align*}
|
||||||
\end{Lemma}
|
\end{Lemma}
|
||||||
|
|
||||||
\begin{Lemma}[Step 0]
|
\begin{Lemma}
|
||||||
For any $\lambda > 0$, training data $(x_i^{\text{train}}
|
For any $\lambda > 0$, training data $(x_i^{\text{train}}
|
||||||
y_i^{\text{train}}) \in \mathbb{R}^2$, with $ i \in
|
y_i^{\text{train}}) \in \mathbb{R}^2$, with $ i \in
|
||||||
\left\{1,\dots,N\right\}$ and subset $K \subset \mathbb{R}$ the spline approximating randomized
|
\left\{1,\dots,N\right\}$ and subset $K \subset \mathbb{R}$ the spline approximating randomized
|
||||||
shallow neural network $\mathcal{RN}_{\tilde{w}}$ converges to the
|
shallow neural network $\mathcal{RN}_{\tilde{w}}$ converges to the
|
||||||
regression spline $f^{*, \lambda}_g$ in
|
cubic smoothing spline $f^{*, \lambda}_g$ in
|
||||||
$\norm{.}_{W^{1,\infty}(K)}$ as the node count $n$ increases,
|
$\norm{.}_{W^{1,\infty}(K)}$ as the node count $n$ increases,
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{eq:s0}
|
\label{eq:s0}
|
||||||
@ -767,11 +790,12 @@ provided in the appendix.
|
|||||||
\end{align*}
|
\end{align*}
|
||||||
By the fundamental theorem of calculus and $\supp(f') \subset
|
By the fundamental theorem of calculus and $\supp(f') \subset
|
||||||
\supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}.
|
\supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}.
|
||||||
|
\todo{ist die 0 wichtig?}
|
||||||
\qed
|
\qed
|
||||||
\label{lem:s0}
|
\label{lem:s0}
|
||||||
\end{Lemma}
|
\end{Lemma}
|
||||||
|
|
||||||
\begin{Lemma}[Step 2]
|
\begin{Lemma}
|
||||||
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
||||||
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
||||||
\left\{1,\dots,N\right\}$, we have
|
\left\{1,\dots,N\right\}$, we have
|
||||||
@ -784,7 +808,7 @@ provided in the appendix.
|
|||||||
\label{lem:s2}
|
\label{lem:s2}
|
||||||
\end{Lemma}
|
\end{Lemma}
|
||||||
|
|
||||||
\begin{Lemma}[Step 3]
|
\begin{Lemma}
|
||||||
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
||||||
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
||||||
\left\{1,\dots,N\right\}$, with $w^*$ as
|
\left\{1,\dots,N\right\}$, with $w^*$ as
|
||||||
@ -798,7 +822,7 @@ provided in the appendix.
|
|||||||
\label{lem:s3}
|
\label{lem:s3}
|
||||||
\end{Lemma}
|
\end{Lemma}
|
||||||
|
|
||||||
\begin{Lemma}[Step 4]
|
\begin{Lemma}
|
||||||
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
||||||
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
||||||
\left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
|
\left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
|
||||||
@ -812,7 +836,7 @@ provided in the appendix.
|
|||||||
\label{lem:s4}
|
\label{lem:s4}
|
||||||
\end{Lemma}
|
\end{Lemma}
|
||||||
|
|
||||||
\begin{Lemma}[Step 7]
|
\begin{Lemma}
|
||||||
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
||||||
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
||||||
\left\{1,\dots,N\right\}$, for any sequence of functions $f^n \in
|
\left\{1,\dots,N\right\}$, for any sequence of functions $f^n \in
|
||||||
@ -876,10 +900,10 @@ We can now use Lemma~\ref{lem:s7} to follow that
|
|||||||
\begin{equation}
|
\begin{equation}
|
||||||
\plimn \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
|
\plimn \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
|
||||||
_{W^{1,\infty}} = 0.
|
_{W^{1,\infty}} = 0.
|
||||||
\label{eq:main2}
|
\label{eq:main4}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
Now by using the triangle inequality with Lemma~\ref{lem:s3} and
|
Now by using the triangle inequality with Lemma~\ref{lem:s3} and
|
||||||
(\ref{eq:main2}) we get
|
(\ref{eq:main4}) we get
|
||||||
\begin{align*}
|
\begin{align*}
|
||||||
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f_g^{*,\lambda}}
|
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f_g^{*,\lambda}}
|
||||||
\leq& \plimn \bigg(\norm{\mathcal{RN}^{*, \tilde{\lambda}} -
|
\leq& \plimn \bigg(\norm{\mathcal{RN}^{*, \tilde{\lambda}} -
|
||||||
@ -892,13 +916,16 @@ We now know that randomized shallow neural networks behave similar to
|
|||||||
spline regression if we regularize the size of the weights during
|
spline regression if we regularize the size of the weights during
|
||||||
training.
|
training.
|
||||||
\textcite{heiss2019} further explore a connection between ridge penalized
|
\textcite{heiss2019} further explore a connection between ridge penalized
|
||||||
networks and randomized shallow neural networks which are trained
|
networks and randomized shallow neural networks trained using gradient
|
||||||
which are only trained for a certain amount of epoch using gradient
|
|
||||||
descent.
|
descent.
|
||||||
And ... that the effect of weight regularization can be achieved by
|
They come to the conclusion that the effect of weight regularization
|
||||||
training for a certain amount of iterations this ... between adapted
|
can be achieved by stopping the training of the randomized shallow
|
||||||
weighted regression splines and randomized shallow neural networks
|
neural network early, with the amount of epochs being proportional to
|
||||||
where training is stopped early.
|
the punishment for weight size.
|
||||||
|
This ... that randomized shallow neural networks trained for a certain
|
||||||
|
amount of iterations converge for a increasing amount of nodes to
|
||||||
|
cubic smoothing splines with appropriate weights.
|
||||||
|
\todo{nochmal nachlesen wie es genau war}
|
||||||
|
|
||||||
\newpage
|
\newpage
|
||||||
\subsection{Simulations}
|
\subsection{Simulations}
|
||||||
@ -936,7 +963,7 @@ would equate to $g(x) = \frac{\mathbb{E}[v_k^2|\xi_k = x]}{10}$. In
|
|||||||
order to utilize the
|
order to utilize the
|
||||||
smoothing spline implemented in Mathlab, $g$ has been simplified to $g
|
smoothing spline implemented in Mathlab, $g$ has been simplified to $g
|
||||||
\equiv \frac{1}{10}$ instead. For all figures $f_1^{*, \lambda}$ has
|
\equiv \frac{1}{10}$ instead. For all figures $f_1^{*, \lambda}$ has
|
||||||
been calculated with Matlab's ..... As ... minimizes
|
been calculated with Matlab's 'smoothingspline', as this minimizes
|
||||||
\[
|
\[
|
||||||
\bar{\lambda} \sum_{i=1}^N(y_i^{train} - f(x_i^{train}))^2 + (1 -
|
\bar{\lambda} \sum_{i=1}^N(y_i^{train} - f(x_i^{train}))^2 + (1 -
|
||||||
\bar{\lambda}) \int (f''(x))^2 dx
|
\bar{\lambda}) \int (f''(x))^2 dx
|
||||||
@ -946,7 +973,7 @@ the smoothing parameter used for fittment is $\bar{\lambda} =
|
|||||||
the networks is chosen as defined in Theorem~\ref{theo:main1} and each
|
the networks is chosen as defined in Theorem~\ref{theo:main1} and each
|
||||||
one is trained on the full training data for 5000 epoch using
|
one is trained on the full training data for 5000 epoch using
|
||||||
gradient descent. The
|
gradient descent. The
|
||||||
results are given in Figure~\ref{fig:rs_vs_rs}, here it can be seen that in
|
results are given in Figure~\ref{fig:rn_vs_rs}, here it can be seen that in
|
||||||
the intervall of the traing data $[-\pi, \pi]$ the neural network and
|
the intervall of the traing data $[-\pi, \pi]$ the neural network and
|
||||||
smoothing spline are nearly identical, coinciding with the proposition.
|
smoothing spline are nearly identical, coinciding with the proposition.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user