final 2.0

master
Tobias Arndt 4 years ago
parent 2ef7cda1dd
commit 2b10ef56e3

@ -1,13 +1,13 @@
\section{Implementations}
In this section the implementations models used are given.
The randomized shallow neural network used in Section~\ref{sec:conv} are
implemented in Scala. No preexisting frameworks were used to ensure
In this section the implementations of the models used are given.
The randomized shallow neural network used in Section~\ref{sec:conv} is
implemented in Scala. No pre-existing frameworks were used to ensure
the implementation was according to the definitions used in Theorem~\ref{theo:main1}.
The neural networks used in Section~\ref{sec:cnn} are implemented in python using
The neural networks used in Section~\ref{sec:cnn} are implemented in Python using
the Keras framework given in TensorFlow. TensorFlow is a library
containing highly efficient GPU implementations of a wide variety
tensor operations, such as convolution as well as efficient algorithms
containing highly efficient GPU implementations of a wide variety of
tensor operations and algorithms
for training neural networks.% (computing derivatives, updating parameters).
\vspace*{-0.5cm}
@ -200,13 +200,13 @@ def get_random_sample(a, b, number_of_samples=10):
return (np.asarray(x).reshape(-1, 28, 28, 1),
np.asarray(y).reshape(10*number_of_samples,1))
\end{lstlisting}
\caption{Python code used to generate the datasets containing a
certain amount of random datapoints per class.}
\caption{Python code used to generate the data sets containing a
certain amount of random data points per class.}
\end{lstfloat}
\section{Additional Comparisons}
\label{app:comp}
In this section comparisons of cross entropy loss and training
In this section, comparisons of cross entropy loss and training
accuracy for the models trained in Section~\ref{sec:smalldata} are given.
\begin{figure}[h]
\centering

@ -33,7 +33,7 @@
positional data. As filter
$g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$
is chosen and applied to the $x$ and $y$ coordinate
data seperately. The convolution of both signals with $g$
data separately. The convolution of both signals with $g$
improves the MSE of the positions from 0.196 to 0.170 and
visibly smoothes the data.
}

@ -323,7 +323,7 @@
\plimn F^{\lambda, g}(f^n) = F^{\lambda, g}(h) \implies
\plimn F_{+-}^{\lambda,g '}(f_+,f_-) = F_{+-}^{\lambda,g '}(h_+,h_-),
\]
and all functions can be split in two functions with disjoint support
and all functions can be split in two functions with disjoint support,
Lemma~\ref{lem:s7} follows.
\end{Proof}
\input{Appendix_code.tex}

@ -2,14 +2,12 @@
series = {arXiv},
author = {Heiss, Jakob and Teichmann, Josef and Wutte, Hanna},
publisher = {Cornell University},
year = {2019},
language = {en},
copyright = {In Copyright - Non-Commercial Use Permitted},
year = {2019}, copyright = {In Copyright - Non-Commercial Use Permitted},
keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
size = {53 p.},
DOI = {10.3929/ethz-b-000402003},
title = {How Implicit Regularization of Neural Networks Affects the Learned Function Part I},
PAGES = {1911.02903}
PAGES = {1911.02903},
}
@article{Dropout,
@ -20,7 +18,7 @@
volume = 15,
number = 56,
pages = {1929-1958},
url = {http://jmlr.org/papers/v15/srivastava14a.html}
Comment url = {http://jmlr.org/papers/v15/srivastava14a.html}
}
@article{ADADELTA,
@ -29,12 +27,10 @@
journal = {CoRR},
volume = {abs/1212.5701},
year = 2012,
url = {http://arxiv.org/abs/1212.5701},
Comment url = {http://arxiv.org/abs/1212.5701},
archivePrefix = {arXiv},
eprint = {1212.5701},
timestamp = {Mon, 13 Aug 2018 16:45:57 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1212-5701.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{backprop,
@ -49,26 +45,21 @@ day={01},
volume={323},
number={6088},
pages={533-536},
abstract={We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal `hidden' units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.},
issn={1476-4687},
doi={10.1038/323533a0},
url={https://doi.org/10.1038/323533a0}
Comment url={https://doi.org/10.1038/323533a0}
}
@article{MNIST,
added-at = {2010-06-28T21:16:30.000+0200},
author = {LeCun, Yann and Cortes, Corinna},
biburl = {https://www.bibsonomy.org/bibtex/2935bad99fa1f65e03c25b315aa3c1032/mhwombat},
groups = {public},
howpublished = {http://yann.lecun.com/exdb/mnist/},
interhash = {21b9d0558bd66279df9452562df6e6f3},
intrahash = {935bad99fa1f65e03c25b315aa3c1032},
keywords = {MSc _checked character_recognition mnist network neural},
lastchecked = {2016-01-14 14:24:11},
timestamp = {2016-07-12T19:25:30.000+0200},
title = {{MNIST} handwritten digit database},
url = {http://yann.lecun.com/exdb/mnist/},
username = {mhwombat},
Comment url = {http://yann.lecun.com/exdb/mnist/},
year = 2010
}
@INPROCEEDINGS{resnet,
@ -127,11 +118,10 @@ journal = {NIPS}
journal = {CoRR},
volume = {abs/1406.2572},
year = {2014},
url = {http://arxiv.org/abs/1406.2572},
Comment url = {http://arxiv.org/abs/1406.2572},
archivePrefix = {arXiv},
eprint = {1406.2572},
timestamp = {Mon, 22 Jul 2019 13:15:46 +0200},
biburl = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@ -145,12 +135,10 @@ journal = {NIPS}
journal = {CoRR},
volume = {abs/1207.0580},
year = {2012},
url = {http://arxiv.org/abs/1207.0580},
Comment url = {http://arxiv.org/abs/1207.0580},
archivePrefix = {arXiv},
eprint = {1207.0580},
timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{
@ -159,22 +147,20 @@ title={On the Variance of the Adaptive Learning Rate and Beyond},
author={Liyuan Liu and Haoming Jiang and Pengcheng He and Weizhu Chen and Xiaodong Liu and Jianfeng Gao and Jiawei Han},
booktitle={International Conference on Learning Representations},
year={2020},
url={https://openreview.net/forum?id=rkgz2aEKDr}
Comment url={https://openreview.net/forum?id=rkgz2aEKDr}
}
@inproceedings{ADAM,
author = {Diederik P. Kingma and
Jimmy Ba},
editor = {Yoshua Bengio and
Yann LeCun},
@Comment editor = {Yoshua Bengio and
@Comment Yann LeCun},
title = {Adam: {A} Method for Stochastic Optimization},
booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
year = {2015},
url = {http://arxiv.org/abs/1412.6980},
Comment url = {http://arxiv.org/abs/1412.6980},
timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{transfer_learning,
@ -186,11 +172,11 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
pages = {020018},
year = {2017},
doi = {10.1063/1.4992835},
URL = {https://aip.scitation.org/doi/abs/10.1063/1.4992835},
eprint = {https://aip.scitation.org/doi/pdf/10.1063/1.4992835}
}
@article{gan,
author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan",
title = "GAN-based synthetic medical image augmentation for increased CNN performance in liver lesion classification",
journal = "Neurocomputing",
volume = 321,
@ -198,8 +184,7 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
year = 2018,
issn = "0925-2312",
doi = "https://doi.org/10.1016/j.neucom.2018.09.013",
url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan"
Comment url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
}
@online{fashionMNIST,
@ -219,7 +204,7 @@ year = {2018},
isbn = {9781450363549},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3206098.3206111},
Comment url = {https://doi.org/10.1145/3206098.3206111},
doi = {10.1145/3206098.3206111},
booktitle = {Proceedings of the 2nd International Conference on Information System and Data Mining},
pages = {1928},
@ -239,12 +224,10 @@ series = {ICISDM '18}
journal = {CoRR},
volume = {abs/1708.04896},
year = 2017,
url = {http://arxiv.org/abs/1708.04896},
Comment url = {http://arxiv.org/abs/1708.04896},
archivePrefix = {arXiv},
eprint = {1708.04896},
timestamp = {Mon, 13 Aug 2018 16:47:52 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1708-04896.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{draw_convnet,
@ -252,7 +235,7 @@ series = {ICISDM '18}
howpublished = {\url{https://github.com/gwding/draw_convnet}},
note = {Accessed: 30.08.2020},
author = {Gavin Weiguang Ding},
year = {2018}
year = 2018
}
@book{Haykin,
@ -290,7 +273,6 @@ series = {ICISDM '18}
title = {Generative Adversarial Nets},
author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
booktitle = {Advances in Neural Information Processing Systems 27},
editor = {Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence and K. Q. Weinberger},
pages = {2672--2680},
year = {2014},
publisher = {Curran Associates, Inc.},

@ -99,7 +99,7 @@ $v$
\end{scope}
\end{tikzpicture}
\end{adjustbox}
\caption[Channel Separation of Color Image]{On the right the red, green, and blue chances of the picture
\caption[Channel Separation of Color Image]{On the right the red, green, and blue channels of the picture
are displayed. In order to better visualize the color channels the
black and white picture of each channel has been colored in the
respective color. Combining the layers results in the image on the
@ -134,7 +134,7 @@ convolution is well defined for all pixels of the image.
Simple examples of image manipulation using
convolution are smoothing operations or
rudimentary detection of edges in grayscale images, meaning they only
rudimentary detection of edges in gray-scale images, meaning they only
have one channel. A filter often used to smooth or blur images
is the Gauss-filter which for a given $\sigma \in \mathbb{R}_+$ and
size $s \in \mathbb{N}$ is
@ -162,7 +162,7 @@ output is given by
\[
O = \sqrt{(I * G)^2 + (I*G^T)^2}
\]
where $\sqrt{\cdot}$ and $\cdot^2$ are applied componentwise. Examples
where $\sqrt{\cdot}$ and $\cdot^2$ are applied component-wise. Examples
for convolution of an image with both kernels are given
in Figure~\ref{fig:img_conv}.
\begin{figure}[H]
@ -208,7 +208,7 @@ in Figure~\ref{fig:img_conv}.
% \caption{test}
% \end{subfigure}
\vspace{-0.1cm}
\caption[Convolution Applied on Image]{Convolution of original greyscale Image (a) with different
\caption[Convolution Applied on Image]{Convolution of original gray-scale Image (a) with different
kernels. In (b) and (c) Gaussian kernels of size 11 and stated
$\sigma^2$ are used. In (d) to (f) the above defined Sobel Operator
kernels are used.}
@ -410,7 +410,7 @@ network.
A class of algorithms that augment the gradient descent
algorithm to lessen this problem are stochastic gradient
descent algorithms.
Here the full dataset is split into smaller disjoint subsets.
Here the full data set is split into smaller disjoint subsets.
Then in each iteration, a (different) subset of data is chosen to
compute the gradient (Algorithm~\ref{alg:sgd}).
The training period until each data point has been considered at least
@ -496,7 +496,7 @@ time.
\includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf}
\caption[CNN Architecture for MNIST Handwritten
Digits]{Convolutional neural network architecture used to model the
MNIST handwritten digits dataset. This figure was created with
MNIST handwritten digits data set. This figure was created with
help of the
{\sffamily{draw\textunderscore convnet}} Python script by \textcite{draw_convnet}.}
\label{fig:mnist_architecture}
@ -546,7 +546,7 @@ The most popular three implementations of this are:
\[
\gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}.
\]
\item Exponential deca,y where the learning rate is decreased after each epoch
\item Exponential decay, where the learning rate is decreased after each epoch
\[
\gamma_n = \gamma_o e^{-n d}.
\]
@ -782,7 +782,7 @@ neural networks.
To get an understanding of the performance of the above
discussed training algorithms the neural network given in
\ref{fig:mnist_architecture} has been
trained on the MNIST handwriting dataset with the above described
trained on the MNIST handwriting data set with the above described
algorithms. For all algorithms, a global learning rate of $0.001$ is
chosen. The parameter preventing divisions by zero is set to
$\varepsilon = 10^{-7}$. For \textsc{AdaDelta} and
@ -938,7 +938,7 @@ to following this practice will be referred to as data generation.
\includegraphics[width=\textwidth]{Figures/Data/mnist_gen_shift.pdf}
\caption{random\\positional shift}
\end{subfigure}
\caption[Image Data Generation]{Example for the manipuations used in
\caption[Image Data Generation]{Example for the manipulations used in
later comparisons. Brightness manipulation and mirroring are not
used, as the images are equal in brightness and digits are not
invariant to mirroring.}
@ -985,15 +985,15 @@ the available data can be highly limited.
In these scenarios, the networks are highly prone to overfit the
data. To get an understanding of accuracies achievable and the
impact of the methods aimed at mitigating overfitting discussed above we fit
networks with different measures implemented to datasets of
networks with different measures implemented to data sets of
varying sizes.
For training, we use the MNIST handwriting dataset as well as the fashion
MNIST dataset. The fashion MNIST dataset is a benchmark set build by
For training, we use the MNIST handwriting data set as well as the fashion
MNIST data set. The fashion MNIST data set is a benchmark set build by
\textcite{fashionMNIST} to provide a more challenging set, as state of
the art models are able to achieve accuracies of 99.88\%
(\textcite{10.1145/3206098.3206111}) on the handwriting set.
The dataset contains 70.000 preprocessed and labeled images of clothes from
The data set contains 70.000 preprocessed and labeled images of clothes from
Zalando. An overview is given in Figure~\ref{fig:fashionMNIST}.
\input{Figures/fashion_mnist.tex}
@ -1082,7 +1082,7 @@ Zalando. An overview is given in Figure~\ref{fig:fashionMNIST}.
The models are trained on subsets with a certain amount of randomly
chosen data points per class.
The sizes chosen for the comparisons are the full dataset, 100, 10, and 1
The sizes chosen for the comparisons are the full data set, 100, 10, and 1
data points per class.
For the task of classifying the fashion data a slightly altered model
@ -1093,7 +1093,7 @@ by two consecutive convolutional layers with filters of size 3.
\includegraphics[width=\textwidth]{Figures/Data/cnn_fashion_fig.pdf}
\caption[CNN Architecture for Fashion MNIST]{Convolutional neural
network architecture used to model the
fashion MNIST dataset. This figure was created using the
fashion MNIST data set. This figure was created using the
draw\textunderscore convnet Python script by \textcite{draw_convnet}.}
\label{fig:fashion_MNIST}
\end{figure}
@ -1110,14 +1110,14 @@ of the models and the parameters used for data generation are given
in Listing~\ref{lst:handwriting} for the handwriting model and in
Listing~\ref{lst:fashion} for the fashion model.
The models are trained for 125s epochs in order
The models are trained for 125 epochs in order
to have enough random
augmentations of the input images present during training,
for the networks to fully profit from the additional training data generated.
The test accuracies of the models after
training for 125
epochs are given in Table~\ref{table:digitsOF} for the handwritten digits
and in Table~\ref{table:fashionOF} for the fashion datasets. Additionally the
and in Table~\ref{table:fashionOF} for the fashion data sets. Additionally the
average test accuracies over the course of learning are given in
Figure~\ref{fig:plotOF_digits} for the handwriting application and
Figure~\ref{fig:plotOF_fashion} for the
@ -1225,7 +1225,7 @@ fashion application.
\end{subfigure}
\caption[Mean Test Accuracies for Subsets of MNIST Handwritten
Digits]{Mean test accuracies of the models fitting the sampled MNIST
handwriting datasets over the 125 epochs of training.}
handwriting data sets over the 125 epochs of training.}
\label{fig:plotOF_digits}
\end{figure}
@ -1352,13 +1352,13 @@ class.
In all scenarios, the addition of the measures reduces the
variance of the model.
The model fit to the fashion MNIST data set benefits less of the
The model fit to the fashion MNIST data set benefits less from these
measures.
For the smallest scenario of one sample per class, a substantial
increase in accuracy can be observed for both measures.
Contrary to the digits data set, dropout improves the
model by a similar margin to data generation.
For the larger data sets, the benefits are far smaller. While
For the larger data sets, the benefits are much smaller. While
in the scenario with 100 samples per class a performance increase can
be seen for with data generation, in the scenario with 10 samples per
class it performs worse than the baseline model.
@ -1367,7 +1367,7 @@ and 100 sample scenario. In all scenarios data generation seems to
benefit from the addition of dropout.
Additional Figures and Tables for the same comparisons with different
performance metrics are given in Appendix~\ref{app:comp}
performance metrics are given in Appendix~\ref{app:comp}.
There it can be seen that while the measures are able reduce overfitting
effectively for the handwritten digits data set, the neural networks
trained on the fashion data set overfit despite these measures being
@ -1416,7 +1416,7 @@ data points which might explain the worse performance of data generation.
In this thesis, we have taken a look at neural networks, their
behavior in small scenarios and their application on image
classification with limited datasets.
classification with limited data sets.
We have explored the relation between ridge penalized neural networks
and slightly altered cubic smoothing splines, giving us an insight
@ -1424,7 +1424,7 @@ about the behavior of the learned function of neural networks.
When comparing optimization algorithms, we have seen that choosing the
right training algorithm can have a
drastic impact on the efficiency of training and quality of a model
the drastic impact on the efficiency of training and quality of a model
obtainable in a reasonable time frame.
The \textsc{Adam} algorithm has performed well in training the
convolutional neural networks.
@ -1438,7 +1438,7 @@ measures combating overfitting, especially if the available training sets are o
a small size. The success of the measures we have examined
seems to be highly dependent on the use case and further research is
being done on the topic of combating overfitting in neural networks.
\textcite{random_erasing} propose randomly erasing parts of the inputs
\textcite{random_erasing} propose randomly erasing parts of the input
images during training and are able to achieve a high accuracy of 96,35\% on the fashion MNIST
data set this way.
While data generation explored in this thesis is able to rudimentary

@ -12,7 +12,7 @@ neural networks.
Furthermore, highly optimized and parallelized frameworks for tensor
operations have been developed.
With these frameworks, such as TensorFlow and PyTorch, building neural
networks as become a much more straightforward process.
networks has become a much more straightforward process.
% Furthermore, with the development of highly optimized and
% parallelized implementations of mathematical operations needed for
% neural networks, such as TensorFlow or PyTorch, building neural network
@ -27,12 +27,12 @@ networks as become a much more straightforward process.
In this thesis we want to get an understanding of the behavior of neural %
networks and
how we can use them for problems with a complex relationship between
in and output.
in- and output.
In Section 2 we introduce the mathematical construct of neural
networks and how to fit them to training data.
To gain some insight about the learned function,
we examine a simple class of neural networks that only contain one
we examine a simple class of neural networks that contain only one
hidden layer.
In Section~\ref{sec:shallownn} we proof a relation between such networks and
functions that minimize the distance to training data
@ -54,7 +54,7 @@ gradient descent in Section~4.4.
% data in each iteration rather than using the whole data set to update
% the parameters.
Most statistical models especially these with large amounts of
trainable parameter can struggle with overfitting the data.
trainable parameters can struggle with overfitting the data.
In Section 4.5 we examine the impact of two measures designed to combat
overfitting.

@ -117,8 +117,8 @@ The activation function is usually chosen nonlinear (a linear one
would result in the entire network collapsing into a linear model) which
allows it to better model data where the relation of in- and output is
of nonlinear nature.
There are two types of activation functions, saturating and not
saturating ones. Popular examples for the former are sigmoid
There are two types of activation functions, saturating and
non-saturating ones. Popular examples for the former are sigmoid
functions where most commonly the standard logistic function or tangens
hyperbolicus are used
as they have easy to compute derivatives which is desirable for
@ -139,7 +139,7 @@ derivatives are close to zero on most of their realm, only assuming
larger values in proximity to zero.
This can hinder the progress of gradient-based methods.
The nonsaturating activation functions commonly used are the rectified
The non-saturating activation functions commonly used are the rectified
linear unit (ReLU) or the leaky ReLU. The ReLU is given by
\begin{equation}
r(x) = \max\left\{0, x\right\}.
@ -292,7 +292,7 @@ In Figure~\ref{fig:activation} visualizations of these functions are given.
\clearpage
\subsection{Training Neural Networks}
As neural networks are a parametric model we need to fit the
As neural networks are parametric models we need to fit the
parameters to the input
data to get meaningful predictions from the network. In order
to accomplish this we need to discuss how we interpret the output of the

File diff suppressed because it is too large Load Diff

@ -1,6 +1,6 @@
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
\babel@toc {english}{}
\defcounter {refsection}{0}\relax
\contentsline {table}{\numberline {4.1}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of MNIST Handwritten Digits}}{41}%
\contentsline {table}{\numberline {4.1}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of MNIST Handwritten Digits}}{41}{table.4.1}%
\defcounter {refsection}{0}\relax
\contentsline {table}{\numberline {4.2}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of Fashion MNIST}}{41}%
\contentsline {table}{\numberline {4.2}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of Fashion MNIST}}{41}{table.4.2}%

@ -19,7 +19,7 @@
\BOOKMARK [3][-]{subsubsection.4.5.3}{Comparisons}{subsection.4.5}% 19
\BOOKMARK [3][-]{subsubsection.4.5.4}{Effectiveness for Small Training Sets}{subsection.4.5}% 20
\BOOKMARK [1][-]{section.5}{Summary and Outlook}{}% 21
\BOOKMARK [1][-]{section*.28}{Appendices}{}% 22
\BOOKMARK [1][-]{Appendix.a.A}{Notes on Proofs of Lemmata in Section 3.1}{}% 23
\BOOKMARK [1][-]{Appendix.a.B}{Implementations}{}% 24
\BOOKMARK [1][-]{Appendix.a.C}{Additional Comparisons}{}% 25
\BOOKMARK [1][-]{section*.27}{Appendices}{}% 22
\BOOKMARK [1][-]{Appendix.1.A}{Notes on Proofs of Lemmata in Section 3.1}{}% 23
\BOOKMARK [1][-]{Appendix.1.B}{Implementations}{}% 24
\BOOKMARK [1][-]{Appendix.1.C}{Additional Comparisons}{}% 25

@ -6,6 +6,7 @@
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{hyperref}
\usepackage{textcomp}
%\usepackage{libertine}
\usepackage{amsmath}
@ -178,7 +179,7 @@
keywordstyle = [2]{\color{ipython_cyan}\ttfamily},
}
\usepackage[style=authoryear, backend=bibtex]{biblatex}
\usepackage[authordate, backend=bibtex, firstinits = true]{biblatex-chicago}
\urlstyle{same}
\bibliography{bibliograpy.bib}
\numberwithin{figure}{section}
@ -280,7 +281,7 @@
\input{further_applications_of_nn}
\newpage
\DeclareNameAlias{sortname}{last-first}
\printbibliography
% Appendix A

@ -107,8 +107,8 @@ on MSE will perfectly fit the data.
\proof
W.l.o.g. all values $x_{ij}^{\text{train}} \in [0,1],~\forall i \in
\left\{1,\dots, t\right\}, j \in \left\{1,\dots,d\right\}$. Now we
chose $v^*$ in order to calculate a unique value for all
$x_i^{\text{train}}$:
chose $v^*$ such that the vector-product with $x_i^{\text{train}}$
results is distinct values for all $i \in \left\{1,\dots,t\right\}$:
\[
v^*_{k,j} = v^*_{j} = 10^{j-1}, ~ \forall k \in \left\{1,\dots,n\right\}.
\]
@ -199,15 +199,16 @@ increased.
[x=x, y=y, col sep=comma, only marks,mark options={scale =
0.7}] {Figures/Data/overfit.csv};
\addplot [red, line width=0.8pt] table [x=x_n, y=s_n, col
sep=comma, forget plot] {Figures/Data/overfit.csv};
sep=comma] {Figures/Data/overfit.csv};
\addplot [black, line width=0.8pt] table [x=x_n, y=y_n, col
sep=comma] {Figures/Data/overfit.csv};
\addplot [black, line width=0.8pt, dashed] table [x=x, y=y, col
sep=comma] {Figures/Data/overfit_spline.csv};
\addlegendentry{\footnotesize{data}};
\addlegendentry{\footnotesize{Data}};
\addlegendentry{\footnotesize{Truth}};
\addlegendentry{\footnotesize{$\mathcal{NN}_{\vartheta^*}$}};
\addlegendentry{\footnotesize{spline}};
\addlegendentry{\footnotesize{Spline}};
\end{axis}
\end{tikzpicture}
\caption[Overfitting of Shallow Neural Networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
@ -340,7 +341,7 @@ derivative of the function a cubic smoothing spline.
\begin{Definition}[Cubic Smoothing Spline]
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
\left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
\left\{1,\dots,N\right\}$ be training data. for a given $\lambda \in
\mathbb{R}$ the cubic smoothing spline is given by
\[
f^{*,\lambda} :\in \argmin_{f \in
@ -377,7 +378,7 @@ definition is given in Definition~\ref{def:wrs}.
Wutte (2019, Definition 3.5)]
\label{def:wrs}
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
\left\{1,\dots,N\right\}$ be training data. For a given $\lambda \in \mathbb{R}_{>0}$
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
cubic smoothing spline $f^{*, \lambda}_g$ is given by
@ -535,9 +536,9 @@ parameters and their densities.
\item The density function $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
\item $g_{\xi}(0) \neq 0$.
\item $\frac{1}{g_{\xi}}\Big|_{\supp(g_{\xi})}$ is uniformly
continous on $\supp(g_{\xi})$.
continuous on $\supp(g_{\xi})$.
\item The conditional distribution $\mathcal{L}(v_k|\xi_k = x)$
is uniformly continous on $\supp(g_{\xi})$.
is uniformly continuous on $\supp(g_{\xi})$.
\item $\mathbb{E}\left[v_k^2\right] < \infty$.
\end{enumerate}
\end{Assumption}
@ -550,7 +551,7 @@ introduce it and the corresponding induced norm.
define the Sobolev space $W^{k,p}(K)$ as the space containing all
real valued functions $u \in L^p(K)$ such that for every multi-index
$\alpha \in \mathbb{N}^n$ with $\abs{\alpha} \leq
k$ the mixed parial derivatives
k$ the mixed partial derivatives
\[
u^{(\alpha)} = \frac{\partial^{\abs{\alpha}} u}{\partial
x_1^{\alpha_1} \dots \partial x_n^{\alpha_n}}
@ -625,7 +626,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
\begin{Lemma}[Poincar\'e Typed Inequality]
\label{lem:pieq}
Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
\mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
\mathbb{R} \to \mathbb{R}\) Lebesgue integrable. Then for \(K=[a,b]
\subset \mathbb{R}\) with \(f(a)=0\) it holds that
\begin{equation*}
\label{eq:pti1}
@ -633,8 +634,8 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
\norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
\norm{f'}_{L^{\infty}(K)}.
\end{equation*}
If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
\mathbb{R}\) Lesbeque integrable then
If additionally \(f'\) is differentiable with \(f'': \mathbb{R} \to
\mathbb{R}\) Lebesgue integrable then
\begin{equation*}
\label{eq:pti2}
\exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
@ -678,7 +679,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
\begin{Lemma}
\label{lem:cnvh}
Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
\mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
\mathbb{R}^2 \to \mathbb{R}\) uniformly continuous such that
\[
\forall x \in \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
\frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
@ -829,7 +830,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in
\left\{1,\dots,N\right\}$, with $w^*$ as
defined in Definition~\ref{def:rpnn} and $\tilde{\lambda}$ as
defined in Theroem~\ref{theo:main1}, it holds
defined in Theorem~\ref{theo:main1}, it holds
\[
\plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} -
f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0.
@ -842,7 +843,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
For any $\lambda > 0$, $N \in \mathbb{N}$, and training data $(x_i^{\text{train}},
y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in
\left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
defined in Definition~\ref{def:rpnn} and Theorem~\ref{theo:main1}
respectively, it holds
\[
\plimn \abs{F_n^{\tilde{\lambda}}(\mathcal{RN}^{*,\tilde{\lambda}}) -
@ -955,7 +956,7 @@ is stopped early, they are close to adapted weighted cubic smoothing splines.
\newpage
\subsection{Simulations}
\label{sec:rsnn_sim}
In the following the behaviour described in Theorem~\ref{theo:main1}
In the following the behavior described in Theorem~\ref{theo:main1}
is visualized in a simulated example. For this two sets of training
data have been generated.
\begin{itemize}

Loading…
Cancel
Save