final 2.0
This commit is contained in:
parent
2ef7cda1dd
commit
2b10ef56e3
@ -1,13 +1,13 @@
|
||||
\section{Implementations}
|
||||
In this section the implementations models used are given.
|
||||
The randomized shallow neural network used in Section~\ref{sec:conv} are
|
||||
implemented in Scala. No preexisting frameworks were used to ensure
|
||||
In this section the implementations of the models used are given.
|
||||
The randomized shallow neural network used in Section~\ref{sec:conv} is
|
||||
implemented in Scala. No pre-existing frameworks were used to ensure
|
||||
the implementation was according to the definitions used in Theorem~\ref{theo:main1}.
|
||||
|
||||
The neural networks used in Section~\ref{sec:cnn} are implemented in python using
|
||||
The neural networks used in Section~\ref{sec:cnn} are implemented in Python using
|
||||
the Keras framework given in TensorFlow. TensorFlow is a library
|
||||
containing highly efficient GPU implementations of a wide variety
|
||||
tensor operations, such as convolution as well as efficient algorithms
|
||||
containing highly efficient GPU implementations of a wide variety of
|
||||
tensor operations and algorithms
|
||||
for training neural networks.% (computing derivatives, updating parameters).
|
||||
|
||||
\vspace*{-0.5cm}
|
||||
@ -200,13 +200,13 @@ def get_random_sample(a, b, number_of_samples=10):
|
||||
return (np.asarray(x).reshape(-1, 28, 28, 1),
|
||||
np.asarray(y).reshape(10*number_of_samples,1))
|
||||
\end{lstlisting}
|
||||
\caption{Python code used to generate the datasets containing a
|
||||
certain amount of random datapoints per class.}
|
||||
\caption{Python code used to generate the data sets containing a
|
||||
certain amount of random data points per class.}
|
||||
\end{lstfloat}
|
||||
|
||||
\section{Additional Comparisons}
|
||||
\label{app:comp}
|
||||
In this section comparisons of cross entropy loss and training
|
||||
In this section, comparisons of cross entropy loss and training
|
||||
accuracy for the models trained in Section~\ref{sec:smalldata} are given.
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
|
@ -33,7 +33,7 @@
|
||||
positional data. As filter
|
||||
$g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$
|
||||
is chosen and applied to the $x$ and $y$ coordinate
|
||||
data seperately. The convolution of both signals with $g$
|
||||
data separately. The convolution of both signals with $g$
|
||||
improves the MSE of the positions from 0.196 to 0.170 and
|
||||
visibly smoothes the data.
|
||||
}
|
||||
|
@ -323,7 +323,7 @@
|
||||
\plimn F^{\lambda, g}(f^n) = F^{\lambda, g}(h) \implies
|
||||
\plimn F_{+-}^{\lambda,g '}(f_+,f_-) = F_{+-}^{\lambda,g '}(h_+,h_-),
|
||||
\]
|
||||
and all functions can be split in two functions with disjoint support
|
||||
and all functions can be split in two functions with disjoint support,
|
||||
Lemma~\ref{lem:s7} follows.
|
||||
\end{Proof}
|
||||
\input{Appendix_code.tex}
|
||||
|
@ -2,14 +2,12 @@
|
||||
series = {arXiv},
|
||||
author = {Heiss, Jakob and Teichmann, Josef and Wutte, Hanna},
|
||||
publisher = {Cornell University},
|
||||
year = {2019},
|
||||
language = {en},
|
||||
copyright = {In Copyright - Non-Commercial Use Permitted},
|
||||
year = {2019}, copyright = {In Copyright - Non-Commercial Use Permitted},
|
||||
keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
|
||||
size = {53 p.},
|
||||
DOI = {10.3929/ethz-b-000402003},
|
||||
title = {How Implicit Regularization of Neural Networks Affects the Learned Function – Part I},
|
||||
PAGES = {1911.02903}
|
||||
PAGES = {1911.02903},
|
||||
}
|
||||
|
||||
@article{Dropout,
|
||||
@ -20,7 +18,7 @@
|
||||
volume = 15,
|
||||
number = 56,
|
||||
pages = {1929-1958},
|
||||
url = {http://jmlr.org/papers/v15/srivastava14a.html}
|
||||
Comment url = {http://jmlr.org/papers/v15/srivastava14a.html}
|
||||
}
|
||||
|
||||
@article{ADADELTA,
|
||||
@ -29,12 +27,10 @@
|
||||
journal = {CoRR},
|
||||
volume = {abs/1212.5701},
|
||||
year = 2012,
|
||||
url = {http://arxiv.org/abs/1212.5701},
|
||||
Comment url = {http://arxiv.org/abs/1212.5701},
|
||||
archivePrefix = {arXiv},
|
||||
eprint = {1212.5701},
|
||||
timestamp = {Mon, 13 Aug 2018 16:45:57 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/abs-1212-5701.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@article{backprop,
|
||||
@ -49,26 +45,21 @@ day={01},
|
||||
volume={323},
|
||||
number={6088},
|
||||
pages={533-536},
|
||||
abstract={We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal `hidden' units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.},
|
||||
issn={1476-4687},
|
||||
doi={10.1038/323533a0},
|
||||
url={https://doi.org/10.1038/323533a0}
|
||||
Comment url={https://doi.org/10.1038/323533a0}
|
||||
}
|
||||
|
||||
@article{MNIST,
|
||||
added-at = {2010-06-28T21:16:30.000+0200},
|
||||
author = {LeCun, Yann and Cortes, Corinna},
|
||||
biburl = {https://www.bibsonomy.org/bibtex/2935bad99fa1f65e03c25b315aa3c1032/mhwombat},
|
||||
groups = {public},
|
||||
howpublished = {http://yann.lecun.com/exdb/mnist/},
|
||||
interhash = {21b9d0558bd66279df9452562df6e6f3},
|
||||
intrahash = {935bad99fa1f65e03c25b315aa3c1032},
|
||||
keywords = {MSc _checked character_recognition mnist network neural},
|
||||
lastchecked = {2016-01-14 14:24:11},
|
||||
timestamp = {2016-07-12T19:25:30.000+0200},
|
||||
title = {{MNIST} handwritten digit database},
|
||||
url = {http://yann.lecun.com/exdb/mnist/},
|
||||
username = {mhwombat},
|
||||
Comment url = {http://yann.lecun.com/exdb/mnist/},
|
||||
year = 2010
|
||||
}
|
||||
@INPROCEEDINGS{resnet,
|
||||
@ -127,11 +118,10 @@ journal = {NIPS}
|
||||
journal = {CoRR},
|
||||
volume = {abs/1406.2572},
|
||||
year = {2014},
|
||||
url = {http://arxiv.org/abs/1406.2572},
|
||||
Comment url = {http://arxiv.org/abs/1406.2572},
|
||||
archivePrefix = {arXiv},
|
||||
eprint = {1406.2572},
|
||||
timestamp = {Mon, 22 Jul 2019 13:15:46 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@ -145,12 +135,10 @@ journal = {NIPS}
|
||||
journal = {CoRR},
|
||||
volume = {abs/1207.0580},
|
||||
year = {2012},
|
||||
url = {http://arxiv.org/abs/1207.0580},
|
||||
Comment url = {http://arxiv.org/abs/1207.0580},
|
||||
archivePrefix = {arXiv},
|
||||
eprint = {1207.0580},
|
||||
timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@inproceedings{
|
||||
@ -159,22 +147,20 @@ title={On the Variance of the Adaptive Learning Rate and Beyond},
|
||||
author={Liyuan Liu and Haoming Jiang and Pengcheng He and Weizhu Chen and Xiaodong Liu and Jianfeng Gao and Jiawei Han},
|
||||
booktitle={International Conference on Learning Representations},
|
||||
year={2020},
|
||||
url={https://openreview.net/forum?id=rkgz2aEKDr}
|
||||
Comment url={https://openreview.net/forum?id=rkgz2aEKDr}
|
||||
}
|
||||
|
||||
@inproceedings{ADAM,
|
||||
author = {Diederik P. Kingma and
|
||||
Jimmy Ba},
|
||||
editor = {Yoshua Bengio and
|
||||
Yann LeCun},
|
||||
@Comment editor = {Yoshua Bengio and
|
||||
@Comment Yann LeCun},
|
||||
title = {Adam: {A} Method for Stochastic Optimization},
|
||||
booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
|
||||
San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
|
||||
year = {2015},
|
||||
url = {http://arxiv.org/abs/1412.6980},
|
||||
Comment url = {http://arxiv.org/abs/1412.6980},
|
||||
timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@article{transfer_learning,
|
||||
@ -186,11 +172,11 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
|
||||
pages = {020018},
|
||||
year = {2017},
|
||||
doi = {10.1063/1.4992835},
|
||||
URL = {https://aip.scitation.org/doi/abs/10.1063/1.4992835},
|
||||
eprint = {https://aip.scitation.org/doi/pdf/10.1063/1.4992835}
|
||||
}
|
||||
|
||||
@article{gan,
|
||||
author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan",
|
||||
title = "GAN-based synthetic medical image augmentation for increased CNN performance in liver lesion classification",
|
||||
journal = "Neurocomputing",
|
||||
volume = 321,
|
||||
@ -198,8 +184,7 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
|
||||
year = 2018,
|
||||
issn = "0925-2312",
|
||||
doi = "https://doi.org/10.1016/j.neucom.2018.09.013",
|
||||
url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
|
||||
author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan"
|
||||
Comment url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
|
||||
}
|
||||
|
||||
@online{fashionMNIST,
|
||||
@ -219,7 +204,7 @@ year = {2018},
|
||||
isbn = {9781450363549},
|
||||
publisher = {Association for Computing Machinery},
|
||||
address = {New York, NY, USA},
|
||||
url = {https://doi.org/10.1145/3206098.3206111},
|
||||
Comment url = {https://doi.org/10.1145/3206098.3206111},
|
||||
doi = {10.1145/3206098.3206111},
|
||||
booktitle = {Proceedings of the 2nd International Conference on Information System and Data Mining},
|
||||
pages = {19–28},
|
||||
@ -239,12 +224,10 @@ series = {ICISDM '18}
|
||||
journal = {CoRR},
|
||||
volume = {abs/1708.04896},
|
||||
year = 2017,
|
||||
url = {http://arxiv.org/abs/1708.04896},
|
||||
Comment url = {http://arxiv.org/abs/1708.04896},
|
||||
archivePrefix = {arXiv},
|
||||
eprint = {1708.04896},
|
||||
timestamp = {Mon, 13 Aug 2018 16:47:52 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/abs-1708-04896.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@misc{draw_convnet,
|
||||
@ -252,7 +235,7 @@ series = {ICISDM '18}
|
||||
howpublished = {\url{https://github.com/gwding/draw_convnet}},
|
||||
note = {Accessed: 30.08.2020},
|
||||
author = {Gavin Weiguang Ding},
|
||||
year = {2018}
|
||||
year = 2018
|
||||
}
|
||||
|
||||
@book{Haykin,
|
||||
@ -290,7 +273,6 @@ series = {ICISDM '18}
|
||||
title = {Generative Adversarial Nets},
|
||||
author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
|
||||
booktitle = {Advances in Neural Information Processing Systems 27},
|
||||
editor = {Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence and K. Q. Weinberger},
|
||||
pages = {2672--2680},
|
||||
year = {2014},
|
||||
publisher = {Curran Associates, Inc.},
|
||||
|
@ -99,7 +99,7 @@ $v$
|
||||
\end{scope}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption[Channel Separation of Color Image]{On the right the red, green, and blue chances of the picture
|
||||
\caption[Channel Separation of Color Image]{On the right the red, green, and blue channels of the picture
|
||||
are displayed. In order to better visualize the color channels the
|
||||
black and white picture of each channel has been colored in the
|
||||
respective color. Combining the layers results in the image on the
|
||||
@ -134,7 +134,7 @@ convolution is well defined for all pixels of the image.
|
||||
|
||||
Simple examples of image manipulation using
|
||||
convolution are smoothing operations or
|
||||
rudimentary detection of edges in grayscale images, meaning they only
|
||||
rudimentary detection of edges in gray-scale images, meaning they only
|
||||
have one channel. A filter often used to smooth or blur images
|
||||
is the Gauss-filter which for a given $\sigma \in \mathbb{R}_+$ and
|
||||
size $s \in \mathbb{N}$ is
|
||||
@ -162,7 +162,7 @@ output is given by
|
||||
\[
|
||||
O = \sqrt{(I * G)^2 + (I*G^T)^2}
|
||||
\]
|
||||
where $\sqrt{\cdot}$ and $\cdot^2$ are applied componentwise. Examples
|
||||
where $\sqrt{\cdot}$ and $\cdot^2$ are applied component-wise. Examples
|
||||
for convolution of an image with both kernels are given
|
||||
in Figure~\ref{fig:img_conv}.
|
||||
\begin{figure}[H]
|
||||
@ -208,7 +208,7 @@ in Figure~\ref{fig:img_conv}.
|
||||
% \caption{test}
|
||||
% \end{subfigure}
|
||||
\vspace{-0.1cm}
|
||||
\caption[Convolution Applied on Image]{Convolution of original greyscale Image (a) with different
|
||||
\caption[Convolution Applied on Image]{Convolution of original gray-scale Image (a) with different
|
||||
kernels. In (b) and (c) Gaussian kernels of size 11 and stated
|
||||
$\sigma^2$ are used. In (d) to (f) the above defined Sobel Operator
|
||||
kernels are used.}
|
||||
@ -410,7 +410,7 @@ network.
|
||||
A class of algorithms that augment the gradient descent
|
||||
algorithm to lessen this problem are stochastic gradient
|
||||
descent algorithms.
|
||||
Here the full dataset is split into smaller disjoint subsets.
|
||||
Here the full data set is split into smaller disjoint subsets.
|
||||
Then in each iteration, a (different) subset of data is chosen to
|
||||
compute the gradient (Algorithm~\ref{alg:sgd}).
|
||||
The training period until each data point has been considered at least
|
||||
@ -496,7 +496,7 @@ time.
|
||||
\includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf}
|
||||
\caption[CNN Architecture for MNIST Handwritten
|
||||
Digits]{Convolutional neural network architecture used to model the
|
||||
MNIST handwritten digits dataset. This figure was created with
|
||||
MNIST handwritten digits data set. This figure was created with
|
||||
help of the
|
||||
{\sffamily{draw\textunderscore convnet}} Python script by \textcite{draw_convnet}.}
|
||||
\label{fig:mnist_architecture}
|
||||
@ -546,7 +546,7 @@ The most popular three implementations of this are:
|
||||
\[
|
||||
\gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}.
|
||||
\]
|
||||
\item Exponential deca,y where the learning rate is decreased after each epoch
|
||||
\item Exponential decay, where the learning rate is decreased after each epoch
|
||||
\[
|
||||
\gamma_n = \gamma_o e^{-n d}.
|
||||
\]
|
||||
@ -782,7 +782,7 @@ neural networks.
|
||||
To get an understanding of the performance of the above
|
||||
discussed training algorithms the neural network given in
|
||||
\ref{fig:mnist_architecture} has been
|
||||
trained on the MNIST handwriting dataset with the above described
|
||||
trained on the MNIST handwriting data set with the above described
|
||||
algorithms. For all algorithms, a global learning rate of $0.001$ is
|
||||
chosen. The parameter preventing divisions by zero is set to
|
||||
$\varepsilon = 10^{-7}$. For \textsc{AdaDelta} and
|
||||
@ -938,7 +938,7 @@ to following this practice will be referred to as data generation.
|
||||
\includegraphics[width=\textwidth]{Figures/Data/mnist_gen_shift.pdf}
|
||||
\caption{random\\positional shift}
|
||||
\end{subfigure}
|
||||
\caption[Image Data Generation]{Example for the manipuations used in
|
||||
\caption[Image Data Generation]{Example for the manipulations used in
|
||||
later comparisons. Brightness manipulation and mirroring are not
|
||||
used, as the images are equal in brightness and digits are not
|
||||
invariant to mirroring.}
|
||||
@ -985,15 +985,15 @@ the available data can be highly limited.
|
||||
In these scenarios, the networks are highly prone to overfit the
|
||||
data. To get an understanding of accuracies achievable and the
|
||||
impact of the methods aimed at mitigating overfitting discussed above we fit
|
||||
networks with different measures implemented to datasets of
|
||||
networks with different measures implemented to data sets of
|
||||
varying sizes.
|
||||
|
||||
For training, we use the MNIST handwriting dataset as well as the fashion
|
||||
MNIST dataset. The fashion MNIST dataset is a benchmark set build by
|
||||
For training, we use the MNIST handwriting data set as well as the fashion
|
||||
MNIST data set. The fashion MNIST data set is a benchmark set build by
|
||||
\textcite{fashionMNIST} to provide a more challenging set, as state of
|
||||
the art models are able to achieve accuracies of 99.88\%
|
||||
(\textcite{10.1145/3206098.3206111}) on the handwriting set.
|
||||
The dataset contains 70.000 preprocessed and labeled images of clothes from
|
||||
The data set contains 70.000 preprocessed and labeled images of clothes from
|
||||
Zalando. An overview is given in Figure~\ref{fig:fashionMNIST}.
|
||||
|
||||
\input{Figures/fashion_mnist.tex}
|
||||
@ -1082,7 +1082,7 @@ Zalando. An overview is given in Figure~\ref{fig:fashionMNIST}.
|
||||
|
||||
The models are trained on subsets with a certain amount of randomly
|
||||
chosen data points per class.
|
||||
The sizes chosen for the comparisons are the full dataset, 100, 10, and 1
|
||||
The sizes chosen for the comparisons are the full data set, 100, 10, and 1
|
||||
data points per class.
|
||||
|
||||
For the task of classifying the fashion data a slightly altered model
|
||||
@ -1093,7 +1093,7 @@ by two consecutive convolutional layers with filters of size 3.
|
||||
\includegraphics[width=\textwidth]{Figures/Data/cnn_fashion_fig.pdf}
|
||||
\caption[CNN Architecture for Fashion MNIST]{Convolutional neural
|
||||
network architecture used to model the
|
||||
fashion MNIST dataset. This figure was created using the
|
||||
fashion MNIST data set. This figure was created using the
|
||||
draw\textunderscore convnet Python script by \textcite{draw_convnet}.}
|
||||
\label{fig:fashion_MNIST}
|
||||
\end{figure}
|
||||
@ -1110,14 +1110,14 @@ of the models and the parameters used for data generation are given
|
||||
in Listing~\ref{lst:handwriting} for the handwriting model and in
|
||||
Listing~\ref{lst:fashion} for the fashion model.
|
||||
|
||||
The models are trained for 125s epochs in order
|
||||
The models are trained for 125 epochs in order
|
||||
to have enough random
|
||||
augmentations of the input images present during training,
|
||||
for the networks to fully profit from the additional training data generated.
|
||||
The test accuracies of the models after
|
||||
training for 125
|
||||
epochs are given in Table~\ref{table:digitsOF} for the handwritten digits
|
||||
and in Table~\ref{table:fashionOF} for the fashion datasets. Additionally the
|
||||
and in Table~\ref{table:fashionOF} for the fashion data sets. Additionally the
|
||||
average test accuracies over the course of learning are given in
|
||||
Figure~\ref{fig:plotOF_digits} for the handwriting application and
|
||||
Figure~\ref{fig:plotOF_fashion} for the
|
||||
@ -1225,7 +1225,7 @@ fashion application.
|
||||
\end{subfigure}
|
||||
\caption[Mean Test Accuracies for Subsets of MNIST Handwritten
|
||||
Digits]{Mean test accuracies of the models fitting the sampled MNIST
|
||||
handwriting datasets over the 125 epochs of training.}
|
||||
handwriting data sets over the 125 epochs of training.}
|
||||
\label{fig:plotOF_digits}
|
||||
\end{figure}
|
||||
|
||||
@ -1352,13 +1352,13 @@ class.
|
||||
In all scenarios, the addition of the measures reduces the
|
||||
variance of the model.
|
||||
|
||||
The model fit to the fashion MNIST data set benefits less of the
|
||||
The model fit to the fashion MNIST data set benefits less from these
|
||||
measures.
|
||||
For the smallest scenario of one sample per class, a substantial
|
||||
increase in accuracy can be observed for both measures.
|
||||
Contrary to the digits data set, dropout improves the
|
||||
model by a similar margin to data generation.
|
||||
For the larger data sets, the benefits are far smaller. While
|
||||
For the larger data sets, the benefits are much smaller. While
|
||||
in the scenario with 100 samples per class a performance increase can
|
||||
be seen for with data generation, in the scenario with 10 samples per
|
||||
class it performs worse than the baseline model.
|
||||
@ -1367,7 +1367,7 @@ and 100 sample scenario. In all scenarios data generation seems to
|
||||
benefit from the addition of dropout.
|
||||
|
||||
Additional Figures and Tables for the same comparisons with different
|
||||
performance metrics are given in Appendix~\ref{app:comp}
|
||||
performance metrics are given in Appendix~\ref{app:comp}.
|
||||
There it can be seen that while the measures are able reduce overfitting
|
||||
effectively for the handwritten digits data set, the neural networks
|
||||
trained on the fashion data set overfit despite these measures being
|
||||
@ -1416,7 +1416,7 @@ data points which might explain the worse performance of data generation.
|
||||
|
||||
In this thesis, we have taken a look at neural networks, their
|
||||
behavior in small scenarios and their application on image
|
||||
classification with limited datasets.
|
||||
classification with limited data sets.
|
||||
|
||||
We have explored the relation between ridge penalized neural networks
|
||||
and slightly altered cubic smoothing splines, giving us an insight
|
||||
@ -1424,7 +1424,7 @@ about the behavior of the learned function of neural networks.
|
||||
|
||||
When comparing optimization algorithms, we have seen that choosing the
|
||||
right training algorithm can have a
|
||||
drastic impact on the efficiency of training and quality of a model
|
||||
the drastic impact on the efficiency of training and quality of a model
|
||||
obtainable in a reasonable time frame.
|
||||
The \textsc{Adam} algorithm has performed well in training the
|
||||
convolutional neural networks.
|
||||
@ -1438,7 +1438,7 @@ measures combating overfitting, especially if the available training sets are o
|
||||
a small size. The success of the measures we have examined
|
||||
seems to be highly dependent on the use case and further research is
|
||||
being done on the topic of combating overfitting in neural networks.
|
||||
\textcite{random_erasing} propose randomly erasing parts of the inputs
|
||||
\textcite{random_erasing} propose randomly erasing parts of the input
|
||||
images during training and are able to achieve a high accuracy of 96,35\% on the fashion MNIST
|
||||
data set this way.
|
||||
While data generation explored in this thesis is able to rudimentary
|
||||
|
@ -12,7 +12,7 @@ neural networks.
|
||||
Furthermore, highly optimized and parallelized frameworks for tensor
|
||||
operations have been developed.
|
||||
With these frameworks, such as TensorFlow and PyTorch, building neural
|
||||
networks as become a much more straightforward process.
|
||||
networks has become a much more straightforward process.
|
||||
% Furthermore, with the development of highly optimized and
|
||||
% parallelized implementations of mathematical operations needed for
|
||||
% neural networks, such as TensorFlow or PyTorch, building neural network
|
||||
@ -27,12 +27,12 @@ networks as become a much more straightforward process.
|
||||
In this thesis we want to get an understanding of the behavior of neural %
|
||||
networks and
|
||||
how we can use them for problems with a complex relationship between
|
||||
in and output.
|
||||
in- and output.
|
||||
In Section 2 we introduce the mathematical construct of neural
|
||||
networks and how to fit them to training data.
|
||||
|
||||
To gain some insight about the learned function,
|
||||
we examine a simple class of neural networks that only contain one
|
||||
we examine a simple class of neural networks that contain only one
|
||||
hidden layer.
|
||||
In Section~\ref{sec:shallownn} we proof a relation between such networks and
|
||||
functions that minimize the distance to training data
|
||||
@ -54,7 +54,7 @@ gradient descent in Section~4.4.
|
||||
% data in each iteration rather than using the whole data set to update
|
||||
% the parameters.
|
||||
Most statistical models especially these with large amounts of
|
||||
trainable parameter can struggle with overfitting the data.
|
||||
trainable parameters can struggle with overfitting the data.
|
||||
In Section 4.5 we examine the impact of two measures designed to combat
|
||||
overfitting.
|
||||
|
||||
|
@ -117,8 +117,8 @@ The activation function is usually chosen nonlinear (a linear one
|
||||
would result in the entire network collapsing into a linear model) which
|
||||
allows it to better model data where the relation of in- and output is
|
||||
of nonlinear nature.
|
||||
There are two types of activation functions, saturating and not
|
||||
saturating ones. Popular examples for the former are sigmoid
|
||||
There are two types of activation functions, saturating and
|
||||
non-saturating ones. Popular examples for the former are sigmoid
|
||||
functions where most commonly the standard logistic function or tangens
|
||||
hyperbolicus are used
|
||||
as they have easy to compute derivatives which is desirable for
|
||||
@ -139,7 +139,7 @@ derivatives are close to zero on most of their realm, only assuming
|
||||
larger values in proximity to zero.
|
||||
This can hinder the progress of gradient-based methods.
|
||||
|
||||
The nonsaturating activation functions commonly used are the rectified
|
||||
The non-saturating activation functions commonly used are the rectified
|
||||
linear unit (ReLU) or the leaky ReLU. The ReLU is given by
|
||||
\begin{equation}
|
||||
r(x) = \max\left\{0, x\right\}.
|
||||
@ -292,7 +292,7 @@ In Figure~\ref{fig:activation} visualizations of these functions are given.
|
||||
\clearpage
|
||||
\subsection{Training Neural Networks}
|
||||
|
||||
As neural networks are a parametric model we need to fit the
|
||||
As neural networks are parametric models we need to fit the
|
||||
parameters to the input
|
||||
data to get meaningful predictions from the network. In order
|
||||
to accomplish this we need to discuss how we interpret the output of the
|
||||
|
3726
TeX/main.bcf
Normal file
3726
TeX/main.bcf
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
|
||||
\babel@toc {english}{}
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {table}{\numberline {4.1}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of MNIST Handwritten Digits}}{41}%
|
||||
\contentsline {table}{\numberline {4.1}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of MNIST Handwritten Digits}}{41}{table.4.1}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {table}{\numberline {4.2}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of Fashion MNIST}}{41}%
|
||||
\contentsline {table}{\numberline {4.2}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of Fashion MNIST}}{41}{table.4.2}%
|
||||
|
@ -19,7 +19,7 @@
|
||||
\BOOKMARK [3][-]{subsubsection.4.5.3}{Comparisons}{subsection.4.5}% 19
|
||||
\BOOKMARK [3][-]{subsubsection.4.5.4}{Effectiveness for Small Training Sets}{subsection.4.5}% 20
|
||||
\BOOKMARK [1][-]{section.5}{Summary and Outlook}{}% 21
|
||||
\BOOKMARK [1][-]{section*.28}{Appendices}{}% 22
|
||||
\BOOKMARK [1][-]{Appendix.a.A}{Notes on Proofs of Lemmata in Section 3.1}{}% 23
|
||||
\BOOKMARK [1][-]{Appendix.a.B}{Implementations}{}% 24
|
||||
\BOOKMARK [1][-]{Appendix.a.C}{Additional Comparisons}{}% 25
|
||||
\BOOKMARK [1][-]{section*.27}{Appendices}{}% 22
|
||||
\BOOKMARK [1][-]{Appendix.1.A}{Notes on Proofs of Lemmata in Section 3.1}{}% 23
|
||||
\BOOKMARK [1][-]{Appendix.1.B}{Implementations}{}% 24
|
||||
\BOOKMARK [1][-]{Appendix.1.C}{Additional Comparisons}{}% 25
|
||||
|
@ -6,6 +6,7 @@
|
||||
\usepackage[english]{babel}
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{textcomp}
|
||||
%\usepackage{libertine}
|
||||
\usepackage{amsmath}
|
||||
@ -178,7 +179,7 @@
|
||||
keywordstyle = [2]{\color{ipython_cyan}\ttfamily},
|
||||
}
|
||||
|
||||
\usepackage[style=authoryear, backend=bibtex]{biblatex}
|
||||
\usepackage[authordate, backend=bibtex, firstinits = true]{biblatex-chicago}
|
||||
\urlstyle{same}
|
||||
\bibliography{bibliograpy.bib}
|
||||
\numberwithin{figure}{section}
|
||||
@ -280,7 +281,7 @@
|
||||
\input{further_applications_of_nn}
|
||||
|
||||
\newpage
|
||||
|
||||
\DeclareNameAlias{sortname}{last-first}
|
||||
\printbibliography
|
||||
|
||||
% Appendix A
|
||||
|
@ -107,8 +107,8 @@ on MSE will perfectly fit the data.
|
||||
\proof
|
||||
W.l.o.g. all values $x_{ij}^{\text{train}} \in [0,1],~\forall i \in
|
||||
\left\{1,\dots, t\right\}, j \in \left\{1,\dots,d\right\}$. Now we
|
||||
chose $v^*$ in order to calculate a unique value for all
|
||||
$x_i^{\text{train}}$:
|
||||
chose $v^*$ such that the vector-product with $x_i^{\text{train}}$
|
||||
results is distinct values for all $i \in \left\{1,\dots,t\right\}$:
|
||||
\[
|
||||
v^*_{k,j} = v^*_{j} = 10^{j-1}, ~ \forall k \in \left\{1,\dots,n\right\}.
|
||||
\]
|
||||
@ -199,15 +199,16 @@ increased.
|
||||
[x=x, y=y, col sep=comma, only marks,mark options={scale =
|
||||
0.7}] {Figures/Data/overfit.csv};
|
||||
\addplot [red, line width=0.8pt] table [x=x_n, y=s_n, col
|
||||
sep=comma, forget plot] {Figures/Data/overfit.csv};
|
||||
sep=comma] {Figures/Data/overfit.csv};
|
||||
\addplot [black, line width=0.8pt] table [x=x_n, y=y_n, col
|
||||
sep=comma] {Figures/Data/overfit.csv};
|
||||
\addplot [black, line width=0.8pt, dashed] table [x=x, y=y, col
|
||||
sep=comma] {Figures/Data/overfit_spline.csv};
|
||||
|
||||
\addlegendentry{\footnotesize{data}};
|
||||
\addlegendentry{\footnotesize{Data}};
|
||||
\addlegendentry{\footnotesize{Truth}};
|
||||
\addlegendentry{\footnotesize{$\mathcal{NN}_{\vartheta^*}$}};
|
||||
\addlegendentry{\footnotesize{spline}};
|
||||
\addlegendentry{\footnotesize{Spline}};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption[Overfitting of Shallow Neural Networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
||||
@ -340,7 +341,7 @@ derivative of the function a cubic smoothing spline.
|
||||
|
||||
\begin{Definition}[Cubic Smoothing Spline]
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
|
||||
\left\{1,\dots,N\right\}$ be training data. for a given $\lambda \in
|
||||
\mathbb{R}$ the cubic smoothing spline is given by
|
||||
\[
|
||||
f^{*,\lambda} :\in \argmin_{f \in
|
||||
@ -377,7 +378,7 @@ definition is given in Definition~\ref{def:wrs}.
|
||||
Wutte (2019, Definition 3.5)]
|
||||
\label{def:wrs}
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||
\left\{1,\dots,N\right\}$ be training data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
||||
cubic smoothing spline $f^{*, \lambda}_g$ is given by
|
||||
|
||||
@ -535,9 +536,9 @@ parameters and their densities.
|
||||
\item The density function $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
|
||||
\item $g_{\xi}(0) \neq 0$.
|
||||
\item $\frac{1}{g_{\xi}}\Big|_{\supp(g_{\xi})}$ is uniformly
|
||||
continous on $\supp(g_{\xi})$.
|
||||
continuous on $\supp(g_{\xi})$.
|
||||
\item The conditional distribution $\mathcal{L}(v_k|\xi_k = x)$
|
||||
is uniformly continous on $\supp(g_{\xi})$.
|
||||
is uniformly continuous on $\supp(g_{\xi})$.
|
||||
\item $\mathbb{E}\left[v_k^2\right] < \infty$.
|
||||
\end{enumerate}
|
||||
\end{Assumption}
|
||||
@ -550,7 +551,7 @@ introduce it and the corresponding induced norm.
|
||||
define the Sobolev space $W^{k,p}(K)$ as the space containing all
|
||||
real valued functions $u \in L^p(K)$ such that for every multi-index
|
||||
$\alpha \in \mathbb{N}^n$ with $\abs{\alpha} \leq
|
||||
k$ the mixed parial derivatives
|
||||
k$ the mixed partial derivatives
|
||||
\[
|
||||
u^{(\alpha)} = \frac{\partial^{\abs{\alpha}} u}{\partial
|
||||
x_1^{\alpha_1} \dots \partial x_n^{\alpha_n}}
|
||||
@ -625,7 +626,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
|
||||
\begin{Lemma}[Poincar\'e Typed Inequality]
|
||||
\label{lem:pieq}
|
||||
Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
|
||||
\mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
|
||||
\mathbb{R} \to \mathbb{R}\) Lebesgue integrable. Then for \(K=[a,b]
|
||||
\subset \mathbb{R}\) with \(f(a)=0\) it holds that
|
||||
\begin{equation*}
|
||||
\label{eq:pti1}
|
||||
@ -633,8 +634,8 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
|
||||
\norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
|
||||
\norm{f'}_{L^{\infty}(K)}.
|
||||
\end{equation*}
|
||||
If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
|
||||
\mathbb{R}\) Lesbeque integrable then
|
||||
If additionally \(f'\) is differentiable with \(f'': \mathbb{R} \to
|
||||
\mathbb{R}\) Lebesgue integrable then
|
||||
\begin{equation*}
|
||||
\label{eq:pti2}
|
||||
\exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
|
||||
@ -678,7 +679,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
|
||||
\begin{Lemma}
|
||||
\label{lem:cnvh}
|
||||
Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
|
||||
\mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
|
||||
\mathbb{R}^2 \to \mathbb{R}\) uniformly continuous such that
|
||||
\[
|
||||
\forall x \in \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
|
||||
\frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
|
||||
@ -829,7 +830,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
|
||||
y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in
|
||||
\left\{1,\dots,N\right\}$, with $w^*$ as
|
||||
defined in Definition~\ref{def:rpnn} and $\tilde{\lambda}$ as
|
||||
defined in Theroem~\ref{theo:main1}, it holds
|
||||
defined in Theorem~\ref{theo:main1}, it holds
|
||||
\[
|
||||
\plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} -
|
||||
f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0.
|
||||
@ -842,7 +843,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
|
||||
For any $\lambda > 0$, $N \in \mathbb{N}$, and training data $(x_i^{\text{train}},
|
||||
y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in
|
||||
\left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
|
||||
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
|
||||
defined in Definition~\ref{def:rpnn} and Theorem~\ref{theo:main1}
|
||||
respectively, it holds
|
||||
\[
|
||||
\plimn \abs{F_n^{\tilde{\lambda}}(\mathcal{RN}^{*,\tilde{\lambda}}) -
|
||||
@ -955,7 +956,7 @@ is stopped early, they are close to adapted weighted cubic smoothing splines.
|
||||
\newpage
|
||||
\subsection{Simulations}
|
||||
\label{sec:rsnn_sim}
|
||||
In the following the behaviour described in Theorem~\ref{theo:main1}
|
||||
In the following the behavior described in Theorem~\ref{theo:main1}
|
||||
is visualized in a simulated example. For this two sets of training
|
||||
data have been generated.
|
||||
\begin{itemize}
|
||||
|
Loading…
Reference in New Issue
Block a user