final 2.0

2020-09-28 09:08:20 +02:00 · 2020-09-28 09:08:20 +02:00 · 2b10ef56e3
commit 2b10ef56e3
parent 2ef7cda1dd
12 changed files with 3813 additions and 103 deletions
--- a/TeX/Appendix_code.tex
+++ b/TeX/Appendix_code.tex
@ -1,13 +1,13 @@
 \section{Implementations}
-In this section the implementations models used are given.
-The randomized shallow neural network used in Section~\ref{sec:conv} are
-implemented in Scala. No preexisting frameworks were used to ensure
+In this section the implementations of the models used are given.
+The randomized shallow neural network used in Section~\ref{sec:conv} is
+implemented in Scala. No pre-existing frameworks were used to ensure
 the implementation was according to the definitions used in Theorem~\ref{theo:main1}.

-The neural networks used in Section~\ref{sec:cnn} are implemented in python using
+The neural networks used in Section~\ref{sec:cnn} are implemented in Python using
 the Keras framework given in TensorFlow. TensorFlow is a library
-containing highly efficient GPU implementations of a wide variety
-tensor operations, such as convolution as well as efficient algorithms
+containing highly efficient GPU implementations of a wide variety of
+tensor operations and algorithms
 for training neural networks.% (computing derivatives, updating parameters).

 \vspace*{-0.5cm}
@ -200,13 +200,13 @@ def get_random_sample(a, b, number_of_samples=10):
        return (np.asarray(x).reshape(-1, 28, 28, 1),
            np.asarray(y).reshape(10*number_of_samples,1))
 \end{lstlisting}
-  \caption{Python code used to generate the datasets containing a
-    certain amount of random datapoints per class.}
+  \caption{Python code used to generate the data sets containing a
+    certain amount of random data points per class.}
 \end{lstfloat}

 \section{Additional Comparisons}
 \label{app:comp}
-In this section comparisons of cross entropy loss and training
+In this section, comparisons of cross entropy loss and training
 accuracy for the models trained in Section~\ref{sec:smalldata} are given.
 \begin{figure}[h]
  \centering
--- a/TeX/Figures/sin_conv.tex
+++ b/TeX/Figures/sin_conv.tex
@ -33,7 +33,7 @@
    positional data. As filter
    $g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$
      is chosen and applied to the $x$ and $y$ coordinate
-      data seperately. The convolution of both signals with $g$
+      data separately. The convolution of both signals with $g$
      improves the MSE of the positions from 0.196 to 0.170 and
      visibly smoothes the data.
  }
--- a/TeX/appendixA.tex
+++ b/TeX/appendixA.tex
@ -323,7 +323,7 @@
    \plimn F^{\lambda, g}(f^n) = F^{\lambda, g}(h) \implies
    \plimn F_{+-}^{\lambda,g '}(f_+,f_-) = F_{+-}^{\lambda,g '}(h_+,h_-),
  \]
-  and all functions can be split in two functions with disjoint support
+  and all functions can be split in two functions with disjoint support,
  Lemma~\ref{lem:s7} follows.
 \end{Proof}
 \input{Appendix_code.tex}
--- a/TeX/bibliograpy.bib
+++ b/TeX/bibliograpy.bib
@ -2,14 +2,12 @@
 	series = {arXiv},
 	author = {Heiss, Jakob and Teichmann, Josef and Wutte, Hanna},
 	publisher = {Cornell University},
-	year = {2019},
-	language = {en},
-	copyright = {In Copyright - Non-Commercial Use Permitted},
+	year = {2019},	copyright = {In Copyright - Non-Commercial Use Permitted},
 	keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
 	size = {53 p.},
 	DOI = {10.3929/ethz-b-000402003},
 	title = {How Implicit Regularization of Neural Networks Affects the Learned Function – Part I},
-	PAGES = {1911.02903}
+	PAGES = {1911.02903},
 }

@article{Dropout,
@ -20,7 +18,7 @@
  volume  = 15,
  number  = 56,
  pages   = {1929-1958},
-  url     = {http://jmlr.org/papers/v15/srivastava14a.html}
+  Comment url     = {http://jmlr.org/papers/v15/srivastava14a.html}
 }

@article{ADADELTA,
@ -29,12 +27,10 @@
  journal   = {CoRR},
  volume    = {abs/1212.5701},
  year      = 2012,
-  url       = {http://arxiv.org/abs/1212.5701},
+  Comment url       = {http://arxiv.org/abs/1212.5701},
  archivePrefix = {arXiv},
  eprint    = {1212.5701},
  timestamp = {Mon, 13 Aug 2018 16:45:57 +0200},
-  biburl    = {https://dblp.org/rec/journals/corr/abs-1212-5701.bib},
-  bibsource = {dblp computer science bibliography, https://dblp.org}
 }

@article{backprop,
@ -49,26 +45,21 @@ day={01},
 volume={323},
 number={6088},
 pages={533-536},
-abstract={We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal `hidden' units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.},
 issn={1476-4687},
 doi={10.1038/323533a0},
-url={https://doi.org/10.1038/323533a0}
+Comment url={https://doi.org/10.1038/323533a0}
 }

@article{MNIST,
  added-at = {2010-06-28T21:16:30.000+0200},
  author = {LeCun, Yann and Cortes, Corinna},
-  biburl = {https://www.bibsonomy.org/bibtex/2935bad99fa1f65e03c25b315aa3c1032/mhwombat},
  groups = {public},
  howpublished = {http://yann.lecun.com/exdb/mnist/},
-  interhash = {21b9d0558bd66279df9452562df6e6f3},
-  intrahash = {935bad99fa1f65e03c25b315aa3c1032},
  keywords = {MSc _checked character_recognition mnist network neural},
  lastchecked = {2016-01-14 14:24:11},
  timestamp = {2016-07-12T19:25:30.000+0200},
  title = {{MNIST} handwritten digit database},
-  url = {http://yann.lecun.com/exdb/mnist/},
-  username = {mhwombat},
+  Comment url = {http://yann.lecun.com/exdb/mnist/},
  year = 2010
 }
@INPROCEEDINGS{resnet,
@ -127,11 +118,10 @@ journal = {NIPS}
  journal   = {CoRR},
  volume    = {abs/1406.2572},
  year      = {2014},
-  url       = {http://arxiv.org/abs/1406.2572},
+  Comment url       = {http://arxiv.org/abs/1406.2572},
  archivePrefix = {arXiv},
  eprint    = {1406.2572},
  timestamp = {Mon, 22 Jul 2019 13:15:46 +0200},
-  biburl    = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
 }

@ -145,12 +135,10 @@ journal = {NIPS}
  journal   = {CoRR},
  volume    = {abs/1207.0580},
  year      = {2012},
-  url       = {http://arxiv.org/abs/1207.0580},
+  Comment url = {http://arxiv.org/abs/1207.0580},
  archivePrefix = {arXiv},
  eprint    = {1207.0580},
  timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
-  biburl    = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib},
-  bibsource = {dblp computer science bibliography, https://dblp.org}
 }

@inproceedings{
@ -159,22 +147,20 @@ title={On the Variance of the Adaptive Learning Rate and Beyond},
 author={Liyuan Liu and Haoming Jiang and Pengcheng He and Weizhu Chen and Xiaodong Liu and Jianfeng Gao and Jiawei Han},
 booktitle={International Conference on Learning Representations},
 year={2020},
-url={https://openreview.net/forum?id=rkgz2aEKDr}
+Comment url={https://openreview.net/forum?id=rkgz2aEKDr}
 }

@inproceedings{ADAM,
  author    = {Diederik P. Kingma and
               Jimmy Ba},
-  editor    = {Yoshua Bengio and
-               Yann LeCun},
+  @Comment editor    = {Yoshua Bengio and
+  @Comment              Yann LeCun},
  title     = {Adam: {A} Method for Stochastic Optimization},
  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
               San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
  year      = {2015},
-  url       = {http://arxiv.org/abs/1412.6980},
+  Comment url       = {http://arxiv.org/abs/1412.6980},
  timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
-  biburl    = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
-  bibsource = {dblp computer science bibliography, https://dblp.org}
 }

@article{transfer_learning,
@ -186,11 +172,11 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
  pages = {020018},
  year = {2017},
  doi = {10.1063/1.4992835},
-  URL = {https://aip.scitation.org/doi/abs/10.1063/1.4992835},
  eprint = {https://aip.scitation.org/doi/pdf/10.1063/1.4992835}
 }

@article{gan,
+  author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan",
  title = "GAN-based synthetic medical image augmentation for increased CNN performance in liver lesion classification",
  journal = "Neurocomputing",
  volume = 321,
@ -198,8 +184,7 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
  year = 2018,
  issn = "0925-2312",
  doi = "https://doi.org/10.1016/j.neucom.2018.09.013",
-  url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
-  author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan"
+  Comment url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
 }

@online{fashionMNIST,
@ -219,7 +204,7 @@ year = {2018},
 isbn = {9781450363549},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
-url = {https://doi.org/10.1145/3206098.3206111},
+Comment url = {https://doi.org/10.1145/3206098.3206111},
 doi = {10.1145/3206098.3206111},
 booktitle = {Proceedings of the 2nd International Conference on Information System and Data Mining},
 pages = {19–28},
@ -239,12 +224,10 @@ series = {ICISDM '18}
  journal   = {CoRR},
  volume    = {abs/1708.04896},
  year      = 2017,
-  url       = {http://arxiv.org/abs/1708.04896},
+  Comment url       = {http://arxiv.org/abs/1708.04896},
  archivePrefix = {arXiv},
  eprint    = {1708.04896},
  timestamp = {Mon, 13 Aug 2018 16:47:52 +0200},
-  biburl    = {https://dblp.org/rec/journals/corr/abs-1708-04896.bib},
-  bibsource = {dblp computer science bibliography, https://dblp.org}
 }

@misc{draw_convnet,
@ -252,7 +235,7 @@ series = {ICISDM '18}
  howpublished = {\url{https://github.com/gwding/draw_convnet}},
  note = {Accessed: 30.08.2020},
  author = {Gavin Weiguang Ding},
-  year = {2018}
+  year = 2018
 }

@book{Haykin,
@ -290,7 +273,6 @@ series = {ICISDM '18}
 title = {Generative Adversarial Nets},
 author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
 booktitle = {Advances in Neural Information Processing Systems 27},
-editor = {Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence and K. Q. Weinberger},
 pages = {2672--2680},
 year = {2014},
 publisher = {Curran Associates, Inc.},
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@ -99,7 +99,7 @@ $v$
      \end{scope}
    \end{tikzpicture}
  \end{adjustbox}
-  \caption[Channel Separation of Color Image]{On the right the red, green, and blue chances of the picture
+  \caption[Channel Separation of Color Image]{On the right the red, green, and blue channels of the picture
    are displayed. In order to better visualize the color channels the
    black and white picture of each channel has been colored in the
    respective color. Combining the layers results in the image on the
@ -134,7 +134,7 @@ convolution is well defined for all pixels of the image.

 Simple examples of image manipulation using
 convolution are smoothing operations or
-rudimentary detection of edges in grayscale images, meaning they only
+rudimentary detection of edges in gray-scale images, meaning they only
 have one channel. A filter often used to smooth or blur images
 is the Gauss-filter which for a given $\sigma \in \mathbb{R}_+$ and
 size $s \in \mathbb{N}$ is
@ -162,7 +162,7 @@ output is given by
 \[
  O = \sqrt{(I * G)^2 + (I*G^T)^2}
 \]
-where $\sqrt{\cdot}$ and $\cdot^2$ are applied componentwise. Examples
+where $\sqrt{\cdot}$ and $\cdot^2$ are applied component-wise. Examples
 for convolution of an image with both kernels are given 
 in Figure~\ref{fig:img_conv}.
 \begin{figure}[H]
@ -208,7 +208,7 @@ in Figure~\ref{fig:img_conv}.
 %     \caption{test}
 %   \end{subfigure}
  \vspace{-0.1cm}
-  \caption[Convolution Applied on Image]{Convolution of original greyscale Image (a)  with different
+  \caption[Convolution Applied on Image]{Convolution of original gray-scale Image (a)  with different
    kernels. In (b) and (c) Gaussian kernels of size 11 and stated
    $\sigma^2$ are used. In (d) to (f) the above defined Sobel Operator
  kernels are used.}
@ -410,7 +410,7 @@ network.
 A class of algorithms that augment the gradient descent
 algorithm to lessen this problem are stochastic gradient
 descent algorithms.
-Here the full dataset is split into smaller disjoint subsets.
+Here the full data set is split into smaller disjoint subsets.
 Then in each iteration, a (different) subset of data is chosen to
 compute the gradient (Algorithm~\ref{alg:sgd}).
 The training period until each data point has been considered at least
@ -496,7 +496,7 @@ time.
  \includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf}
  \caption[CNN Architecture for MNIST Handwritten
  Digits]{Convolutional neural network architecture used to model the
-    MNIST handwritten digits dataset. This figure was created with
+    MNIST handwritten digits data set. This figure was created with
    help of the
    {\sffamily{draw\textunderscore convnet}} Python script by \textcite{draw_convnet}.}
  \label{fig:mnist_architecture}
@ -546,7 +546,7 @@ The most popular three implementations of this are:
  \[
    \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}.
  \]
-  \item Exponential deca,y where the learning rate is decreased after each epoch
+  \item Exponential decay, where the learning rate is decreased after each epoch
 \[
  \gamma_n = \gamma_o e^{-n d}.
 \]
@ -782,7 +782,7 @@ neural networks.
 To get an understanding of the performance of the above
 discussed training algorithms the neural network given in
 \ref{fig:mnist_architecture} has been
-trained on the MNIST handwriting dataset with the above described
+trained on the MNIST handwriting data set with the above described
 algorithms. For all algorithms, a global learning rate of $0.001$ is
 chosen. The parameter preventing divisions by zero is set to
 $\varepsilon = 10^{-7}$. For \textsc{AdaDelta} and
@ -938,7 +938,7 @@ to following this practice will be referred to as data generation.
    \includegraphics[width=\textwidth]{Figures/Data/mnist_gen_shift.pdf}
    \caption{random\\positional shift}
  \end{subfigure}
-  \caption[Image Data Generation]{Example for the manipuations used in
+  \caption[Image Data Generation]{Example for the manipulations used in
    later comparisons. Brightness manipulation and mirroring are not
    used, as the images are equal in brightness and digits are not
    invariant to mirroring.}
@ -985,15 +985,15 @@ the available data can be highly limited.
 In these scenarios, the networks are highly prone to overfit the
 data. To get an understanding of accuracies achievable and the
 impact of the methods aimed at mitigating overfitting discussed above we fit
-networks with different measures implemented to datasets of
+networks with different measures implemented to data sets of
 varying sizes.

-For training, we use the MNIST handwriting dataset as well as the fashion
-MNIST dataset. The fashion MNIST dataset is a benchmark set build by
+For training, we use the MNIST handwriting data set as well as the fashion
+MNIST data set. The fashion MNIST data set is a benchmark set build by
 \textcite{fashionMNIST} to provide a more challenging set, as state of
 the art models are able to achieve accuracies of 99.88\%
 (\textcite{10.1145/3206098.3206111}) on the handwriting set.
-The dataset contains 70.000 preprocessed and labeled images of clothes from
+The data set contains 70.000 preprocessed and labeled images of clothes from
 Zalando. An overview is given in Figure~\ref{fig:fashionMNIST}.

 \input{Figures/fashion_mnist.tex}
@ -1082,7 +1082,7 @@ Zalando. An overview is given in Figure~\ref{fig:fashionMNIST}.

 The models are trained on subsets with a certain amount of randomly
 chosen data points per class.
-The sizes chosen for the comparisons are the full dataset, 100, 10, and 1
+The sizes chosen for the comparisons are the full data set, 100, 10, and 1
 data points per class.

 For the task of classifying the fashion data a slightly altered model
@ -1093,7 +1093,7 @@ by two consecutive convolutional layers with filters of size 3.
  \includegraphics[width=\textwidth]{Figures/Data/cnn_fashion_fig.pdf}
  \caption[CNN Architecture for Fashion MNIST]{Convolutional neural
    network architecture used to model the 
-    fashion MNIST dataset. This figure was created using the
+    fashion MNIST data set. This figure was created using the
    draw\textunderscore convnet Python script by \textcite{draw_convnet}.} 
  \label{fig:fashion_MNIST}
 \end{figure}
@ -1110,14 +1110,14 @@ of the models and the parameters used for data generation are given
 in Listing~\ref{lst:handwriting} for the handwriting model and in
 Listing~\ref{lst:fashion} for the fashion model.

-The models are trained for 125s epochs in order
+The models are trained for 125 epochs in order
 to have enough random
 augmentations of the input images present during training,
 for the networks to fully profit from the additional training data generated.
 The test accuracies of the models after
 training for 125 
 epochs are given in Table~\ref{table:digitsOF} for the handwritten digits
-and in Table~\ref{table:fashionOF} for the fashion datasets. Additionally the
+and in Table~\ref{table:fashionOF} for the fashion data sets. Additionally the
 average test accuracies over the course of learning are given in
 Figure~\ref{fig:plotOF_digits} for the handwriting application and
 Figure~\ref{fig:plotOF_fashion} for the 
@ -1225,7 +1225,7 @@ fashion application.
  \end{subfigure}
  \caption[Mean Test Accuracies for Subsets of MNIST Handwritten
  Digits]{Mean test accuracies of the models fitting the sampled MNIST
-    handwriting datasets over the 125 epochs of training.}
+    handwriting data sets over the 125 epochs of training.}
  \label{fig:plotOF_digits}
 \end{figure}

@ -1352,13 +1352,13 @@ class.
 In all scenarios, the addition of the measures reduces the
 variance of the model.

-The model fit to the fashion MNIST data set benefits less of the
+The model fit to the fashion MNIST data set benefits less from these
 measures.
 For the smallest scenario of one sample per class, a substantial
 increase in accuracy can be observed for both measures.
 Contrary to the digits data set, dropout improves the
 model by a similar margin to data generation.
-For the larger data sets, the benefits are far smaller. While
+For the larger data sets, the benefits are much smaller. While
 in the scenario with 100 samples per class a performance increase can
 be seen for with data generation, in the scenario with 10 samples per
 class it performs worse than the baseline model.
@ -1367,7 +1367,7 @@ and 100 sample scenario. In all scenarios data generation seems to
 benefit from the addition of dropout.

 Additional Figures and Tables for the same comparisons with different
-performance metrics are given in Appendix~\ref{app:comp}
+performance metrics are given in Appendix~\ref{app:comp}.
 There it can be seen that while the measures are able reduce overfitting
 effectively for the handwritten digits data set, the neural networks
 trained on the fashion data set overfit despite these measures being
@ -1416,7 +1416,7 @@ data points which might explain the worse performance of data generation.

 In this thesis, we have taken a look at neural networks, their
 behavior in small scenarios and their application on image
-classification with limited datasets.
+classification with limited data sets.

 We have explored the relation between ridge penalized neural networks
 and slightly altered cubic smoothing splines, giving us an insight
@ -1424,7 +1424,7 @@ about the behavior of the learned function of neural networks.

 When comparing optimization algorithms, we have seen that choosing the
 right training algorithm can have a 
-drastic impact on the efficiency of training and quality of a model
+the drastic impact on the efficiency of training and quality of a model
 obtainable in a reasonable time frame.
 The \textsc{Adam} algorithm has performed well in training the
 convolutional neural networks.
@ -1438,7 +1438,7 @@ measures combating overfitting,  especially if the available training sets are o
 a small size. The success of the measures we have examined
 seems to be highly dependent on the use case and further research is
 being done on the topic of combating overfitting in neural networks. 
-\textcite{random_erasing} propose randomly erasing parts of the inputs
+\textcite{random_erasing} propose randomly erasing parts of the input
 images during training and are able to achieve a high accuracy of 96,35\% on the fashion MNIST
 data set this way.
 While data generation explored in this thesis is able to rudimentary
--- a/TeX/introduction.tex
+++ b/TeX/introduction.tex
@ -12,7 +12,7 @@ neural networks.
 Furthermore, highly optimized and parallelized frameworks for tensor
 operations have been developed.
 With these frameworks, such as TensorFlow and PyTorch, building neural
-networks as become a much more straightforward process.
+networks has become a much more straightforward process.
 % Furthermore, with the development of highly optimized and
 % parallelized implementations of mathematical operations needed for
 % neural networks, such as TensorFlow or PyTorch, building neural network
@ -27,12 +27,12 @@ networks as become a much more straightforward process.
 In this thesis we want to get an understanding of the behavior of neural %
 networks and 
 how we can use them for problems with a complex relationship between
-in and output.
+in- and output.
 In Section 2 we introduce the mathematical construct of neural
 networks and how to fit them to training data.

 To gain some insight about the learned function,
-we examine a simple class of neural networks that only contain one
+we examine a simple class of neural networks that contain only one
 hidden layer.
 In Section~\ref{sec:shallownn} we proof a relation between such networks and
 functions that minimize the distance to training data 
@ -54,7 +54,7 @@ gradient descent in Section~4.4.
 % data in each iteration rather than using the whole data set to update
 % the parameters.
 Most statistical models especially these with large amounts of
-trainable parameter can struggle with overfitting the data.
+trainable parameters can struggle with overfitting the data.
 In Section 4.5 we examine the impact of two measures designed to combat
 overfitting.

--- a/TeX/introduction_nn.tex
+++ b/TeX/introduction_nn.tex
@ -117,8 +117,8 @@ The activation function is usually chosen nonlinear (a linear one
 would result in the entire network collapsing into a linear model) which
 allows it to better model data where the relation of in- and output is
 of nonlinear nature.
-There are two types of activation functions, saturating and not
-saturating ones. Popular examples for the former are sigmoid
+There are two types of activation functions, saturating and
+non-saturating ones. Popular examples for the former are sigmoid 
 functions where most commonly the standard logistic function or tangens
 hyperbolicus are used
 as they have easy to compute derivatives which is desirable for
@ -139,7 +139,7 @@ derivatives are close to zero on most of their realm, only assuming
 larger values in proximity to zero.
 This can hinder the progress of gradient-based methods.

-The nonsaturating activation functions commonly used are the rectified
+The non-saturating activation functions commonly used are the rectified
 linear unit (ReLU) or the leaky ReLU. The ReLU is given by
 \begin{equation}
  r(x) = \max\left\{0, x\right\}.
@ -292,7 +292,7 @@ In Figure~\ref{fig:activation} visualizations of these functions are given.
 \clearpage
 \subsection{Training Neural Networks}

-As neural networks are a parametric model we need to fit the
+As neural networks are parametric models we need to fit the
 parameters to the input
 data to get meaningful predictions from the network. In order
 to accomplish this we need to discuss how we interpret the output of the
--- a/TeX/main.bcf
+++ b/TeX/main.bcf
--- a/TeX/main.lot
+++ b/TeX/main.lot
@ -1,6 +1,6 @@
 \boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
 \babel@toc {english}{}
 \defcounter {refsection}{0}\relax 
-\contentsline {table}{\numberline {4.1}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of MNIST Handwritten Digits}}{41}% 
+\contentsline {table}{\numberline {4.1}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of MNIST Handwritten Digits}}{41}{table.4.1}% 
 \defcounter {refsection}{0}\relax 
-\contentsline {table}{\numberline {4.2}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of Fashion MNIST}}{41}% 
+\contentsline {table}{\numberline {4.2}{\ignorespaces Values of Test Accuracies for Models Trained on Subsets of Fashion MNIST}}{41}{table.4.2}% 
--- a/TeX/main.out
+++ b/TeX/main.out
@ -19,7 +19,7 @@
 \BOOKMARK [3][-]{subsubsection.4.5.3}{Comparisons}{subsection.4.5}% 19
 \BOOKMARK [3][-]{subsubsection.4.5.4}{Effectiveness for Small Training Sets}{subsection.4.5}% 20
 \BOOKMARK [1][-]{section.5}{Summary and Outlook}{}% 21
-\BOOKMARK [1][-]{section*.28}{Appendices}{}% 22
-\BOOKMARK [1][-]{Appendix.a.A}{Notes on Proofs of Lemmata in Section 3.1}{}% 23
-\BOOKMARK [1][-]{Appendix.a.B}{Implementations}{}% 24
-\BOOKMARK [1][-]{Appendix.a.C}{Additional Comparisons}{}% 25
+\BOOKMARK [1][-]{section*.27}{Appendices}{}% 22
+\BOOKMARK [1][-]{Appendix.1.A}{Notes on Proofs of Lemmata in Section 3.1}{}% 23
+\BOOKMARK [1][-]{Appendix.1.B}{Implementations}{}% 24
+\BOOKMARK [1][-]{Appendix.1.C}{Additional Comparisons}{}% 25
--- a/TeX/main.tex
+++ b/TeX/main.tex
@ -6,6 +6,7 @@
 \usepackage[english]{babel}
 \usepackage[utf8]{inputenc}
 \usepackage[T1]{fontenc}
+\usepackage{hyperref}
 \usepackage{textcomp}
 %\usepackage{libertine}
 \usepackage{amsmath}
@ -178,7 +179,7 @@
    keywordstyle = [2]{\color{ipython_cyan}\ttfamily},
 }

-\usepackage[style=authoryear, backend=bibtex]{biblatex}
+\usepackage[authordate, backend=bibtex, firstinits = true]{biblatex-chicago}
 \urlstyle{same}
 \bibliography{bibliograpy.bib}
 \numberwithin{figure}{section}
@ -280,7 +281,7 @@
 \input{further_applications_of_nn}

 \newpage
-
+\DeclareNameAlias{sortname}{last-first}
 \printbibliography

 % Appendix A
--- a/TeX/theo_3_8.tex
+++ b/TeX/theo_3_8.tex
@ -107,8 +107,8 @@ on MSE will perfectly fit the data.
  \proof
  W.l.o.g. all values $x_{ij}^{\text{train}} \in [0,1],~\forall i \in
  \left\{1,\dots, t\right\}, j \in \left\{1,\dots,d\right\}$. Now we
-  chose $v^*$ in order to calculate a unique value for all
-  $x_i^{\text{train}}$:
+  chose $v^*$ such that the vector-product with $x_i^{\text{train}}$
+  results is distinct values for all $i \in \left\{1,\dots,t\right\}$:
  \[
    v^*_{k,j} = v^*_{j} = 10^{j-1}, ~ \forall k \in \left\{1,\dots,n\right\}.
  \]
@ -199,15 +199,16 @@ increased.
      [x=x, y=y, col sep=comma, only marks,mark options={scale =
        0.7}] {Figures/Data/overfit.csv};  
      \addplot [red, line width=0.8pt] table [x=x_n, y=s_n, col
-      sep=comma, forget plot] {Figures/Data/overfit.csv}; 
+      sep=comma] {Figures/Data/overfit.csv}; 
      \addplot [black, line width=0.8pt] table [x=x_n, y=y_n, col
      sep=comma] {Figures/Data/overfit.csv}; 
      \addplot [black, line width=0.8pt, dashed] table [x=x, y=y, col
      sep=comma] {Figures/Data/overfit_spline.csv};
      
-      \addlegendentry{\footnotesize{data}};
+      \addlegendentry{\footnotesize{Data}};
+      \addlegendentry{\footnotesize{Truth}};
      \addlegendentry{\footnotesize{$\mathcal{NN}_{\vartheta^*}$}};
-      \addlegendentry{\footnotesize{spline}};
+      \addlegendentry{\footnotesize{Spline}};
    \end{axis}
  \end{tikzpicture}
  \caption[Overfitting of Shallow Neural Networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
@ -340,7 +341,7 @@ derivative of the function a cubic smoothing spline.

 \begin{Definition}[Cubic Smoothing Spline]
  Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
-  \left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
+  \left\{1,\dots,N\right\}$ be training data. for a given $\lambda \in
  \mathbb{R}$ the cubic smoothing spline is given by
  \[
    f^{*,\lambda} :\in \argmin_{f \in
@ -377,7 +378,7 @@ definition is given in Definition~\ref{def:wrs}.
  Wutte (2019, Definition 3.5)]
  \label{def:wrs}
  Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
-  \left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
+  \left\{1,\dots,N\right\}$ be training data. For a given $\lambda \in \mathbb{R}_{>0}$
  and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
  cubic smoothing spline $f^{*, \lambda}_g$ is given by

@ -535,9 +536,9 @@ parameters and their densities.
    \item The density function $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
    \item $g_{\xi}(0) \neq 0$.
    \item $\frac{1}{g_{\xi}}\Big|_{\supp(g_{\xi})}$ is uniformly
-    continous on $\supp(g_{\xi})$.
+    continuous on $\supp(g_{\xi})$.
    \item The conditional distribution $\mathcal{L}(v_k|\xi_k = x)$
-    is uniformly continous on $\supp(g_{\xi})$.
+    is uniformly continuous on $\supp(g_{\xi})$.
    \item $\mathbb{E}\left[v_k^2\right] < \infty$.
  \end{enumerate}
 \end{Assumption}
@ -550,7 +551,7 @@ introduce it and the corresponding induced norm.
  define the Sobolev space $W^{k,p}(K)$ as the space containing all
  real valued functions $u \in L^p(K)$ such that for every multi-index
  $\alpha \in \mathbb{N}^n$ with $\abs{\alpha} \leq
-  k$ the mixed parial derivatives
+  k$ the mixed partial derivatives
  \[
    u^{(\alpha)} = \frac{\partial^{\abs{\alpha}} u}{\partial
      x_1^{\alpha_1} \dots \partial x_n^{\alpha_n}}
@ -625,7 +626,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
 \begin{Lemma}[Poincar\'e Typed Inequality]
  \label{lem:pieq}
  Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
-  \mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
+  \mathbb{R} \to \mathbb{R}\) Lebesgue integrable. Then for \(K=[a,b]
  \subset \mathbb{R}\) with \(f(a)=0\) it holds that
  \begin{equation*}
    \label{eq:pti1}
@ -633,8 +634,8 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
    \norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
    \norm{f'}_{L^{\infty}(K)}.
  \end{equation*}
-  If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
-  \mathbb{R}\) Lesbeque integrable then
+  If additionally \(f'\) is differentiable with \(f'': \mathbb{R} \to
+  \mathbb{R}\) Lebesgue integrable then
  \begin{equation*}
    \label{eq:pti2}
    \exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
@ -678,7 +679,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
 \begin{Lemma}
  \label{lem:cnvh}
  Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
-  \mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
+  \mathbb{R}^2 \to \mathbb{R}\) uniformly continuous such that
  \[
    \forall x \in  \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
      \frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
@ -829,7 +830,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
  y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in
  \left\{1,\dots,N\right\}$, with $w^*$ as
  defined in Definition~\ref{def:rpnn} and $\tilde{\lambda}$ as
-  defined in Theroem~\ref{theo:main1}, it holds
+  defined in Theorem~\ref{theo:main1}, it holds
  \[
    \plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} -
      f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0.
@ -842,7 +843,7 @@ given in \textcite{heiss2019} and Appendix~\ref{appendix:proofs}.
  For any $\lambda > 0$, $N \in \mathbb{N}$, and training data $(x_i^{\text{train}},
  y_i^{\text{train}}) \in \mathbb{R}^2$, with $i \in
  \left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
-  defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
+  defined in Definition~\ref{def:rpnn} and Theorem~\ref{theo:main1}
  respectively, it holds
  \[
    \plimn \abs{F_n^{\tilde{\lambda}}(\mathcal{RN}^{*,\tilde{\lambda}}) -
@ -955,7 +956,7 @@ is stopped early, they are close to adapted weighted cubic smoothing splines.
 \newpage
 \subsection{Simulations}
 \label{sec:rsnn_sim}
-In the following the behaviour described in Theorem~\ref{theo:main1}
+In the following the behavior described in Theorem~\ref{theo:main1}
 is visualized in a simulated example. For this two sets of training
 data have been generated.
 \begin{itemize}