diff --git a/.gitignore b/.gitignore
index cc9be34..ebbe901 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ main-blx.bib
 *.png
 *.jpg
 *.xcf
+*.gif
 
 # no slurm logs
 *slurm*.out
diff --git a/TeX/Figures/SGD_vs_GD.tex b/TeX/Figures/SGD_vs_GD.tex
index 8edf634..10318ea 100644
--- a/TeX/Figures/SGD_vs_GD.tex
+++ b/TeX/Figures/SGD_vs_GD.tex
@@ -81,7 +81,6 @@ plot coordinates {
     \\\cline{1-4}\cline{6-9}
     GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
     \\\cline{1-4}\cline{6-9}
-    \multicolumn{9}{c}{test}\\
     0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
   \end{tabu}
   \caption{Performance metrics of the networks trained in
diff --git a/TeX/Figures/sdg_comparison.tex b/TeX/Figures/sdg_comparison.tex
index 8cf7c62..cee21c7 100644
--- a/TeX/Figures/sdg_comparison.tex
+++ b/TeX/Figures/sdg_comparison.tex
@@ -62,7 +62,7 @@ plot coordinates {
       \multicolumn{3}{c}{Classification Accuracy}
       &~&\multicolumn{3}{c}{Error Measure}
       \\\cline{1-3}\cline{5-7}
-      ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
+      \textsc{AdaGad}&\textsc{AdaDelta}&\textsc{Adam}&&\textsc{AdaGrad}&\textsc{AdaDelta}&\textsc{Adam}
       \\\cline{1-3}\cline{5-7}
       1&1&1&&1&1&1
     \end{tabu}
diff --git a/TeX/Figures/test.tex b/TeX/Figures/test.tex
index e06ea10..bcc7a2a 100644
--- a/TeX/Figures/test.tex
+++ b/TeX/Figures/test.tex
@@ -16,288 +16,301 @@
 \usepackage{listings}
 \usepackage{float}
 
-\newfloat{lstfloat}{htbp}{lop}
-\floatname{lstfloat}{Listing}
-\def\lstfloatautorefname{Listing}
+\usepackage{amsmath,amssymb}
 
-\lstset{
-    breaklines=true,
-    %
-    extendedchars=true,
-    literate=
-    {á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1
-    {Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1
-    {à}{{\`a}}1 {è}{{\`e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1
-    {À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1
-    {ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1
-    {Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1
-    {â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1
-    {Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1
-    {œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1
-    {ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1
-    {€}{{\EUR}}1 {£}{{\pounds}}1
-}
+% \newfloat{lstfloat}{htbp}{lop}
+% \floatname{lstfloat}{Listing}
+% \def\lstfloatautorefname{Listing}
 
-%%
-%% Python definition (c) 1998 Michael Weber
-%% Additional definitions (2013) Alexis Dimitriadis
-%% modified by me (should not have empty lines)
-%%
-\lstdefinelanguage{iPython}{
-  morekeywords={access,and,break,class,continue,def,del,elif,else,except,exec,finally,for,from,global,if,import,
-    in,is,lambda,not,or,pass,print,raise,return,try,while},%
-    %
-    % Built-ins
-    morekeywords=[2]{abs,all,any,basestring,bin,bool,bytearray,callable,chr,classmethod,cmp,compile,complex,delattr,dict,dir,divmod,enumerate,eval,execfile,file,filter,float,format,frozenset,getattr,globals,hasattr,hash,help,hex,id,input,int,isinstance,issubclass,iter,len,list,locals,long,map,max,memoryview,min,next,object,oct,open,ord,pow,property,range,raw_input,reduce,reload,repr,reversed,round,set,setattr,slice,sorted,staticmethod,str,sum,super,tuple,type,unichr,unicode,vars,xrange,zip,apply,buffer,coerce,intern,val},%
-    %
-    sensitive=true,%
-    morecomment=[l]\#,%
-    morestring=[b]',%
-    morestring=[b]",%
-    %
-    morestring=[s]{'''}{'''},% used for documentation text (mulitiline strings)
-    morestring=[s]{"""}{"""},% added by Philipp Matthias Hahn
-    %
-    morestring=[s]{r'}{'},% `raw' strings
-    morestring=[s]{r"}{"},%
-    morestring=[s]{r'''}{'''},%
-    morestring=[s]{r"""}{"""},%
-    morestring=[s]{u'}{'},% unicode strings
-    morestring=[s]{u"}{"},%
-    morestring=[s]{u'''}{'''},%
-    morestring=[s]{u"""}{"""},%
-    %
-    % {replace}{replacement}{lenght of replace}
-    % *{-}{-}{1} will not replace in comments and so on
-    literate=
-    {á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1
-    {Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1
-    {à}{{\`a}}1 {è}{{\`e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1
-    {À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1
-    {ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1
-    {Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1
-    {â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1
-    {Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1
-    {œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1
-    {ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1
-    {€}{{\EUR}}1 {£}{{\pounds}}1
-    %
-    {^}{{{\color{ipython_purple}\^{}}}}1
-    {=}{{{\color{ipython_purple}=}}}1
-    %
-    {+}{{{\color{ipython_purple}+}}}1
-    {*}{{{\color{ipython_purple}$^\ast$}}}1
-    {/}{{{\color{ipython_purple}/}}}1
-    %
-    {+=}{{{+=}}}1
-    {-=}{{{-=}}}1
-    {*=}{{{$^\ast$=}}}1
-    {/=}{{{/=}}}1,
-    literate=
-    *{-}{{{\color{ipython_purple}-}}}1
-     {?}{{{\color{ipython_purple}?}}}1,
-    %
-    identifierstyle=\color{black}\ttfamily,
-    commentstyle=\color{ipython_red}\ttfamily,
-    stringstyle=\color{ipython_red}\ttfamily,
-    keepspaces=true,
-    showspaces=false,
-    showstringspaces=false,
-    %
-    rulecolor=\color{ipython_frame},
-    frame=single,
-    frameround={t}{t}{t}{t},
-    framexleftmargin=6mm,
-    numbers=left,
-    numberstyle=\tiny\color{halfgray},
-    %
-    %
-    backgroundcolor=\color{ipython_bg},
-    %   extendedchars=true,
-    basicstyle=\scriptsize,
-    keywordstyle=\color{ipython_green}\ttfamily,
-    morekeywords = [3]{Int, Double},
-    morekeywords = [2]{foldRight, case},
-    keywordstyle = [3]{\color{ipython_purple}\ttfamily},
-    keywordstyle = [2]{\color{ipython_cyan}\ttfamily},
-}
+% \lstset{
+%     breaklines=true,
+%     %
+%     extendedchars=true,
+%     literate=
+%     {á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1
+%     {Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1
+%     {à}{{\`a}}1 {è}{{\`e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1
+%     {À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1
+%     {ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1
+%     {Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1
+%     {â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1
+%     {Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1
+%     {œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1
+%     {ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1
+%     {€}{{\EUR}}1 {£}{{\pounds}}1
+% }
+
+% %%
+% %% Python definition (c) 1998 Michael Weber
+% %% Additional definitions (2013) Alexis Dimitriadis
+% %% modified by me (should not have empty lines)
+% %%
+% \lstdefinelanguage{iPython}{
+%   morekeywords={access,and,break,class,continue,def,del,elif,else,except,exec,finally,for,from,global,if,import,
+%     in,is,lambda,not,or,pass,print,raise,return,try,while},%
+%     %
+%     % Built-ins
+%     morekeywords=[2]{abs,all,any,basestring,bin,bool,bytearray,callable,chr,classmethod,cmp,compile,complex,delattr,dict,dir,divmod,enumerate,eval,execfile,file,filter,float,format,frozenset,getattr,globals,hasattr,hash,help,hex,id,input,int,isinstance,issubclass,iter,len,list,locals,long,map,max,memoryview,min,next,object,oct,open,ord,pow,property,range,raw_input,reduce,reload,repr,reversed,round,set,setattr,slice,sorted,staticmethod,str,sum,super,tuple,type,unichr,unicode,vars,xrange,zip,apply,buffer,coerce,intern,val},%
+%     %
+%     sensitive=true,%
+%     morecomment=[l]\#,%
+%     morestring=[b]',%
+%     morestring=[b]",%
+%     %
+%     morestring=[s]{'''}{'''},% used for documentation text (mulitiline strings)
+%     morestring=[s]{"""}{"""},% added by Philipp Matthias Hahn
+%     %
+%     morestring=[s]{r'}{'},% `raw' strings
+%     morestring=[s]{r"}{"},%
+%     morestring=[s]{r'''}{'''},%
+%     morestring=[s]{r"""}{"""},%
+%     morestring=[s]{u'}{'},% unicode strings
+%     morestring=[s]{u"}{"},%
+%     morestring=[s]{u'''}{'''},%
+%     morestring=[s]{u"""}{"""},%
+%     %
+%     % {replace}{replacement}{lenght of replace}
+%     % *{-}{-}{1} will not replace in comments and so on
+%     literate=
+%     {á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1
+%     {Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1
+%     {à}{{\`a}}1 {è}{{\`e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1
+%     {À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1
+%     {ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1
+%     {Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1
+%     {â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1
+%     {Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1
+%     {œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1
+%     {ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1
+%     {€}{{\EUR}}1 {£}{{\pounds}}1
+%     %
+%     {^}{{{\color{ipython_purple}\^{}}}}1
+%     {=}{{{\color{ipython_purple}=}}}1
+%     %
+%     {+}{{{\color{ipython_purple}+}}}1
+%     {*}{{{\color{ipython_purple}$^\ast$}}}1
+%     {/}{{{\color{ipython_purple}/}}}1
+%     %
+%     {+=}{{{+=}}}1
+%     {-=}{{{-=}}}1
+%     {*=}{{{$^\ast$=}}}1
+%     {/=}{{{/=}}}1,
+%     literate=
+%     *{-}{{{\color{ipython_purple}-}}}1
+%      {?}{{{\color{ipython_purple}?}}}1,
+%     %
+%     identifierstyle=\color{black}\ttfamily,
+%     commentstyle=\color{ipython_red}\ttfamily,
+%     stringstyle=\color{ipython_red}\ttfamily,
+%     keepspaces=true,
+%     showspaces=false,
+%     showstringspaces=false,
+%     %
+%     rulecolor=\color{ipython_frame},
+%     frame=single,
+%     frameround={t}{t}{t}{t},
+%     framexleftmargin=6mm,
+%     numbers=left,
+%     numberstyle=\tiny\color{halfgray},
+%     %
+%     %
+%     backgroundcolor=\color{ipython_bg},
+%     %   extendedchars=true,
+%     basicstyle=\scriptsize,
+%     keywordstyle=\color{ipython_green}\ttfamily,
+%     morekeywords = [3]{Int, Double},
+%     morekeywords = [2]{foldRight, case},
+%     keywordstyle = [3]{\color{ipython_purple}\ttfamily},
+%     keywordstyle = [2]{\color{ipython_cyan}\ttfamily},
+% }
 
 
+% \begin{document}
+
+% \begin{lstfloat}
+% \begin{lstlisting}[language=iPython]
+% import breeze.stats.distributions.Uniform
+% import breeze.stats.distributions.Gaussian
+% import scala.language.postfixOps
+
+% object Activation {
+%   def apply(x: Double): Double = math.max(0, x)
+
+%   def d(x: Double): Double = if (x > 0) 1 else 0
+% }
+
+% class RSNN(val n: Int, val gamma: Double = 0.001) {
+%     val g_unif = Uniform(-10, 10)
+%     val g_gauss = Gaussian(0, 5)
+
+%     val xis = g_unif.sample(n)
+%     val vs = g_gauss.sample(n)
+%     val bs = xis zip vs map {case(xi, v) => xi * v}
+
+%     def computeL1(x: Double) = (bs zip vs) map {
+%         case (b, v) => Activation(b + v * x) }
+  
+%     def computeL2(l1: Seq[Double], ws: Seq[Double]): Double =
+%         (l1 zip ws) map { case (l, w) => w * l } sum
+  
+%     def output(ws: Seq[Double])(x: Double): Double =
+%         computeL2(computeL1(x), ws)
+  
+%     def learn(data: Seq[(Double, Double)], ws: Seq[Double],
+%         lamb: Double, gamma: Double): Seq[Double] = {
+    
+%         lazy val deltas = data.map {
+%             case (x, y) =>
+%                 val l1 = computeL1(x) // n
+%                 val out = computeL2(l1, ws) // 1
+%                 (l1 zip ws) map {case (l1, w) => (l1 * 2 * (out - y) +
+%                     lam * 2 * w)  * gamma * -1}
+%         }
+  
+%         deltas.foldRight(ws)(
+%             (delta, ws) => ws zip (delta) map { case (w, d) => w + d })  
+%     }
+
+%     def train(data: Seq[(Double, Double)], iter: Int, lam: Double,
+%         gamma: Double = gamma): (Seq[Double], Double => Double)= {
+      
+%         val ws = (1 to iter).foldRight((1 to n).map(
+%             _ => 0.0) :Seq[Double])((i, w) => {
+%             println(s"Training iteration $i")
+%             println(w.sum/w.length)
+%             learn(data, w, lam, gamma / 10)
+%         })
+%         (ws, output(ws))
+%     }
+% }
+% \end{lstlisting}
+% \caption{Scala code used to build and train the ridge penalized
+%   randomized shallow neural network in .... The parameter \textit{lam}
+% in the train function represents the $\lambda$ parameter in the error
+% function. The parameters \textit{n} and \textit{gamma} set the number
+% of hidden nodes and the stepsize for training.}
+% \end{lstfloat}
+% \clearpage
+
+% \begin{lstlisting}[language=iPython]
+% import tensorflow as tf
+% import numpy as np
+% from tensorflow.keras.callbacks import CSVLogger
+% from tensorflow.keras.preprocessing.image import ImageDataGenerator
+
+% mnist = tf.keras.datasets.mnist
+
+% (x_train, y_train), (x_test, y_test) = mnist.load_data()
+% x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
+% x_train = x_train / 255.0
+% x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
+% x_test = x_test / 255.0
+
+% y_train = tf.keras.utils.to_categorical(y_train)
+% y_test = tf.keras.utils.to_categorical(y_test)
+
+% model = tf.keras.models.Sequential()
+% model.add(tf.keras.layers.Conv2D(24,kernel_size=5,padding='same',activation='relu',input_shape=(28,28,1)))
+% model.add(tf.keras.layers.MaxPool2D())
+% model.add(tf.keras.layers.Conv2D(64,kernel_size=5,padding='same',activation='relu'))
+% model.add(tf.keras.layers.MaxPool2D(padding='same'))
+% model.add(tf.keras.layers.Flatten())
+% model.add(tf.keras.layers.Dense(256, activation='relu'))
+% model.add(tf.keras.layers.Dropout(0.2))
+% model.add(tf.keras.layers.Dense(10, activation='softmax'))
+% model.compile(optimizer='adam', loss="categorical_crossentropy",
+%     metrics=["accuracy"])
+
+% datagen = ImageDataGenerator(
+%     rotation_range = 30,
+%     zoom_range = 0.15,
+%     width_shift_range=2,
+%     height_shift_range=2,
+%     shear_range = 1)
+
+% csv_logger = CSVLogger(<Target File>)
+    
+% history = model.fit(datagen.flow(x_train, y_train, batch_size=50),
+% validation_data=(x_test, y_test),
+%     epochs=125, callbacks=[csv_logger],
+%     steps_per_epoch = x_train.shape[0]//50) 
+
+% \end{lstlisting}
+% \clearpage
+% \begin{lstlisting}[language=iPython]
+% import tensorflow as tf
+% import numpy as np
+% from tensorflow.keras.callbacks import CSVLogger
+% from tensorflow.keras.preprocessing.image import ImageDataGenerator
+% mnist = tf.keras.datasets.fashion_mnist
+
+% (x_train, y_train), (x_test, y_test) = mnist.load_data()
+% x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
+% x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
+% x_train, x_test = x_train / 255.0, x_test / 255.0
+
+% y_train = tf.keras.utils.to_categorical(y_train)
+% y_test = tf.keras.utils.to_categorical(y_test)
+
+% model = tf.keras.Sequential()
+% model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size = (3, 3), activation='relu',
+%     input_shape = (28, 28, 1), padding='same'))
+% model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size = (2, 2), activation='relu', padding = 'same'))
+% model.add(tf.keras.layers.MaxPool2D(strides=(2,2)))
+% model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same'))
+% model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same'))
+% model.add(tf.keras.layers.MaxPool2D(strides=(2,2)))
+% model.add(tf.keras.layers.Flatten())
+% model.add(tf.keras.layers.Dense(256, activation='relu'))
+% model.add(tf.keras.layers.Dropout(0.2))
+% model.add(tf.keras.layers.Dense(10, activation='softmax'))
+
+% model.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss="categorical_crossentropy", metrics=["accuracy"])
+
+% datagen = ImageDataGenerator(
+%     rotation_range = 15,
+%     zoom_range = 0.1,
+%     width_shift_range=2,
+%     height_shift_range=2,
+%     shear_range = 0.5,
+%     fill_mode = 'constant',
+%     cval = 0)
+        
+%     csv_logger = CSVLogger(<Target File>)
+    
+%     history = model.fit(datagen.flow(x_train, y_train, batch_size=30),
+%         steps_per_epoch=2000,
+%         validation_data=(x_test, y_test),
+%         epochs=125, callbacks=[csv_logger],
+%         shuffle=True)
+
+% \end{lstlisting}
+% \begin{lstlisting}[language=iPython]
+% def get_random_sample(a, b, number_of_samples=10):
+%     x = []
+%     y = []
+%     for category_number in range(0,10):
+%     # get all samples of a category
+%         train_data_category = a[b==category_number]
+%         # pick a number of random samples from the category
+%         train_data_category = train_data_category[np.random.randint(
+%             train_data_category.shape[0], size=number_of_samples), :]
+%         x.extend(train_data_category)
+%         y.append([category_number]*number_of_samples)
+
+%         return (np.asarray(x).reshape(-1, 28, 28, 1),
+%             np.asarray(y).reshape(10*number_of_samples,1))
+% \end{lstlisting}
 \begin{document}
 
-\begin{lstfloat}
-\begin{lstlisting}[language=iPython]
-import breeze.stats.distributions.Uniform
-import breeze.stats.distributions.Gaussian
-import scala.language.postfixOps
+\begin{align}
+  \makebox[2cm][c]{$\overset{\text{Lem. A.6}}{\underset{\delta \text{
+  small enough}}{=}} $} 
+\end{align}
 
-object Activation {
-  def apply(x: Double): Double = math.max(0, x)
+\end{document} 
 
-  def d(x: Double): Double = if (x > 0) 1 else 0
-}
-
-class RSNN(val n: Int, val gamma: Double = 0.001) {
-    val g_unif = Uniform(-10, 10)
-    val g_gauss = Gaussian(0, 5)
-
-    val xis = g_unif.sample(n)
-    val vs = g_gauss.sample(n)
-    val bs = xis zip vs map {case(xi, v) => xi * v}
-
-    def computeL1(x: Double) = (bs zip vs) map {
-        case (b, v) => Activation(b + v * x) }
-  
-    def computeL2(l1: Seq[Double], ws: Seq[Double]): Double =
-        (l1 zip ws) map { case (l, w) => w * l } sum
-  
-    def output(ws: Seq[Double])(x: Double): Double =
-        computeL2(computeL1(x), ws)
-  
-    def learn(data: Seq[(Double, Double)], ws: Seq[Double],
-        lamb: Double, gamma: Double): Seq[Double] = {
-    
-        lazy val deltas = data.map {
-            case (x, y) =>
-                val l1 = computeL1(x) // n
-                val out = computeL2(l1, ws) // 1
-                (l1 zip ws) map {case (l1, w) => (l1 * 2 * (out - y) +
-                    lam * 2 * w)  * gamma * -1}
-        }
-  
-        deltas.foldRight(ws)(
-            (delta, ws) => ws zip (delta) map { case (w, d) => w + d })  
-    }
-
-    def train(data: Seq[(Double, Double)], iter: Int, lam: Double,
-        gamma: Double = gamma): (Seq[Double], Double => Double)= {
-      
-        val ws = (1 to iter).foldRight((1 to n).map(
-            _ => 0.0) :Seq[Double])((i, w) => {
-            println(s"Training iteration $i")
-            println(w.sum/w.length)
-            learn(data, w, lam, gamma / 10)
-        })
-        (ws, output(ws))
-    }
-}
-\end{lstlisting}
-\caption{Scala code used to build and train the ridge penalized
-  randomized shallow neural network in .... The parameter \textit{lam}
-in the train function represents the $\lambda$ parameter in the error
-function. The parameters \textit{n} and \textit{gamma} set the number
-of hidden nodes and the stepsize for training.}
-\end{lstfloat}
-\clearpage
-
-\begin{lstlisting}[language=iPython]
-import tensorflow as tf
-import numpy as np
-from tensorflow.keras.callbacks import CSVLogger
-from tensorflow.keras.preprocessing.image import ImageDataGenerator
-
-mnist = tf.keras.datasets.mnist
-
-(x_train, y_train), (x_test, y_test) = mnist.load_data()
-x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
-x_train = x_train / 255.0
-x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
-x_test = x_test / 255.0
-
-y_train = tf.keras.utils.to_categorical(y_train)
-y_test = tf.keras.utils.to_categorical(y_test)
-
-model = tf.keras.models.Sequential()
-model.add(tf.keras.layers.Conv2D(24,kernel_size=5,padding='same',activation='relu',input_shape=(28,28,1)))
-model.add(tf.keras.layers.MaxPool2D())
-model.add(tf.keras.layers.Conv2D(64,kernel_size=5,padding='same',activation='relu'))
-model.add(tf.keras.layers.MaxPool2D(padding='same'))
-model.add(tf.keras.layers.Flatten())
-model.add(tf.keras.layers.Dense(256, activation='relu'))
-model.add(tf.keras.layers.Dropout(0.2))
-model.add(tf.keras.layers.Dense(10, activation='softmax'))
-model.compile(optimizer='adam', loss="categorical_crossentropy",
-    metrics=["accuracy"])
-
-datagen = ImageDataGenerator(
-    rotation_range = 30,
-    zoom_range = 0.15,
-    width_shift_range=2,
-    height_shift_range=2,
-    shear_range = 1)
-
-csv_logger = CSVLogger(<Target File>)
-    
-history = model.fit(datagen.flow(x_train, y_train, batch_size=50),
-validation_data=(x_test, y_test),
-    epochs=125, callbacks=[csv_logger],
-    steps_per_epoch = x_train.shape[0]//50) 
-
-\end{lstlisting}
-\clearpage
-\begin{lstlisting}[language=iPython]
-import tensorflow as tf
-import numpy as np
-from tensorflow.keras.callbacks import CSVLogger
-from tensorflow.keras.preprocessing.image import ImageDataGenerator
-mnist = tf.keras.datasets.fashion_mnist
-
-(x_train, y_train), (x_test, y_test) = mnist.load_data()
-x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
-x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
-x_train, x_test = x_train / 255.0, x_test / 255.0
-
-y_train = tf.keras.utils.to_categorical(y_train)
-y_test = tf.keras.utils.to_categorical(y_test)
-
-model = tf.keras.Sequential()
-model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size = (3, 3), activation='relu',
-    input_shape = (28, 28, 1), padding='same'))
-model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size = (2, 2), activation='relu', padding = 'same'))
-model.add(tf.keras.layers.MaxPool2D(strides=(2,2)))
-model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same'))
-model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same'))
-model.add(tf.keras.layers.MaxPool2D(strides=(2,2)))
-model.add(tf.keras.layers.Flatten())
-model.add(tf.keras.layers.Dense(256, activation='relu'))
-model.add(tf.keras.layers.Dropout(0.2))
-model.add(tf.keras.layers.Dense(10, activation='softmax'))
-
-model.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss="categorical_crossentropy", metrics=["accuracy"])
-
-datagen = ImageDataGenerator(
-    rotation_range = 15,
-    zoom_range = 0.1,
-    width_shift_range=2,
-    height_shift_range=2,
-    shear_range = 0.5,
-    fill_mode = 'constant',
-    cval = 0)
-        
-    csv_logger = CSVLogger(<Target File>)
-    
-    history = model.fit(datagen.flow(x_train, y_train, batch_size=30),
-        steps_per_epoch=2000,
-        validation_data=(x_test, y_test),
-        epochs=125, callbacks=[csv_logger],
-        shuffle=True)
-
-\end{lstlisting}
-\begin{lstlisting}[language=iPython]
-def get_random_sample(a, b, number_of_samples=10):
-    x = []
-    y = []
-    for category_number in range(0,10):
-    # get all samples of a category
-        train_data_category = a[b==category_number]
-        # pick a number of random samples from the category
-        train_data_category = train_data_category[np.random.randint(
-            train_data_category.shape[0], size=number_of_samples), :]
-        x.extend(train_data_category)
-        y.append([category_number]*number_of_samples)
-
-        return (np.asarray(x).reshape(-1, 28, 28, 1),
-            np.asarray(y).reshape(10*number_of_samples,1))
-\end{lstlisting}
-
-\end{document} 
\ No newline at end of file
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
diff --git a/TeX/appendixA.tex b/TeX/appendixA.tex
index b67feb3..3854d20 100644
--- a/TeX/appendixA.tex
+++ b/TeX/appendixA.tex
@@ -147,7 +147,7 @@ Wutte (2019, Lemma A.11)]
   \mathbb{R}_{>0} : \forall \omega \in \Omega : \forall l, l' \in
   \left\{1,\dots,N\right\} : \forall n \in \mathbb{N}$
   \[
-    \left(\abs{\xi_l(\omega) - \xi_{l'}(\omega)} < \delta \angle
+    \left(\abs{\xi_l(\omega) - \xi_{l'}(\omega)} < \delta \wedge
       \text{sign}(v_l(\omega)) = \text{sign}(v_{l'}(\omega))\right)
     \implies \abs{\frac{w_l^{*, \tilde{\lambda}}(\omega)}{v_l(\omega)}
       - \frac{w_{l'}^{*, \tilde{\lambda}}(\omega)}{v_{l'}(\omega)}} <
@@ -157,6 +157,61 @@ Wutte (2019, Lemma A.11)]
   \proof given in ..
 \end{Lemma}
 
+\begin{Lemma}[$\frac{w^{*,\tilde{\lambda}}}{v} \approx
+    \mathcal{O}(\frac{1}{n})$, Heiss, Teichmann, and
+    Wutte (2019, Lemma A.14)]
+    For any $\lambda > 0$ and data $(x_i^{\text{train}},
+    y_i^{\text{train}}) \in \mathbb{R}^2, i\in
+    \left\{1,\dots,\right\}$, we have
+    \[
+      \forall P \in (0,1) : \exists C \in \mathbb{R}_{>0} : \exists
+      n_0 \in \mathbb{N} : \forall n > n_0 : \mathbb{P}
+      \left[\max_{k\in \left\{1,\dots,n\right\}}
+        \frac{w_k^{*,\tilde{\lambda}}}{v_k} < C
+        \frac{1}{n}\right] > P
+      % \max_{k\in \left\{1,\dots,n\right\}}
+      % \frac{w_k^{*,\tilde{\lambda}}}{v_k} = \plimn
+    \]
+    \proof
+    Let $k^*_+ \in \argmax_{k\in
+      \left\{1,\dots,n\right\}}\frac{w^{*,\tilde{\lambda}}}{v_k} : v_k
+    > 0$ and $k^*_- \in \argmax_{k\in
+      \left\{1,\dots,n\right\}}\frac{w^{*,\tilde{\lambda}}}{v_k} : v_k
+    < 0$. W.l.o.g. assume $\frac{w_{k_+^*}^2}{v_{k_+^*}^2} \geq
+    \frac{w_{k_-^*}^2}{v_{k_-^*}^2}$
+    \begin{align*}
+      \frac{F^{\lambda,
+      g}\left(f^{*,\lambda}_g\right)}{\tilde{\lambda}}
+      \makebox[2cm][c]{$\stackrel{\mathbb{P}}{\geq}$}
+      & \frac{1}{2 \tilde{\lambda}}
+        F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right)
+        = \frac{1}{2 \tilde{\lambda}}\left[\sum ... + \tilde{\lambda} \norm{w}_2^2\right]
+      \\
+      \makebox[2cm][c]{$\geq$}
+      & \frac{1}{2}\left( \sum_{\substack{k: v_k
+        > 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*}
+      + \delta)}} \left(w_k^{*,\tilde{\lambda}}\right)^2 +
+      \sum_{\substack{k: v_k < 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*}
+      + \delta)}} \left(w_k^{*,\tilde{\lambda}}\right)^2\right) \\
+      \makebox[2cm][c]{$\overset{\text{Lem. A.6}}{\underset{\delta \text{
+      small enough}}{\geq}} $}
+      &
+        \frac{1}{4}\left(\left(\frac{w_{k_+^*}^{*,\tilde{\lambda}}}
+        {v_{k_+^*}}\right)^2\sum_{\substack{k:
+        v_k > 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} + \delta)}}v_k^2 +
+      \left(\frac{w_{k_-^*}^{*,\tilde{\lambda}}}{v_{k_-^*}}\right)^2
+      \sum_{\substack{k:  
+        v_k < 0 \\\xi_k\in(\xi_{k^*}, \xi_{k^*} +
+      \delta)}}v_k^2\right)\\
+      \makebox[2cm][c]{$\stackrel{\mathbb{P}}{\geq}$}
+      & \frac{1}{8}
+        \left(\frac{w_{k_+^*}^{*,\tilde{\lambda}}}{v_{k^*}}\right)^2
+        n \delta g_\xi(\xi_{k_+^*}) \mathbb{P}(v_k
+        >0)\mathbb{E}[v_k^2|\xi_k = \xi_{k^*_+}]
+    \end{align*}
+  
+\end{Lemma}
+
 \input{Appendix_code.tex}
 
 \end{appendices}
diff --git a/TeX/bibliograpy.bib b/TeX/bibliograpy.bib
index f10d111..f2aa34b 100644
--- a/TeX/bibliograpy.bib
+++ b/TeX/bibliograpy.bib
@@ -7,7 +7,6 @@
 	copyright = {In Copyright - Non-Commercial Use Permitted},
 	keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
 	size = {53 p.},
-	abstract = {Today, various forms of neural networks are trained to perform approximation tasks in many fields. However, the solutions obtained are not fully understood. Empirical results suggest that typical training algorithms favor regularized solutions.These observations motivate us to analyze properties of the solutions found by gradient descent initialized close to zero, that is frequently employed to perform the training task. As a starting point, we consider one dimensional (shallow) ReLU neural networks in which weights are chosen randomly and only the terminal layer is trained. We show that the resulting solution converges to the smooth spline interpolation of the training data as the number of hidden nodes tends to infinity. Moreover, we derive a correspondence between the early stopped gradient descent and the smoothing spline regression. This might give valuable insight on the properties of the solutions obtained using gradient descent methods in general settings.},
 	DOI = {10.3929/ethz-b-000402003},
 	title = {How Implicit Regularization of Neural Networks Affects the Learned Function – Part I},
 	PAGES = {1911.02903}
@@ -72,23 +71,14 @@ url={https://doi.org/10.1038/323533a0}
   username = {mhwombat},
   year = 2010
 }
-
-@article{resnet,
-  author    = {Kaiming He and
-               Xiangyu Zhang and
-               Shaoqing Ren and
-               Jian Sun},
-  title     = {Deep Residual Learning for Image Recognition},
-  journal   = {CoRR},
-  volume    = {abs/1512.03385},
-  year      = 2015,
-  url       = {http://arxiv.org/abs/1512.03385},
-  archivePrefix = {arXiv},
-  eprint    = {1512.03385},
-  timestamp = {Wed, 17 Apr 2019 17:23:45 +0200},
-  biburl    = {https://dblp.org/rec/journals/corr/HeZRS15.bib},
-  bibsource = {dblp computer science bibliography, https://dblp.org}
-}
+@INPROCEEDINGS{resnet,
+  author={Kaiming {He} and Xiangyu {Zhang} and Shaoqing {Ren} and Jian {Sun}},
+  booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 
+  title={Deep Residual Learning for Image Recognition}, 
+  year={2016},
+  volume={},
+  number={},
+  pages={770-778},}
 
 @book{PRML,
    title =     {Pattern Recognition and Machine Learning},
@@ -117,6 +107,15 @@ numpages = {39}
 }
 
 @article{DBLP:journals/corr/DauphinPGCGB14,
+author = {Dauphin, Yann and Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Ganguli, Surya and Bengio, Y.},
+year = {2014},
+month = {06},
+pages = {},
+title = {Identifying and attacking the saddle point problem in high-dimensional non-convex optimization},
+volume = {27},
+journal = {NIPS}
+}
+@article{saddle_point,
   author    = {Yann N. Dauphin and
                Razvan Pascanu and
                {\c{C}}aglar G{\"{u}}l{\c{c}}ehre and
@@ -285,4 +284,15 @@ series = {ICISDM '18}
   timestamp = {Mon, 13 Aug 2018 16:48:10 +0200},
   biburl    = {https://dblp.org/rec/journals/corr/Ruder16.bib},
   bibsource = {dblp computer science bibliography, https://dblp.org}
-}
\ No newline at end of file
+}
+
+@incollection{goodfellow_gan,
+title = {Generative Adversarial Nets},
+author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
+booktitle = {Advances in Neural Information Processing Systems 27},
+editor = {Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence and K. Q. Weinberger},
+pages = {2672--2680},
+year = {2014},
+publisher = {Curran Associates, Inc.},
+url = {http://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf}
+}
diff --git a/TeX/further_applications_of_nn.tex b/TeX/further_applications_of_nn.tex
index 5f19172..75ef73f 100644
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@@ -1,4 +1,4 @@
-\section{Application of NN to higher complexity Problems}
+\section{\titlecap{application of neural networks to higher complexity problems}}
 
 This section is based on \textcite[Chapter~9]{Goodfellow}
 
@@ -155,36 +155,40 @@ in Figure~\ref{fig:img_conv}.
 
 \begin{figure}[h]
   \centering
-  \begin{subfigure}{0.3\textwidth}
+  \begin{subfigure}{0.27\textwidth}
     \centering
     \includegraphics[width=\textwidth]{Figures/Data/klammern.jpg}
-    \caption{Original Picture}
+    \caption{\small Original Picture\\~}
     \label{subf:OrigPicGS}
   \end{subfigure}
-  \begin{subfigure}{0.3\textwidth}
+  \hspace{0.02\textwidth}
+  \begin{subfigure}{0.27\textwidth}
     \centering
     \includegraphics[width=\textwidth]{Figures/Data/image_conv9.png}
-    \caption{\hspace{-2pt}Gaussian Blur $\sigma^2 = 1$}
+    \caption{\small Gaussian Blur $\sigma^2 = 1$}
   \end{subfigure}
-  \begin{subfigure}{0.3\textwidth}
+  \hspace{0.02\textwidth}
+  \begin{subfigure}{0.27\textwidth}
     \centering
     \includegraphics[width=\textwidth]{Figures/Data/image_conv10.png}
-    \caption{Gaussian Blur $\sigma^2 = 4$}
+    \caption{\small Gaussian Blur $\sigma^2 = 4$}
   \end{subfigure}\\
-  \begin{subfigure}{0.3\textwidth}
+  \begin{subfigure}{0.27\textwidth}
     \centering
     \includegraphics[width=\textwidth]{Figures/Data/image_conv4.png}
-    \caption{Sobel Operator $x$-direction}
+    \caption{\small Sobel Operator $x$-direction}
   \end{subfigure}
-  \begin{subfigure}{0.3\textwidth}
+  \hspace{0.02\textwidth}
+  \begin{subfigure}{0.27\textwidth}
     \centering
     \includegraphics[width=\textwidth]{Figures/Data/image_conv5.png}
-    \caption{Sobel Operator $y$-direction}
+    \caption{\small Sobel Operator $y$-direction}
   \end{subfigure}
-  \begin{subfigure}{0.3\textwidth}
+  \hspace{0.02\textwidth}
+  \begin{subfigure}{0.27\textwidth}
     \centering
     \includegraphics[width=\textwidth]{Figures/Data/image_conv6.png}
-    \caption{Sobel Operator combined}
+    \caption{\small Sobel Operator combined}
   \end{subfigure}
 %   \begin{subfigure}{0.24\textwidth}
 %     \centering
@@ -199,7 +203,7 @@ in Figure~\ref{fig:img_conv}.
 \end{figure}
 \clearpage
 \newpage
-\subsection{Convolutional NN}
+\subsection{Convolutional Neural Networks}
 \todo{Eileitung zu CNN amout of parameters}
 % Conventional neural network as described in chapter .. are made up of
 % fully connected layers, meaning each node in a layer is influenced by
@@ -239,10 +243,10 @@ The usage of multiple filters results in multiple outputs of the same
 size as the input (or slightly smaller if no padding is used). These
 are often called channels. 
 For convolutional layers that are preceded by convolutional layers the
-size of the filter is often chosen to coincide with the amount of channels
+size of the filters are often chosen to coincide with the amount of channels
 of the output of the previous layer and not padded in this
 direction.
-This results in the channels ``being squashed'' and prevents gaining
+This results in these channels ``being squashed'' and prevents gaining
 additional 
 dimensions\todo{filter mit ganzer tiefe besser erklären} in the output.
 This can also be used to flatten certain less interesting channels of
@@ -252,14 +256,15 @@ the input as for example color channels.
 
 A way additionally reduce the size using convolution is not applying the
 convolution on every pixel, but rather specifying a certain ``stride''
-$s$ at which the filter $g$ is moved over the input $I$,
+$s$ for each direction at which the filter $g$ is moved over the input $I$,
 \[
-  O_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
+  O_{x,\dots,c} = \sum_{i,\dots,l \in \mathbb{Z}} I_{(x \cdot
+    s_x)-i,\dots,(c \cdot s_c)-l} \cdot g_{i,\dots,l}.
 \] 
 
-The size and stride for all filters in a layer should be the same in
+The sizes and stride should be the same for all filters in a layer in
 order to get a uniform tensor as output.
-T% he size of the filters and the way they are applied can be tuned
+% The size of the filters and the way they are applied can be tuned
 % while building the model should be the same for all filters in one
 % layer in order for the output being of consistent size in all channels.
 % It is common to reduce the d< by not applying the
@@ -288,14 +293,13 @@ T% he size of the filters and the way they are applied can be tuned
 % model to the data. Using multiple kernels it is possible to extract
 % different features from the image (e.g. edges -> sobel).
 
-In order to further reduce the size towards the final layer, convolutional
+As a means to further reduce the size towards the final layer, convolutional
 layers are often followed by a pooling layer. 
 In a pooling layer the input is
 reduced in size by extracting a single value from a
 neighborhood of pixels, often by taking the maximum value in the
 neighborhood (max-pooling). The resulting output size is dependent on
-the offset of the neighborhoods used, this offset is commonly called
-``stride''\todo{zwei mal stride}.
+the offset (stride) of the neighborhoods used.
 The combination of convolution and pooling layers allows for
 extraction of features from the input in the from of feature maps while
 using relatively few parameters that need to be trained.
@@ -306,6 +310,27 @@ by two fully connected layers.
 
 
 \begin{figure}[h]
+  \centering
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Figures/Data/mnist0bw.pdf}
+    \caption{input}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Figures/Data/conv2d_6.pdf}
+    \caption{convolution}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_6.pdf}
+    \caption{max-pool}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Figures/Data/conv2d_7.pdf}
+    \caption{convolution}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Figures/Data/max_pooling2d_7.pdf}
+    \caption{max-pool}
+  \end{subfigure}
   \centering
   \begin{subfigure}{0.19\textwidth}
     \includegraphics[width=\textwidth]{Figures/Data/mnist0bw.pdf}
@@ -333,10 +358,10 @@ by two fully connected layers.
   \label{fig:feature_map}
 \end{figure}
 
-\subsubsection{Parallels to the Visual Cortex in Mammals}
+% \subsubsection{Parallels to the Visual Cortex in Mammals}
 
-The choice of convolution for image classification tasks is not
-arbitrary. ... auge... bla bla
+% The choice of convolution for image classification tasks is not
+% arbitrary. ... auge... bla bla
 
 
 % \subsection{Limitations of the Gradient Descent Algorithm}
@@ -345,7 +370,7 @@ arbitrary. ... auge... bla bla
 % -Problems navigating valleys -> momentum
 % -Different scale of gradients for vars in different layers -> ADAdelta
 
-\subsection{Stochastic Training Algorithms}
+\subsection{\titlecap{stochastic training algorithms}}
 For many applications in which neural networks are used such as
 image classification or segmentation, large training data sets become
 detrimental to capture the nuances of the
@@ -356,15 +381,18 @@ derivatives of the network with respect for each
 variable need to be computed for all data points.
 Thus the amount of memory and computing power available limits the
 size of the training data that can be efficiently used in fitting the
-network. A class of algorithms that augment the gradient descent
+network.
+
+A class of algorithms that augment the gradient descent
 algorithm in order to lessen this problem are stochastic gradient
 descent algorithms.
 Here the full dataset is split into smaller disjoint subsets.
 Then in each iteration a (different) subset of data is chosen to
-compute the gradient (Algorithm~\ref{alg:sdg}).
+compute the gradient (Algorithm~\ref{alg:sgd}).
 The training period until each data point has been considered at least
 once in
 updating the parameters is commonly called an ``epoch''.
+
 Using subsets reduces the amount of memory required for storing the
 necessary values for each update, thus making it possible to use very
 large training sets to fit the model.
@@ -407,7 +435,7 @@ In order to illustrate this behavior we modeled a convolutional neural
 network to classify handwritten digits. The data set used for this is the
 MNIST database of handwritten digits (\textcite{MNIST},
 Figure~\ref{fig:MNIST}).
-\input{Figures/mnist.tex}
+
 The network used consists of two convolution and max pooling layers
 followed by one fully connected hidden layer and the output layer.
 Both covolutional layers utilize square filters of size five which are
@@ -415,25 +443,15 @@ applied with a stride of one.
 The first layer consists of 32 filters and the second of 64. Both
 pooling layers pool a $2\times 2$ area. The fully connected layer
 consists of 256 nodes and the output layer of 10, one for each digit.
-All layers use RELU as activation function, except the output layer 
-with the output layer which uses softmax (\ref{def:softmax}).
-As loss function categorical crossentropy is used (\ref{eq:cross_entropy}).
+All layers use a ReLU as activation function, except the output layer
+which uses softmax (\ref{eq:softmax}). 
+As loss function categorical cross entropy (\ref{eq:cross_entropy}) is used.
 The architecture of the convolutional neural network is summarized in
 Figure~\ref{fig:mnist_architecture}.
 
-\begin{figure}
-  \includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf}
-  \caption{Convolutional neural network architecture used to model the
-    MNIST handwritten digits dataset. This figure was created using the
-    draw\textunderscore convnet Python script by \textcite{draw_convnet}.}
-  \label{fig:mnist_architecture}
-\end{figure}
-
 The results of the network being trained with gradient descent and
 stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd}
-and Table~\ref{table:sgd_vs_gd}
-
-
+and Table~\ref{table:sgd_vs_gd}.
 Here it can be seen that the network trained with stochstic gradient
 descent is more accurate after the first epoch than the ones trained
 with gradient descent after 20 epochs.
@@ -445,58 +463,75 @@ gradient calculated on the subset it performs far better than the
 network using true gradients when training for the same mount of time.
 \todo{vergleich training time}
 
+\input{Figures/mnist.tex}
+
+\begin{figure}
+  \includegraphics[width=\textwidth]{Figures/Data/convnet_fig.pdf}
+  \caption{Convolutional neural network architecture used to model the
+    MNIST handwritten digits dataset. This figure was created using the
+    draw\textunderscore convnet Python script by \textcite{draw_convnet}.}
+  \label{fig:mnist_architecture}
+\end{figure}
+
 \input{Figures/SGD_vs_GD.tex}
 \clearpage
 \subsection{\titlecap{modified stochastic gradient descent}}
-This section is based on \textcite{ruder}.
+This section is based on \textcite{ruder}, \textcite{ADAGRAD},
+\textcite{ADADELTA} and \textcite{ADAM}.
 
-An inherent problem of the stochastic gradient descent algorithm is
-its sensitivity to the learning rate $\gamma$. This results in the
-problem of having to find a appropriate learning rate for each problem
-which is largely guesswork, the impact of choosing a bad learning rate
+While stochastic gradient descent can work quite well in fitting
+models its sensitivity to the learning rate $\gamma$ is an inherent
+problem.
+This results in  having to find an appropriate learning rate for each problem
+which is largely guesswork. The impact of choosing a bad learning rate
 can be seen in Figure~\ref{fig:sgd_vs_gd}.
 % There is a inherent problem in the sensitivity of the gradient descent
 % algorithm regarding the learning rate $\gamma$.
 % The difficulty of choosing the learning rate can be seen
 % in Figure~\ref{sgd_vs_gd}.
 For small rates the progress in each iteration is small
-but as the rate is enlarged the algorithm can become unstable and the parameters
-diverge to infinity. Even for learning rates small enough to ensure the parameters
+but for learning rates to large the algorithm can become unstable with
+updates being larger then the parameters themselves which can result
+in the parameters diverging to infinity.
+Even for learning rates small enough to ensure the parameters
 do not diverge to infinity, steep valleys in the function to be
 minimized can hinder the progress of
-the algorithm as for leaning rates not small enough gradient descent
-``bounces between'' the walls of the valley rather then following a
-downward trend in the valley.
+the algorithm.
+If the bottom of the valley slowly slopes towards the minimum
+the steep nature of the valley can result in the
+algorithm ``bouncing between'' the walls of the valley rather then
+following the downwards trend.
 
-% \[
-%   w - \gamma \nabla_w ...
-% \]
-%thus the weights grow to infinity.
-\todo{unstable learning rate besser
-  erklären}
-
-To combat this problem \todo{quelle} propose to alter the learning
+A possible way to combat this is to alter the learning
 rate over the course of training, often called leaning rate
-scheduling in order to decrease the learning rate over the course of
-training. The most popular implementations of this are time based
+scheduling.
+The most popular implementations of this are time based
 decay
 \[
   \gamma_{n+1} = \frac{\gamma_n}{1 + d n},
 \]
-where $d$ is the decay parameter and $n$ is the number of epochs,
-step based decay where the learning rate is fixed for a span of $r$
+where $d$ is the decay parameter and $n$ is the number of epochs.
+Step based decay where the learning rate is fixed for a span of $r$
 epochs and then decreased according to parameter $d$
 \[
-  \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}
+  \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}.
 \]
-and exponential decay where the learning rate is decreased after each epoch
+And exponential decay where the learning rate is decreased after each epoch
 \[
   \gamma_n = \gamma_o e^{-n d}.
-\]
-These methods are able to increase the accuracy of a model by large
-margins as seen in the training of RESnet by \textcite{resnet}.
-\todo{vielleicht grafik
-  einbauen}
+\]\todo{satz aufteilen}
+These methods are able to increase the accuracy of models by large
+margins as seen in the training of RESnet by \textcite{resnet}, cf. Figure~\ref{fig:resnet}.
+\begin{figure}[h]
+  \centering
+  \includegraphics[width=\textwidth]{Figures/Data/7780459-fig-4-source-hires.png}
+  \caption[Learning Rate Decay]{Error history of convolutional neural
+    network trained with learning rate decay. \textcite[Figure
+    4]{resnet}}
+  \label{fig:resnet}
+\end{figure}
+
+
 However stochastic gradient descent with weight decay is
 still highly sensitive to the choice of the hyperparameters $\gamma_0$
 and $d$.
@@ -504,25 +539,29 @@ In order to mitigate this problem a number of algorithms have been
 developed to regularize the learning rate with as minimal
 hyperparameter guesswork as possible.
 
-We will examine and compare a ... algorithms that use a adaptive
-learning rate.
-They all scale the gradient for the update depending of past gradients
-for each weight individually.
+In the following we will compare three algorithms that use a adaptive
+learning rate, meaning they scale the updates according to past iterations.
+% We will examine and compare a four algorithms that use a adaptive
+% learning rate.
+% They all scale the gradient for the update depending of past gradients
+% for each weight individually.
 
 The algorithms are build up on each other with the adaptive gradient
 algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD})
 laying the base work. Here for each parameter update the learning rate
-is given my a constant
-$\gamma$ is divided by the sum of the squares of the past partial
+is given by a constant global rate
+$\gamma$ divided by the sum of the squares of the past partial
 derivatives in this parameter. This results in a monotonous decaying
 learning rate with faster
 decay for parameters with large updates, where as
-parameters with small updates experience smaller decay. The \textsc{AdaGrad}
+parameters with small updates experience smaller decay.
+The \textsc{AdaGrad}
 algorithm is given in Algorithm~\ref{alg:ADAGRAD}. Note that while
 this algorithm is still based upon the idea of gradient descent it no
 longer takes steps in the direction of the gradient while
 updating. Due to the individual learning rates for each parameter only
-the direction/sign for single parameters remain the same.
+the direction/sign for single parameters remain the same compared to
+gradient descent.
 
 \begin{algorithm}[H]
   \SetAlgoLined
@@ -589,7 +628,7 @@ As the root mean square of the past gradients is already used in the
 denominator of the learning rate a exponentially decaying root mean
 square of the past updates is used to obtain a $\Delta x$ quantity for
 the denominator resulting in the correct unit of the update. The full
-algorithm is given by Algorithm~\ref{alg:adadelta}.
+algorithm is given in Algorithm~\ref{alg:adadelta}.
 
 \begin{algorithm}[H]
   \SetAlgoLined
@@ -613,13 +652,13 @@ algorithm is given by Algorithm~\ref{alg:adadelta}.
 While the stochastic gradient algorithm is less susceptible to getting
 stuck in local
 extrema than gradient descent the problem still persists especially
-for saddle points with steep .... \textcite{DBLP:journals/corr/Dauphinpgcgb14}
+for saddle points (\textcite{DBLP:journals/corr/Dauphinpgcgb14}).
 
 An approach to the problem of ``getting stuck'' in saddle point or
 local minima/maxima is the addition of momentum to SDG. Instead of
 using the actual gradient for the parameter update an average over the
-past gradients is used. In order to avoid the need to SAVE the past
-values usually a exponentially decaying average is used resulting in
+past gradients is used. In order to avoid the need to hold the past
+values in memory usually a exponentially decaying average is used resulting in
 Algorithm~\ref{alg:sgd_m}. This is comparable of following the path
 of a marble with mass rolling down the slope of the error
 function. The decay rate for the average is comparable to the inertia
@@ -653,13 +692,15 @@ In an effort to combine the properties of the momentum method and the
 automatic adapted learning rate of \textsc{AdaDelta} \textcite{ADAM}
 developed the \textsc{Adam} algorithm, given in
 Algorithm~\ref{alg:adam}. Here the exponentially decaying 
-root mean square of the gradients is still used for realizing and
+root mean square of the gradients is still used for regularizing the
+learning rate and
 combined with the momentum method. Both terms are normalized such that
-the ... are the first and second moment of the gradient. However the term used in
+their means are the first and second moment of the gradient. However the term used in
 \textsc{AdaDelta} to ensure correct units is dropped for a scalar
-global learning rate. This results in ..  hyperparameters, however the
+global learning rate. This results in four tunable  hyperparameters,
+however the 
 algorithms seems to be exceptionally stable with the recommended
-parameters of ... and is a very reliable algorithm for training
+parameters of $\alpha = 0.001, \beta_1 = 0.9, \beta_2 = 0.999, \varepsilon=$1e-7 and is a very reliable algorithm for training
 neural networks.
 
 \begin{algorithm}[H]
@@ -685,8 +726,10 @@ neural networks.
 \end{algorithm}
 
 In order to get an understanding of the performance of the above
-discussed training algorithms the neural network given in ... has been
-trained on the ... and the results are given in
+discussed training algorithms the neural network given in \ref{fig:mnist_architecture} has been
+trained on the MNIST handwriting dataset with the above described
+algorithms.
+The performance metrics of the resulting learned functions are given in
 Figure~\ref{fig:comp_alg}.
 Here it can be seen that the ADAM algorithm performs far better than
 the other algorithms, with AdaGrad and Adelta following... bla bla
@@ -696,7 +739,7 @@ the other algorithms, with AdaGrad and Adelta following... bla bla
 
 % \subsubsubsection{Stochastic Gradient Descent}
 \clearpage
-\subsection{Combating Overfitting}
+\subsection{\titlecap{combating overfitting}}
 
 % As in many machine learning applications if the model is overfit in
 % the data it can drastically reduce the generalization of the model. In
@@ -754,12 +797,12 @@ training as well as testing.
 %as well as testing.
 In order to make this approach feasible
 \textcite{Dropout1} propose random dropout.
-Instead of training different models for each data point in a batch
+Instead of training different models, for each data point in a batch
 randomly chosen nodes in the network are disabled (their output is
 fixed to zero) and the updates for the weights in the remaining
-smaller network are comuted. These the updates computed for each data
-point in the batch are then accumulated and applied to the full
-network.
+smaller network are computed.
+After updates have been ... this way for each data point in a batch
+the updates are accumulated and applied to the full network.
 This can be compared to many small networks which share their weights
 for their active neurons being trained simultaniously.
 For testing the ``mean network'' with all nodes active but their
@@ -785,9 +828,12 @@ used. \todo{comparable to averaging dropout networks, beispiel für
 % \textcite{Dropout}.
 
 \subsubsection{\titlecap{manipulation of input data}}
-Another way to combat overfitting is to keep the network from learning
-the dataset by manipulating the inputs randomly for each iteration of
-training. This is commonly used in image based tasks as there are
+Another way to combat overfitting is to keep the network form
+``memorizing''
+the training data rather then learning the relation between in- and
+output is to randomly alter the training inputs for
+each iteration of training.
+This is commonly used in image based tasks as there are
 often ways to maipulate the input while still being sure the labels
 remain the same. For example in a image classification task such as
 handwritten digits the associated label should remain right when the
@@ -795,7 +841,8 @@ image is rotated or stretched by a small amount.
 When using this one has to be sure that the labels indeed remain the
 same or else the network will not learn the desired ...
 In the case of handwritten digits for example a to high rotation angle
-will ... a nine or six.
+will make the distinction between a nine or six hard and will lessen
+the quality of the learned function.
 The most common transformations are rotation, zoom, shear, brightness,
 mirroring. Examples of this are given in Figure~\ref{fig:datagen}.
 
@@ -827,15 +874,26 @@ mirroring. Examples of this are given in Figure~\ref{fig:datagen}.
   \label{fig:datagen}
 \end{figure}
 
+\subsubsection{\titlecap{comparisons}}
+
 In order to compare the benefits obtained from implementing these
-measures we have trained the network given in ... on the same problem
+measures we have trained the network given in
+\ref{fig:mnist_architecture} on the handwriting recognition problem
 and implemented different combinations of data generation and dropout. The results
 are given in Figure~\ref{fig:gen_dropout}. For each scennario the
 model was trained five times and the performance measures were
-averaged. It can be seen that implementing the measures does indeed
-increase the performance of the model. Implementing data generation on
-its own seems to have a larger impact than dropout and applying both
-increases the accuracy even further.
+averaged.
+
+It can be seen that implementing the measures does indeed
+increase the performance of the model.
+Using data generation to alter the training data seems to have a
+larger impact than dropout, however utilizing both measures yields the
+best results.
+\todo{auf zahlen in tabelle verweisen?}
+
+% Implementing data generation on
+% its own seems to have a larger impact than dropout and applying both
+% increases the accuracy even further.
 
 The better performance stems most likely from reduced overfitting. The
 reduction in overfitting can be seen in
@@ -843,29 +901,29 @@ reduction in overfitting can be seen in
 accuracy decreases with test accuracy increasing. However utlitizing
 data generation as well as dropout with a probability of 0.4 seems to
 be a too aggressive approach as the training accuracy drops below the
-test accuracy\todo{kleine begründung}. 
+test accuracy\todo{kleine begründung}.
 
 \input{Figures/gen_dropout.tex}
 
-\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
-training set?}
 
 \clearpage
 \subsubsection{\titlecap{effectivety for small training sets}}
 
 For some applications (medical problems with small amount of patients)
 the available data can be highly limited.
-In these problems the networks are highly ... for overfitting the
+In these problems the networks are highly prone to overfit the
 data. In order to get a understanding of accuracys achievable and the
-impact of the measures to prevent overfitting discussed above we and train
-the network on datasets of varying sizes with different measures implemented.
+impact of the methods aimed at mitigating overfitting discussed above we and train
+networks with different measures implemented to fit datasets of
+varying sizes.
+
 For training we use the mnist handwriting dataset as well as the fashion
 mnist dataset. The fashion mnist dataset is a benchmark set build by
 \textcite{fashionMNIST} in order to provide a harder set, as state of
 the art models are able to achive accuracies of 99.88\%
 (\textcite{10.1145/3206098.3206111}) on the handwriting set.
-The dataset contains 70.000 preprocessed images of clothes from
-zalando, a overview is given in Figure~\ref{fig:fashionMNIST}.
+The dataset contains 70.000 preprocessed and labeled images of clothes from
+Zalando, a overview is given in Figure~\ref{fig:fashionMNIST}.
 
 \input{Figures/fashion_mnist.tex}
 
@@ -874,90 +932,91 @@ zalando, a overview is given in Figure~\ref{fig:fashionMNIST}.
 \begin{minipage}{\textwidth}
   \small
   \begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
-    \Tstrut \Bstrut & \textsc{Adam}  & D. 0.2          & Gen             & Gen.+D. 0.2     \\
+    \Tstrut \Bstrut & \textsc{Adam}     & D. 0.2            & Gen               & Gen.+D. 0.2       \\
     \hline           
                     & 
-    \multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut                       \\
+    \multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut                                \\
     \cline{2-5}
-    max  \Tstrut    & 0.5633         & 0.5312          & \textbf{0.6704} & 0.6604          \\
-    min             & 0.3230         & 0.4224          & 0.4878          & \textbf{0.5175} \\
-    mean            & 0.4570         & 0.4714          & 0.5862          & \textbf{0.6014} \\
-    var  \Bstrut    & 0.0040         & \textbf{0.0012} & 0.0036          & 0.0023          \\
+    max  \Tstrut    & 0.5633            & 0.5312            & \textbf{0.6704}   & 0.6604            \\
+    min             & 0.3230            & 0.4224            & 0.4878            & \textbf{0.5175}   \\
+    mean            & 0.4570            & 0.4714            & 0.5862            & \textbf{0.6014}   \\
+    var  \Bstrut    & 4.021e-3          & \textbf{1.175e-3} & 3.600e-3          & 2.348e-3          \\
     \hline
                     & 
-    \multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut                     \\
+    \multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut                              \\
     \cline{2-5}
-    max  \Tstrut    & 0.8585         & 0.9423          & 0.9310          & \textbf{0.9441} \\
-    min             & 0.8148         & \textbf{0.9081} & 0.9018          & 0.9061          \\
-    mean            & 0.8377         & \textbf{0.9270} & 0.9185          & 0.9232          \\
-    var  \Bstrut    & 2.7e-04        & 1.3e-04         & 6e-05           & 1.5e-04         \\
+    max  \Tstrut    & 0.8585            & 0.9423            & 0.9310            & \textbf{0.9441}   \\
+    min             & 0.8148            & \textbf{0.9081}   & 0.9018            & 0.9061            \\
+    mean            & 0.8377            & \textbf{0.9270}   & 0.9185            & 0.9232            \\
+    var  \Bstrut    & 2.694e-4          & \textbf{1.278e-4} & 6.419e-5          & 1.504e-4          \\
     \hline
                     & 
-    \multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut                    \\
+    \multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut                             \\
     \cline{2-5}
-    max  \Tstrut    & 0.9637         & 0.9796          & 0.9810          & \textbf{0.9811} \\
-    min             & 0.9506         & 0.9719          & 0.9702          & \textbf{0.9727} \\
-    mean            & 0.9582         & 0.9770          & 0.9769          & \textbf{0.9783} \\
-    var  \Bstrut    & 2e-05          & 1e-05           & 1e-05           & 1e-05           \\
+    max  \Tstrut    & 0.9637            & 0.9796            & 0.9810            & \textbf{0.9811}   \\
+    min             & 0.9506            & 0.9719            & 0.9702            & \textbf{0.9727}   \\
+    mean            & 0.9582            & 0.9770            & 0.9769            & \textbf{0.9783}   \\
+    var  \Bstrut    & 1.858e-5          & 5.778e-6          & 9.398e-6          & \textbf{4.333e-6} \\
     \hline
   \end{tabu}
   \normalsize
   \captionof{table}{Values of the test accuracy of the model trained
     10 times 
     on random MNIST handwriting training sets containing 1, 10 and 100
-    data points per class after 125 epochs. The mean achieved accuracy
+    data points per class after 125 epochs. The mean accuracy achieved
     for the full set employing both overfitting measures is }
   \label{table:digitsOF}
   \small
   \centering
   \begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
-    \Tstrut \Bstrut & \textsc{Adam}  & D. 0.2          & Gen             & Gen.+D. 0.2     \\
+    \Tstrut \Bstrut & \textsc{Adam}     & D. 0.2            & Gen               & Gen.+D. 0.2       \\
     \hline           
                     & 
-    \multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut                       \\
+    \multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut                                \\
     \cline{2-5}
-    max  \Tstrut    & 0.4885         & \textbf{0.5613} & 0.5488          & 0.5475          \\
-    min             & 0.3710         & \textbf{0.3858} & 0.3736          & 0.3816          \\
-    mean \Bstrut    & 0.4166         & 0.4838          & 0.4769          & \textbf{0.4957} \\
-    var             & \textbf{0.002} & 0.00294         & 0.00338         & 0.0030          \\
+    max  \Tstrut    & 0.4885            & \textbf{0.5513}   & 0.5488            & 0.5475            \\
+    min             & 0.3710            & \textbf{0.3858}   & 0.3736            & 0.3816            \\
+    mean \Bstrut    & 0.4166            & 0.4838            & 0.4769            & \textbf{0.4957}   \\
+    var             & \textbf{1.999e-3} & 2.945e-3          & 3.375e-3          & 2.976e-3          \\
     \hline
                     & 
-    \multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut                     \\
+    \multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut                              \\
     \cline{2-5}
-    max  \Tstrut    & 0.7370         & 0.7340          & 0.7236          & \textbf{0.7502} \\
-    min             & 0.6818         & 0.6673          & 0.6709          & \textbf{0.6799} \\
-    mean            & 0.7130         & \textbf{0.7156} & 0.7031          & 0.7136          \\
-    var  \Bstrut    & 3.2e-04        & 3.4e-04         & 3.2e-04         & 4.5e-04         \\
+    max  \Tstrut    & 0.7370            & 0.7340            & 0.7236            & \textbf{0.7502}   \\
+    min             & \textbf{0.6818}   & 0.6673            & 0.6709            & 0.6799            \\
+    mean            & 0.7130            & \textbf{0.7156}   & 0.7031            & 0.7136            \\
+    var  \Bstrut    & \textbf{3.184e-4} & 3.356e-4          & 3.194e-4          & 4.508e-4          \\
     \hline
                     & 
-    \multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut                    \\
+    \multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut                             \\
     \cline{2-5}
-    max  \Tstrut    & 0.8454         & 0.8385          & 0.8456          & \textbf{0.8459} \\
-    min             & 0.8227         & 0.8200          & \textbf{0.8305} & 0.8274          \\
-    mean            & 0.8331         & 0.8289          & 0.8391          & \textbf{0.8409} \\
-    var  \Bstrut    & 4e-05          & 4e-05           & 2e-05           & 3e-05           \\
+    max  \Tstrut    & 0.8454            & 0.8385            & 0.8456            & \textbf{0.8459}   \\
+    min             & 0.8227            & 0.8200            & \textbf{0.8305}   & 0.8274            \\
+    mean            & 0.8331            & 0.8289            & 0.8391            & \textbf{0.8409}   \\
+    var  \Bstrut    & 3.847e-5          & 4.259e-5          & \textbf{2.315e-5} & 2.769e-5          \\
     \hline
   \end{tabu}
   \normalsize
-  \captionof{table}{Values of the test accuracy of the model trained 10 times
-    on random fashion MNIST training sets containing 1, 10 and 100 data points per
-    class. The mean achieved accuracy for the full dataset is: ....}
+  \captionof{table}{Values of the test accuracy of the model trained
+    10 times 
+    on random fashion MNIST training sets containing 1, 10 and 100
+    data points per class after 125 epochs. The mean accuracy achieved
+    for the full set employing both overfitting measures is }
   \label{table:fashionOF}
-\end{minipage}
-\clearpage  % if needed/desired
+\end{minipage}\todo{check values}
+\clearpage 
 }
 
-The random datasets chosen for training are made up of a certain
-number of datapoints for each class, which are chosen at random. The
-sizes chosen for the comparisons are the full dataset, 100, 10 and 1
-data points
-per class.
+The models are trained on subsets with a certain amount of randomly
+chosen datapoints per class.
+The sizes chosen for the comparisons are the full dataset, 100, 10 and 1
+data points per class.
 
 For the task of classifying the fashion data a slightly altered model
 is used. The convolutional layers with filters of size 5 are replaced
 by two consecutive convolutional layers with filters of size 3.
-This is done in order to have more ... in order to better ... the data
-in the model. A diagram of the architecture is given in
+This is done in order to have more ... in order to better accommodate
+for the more complex nature of the data. A diagram of the architecture is given in
 Figure~\ref{fig:fashion_MNIST}.
 
 \afterpage{
@@ -981,7 +1040,7 @@ Listing~\ref{lst:fashion} for the fashion model.
 
 The models are trained for 125 epoch in order
 to have enough random
-augmentations of the input images present during training
+augmentations of the input images are present during training
 for the networks to fully profit of the additional training data generated.
 The test accuracies of the models after
 training for 125 
@@ -998,7 +1057,7 @@ fashion application.
     \begin{tikzpicture}
       \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
                      /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
-        height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
         xlabel = {epoch},ylabel = {Test Accuracy}, cycle
         list/Dark2, every axis plot/.append style={line width
           =1.25pt}]
@@ -1031,7 +1090,7 @@ fashion application.
     \begin{tikzpicture}
       \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
                      /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
-        height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
         xlabel = {epoch},ylabel = {Test Accuracy}, cycle
         list/Dark2, every axis plot/.append style={line width
           =1.25pt}]
@@ -1061,7 +1120,7 @@ fashion application.
     \begin{tikzpicture}
       \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
                      /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
-        height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
         xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
         list/Dark2, every axis plot/.append style={line width
           =1.25pt}, ymin = {0.92}]
@@ -1100,7 +1159,7 @@ fashion application.
       \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
                      /pgf/number format/precision=3},tick style =
                    {draw = none}, width = \textwidth,
-        height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
         xlabel = {epoch},ylabel = {Test Accuracy}, cycle
         list/Dark2, every axis plot/.append style={line width
           =1.25pt}]
@@ -1132,7 +1191,7 @@ fashion application.
     \begin{tikzpicture}
       \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
                      /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
-        height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
         xlabel = {epoch},ylabel = {Test Accuracy}, cycle
         list/Dark2, every axis plot/.append style={line width
           =1.25pt}, ymin = {0.62}]
@@ -1162,7 +1221,7 @@ fashion application.
     \begin{tikzpicture}
       \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
                      /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
-        height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
         xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
         list/Dark2, every axis plot/.append style={line width
           =1.25pt}, ymin = {0.762}]
@@ -1188,45 +1247,129 @@ fashion application.
     \caption{100 samples per class}
     \vspace{.25cm}
   \end{subfigure}
-  \caption{Mean test accuracies of the models fitting the sampled MNIST
-    handwriting datasets over the 125 epochs of training.}
+  \caption{Mean test accuracies of the models fitting the sampled fashion MNIST
+    over the 125 epochs of training.}
   \label{fig:plotOF_fashion}
 \end{figure}
 
-It can be seen in ... and ... that the usage of .. overfitting
-measures greatly improves the accuracy for small datasets. However for
-the smallest size of one datapoint per class generating more data
-... outperforms dropout with only a ... improvment being seen by the
-implementation of dropout whereas datageneration improves the accuracy
-by... . On the other hand the implementation of dropout seems to
-reduce the variance in the model accuracy, as the variance in accuracy
-for the dropout model is less than .. while the variance of the
-datagen .. model is nearly the same. The model with datageneration
-... a reduction in variance with the addition of dropout.
+It can be seen in figure ... that for the handwritten digits scenario
+using data generation greatly improves the accuracy for the smallest
+training set of one sample per class.
+While the addition of dropout only seems to have a small effect on the
+accuracy of the model, the variance get further reduced than with data
+generation. This drop in variance translates to the combination of
+both measures, resulting in the overall best performing model.
 
-For the slightly larger training sets of ten samples per class the
-difference between the two measures seems smaller. Here the
-improvement in accuracy
-seen by dropout is slightly larger than the one of
-datageneration. However for the larger sized training set the variance
-in test accuracies is lower for the model with datageneration than the
-one with dropout.
+In the scenario with 10 and 100 samples per class the measures improve
+the performance as well, however the difference in performance between
+overfitting measures is much smaller than in the first scenario
+with the accuracy gain of dropout being similar to data generation.
+While the observation of the variances persist for the scenario with
+100 samples per class it does not for the one with 10 samples per
+class.
+However in all scenarios the addition of the measures reduces the
+variance of the model.
 
-The results for the training sets with 100 samples per class resemble
-the ones for the sets with 10 per class.
+The model fit to the fashion MNIST data set benefits less of the
+measures.
+For the smallest scenario of one sample fer class a substantial
+increase in accuracy can be observed for the models with the
+... measures.... Contrary to the digits data set dropout improves the
+model by a similar margin to data generation.
+For the larger data sets however the benefits are far smaller. While
+in the scenario with 100 samples per class a performance increase can
+be seen for ... of data generation, it performs worse in the 10
+samples per class scenario than the baseline mode.
+Dropout does seem to have negligible impact on its own in both the 10
+and 100 sample scenario. However in all scenarios the addition of
+dropout to data generation seems to ...
+
+Additional Figures and Tables for the same comparisons with different
+performance metrics are given in Appendix ...
+There it cam be seen that while the measures ... reduce overfitting
+effectively for the handwritten digits data set, the neural networks
+trained on the fashion data set overfit despite these measures being
+in place.
+
+
+% It can be seen in ...  that the usage of .. overfitting
+% measures greatly improves the accuracy for small datasets. However for
+% the smallest size of one datapoint per class generating more data
+% ... outperforms dropout with only a ... improvment being seen by the
+% implementation of dropout whereas datageneration improves the accuracy
+% by... . On the other hand the implementation of dropout seems to
+% reduce the variance in the model accuracy, as the variance in accuracy
+% for the dropout model is less than .. while the variance of the
+% datagen .. model is nearly the same. The model with datageneration
+% ... a reduction in variance with the addition of dropout.
+
+% For the slightly larger training sets of ten samples per class the
+% difference between the two measures seems smaller. Here the
+% improvement in accuracy
+% seen by dropout is slightly larger than the one of
+% datageneration. However for the larger sized training set the variance
+% in test accuracies is lower for the model with datageneration than the
+% one with dropout.
+
+% The results for the training sets with 100 samples per class resemble
+% the ones for the sets with 10 per class.
+
+Overall it seems that both measures can increase the performance of
+a convolution neural network however the success is dependent on the problem.
+For the handwritten digits the great result of data generation likely
+stems from the .. As the digits are not rotated the same way or
+aligned the same way in all ... using images that are altered in such
+a way can help the network learn to recognize digits that are written
+at a different slant.
+In the fashion data set however the alignment of all images are very
+COHERENT and little to no difference between two data points of the
+same class can be ... by rotation, shifts or shear ...
 
-Overall the models ... both measures to combat overfitting seem to
-perform considerably well compared to the ones without. The usage of
-these measures has great potential in improving models used for
-applications with limited training data. Additional tables and figures
-visualizing the effects on the logarithmic corssentropy rather than
-loss are given in the appendix\todo{figs für appendix}
 
 
 
 
 \clearpage
-\section{Schluss}
+\section{\titlecap{summary and outlook}}
+
+In this thesis we have taken a look at neural networks, their
+behavior in small scenarios and their application on image
+classification with limited datasets.
+
+We have shown that ridge penalized neural networks ... to
+slightly altered cubic smoothing splines, giving us an insight about
+the behavior of the learned function of neural networks.
+
+We have seen that choosing the right training algorithm can have a
+drastic impact on the efficiency of training and quality of a model
+obtainable in a reasonable time frame.
+The \textsc{Adam} algorithm has proven itself as best fit for the task
+of classifying images. However there is ... ongoing research in
+improving these algorithms, for example \textcite{rADAM} propose an
+alteration to the \textsc{Adam} algorithm in order to make the
+... term more stable in early phases of training.
+
+We have seen that a convolutional network can benefit greatly from
+measures combating overfitting,  especially if the available training sets are of
+a small size. However the success of the measures we have examined
+seem to be highly dependent on ...
+... there is further research being done on the topic of combating
+overfitting.
+\textcite{random_erasing} propose randomly erasing parts of the inputs
+images during training and are able to achieve high a high accuracy on the fashion MNIST
+data set this way (96,35\%).
+While data generation explored in this thesis is able to rudimentary
+generate new training data there is ... in using more elaborate methods
+to enlagre the training set.
+\textcite{gan} explore the application of generative adversarial
+networks in order to ... for medical images with small ...
+These networks ... in order to generate completely new images
+... (cf. \textcite{goodfellow_gan}).
+
+Convolutional neural networks are able to achieve remarkable results
+and with further improvements and ... will find further applications
+and is a staple here to stay.
+
 \begin{itemize}
   \item generate more data, GAN etc \textcite{gan}
   \item Transfer learning, use network trained on different task and
diff --git a/TeX/introduction.tex b/TeX/introduction.tex
index 10ec7de..7b014a7 100644
--- a/TeX/introduction.tex
+++ b/TeX/introduction.tex
@@ -1,6 +1,22 @@
 \section{Introduction}
 
+Neural networks have become a widely used model as they are relatively
+easy to build with modern frameworks like tensorflow and are able to
+model complex data.
+In this thesis we will .. networks ..
 
+In order to get some understanding about the behavior of the learned
+function of neural networks we examine the convergence behavior for
+....
+
+An interesting application of neural networks is the application to
+image classification tasks. We ... impact of ... on the performance of
+a neural network in such a task.
+
+As in some applications such as medical imaging one might be limited
+to very small training data we study the impact of two measures in
+improving the accuracy in such a case by trying to ... the model from
+overfitting the data.
 
 
 
diff --git a/TeX/introduction_nn.tex b/TeX/introduction_nn.tex
index 5382849..a39a58c 100644
--- a/TeX/introduction_nn.tex
+++ b/TeX/introduction_nn.tex
@@ -1,5 +1,5 @@
 
-\section{Introduction to Neural Networks}
+\section{\titlecap{Introduction to Neural Networks}}
 
 This chapter is based on \textcite[Chapter~6]{Goodfellow} and \textcite{Haykin}.
 
@@ -95,18 +95,21 @@ Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}.
   \label{fig:nn}
 \end{figure}
 
-\subsection{Nonlinearity of Neural Networks}
+\subsection{\titlecap{nonlinearity of neural networks}}
 
 The arguably most important feature of neural networks that sets them
 apart from linear models is the activation function implemented in the
 neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the
 inputs a activation function $\sigma$ is applied resulting in the
-output of the $k$-th neuron in a layer $l$
+output of the $k$-th neuron in a layer $l$ with $m$ nodes in layer $l-1$
 being given by
-\[
-  o_{l,k} = \sigma\left(b_{l,k} + \sum_{j=1}^m w_{l,k,j} o_{l-1,j}\right)
-\]
-for weights $w_{l,k,j}$ and biases $b_{l,k}$.
+\begin{align*}
+  o_{l,k} = \sigma\left(b_{l,k} + \sum_{j=1}^{m} w_{l,k,j}
+            o_{l-1,j}\right)
+\end{align*}
+for weights $w_{l,k,j}$ and biases $b_{l,k}$. For a network with $L$
+hidden layers and inputs $o_{0}$ the final outputs of the network
+are thus given by $o_{L+1}$.
 The activation function is usually chosen nonlinear (a linear one
 would result in the entire model collapsing into a linear one\todo{beweis?}) which
 allows it to better model data where the relation of in- and output is
@@ -308,7 +311,7 @@ neural network.
 % data-point in fitting the model, where usually some distance between
 % the model output and the labels is minimized.
 
-\subsubsection{\titlecap{nonliniarity in last layer}}
+\subsubsection{\titlecap{nonliniarity in the last layer}}
 
 Given the nature of the neural net the outputs of the last layer are
 real numbers. For regression tasks this is desirable, for
@@ -333,9 +336,10 @@ This however makes training the model with gradient based methods impossible, as
 the transformation is either zero or undefined.
 A continuous transformation that is close to the argmax one is given by
 softmax
-\[
+\begin{equation}
   \text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}.
-\]
+  \label{eq:softmax}
+\end{equation}
 The softmax function transforms the realm of the output to the interval $[0,1]$
 and the individual values sum to one, thus the output can be interpreted as
 a probability for each class given the input.
@@ -406,7 +410,7 @@ As discussed above the output of a neural network for a classification
 problem can be interpreted as a probability distribution over the classes
 conditioned on the input. In this case it is desirable to
 use error functions designed to compare probability distributions. A
-widespread error function for this use case is the cross entropy (\textcite{PRML}),
+widespread error function for this use case is the categorical cross entropy (\textcite{PRML}),
 which for two discrete distributions $p, q$ with the same realm $C$ is given by
 \[
   H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
@@ -415,9 +419,10 @@ comparing $q$ to a target density $p$.
 For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$
 corresponds to the probability of class $c$ given $x_i$ and predictor
 $f$ we get the loss function
-\[
+\begin{equation}
   CE(f) = \sum_{i=1}^n H(y_i, f(x_i)).
-\]
+  \label{eq:cross_entropy}
+\end{equation}
 
 \todo{Den satz einbauen}
 -Maximum Likelihood
@@ -471,7 +476,10 @@ expensive.
 By using the chain rule and exploiting the layered structure we can
 compute the parameter update much more efficiently, this practice is
 called backpropagation and was introduced by
-\textcite{backprop}\todo{nachsehen ob richtige quelle}. 
+\textcite{backprop}\todo{nachsehen ob richtige quelle}. The algorithm
+for one data point is given in Algorithm~\ref{alg:backprop}, but for all error
+functions that are sums of errors for single data points (MSE, cross
+entropy) backpropagation works analogous for larger training data. 
 
 % \subsubsection{Backpropagation}
 
@@ -485,11 +493,33 @@ called backpropagation and was introduced by
 % for each layer from output layer towards the first layer while only
 % needing to ....
 
-\[
-  \frac{\partial L(...)}{}
-\]
-Backprop noch aufschreiben
-\todo{Backprop richtig aufschreiben}
+\begin{algorithm}[H]
+  \SetAlgoLined
+  \KwInput{Inputs $o_0$, neural network
+    with $L$ hidden layers and weights $w$ and biases $b$ for $n_l$
+    nodes and activation function $\sigma_l$ in layer $l$, loss $\tilde{L}$.}
+  Forward Propagation:
+  \For{$l \in \left\{1, \dots, L+1\right\}$}{
+    Compute values for layer $l$:
+    $z_{l,k} \leftarrow b_{l,k} + w_{l,k}^{\mathrm{T}} o_{l-1}, k \in \left\{1,\dots,n_l\right\}$\;
+    $o_{l,k} \leftarrow \sigma_l(z_{l,k}), k \in \left\{1,\dots,n_l\right\}$ \;
+  }
+  Calculate derivative for output layer: $\delta_{L+1, k} \leftarrow
+  \frac{\partial\tilde{L}(o_{L+1})}{\partial o_{L+1,k}} \sigma_{L+1}'(z_{L+1,k})$\;
+  Back propagate the error:
+  \For{$l \in \left\{L,\dots,1\right\}$}{
+    $\delta_{l,k} \leftarrow w_{l+1,k}^{\mathrm{T}} \delta_{l+1}
+    \sigma_{l}'(z_{l,k}), k=1,\dots,n_k$
+  }
+  Calculate gradients:
+  $\frac{\partial\tilde{L}}{\partial w_{l,k,j}} =
+  \delta_{l,k}o_{l-1,j}$,
+  $\frac{\partial\tilde{L}}{\partial b_{l,k}} =
+  \delta_{l,k}$\;
+  
+  \caption{Backpropagation for one data point}
+  \label{alg:backprop}
+\end{algorithm}
 
 %%% Local Variables:
 %%% mode: latex
diff --git a/TeX/main.lot b/TeX/main.lot
index f41b75e..79504e6 100644
--- a/TeX/main.lot
+++ b/TeX/main.lot
@@ -1,8 +1,8 @@
 \boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
 \babel@toc {english}{}
 \defcounter {refsection}{0}\relax 
-\contentsline {table}{\numberline {4.1}{\ignorespaces Performance metrics of the networks trained in Figure~\ref {fig:sgd_vs_gd} after 20 training epochs.\relax }}{30}{table.caption.34}% 
+\contentsline {table}{\numberline {4.1}{\ignorespaces Performance metrics of the networks trained in Figure~\ref {fig:sgd_vs_gd} after 20 training epochs.\relax }}{29}{table.caption.32}% 
 \defcounter {refsection}{0}\relax 
-\contentsline {table}{\numberline {4.2}{\ignorespaces Values of the test accuracy of the model trained 10 times on random MNIST handwriting training sets containing 1, 10 and 100 data points per class after 125 epochs. The mean achieved accuracy for the full set employing both overfitting measures is \relax }}{41}{table.4.2}% 
+\contentsline {table}{\numberline {4.2}{\ignorespaces Values of the test accuracy of the model trained 10 times on random MNIST handwriting training sets containing 1, 10 and 100 data points per class after 125 epochs. The mean accuracy achieved for the full set employing both overfitting measures is \relax }}{42}{table.4.2}% 
 \defcounter {refsection}{0}\relax 
-\contentsline {table}{\numberline {4.3}{\ignorespaces Values of the test accuracy of the model trained 10 times on random fashion MNIST training sets containing 1, 10 and 100 data points per class. The mean achieved accuracy for the full dataset is: ....\relax }}{41}{table.4.3}% 
+\contentsline {table}{\numberline {4.3}{\ignorespaces Values of the test accuracy of the model trained 10 times on random fashion MNIST training sets containing 1, 10 and 100 data points per class after 125 epochs. The mean accuracy achieved for the full set employing both overfitting measures is \relax }}{42}{table.4.3}% 
diff --git a/TeX/main.tex b/TeX/main.tex
index 9aace0a..6427570 100644
--- a/TeX/main.tex
+++ b/TeX/main.tex
@@ -34,7 +34,7 @@
 \usepackage{todonotes}
 \usepackage{lipsum}
 \usepackage[ruled,vlined]{algorithm2e}
-%\usepackage{showframe}
+\usepackage{showframe}
 \usepackage[protrusion=true, expansion=true, kerning=true, letterspace
 = 150]{microtype}
 \usepackage{titlecaps}
@@ -43,8 +43,9 @@
 \usepackage{chngcntr}
 \usepackage{hyperref}
 \hypersetup{
-    linktoc=all,     %set to all if you want both sections and subsections linked
+  linktoc=all,     %set to all if you want both sections and subsections linked
 }
+\allowdisplaybreaks  
 
 \captionsetup[sub]{justification=centering}
 
@@ -202,6 +203,7 @@
 \DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim}
 \DeclareMathOperator{\supp}{supp}
 \DeclareMathOperator*{\argmin}{arg\,min}
+\DeclareMathOperator*{\argmax}{arg\,max}
 \DeclareMathOperator*{\po}{\mathbb{P}\text{-}\mathcal{O}}
 \DeclareMathOperator*{\equals}{=}
 \begin{document}
@@ -286,6 +288,413 @@
 % Appendix A
 \input{appendixA.tex}
 
+\section{\titlecap{additional comparisons}}
+In this section we show additional comparisons for the neural networks
+trained in Section~\ref{...}. In ... the same comparisons given for
+the test accuracy are given for the cross entropy loss on the test
+set, as well as on the training set.
+
+
+\begin{figure}[h]
+  \centering
+  \small
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_1.mean};  
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_02_1.mean};
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_1.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_02_1.mean};
+
+        
+        \addlegendentry{\footnotesize{Default}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+        \addlegendentry{\footnotesize{Default}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{1 sample per class}
+    \vspace{0.25cm}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_00_10.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_02_10.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_00_10.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_02_10.mean};
+
+
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{10 samples per class}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_00_100.mean};
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_02_100.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_00_100.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_02_100.mean}; 
+        
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{100 samples per class}
+    \vspace{.25cm}
+  \end{subfigure}
+  \caption{Mean test accuracies of the models fitting the sampled MNIST
+    handwriting datasets over the 125 epochs of training.}
+\end{figure}
+
+\begin{figure}[h]
+  \centering
+  \small
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style =
+                   {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_0_1.mean};  
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_2_1.mean};
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_0_1.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_2_1.mean};
+
+        
+        \addlegendentry{\footnotesize{Default}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{1 sample per class}
+    \vspace{0.25cm}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}, ymin = {0.62}]
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_0_10.mean};  
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_2_10.mean};
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_0_10.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_2_10.mean};
+
+        
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{10 samples per class}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_0_100.mean};  
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_2_100.mean};
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_0_100.mean}; 
+        \addplot table
+        [x=epoch, y=val_loss, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_2_100.mean};
+        
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{100 samples per class}
+    \vspace{.25cm}
+  \end{subfigure}
+  \caption{Mean test accuracies of the models fitting the sampled fashion MNIST
+    over the 125 epochs of training.}
+\end{figure}
+
+\begin{figure}[h]
+  \centering
+  \small
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_1.mean};  
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_02_1.mean};
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_1.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_02_1.mean};
+
+        
+        \addlegendentry{\footnotesize{Default}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+        \addlegendentry{\footnotesize{Default}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{1 sample per class}
+    \vspace{0.25cm}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_00_10.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_02_10.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_00_10.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_02_10.mean};
+
+
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{10 samples per class}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}, ymin = {0.92}]
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_00_100.mean};
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_dropout_02_100.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_00_100.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/adam_datagen_dropout_02_100.mean}; 
+        
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{100 samples per class}
+    \vspace{.25cm}
+  \end{subfigure}
+  \caption{Mean test accuracies of the models fitting the sampled MNIST
+    handwriting datasets over the 125 epochs of training.}
+\end{figure}
+
+\begin{figure}[h]
+  \centering
+  \small
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style =
+                   {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_0_1.mean};  
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_2_1.mean};
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_0_1.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_2_1.mean};
+
+        
+        \addlegendentry{\footnotesize{Default}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{1 sample per class}
+    \vspace{0.25cm}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch},ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}, ymin = {0.62}]
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_0_10.mean};  
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_2_10.mean};
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_0_10.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_2_10.mean};
+
+        
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{10 samples per class}
+  \end{subfigure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
+        height = 0.4\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
+        list/Dark2, every axis plot/.append style={line width
+          =1.25pt}]
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_0_100.mean};  
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_dropout_2_100.mean};
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_0_100.mean}; 
+        \addplot table
+        [x=epoch, y=accuracy, col sep=comma, mark = none]
+        {Figures/Data/fashion_datagen_dropout_2_100.mean};
+        
+        \addlegendentry{\footnotesize{Default.}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G + D. 0.2}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{100 samples per class}
+    \vspace{.25cm}
+  \end{subfigure}
+  \caption{Mean test accuracies of the models fitting the sampled fashion MNIST
+    over the 125 epochs of training.}
+\end{figure}
+
 \end{document}
 
 %%% Local Variables:
diff --git a/TeX/theo_3_8.tex b/TeX/theo_3_8.tex
index 66dc28c..d9a79ff 100644
--- a/TeX/theo_3_8.tex
+++ b/TeX/theo_3_8.tex
@@ -224,8 +224,8 @@ plot coordinates {
 \end{figure}
 
 \clearpage
-\subsection{Convergence Behaviour of 1-dim. Randomized Shallow Neural
-  Networks} 
+\subsection{\titlecap{convergence behaviour of 1-dim. randomized shallow neural
+  networks}} 
 
 
 This section is based on \textcite{heiss2019}.
@@ -963,7 +963,7 @@ would equate to $g(x) = \frac{\mathbb{E}[v_k^2|\xi_k = x]}{10}$. In
 order to utilize the
 smoothing spline implemented in Mathlab, $g$ has been simplified to $g
 \equiv \frac{1}{10}$ instead. For all figures $f_1^{*, \lambda}$ has
-been calculated with Matlab's 'smoothingspline', as this minimizes
+been calculated with Matlab's ``smoothingspline'', as this minimizes
 \[
   \bar{\lambda} \sum_{i=1}^N(y_i^{train} - f(x_i^{train}))^2 + (1 -
   \bar{\lambda}) \int (f''(x))^2 dx
@@ -971,7 +971,7 @@ been calculated with Matlab's 'smoothingspline', as this minimizes
 the smoothing parameter used for fittment is $\bar{\lambda} =
 \frac{1}{1 + \lambda}$. The parameter $\tilde{\lambda}$ for training
 the networks is chosen as defined in Theorem~\ref{theo:main1} and each
-one is trained on the full training data for 5000 epoch using
+network is trained on the full training data for 5000 epochs using
 gradient descent. The
 results are given in Figure~\ref{fig:rn_vs_rs}, here it can be seen that in
 the intervall of the traing data $[-\pi, \pi]$ the neural network and