progress

2020-08-10 20:54:02 +02:00 · 2020-08-10 20:54:02 +02:00 · bad8e42630
commit bad8e42630
parent 1a45e7d596
9 changed files with 289 additions and 70 deletions
--- a/TeX/Plots/SGD_vs_GD.tex
+++ b/TeX/Plots/SGD_vs_GD.tex
@ -80,6 +80,7 @@ plot coordinates {
    \\\cline{1-4}\cline{6-9}
    GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
    \\\cline{1-4}\cline{6-9}
+    \multicolumn{9}{c}{test}\\
    0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
  \end{tabu}
  \caption{Performance metrics of the networks trained in
--- a/TeX/Plots/fashion_mnist.tex
+++ b/TeX/Plots/fashion_mnist.tex
@ -0,0 +1,53 @@
+\begin{figure}[h]
+    \centering
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist0.pdf}
+    \caption{T-shirt/top}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist1.pdf}
+    \caption{Trousers}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist2.pdf}
+    \caption{Pullover}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist3.pdf}
+    \caption{Dress}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist4.pdf}
+    \caption{Coat}
+  \end{subfigure}\\
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist5.pdf}
+    \caption{Sandal}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist6.pdf}
+    \caption{Shirt}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist7.pdf}
+    \caption{Sneaker}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist8.pdf}
+    \caption{Bag}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf}
+    \caption{Ankle boot}
+  \end{subfigure}
+  \caption{The fashtion MNIST data set contains 70.000 images of
+    preprocessed product images from Zalando, which are categorized as
+    T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
+    Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while
+    the rest are used to validate the models trained.}
+  \label{fig:MNIST}
+\end{figure}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../main"
+%%% End:
--- a/TeX/Plots/gen_dropout.tex
+++ b/TeX/Plots/gen_dropout.tex
@ -0,0 +1,79 @@
+\pgfplotsset{
+compat=1.11,
+legend image code/.code={
+\draw[mark repeat=2,mark phase=2]
+plot coordinates {
+(0cm,0cm)
+(0.3cm,0cm)        %% default is (0.3cm,0cm)
+(0.6cm,0cm)         %% default is (0.6cm,0cm)
+};%
+}
+}
+\begin{figure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_datagen_full_mean.log};  
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_datagen_dropout_02_full_mean.log}; 
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_datagen_dropout_04_full_mean.log};
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_dropout_02_full_mean.log}; 
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_dropout_04_full_mean.log}; 
+        \addplot [dashed] table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_full_mean.log}; 
+        
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{G. + D. 0.4}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+        \addlegendentry{\footnotesize{Default}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{Classification accuracy}
+    \vspace{.25cm}
+  \end{subfigure}
+  \begin{subfigure}[h]{1.0\linewidth}
+    \begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
+      \multicolumn{7}{c}{Classification Accuracy}\Bstrut
+      \\\hline
+      &\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.+D.~0.4 \Tstrut \Bstrut
+      \\\hline
+      mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
+      max& \\
+      min& \\
+      \multicolumn{7}{c}{Training Accuracy}\Bstrut
+      \\\hline
+      mean&0.9994&0.9990&0.9989&0.9967&0.9954&0.9926 \Tstrut \\
+      max& \\
+      min& \\
+      
+    \end{tabu}
+    \caption{Mean and maximum accuracy after 48 epochs of training.}
+  \end{subfigure}
+  \caption{Accuracy for the net given in ... with Dropout (D.),
+    data generation (G.), a combination, or neither (Default) implemented and trained
+    with \textsc{Adam}. For each epoch the 60.000 training samples
+    were used, or for data generation 10.000 steps with each using
+    batches of 60 generated data points. For each configuration the
+    model was trained 5 times and the average accuracies at each epoch
+    are given in (a). Mean, maximum and minimum values of accuracy on
+    the test and training set are given in (b).}
+\end{figure}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../main"
+%%% End:
--- a/TeX/Plots/pfg_test.tex
+++ b/TeX/Plots/pfg_test.tex
@ -7,6 +7,10 @@
 \usepackage{tabu}
 \usepackage{graphicx}
 \usetikzlibrary{calc, 3d}
+\usepgfplotslibrary{colorbrewer}
+
+\newcommand\Tstrut{\rule{0pt}{2.6ex}}         % = `top' strut
+\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}}   % = `bottom' strut

 \begin{document}
 \pgfplotsset{
@ -15,71 +19,80 @@ legend image code/.code={
 \draw[mark repeat=2,mark phase=2]
 plot coordinates {
 (0cm,0cm)
-(0.0cm,0cm)        %% default is (0.3cm,0cm)
-(0.0cm,0cm)         %% default is (0.6cm,0cm)
+(0.3cm,0cm)        %% default is (0.3cm,0cm)
+(0.6cm,0cm)         %% default is (0.6cm,0cm)
 };%
 }
 }
 \begin{figure}
-  \begin{subfigure}[b]{\textwidth}
+  \begin{subfigure}[h]{\textwidth}
    \begin{tikzpicture}
-      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
-        xlabel = {epoch}, ylabel = {Classification Accuracy}]
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
+        % \addplot [dashed] table
+        % [x=epoch, y=accuracy, col sep=comma, mark = none]
+        % {Data/adam_datagen_full.log};
        \addplot table
        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
-        {Data/adagrad.log}; 
+        {Data/adam_datagen_full_mean.log};  
+        % \addplot [dashed] table
+        % [x=epoch, y=accuracy, col sep=comma, mark = none]
+        % {Data/adam_datagen_dropout_02_full.log}; 
        \addplot table
        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
-        {Data/adadelta.log}; 
+        {Data/adam_datagen_dropout_02_full_mean.log}; 
        \addplot table
        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
-        {Data/adam.log}; 
+        {Data/adam_datagen_dropout_04_full_mean.log};
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Data/adam_dropout_02_full_mean.log}; 
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Data/adam_dropout_04_full_mean.log}; 
+        \addplot [dashed] table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Data/adam_full_mean.log}; 
        
-         \addlegendentry{\footnotesize{ADAGRAD}}
-        \addlegendentry{\footnotesize{ADADELTA}}
-        \addlegendentry{\footnotesize{ADAM}}
-        \addlegendentry{SGD$_{0.01}$}
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{G. + D. 0.4}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+        \addlegendentry{\footnotesize{Default}}
      \end{axis}
    \end{tikzpicture}
-    %\caption{Classification accuracy}
+    \caption{Classification accuracy}
+    \vspace{.25cm}
  \end{subfigure}
-  \begin{subfigure}[b]{\textwidth}
-    \begin{tikzpicture}
-      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.7\textwidth, ymax = 0.5,
-        xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
-        {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
-        \addplot table
-        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log};
-        \addplot table
-        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log};
-        \addplot table
-        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log};
-        
-        \addlegendentry{\footnotesize{ADAGRAD}}
-        \addlegendentry{\footnotesize{ADADELTA}}
-        \addlegendentry{\footnotesize{ADAM}}
-        \addlegendentry{SGD$_{0.01}$}
-        
-      \end{axis}
-    \end{tikzpicture}
-    \caption{Performance metrics during training}
-  \end{subfigure}
-  \\~\\
-  \begin{subfigure}[b]{1.0\linewidth}
-    \begin{tabu} to \textwidth {@{}  *3{X[c]}c*3{X[c]} @{}}
-      \multicolumn{3}{c}{Classification Accuracy}
-      &~&\multicolumn{3}{c}{Error Measure}
-      \\\cline{1-3}\cline{5-7}
-      ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
-      \\\cline{1-3}\cline{5-7}
-      1&1&1&&1&1&1
+  \begin{subfigure}[h]{1.0\linewidth}
+    \begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
+      \multicolumn{7}{c}{Classification Accuracy}\Bstrut
+      \\\hline
+      &\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.~,D.~0.4 \Tstrut \Bstrut
+      \\\hline
+      mean&0.9994&0.9990&0.9989&0.9937&0.9938&0.9940 \Tstrut \\
+      max& \\
+      min& \\
+      \multicolumn{7}{c}{Training Accuracy}\Bstrut
+      \\\hline
+      mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
+      max& \\
+      min& \\
+      
    \end{tabu}
-    \caption{Performace metrics after 20 epochs}
+    \caption{Mean and maximum accuracy after 48 epochs of training.}
  \end{subfigure}
-  \caption{Performance metrics of the network given in ... trained
-    with different optimization algorithms}
+  \caption{Accuracy for the net given in ... with Dropout (D.),
+    data generation (G.), a combination, or neither (Default) implemented and trained
+    with \textsc{Adam}. For each epoch the 60.000 training samples
+    were used, or for data generation 10.000 steps with each using
+    batches of 60 generated data points. For each configuration the
+    model was trained 5 times and the average accuracies at each epoch
+    are given in (a). Mean, maximum and minimum values of accuracy on
+    the test and training set are given in (b).}
 \end{figure}

 \begin{center}
@ -87,18 +100,23 @@ plot coordinates {
    \centering
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Data/mnist0.pdf}
+    \caption{original\\image}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist1.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_zoom.pdf}
+    \caption{random\\zoom}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist2.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf}
+    \caption{random\\shear}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist3.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf}
+    \caption{random\\rotation}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist4.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf}
+  \caption{random\\positional shift}
  \end{subfigure}\\
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Data/mnist5.pdf}
--- a/TeX/Plots/sdg_comparison.tex
+++ b/TeX/Plots/sdg_comparison.tex
@ -67,7 +67,7 @@ plot coordinates {
    \end{tabu}
    \caption{Performace metrics after 20 epochs}
  \end{subfigure}
-  \caption{Performance metrics of the network given in ... trained
+  \caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
    with different optimization algorithms}
 \end{figure}
 %%% Local Variables:
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@ -450,7 +450,7 @@ $\gamma$ is divided by the sum of the squares of the past partial
 derivatives in this parameter. This results in a monotonously
 decreasing learning rate for each parameter. This results in a faster
 decaying learning rate for parameters with large updates, where as
-parameters with small updates experience smaller decay. The ADAGRAD
+parameters with small updates experience smaller decay. The \textsc{AdaGrad}
 algorithm is given in Algorithm~\ref{alg:ADAGRAD}.

 \begin{algorithm}[H]
@ -465,15 +465,15 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
    1, \dots,p$\;
    Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
  }  
-  \caption{\textls{ADAGRAD}}
+  \caption{\textls{\textsc{AdaGrad}}}
  \label{alg:ADAGRAD}
 \end{algorithm}

-Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA)
-in order to improve upon the two main drawbacks of ADAGRAD, being the
+Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta})
+in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the
 continual decay of the learning rate and the need for a manually
 selected global learning rate $\gamma$.
-As ADAGRAD accumulates the squared gradients the learning rate will
+As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will
 eventually become infinitely small.
 In order to ensure that even after a significant of iterations
 learning continues to make progress instead of summing the gradients a
@ -500,7 +500,7 @@ by these of the parameter update $\Delta x_t$. This proper
    x^2]_{t-1} + (1+p)\Delta x_t^2$\;
    Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
  }  
-  \caption{ADADELTA, \textcite{ADADELTA}}
+  \caption{\textsc{AdaDelta}, \textcite{ADADELTA}}
  \label{alg:gd}
 \end{algorithm}

@ -520,11 +520,11 @@ of the marble.
 This results in the algorithm being able to escape ... due to the
 build up momentum from approaching it. 

-\begin{itemize}
-  \item ADAM
-  \item momentum
-  \item ADADETLA \textcite{ADADELTA} 
-\end{itemize}
+% \begin{itemize}
+%   \item ADAM
+%   \item momentum
+%   \item ADADETLA \textcite{ADADELTA} 
+% \end{itemize}


 \begin{algorithm}[H]
@ -665,7 +665,37 @@ When using this one has to be sure that the labels indeed remain the
 same or else the network will not learn the desired ...
 In the case of handwritten digits for example a to high rotation angle
 will ... a nine or six.
-The most common transformations are rotation, zoom, shear, brightness, mirroring.
+The most common transformations are rotation, zoom, shear, brightness,
+mirroring.
+
+\begin{figure}[h]
+  \centering
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
+    \caption{original\\image}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_zoom.pdf}
+    \caption{random\\zoom}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shear.pdf}
+    \caption{random\\shear}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_rotation.pdf}
+    \caption{random\\rotation}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf}
+    \caption{random\\positional shift}
+  \end{subfigure}
+  \caption{Example for the manipuations used in ... As all images are
+    of the same intensity brightness manipulation does not seem
+    ... Additionally mirroring is not used for ...  reasons.}
+\end{figure}
+
+\input{Plots/gen_dropout.tex}

 \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
 training set?}
@ -674,10 +704,41 @@ training set?}

 For some applications (medical problems with small amount of patients)
 the available data can be highly limited.
-In order to get a understanding for the achievable accuracy for such a
-scenario in the following we examine the ... and  .. with a highly
-reduced training set and the impact the above mentioned strategies on
-combating overfitting have.
+In these problems the networks are highly ... for overfitting the
+data. In order to get a understanding of accuracys achievable and the
+impact of the measures to prevent overfitting discussed above we and train
+the network on datasets of varying sizes.
+First we use the mnist handwriting dataset and then a slightly harder
+problem given by the mnist fashion dataset which contains PREEDITED
+pictures of clothes from 10 different categories.
+
+\input{Plots/fashion_mnist.tex}
+
+For training for each class a certain number of random datapoints are
+chosen for training the network. The sizes chosen are:
+full dataset: ... per class\\
+1000 per class
+100 per class
+10 per class
+
+the results for training .. are given in ... Here can be seen...
+
+\begin{figure}[h]
+  \centering
+  \missingfigure{datagen digits}
+  \caption{Sample pictures of the mnist fashioyn dataset, one per
+    class.}
+  \label{mnist fashion}
+\end{figure}
+
+\begin{figure}[h]
+  \centering
+  \missingfigure{datagen fashion}
+  \caption{Sample pictures of the mnist fashioyn dataset, one per
+    class.}
+  \label{mnist fashion}
+\end{figure}
+

 \clearpage
 \section{Bla}
--- a/TeX/introduction_nn.tex
+++ b/TeX/introduction_nn.tex
@ -295,7 +295,7 @@ interpretation.
 Commonly the nodes in the output layer each correspond to a class and
 the class chosen as prediction is the one with the highest value at
 the corresponding output node.
-The naive transformation to achieve this is transforming the output
+This corresponds to a transformation of the output
 vector $o$ into a one-hot vector
 \[
  \text{pred}_i =
--- a/TeX/main.tex
+++ b/TeX/main.tex
@ -92,6 +92,9 @@

 \newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}

+\newcommand\Tstrut{\rule{0pt}{2.6ex}}         % = `top' strut
+\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}}   % = `bottom' strut
+
 \SetKwInput{KwInput}{Input}  

 %\newcommand{\myrightarrow}[1]{\xrightarrow{\makebox[2em][c]{$\scriptstyle#1$}}}‌
--- a/TeX/theo_3_8.tex
+++ b/TeX/theo_3_8.tex
@ -6,6 +6,10 @@
 %%% End:
 \section{Shallow Neural Networks}

+In order to get a some understanding of the behavior of neural
+networks we study a simplified class of networks called shallow neural
+networks in this chapter. We consider shallow neural networks consist of a single
+hidden layer and 
 In order to examine some behavior of neural networks in this chapter
 we consider a simple class of networks, the shallow ones. These
 networks only contain one hidden layer and have a single output node.