mnsit udn adadelta alg

2020-07-22 17:11:09 +02:00 · 2020-07-22 17:11:09 +02:00 · 2c2b053d54
commit 2c2b053d54
parent 46031fcd5d
5 changed files with 1980 additions and 23 deletions
--- a/Cluster/gd_10min.out
+++ b/Cluster/gd_10min.out
--- a/TeX/Plots/SGD_vs_GD.tex
+++ b/TeX/Plots/SGD_vs_GD.tex
@ -84,6 +84,7 @@ plot coordinates {
  \end{tabu}
  \caption{Performace metrics of the networks trained in
    Figure~\ref{ref:sdg_vs_gd} after 20 training epochs.}
+  \label{sgd_vs_gd}
 \end{table}
 %%% Local Variables:
 %%% mode: latex
--- a/TeX/Plots/pfg_test.tex
+++ b/TeX/Plots/pfg_test.tex
@ -30,7 +30,7 @@ plot coordinates {
          $10$,$12$,$14$,$16$,$18$,$20$},
        xlabel = {epoch}, ylabel = {Classification Accuracy}]
        \addplot table
-        [x=epoch, y=val_accuracy, col sep=comma] {Data/GD_01.log};
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none] {Data/gd_10min.log};
        \addplot table
        [x=epoch, y=val_accuracy, col sep=comma] {Data/GD_05.log};
        \addplot table
@ -89,29 +89,48 @@ plot coordinates {
  \end{subfigure}
  \caption{The neural network given in ?? trained with different
    algorithms on the MNIST handwritten digits data set. For gradient
-    descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\text{rate}}$). For
+    descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{
+        rate}$). For
    stochastic gradient descend a batch size of 32 and learning rate
    of 0.01 is used (SDG$_{0.01}$)}
 \end{figure}

 \begin{center}
-\begin{figure}[h]
-  \begin{subfigure}{0.49\textwidth}
-    \includegraphics[width=\textwidth]{Data/klammern.jpg}
-    \caption{Original Picure}
+  \begin{figure}[h]
+    \centering
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist0.pdf}
  \end{subfigure}
-  \begin{subfigure}{0.49\textwidth}
-    \includegraphics[width=\textwidth]{Data/image_conv4.png}
-    \caption{test}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist1.pdf}
  \end{subfigure}
-  \begin{subfigure}{0.49\textwidth}
-    \includegraphics[width=\textwidth]{Data/image_conv5.png}
-    \caption{test}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist2.pdf}
  \end{subfigure}
-  \begin{subfigure}{0.49\textwidth}
-    \includegraphics[width=\textwidth]{Data/image_conv6.png}
-    \caption{test}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist3.pdf}
  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist4.pdf}
+  \end{subfigure}\\
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist5.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist6.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist7.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist8.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Data/mnist9.pdf}
+  \end{subfigure}
+  \caption{The MNIST data set contains 70.000 images of preprocessed handwritten
+    digits. Of these images 60.000 are used as training images, while
+    the rest are used to validate the models trained.}
 \end{figure}
 \end{center}

--- a/TeX/bibliograpy.bib
+++ b/TeX/bibliograpy.bib
@ -56,3 +56,20 @@ issn={1476-4687},
 doi={10.1038/323533a0},
 url={https://doi.org/10.1038/323533a0}
 }
+
+@article{MNIST,
+  added-at = {2010-06-28T21:16:30.000+0200},
+  author = {LeCun, Yann and Cortes, Corinna},
+  biburl = {https://www.bibsonomy.org/bibtex/2935bad99fa1f65e03c25b315aa3c1032/mhwombat},
+  groups = {public},
+  howpublished = {http://yann.lecun.com/exdb/mnist/},
+  interhash = {21b9d0558bd66279df9452562df6e6f3},
+  intrahash = {935bad99fa1f65e03c25b315aa3c1032},
+  keywords = {MSc _checked character_recognition mnist network neural},
+  lastchecked = {2016-01-14 14:24:11},
+  timestamp = {2016-07-12T19:25:30.000+0200},
+  title = {{MNIST} handwritten digit database},
+  url = {http://yann.lecun.com/exdb/mnist/},
+  username = {mhwombat},
+  year = 2010
+}
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@ -263,20 +263,91 @@ dataset a (different) subset of data is chosen to
 compute the gradient in each iteration.
 The amount of iterations until each data point has been considered in
 updating the parameters is commonly called a ``epoch''.
-This reduces the amount of memory and computing power required for
-each iteration. This allows for use of very large training
-sets. Additionally the noise introduced on the gradient can improve
+Using subsets reduces the amount of memory and computing power required for
+each iteration. This makes it possible to use very large training
+sets to fit the model.
+Additionally the noise introduced on the gradient can improve
 the accuracy of the fit as stochastic gradient descent algorithms are
 less likely to get stuck on local extrema.

 \input{Plots/SGD_vs_GD.tex}

-Another benefit of using subsets even if enough memory is available to
-use the whole dataset is that depending on the size of the subsets the
+Another important benefit in using subsets is that depending on their size the
 gradient can be calculated far quicker which allows to make more steps
 in the same time. If the approximated gradient is close enough to the
 ``real'' one this can drastically cut down the time required for
-training the model.
+training the model. And improve the accuracy achievable in a given
+mount of training time.
+In order to illustrate this behavior we modeled a convolutional neural
+network to ... handwritten digits. The data set used for this is the
+MNIST database of handwritten digits (\textcite{MNIST},
+Figure~\ref{fig:MNIST}).
+The network used consists of two convolution and max pooling layers
+followed by one fully connected hidden layer and the output layer.
+Both covolutional layers utilize square filters of size five which are
+applied with a stride of one.
+The first layer consists of 32 filters and the second of 64. Both
+pooling layers pool a $2\times 2$ area. The fully connected layer
+consists of 256 nodes and the output layer of 10, one for each digit.
+All layers except the output layer use RELU as activation function
+with the output layer using softmax (\ref{def:softmax}).
+As loss function categorical crossentropy is used (\ref{def:...}).
+In Figure~\ref{fig:mnist_architecture} the architecture of the network
+is summarized.
+Here it can be seen that the network trained with stochstic gradient
+descent is more accurate after the first epoch than the ones trained
+with gradient descent after 20 epochs.
+This is due to the former using a batch size of 32 and thus having
+made 1.875  updates to the weights
+after the first epoch. While each of these updates uses a approximate
+gradient calculated on the subset it performs far better than the
+network using true gradients when training for the same mount of time.
+\todo{vergleich training time}
+The difficulty of choosing the learning rate ALSO ILLUSTRATED IN
+FUGURE...
+
+
+The results of the network being trained with gradient descent and
+stochastic gradient descent are given in Figure~\ref{fig:sgd_vs_gd}
+and Table~\ref{table:sgd_vs_dg}
+
+  \begin{figure}[h]
+    \centering
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
+  \end{subfigure}\\
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
+  \end{subfigure}
+  \caption{The MNIST data set contains 70.000 images of preprocessed handwritten
+    digits. Of these images 60.000 are used as training images, while
+    the rest are used to validate the models trained.}
+  \label{fig:MNIST}
+\end{figure}

 \begin{itemize}
  \item ADAM
@ -285,7 +356,24 @@ training the model.
  
  
 \end{itemize}
-
+\begin{algorithm}[H]
+  \SetAlgoLined
+  \KwInput{Decay Rate $\rho$, Constant $\varepsilon$}
+  \KwInput{Initial parameter $x_1$}
+  Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
+  \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
+    Compute Gradient: $g_t$\;
+    Accumulate Gradient: $[E[g^2]_t \leftarrow \roh D[g^2]_{t-1} +
+    (1-\roh)g_t^2$\;
+    Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
+        x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
+    Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
+    x^2]_{t-1} + (1+p)\Delta x_t^2$\;
+    Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
+  }  
+  \caption{ADADELTA, \textcite{ADADELTA}}
+  \label{alg:gd}
+\end{algorithm}


 % \subsubsubsection{Stochastic Gradient Descent}
@ -319,7 +407,8 @@ networks. The nodes are chosen at random and change in every
 iteration, this practice is called Dropout and was introduced by
 \textcite{Dropout}.

-\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä.}
+\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
+training set?}