mnsit udn adadelta alg
This commit is contained in:
parent
46031fcd5d
commit
2c2b053d54
1831
Cluster/gd_10min.out
Normal file
1831
Cluster/gd_10min.out
Normal file
File diff suppressed because it is too large
Load Diff
@ -84,6 +84,7 @@ plot coordinates {
|
||||
\end{tabu}
|
||||
\caption{Performace metrics of the networks trained in
|
||||
Figure~\ref{ref:sdg_vs_gd} after 20 training epochs.}
|
||||
\label{sgd_vs_gd}
|
||||
\end{table}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -30,7 +30,7 @@ plot coordinates {
|
||||
$10$,$12$,$14$,$16$,$18$,$20$},
|
||||
xlabel = {epoch}, ylabel = {Classification Accuracy}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma] {Data/GD_01.log};
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none] {Data/gd_10min.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma] {Data/GD_05.log};
|
||||
\addplot table
|
||||
@ -89,29 +89,48 @@ plot coordinates {
|
||||
\end{subfigure}
|
||||
\caption{The neural network given in ?? trained with different
|
||||
algorithms on the MNIST handwritten digits data set. For gradient
|
||||
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\text{rate}}$). For
|
||||
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{
|
||||
rate}$). For
|
||||
stochastic gradient descend a batch size of 32 and learning rate
|
||||
of 0.01 is used (SDG$_{0.01}$)}
|
||||
\end{figure}
|
||||
|
||||
\begin{center}
|
||||
\begin{figure}[h]
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/klammern.jpg}
|
||||
\caption{Original Picure}
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist0.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/image_conv4.png}
|
||||
\caption{test}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist1.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/image_conv5.png}
|
||||
\caption{test}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist2.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/image_conv6.png}
|
||||
\caption{test}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist3.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist4.pdf}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist5.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist6.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist7.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist8.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\end{figure}
|
||||
\end{center}
|
||||
|
||||
|
@ -56,3 +56,20 @@ issn={1476-4687},
|
||||
doi={10.1038/323533a0},
|
||||
url={https://doi.org/10.1038/323533a0}
|
||||
}
|
||||
|
||||
@article{MNIST,
|
||||
added-at = {2010-06-28T21:16:30.000+0200},
|
||||
author = {LeCun, Yann and Cortes, Corinna},
|
||||
biburl = {https://www.bibsonomy.org/bibtex/2935bad99fa1f65e03c25b315aa3c1032/mhwombat},
|
||||
groups = {public},
|
||||
howpublished = {http://yann.lecun.com/exdb/mnist/},
|
||||
interhash = {21b9d0558bd66279df9452562df6e6f3},
|
||||
intrahash = {935bad99fa1f65e03c25b315aa3c1032},
|
||||
keywords = {MSc _checked character_recognition mnist network neural},
|
||||
lastchecked = {2016-01-14 14:24:11},
|
||||
timestamp = {2016-07-12T19:25:30.000+0200},
|
||||
title = {{MNIST} handwritten digit database},
|
||||
url = {http://yann.lecun.com/exdb/mnist/},
|
||||
username = {mhwombat},
|
||||
year = 2010
|
||||
}
|
||||
|
@ -263,20 +263,91 @@ dataset a (different) subset of data is chosen to
|
||||
compute the gradient in each iteration.
|
||||
The amount of iterations until each data point has been considered in
|
||||
updating the parameters is commonly called a ``epoch''.
|
||||
This reduces the amount of memory and computing power required for
|
||||
each iteration. This allows for use of very large training
|
||||
sets. Additionally the noise introduced on the gradient can improve
|
||||
Using subsets reduces the amount of memory and computing power required for
|
||||
each iteration. This makes it possible to use very large training
|
||||
sets to fit the model.
|
||||
Additionally the noise introduced on the gradient can improve
|
||||
the accuracy of the fit as stochastic gradient descent algorithms are
|
||||
less likely to get stuck on local extrema.
|
||||
|
||||
\input{Plots/SGD_vs_GD.tex}
|
||||
|
||||
Another benefit of using subsets even if enough memory is available to
|
||||
use the whole dataset is that depending on the size of the subsets the
|
||||
Another important benefit in using subsets is that depending on their size the
|
||||
gradient can be calculated far quicker which allows to make more steps
|
||||
in the same time. If the approximated gradient is close enough to the
|
||||
``real'' one this can drastically cut down the time required for
|
||||
training the model.
|
||||
training the model. And improve the accuracy achievable in a given
|
||||
mount of training time.
|
||||
In order to illustrate this behavior we modeled a convolutional neural
|
||||
network to ... handwritten digits. The data set used for this is the
|
||||
MNIST database of handwritten digits (\textcite{MNIST},
|
||||
Figure~\ref{fig:MNIST}).
|
||||
The network used consists of two convolution and max pooling layers
|
||||
followed by one fully connected hidden layer and the output layer.
|
||||
Both covolutional layers utilize square filters of size five which are
|
||||
applied with a stride of one.
|
||||
The first layer consists of 32 filters and the second of 64. Both
|
||||
pooling layers pool a $2\times 2$ area. The fully connected layer
|
||||
consists of 256 nodes and the output layer of 10, one for each digit.
|
||||
All layers except the output layer use RELU as activation function
|
||||
with the output layer using softmax (\ref{def:softmax}).
|
||||
As loss function categorical crossentropy is used (\ref{def:...}).
|
||||
In Figure~\ref{fig:mnist_architecture} the architecture of the network
|
||||
is summarized.
|
||||
Here it can be seen that the network trained with stochstic gradient
|
||||
descent is more accurate after the first epoch than the ones trained
|
||||
with gradient descent after 20 epochs.
|
||||
This is due to the former using a batch size of 32 and thus having
|
||||
made 1.875 updates to the weights
|
||||
after the first epoch. While each of these updates uses a approximate
|
||||
gradient calculated on the subset it performs far better than the
|
||||
network using true gradients when training for the same mount of time.
|
||||
\todo{vergleich training time}
|
||||
The difficulty of choosing the learning rate ALSO ILLUSTRATED IN
|
||||
FUGURE...
|
||||
|
||||
|
||||
The results of the network being trained with gradient descent and
|
||||
stochastic gradient descent are given in Figure~\ref{fig:sgd_vs_gd}
|
||||
and Table~\ref{table:sgd_vs_dg}
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\label{fig:MNIST}
|
||||
\end{figure}
|
||||
|
||||
\begin{itemize}
|
||||
\item ADAM
|
||||
@ -285,7 +356,24 @@ training the model.
|
||||
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
\KwInput{Decay Rate $\rho$, Constant $\varepsilon$}
|
||||
\KwInput{Initial parameter $x_1$}
|
||||
Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Accumulate Gradient: $[E[g^2]_t \leftarrow \roh D[g^2]_{t-1} +
|
||||
(1-\roh)g_t^2$\;
|
||||
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
|
||||
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
|
||||
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
|
||||
x^2]_{t-1} + (1+p)\Delta x_t^2$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{ADADELTA, \textcite{ADADELTA}}
|
||||
\label{alg:gd}
|
||||
\end{algorithm}
|
||||
|
||||
|
||||
% \subsubsubsection{Stochastic Gradient Descent}
|
||||
@ -319,7 +407,8 @@ networks. The nodes are chosen at random and change in every
|
||||
iteration, this practice is called Dropout and was introduced by
|
||||
\textcite{Dropout}.
|
||||
|
||||
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä.}
|
||||
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
|
||||
training set?}
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user