progress
This commit is contained in:
parent
2c2b053d54
commit
b0afc88091
@ -73,18 +73,18 @@ plot coordinates {
|
||||
\label{fig:sgd_vs_gd}
|
||||
\end{figure}
|
||||
|
||||
\begin{table}
|
||||
\begin{table}[h]
|
||||
\begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}}
|
||||
\multicolumn{4}{c}{Classification Accuracy}
|
||||
&~&\multicolumn{4}{c}{Error Measure}
|
||||
\\\cline{1-4}\cline{6-9}
|
||||
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
|
||||
\\\cline{1-4}\cline{6-9}
|
||||
1&1&1&1&&1&1&1&1
|
||||
0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
|
||||
\end{tabu}
|
||||
\caption{Performace metrics of the networks trained in
|
||||
Figure~\ref{ref:sdg_vs_gd} after 20 training epochs.}
|
||||
\label{sgd_vs_gd}
|
||||
\caption{Performance metrics of the networks trained in
|
||||
Figure~\ref{fig:sgd_vs_gd} after 20 training epochs.}
|
||||
\label{table:sgd_vs_gd}
|
||||
\end{table}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
41
TeX/Plots/mnist.tex
Normal file
41
TeX/Plots/mnist.tex
Normal file
@ -0,0 +1,41 @@
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\label{fig:MNIST}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
@ -73,3 +73,20 @@ url={https://doi.org/10.1038/323533a0}
|
||||
username = {mhwombat},
|
||||
year = 2010
|
||||
}
|
||||
|
||||
@article{resnet,
|
||||
author = {Kaiming He and
|
||||
Xiangyu Zhang and
|
||||
Shaoqing Ren and
|
||||
Jian Sun},
|
||||
title = {Deep Residual Learning for Image Recognition},
|
||||
journal = {CoRR},
|
||||
volume = {abs/1512.03385},
|
||||
year = 2015,
|
||||
url = {http://arxiv.org/abs/1512.03385},
|
||||
archivePrefix = {arXiv},
|
||||
eprint = {1512.03385},
|
||||
timestamp = {Wed, 17 Apr 2019 17:23:45 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/HeZRS15.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
@ -136,7 +136,7 @@ output is given by
|
||||
\]
|
||||
where $\sqrt{\cdot}$ and $\cdot^2$ are applied component
|
||||
wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img_conv}.
|
||||
|
||||
\todo{padding}
|
||||
|
||||
|
||||
\begin{figure}[h]
|
||||
@ -186,20 +186,59 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
|
||||
\clearpage
|
||||
\newpage
|
||||
\subsection{Convolutional NN}
|
||||
\todo{Eileitung zu CNN}
|
||||
% Conventional neural network as described in chapter .. are made up of
|
||||
% fully connected layers, meaning each node in a layer is influenced by
|
||||
% all nodes of the previous layer. If one wants to extract information
|
||||
% out of high dimensional input such as images this results in a very
|
||||
% large amount of variables in the model. This limits the
|
||||
|
||||
In conventional neural networks as described in chapter ... all layers
|
||||
are fully connected, meaning each output node in a layer is influenced
|
||||
by all inputs. For $i$ inputs and $o$ output nodes this results in $i
|
||||
+ 1$ variables at each node (weights and bias) and a total $o(i + 1)$
|
||||
variables. For large inputs like image data the amount of variables
|
||||
that have to be trained in order to fit the model can get excessive
|
||||
and hinder the ability to train the model due to memory and
|
||||
computational restrictions. By using convolution we can extract
|
||||
meaningful information such as edges in an image with a kernel of a
|
||||
small size $k$ in the tens or hundreds independent of the size of the
|
||||
original image. Thus for a large image $k \cdot i$ can be several
|
||||
orders of magnitude smaller than $o\cdot i$ .
|
||||
% In conventional neural networks as described in chapter ... all layers
|
||||
% are fully connected, meaning each output node in a layer is influenced
|
||||
% by all inputs. For $i$ inputs and $o$ output nodes this results in $i
|
||||
% + 1$ variables at each node (weights and bias) and a total $o(i + 1)$
|
||||
% variables. For large inputs like image data the amount of variables
|
||||
% that have to be trained in order to fit the model can get excessive
|
||||
% and hinder the ability to train the model due to memory and
|
||||
% computational restrictions. By using convolution we can extract
|
||||
% meaningful information such as edges in an image with a kernel of a
|
||||
% small size $k$ in the tens or hundreds independent of the size of the
|
||||
% original image. Thus for a large image $k \cdot i$ can be several
|
||||
% orders of magnitude smaller than $o\cdot i$ .
|
||||
|
||||
As seen in the previous section convolution can lend itself to
|
||||
manipulation of images or other large data which motivates it usage in
|
||||
neural networks.
|
||||
This is achieved by implementing convolutional layers where several
|
||||
filters are applied to the input. Where the values of the filters are
|
||||
trainable parameters of the model.
|
||||
Each node in such a layer corresponds to a pixel of the output of
|
||||
convolution with one of those filters on which a bias and activation
|
||||
function are applied.
|
||||
The usage of multiple filters results in multiple outputs of the same
|
||||
size as the input. These are often called channels. Depending on the
|
||||
size of the filters this can result in the dimension of the output
|
||||
being one larger than the input.
|
||||
However for convolutional layers following a convolutional layer the
|
||||
size of the filter is often chosen to coincide with the amount of channels
|
||||
of the output of the previous layer without using padding in this
|
||||
direction in order to prevent gaining additional
|
||||
dimensions\todo{komisch} in the output.
|
||||
This can also be used to flatten certain less interesting channels of
|
||||
the input as for example a color channels.
|
||||
Thus filters used in convolutional networks are usually have the same
|
||||
amount of dimensions as the input or one more.
|
||||
|
||||
The size of the filters and the way they are applied can be tuned
|
||||
while building the model should be the same for all filters in one
|
||||
layer in order for the output being of consistent size in all channels.
|
||||
It is common to reduce the d< by not applying the
|
||||
filters on each ``pixel'' but rather specify a ``stride'' $s$ at which
|
||||
the filter $g$ is moved over the input $I$
|
||||
|
||||
\[
|
||||
O_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
|
||||
\]
|
||||
|
||||
As seen convolution lends itself for image manipulation. In this
|
||||
chapter we will explore how we can incorporate convolution in neural
|
||||
@ -260,9 +299,9 @@ network. A class of algorithms that augment the gradient descent
|
||||
algorithm in order to lessen this problem are stochastic gradient
|
||||
descent algorithms. Here the premise is that instead of using the whole
|
||||
dataset a (different) subset of data is chosen to
|
||||
compute the gradient in each iteration.
|
||||
The amount of iterations until each data point has been considered in
|
||||
updating the parameters is commonly called a ``epoch''.
|
||||
compute the gradient in each iteration (Algorithm~\ref{alg:sdg}).
|
||||
The training period until each data point has been considered in
|
||||
updating the parameters is commonly called an ``epoch''.
|
||||
Using subsets reduces the amount of memory and computing power required for
|
||||
each iteration. This makes it possible to use very large training
|
||||
sets to fit the model.
|
||||
@ -270,18 +309,42 @@ Additionally the noise introduced on the gradient can improve
|
||||
the accuracy of the fit as stochastic gradient descent algorithms are
|
||||
less likely to get stuck on local extrema.
|
||||
|
||||
\input{Plots/SGD_vs_GD.tex}
|
||||
|
||||
Another important benefit in using subsets is that depending on their size the
|
||||
gradient can be calculated far quicker which allows to make more steps
|
||||
gradient can be calculated far quicker which allows for more parameter updates
|
||||
in the same time. If the approximated gradient is close enough to the
|
||||
``real'' one this can drastically cut down the time required for
|
||||
training the model. And improve the accuracy achievable in a given
|
||||
training the model to a certain degree or improve the accuracy achievable in a given
|
||||
mount of training time.
|
||||
|
||||
\begin{algorithm}
|
||||
\SetAlgoLined
|
||||
\KwInput{Function $f$, Weights $w$, Learning Rate $\gamma$, Batch Size $B$, Loss Function $L$,
|
||||
Training Data $D$, Epochs $E$.}
|
||||
\For{$i \in \left\{1:E\right\}$}{
|
||||
S <- D
|
||||
\While{$\abs{S} \geq B$}{
|
||||
Draw $\tilde{D}$ from $S$ with $\vert\tilde{D}\vert = B$\;
|
||||
Update $S$: $S \leftarrow S \setminus \tilde{D}$\;
|
||||
Compute Gradient: $g \leftarrow \frac{\mathrm{d} L(f_w,
|
||||
\tilde{D})}{\mathrm{d} w}$\;
|
||||
Update: $w \leftarrow w - \gamma g$\;
|
||||
}
|
||||
\If{$S \neq \emptyset$}{
|
||||
Compute Gradient: $g \leftarrow \frac{\mathrm{d} L(f_w,
|
||||
S)}{\mathrm{d} w}$\;
|
||||
Update: $w \leftarrow w - \gamma g$\;
|
||||
}
|
||||
Increment: $i \leftarrow i+1$\;
|
||||
}
|
||||
\caption{Stochastic gradient descent.}
|
||||
\label{alg:sgd}
|
||||
\end{algorithm}
|
||||
|
||||
In order to illustrate this behavior we modeled a convolutional neural
|
||||
network to ... handwritten digits. The data set used for this is the
|
||||
MNIST database of handwritten digits (\textcite{MNIST},
|
||||
Figure~\ref{fig:MNIST}).
|
||||
\input{Plots/mnist.tex}
|
||||
The network used consists of two convolution and max pooling layers
|
||||
followed by one fully connected hidden layer and the output layer.
|
||||
Both covolutional layers utilize square filters of size five which are
|
||||
@ -292,62 +355,78 @@ consists of 256 nodes and the output layer of 10, one for each digit.
|
||||
All layers except the output layer use RELU as activation function
|
||||
with the output layer using softmax (\ref{def:softmax}).
|
||||
As loss function categorical crossentropy is used (\ref{def:...}).
|
||||
In Figure~\ref{fig:mnist_architecture} the architecture of the network
|
||||
is summarized.
|
||||
The architecture of the convolutional neural network is summarized in
|
||||
Figure~\ref{fig:mnist_architecture}.
|
||||
|
||||
\begin{figure}
|
||||
\missingfigure{network architecture}
|
||||
\caption{architecture}
|
||||
\label{fig:mnist_architecture}
|
||||
\end{figure}
|
||||
|
||||
The results of the network being trained with gradient descent and
|
||||
stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd}
|
||||
and Table~\ref{table:sgd_vs_gd}
|
||||
|
||||
\input{Plots/SGD_vs_GD.tex}
|
||||
|
||||
Here it can be seen that the network trained with stochstic gradient
|
||||
descent is more accurate after the first epoch than the ones trained
|
||||
with gradient descent after 20 epochs.
|
||||
This is due to the former using a batch size of 32 and thus having
|
||||
made 1.875 updates to the weights
|
||||
after the first epoch. While each of these updates uses a approximate
|
||||
after the first epoch in comparison to one update . While each of
|
||||
these updates uses a approximate
|
||||
gradient calculated on the subset it performs far better than the
|
||||
network using true gradients when training for the same mount of time.
|
||||
\todo{vergleich training time}
|
||||
The difficulty of choosing the learning rate ALSO ILLUSTRATED IN
|
||||
FUGURE...
|
||||
\clearpage
|
||||
\subsection{Modified Stochastic Gradient Descent}
|
||||
There is a inherent problem in the sensitivity of the gradient descent
|
||||
algorithm regarding the learning rate $\gamma$.
|
||||
The difficulty of choosing the learning rate is
|
||||
in the Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
|
||||
but as the rate is enlarged the algorithm can become unstable and
|
||||
diverge. Even for learning rates small enough to ensure the parameters
|
||||
do not diverge to infinity steep valleys can hinder the progress of
|
||||
the algorithm as with to large leaning rates gradient descent
|
||||
``bounces between'' the walls of the valley rather then follow ...
|
||||
|
||||
% \[
|
||||
% w - \gamma \nabla_w ...
|
||||
% \]
|
||||
thus the weights grow to infinity.
|
||||
\todo{unstable learning rate besser
|
||||
erklären}
|
||||
|
||||
The results of the network being trained with gradient descent and
|
||||
stochastic gradient descent are given in Figure~\ref{fig:sgd_vs_gd}
|
||||
and Table~\ref{table:sgd_vs_dg}
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\label{fig:MNIST}
|
||||
\end{figure}
|
||||
To combat this problem it is proposed \todo{quelle} alter the learning
|
||||
rate over the course of training, often called leaning rate
|
||||
scheduling. The most popular implementations of this are time based
|
||||
decay
|
||||
\[
|
||||
\gamma_{n+1} = \frac{\gamma_n}{1 + d n},
|
||||
\]
|
||||
where $d$ is the decay parameter and $n$ is the number of epochs,
|
||||
step based decay where the learning rate is fixed for a span of $r$
|
||||
epochs and then decreased according to parameter $d$
|
||||
\[
|
||||
\gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}
|
||||
\]
|
||||
and exponential decay, where the learning rate is decreased after each epoch,
|
||||
\[
|
||||
\gamma_n = \gamma_o e^{-n d}.
|
||||
\]
|
||||
These methods are able to increase the accuracy of a model by a large
|
||||
margin as seen in the training of RESnet by \textcite{resnet}
|
||||
\todo{vielleicht grafik
|
||||
einbauen}. However stochastic gradient descent with weight decay is
|
||||
still highly sensitive to the choice of the hyperparameters $\gamma$
|
||||
and $d$.
|
||||
In order to mitigate this problem a number of algorithms have been
|
||||
developed to regularize the learning rate with as minimal
|
||||
hyperparameter guesswork as possible.
|
||||
One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELTA}
|
||||
\clearpage
|
||||
|
||||
\begin{itemize}
|
||||
\item ADAM
|
||||
@ -363,8 +442,8 @@ and Table~\ref{table:sgd_vs_dg}
|
||||
Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Accumulate Gradient: $[E[g^2]_t \leftarrow \roh D[g^2]_{t-1} +
|
||||
(1-\roh)g_t^2$\;
|
||||
Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} +
|
||||
(1-\rho)g_t^2$\;
|
||||
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
|
||||
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
|
||||
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
|
||||
|
@ -78,7 +78,7 @@
|
||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||
\DeclareMathOperator*{\po}{\mathbb{P}\text{-}\mathcal{O}}
|
||||
\DeclareMathOperator*{\equals}{=}
|
||||
\begin{document}f
|
||||
\begin{document}
|
||||
|
||||
|
||||
|
||||
|
@ -172,7 +172,6 @@ increased.
|
||||
|
||||
|
||||
\begin{figure}
|
||||
\begin{adjustbox}{width = \textwidth}
|
||||
\pgfplotsset{
|
||||
compat=1.11,
|
||||
legend image code/.code={
|
||||
@ -202,7 +201,6 @@ plot coordinates {
|
||||
\addlegendentry{\footnotesize{spline}};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
||||
\varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
|
||||
(\textcolor{blue}{blue dots}) the neural network constructed
|
||||
|
Loading…
Reference in New Issue
Block a user