progress
This commit is contained in:
parent
b716f7688a
commit
7ffa8e63f2
@ -13,7 +13,7 @@ plot coordinates {
|
||||
\begin{subfigure}[h!]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.65\textwidth,
|
||||
height = 0.6\textwidth,
|
||||
xtick = {1, 3, 5,7,9,11,13,15,17,19},
|
||||
xticklabels = {$2$, $4$, $6$, $8$,
|
||||
$10$,$12$,$14$,$16$,$18$,$20$},
|
||||
@ -39,13 +39,13 @@ plot coordinates {
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.65\textwidth,
|
||||
height = 0.6\textwidth,
|
||||
ytick = {0, 1, 2, 3, 4},
|
||||
yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$},
|
||||
xtick = {1, 3, 5,7,9,11,13,15,17,19},
|
||||
xticklabels = {$2$, $4$, $6$, $8$,
|
||||
$10$,$12$,$14$,$16$,$18$,$20$},
|
||||
xlabel = {training epoch}, ylabel = {error measure}]
|
||||
xlabel = {training epoch}, ylabel = {error measure\vphantom{fy}}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma] {Plots/Data/GD_01.log};
|
||||
\addplot table
|
||||
|
@ -134,4 +134,46 @@ numpages = {39}
|
||||
timestamp = {Mon, 22 Jul 2019 13:15:46 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@article{Dropout1,
|
||||
author = {Geoffrey E. Hinton and
|
||||
Nitish Srivastava and
|
||||
Alex Krizhevsky and
|
||||
Ilya Sutskever and
|
||||
Ruslan Salakhutdinov},
|
||||
title = {Improving neural networks by preventing co-adaptation of feature detectors},
|
||||
journal = {CoRR},
|
||||
volume = {abs/1207.0580},
|
||||
year = {2012},
|
||||
url = {http://arxiv.org/abs/1207.0580},
|
||||
archivePrefix = {arXiv},
|
||||
eprint = {1207.0580},
|
||||
timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@inproceedings{
|
||||
rADAM,
|
||||
title={On the Variance of the Adaptive Learning Rate and Beyond},
|
||||
author={Liyuan Liu and Haoming Jiang and Pengcheng He and Weizhu Chen and Xiaodong Liu and Jianfeng Gao and Jiawei Han},
|
||||
booktitle={International Conference on Learning Representations},
|
||||
year={2020},
|
||||
url={https://openreview.net/forum?id=rkgz2aEKDr}
|
||||
}
|
||||
|
||||
@inproceedings{ADAM,
|
||||
author = {Diederik P. Kingma and
|
||||
Jimmy Ba},
|
||||
editor = {Yoshua Bengio and
|
||||
Yann LeCun},
|
||||
title = {Adam: {A} Method for Stochastic Optimization},
|
||||
booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
|
||||
San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
|
||||
year = {2015},
|
||||
url = {http://arxiv.org/abs/1412.6980},
|
||||
timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
@ -276,11 +276,11 @@ The choice of convolution for image classification tasks is not
|
||||
arbitrary. ... auge... bla bla
|
||||
|
||||
|
||||
\subsection{Limitations of the Gradient Descent Algorithm}
|
||||
% \subsection{Limitations of the Gradient Descent Algorithm}
|
||||
|
||||
-Hyperparameter guesswork
|
||||
-Problems navigating valleys -> momentum
|
||||
-Different scale of gradients for vars in different layers -> ADAdelta
|
||||
% -Hyperparameter guesswork
|
||||
% -Problems navigating valleys -> momentum
|
||||
% -Different scale of gradients for vars in different layers -> ADAdelta
|
||||
|
||||
\subsection{Stochastic Training Algorithms}
|
||||
|
||||
@ -368,20 +368,21 @@ The results of the network being trained with gradient descent and
|
||||
stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd}
|
||||
and Table~\ref{table:sgd_vs_gd}
|
||||
|
||||
\input{Plots/SGD_vs_GD.tex}
|
||||
|
||||
Here it can be seen that the network trained with stochstic gradient
|
||||
descent is more accurate after the first epoch than the ones trained
|
||||
with gradient descent after 20 epochs.
|
||||
This is due to the former using a batch size of 32 and thus having
|
||||
made 1.875 updates to the weights
|
||||
after the first epoch in comparison to one update . While each of
|
||||
after the first epoch in comparison to one update. While each of
|
||||
these updates uses a approximate
|
||||
gradient calculated on the subset it performs far better than the
|
||||
network using true gradients when training for the same mount of time.
|
||||
\todo{vergleich training time}
|
||||
|
||||
\input{Plots/SGD_vs_GD.tex}
|
||||
\clearpage
|
||||
\subsection{Modified Stochastic Gradient Descent}
|
||||
\subsection{\titlecap{modified stochastic gradient descent}}
|
||||
There is a inherent problem in the sensitivity of the gradient descent
|
||||
algorithm regarding the learning rate $\gamma$.
|
||||
The difficulty of choosing the learning rate can be seen
|
||||
@ -434,7 +435,7 @@ They all scale the gradient for the update depending of past gradients
|
||||
for each weight individually.
|
||||
|
||||
The algorithms are build up on each other with the adaptive gradient
|
||||
algorithm (ADAGRAD, \textcite{ADAGRAD})
|
||||
algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD})
|
||||
laying the base work. Here for each parameter update the learning rate
|
||||
is given my a constant
|
||||
$\gamma$ is divided by the sum of the squares of the past partial
|
||||
@ -456,11 +457,11 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
||||
1, \dots,p$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{ADAGRAD}
|
||||
\caption{\textls{ADAGRAD}}
|
||||
\label{alg:ADAGRAD}
|
||||
\end{algorithm}
|
||||
|
||||
Building on ADAGRAD \textcite{ADADELTA} developed the ... (ADADELTA)
|
||||
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA)
|
||||
in order to improve upon the two main drawbacks of ADAGRAD, being the
|
||||
continual decay of the learning rate and the need for a manually
|
||||
selected global learning rate $\gamma$.
|
||||
@ -476,18 +477,6 @@ if the parameter vector had some hypothetical units they would be matched
|
||||
by these of the parameter update $\Delta x_t$. This proper
|
||||
\todo{erklärung unit}
|
||||
|
||||
|
||||
While the stochastic gradient algorithm is less susceptible to local
|
||||
extrema than gradient descent the problem still persists especially
|
||||
with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
|
||||
|
||||
\begin{itemize}
|
||||
\item ADAM
|
||||
\item momentum
|
||||
\item ADADETLA \textcite{ADADELTA}
|
||||
|
||||
|
||||
\end{itemize}
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
\KwInput{Decay Rate $\rho$, Constant $\varepsilon$}
|
||||
@ -507,6 +496,68 @@ with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
|
||||
\label{alg:gd}
|
||||
\end{algorithm}
|
||||
|
||||
While the stochastic gradient algorithm is less susceptible to local
|
||||
extrema than gradient descent the problem still persists especially
|
||||
with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
|
||||
|
||||
A approach to the problem of ``getting stuck'' in saddle point or
|
||||
local minima/maxima is the addition of momentum to SDG. Instead of
|
||||
using the actual gradient for the parameter update a average over the
|
||||
past gradients is used. In order to avoid the need to SAVE the past
|
||||
values usually a exponentially decaying average is used resulting in
|
||||
Algorithm~\ref{alg_momentum}. This is comparable of following the path
|
||||
of a marble with mass rolling down the SLOPE of the error
|
||||
function. The decay rate for the average is comparable to the TRÄGHEIT
|
||||
of the marble.
|
||||
This results in the algorithm being able to escape ... due to the
|
||||
build up momentum from approaching it.
|
||||
|
||||
\begin{itemize}
|
||||
\item ADAM
|
||||
\item momentum
|
||||
\item ADADETLA \textcite{ADADELTA}
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
\KwInput{Learning Rate $\gamma$, Decay Rate $\rho$}
|
||||
\KwInput{Initial parameter $x_1$}
|
||||
Initialize accumulation variables $m_0 = 0$\;
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Accumulate Gradient: $m_t \leftarrow \rho m_{t-1} + (1-\rho) g_t$\;
|
||||
Compute Update: $\Delta x_t \leftarrow -\gamma m_t$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{SDG with momentum}
|
||||
\label{alg:gd}
|
||||
\end{algorithm}
|
||||
|
||||
Problems / Improvements ADAM \textcite{rADAM}
|
||||
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
\KwInput{Stepsize $\alpha$}
|
||||
\KwInput{Decay Parameters $\beta_1$, $\beta_2$}
|
||||
Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} +
|
||||
(1-\rho)g_t^2$\;
|
||||
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
|
||||
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
|
||||
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
|
||||
x^2]_{t-1} + (1+p)\Delta x_t^2$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{ADAM, \cite{ADAM}}
|
||||
\label{alg:gd}
|
||||
\end{algorithm}
|
||||
|
||||
|
||||
|
||||
\input{Plots/sdg_comparison.tex}
|
||||
|
||||
% \subsubsubsection{Stochastic Gradient Descent}
|
||||
@ -533,7 +584,31 @@ by introducing noise into the training of the model. This is a
|
||||
successful strategy for ofter models as well, the a conglomerate of
|
||||
descision trees grown on bootstrapped trainig samples benefit greatly
|
||||
of randomizing the features available to use in each training
|
||||
iteration (Hastie, Bachelorarbeit??). The way noise is introduced into
|
||||
iteration (Hastie, Bachelorarbeit??).
|
||||
There are two approaches to introduce noise to the model during
|
||||
learning, either by manipulating the model it self or by manipulating
|
||||
the input data.
|
||||
\subsubsection{Dropout}
|
||||
If a neural network has enough hidden nodes to model a training set
|
||||
accuratly
|
||||
Similarly to decision trees and random forests training multiple
|
||||
models on the same task and averaging the predictions can improve the
|
||||
results and combat overfitting. However training a very large
|
||||
number of neural networks is computationally expensive in training
|
||||
as well as testing. In order to make this approach feasible
|
||||
\textcite{Dropout1} introduced random dropout.
|
||||
Here for each training iteration from a before specified (sub)set of nodes
|
||||
randomly chosen ones are deactivated (their output is fixed to 0).
|
||||
During training
|
||||
Instead of using different models and averaging them randomly
|
||||
deactivated nodes are used to simulate different networks which all
|
||||
share the same weights for present nodes.
|
||||
|
||||
|
||||
|
||||
A simple but effective way to introduce noise to the model is by
|
||||
deactivating randomly chosen nodes in a layer
|
||||
The way noise is introduced into
|
||||
the model is by deactivating certain nodes (setting the output of the
|
||||
node to 0) in the fully connected layers of the convolutional neural
|
||||
networks. The nodes are chosen at random and change in every
|
||||
@ -543,7 +618,12 @@ iteration, this practice is called Dropout and was introduced by
|
||||
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
|
||||
training set?}
|
||||
|
||||
\subsubsection{Effectively for small training sets}
|
||||
\subsubsection{Effectivety for small training sets}
|
||||
|
||||
For some applications (medical problems with small amount of patients)
|
||||
the available data can be highly limited. In the following the impact
|
||||
on highly reduced training sets has been ... for ... and the results
|
||||
are given in Figure ...
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -7,7 +7,7 @@
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{textcomp}
|
||||
\usepackage{libertine}
|
||||
%\usepackage{libertine}
|
||||
\usepackage{amsmath}
|
||||
%\usepackage{amssymb}
|
||||
\usepackage{amsthm}
|
||||
@ -35,7 +35,9 @@
|
||||
\usepackage{lipsum}
|
||||
\usepackage[ruled,vlined]{algorithm2e}
|
||||
%\usepackage{showframe}
|
||||
\usepackage[protrusion=true, expansion=true, kerning=true]{microtype}
|
||||
\usepackage[protrusion=true, expansion=true, kerning=true, letterspace
|
||||
= 150]{microtype}
|
||||
\usepackage{titlecaps}
|
||||
|
||||
\captionsetup[sub]{justification=centering}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user