progress

2020-08-04 19:21:46 +02:00 · 2020-08-04 19:21:46 +02:00 · 7ffa8e63f2
commit 7ffa8e63f2
parent b716f7688a
4 changed files with 153 additions and 29 deletions
--- a/TeX/Plots/SGD_vs_GD.tex
+++ b/TeX/Plots/SGD_vs_GD.tex
@ -13,7 +13,7 @@ plot coordinates {
  \begin{subfigure}[h!]{\textwidth}
    \begin{tikzpicture}
      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.65\textwidth,
+        height = 0.6\textwidth,
        xtick = {1, 3, 5,7,9,11,13,15,17,19},
        xticklabels = {$2$, $4$, $6$, $8$,
          $10$,$12$,$14$,$16$,$18$,$20$},
@ -39,13 +39,13 @@ plot coordinates {
  \begin{subfigure}[b]{\textwidth}
    \begin{tikzpicture}
      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.65\textwidth,
+        height = 0.6\textwidth,
        ytick = {0, 1, 2, 3, 4},
        yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$},
        xtick = {1, 3, 5,7,9,11,13,15,17,19},
        xticklabels = {$2$, $4$, $6$, $8$,
          $10$,$12$,$14$,$16$,$18$,$20$},
-        xlabel = {training epoch}, ylabel = {error measure}]
+        xlabel = {training epoch}, ylabel = {error measure\vphantom{fy}}]
        \addplot table
        [x=epoch, y=val_loss, col sep=comma] {Plots/Data/GD_01.log};
        \addplot table
--- a/TeX/bibliograpy.bib
+++ b/TeX/bibliograpy.bib
@ -135,3 +135,45 @@ numpages = {39}
  biburl    = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
 }
+
+@article{Dropout1,
+  author    = {Geoffrey E. Hinton and
+               Nitish Srivastava and
+               Alex Krizhevsky and
+               Ilya Sutskever and
+               Ruslan Salakhutdinov},
+  title     = {Improving neural networks by preventing co-adaptation of feature detectors},
+  journal   = {CoRR},
+  volume    = {abs/1207.0580},
+  year      = {2012},
+  url       = {http://arxiv.org/abs/1207.0580},
+  archivePrefix = {arXiv},
+  eprint    = {1207.0580},
+  timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{
+rADAM,
+title={On the Variance of the Adaptive Learning Rate and Beyond},
+author={Liyuan Liu and Haoming Jiang and Pengcheng He and Weizhu Chen and Xiaodong Liu and Jianfeng Gao and Jiawei Han},
+booktitle={International Conference on Learning Representations},
+year={2020},
+url={https://openreview.net/forum?id=rkgz2aEKDr}
+}
+
+@inproceedings{ADAM,
+  author    = {Diederik P. Kingma and
+               Jimmy Ba},
+  editor    = {Yoshua Bengio and
+               Yann LeCun},
+  title     = {Adam: {A} Method for Stochastic Optimization},
+  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
+               San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
+  year      = {2015},
+  url       = {http://arxiv.org/abs/1412.6980},
+  timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@ -276,11 +276,11 @@ The choice of convolution for image classification tasks is not
 arbitrary. ... auge... bla bla


-\subsection{Limitations of the Gradient Descent Algorithm}
+% \subsection{Limitations of the Gradient Descent Algorithm}

-Hyperparameter guesswork
-Problems navigating valleys -> momentum
-Different scale of gradients for vars in different layers -> ADAdelta
+% -Hyperparameter guesswork
+% -Problems navigating valleys -> momentum
+% -Different scale of gradients for vars in different layers -> ADAdelta

 \subsection{Stochastic Training Algorithms}

@ -368,20 +368,21 @@ The results of the network being trained with gradient descent and
 stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd}
 and Table~\ref{table:sgd_vs_gd}

-\input{Plots/SGD_vs_GD.tex}

 Here it can be seen that the network trained with stochstic gradient
 descent is more accurate after the first epoch than the ones trained
 with gradient descent after 20 epochs.
 This is due to the former using a batch size of 32 and thus having
 made 1.875  updates to the weights
-after the first epoch in comparison to one update . While each of
+after the first epoch in comparison to one update. While each of
 these updates uses a approximate 
 gradient calculated on the subset it performs far better than the
 network using true gradients when training for the same mount of time.
 \todo{vergleich training time}
+
+\input{Plots/SGD_vs_GD.tex}
 \clearpage
-\subsection{Modified Stochastic Gradient Descent}
+\subsection{\titlecap{modified stochastic gradient descent}}
 There is a inherent problem in the sensitivity of the gradient descent
 algorithm regarding the learning rate $\gamma$.
 The difficulty of choosing the learning rate can be seen
@ -434,7 +435,7 @@ They all scale the gradient for the update depending of past gradients
 for each weight individually.

 The algorithms are build up on each other with the adaptive gradient
-algorithm (ADAGRAD, \textcite{ADAGRAD})
+algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD})
 laying the base work. Here for each parameter update the learning rate
 is given my a constant
 $\gamma$ is divided by the sum of the squares of the past partial
@ -456,11 +457,11 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
    1, \dots,p$\;
    Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
  }  
-  \caption{ADAGRAD}
+  \caption{\textls{ADAGRAD}}
  \label{alg:ADAGRAD}
 \end{algorithm}

-Building on ADAGRAD \textcite{ADADELTA} developed the ... (ADADELTA)
+Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA)
 in order to improve upon the two main drawbacks of ADAGRAD, being the
 continual decay of the learning rate and the need for a manually
 selected global learning rate $\gamma$.
@ -476,18 +477,6 @@ if the parameter vector had some hypothetical units they would be matched
 by these of the parameter update $\Delta x_t$. This proper
 \todo{erklärung unit}

-
-While the stochastic gradient algorithm is less susceptible to local
-extrema than gradient descent the problem still persists especially
-with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
-
-\begin{itemize}
-  \item ADAM
-  \item momentum
-  \item ADADETLA \textcite{ADADELTA}
-  
-  
-\end{itemize}
 \begin{algorithm}[H]
  \SetAlgoLined
  \KwInput{Decay Rate $\rho$, Constant $\varepsilon$}
@ -507,6 +496,68 @@ with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
  \label{alg:gd}
 \end{algorithm}

+While the stochastic gradient algorithm is less susceptible to local
+extrema than gradient descent the problem still persists especially
+with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
+
+A approach to the problem of ``getting stuck'' in saddle point or
+local minima/maxima is the addition of momentum to SDG. Instead of
+using the actual gradient for the parameter update a average over the
+past gradients is used. In order to avoid the need to SAVE the past
+values usually a exponentially decaying average is used resulting in
+Algorithm~\ref{alg_momentum}. This is comparable of following the path
+of a marble with mass rolling down the SLOPE of the error
+function. The decay rate for the average is comparable to the TRÄGHEIT
+of the marble.
+This results in the algorithm being able to escape ... due to the
+build up momentum from approaching it. 
+
+\begin{itemize}
+  \item ADAM
+  \item momentum
+  \item ADADETLA \textcite{ADADELTA} 
+\end{itemize}
+
+
+\begin{algorithm}[H]
+  \SetAlgoLined
+  \KwInput{Learning Rate $\gamma$, Decay Rate $\rho$}
+  \KwInput{Initial parameter $x_1$}
+  Initialize accumulation variables $m_0 = 0$\;
+  \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
+    Compute Gradient: $g_t$\;
+    Accumulate Gradient: $m_t \leftarrow \rho m_{t-1} + (1-\rho) g_t$\;
+    Compute Update: $\Delta x_t \leftarrow -\gamma m_t$\;
+    Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
+  }  
+  \caption{SDG with momentum}
+  \label{alg:gd}
+\end{algorithm}
+
+Problems / Improvements ADAM \textcite{rADAM}
+
+
+\begin{algorithm}[H]
+  \SetAlgoLined
+  \KwInput{Stepsize $\alpha$}
+  \KwInput{Decay Parameters $\beta_1$, $\beta_2$}
+  Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
+  \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
+    Compute Gradient: $g_t$\;
+    Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} +
+    (1-\rho)g_t^2$\;
+    Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
+        x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
+    Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
+    x^2]_{t-1} + (1+p)\Delta x_t^2$\;
+    Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
+  }  
+  \caption{ADAM, \cite{ADAM}}
+  \label{alg:gd}
+\end{algorithm}
+
+
+
 \input{Plots/sdg_comparison.tex}

 % \subsubsubsection{Stochastic Gradient Descent}
@ -533,7 +584,31 @@ by introducing noise into the training of the model. This is a
 successful strategy for ofter models as well, the a conglomerate of
 descision trees grown on bootstrapped trainig samples benefit greatly
 of randomizing the features available to use in each training
-iteration (Hastie, Bachelorarbeit??). The way noise is introduced into
+iteration (Hastie, Bachelorarbeit??).
+There are two approaches to introduce noise to the model during
+learning, either by manipulating the model it self or by manipulating
+the input data.
+\subsubsection{Dropout}
+If a neural network has enough hidden nodes to model a training set
+accuratly 
+Similarly to decision trees and random forests training multiple
+models on the same task and averaging the predictions can improve the
+results and combat overfitting. However training a very large
+number of neural networks is computationally expensive in training
+as well as testing. In order to make this approach feasible
+\textcite{Dropout1} introduced random dropout.
+Here for each training iteration from a before specified (sub)set of nodes
+randomly chosen ones are deactivated (their output is fixed to 0).
+During training 
+Instead of using different models and averaging them randomly
+deactivated nodes are used to simulate different networks which all
+share the same weights for present nodes.
+
+
+
+A simple but effective way to introduce noise to the model is by
+deactivating randomly chosen nodes in a layer 
+The way noise is introduced into
 the model is by deactivating certain nodes (setting the output of the
 node to 0) in the fully connected layers of the convolutional neural
 networks. The nodes are chosen at random and change in every
@ -543,7 +618,12 @@ iteration, this practice is called Dropout and was introduced by
 \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
 training set?}

-\subsubsection{Effectively for small training sets}
+\subsubsection{Effectivety for small training sets}
+
+For some applications (medical problems with small amount of patients)
+the available data can be highly limited. In the following the impact
+on highly reduced training sets has been ... for ... and the results
+are given in Figure ...

 %%% Local Variables:
 %%% mode: latex
--- a/TeX/main.tex
+++ b/TeX/main.tex
@ -7,7 +7,7 @@
 \usepackage[utf8]{inputenc}
 \usepackage[T1]{fontenc}
 \usepackage{textcomp}
-\usepackage{libertine}
+%\usepackage{libertine}
 \usepackage{amsmath}
 %\usepackage{amssymb}
 \usepackage{amsthm}
@ -35,7 +35,9 @@
 \usepackage{lipsum}
 \usepackage[ruled,vlined]{algorithm2e}
 %\usepackage{showframe}
-\usepackage[protrusion=true, expansion=true, kerning=true]{microtype}
+\usepackage[protrusion=true, expansion=true, kerning=true, letterspace
+= 150]{microtype}
+\usepackage{titlecaps}

 \captionsetup[sub]{justification=centering}