From 7ffa8e63f260f776c57e1e3d67a64588612c2a71 Mon Sep 17 00:00:00 2001 From: Tobias Arndt Date: Tue, 4 Aug 2020 19:21:46 +0200 Subject: [PATCH] progress --- TeX/Plots/SGD_vs_GD.tex | 6 +- TeX/bibliograpy.bib | 42 +++++++++++ TeX/further_applications_of_nn.tex | 114 ++++++++++++++++++++++++----- TeX/main.tex | 6 +- 4 files changed, 146 insertions(+), 22 deletions(-) diff --git a/TeX/Plots/SGD_vs_GD.tex b/TeX/Plots/SGD_vs_GD.tex index c5c74b6..b6b6e26 100644 --- a/TeX/Plots/SGD_vs_GD.tex +++ b/TeX/Plots/SGD_vs_GD.tex @@ -13,7 +13,7 @@ plot coordinates { \begin{subfigure}[h!]{\textwidth} \begin{tikzpicture} \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.65\textwidth, + height = 0.6\textwidth, xtick = {1, 3, 5,7,9,11,13,15,17,19}, xticklabels = {$2$, $4$, $6$, $8$, $10$,$12$,$14$,$16$,$18$,$20$}, @@ -39,13 +39,13 @@ plot coordinates { \begin{subfigure}[b]{\textwidth} \begin{tikzpicture} \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.65\textwidth, + height = 0.6\textwidth, ytick = {0, 1, 2, 3, 4}, yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$}, xtick = {1, 3, 5,7,9,11,13,15,17,19}, xticklabels = {$2$, $4$, $6$, $8$, $10$,$12$,$14$,$16$,$18$,$20$}, - xlabel = {training epoch}, ylabel = {error measure}] + xlabel = {training epoch}, ylabel = {error measure\vphantom{fy}}] \addplot table [x=epoch, y=val_loss, col sep=comma] {Plots/Data/GD_01.log}; \addplot table diff --git a/TeX/bibliograpy.bib b/TeX/bibliograpy.bib index 920c253..d839eb3 100644 --- a/TeX/bibliograpy.bib +++ b/TeX/bibliograpy.bib @@ -134,4 +134,46 @@ numpages = {39} timestamp = {Mon, 22 Jul 2019 13:15:46 +0200}, biburl = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{Dropout1, + author = {Geoffrey E. Hinton and + Nitish Srivastava and + Alex Krizhevsky and + Ilya Sutskever and + Ruslan Salakhutdinov}, + title = {Improving neural networks by preventing co-adaptation of feature detectors}, + journal = {CoRR}, + volume = {abs/1207.0580}, + year = {2012}, + url = {http://arxiv.org/abs/1207.0580}, + archivePrefix = {arXiv}, + eprint = {1207.0580}, + timestamp = {Mon, 13 Aug 2018 16:46:10 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@inproceedings{ +rADAM, +title={On the Variance of the Adaptive Learning Rate and Beyond}, +author={Liyuan Liu and Haoming Jiang and Pengcheng He and Weizhu Chen and Xiaodong Liu and Jianfeng Gao and Jiawei Han}, +booktitle={International Conference on Learning Representations}, +year={2020}, +url={https://openreview.net/forum?id=rkgz2aEKDr} +} + +@inproceedings{ADAM, + author = {Diederik P. Kingma and + Jimmy Ba}, + editor = {Yoshua Bengio and + Yann LeCun}, + title = {Adam: {A} Method for Stochastic Optimization}, + booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015, + San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings}, + year = {2015}, + url = {http://arxiv.org/abs/1412.6980}, + timestamp = {Thu, 25 Jul 2019 14:25:37 +0200}, + biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} } \ No newline at end of file diff --git a/TeX/further_applications_of_nn.tex b/TeX/further_applications_of_nn.tex index 5af0c3e..0eb1bca 100644 --- a/TeX/further_applications_of_nn.tex +++ b/TeX/further_applications_of_nn.tex @@ -276,11 +276,11 @@ The choice of convolution for image classification tasks is not arbitrary. ... auge... bla bla -\subsection{Limitations of the Gradient Descent Algorithm} +% \subsection{Limitations of the Gradient Descent Algorithm} --Hyperparameter guesswork --Problems navigating valleys -> momentum --Different scale of gradients for vars in different layers -> ADAdelta +% -Hyperparameter guesswork +% -Problems navigating valleys -> momentum +% -Different scale of gradients for vars in different layers -> ADAdelta \subsection{Stochastic Training Algorithms} @@ -368,20 +368,21 @@ The results of the network being trained with gradient descent and stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd} and Table~\ref{table:sgd_vs_gd} -\input{Plots/SGD_vs_GD.tex} Here it can be seen that the network trained with stochstic gradient descent is more accurate after the first epoch than the ones trained with gradient descent after 20 epochs. This is due to the former using a batch size of 32 and thus having made 1.875 updates to the weights -after the first epoch in comparison to one update . While each of +after the first epoch in comparison to one update. While each of these updates uses a approximate gradient calculated on the subset it performs far better than the network using true gradients when training for the same mount of time. \todo{vergleich training time} + +\input{Plots/SGD_vs_GD.tex} \clearpage -\subsection{Modified Stochastic Gradient Descent} +\subsection{\titlecap{modified stochastic gradient descent}} There is a inherent problem in the sensitivity of the gradient descent algorithm regarding the learning rate $\gamma$. The difficulty of choosing the learning rate can be seen @@ -434,7 +435,7 @@ They all scale the gradient for the update depending of past gradients for each weight individually. The algorithms are build up on each other with the adaptive gradient -algorithm (ADAGRAD, \textcite{ADAGRAD}) +algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD}) laying the base work. Here for each parameter update the learning rate is given my a constant $\gamma$ is divided by the sum of the squares of the past partial @@ -456,11 +457,11 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}. 1, \dots,p$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; } - \caption{ADAGRAD} + \caption{\textls{ADAGRAD}} \label{alg:ADAGRAD} \end{algorithm} -Building on ADAGRAD \textcite{ADADELTA} developed the ... (ADADELTA) +Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA) in order to improve upon the two main drawbacks of ADAGRAD, being the continual decay of the learning rate and the need for a manually selected global learning rate $\gamma$. @@ -476,22 +477,70 @@ if the parameter vector had some hypothetical units they would be matched by these of the parameter update $\Delta x_t$. This proper \todo{erklärung unit} +\begin{algorithm}[H] + \SetAlgoLined + \KwInput{Decay Rate $\rho$, Constant $\varepsilon$} + \KwInput{Initial parameter $x_1$} + Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\; + \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{ + Compute Gradient: $g_t$\; + Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} + + (1-\rho)g_t^2$\; + Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta + x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\; + Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta + x^2]_{t-1} + (1+p)\Delta x_t^2$\; + Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; + } + \caption{ADADELTA, \textcite{ADADELTA}} + \label{alg:gd} +\end{algorithm} While the stochastic gradient algorithm is less susceptible to local extrema than gradient descent the problem still persists especially with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14} +A approach to the problem of ``getting stuck'' in saddle point or +local minima/maxima is the addition of momentum to SDG. Instead of +using the actual gradient for the parameter update a average over the +past gradients is used. In order to avoid the need to SAVE the past +values usually a exponentially decaying average is used resulting in +Algorithm~\ref{alg_momentum}. This is comparable of following the path +of a marble with mass rolling down the SLOPE of the error +function. The decay rate for the average is comparable to the TRÄGHEIT +of the marble. +This results in the algorithm being able to escape ... due to the +build up momentum from approaching it. + \begin{itemize} \item ADAM \item momentum - \item ADADETLA \textcite{ADADELTA} - - + \item ADADETLA \textcite{ADADELTA} \end{itemize} + + \begin{algorithm}[H] \SetAlgoLined - \KwInput{Decay Rate $\rho$, Constant $\varepsilon$} + \KwInput{Learning Rate $\gamma$, Decay Rate $\rho$} \KwInput{Initial parameter $x_1$} + Initialize accumulation variables $m_0 = 0$\; + \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{ + Compute Gradient: $g_t$\; + Accumulate Gradient: $m_t \leftarrow \rho m_{t-1} + (1-\rho) g_t$\; + Compute Update: $\Delta x_t \leftarrow -\gamma m_t$\; + Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; + } + \caption{SDG with momentum} + \label{alg:gd} +\end{algorithm} + +Problems / Improvements ADAM \textcite{rADAM} + + +\begin{algorithm}[H] + \SetAlgoLined + \KwInput{Stepsize $\alpha$} + \KwInput{Decay Parameters $\beta_1$, $\beta_2$} Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\; \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{ Compute Gradient: $g_t$\; @@ -503,10 +552,12 @@ with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14} x^2]_{t-1} + (1+p)\Delta x_t^2$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; } - \caption{ADADELTA, \textcite{ADADELTA}} + \caption{ADAM, \cite{ADAM}} \label{alg:gd} \end{algorithm} + + \input{Plots/sdg_comparison.tex} % \subsubsubsection{Stochastic Gradient Descent} @@ -533,7 +584,31 @@ by introducing noise into the training of the model. This is a successful strategy for ofter models as well, the a conglomerate of descision trees grown on bootstrapped trainig samples benefit greatly of randomizing the features available to use in each training -iteration (Hastie, Bachelorarbeit??). The way noise is introduced into +iteration (Hastie, Bachelorarbeit??). +There are two approaches to introduce noise to the model during +learning, either by manipulating the model it self or by manipulating +the input data. +\subsubsection{Dropout} +If a neural network has enough hidden nodes to model a training set +accuratly +Similarly to decision trees and random forests training multiple +models on the same task and averaging the predictions can improve the +results and combat overfitting. However training a very large +number of neural networks is computationally expensive in training +as well as testing. In order to make this approach feasible +\textcite{Dropout1} introduced random dropout. +Here for each training iteration from a before specified (sub)set of nodes +randomly chosen ones are deactivated (their output is fixed to 0). +During training +Instead of using different models and averaging them randomly +deactivated nodes are used to simulate different networks which all +share the same weights for present nodes. + + + +A simple but effective way to introduce noise to the model is by +deactivating randomly chosen nodes in a layer +The way noise is introduced into the model is by deactivating certain nodes (setting the output of the node to 0) in the fully connected layers of the convolutional neural networks. The nodes are chosen at random and change in every @@ -543,7 +618,12 @@ iteration, this practice is called Dropout and was introduced by \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als training set?} -\subsubsection{Effectively for small training sets} +\subsubsection{Effectivety for small training sets} + +For some applications (medical problems with small amount of patients) +the available data can be highly limited. In the following the impact +on highly reduced training sets has been ... for ... and the results +are given in Figure ... %%% Local Variables: %%% mode: latex diff --git a/TeX/main.tex b/TeX/main.tex index a413bc8..4b3ae6b 100644 --- a/TeX/main.tex +++ b/TeX/main.tex @@ -7,7 +7,7 @@ \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{textcomp} -\usepackage{libertine} +%\usepackage{libertine} \usepackage{amsmath} %\usepackage{amssymb} \usepackage{amsthm} @@ -35,7 +35,9 @@ \usepackage{lipsum} \usepackage[ruled,vlined]{algorithm2e} %\usepackage{showframe} -\usepackage[protrusion=true, expansion=true, kerning=true]{microtype} +\usepackage[protrusion=true, expansion=true, kerning=true, letterspace += 150]{microtype} +\usepackage{titlecaps} \captionsetup[sub]{justification=centering}