This commit is contained in:
Tobias Arndt 2020-08-03 18:54:35 +02:00
parent b0afc88091
commit b716f7688a
6 changed files with 314 additions and 80 deletions

View File

@ -24,24 +24,21 @@ plot coordinates {
\begin{subfigure}[b]{\textwidth}
\begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.7\textwidth,
xtick = {1, 3, 5,7,9,11,13,15,17,19},
xticklabels = {$2$, $4$, $6$, $8$,
$10$,$12$,$14$,$16$,$18$,$20$},
height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
xlabel = {epoch}, ylabel = {Classification Accuracy}]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] {Data/gd_10min.log};
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adagrad.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma] {Data/GD_05.log};
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adadelta.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma] {Data/GD_1.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma]
{Data/SGD_01_b32.log};
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam.log};
\addlegendentry{GD$_{0.01}$}
\addlegendentry{GD$_{0.05}$}
\addlegendentry{GD$_{0.1}$}
\addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$}
\end{axis}
\end{tikzpicture}
@ -50,25 +47,19 @@ plot coordinates {
\begin{subfigure}[b]{\textwidth}
\begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.7\textwidth,
ytick = {0, 1, 2, 3, 4},
yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$},
xtick = {1, 3, 5,7,9,11,13,15,17,19},
xticklabels = {$2$, $4$, $6$, $8$,
$10$,$12$,$14$,$16$,$18$,$20$},
xlabel = {epoch}, ylabel = {Error Measure}]
height = 0.7\textwidth, ymax = 0.5,
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
\addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/GD_01.log};
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log};
\addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/GD_05.log};
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log};
\addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/GD_1.log};
\addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/SGD_01_b32.log};
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log};
\addlegendentry{GD$_{0.01}$}
\addlegendentry{GD$_{0.05}$}
\addlegendentry{GD$_{0.1}$}
\addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$}
\end{axis}
@ -77,13 +68,13 @@ plot coordinates {
\end{subfigure}
\\~\\
\begin{subfigure}[b]{1.0\linewidth}
\begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}}
\multicolumn{4}{c}{Classification Accuracy}
&~&\multicolumn{4}{c}{Error Measure}
\\\cline{1-4}\cline{6-9}
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
\\\cline{1-4}\cline{6-9}
1&1&1&1&&1&1&1&1
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
\multicolumn{3}{c}{Classification Accuracy}
&~&\multicolumn{3}{c}{Error Measure}
\\\cline{1-3}\cline{5-7}
ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
\\\cline{1-3}\cline{5-7}
1&1&1&&1&1&1
\end{tabu}
\caption{Performace metrics after 20 epochs}
\end{subfigure}

View File

@ -0,0 +1,76 @@
\pgfplotsset{
compat=1.11,
legend image code/.code={
\draw[mark repeat=2,mark phase=2]
plot coordinates {
(0cm,0cm)
(0.0cm,0cm) %% default is (0.3cm,0cm)
(0.0cm,0cm) %% default is (0.6cm,0cm)
};%
}
}
\begin{figure}
\begin{subfigure}[b]{\textwidth}
\begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
xlabel = {epoch}, ylabel = {Classification Accuracy}]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adagrad.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adadelta.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam.log};
\addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$}
\end{axis}
\end{tikzpicture}
%\caption{Classification accuracy}
\end{subfigure}
\begin{subfigure}[b]{\textwidth}
\begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymax = 0.5,
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
\addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
\addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
\addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
\addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$}
\end{axis}
\end{tikzpicture}
\caption{Performance metrics during training}
\end{subfigure}
\\~\\
\begin{subfigure}[b]{1.0\linewidth}
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
\multicolumn{3}{c}{Classification Accuracy}
&~&\multicolumn{3}{c}{Error Measure}
\\\cline{1-3}\cline{5-7}
ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
\\\cline{1-3}\cline{5-7}
1&1&1&&1&1&1
\end{tabu}
\caption{Performace metrics after 20 epochs}
\end{subfigure}
\caption{Performance metrics of the network given in ... trained
with different optimization algorithms}
\end{figure}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

View File

@ -7,7 +7,6 @@
copyright = {In Copyright - Non-Commercial Use Permitted},
keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
size = {53 p.},
address = {Ithaca, NY},
abstract = {Today, various forms of neural networks are trained to perform approximation tasks in many fields. However, the solutions obtained are not fully understood. Empirical results suggest that typical training algorithms favor regularized solutions.These observations motivate us to analyze properties of the solutions found by gradient descent initialized close to zero, that is frequently employed to perform the training task. As a starting point, we consider one dimensional (shallow) ReLU neural networks in which weights are chosen randomly and only the terminal layer is trained. We show that the resulting solution converges to the smooth spline interpolation of the training data as the number of hidden nodes tends to infinity. Moreover, we derive a correspondence between the early stopped gradient descent and the smoothing spline regression. This might give valuable insight on the properties of the solutions obtained using gradient descent methods in general settings.},
DOI = {10.3929/ethz-b-000402003},
title = {How Implicit Regularization of Neural Networks Affects the Learned Function Part I},
@ -89,4 +88,50 @@ url={https://doi.org/10.1038/323533a0}
timestamp = {Wed, 17 Apr 2019 17:23:45 +0200},
biburl = {https://dblp.org/rec/journals/corr/HeZRS15.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@book{PRML,
title = {Pattern Recognition and Machine Learning},
author = {Christopher M. Bishop},
publisher = {Springer},
isbn = {9780387310732,0387310738},
year = 2006,
series = {Information science and statistics},
edition = {1st ed. 2006. Corr. 2nd printing},
pages = {209}
}
@article{ADAGRAD,
author = {Duchi, John and Hazan, Elad and Singer, Yoram},
title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
year = {2011},
issue_date = {2/1/2011},
publisher = {JMLR.org},
volume = {12},
number = {null},
issn = {1532-4435},
journal = {J. Mach. Learn. Res.},
month = jul,
pages = {21212159},
numpages = {39}
}
@article{DBLP:journals/corr/DauphinPGCGB14,
author = {Yann N. Dauphin and
Razvan Pascanu and
{\c{C}}aglar G{\"{u}}l{\c{c}}ehre and
Kyunghyun Cho and
Surya Ganguli and
Yoshua Bengio},
title = {Identifying and attacking the saddle point problem in high-dimensional
non-convex optimization},
journal = {CoRR},
volume = {abs/1406.2572},
year = {2014},
url = {http://arxiv.org/abs/1406.2572},
archivePrefix = {arXiv},
eprint = {1406.2572},
timestamp = {Mon, 22 Jul 2019 13:15:46 +0200},
biburl = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}

View File

@ -384,22 +384,23 @@ network using true gradients when training for the same mount of time.
\subsection{Modified Stochastic Gradient Descent}
There is a inherent problem in the sensitivity of the gradient descent
algorithm regarding the learning rate $\gamma$.
The difficulty of choosing the learning rate is
in the Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
The difficulty of choosing the learning rate can be seen
in Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
but as the rate is enlarged the algorithm can become unstable and
diverge. Even for learning rates small enough to ensure the parameters
do not diverge to infinity steep valleys can hinder the progress of
the algorithm as with to large leaning rates gradient descent
``bounces between'' the walls of the valley rather then follow ...
``bounces between'' the walls of the valley rather then follow a
downward trend in the valley.
% \[
% w - \gamma \nabla_w ...
% \]
thus the weights grow to infinity.
%thus the weights grow to infinity.
\todo{unstable learning rate besser
erklären}
To combat this problem it is proposed \todo{quelle} alter the learning
To combat this problem \todo{quelle} propose to alter the learning
rate over the course of training, often called leaning rate
scheduling. The most popular implementations of this are time based
decay
@ -417,16 +418,68 @@ and exponential decay, where the learning rate is decreased after each epoch,
\gamma_n = \gamma_o e^{-n d}.
\]
These methods are able to increase the accuracy of a model by a large
margin as seen in the training of RESnet by \textcite{resnet}
\todo{vielleicht grafik
einbauen}. However stochastic gradient descent with weight decay is
still highly sensitive to the choice of the hyperparameters $\gamma$
margin as seen in the training of RESnet by \textcite{resnet}.
\todo{vielleicht grafik
einbauen}
However stochastic gradient descent with weight decay is
still highly sensitive to the choice of the hyperparameters $\gamma_0$
and $d$.
In order to mitigate this problem a number of algorithms have been
developed to regularize the learning rate with as minimal
hyperparameter guesswork as possible.
One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELTA}
\clearpage
We will examine and compare a ... algorithms that use a adaptive
learning rate.
They all scale the gradient for the update depending of past gradients
for each weight individually.
The algorithms are build up on each other with the adaptive gradient
algorithm (ADAGRAD, \textcite{ADAGRAD})
laying the base work. Here for each parameter update the learning rate
is given my a constant
$\gamma$ is divided by the sum of the squares of the past partial
derivatives in this parameter. This results in a monotonously
decreasing learning rate for each parameter. This results in a faster
decaying learning rate for parameters with large updates, where as
parameters with small updates experience smaller decay. The ADAGRAD
algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
\begin{algorithm}[H]
\SetAlgoLined
\KwInput{Global learning rate $\gamma$}
\KwInput{Constant $\varepsilon$}
\KwInput{Initial parameter vector $x_1 \in \mathbb{R}^p$}
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
Compute Gradient: $g_t$\;
Compute Update: $\Delta x_{t,i} \leftarrow
-\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_t, \forall i =
1, \dots,p$\;
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
}
\caption{ADAGRAD}
\label{alg:ADAGRAD}
\end{algorithm}
Building on ADAGRAD \textcite{ADADELTA} developed the ... (ADADELTA)
in order to improve upon the two main drawbacks of ADAGRAD, being the
continual decay of the learning rate and the need for a manually
selected global learning rate $\gamma$.
As ADAGRAD accumulates the squared gradients the learning rate will
eventually become infinitely small.
In order to ensure that even after a significant of iterations
learning continues to make progress instead of summing the gradients a
exponentially decaying average of the past gradients is used to ....
Additionally the fixed global learning rate $\gamma$ is substituted by
a exponentially decaying average of the past parameter updates.
The usage of the past parameter updates is motivated by ensuring that
if the parameter vector had some hypothetical units they would be matched
by these of the parameter update $\Delta x_t$. This proper
\todo{erklärung unit}
While the stochastic gradient algorithm is less susceptible to local
extrema than gradient descent the problem still persists especially
with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
\begin{itemize}
\item ADAM
@ -454,9 +507,10 @@ One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELT
\label{alg:gd}
\end{algorithm}
\input{Plots/sdg_comparison.tex}
% \subsubsubsection{Stochastic Gradient Descent}
\clearpage
\subsection{Combating Overfitting}
% As in many machine learning applications if the model is overfit in
@ -489,7 +543,7 @@ iteration, this practice is called Dropout and was introduced by
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
training set?}
\subsubsection{Effectively for small training sets}
%%% Local Variables:
%%% mode: latex

View File

@ -194,45 +194,113 @@ using data with the expected response (label) attached to each
data-point in fitting the model, where usually some distance between
the model output and the labels is minimized.
\subsubsection{Interpreting the Output}
In order to properly interpret the output of a neural network and
training it, depending on the problem it might be advantageous to
transform the output form the last layer. Given the nature of the
neural network the value at each output node is a real number. This is
desirable for applications where the desired output is a real numbered
vector (e.g. steering inputs for a autonomous car), however for
classification problems it is desirable to transform this
output. Often classification problems are modeled in such a way that
each output node corresponds to a class. Then the output vector needs
to be normalized in order to give a prediction. The naive approach is
to transform the output vector $o$ into a one-hot vector $p$
corresponding to a $0$
entry for all classes except one, which is the predicted class.
\subsubsection{Interpreting the Output / Classification vs Regression
/ Nonliniarity in last layer}
Given the nature of the neural net the output of the last layer are
real numbers. For regression tasks this is desirable, for
classification problems however some transformations might be
necessary.
As the goal in the latter is to predict a certain class or classes for
an object the output needs to be of a form that allows this
interpretation.
Commonly the nodes in the output layer each correspond to a class and
the class chosen as prediction is the one with the highest value at
the corresponding output node.
The naive transformation to achieve this is transforming the output
vector $o$ into a one-hot vector
\[
p_i =
\text{pred}_i =
\begin{cases}
1,& i < j, \forall i,j \in \text{arg}\max o_i, \\
0,& \text{else.}
1,& \text{if } o_i = \max_j o_j \\
0,& \text{else}.
\end{cases}
\]\todo{besser formulieren}
\]
This however makes training the model with gradient based methods impossible, as the derivative of
the transformation is either zero or undefined.
A continuous transformation that is close to the argmax one is given by
softmax
\[
\text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}.
\]
The softmax function transforms the realm of the output to the interval $[0,1]$
and the individual values sum to one, thus the output can be interpreted as
a probability for each class given the input.
Additionally to being differentiable this allows for evaluataing the
cetainiy of a prediction, rather than just whether it is accurate.
However this imposes difficulties in training the network as with this
addition the model is no longer differentiable which imitates the
ways the model can be trained. Additionally information about the
``certainty'' for each class in the prediction gets lost. A popular
way to circumvent this problem is to normalize the output vector is
such a way that the entries add up to one, this allows for the
interpretation of probabilities assigned to each class.
\todo{vielleicht additiv invarianz}
% Another property that makes softmax attractive is the invariance to addition
% \[
% \text{sofmax}(o) = \text{softmax}(o + c
% \]
% In order to properly interpret the output of a neural network and
% training it, depending on the problem it might be advantageous to
% transform the output form the last layer. Given the nature of the
% neural network the value at each output node is a real number. This is
% desirable for applications where the desired output is a real numbered
% vector (e.g. steering inputs for a autonomous car), however for
% classification problems it is desirable to transform this
% output. Often classification problems are modeled in such a way that
% each output node corresponds to a class. Then the output vector needs
% to be normalized in order to give a prediction. The naive approach is
% to transform the output vector $o$ into a one-hot vector $p$
% corresponding to a $0$
% entry for all classes except one, which is the predicted class.
% \[
% p_i =
% \begin{cases}
% 1,& i < j, \forall i,j \in \text{arg}\max o_i, \\
% 0,& \text{else.}
% \end{cases}
% \]\todo{besser formulieren}
% However this imposes difficulties in training the network as with this
% addition the model is no longer differentiable which imitates the
% ways the model can be trained. Additionally information about the
% ``certainty'' for each class in the prediction gets lost. A popular
% way to circumvent this problem is to normalize the output vector is
% such a way that the entries add up to one, this allows for the
% interpretation of probabilities assigned to each class.
\subsubsection{Error Measurement}
In order to make assessment about the quality of a network $\mathcal{NN}$ and train
it we need to discuss how we measure error. As for regression problems
the output is continuous in contrast to the class predictions in a
classification problem, we need to discuss these problems separately.
\paragraph{Regression Problems}
it we need to discuss how we measure error. The choice of the error
function is highly dependent on the type of the problem. For
regression problems a commonly used error measure is the mean squared
error (MSE)
which for a function $f$ and data $(x_i,y_i), i=1,\dots,n$ is given by
\[
MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2.
\]
However depending on the problem error measures with differnt
properties might be needed, for example in some contexts it is
required to consider a proportional rather than absolute error as is
common in time series models. \todo{komisch}
As discussed above the output of a neural network for a classification
problem can be interpreted as a probability distribution over the classes
conditioned on the input. In this case it is \todo{can?} desirable to
use error functions designed to compare probability distributions. A
widespread error function for this use case is the cross entropy (\textcite{PRML}),
which for two discrete distributions $p, q$ with the same realm $C$ is given by
\[
H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
\]
which compares a $q$ to a true underlying distribution $p$.
For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$
corresponds to the probability of class $c$ given $x_i$ and predictor
$f$ we get the loss function
\[
Bla = \sum_{i=1}^n H(y_i, f(x_i)).
\]
-Maximum Likelihood
-Ableitung mit softmax pseudo linear -> fast improvemtns possible
\subsubsection{Gradient Descent Algorithm}

View File

@ -34,7 +34,7 @@
\usepackage{todonotes}
\usepackage{lipsum}
\usepackage[ruled,vlined]{algorithm2e}
\usepackage{showframe}
%\usepackage{showframe}
\usepackage[protrusion=true, expansion=true, kerning=true]{microtype}
\captionsetup[sub]{justification=centering}