progress

2020-08-03 18:54:35 +02:00 · 2020-08-03 18:54:35 +02:00 · b716f7688a
commit b716f7688a
parent b0afc88091
6 changed files with 314 additions and 80 deletions
--- a/TeX/Plots/pfg_test.tex
+++ b/TeX/Plots/pfg_test.tex
@ -24,24 +24,21 @@ plot coordinates {
  \begin{subfigure}[b]{\textwidth}
    \begin{tikzpicture}
      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.7\textwidth,
+        height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
        xtick = {1, 3, 5,7,9,11,13,15,17,19},
        xticklabels = {$2$, $4$, $6$, $8$,
          $10$,$12$,$14$,$16$,$18$,$20$},
        xlabel = {epoch}, ylabel = {Classification Accuracy}]
        \addplot table
-        [x=epoch, y=val_accuracy, col sep=comma, mark = none] {Data/gd_10min.log};
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
        {Data/adagrad.log}; 
        \addplot table
-        [x=epoch, y=val_accuracy, col sep=comma] {Data/GD_05.log};
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
        {Data/adadelta.log}; 
        \addplot table
-        [x=epoch, y=val_accuracy, col sep=comma] {Data/GD_1.log};
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
-        \addplot table
+        {Data/adam.log}; 
        [x=epoch, y=val_accuracy, col sep=comma]
        {Data/SGD_01_b32.log};
-         \addlegendentry{GD$_{0.01}$}
+         \addlegendentry{\footnotesize{ADAGRAD}}
-        \addlegendentry{GD$_{0.05}$}
+        \addlegendentry{\footnotesize{ADADELTA}}
-        \addlegendentry{GD$_{0.1}$}
+        \addlegendentry{\footnotesize{ADAM}}
        \addlegendentry{SGD$_{0.01}$}
      \end{axis}
    \end{tikzpicture}
@ -50,25 +47,19 @@ plot coordinates {
  \begin{subfigure}[b]{\textwidth}
    \begin{tikzpicture}
      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.7\textwidth,
+        height = 0.7\textwidth, ymax = 0.5,
-        ytick = {0, 1, 2, 3, 4},
+        xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
-        yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$},
+        {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
        xtick = {1, 3, 5,7,9,11,13,15,17,19},
        xticklabels = {$2$, $4$, $6$, $8$,
          $10$,$12$,$14$,$16$,$18$,$20$},
        xlabel = {epoch}, ylabel = {Error Measure}]
        \addplot table
-        [x=epoch, y=val_loss, col sep=comma] {Data/GD_01.log};
+        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log};
        \addplot table
-        [x=epoch, y=val_loss, col sep=comma] {Data/GD_05.log};
+        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log};
        \addplot table
-        [x=epoch, y=val_loss, col sep=comma] {Data/GD_1.log};
+        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log};
        \addplot table
        [x=epoch, y=val_loss, col sep=comma] {Data/SGD_01_b32.log};
-        \addlegendentry{GD$_{0.01}$}
+        \addlegendentry{\footnotesize{ADAGRAD}}
-        \addlegendentry{GD$_{0.05}$}
+        \addlegendentry{\footnotesize{ADADELTA}}
-        \addlegendentry{GD$_{0.1}$}
+        \addlegendentry{\footnotesize{ADAM}}
        \addlegendentry{SGD$_{0.01}$}
      \end{axis}
@ -77,13 +68,13 @@ plot coordinates {
  \end{subfigure}
  \\~\\
  \begin{subfigure}[b]{1.0\linewidth}
-    \begin{tabu} to \textwidth {@{}  *4{X[c]}c*4{X[c]} @{}}
+    \begin{tabu} to \textwidth {@{}  *3{X[c]}c*3{X[c]} @{}}
-      \multicolumn{4}{c}{Classification Accuracy}
+      \multicolumn{3}{c}{Classification Accuracy}
-      &~&\multicolumn{4}{c}{Error Measure}
+      &~&\multicolumn{3}{c}{Error Measure}
-      \\\cline{1-4}\cline{6-9}
+      \\\cline{1-3}\cline{5-7}
-      GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
+      ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
-      \\\cline{1-4}\cline{6-9}
+      \\\cline{1-3}\cline{5-7}
-      1&1&1&1&&1&1&1&1
+      1&1&1&&1&1&1
    \end{tabu}
    \caption{Performace metrics after 20 epochs}
  \end{subfigure}
--- a/TeX/Plots/sdg_comparison.tex
+++ b/TeX/Plots/sdg_comparison.tex
@ -0,0 +1,76 @@
 \pgfplotsset{
 compat=1.11,
 legend image code/.code={
 \draw[mark repeat=2,mark phase=2]
 plot coordinates {
 (0cm,0cm)
 (0.0cm,0cm)        %% default is (0.3cm,0cm)
 (0.0cm,0cm)         %% default is (0.6cm,0cm)
 };%
 }
 }
 \begin{figure}
  \begin{subfigure}[b]{\textwidth}
    \begin{tikzpicture}
      \begin{axis}[tick style = {draw = none}, width = \textwidth,
        height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
        xlabel = {epoch}, ylabel = {Classification Accuracy}]
        \addplot table
        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
        {Plots/Data/adagrad.log}; 
        \addplot table
        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
        {Plots/Data/adadelta.log}; 
        \addplot table
        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
        {Plots/Data/adam.log}; 
         \addlegendentry{\footnotesize{ADAGRAD}}
        \addlegendentry{\footnotesize{ADADELTA}}
        \addlegendentry{\footnotesize{ADAM}}
        \addlegendentry{SGD$_{0.01}$}
      \end{axis}
    \end{tikzpicture}
    %\caption{Classification accuracy}
  \end{subfigure}
  \begin{subfigure}[b]{\textwidth}
    \begin{tikzpicture}
      \begin{axis}[tick style = {draw = none}, width = \textwidth,
        height = 0.6\textwidth, ymax = 0.5,
        xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
        {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
        \addplot table
        [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
        \addplot table
        [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
        \addplot table
        [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
        \addlegendentry{\footnotesize{ADAGRAD}}
        \addlegendentry{\footnotesize{ADADELTA}}
        \addlegendentry{\footnotesize{ADAM}}
        \addlegendentry{SGD$_{0.01}$}
      \end{axis}
    \end{tikzpicture}
    \caption{Performance metrics during training}
  \end{subfigure}
  \\~\\
  \begin{subfigure}[b]{1.0\linewidth}
    \begin{tabu} to \textwidth {@{}  *3{X[c]}c*3{X[c]} @{}}
      \multicolumn{3}{c}{Classification Accuracy}
      &~&\multicolumn{3}{c}{Error Measure}
      \\\cline{1-3}\cline{5-7}
      ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
      \\\cline{1-3}\cline{5-7}
      1&1&1&&1&1&1
    \end{tabu}
    \caption{Performace metrics after 20 epochs}
  \end{subfigure}
  \caption{Performance metrics of the network given in ... trained
    with different optimization algorithms}
 \end{figure}
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-master: "../main"
 %%% End:
--- a/TeX/bibliograpy.bib
+++ b/TeX/bibliograpy.bib
@ -7,7 +7,6 @@
 	copyright = {In Copyright - Non-Commercial Use Permitted},
 	keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
 	size = {53 p.},
 	address = {Ithaca, NY},
 	abstract = {Today, various forms of neural networks are trained to perform approximation tasks in many fields. However, the solutions obtained are not fully understood. Empirical results suggest that typical training algorithms favor regularized solutions.These observations motivate us to analyze properties of the solutions found by gradient descent initialized close to zero, that is frequently employed to perform the training task. As a starting point, we consider one dimensional (shallow) ReLU neural networks in which weights are chosen randomly and only the terminal layer is trained. We show that the resulting solution converges to the smooth spline interpolation of the training data as the number of hidden nodes tends to infinity. Moreover, we derive a correspondence between the early stopped gradient descent and the smoothing spline regression. This might give valuable insight on the properties of the solutions obtained using gradient descent methods in general settings.},
 	DOI = {10.3929/ethz-b-000402003},
 	title = {How Implicit Regularization of Neural Networks Affects the Learned Function – Part I},
@ -89,4 +88,50 @@ url={https://doi.org/10.1038/323533a0}
  timestamp = {Wed, 17 Apr 2019 17:23:45 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/HeZRS15.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
 }
@book{PRML,
   title =     {Pattern Recognition and Machine Learning},
   author =    {Christopher M. Bishop},
   publisher = {Springer},
   isbn =      {9780387310732,0387310738},
   year =      2006,
   series =    {Information science and statistics},
   edition =   {1st ed. 2006. Corr. 2nd printing},
   pages =     {209}
 }
@article{ADAGRAD,
 author = {Duchi, John and Hazan, Elad and Singer, Yoram},
 title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
 year = {2011},
 issue_date = {2/1/2011},
 publisher = {JMLR.org},
 volume = {12},
 number = {null},
 issn = {1532-4435},
 journal = {J. Mach. Learn. Res.},
 month = jul,
 pages = {2121–2159},
 numpages = {39}
 }
@article{DBLP:journals/corr/DauphinPGCGB14,
  author    = {Yann N. Dauphin and
               Razvan Pascanu and
               {\c{C}}aglar G{\"{u}}l{\c{c}}ehre and
               Kyunghyun Cho and
               Surya Ganguli and
               Yoshua Bengio},
  title     = {Identifying and attacking the saddle point problem in high-dimensional
               non-convex optimization},
  journal   = {CoRR},
  volume    = {abs/1406.2572},
  year      = {2014},
  url       = {http://arxiv.org/abs/1406.2572},
  archivePrefix = {arXiv},
  eprint    = {1406.2572},
  timestamp = {Mon, 22 Jul 2019 13:15:46 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
 }
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@ -384,22 +384,23 @@ network using true gradients when training for the same mount of time.
 \subsection{Modified Stochastic Gradient Descent}
 There is a inherent problem in the sensitivity of the gradient descent
 algorithm regarding the learning rate $\gamma$.
-The difficulty of choosing the learning rate is
+The difficulty of choosing the learning rate can be seen
-in the Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
+in Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
 but as the rate is enlarged the algorithm can become unstable and
 diverge. Even for learning rates small enough to ensure the parameters
 do not diverge to infinity steep valleys can hinder the progress of
 the algorithm as with to large leaning rates gradient descent
-``bounces between'' the walls of the valley rather then follow ...
+``bounces between'' the walls of the valley rather then follow a
 downward trend in the valley.
 % \[
 %   w - \gamma \nabla_w ...
 % \]
-thus the weights grow to infinity.
+%thus the weights grow to infinity.
 \todo{unstable learning rate besser
  erklären}
-To combat this problem it is proposed \todo{quelle} alter the learning
+To combat this problem \todo{quelle} propose to alter the learning
 rate over the course of training, often called leaning rate
 scheduling. The most popular implementations of this are time based
 decay
@ -417,16 +418,68 @@ and exponential decay, where the learning rate is decreased after each epoch,
  \gamma_n = \gamma_o e^{-n d}.
 \]
 These methods are able to increase the accuracy of a model by a large
-margin as seen in the training of RESnet by \textcite{resnet}
+margin as seen in the training of RESnet by \textcite{resnet}.
-                                                                       \todo{vielleicht grafik
+\todo{vielleicht grafik
-  einbauen}. However stochastic gradient descent with weight decay is
+  einbauen}
-still highly sensitive to the choice of the hyperparameters $\gamma$
+However stochastic gradient descent with weight decay is
 still highly sensitive to the choice of the hyperparameters $\gamma_0$
 and $d$.
 In order to mitigate this problem a number of algorithms have been
 developed to regularize the learning rate with as minimal
 hyperparameter guesswork as possible.
-One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELTA}
+
-\clearpage
+We will examine and compare a ... algorithms that use a adaptive
 learning rate.
 They all scale the gradient for the update depending of past gradients
 for each weight individually.
 The algorithms are build up on each other with the adaptive gradient
 algorithm (ADAGRAD, \textcite{ADAGRAD})
 laying the base work. Here for each parameter update the learning rate
 is given my a constant
 $\gamma$ is divided by the sum of the squares of the past partial
 derivatives in this parameter. This results in a monotonously
 decreasing learning rate for each parameter. This results in a faster
 decaying learning rate for parameters with large updates, where as
 parameters with small updates experience smaller decay. The ADAGRAD
 algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
 \begin{algorithm}[H]
  \SetAlgoLined
  \KwInput{Global learning rate $\gamma$}
  \KwInput{Constant $\varepsilon$}
  \KwInput{Initial parameter vector $x_1 \in \mathbb{R}^p$}
  \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
    Compute Gradient: $g_t$\;
    Compute Update: $\Delta x_{t,i} \leftarrow
    -\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_t, \forall i =
    1, \dots,p$\;
    Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
  }  
  \caption{ADAGRAD}
  \label{alg:ADAGRAD}
 \end{algorithm}
 Building on ADAGRAD \textcite{ADADELTA} developed the ... (ADADELTA)
 in order to improve upon the two main drawbacks of ADAGRAD, being the
 continual decay of the learning rate and the need for a manually
 selected global learning rate $\gamma$.
 As ADAGRAD accumulates the squared gradients the learning rate will
 eventually become infinitely small.
 In order to ensure that even after a significant of iterations
 learning continues to make progress instead of summing the gradients a
 exponentially decaying average of the past gradients is used to ....
 Additionally the fixed global learning rate $\gamma$ is substituted by
 a exponentially decaying average of the past parameter updates.
 The usage of the past parameter updates is motivated by ensuring that
 if the parameter vector had some hypothetical units they would be matched
 by these of the parameter update $\Delta x_t$. This proper
 \todo{erklärung unit}
 While the stochastic gradient algorithm is less susceptible to local
 extrema than gradient descent the problem still persists especially
 with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
 \begin{itemize}
  \item ADAM
@ -454,9 +507,10 @@ One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELT
  \label{alg:gd}
 \end{algorithm}
 \input{Plots/sdg_comparison.tex}
 % \subsubsubsection{Stochastic Gradient Descent}
-
+\clearpage
 \subsection{Combating Overfitting}
 % As in many machine learning applications if the model is overfit in
@ -489,7 +543,7 @@ iteration, this practice is called Dropout and was introduced by
 \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
 training set?}
-
+\subsubsection{Effectively for small training sets}
 %%% Local Variables:
 %%% mode: latex
--- a/TeX/introduction_nn.tex
+++ b/TeX/introduction_nn.tex
@ -194,45 +194,113 @@ using data with the expected response (label) attached to each
 data-point in fitting the model, where usually some distance between
 the model output and the labels is minimized.
-\subsubsection{Interpreting the Output}
+\subsubsection{Interpreting the Output / Classification vs Regression
-
+  / Nonliniarity in last layer}
 In order to properly interpret the output of a neural network and
 training it, depending on the problem it might be advantageous to
 transform the output form the last layer. Given the nature of the
 neural network the value at each output node is a real number. This is
 desirable for applications where the desired output is a real numbered
 vector (e.g. steering inputs for a autonomous car), however for
 classification problems it is desirable to transform this
 output. Often classification problems are modeled in such a way that
 each output node corresponds to a class. Then the output vector needs
 to be normalized in order to give a prediction. The naive approach is
 to transform the output vector $o$ into a one-hot vector $p$
 corresponding to a $0$
 entry for all classes except one, which is the predicted class.
 Given the nature of the neural net the output of the last layer are
 real numbers. For regression tasks this is desirable, for
 classification problems however some transformations might be
 necessary.
 As the goal in the latter is to predict a certain class or classes for
 an object the output needs to be of a form that allows this
 interpretation.
 Commonly the nodes in the output layer each correspond to a class and
 the class chosen as prediction is the one with the highest value at
 the corresponding output node.
 The naive transformation to achieve this is transforming the output
 vector $o$ into a one-hot vector
 \[
-  p_i =
+  \text{pred}_i =
  \begin{cases}
-    1,&  i < j, \forall i,j \in \text{arg}\max o_i, \\ 
+    1,& \text{if } o_i = \max_j o_j \\
-    0,& \text{else.}
+    0,& \text{else}.
  \end{cases}
-\]\todo{besser formulieren}
+\]
 This however makes training the model with gradient based methods impossible, as the derivative of
 the transformation is either zero or undefined.
 A continuous transformation that is close to the argmax one is given by
 softmax
 \[
  \text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}.
 \]
 The softmax function transforms the realm of the output to the interval $[0,1]$
 and the individual values sum to one, thus the output can be interpreted as
 a probability for each class given the input.
 Additionally to being differentiable this allows for evaluataing the
 cetainiy of a prediction, rather than just whether it is accurate.
-However this imposes difficulties in training the network as with this
+\todo{vielleicht additiv invarianz}
-addition the model is no longer differentiable which imitates the
+% Another property that makes softmax attractive is the invariance to addition
-ways the model can be trained. Additionally information about the
+% \[
-``certainty'' for each class in the prediction gets lost. A popular
+%   \text{sofmax}(o) = \text{softmax}(o + c
-way to circumvent this problem is to normalize the output vector is
+% \]
-such a way that the entries add up to one, this allows for the
+
-interpretation of probabilities assigned to each class.
+
 % In order to properly interpret the output of a neural network and
 % training it, depending on the problem it might be advantageous to
 % transform the output form the last layer. Given the nature of the
 % neural network the value at each output node is a real number. This is
 % desirable for applications where the desired output is a real numbered
 % vector (e.g. steering inputs for a autonomous car), however for
 % classification problems it is desirable to transform this
 % output. Often classification problems are modeled in such a way that
 % each output node corresponds to a class. Then the output vector needs
 % to be normalized in order to give a prediction. The naive approach is
 % to transform the output vector $o$ into a one-hot vector $p$
 % corresponding to a $0$
 % entry for all classes except one, which is the predicted class.
 % \[
 %   p_i =
 %   \begin{cases}
 %     1,&  i < j, \forall i,j \in \text{arg}\max o_i, \\ 
 %     0,& \text{else.}
 %   \end{cases}
 % \]\todo{besser formulieren}
 % However this imposes difficulties in training the network as with this
 % addition the model is no longer differentiable which imitates the
 % ways the model can be trained. Additionally information about the
 % ``certainty'' for each class in the prediction gets lost. A popular
 % way to circumvent this problem is to normalize the output vector is
 % such a way that the entries add up to one, this allows for the
 % interpretation of probabilities assigned to each class.
 \subsubsection{Error Measurement}
 In order to make assessment about the quality of a network $\mathcal{NN}$ and train
-it we need to discuss how we measure error. As for regression problems
+it we need to discuss how we measure error. The choice of the error
-the output is continuous in contrast to the class predictions in a
+function is highly dependent on the type of the problem. For
-classification problem, we need to discuss these problems separately.
+regression problems a commonly used error measure is the mean squared
-\paragraph{Regression Problems}
+error (MSE)
 which for a function $f$ and data $(x_i,y_i), i=1,\dots,n$ is given by
 \[
  MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2.
 \]
 However depending on the problem error measures with differnt
 properties might be needed, for example in some contexts it is
 required to consider a proportional rather than absolute error as is
 common in time series models. \todo{komisch}
 As discussed above the output of a neural network for a classification
 problem can be interpreted as a probability distribution over the classes
 conditioned on the input. In this case it is \todo{can?} desirable to
 use error functions designed to compare probability distributions. A
 widespread error function for this use case is the cross entropy (\textcite{PRML}),
 which for two discrete distributions $p, q$ with the same realm $C$ is given by
 \[
  H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
 \]
 which compares a $q$ to a true underlying distribution $p$.  
 For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$
 corresponds to the probability of class $c$ given $x_i$ and predictor
 $f$ we get the loss function
 \[
  Bla = \sum_{i=1}^n H(y_i, f(x_i)).
 \]
 -Maximum Likelihood
 -Ableitung mit softmax pseudo linear -> fast improvemtns possible
 \subsubsection{Gradient Descent Algorithm}
--- a/TeX/main.tex
+++ b/TeX/main.tex
@ -34,7 +34,7 @@
 \usepackage{todonotes}
 \usepackage{lipsum}
 \usepackage[ruled,vlined]{algorithm2e}
-\usepackage{showframe}
+%\usepackage{showframe}
 \usepackage[protrusion=true, expansion=true, kerning=true]{microtype}
 \captionsetup[sub]{justification=centering}