progress

2020-07-29 15:27:27 +02:00 · 2020-07-29 15:27:27 +02:00 · b0afc88091
commit b0afc88091
parent 2c2b053d54
6 changed files with 211 additions and 76 deletions
--- a/TeX/Plots/SGD_vs_GD.tex
+++ b/TeX/Plots/SGD_vs_GD.tex
@ -73,18 +73,18 @@ plot coordinates {
  \label{fig:sgd_vs_gd}
 \end{figure}
-\begin{table}
+\begin{table}[h]
  \begin{tabu} to \textwidth {@{}  *4{X[c]}c*4{X[c]} @{}}
    \multicolumn{4}{c}{Classification Accuracy}
    &~&\multicolumn{4}{c}{Error Measure}
    \\\cline{1-4}\cline{6-9}
    GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
    \\\cline{1-4}\cline{6-9}
-    1&1&1&1&&1&1&1&1
+    0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
  \end{tabu}
-  \caption{Performace metrics of the networks trained in
+  \caption{Performance metrics of the networks trained in
-    Figure~\ref{ref:sdg_vs_gd} after 20 training epochs.}
+    Figure~\ref{fig:sgd_vs_gd} after 20 training epochs.}
-  \label{sgd_vs_gd}
+  \label{table:sgd_vs_gd}
 \end{table}
 %%% Local Variables:
 %%% mode: latex
--- a/TeX/Plots/mnist.tex
+++ b/TeX/Plots/mnist.tex
@ -0,0 +1,41 @@
 \begin{figure}[h]
    \centering
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
  \end{subfigure}\\
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
  \end{subfigure}
  \caption{The MNIST data set contains 70.000 images of preprocessed handwritten
    digits. Of these images 60.000 are used as training images, while
    the rest are used to validate the models trained.}
  \label{fig:MNIST}
 \end{figure}
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-master: "../main"
 %%% End:
--- a/TeX/bibliograpy.bib
+++ b/TeX/bibliograpy.bib
@ -73,3 +73,20 @@ url={https://doi.org/10.1038/323533a0}
  username = {mhwombat},
  year = 2010
 }
@article{resnet,
  author    = {Kaiming He and
               Xiangyu Zhang and
               Shaoqing Ren and
               Jian Sun},
  title     = {Deep Residual Learning for Image Recognition},
  journal   = {CoRR},
  volume    = {abs/1512.03385},
  year      = 2015,
  url       = {http://arxiv.org/abs/1512.03385},
  archivePrefix = {arXiv},
  eprint    = {1512.03385},
  timestamp = {Wed, 17 Apr 2019 17:23:45 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/HeZRS15.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
 }
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@ -136,7 +136,7 @@ output is given by
 \]
 where $\sqrt{\cdot}$ and $\cdot^2$ are applied component
 wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img_conv}.
-
+\todo{padding}
 \begin{figure}[h]
@ -186,20 +186,59 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
 \clearpage
 \newpage
 \subsection{Convolutional NN}
 \todo{Eileitung zu CNN}
 % Conventional neural network as described in chapter .. are made up of
 % fully connected layers, meaning each node in a layer is influenced by
 % all nodes of the previous layer. If one wants to extract information
 % out of high dimensional input such as images this results in a very
 % large amount of variables in the model. This limits the 
-In conventional neural networks as described in chapter ... all layers
+% In conventional neural networks as described in chapter ... all layers
-are fully connected, meaning each output node in a layer is influenced
+% are fully connected, meaning each output node in a layer is influenced
-by all inputs. For $i$ inputs and $o$ output nodes this results in $i
+% by all inputs. For $i$ inputs and $o$ output nodes this results in $i
-+ 1$ variables at each node (weights and bias) and a total $o(i + 1)$
+% + 1$ variables at each node (weights and bias) and a total $o(i + 1)$
-variables. For large inputs like image data the amount of variables
+% variables. For large inputs like image data the amount of variables
-that have to be trained in order to fit the model can get excessive
+% that have to be trained in order to fit the model can get excessive
-and hinder the ability to train the model due to memory and
+% and hinder the ability to train the model due to memory and
-computational restrictions. By using convolution we can extract
+% computational restrictions. By using convolution we can extract
-meaningful information such as edges in an image with a kernel of a
+% meaningful information such as edges in an image with a kernel of a
-small size $k$ in the tens or hundreds independent of the size of the
+% small size $k$ in the tens or hundreds independent of the size of the
-original image. Thus for a large image $k \cdot i$ can be several
+% original image. Thus for a large image $k \cdot i$ can be several
-orders of magnitude smaller than $o\cdot i$ .
+% orders of magnitude smaller than $o\cdot i$ .
 As seen in the previous section convolution can lend itself to
 manipulation of images or other large data which motivates it usage in
 neural networks.
 This is achieved by implementing convolutional layers where several
 filters are applied to the input. Where the values of the filters are
 trainable parameters of the model.
 Each node in such a layer corresponds to a pixel of the output of
 convolution with one of those filters on which a bias and activation
 function are applied.
 The usage of multiple filters results in multiple outputs of the same
 size as the input. These are often called channels. Depending on the
 size of the filters this can result in the dimension of the output
 being one larger than the input.
 However for convolutional layers following a convolutional layer the
 size of the filter is often chosen to coincide with the amount of channels
 of the output of the previous layer without using padding in this
 direction in order to prevent gaining additional
 dimensions\todo{komisch} in the output.
 This can also be used to flatten certain less interesting channels of
 the input as for example a color channels.
 Thus filters used in convolutional networks are usually have the same
 amount of dimensions as the input or one more.
 The size of the filters and the way they are applied can be tuned
 while building the model should be the same for all filters in one
 layer in order for the output being of consistent size in all channels.
 It is common to reduce the d< by not applying the
 filters on each ``pixel'' but rather specify a ``stride'' $s$ at which
 the filter $g$ is moved over the input $I$
 \[
  O_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
 \] 
 As seen convolution lends itself for image manipulation. In this
 chapter we will explore how we can incorporate convolution in neural
@ -260,9 +299,9 @@ network. A class of algorithms that augment the gradient descent
 algorithm in order to lessen this problem are stochastic gradient
 descent algorithms. Here the premise is that instead of using the whole
 dataset a (different) subset of data is chosen to
-compute the gradient in each iteration.
+compute the gradient in each iteration (Algorithm~\ref{alg:sdg}).
-The amount of iterations until each data point has been considered in
+The training period until each data point has been considered in
-updating the parameters is commonly called a ``epoch''.
+updating the parameters is commonly called an ``epoch''.
 Using subsets reduces the amount of memory and computing power required for
 each iteration. This makes it possible to use very large training
 sets to fit the model.
@ -270,18 +309,42 @@ Additionally the noise introduced on the gradient can improve
 the accuracy of the fit as stochastic gradient descent algorithms are
 less likely to get stuck on local extrema.
 \input{Plots/SGD_vs_GD.tex}
 Another important benefit in using subsets is that depending on their size the
-gradient can be calculated far quicker which allows to make more steps
+gradient can be calculated far quicker which allows for more parameter updates
 in the same time. If the approximated gradient is close enough to the
 ``real'' one this can drastically cut down the time required for
-training the model. And improve the accuracy achievable in a given
+training the model to a certain degree or improve the accuracy achievable in a given
 mount of training time.
 \begin{algorithm}
  \SetAlgoLined
  \KwInput{Function $f$, Weights $w$, Learning Rate $\gamma$, Batch Size $B$, Loss Function $L$,
    Training Data $D$, Epochs $E$.}
  \For{$i \in  \left\{1:E\right\}$}{
    S <- D
    \While{$\abs{S} \geq B$}{
      Draw $\tilde{D}$ from $S$ with $\vert\tilde{D}\vert = B$\;
      Update $S$: $S \leftarrow S \setminus \tilde{D}$\;
      Compute Gradient: $g \leftarrow \frac{\mathrm{d} L(f_w,
        \tilde{D})}{\mathrm{d} w}$\;
      Update: $w \leftarrow w - \gamma g$\;
    }
    \If{$S \neq \emptyset$}{
      Compute Gradient: $g \leftarrow \frac{\mathrm{d} L(f_w,
        S)}{\mathrm{d} w}$\;
      Update:  $w \leftarrow w - \gamma g$\;
    }
    Increment: $i \leftarrow i+1$\;
  }  
  \caption{Stochastic gradient descent.}
  \label{alg:sgd}
 \end{algorithm}
 In order to illustrate this behavior we modeled a convolutional neural
 network to ... handwritten digits. The data set used for this is the
 MNIST database of handwritten digits (\textcite{MNIST},
 Figure~\ref{fig:MNIST}).
 \input{Plots/mnist.tex}
 The network used consists of two convolution and max pooling layers
 followed by one fully connected hidden layer and the output layer.
 Both covolutional layers utilize square filters of size five which are
@ -292,62 +355,78 @@ consists of 256 nodes and the output layer of 10, one for each digit.
 All layers except the output layer use RELU as activation function
 with the output layer using softmax (\ref{def:softmax}).
 As loss function categorical crossentropy is used (\ref{def:...}).
-In Figure~\ref{fig:mnist_architecture} the architecture of the network
+The architecture of the convolutional neural network is summarized in
-is summarized.
+Figure~\ref{fig:mnist_architecture}.
 \begin{figure}
  \missingfigure{network architecture}
  \caption{architecture}
  \label{fig:mnist_architecture}
 \end{figure}
 The results of the network being trained with gradient descent and
 stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd}
 and Table~\ref{table:sgd_vs_gd}
 \input{Plots/SGD_vs_GD.tex}
 Here it can be seen that the network trained with stochstic gradient
 descent is more accurate after the first epoch than the ones trained
 with gradient descent after 20 epochs.
 This is due to the former using a batch size of 32 and thus having
 made 1.875  updates to the weights
-after the first epoch. While each of these updates uses a approximate
+after the first epoch in comparison to one update . While each of
 these updates uses a approximate 
 gradient calculated on the subset it performs far better than the
 network using true gradients when training for the same mount of time.
 \todo{vergleich training time}
-The difficulty of choosing the learning rate ALSO ILLUSTRATED IN
+\clearpage
-FUGURE...
+\subsection{Modified Stochastic Gradient Descent}
 There is a inherent problem in the sensitivity of the gradient descent
 algorithm regarding the learning rate $\gamma$.
 The difficulty of choosing the learning rate is
 in the Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
 but as the rate is enlarged the algorithm can become unstable and
 diverge. Even for learning rates small enough to ensure the parameters
 do not diverge to infinity steep valleys can hinder the progress of
 the algorithm as with to large leaning rates gradient descent
 ``bounces between'' the walls of the valley rather then follow ...
 % \[
 %   w - \gamma \nabla_w ...
 % \]
 thus the weights grow to infinity.
 \todo{unstable learning rate besser
  erklären}
-The results of the network being trained with gradient descent and
+To combat this problem it is proposed \todo{quelle} alter the learning
-stochastic gradient descent are given in Figure~\ref{fig:sgd_vs_gd}
+rate over the course of training, often called leaning rate
-and Table~\ref{table:sgd_vs_dg}
+scheduling. The most popular implementations of this are time based
-
+decay
-  \begin{figure}[h]
+\[
-    \centering
+  \gamma_{n+1} = \frac{\gamma_n}{1 + d n},
-  \begin{subfigure}{0.19\textwidth}
+\]
-    \includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
+where $d$ is the decay parameter and $n$ is the number of epochs,
-  \end{subfigure}
+step based decay where the learning rate is fixed for a span of $r$
-  \begin{subfigure}{0.19\textwidth}
+epochs and then decreased according to parameter $d$
-    \includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
+\[
-  \end{subfigure}
+  \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}
-  \begin{subfigure}{0.19\textwidth}
+\]
-    \includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
+and exponential decay, where the learning rate is decreased after each epoch,
-  \end{subfigure}
+\[
-  \begin{subfigure}{0.19\textwidth}
+  \gamma_n = \gamma_o e^{-n d}.
-    \includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
+\]
-  \end{subfigure}
+These methods are able to increase the accuracy of a model by a large
-  \begin{subfigure}{0.19\textwidth}
+margin as seen in the training of RESnet by \textcite{resnet}
-    \includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
+                                                                       \todo{vielleicht grafik
-  \end{subfigure}\\
+  einbauen}. However stochastic gradient descent with weight decay is
-  \begin{subfigure}{0.19\textwidth}
+still highly sensitive to the choice of the hyperparameters $\gamma$
-    \includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
+and $d$.
-  \end{subfigure}
+In order to mitigate this problem a number of algorithms have been
-  \begin{subfigure}{0.19\textwidth}
+developed to regularize the learning rate with as minimal
-    \includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
+hyperparameter guesswork as possible.
-  \end{subfigure}
+One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELTA}
-  \begin{subfigure}{0.19\textwidth}
+\clearpage
    \includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
  \end{subfigure}
  \begin{subfigure}{0.19\textwidth}
    \includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
  \end{subfigure}
  \caption{The MNIST data set contains 70.000 images of preprocessed handwritten
    digits. Of these images 60.000 are used as training images, while
    the rest are used to validate the models trained.}
  \label{fig:MNIST}
 \end{figure}
 \begin{itemize}
  \item ADAM
@ -363,8 +442,8 @@ and Table~\ref{table:sgd_vs_dg}
  Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
  \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
    Compute Gradient: $g_t$\;
-    Accumulate Gradient: $[E[g^2]_t \leftarrow \roh D[g^2]_{t-1} +
+    Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} +
-    (1-\roh)g_t^2$\;
+    (1-\rho)g_t^2$\;
    Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
        x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
    Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
--- a/TeX/main.tex
+++ b/TeX/main.tex
@ -78,7 +78,7 @@
 \DeclareMathOperator*{\argmin}{arg\,min}
 \DeclareMathOperator*{\po}{\mathbb{P}\text{-}\mathcal{O}}
 \DeclareMathOperator*{\equals}{=}
-\begin{document}f
+\begin{document}
--- a/TeX/theo_3_8.tex
+++ b/TeX/theo_3_8.tex
@ -172,7 +172,6 @@ increased.
 \begin{figure}
  \begin{adjustbox}{width = \textwidth}
    \pgfplotsset{
      compat=1.11,
 legend image code/.code={
@ -202,7 +201,6 @@ plot coordinates {
        \addlegendentry{\footnotesize{spline}};
      \end{axis}
    \end{tikzpicture}
  \end{adjustbox}
  \caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
    \varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
    (\textcolor{blue}{blue dots}) the neural network constructed