From 1a45e7d59608a42728773c771e8cec8f9f44e845 Mon Sep 17 00:00:00 2001
From: Tobias Arndt <tobias@arndts-online.de>
Date: Sat, 8 Aug 2020 12:39:03 +0200
Subject: [PATCH] progress

---
 TeX/Plots/pfg_test.tex             |  62 +++++++++--
 TeX/further_applications_of_nn.tex | 158 ++++++++++++++++++++---------
 TeX/introduction_nn.tex            | 114 ++++++++++++++++++---
 TeX/theo_3_8.tex                   |   2 +-
 4 files changed, 267 insertions(+), 69 deletions(-)

diff --git a/TeX/Plots/pfg_test.tex b/TeX/Plots/pfg_test.tex
index 92a2917..a3ba8e0 100644
--- a/TeX/Plots/pfg_test.tex
+++ b/TeX/Plots/pfg_test.tex
@@ -1,4 +1,4 @@
-\documentclass{article}
+\documentclass[a4paper, 12pt, draft=true]{article}
 \usepackage{pgfplots}
 \usepackage{filecontents}
 \usepackage{subcaption}
@@ -78,12 +78,8 @@ plot coordinates {
     \end{tabu}
     \caption{Performace metrics after 20 epochs}
   \end{subfigure}
-  \caption{The neural network given in ?? trained with different
-    algorithms on the MNIST handwritten digits data set. For gradient
-    descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{
-        rate}$). For
-    stochastic gradient descend a batch size of 32 and learning rate
-    of 0.01 is used (SDG$_{0.01}$)}
+  \caption{Performance metrics of the network given in ... trained
+    with different optimization algorithms}
 \end{figure}
 
 \begin{center}
@@ -147,6 +143,58 @@ plot coordinates {
     left}
 \end{figure}
 
+\begin{figure}
+  \centering
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, ymin=0, ymax = 1, width=\textwidth]
+        \addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, width=\textwidth]
+        \addplot[domain=-5:5, samples=100]{tanh(x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, width=\textwidth,
+        ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
+        \addplot[domain=-5:5, samples=100]{max(0,x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
+        ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
+        \addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+\end{figure}
+
+
+\begin{tikzpicture}
+\begin{axis}[enlargelimits=false]
+\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
+\addplot[domain=-5:5, samples=100]{tanh(x)};
+\addplot[domain=-5:5, samples=100]{max(0,x)};
+\end{axis}
+\end{tikzpicture}
+
+\begin{tikzpicture}
+\begin{axis}[enlargelimits=false]
+\addplot[domain=-2*pi:2*pi, samples=100]{cos(deg(x))};
+\end{axis}
+\end{tikzpicture}
 
 \end{document}
 
diff --git a/TeX/further_applications_of_nn.tex b/TeX/further_applications_of_nn.tex
index 0eb1bca..08cf424 100644
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@@ -150,7 +150,7 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
   \begin{subfigure}{0.3\textwidth}
     \centering
     \includegraphics[width=\textwidth]{Plots/Data/image_conv9.png}
-    \caption{Gaussian Blur $\sigma^2 = 1$}
+    \caption{\hspace{-2pt}Gaussian Blur $\sigma^2 = 1$}
   \end{subfigure}
   \begin{subfigure}{0.3\textwidth}
     \centering
@@ -383,15 +383,22 @@ network using true gradients when training for the same mount of time.
 \input{Plots/SGD_vs_GD.tex}
 \clearpage
 \subsection{\titlecap{modified stochastic gradient descent}}
-There is a inherent problem in the sensitivity of the gradient descent
-algorithm regarding the learning rate $\gamma$.
-The difficulty of choosing the learning rate can be seen
-in Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
-but as the rate is enlarged the algorithm can become unstable and
-diverge. Even for learning rates small enough to ensure the parameters
-do not diverge to infinity steep valleys can hinder the progress of
-the algorithm as with to large leaning rates gradient descent
-``bounces between'' the walls of the valley rather then follow a
+An inherent problem of the stochastic gradient descent algorithm is
+its sensitivity to the learning rate $\gamma$. This results in the
+problem of having to find a appropriate learning rate for each problem
+which is largely guesswork, the impact of choosing a bad learning rate
+can be seen in Figure~\ref{fig:sgd_vs_gd}.
+% There is a inherent problem in the sensitivity of the gradient descent
+% algorithm regarding the learning rate $\gamma$.
+% The difficulty of choosing the learning rate can be seen
+% in Figure~\ref{sgd_vs_gd}.
+For small rates the progress in each iteration is small
+but as the rate is enlarged the algorithm can become unstable and the parameters
+diverge to infinity. Even for learning rates small enough to ensure the parameters
+do not diverge to infinity, steep valleys in the function to be
+minimized can hinder the progress of
+the algorithm as for leaning rates not small enough gradient descent
+``bounces between'' the walls of the valley rather then following a
 downward trend in the valley.
 
 % \[
@@ -403,7 +410,8 @@ downward trend in the valley.
 
 To combat this problem \todo{quelle} propose to alter the learning
 rate over the course of training, often called leaning rate
-scheduling. The most popular implementations of this are time based
+scheduling in order to decrease the learning rate over the course of
+training. The most popular implementations of this are time based
 decay
 \[
   \gamma_{n+1} = \frac{\gamma_n}{1 + d n},
@@ -414,12 +422,12 @@ epochs and then decreased according to parameter $d$
 \[
   \gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}
 \]
-and exponential decay, where the learning rate is decreased after each epoch,
+and exponential decay where the learning rate is decreased after each epoch
 \[
   \gamma_n = \gamma_o e^{-n d}.
 \]
-These methods are able to increase the accuracy of a model by a large
-margin as seen in the training of RESnet by \textcite{resnet}.
+These methods are able to increase the accuracy of a model by large
+margins as seen in the training of RESnet by \textcite{resnet}.
 \todo{vielleicht grafik
   einbauen}
 However stochastic gradient descent with weight decay is
@@ -500,9 +508,9 @@ While the stochastic gradient algorithm is less susceptible to local
 extrema than gradient descent the problem still persists especially
 with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
 
-A approach to the problem of ``getting stuck'' in saddle point or
+An approach to the problem of ``getting stuck'' in saddle point or
 local minima/maxima is the addition of momentum to SDG. Instead of
-using the actual gradient for the parameter update a average over the
+using the actual gradient for the parameter update an average over the
 past gradients is used. In order to avoid the need to SAVE the past
 values usually a exponentially decaying average is used resulting in
 Algorithm~\ref{alg_momentum}. This is comparable of following the path
@@ -534,6 +542,10 @@ build up momentum from approaching it.
   \label{alg:gd}
 \end{algorithm}
 
+In an effort to combine the properties of the momentum method and the
+automatic adapted learning rate of \textsc{AdaDelta} \textcite{ADAM}
+developed the \textsc{Adam} algorithm. The 
+
 Problems / Improvements ADAM \textcite{rADAM}
 
 
@@ -541,11 +553,14 @@ Problems / Improvements ADAM \textcite{rADAM}
   \SetAlgoLined
   \KwInput{Stepsize $\alpha$}
   \KwInput{Decay Parameters $\beta_1$, $\beta_2$}
-  Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
+  Initialize accumulation variables $m_0 = 0$, $v_0 = 0$\;
   \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
     Compute Gradient: $g_t$\;
-    Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} +
-    (1-\rho)g_t^2$\;
+    Accumulate first and second Moment of the Gradient:
+    \begin{align*}
+      m_t &\leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\
+      v_t &\leftarrow \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\;
+    \end{align*}
     Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
         x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
     Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
@@ -589,41 +604,88 @@ There are two approaches to introduce noise to the model during
 learning, either by manipulating the model it self or by manipulating
 the input data.
 \subsubsection{Dropout}
-If a neural network has enough hidden nodes to model a training set
-accuratly 
-Similarly to decision trees and random forests training multiple
-models on the same task and averaging the predictions can improve the
-results and combat overfitting. However training a very large
-number of neural networks is computationally expensive in training
-as well as testing. In order to make this approach feasible
-\textcite{Dropout1} introduced random dropout.
-Here for each training iteration from a before specified (sub)set of nodes
-randomly chosen ones are deactivated (their output is fixed to 0).
-During training 
-Instead of using different models and averaging them randomly
-deactivated nodes are used to simulate different networks which all
-share the same weights for present nodes.
-
-
-
-A simple but effective way to introduce noise to the model is by
-deactivating randomly chosen nodes in a layer 
-The way noise is introduced into
-the model is by deactivating certain nodes (setting the output of the
-node to 0) in the fully connected layers of the convolutional neural
-networks. The nodes are chosen at random and change in every
-iteration, this practice is called Dropout and was introduced by
-\textcite{Dropout}.
+If a neural network has enough hidden nodes there will be sets of
+weights that accurately fit the training set (proof for a small
+scenario given in ...) this expecially occurs when the relation
+between the input and output is highly complex, which requires a large
+network to model and the training set is limited in size (vgl cnn
+wening bilder). However each of these weights will result in different
+predicitons for a test set and all of them will perform worse on the
+test data than the training data. A way to improve the predictions and
+reduce the overfitting  would
+be to train a large number of networks and average their results (vgl
+random forests) however this is often computational not feasible in
+training as well as testing.
+% Similarly to decision trees and random forests training multiple
+% models on the same task and averaging the predictions can improve the
+% results and combat overfitting. However training a very large
+% number of neural networks is computationally expensive in training
+%as well as testing.
+In order to make this approach feasible
+\textcite{Dropout1} propose random dropout.
+Instead of training different models for each data point in a batch
+randomly chosen nodes in the network are disabled (their output is
+fixed to zero) and the updates for the weights in the remaining
+smaller network are comuted. These the updates computed for each data
+point in the batch are then accumulated and applied to the full
+network.
+This can be compared to many small networks which share their weights
+for their active neurons being trained simultaniously.
+For testing the ``mean network'' with all nodes active but their
+output scaled accordingly to compensate for more active nodes is
+used. \todo{comparable to averaging dropout networks, beispiel für
+  besser in kleinem fall}
+% Here for each training iteration from a before specified (sub)set of nodes
+% randomly chosen ones are deactivated (their output is fixed to 0).
+% During training 
+% Instead of using different models and averaging them randomly
+% deactivated nodes are used to simulate different networks which all
+% share the same weights for present nodes.
+
+
+
+% A simple but effective way to introduce noise to the model is by
+% deactivating randomly chosen nodes in a layer 
+% The way noise is introduced into
+% the model is by deactivating certain nodes (setting the output of the
+% node to 0) in the fully connected layers of the convolutional neural
+% networks. The nodes are chosen at random and change in every
+% iteration, this practice is called Dropout and was introduced by
+% \textcite{Dropout}.
+
+\subsubsection{\titlecap{manipulation of input data}}
+Another way to combat overfitting is to keep the network from learning
+the dataset by manipulating the inputs randomly for each iteration of
+training. This is commonly used in image based tasks as there are
+often ways to maipulate the input while still being sure the labels
+remain the same. For example in a image classification task such as
+handwritten digits the associated label should remain right when the
+image is rotated or stretched by a small amount.
+When using this one has to be sure that the labels indeed remain the
+same or else the network will not learn the desired ...
+In the case of handwritten digits for example a to high rotation angle
+will ... a nine or six.
+The most common transformations are rotation, zoom, shear, brightness, mirroring.
 
 \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
 training set?}
 
-\subsubsection{Effectivety for small training sets}
+\subsubsection{\titlecap{effectivety for small training sets}}
 
 For some applications (medical problems with small amount of patients)
-the available data can be highly limited. In the following the impact
-on highly reduced training sets has been ... for ... and the results
-are given in Figure ...
+the available data can be highly limited.
+In order to get a understanding for the achievable accuracy for such a
+scenario in the following we examine the ... and  .. with a highly
+reduced training set and the impact the above mentioned strategies on
+combating overfitting have.
+
+\clearpage
+\section{Bla}
+\begin{itemize}
+  \item generate more data, GAN etc
+  \item Transfer learning, use network trained on different task and
+  repurpose it / train it with the training data
+\end{itemize}
 
 %%% Local Variables:
 %%% mode: latex
diff --git a/TeX/introduction_nn.tex b/TeX/introduction_nn.tex
index 19d3f76..fbddced 100644
--- a/TeX/introduction_nn.tex
+++ b/TeX/introduction_nn.tex
@@ -87,6 +87,93 @@ except for the input layer, which recieves the components of the input.
 
 \subsection{Nonlinearity of Neural Networks}
 
+The arguably most important feature of neural networks that sets them
+apart from linear models is the activation function implemented in the
+neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the
+inputs a activation function $\sigma$ is applied in order to obtain
+the output resulting in the output being given by
+\[
+  o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right).
+\]
+The activation function is usually chosen nonlinear (a linear one
+would result in the entire model collapsing into a linear one) which
+allows it to better model data (beispiel satz ...).
+There are two types of activation functions, saturating and not
+saturating ones. Popular examples for the former are sigmoid
+functions where most commonly the standard logisitc function or tanh are used
+as they have easy to compute derivatives which is ... for gradient
+based optimization algorithms. The standard logistic function (often
+referred to simply as sigmoid function) is given by
+\[
+  f(x) = \frac{1}{1+e^{-x}}
+\]
+and has a realm of $[0,1]$. Its usage as an activation function is
+motivated by modeling neurons which
+are close to deactive until a certain threshold where they grow in
+intensity until they are fully
+active, which is similar to the behavior of neurons in brains
+\todo{besser schreiben}. The tanh function is given by
+\[
+  tanh(x) = \frac{2}{e^{2x}+1}
+\]
+
+The downside of these saturating activation functions is that given
+their ... their derivatives are close to zero for large or small
+input values which can ... the ... of gradient based methods.
+
+The nonsaturating activation functions commonly used are the recified
+linear using (ReLU) or the leaky RelU. The ReLU is given by
+\[
+  r(x) = \max\left\{0, x\right\}.
+\]
+This has the benefit of having a constant derivative for values larger
+than zero. However the derivative being zero ... . The leaky ReLU is
+an attempt to counteract this problem by assigning a small constant
+derivative to all values smaller than zero and for scalar $\alpha$ is given by
+\[
+  l(x) = \max\left\{0, x\right\} + \alpha.
+\]
+In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}.
+
+\begin{figure}
+  \centering
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, ymin=0, ymax = 1, width=\textwidth]
+        \addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, width=\textwidth]
+        \addplot[domain=-5:5, samples=100]{tanh(x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, width=\textwidth,
+        ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
+        \addplot[domain=-5:5, samples=100]{max(0,x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+  \begin{subfigure}{.45\linewidth}
+    \centering
+    \begin{tikzpicture}
+      \begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
+        ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
+        \addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
+      \end{axis}
+    \end{tikzpicture}
+  \end{subfigure}
+  \caption{Plots of the activation fucntoins...}
+  \label{fig:activation}
+\end{figure}
 
 
 \begin{figure}
@@ -173,6 +260,7 @@ except for the input layer, which recieves the components of the input.
 
   \end{tikzpicture}
   \caption{Structure of a single neuron}
+  \label{fig:neuron}
 \end{figure}
 
 \clearpage
@@ -345,19 +433,19 @@ large in networks with multiple layers of high neuron count naively
 computing these can get quite memory and computational expensive. But
 by using the chain rule and exploiting the layered structure we can
 compute the gradient much more efficiently by using backpropagation
-first introduced by \textcite{backprop}.
-
-\subsubsection{Backpropagation}
-
-As with an increasing amount of layers the derivative of a loss
-function with respect to a certain variable becomes more intensive to
-compute there have been efforts in increasing the efficiency of
-computing these derivatives. Today the BACKPROPAGATION algorithm is
-widely used to compute the derivatives needed for the optimization
-algorithms. Here instead of naively calculating the derivative for
-each variable, the chain rule is used in order to compute derivatives
-for each layer from output layer towards the first layer while only
-needing to ....
+introduced by \textcite{backprop}. 
+
+% \subsubsection{Backpropagation}
+
+% As with an increasing amount of layers the derivative of a loss
+% function with respect to a certain variable becomes more intensive to
+% compute there have been efforts in increasing the efficiency of
+% computing these derivatives. Today the BACKPROPAGATION algorithm is
+% widely used to compute the derivatives needed for the optimization
+% algorithms. Here instead of naively calculating the derivative for
+% each variable, the chain rule is used in order to compute derivatives
+% for each layer from output layer towards the first layer while only
+% needing to ....
 
 \[
   \frac{\partial L(...)}{}
diff --git a/TeX/theo_3_8.tex b/TeX/theo_3_8.tex
index d91878f..c35df25 100644
--- a/TeX/theo_3_8.tex
+++ b/TeX/theo_3_8.tex
@@ -220,7 +220,7 @@ plot coordinates {
   Networks} 
 
 
-In this section we will analyze the connection of randomized shallow
+This section is based on \textcite{heiss2019}. We will analyze the connection of randomized shallow
 Neural Networks with one dimensional input and regression splines. We
 will see that the punishment of the size of the weights in training
 the randomized shallow