From bad8e4263029dc53c82ce0993f4074aa1634cd73 Mon Sep 17 00:00:00 2001
From: Tobias Arndt <tobias@arndts-online.de>
Date: Mon, 10 Aug 2020 20:54:02 +0200
Subject: [PATCH] progress

---
 TeX/Plots/SGD_vs_GD.tex            |   1 +
 TeX/Plots/fashion_mnist.tex        |  53 +++++++++++++
 TeX/Plots/gen_dropout.tex          |  79 +++++++++++++++++++
 TeX/Plots/pfg_test.tex             | 122 +++++++++++++++++------------
 TeX/Plots/sdg_comparison.tex       |   2 +-
 TeX/further_applications_of_nn.tex |  93 ++++++++++++++++++----
 TeX/introduction_nn.tex            |   2 +-
 TeX/main.tex                       |   3 +
 TeX/theo_3_8.tex                   |   4 +
 9 files changed, 289 insertions(+), 70 deletions(-)
 create mode 100644 TeX/Plots/fashion_mnist.tex
 create mode 100644 TeX/Plots/gen_dropout.tex

diff --git a/TeX/Plots/SGD_vs_GD.tex b/TeX/Plots/SGD_vs_GD.tex
index b6b6e26..d359b19 100644
--- a/TeX/Plots/SGD_vs_GD.tex
+++ b/TeX/Plots/SGD_vs_GD.tex
@@ -80,6 +80,7 @@ plot coordinates {
     \\\cline{1-4}\cline{6-9}
     GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
     \\\cline{1-4}\cline{6-9}
+    \multicolumn{9}{c}{test}\\
     0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
   \end{tabu}
   \caption{Performance metrics of the networks trained in
diff --git a/TeX/Plots/fashion_mnist.tex b/TeX/Plots/fashion_mnist.tex
new file mode 100644
index 0000000..919ba1a
--- /dev/null
+++ b/TeX/Plots/fashion_mnist.tex
@@ -0,0 +1,53 @@
+\begin{figure}[h]
+    \centering
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist0.pdf}
+    \caption{T-shirt/top}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist1.pdf}
+    \caption{Trousers}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist2.pdf}
+    \caption{Pullover}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist3.pdf}
+    \caption{Dress}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist4.pdf}
+    \caption{Coat}
+  \end{subfigure}\\
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist5.pdf}
+    \caption{Sandal}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist6.pdf}
+    \caption{Shirt}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist7.pdf}
+    \caption{Sneaker}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist8.pdf}
+    \caption{Bag}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf}
+    \caption{Ankle boot}
+  \end{subfigure}
+  \caption{The fashtion MNIST data set contains 70.000 images of
+    preprocessed product images from Zalando, which are categorized as
+    T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
+    Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while
+    the rest are used to validate the models trained.}
+  \label{fig:MNIST}
+\end{figure}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../main"
+%%% End:
diff --git a/TeX/Plots/gen_dropout.tex b/TeX/Plots/gen_dropout.tex
new file mode 100644
index 0000000..d29536d
--- /dev/null
+++ b/TeX/Plots/gen_dropout.tex
@@ -0,0 +1,79 @@
+\pgfplotsset{
+compat=1.11,
+legend image code/.code={
+\draw[mark repeat=2,mark phase=2]
+plot coordinates {
+(0cm,0cm)
+(0.3cm,0cm)        %% default is (0.3cm,0cm)
+(0.6cm,0cm)         %% default is (0.6cm,0cm)
+};%
+}
+}
+\begin{figure}
+  \begin{subfigure}[h]{\textwidth}
+    \begin{tikzpicture}
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_datagen_full_mean.log};  
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_datagen_dropout_02_full_mean.log}; 
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_datagen_dropout_04_full_mean.log};
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_dropout_02_full_mean.log}; 
+        \addplot table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_dropout_04_full_mean.log}; 
+        \addplot [dashed] table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Plots/Data/adam_full_mean.log}; 
+        
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{G. + D. 0.4}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+        \addlegendentry{\footnotesize{Default}}
+      \end{axis}
+    \end{tikzpicture}
+    \caption{Classification accuracy}
+    \vspace{.25cm}
+  \end{subfigure}
+  \begin{subfigure}[h]{1.0\linewidth}
+    \begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
+      \multicolumn{7}{c}{Classification Accuracy}\Bstrut
+      \\\hline
+      &\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.+D.~0.4 \Tstrut \Bstrut
+      \\\hline
+      mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
+      max& \\
+      min& \\
+      \multicolumn{7}{c}{Training Accuracy}\Bstrut
+      \\\hline
+      mean&0.9994&0.9990&0.9989&0.9967&0.9954&0.9926 \Tstrut \\
+      max& \\
+      min& \\
+      
+    \end{tabu}
+    \caption{Mean and maximum accuracy after 48 epochs of training.}
+  \end{subfigure}
+  \caption{Accuracy for the net given in ... with Dropout (D.),
+    data generation (G.), a combination, or neither (Default) implemented and trained
+    with \textsc{Adam}. For each epoch the 60.000 training samples
+    were used, or for data generation 10.000 steps with each using
+    batches of 60 generated data points. For each configuration the
+    model was trained 5 times and the average accuracies at each epoch
+    are given in (a). Mean, maximum and minimum values of accuracy on
+    the test and training set are given in (b).}
+\end{figure}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../main"
+%%% End:
diff --git a/TeX/Plots/pfg_test.tex b/TeX/Plots/pfg_test.tex
index a3ba8e0..d75e7fb 100644
--- a/TeX/Plots/pfg_test.tex
+++ b/TeX/Plots/pfg_test.tex
@@ -7,6 +7,10 @@
 \usepackage{tabu}
 \usepackage{graphicx}
 \usetikzlibrary{calc, 3d}
+\usepgfplotslibrary{colorbrewer}
+
+\newcommand\Tstrut{\rule{0pt}{2.6ex}}         % = `top' strut
+\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}}   % = `bottom' strut
 
 \begin{document}
 \pgfplotsset{
@@ -15,71 +19,80 @@ legend image code/.code={
 \draw[mark repeat=2,mark phase=2]
 plot coordinates {
 (0cm,0cm)
-(0.0cm,0cm)        %% default is (0.3cm,0cm)
-(0.0cm,0cm)         %% default is (0.6cm,0cm)
+(0.3cm,0cm)        %% default is (0.3cm,0cm)
+(0.6cm,0cm)         %% default is (0.6cm,0cm)
 };%
 }
 }
 \begin{figure}
-  \begin{subfigure}[b]{\textwidth}
+  \begin{subfigure}[h]{\textwidth}
     \begin{tikzpicture}
-      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
-        xlabel = {epoch}, ylabel = {Classification Accuracy}]
+      \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
+                     /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
+        height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
+        xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
+        % \addplot [dashed] table
+        % [x=epoch, y=accuracy, col sep=comma, mark = none]
+        % {Data/adam_datagen_full.log};
         \addplot table
         [x=epoch, y=val_accuracy, col sep=comma, mark = none]
-        {Data/adagrad.log}; 
+        {Data/adam_datagen_full_mean.log};  
+        % \addplot [dashed] table
+        % [x=epoch, y=accuracy, col sep=comma, mark = none]
+        % {Data/adam_datagen_dropout_02_full.log}; 
         \addplot table
         [x=epoch, y=val_accuracy, col sep=comma, mark = none]
-        {Data/adadelta.log}; 
+        {Data/adam_datagen_dropout_02_full_mean.log}; 
         \addplot table
         [x=epoch, y=val_accuracy, col sep=comma, mark = none]
-        {Data/adam.log}; 
-        
-         \addlegendentry{\footnotesize{ADAGRAD}}
-        \addlegendentry{\footnotesize{ADADELTA}}
-        \addlegendentry{\footnotesize{ADAM}}
-        \addlegendentry{SGD$_{0.01}$}
-      \end{axis}
-    \end{tikzpicture}
-    %\caption{Classification accuracy}
-  \end{subfigure}
-  \begin{subfigure}[b]{\textwidth}
-    \begin{tikzpicture}
-      \begin{axis}[tick style = {draw = none}, width = \textwidth,
-        height = 0.7\textwidth, ymax = 0.5,
-        xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
-        {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
-        \addplot table
-        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log};
+        {Data/adam_datagen_dropout_04_full_mean.log};
         \addplot table
-        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log};
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Data/adam_dropout_02_full_mean.log}; 
         \addplot table
-        [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log};
-        
-        \addlegendentry{\footnotesize{ADAGRAD}}
-        \addlegendentry{\footnotesize{ADADELTA}}
-        \addlegendentry{\footnotesize{ADAM}}
-        \addlegendentry{SGD$_{0.01}$}
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Data/adam_dropout_04_full_mean.log}; 
+        \addplot [dashed] table
+        [x=epoch, y=val_accuracy, col sep=comma, mark = none]
+        {Data/adam_full_mean.log}; 
         
+        \addlegendentry{\footnotesize{G.}}
+        \addlegendentry{\footnotesize{G. + D. 0.2}}
+        \addlegendentry{\footnotesize{G. + D. 0.4}}
+        \addlegendentry{\footnotesize{D. 0.2}}
+        \addlegendentry{\footnotesize{D. 0.4}}
+        \addlegendentry{\footnotesize{Default}}
       \end{axis}
     \end{tikzpicture}
-    \caption{Performance metrics during training}
-  \end{subfigure}
-  \\~\\
-  \begin{subfigure}[b]{1.0\linewidth}
-    \begin{tabu} to \textwidth {@{}  *3{X[c]}c*3{X[c]} @{}}
-      \multicolumn{3}{c}{Classification Accuracy}
-      &~&\multicolumn{3}{c}{Error Measure}
-      \\\cline{1-3}\cline{5-7}
-      ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
-      \\\cline{1-3}\cline{5-7}
-      1&1&1&&1&1&1
+    \caption{Classification accuracy}
+    \vspace{.25cm}
+  \end{subfigure}
+  \begin{subfigure}[h]{1.0\linewidth}
+    \begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
+      \multicolumn{7}{c}{Classification Accuracy}\Bstrut
+      \\\hline
+      &\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.~,D.~0.4 \Tstrut \Bstrut
+      \\\hline
+      mean&0.9994&0.9990&0.9989&0.9937&0.9938&0.9940 \Tstrut \\
+      max& \\
+      min& \\
+      \multicolumn{7}{c}{Training Accuracy}\Bstrut
+      \\\hline
+      mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
+      max& \\
+      min& \\
+      
     \end{tabu}
-    \caption{Performace metrics after 20 epochs}
-  \end{subfigure}
-  \caption{Performance metrics of the network given in ... trained
-    with different optimization algorithms}
+    \caption{Mean and maximum accuracy after 48 epochs of training.}
+  \end{subfigure}
+  \caption{Accuracy for the net given in ... with Dropout (D.),
+    data generation (G.), a combination, or neither (Default) implemented and trained
+    with \textsc{Adam}. For each epoch the 60.000 training samples
+    were used, or for data generation 10.000 steps with each using
+    batches of 60 generated data points. For each configuration the
+    model was trained 5 times and the average accuracies at each epoch
+    are given in (a). Mean, maximum and minimum values of accuracy on
+    the test and training set are given in (b).}
 \end{figure}
 
 \begin{center}
@@ -87,18 +100,23 @@ plot coordinates {
     \centering
   \begin{subfigure}{0.19\textwidth}
     \includegraphics[width=\textwidth]{Data/mnist0.pdf}
+    \caption{original\\image}
   \end{subfigure}
   \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist1.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_zoom.pdf}
+    \caption{random\\zoom}
   \end{subfigure}
   \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist2.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf}
+    \caption{random\\shear}
   \end{subfigure}
   \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist3.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf}
+    \caption{random\\rotation}
   \end{subfigure}
   \begin{subfigure}{0.19\textwidth}
-    \includegraphics[width=\textwidth]{Data/mnist4.pdf}
+    \includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf}
+  \caption{random\\positional shift}
   \end{subfigure}\\
   \begin{subfigure}{0.19\textwidth}
     \includegraphics[width=\textwidth]{Data/mnist5.pdf}
diff --git a/TeX/Plots/sdg_comparison.tex b/TeX/Plots/sdg_comparison.tex
index c42ffc4..7c0877f 100644
--- a/TeX/Plots/sdg_comparison.tex
+++ b/TeX/Plots/sdg_comparison.tex
@@ -67,7 +67,7 @@ plot coordinates {
     \end{tabu}
     \caption{Performace metrics after 20 epochs}
   \end{subfigure}
-  \caption{Performance metrics of the network given in ... trained
+  \caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
     with different optimization algorithms}
 \end{figure}
 %%% Local Variables:
diff --git a/TeX/further_applications_of_nn.tex b/TeX/further_applications_of_nn.tex
index 08cf424..422df1b 100644
--- a/TeX/further_applications_of_nn.tex
+++ b/TeX/further_applications_of_nn.tex
@@ -450,7 +450,7 @@ $\gamma$ is divided by the sum of the squares of the past partial
 derivatives in this parameter. This results in a monotonously
 decreasing learning rate for each parameter. This results in a faster
 decaying learning rate for parameters with large updates, where as
-parameters with small updates experience smaller decay. The ADAGRAD
+parameters with small updates experience smaller decay. The \textsc{AdaGrad}
 algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
 
 \begin{algorithm}[H]
@@ -465,15 +465,15 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
     1, \dots,p$\;
     Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
   }  
-  \caption{\textls{ADAGRAD}}
+  \caption{\textls{\textsc{AdaGrad}}}
   \label{alg:ADAGRAD}
 \end{algorithm}
 
-Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA)
-in order to improve upon the two main drawbacks of ADAGRAD, being the
+Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta})
+in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the
 continual decay of the learning rate and the need for a manually
 selected global learning rate $\gamma$.
-As ADAGRAD accumulates the squared gradients the learning rate will
+As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will
 eventually become infinitely small.
 In order to ensure that even after a significant of iterations
 learning continues to make progress instead of summing the gradients a
@@ -500,7 +500,7 @@ by these of the parameter update $\Delta x_t$. This proper
     x^2]_{t-1} + (1+p)\Delta x_t^2$\;
     Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
   }  
-  \caption{ADADELTA, \textcite{ADADELTA}}
+  \caption{\textsc{AdaDelta}, \textcite{ADADELTA}}
   \label{alg:gd}
 \end{algorithm}
 
@@ -520,11 +520,11 @@ of the marble.
 This results in the algorithm being able to escape ... due to the
 build up momentum from approaching it. 
 
-\begin{itemize}
-  \item ADAM
-  \item momentum
-  \item ADADETLA \textcite{ADADELTA} 
-\end{itemize}
+% \begin{itemize}
+%   \item ADAM
+%   \item momentum
+%   \item ADADETLA \textcite{ADADELTA} 
+% \end{itemize}
 
 
 \begin{algorithm}[H]
@@ -665,7 +665,37 @@ When using this one has to be sure that the labels indeed remain the
 same or else the network will not learn the desired ...
 In the case of handwritten digits for example a to high rotation angle
 will ... a nine or six.
-The most common transformations are rotation, zoom, shear, brightness, mirroring.
+The most common transformations are rotation, zoom, shear, brightness,
+mirroring.
+
+\begin{figure}[h]
+  \centering
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
+    \caption{original\\image}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_zoom.pdf}
+    \caption{random\\zoom}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shear.pdf}
+    \caption{random\\shear}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_rotation.pdf}
+    \caption{random\\rotation}
+  \end{subfigure}
+  \begin{subfigure}{0.19\textwidth}
+    \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf}
+    \caption{random\\positional shift}
+  \end{subfigure}
+  \caption{Example for the manipuations used in ... As all images are
+    of the same intensity brightness manipulation does not seem
+    ... Additionally mirroring is not used for ...  reasons.}
+\end{figure}
+
+\input{Plots/gen_dropout.tex}
 
 \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
 training set?}
@@ -674,10 +704,41 @@ training set?}
 
 For some applications (medical problems with small amount of patients)
 the available data can be highly limited.
-In order to get a understanding for the achievable accuracy for such a
-scenario in the following we examine the ... and  .. with a highly
-reduced training set and the impact the above mentioned strategies on
-combating overfitting have.
+In these problems the networks are highly ... for overfitting the
+data. In order to get a understanding of accuracys achievable and the
+impact of the measures to prevent overfitting discussed above we and train
+the network on datasets of varying sizes.
+First we use the mnist handwriting dataset and then a slightly harder
+problem given by the mnist fashion dataset which contains PREEDITED
+pictures of clothes from 10 different categories.
+
+\input{Plots/fashion_mnist.tex}
+
+For training for each class a certain number of random datapoints are
+chosen for training the network. The sizes chosen are:
+full dataset: ... per class\\
+1000 per class
+100 per class
+10 per class
+
+the results for training .. are given in ... Here can be seen...
+
+\begin{figure}[h]
+  \centering
+  \missingfigure{datagen digits}
+  \caption{Sample pictures of the mnist fashioyn dataset, one per
+    class.}
+  \label{mnist fashion}
+\end{figure}
+
+\begin{figure}[h]
+  \centering
+  \missingfigure{datagen fashion}
+  \caption{Sample pictures of the mnist fashioyn dataset, one per
+    class.}
+  \label{mnist fashion}
+\end{figure}
+
 
 \clearpage
 \section{Bla}
diff --git a/TeX/introduction_nn.tex b/TeX/introduction_nn.tex
index fbddced..f44ddd2 100644
--- a/TeX/introduction_nn.tex
+++ b/TeX/introduction_nn.tex
@@ -295,7 +295,7 @@ interpretation.
 Commonly the nodes in the output layer each correspond to a class and
 the class chosen as prediction is the one with the highest value at
 the corresponding output node.
-The naive transformation to achieve this is transforming the output
+This corresponds to a transformation of the output
 vector $o$ into a one-hot vector
 \[
   \text{pred}_i =
diff --git a/TeX/main.tex b/TeX/main.tex
index 4b3ae6b..4123b97 100644
--- a/TeX/main.tex
+++ b/TeX/main.tex
@@ -92,6 +92,9 @@
 
 \newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}
 
+\newcommand\Tstrut{\rule{0pt}{2.6ex}}         % = `top' strut
+\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}}   % = `bottom' strut
+
 \SetKwInput{KwInput}{Input}  
 
 %\newcommand{\myrightarrow}[1]{\xrightarrow{\makebox[2em][c]{$\scriptstyle#1$}}}‌
diff --git a/TeX/theo_3_8.tex b/TeX/theo_3_8.tex
index c35df25..caadc46 100644
--- a/TeX/theo_3_8.tex
+++ b/TeX/theo_3_8.tex
@@ -6,6 +6,10 @@
 %%% End:
 \section{Shallow Neural Networks}
 
+In order to get a some understanding of the behavior of neural
+networks we study a simplified class of networks called shallow neural
+networks in this chapter. We consider shallow neural networks consist of a single
+hidden layer and 
 In order to examine some behavior of neural networks in this chapter
 we consider a simple class of networks, the shallow ones. These
 networks only contain one hidden layer and have a single output node.