From bad8e4263029dc53c82ce0993f4074aa1634cd73 Mon Sep 17 00:00:00 2001 From: Tobias Arndt Date: Mon, 10 Aug 2020 20:54:02 +0200 Subject: [PATCH] progress --- TeX/Plots/SGD_vs_GD.tex | 1 + TeX/Plots/fashion_mnist.tex | 53 +++++++++++++ TeX/Plots/gen_dropout.tex | 79 +++++++++++++++++++ TeX/Plots/pfg_test.tex | 122 +++++++++++++++++------------ TeX/Plots/sdg_comparison.tex | 2 +- TeX/further_applications_of_nn.tex | 93 ++++++++++++++++++---- TeX/introduction_nn.tex | 2 +- TeX/main.tex | 3 + TeX/theo_3_8.tex | 4 + 9 files changed, 289 insertions(+), 70 deletions(-) create mode 100644 TeX/Plots/fashion_mnist.tex create mode 100644 TeX/Plots/gen_dropout.tex diff --git a/TeX/Plots/SGD_vs_GD.tex b/TeX/Plots/SGD_vs_GD.tex index b6b6e26..d359b19 100644 --- a/TeX/Plots/SGD_vs_GD.tex +++ b/TeX/Plots/SGD_vs_GD.tex @@ -80,6 +80,7 @@ plot coordinates { \\\cline{1-4}\cline{6-9} GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$ \\\cline{1-4}\cline{6-9} + \multicolumn{9}{c}{test}\\ 0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032 \end{tabu} \caption{Performance metrics of the networks trained in diff --git a/TeX/Plots/fashion_mnist.tex b/TeX/Plots/fashion_mnist.tex new file mode 100644 index 0000000..919ba1a --- /dev/null +++ b/TeX/Plots/fashion_mnist.tex @@ -0,0 +1,53 @@ +\begin{figure}[h] + \centering + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist0.pdf} + \caption{T-shirt/top} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist1.pdf} + \caption{Trousers} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist2.pdf} + \caption{Pullover} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist3.pdf} + \caption{Dress} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist4.pdf} + \caption{Coat} + \end{subfigure}\\ + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist5.pdf} + \caption{Sandal} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist6.pdf} + \caption{Shirt} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist7.pdf} + \caption{Sneaker} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist8.pdf} + \caption{Bag} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf} + \caption{Ankle boot} + \end{subfigure} + \caption{The fashtion MNIST data set contains 70.000 images of + preprocessed product images from Zalando, which are categorized as + T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, + Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while + the rest are used to validate the models trained.} + \label{fig:MNIST} +\end{figure} +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "../main" +%%% End: diff --git a/TeX/Plots/gen_dropout.tex b/TeX/Plots/gen_dropout.tex new file mode 100644 index 0000000..d29536d --- /dev/null +++ b/TeX/Plots/gen_dropout.tex @@ -0,0 +1,79 @@ +\pgfplotsset{ +compat=1.11, +legend image code/.code={ +\draw[mark repeat=2,mark phase=2] +plot coordinates { +(0cm,0cm) +(0.3cm,0cm) %% default is (0.3cm,0cm) +(0.6cm,0cm) %% default is (0.6cm,0cm) +};% +} +} +\begin{figure} + \begin{subfigure}[h]{\textwidth} + \begin{tikzpicture} + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2] + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adam_datagen_full_mean.log}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adam_datagen_dropout_02_full_mean.log}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adam_datagen_dropout_04_full_mean.log}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adam_dropout_02_full_mean.log}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adam_dropout_04_full_mean.log}; + \addplot [dashed] table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adam_full_mean.log}; + + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G. + D. 0.2}} + \addlegendentry{\footnotesize{G. + D. 0.4}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{D. 0.4}} + \addlegendentry{\footnotesize{Default}} + \end{axis} + \end{tikzpicture} + \caption{Classification accuracy} + \vspace{.25cm} + \end{subfigure} + \begin{subfigure}[h]{1.0\linewidth} + \begin{tabu} to \textwidth {@{} l *6{X[c]} @{}} + \multicolumn{7}{c}{Classification Accuracy}\Bstrut + \\\hline + &\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.+D.~0.4 \Tstrut \Bstrut + \\\hline + mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\ + max& \\ + min& \\ + \multicolumn{7}{c}{Training Accuracy}\Bstrut + \\\hline + mean&0.9994&0.9990&0.9989&0.9967&0.9954&0.9926 \Tstrut \\ + max& \\ + min& \\ + + \end{tabu} + \caption{Mean and maximum accuracy after 48 epochs of training.} + \end{subfigure} + \caption{Accuracy for the net given in ... with Dropout (D.), + data generation (G.), a combination, or neither (Default) implemented and trained + with \textsc{Adam}. For each epoch the 60.000 training samples + were used, or for data generation 10.000 steps with each using + batches of 60 generated data points. For each configuration the + model was trained 5 times and the average accuracies at each epoch + are given in (a). Mean, maximum and minimum values of accuracy on + the test and training set are given in (b).} +\end{figure} +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "../main" +%%% End: diff --git a/TeX/Plots/pfg_test.tex b/TeX/Plots/pfg_test.tex index a3ba8e0..d75e7fb 100644 --- a/TeX/Plots/pfg_test.tex +++ b/TeX/Plots/pfg_test.tex @@ -7,6 +7,10 @@ \usepackage{tabu} \usepackage{graphicx} \usetikzlibrary{calc, 3d} +\usepgfplotslibrary{colorbrewer} + +\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut +\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut \begin{document} \pgfplotsset{ @@ -15,71 +19,80 @@ legend image code/.code={ \draw[mark repeat=2,mark phase=2] plot coordinates { (0cm,0cm) -(0.0cm,0cm) %% default is (0.3cm,0cm) -(0.0cm,0cm) %% default is (0.6cm,0cm) +(0.3cm,0cm) %% default is (0.3cm,0cm) +(0.6cm,0cm) %% default is (0.6cm,0cm) };% } } \begin{figure} - \begin{subfigure}[b]{\textwidth} + \begin{subfigure}[h]{\textwidth} \begin{tikzpicture} - \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east}, - xlabel = {epoch}, ylabel = {Classification Accuracy}] + \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth, + height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east}, + xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2] + % \addplot [dashed] table + % [x=epoch, y=accuracy, col sep=comma, mark = none] + % {Data/adam_datagen_full.log}; \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] - {Data/adagrad.log}; + {Data/adam_datagen_full_mean.log}; + % \addplot [dashed] table + % [x=epoch, y=accuracy, col sep=comma, mark = none] + % {Data/adam_datagen_dropout_02_full.log}; \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] - {Data/adadelta.log}; + {Data/adam_datagen_dropout_02_full_mean.log}; \addplot table [x=epoch, y=val_accuracy, col sep=comma, mark = none] - {Data/adam.log}; - - \addlegendentry{\footnotesize{ADAGRAD}} - \addlegendentry{\footnotesize{ADADELTA}} - \addlegendentry{\footnotesize{ADAM}} - \addlegendentry{SGD$_{0.01}$} - \end{axis} - \end{tikzpicture} - %\caption{Classification accuracy} - \end{subfigure} - \begin{subfigure}[b]{\textwidth} - \begin{tikzpicture} - \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.7\textwidth, ymax = 0.5, - xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels = - {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}] - \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log}; + {Data/adam_datagen_dropout_04_full_mean.log}; \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log}; + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Data/adam_dropout_02_full_mean.log}; \addplot table - [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log}; - - \addlegendentry{\footnotesize{ADAGRAD}} - \addlegendentry{\footnotesize{ADADELTA}} - \addlegendentry{\footnotesize{ADAM}} - \addlegendentry{SGD$_{0.01}$} + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Data/adam_dropout_04_full_mean.log}; + \addplot [dashed] table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Data/adam_full_mean.log}; + \addlegendentry{\footnotesize{G.}} + \addlegendentry{\footnotesize{G. + D. 0.2}} + \addlegendentry{\footnotesize{G. + D. 0.4}} + \addlegendentry{\footnotesize{D. 0.2}} + \addlegendentry{\footnotesize{D. 0.4}} + \addlegendentry{\footnotesize{Default}} \end{axis} \end{tikzpicture} - \caption{Performance metrics during training} - \end{subfigure} - \\~\\ - \begin{subfigure}[b]{1.0\linewidth} - \begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}} - \multicolumn{3}{c}{Classification Accuracy} - &~&\multicolumn{3}{c}{Error Measure} - \\\cline{1-3}\cline{5-7} - ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM - \\\cline{1-3}\cline{5-7} - 1&1&1&&1&1&1 + \caption{Classification accuracy} + \vspace{.25cm} + \end{subfigure} + \begin{subfigure}[h]{1.0\linewidth} + \begin{tabu} to \textwidth {@{} l *6{X[c]} @{}} + \multicolumn{7}{c}{Classification Accuracy}\Bstrut + \\\hline + &\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.~,D.~0.4 \Tstrut \Bstrut + \\\hline + mean&0.9994&0.9990&0.9989&0.9937&0.9938&0.9940 \Tstrut \\ + max& \\ + min& \\ + \multicolumn{7}{c}{Training Accuracy}\Bstrut + \\\hline + mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\ + max& \\ + min& \\ + \end{tabu} - \caption{Performace metrics after 20 epochs} - \end{subfigure} - \caption{Performance metrics of the network given in ... trained - with different optimization algorithms} + \caption{Mean and maximum accuracy after 48 epochs of training.} + \end{subfigure} + \caption{Accuracy for the net given in ... with Dropout (D.), + data generation (G.), a combination, or neither (Default) implemented and trained + with \textsc{Adam}. For each epoch the 60.000 training samples + were used, or for data generation 10.000 steps with each using + batches of 60 generated data points. For each configuration the + model was trained 5 times and the average accuracies at each epoch + are given in (a). Mean, maximum and minimum values of accuracy on + the test and training set are given in (b).} \end{figure} \begin{center} @@ -87,18 +100,23 @@ plot coordinates { \centering \begin{subfigure}{0.19\textwidth} \includegraphics[width=\textwidth]{Data/mnist0.pdf} + \caption{original\\image} \end{subfigure} \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Data/mnist1.pdf} + \includegraphics[width=\textwidth]{Data/mnist_gen_zoom.pdf} + \caption{random\\zoom} \end{subfigure} \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Data/mnist2.pdf} + \includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf} + \caption{random\\shear} \end{subfigure} \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Data/mnist3.pdf} + \includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf} + \caption{random\\rotation} \end{subfigure} \begin{subfigure}{0.19\textwidth} - \includegraphics[width=\textwidth]{Data/mnist4.pdf} + \includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf} + \caption{random\\positional shift} \end{subfigure}\\ \begin{subfigure}{0.19\textwidth} \includegraphics[width=\textwidth]{Data/mnist5.pdf} diff --git a/TeX/Plots/sdg_comparison.tex b/TeX/Plots/sdg_comparison.tex index c42ffc4..7c0877f 100644 --- a/TeX/Plots/sdg_comparison.tex +++ b/TeX/Plots/sdg_comparison.tex @@ -67,7 +67,7 @@ plot coordinates { \end{tabu} \caption{Performace metrics after 20 epochs} \end{subfigure} - \caption{Performance metrics of the network given in ... trained + \caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained with different optimization algorithms} \end{figure} %%% Local Variables: diff --git a/TeX/further_applications_of_nn.tex b/TeX/further_applications_of_nn.tex index 08cf424..422df1b 100644 --- a/TeX/further_applications_of_nn.tex +++ b/TeX/further_applications_of_nn.tex @@ -450,7 +450,7 @@ $\gamma$ is divided by the sum of the squares of the past partial derivatives in this parameter. This results in a monotonously decreasing learning rate for each parameter. This results in a faster decaying learning rate for parameters with large updates, where as -parameters with small updates experience smaller decay. The ADAGRAD +parameters with small updates experience smaller decay. The \textsc{AdaGrad} algorithm is given in Algorithm~\ref{alg:ADAGRAD}. \begin{algorithm}[H] @@ -465,15 +465,15 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}. 1, \dots,p$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; } - \caption{\textls{ADAGRAD}} + \caption{\textls{\textsc{AdaGrad}}} \label{alg:ADAGRAD} \end{algorithm} -Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA) -in order to improve upon the two main drawbacks of ADAGRAD, being the +Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta}) +in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the continual decay of the learning rate and the need for a manually selected global learning rate $\gamma$. -As ADAGRAD accumulates the squared gradients the learning rate will +As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will eventually become infinitely small. In order to ensure that even after a significant of iterations learning continues to make progress instead of summing the gradients a @@ -500,7 +500,7 @@ by these of the parameter update $\Delta x_t$. This proper x^2]_{t-1} + (1+p)\Delta x_t^2$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; } - \caption{ADADELTA, \textcite{ADADELTA}} + \caption{\textsc{AdaDelta}, \textcite{ADADELTA}} \label{alg:gd} \end{algorithm} @@ -520,11 +520,11 @@ of the marble. This results in the algorithm being able to escape ... due to the build up momentum from approaching it. -\begin{itemize} - \item ADAM - \item momentum - \item ADADETLA \textcite{ADADELTA} -\end{itemize} +% \begin{itemize} +% \item ADAM +% \item momentum +% \item ADADETLA \textcite{ADADELTA} +% \end{itemize} \begin{algorithm}[H] @@ -665,7 +665,37 @@ When using this one has to be sure that the labels indeed remain the same or else the network will not learn the desired ... In the case of handwritten digits for example a to high rotation angle will ... a nine or six. -The most common transformations are rotation, zoom, shear, brightness, mirroring. +The most common transformations are rotation, zoom, shear, brightness, +mirroring. + +\begin{figure}[h] + \centering + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf} + \caption{original\\image} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_zoom.pdf} + \caption{random\\zoom} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shear.pdf} + \caption{random\\shear} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_rotation.pdf} + \caption{random\\rotation} + \end{subfigure} + \begin{subfigure}{0.19\textwidth} + \includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf} + \caption{random\\positional shift} + \end{subfigure} + \caption{Example for the manipuations used in ... As all images are + of the same intensity brightness manipulation does not seem + ... Additionally mirroring is not used for ... reasons.} +\end{figure} + +\input{Plots/gen_dropout.tex} \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als training set?} @@ -674,10 +704,41 @@ training set?} For some applications (medical problems with small amount of patients) the available data can be highly limited. -In order to get a understanding for the achievable accuracy for such a -scenario in the following we examine the ... and .. with a highly -reduced training set and the impact the above mentioned strategies on -combating overfitting have. +In these problems the networks are highly ... for overfitting the +data. In order to get a understanding of accuracys achievable and the +impact of the measures to prevent overfitting discussed above we and train +the network on datasets of varying sizes. +First we use the mnist handwriting dataset and then a slightly harder +problem given by the mnist fashion dataset which contains PREEDITED +pictures of clothes from 10 different categories. + +\input{Plots/fashion_mnist.tex} + +For training for each class a certain number of random datapoints are +chosen for training the network. The sizes chosen are: +full dataset: ... per class\\ +1000 per class +100 per class +10 per class + +the results for training .. are given in ... Here can be seen... + +\begin{figure}[h] + \centering + \missingfigure{datagen digits} + \caption{Sample pictures of the mnist fashioyn dataset, one per + class.} + \label{mnist fashion} +\end{figure} + +\begin{figure}[h] + \centering + \missingfigure{datagen fashion} + \caption{Sample pictures of the mnist fashioyn dataset, one per + class.} + \label{mnist fashion} +\end{figure} + \clearpage \section{Bla} diff --git a/TeX/introduction_nn.tex b/TeX/introduction_nn.tex index fbddced..f44ddd2 100644 --- a/TeX/introduction_nn.tex +++ b/TeX/introduction_nn.tex @@ -295,7 +295,7 @@ interpretation. Commonly the nodes in the output layer each correspond to a class and the class chosen as prediction is the one with the highest value at the corresponding output node. -The naive transformation to achieve this is transforming the output +This corresponds to a transformation of the output vector $o$ into a one-hot vector \[ \text{pred}_i = diff --git a/TeX/main.tex b/TeX/main.tex index 4b3ae6b..4123b97 100644 --- a/TeX/main.tex +++ b/TeX/main.tex @@ -92,6 +92,9 @@ \newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}} +\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut +\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut + \SetKwInput{KwInput}{Input} %\newcommand{\myrightarrow}[1]{\xrightarrow{\makebox[2em][c]{$\scriptstyle#1$}}}‌ diff --git a/TeX/theo_3_8.tex b/TeX/theo_3_8.tex index c35df25..caadc46 100644 --- a/TeX/theo_3_8.tex +++ b/TeX/theo_3_8.tex @@ -6,6 +6,10 @@ %%% End: \section{Shallow Neural Networks} +In order to get a some understanding of the behavior of neural +networks we study a simplified class of networks called shallow neural +networks in this chapter. We consider shallow neural networks consist of a single +hidden layer and In order to examine some behavior of neural networks in this chapter we consider a simple class of networks, the shallow ones. These networks only contain one hidden layer and have a single output node.