main
Tobias Arndt 4 years ago
parent 1a45e7d596
commit bad8e42630

@ -80,6 +80,7 @@ plot coordinates {
\\\cline{1-4}\cline{6-9} \\\cline{1-4}\cline{6-9}
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$ GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
\\\cline{1-4}\cline{6-9} \\\cline{1-4}\cline{6-9}
\multicolumn{9}{c}{test}\\
0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032 0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
\end{tabu} \end{tabu}
\caption{Performance metrics of the networks trained in \caption{Performance metrics of the networks trained in

@ -0,0 +1,53 @@
\begin{figure}[h]
\centering
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist0.pdf}
\caption{T-shirt/top}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist1.pdf}
\caption{Trousers}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist2.pdf}
\caption{Pullover}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist3.pdf}
\caption{Dress}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist4.pdf}
\caption{Coat}
\end{subfigure}\\
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist5.pdf}
\caption{Sandal}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist6.pdf}
\caption{Shirt}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist7.pdf}
\caption{Sneaker}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist8.pdf}
\caption{Bag}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf}
\caption{Ankle boot}
\end{subfigure}
\caption{The fashtion MNIST data set contains 70.000 images of
preprocessed product images from Zalando, which are categorized as
T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while
the rest are used to validate the models trained.}
\label{fig:MNIST}
\end{figure}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

@ -0,0 +1,79 @@
\pgfplotsset{
compat=1.11,
legend image code/.code={
\draw[mark repeat=2,mark phase=2]
plot coordinates {
(0cm,0cm)
(0.3cm,0cm) %% default is (0.3cm,0cm)
(0.6cm,0cm) %% default is (0.6cm,0cm)
};%
}
}
\begin{figure}
\begin{subfigure}[h]{\textwidth}
\begin{tikzpicture}
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_full_mean.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_dropout_02_full_mean.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_datagen_dropout_04_full_mean.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_dropout_02_full_mean.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_dropout_04_full_mean.log};
\addplot [dashed] table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam_full_mean.log};
\addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G. + D. 0.2}}
\addlegendentry{\footnotesize{G. + D. 0.4}}
\addlegendentry{\footnotesize{D. 0.2}}
\addlegendentry{\footnotesize{D. 0.4}}
\addlegendentry{\footnotesize{Default}}
\end{axis}
\end{tikzpicture}
\caption{Classification accuracy}
\vspace{.25cm}
\end{subfigure}
\begin{subfigure}[h]{1.0\linewidth}
\begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
\multicolumn{7}{c}{Classification Accuracy}\Bstrut
\\\hline
&\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.+D.~0.4 \Tstrut \Bstrut
\\\hline
mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
max& \\
min& \\
\multicolumn{7}{c}{Training Accuracy}\Bstrut
\\\hline
mean&0.9994&0.9990&0.9989&0.9967&0.9954&0.9926 \Tstrut \\
max& \\
min& \\
\end{tabu}
\caption{Mean and maximum accuracy after 48 epochs of training.}
\end{subfigure}
\caption{Accuracy for the net given in ... with Dropout (D.),
data generation (G.), a combination, or neither (Default) implemented and trained
with \textsc{Adam}. For each epoch the 60.000 training samples
were used, or for data generation 10.000 steps with each using
batches of 60 generated data points. For each configuration the
model was trained 5 times and the average accuracies at each epoch
are given in (a). Mean, maximum and minimum values of accuracy on
the test and training set are given in (b).}
\end{figure}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

@ -7,6 +7,10 @@
\usepackage{tabu} \usepackage{tabu}
\usepackage{graphicx} \usepackage{graphicx}
\usetikzlibrary{calc, 3d} \usetikzlibrary{calc, 3d}
\usepgfplotslibrary{colorbrewer}
\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut
\begin{document} \begin{document}
\pgfplotsset{ \pgfplotsset{
@ -15,71 +19,80 @@ legend image code/.code={
\draw[mark repeat=2,mark phase=2] \draw[mark repeat=2,mark phase=2]
plot coordinates { plot coordinates {
(0cm,0cm) (0cm,0cm)
(0.0cm,0cm) %% default is (0.3cm,0cm) (0.3cm,0cm) %% default is (0.3cm,0cm)
(0.0cm,0cm) %% default is (0.6cm,0cm) (0.6cm,0cm) %% default is (0.6cm,0cm)
};% };%
} }
} }
\begin{figure} \begin{figure}
\begin{subfigure}[b]{\textwidth} \begin{subfigure}[h]{\textwidth}
\begin{tikzpicture} \begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth, \begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east}, /pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
xlabel = {epoch}, ylabel = {Classification Accuracy}] height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
% \addplot [dashed] table
% [x=epoch, y=accuracy, col sep=comma, mark = none]
% {Data/adam_datagen_full.log};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adagrad.log}; {Data/adam_datagen_full_mean.log};
% \addplot [dashed] table
% [x=epoch, y=accuracy, col sep=comma, mark = none]
% {Data/adam_datagen_dropout_02_full.log};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adadelta.log}; {Data/adam_datagen_dropout_02_full_mean.log};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam.log}; {Data/adam_datagen_dropout_04_full_mean.log};
\addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$}
\end{axis}
\end{tikzpicture}
%\caption{Classification accuracy}
\end{subfigure}
\begin{subfigure}[b]{\textwidth}
\begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.7\textwidth, ymax = 0.5,
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
\addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log};
\addplot table \addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log}; [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_dropout_02_full_mean.log};
\addplot table \addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log}; [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adam_dropout_04_full_mean.log};
\addlegendentry{\footnotesize{ADAGRAD}} \addplot [dashed] table
\addlegendentry{\footnotesize{ADADELTA}} [x=epoch, y=val_accuracy, col sep=comma, mark = none]
\addlegendentry{\footnotesize{ADAM}} {Data/adam_full_mean.log};
\addlegendentry{SGD$_{0.01}$}
\addlegendentry{\footnotesize{G.}}
\addlegendentry{\footnotesize{G. + D. 0.2}}
\addlegendentry{\footnotesize{G. + D. 0.4}}
\addlegendentry{\footnotesize{D. 0.2}}
\addlegendentry{\footnotesize{D. 0.4}}
\addlegendentry{\footnotesize{Default}}
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
\caption{Performance metrics during training} \caption{Classification accuracy}
\end{subfigure} \vspace{.25cm}
\\~\\ \end{subfigure}
\begin{subfigure}[b]{1.0\linewidth} \begin{subfigure}[h]{1.0\linewidth}
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}} \begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
\multicolumn{3}{c}{Classification Accuracy} \multicolumn{7}{c}{Classification Accuracy}\Bstrut
&~&\multicolumn{3}{c}{Error Measure} \\\hline
\\\cline{1-3}\cline{5-7} &\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.~,D.~0.4 \Tstrut \Bstrut
ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM \\\hline
\\\cline{1-3}\cline{5-7} mean&0.9994&0.9990&0.9989&0.9937&0.9938&0.9940 \Tstrut \\
1&1&1&&1&1&1 max& \\
min& \\
\multicolumn{7}{c}{Training Accuracy}\Bstrut
\\\hline
mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
max& \\
min& \\
\end{tabu} \end{tabu}
\caption{Performace metrics after 20 epochs} \caption{Mean and maximum accuracy after 48 epochs of training.}
\end{subfigure} \end{subfigure}
\caption{Performance metrics of the network given in ... trained \caption{Accuracy for the net given in ... with Dropout (D.),
with different optimization algorithms} data generation (G.), a combination, or neither (Default) implemented and trained
with \textsc{Adam}. For each epoch the 60.000 training samples
were used, or for data generation 10.000 steps with each using
batches of 60 generated data points. For each configuration the
model was trained 5 times and the average accuracies at each epoch
are given in (a). Mean, maximum and minimum values of accuracy on
the test and training set are given in (b).}
\end{figure} \end{figure}
\begin{center} \begin{center}
@ -87,18 +100,23 @@ plot coordinates {
\centering \centering
\begin{subfigure}{0.19\textwidth} \begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Data/mnist0.pdf} \includegraphics[width=\textwidth]{Data/mnist0.pdf}
\caption{original\\image}
\end{subfigure} \end{subfigure}
\begin{subfigure}{0.19\textwidth} \begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Data/mnist1.pdf} \includegraphics[width=\textwidth]{Data/mnist_gen_zoom.pdf}
\caption{random\\zoom}
\end{subfigure} \end{subfigure}
\begin{subfigure}{0.19\textwidth} \begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Data/mnist2.pdf} \includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf}
\caption{random\\shear}
\end{subfigure} \end{subfigure}
\begin{subfigure}{0.19\textwidth} \begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Data/mnist3.pdf} \includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf}
\caption{random\\rotation}
\end{subfigure} \end{subfigure}
\begin{subfigure}{0.19\textwidth} \begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Data/mnist4.pdf} \includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf}
\caption{random\\positional shift}
\end{subfigure}\\ \end{subfigure}\\
\begin{subfigure}{0.19\textwidth} \begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Data/mnist5.pdf} \includegraphics[width=\textwidth]{Data/mnist5.pdf}

@ -67,7 +67,7 @@ plot coordinates {
\end{tabu} \end{tabu}
\caption{Performace metrics after 20 epochs} \caption{Performace metrics after 20 epochs}
\end{subfigure} \end{subfigure}
\caption{Performance metrics of the network given in ... trained \caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
with different optimization algorithms} with different optimization algorithms}
\end{figure} \end{figure}
%%% Local Variables: %%% Local Variables:

@ -450,7 +450,7 @@ $\gamma$ is divided by the sum of the squares of the past partial
derivatives in this parameter. This results in a monotonously derivatives in this parameter. This results in a monotonously
decreasing learning rate for each parameter. This results in a faster decreasing learning rate for each parameter. This results in a faster
decaying learning rate for parameters with large updates, where as decaying learning rate for parameters with large updates, where as
parameters with small updates experience smaller decay. The ADAGRAD parameters with small updates experience smaller decay. The \textsc{AdaGrad}
algorithm is given in Algorithm~\ref{alg:ADAGRAD}. algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
\begin{algorithm}[H] \begin{algorithm}[H]
@ -465,15 +465,15 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
1, \dots,p$\; 1, \dots,p$\;
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
} }
\caption{\textls{ADAGRAD}} \caption{\textls{\textsc{AdaGrad}}}
\label{alg:ADAGRAD} \label{alg:ADAGRAD}
\end{algorithm} \end{algorithm}
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA) Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta})
in order to improve upon the two main drawbacks of ADAGRAD, being the in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the
continual decay of the learning rate and the need for a manually continual decay of the learning rate and the need for a manually
selected global learning rate $\gamma$. selected global learning rate $\gamma$.
As ADAGRAD accumulates the squared gradients the learning rate will As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will
eventually become infinitely small. eventually become infinitely small.
In order to ensure that even after a significant of iterations In order to ensure that even after a significant of iterations
learning continues to make progress instead of summing the gradients a learning continues to make progress instead of summing the gradients a
@ -500,7 +500,7 @@ by these of the parameter update $\Delta x_t$. This proper
x^2]_{t-1} + (1+p)\Delta x_t^2$\; x^2]_{t-1} + (1+p)\Delta x_t^2$\;
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
} }
\caption{ADADELTA, \textcite{ADADELTA}} \caption{\textsc{AdaDelta}, \textcite{ADADELTA}}
\label{alg:gd} \label{alg:gd}
\end{algorithm} \end{algorithm}
@ -520,11 +520,11 @@ of the marble.
This results in the algorithm being able to escape ... due to the This results in the algorithm being able to escape ... due to the
build up momentum from approaching it. build up momentum from approaching it.
\begin{itemize} % \begin{itemize}
\item ADAM % \item ADAM
\item momentum % \item momentum
\item ADADETLA \textcite{ADADELTA} % \item ADADETLA \textcite{ADADELTA}
\end{itemize} % \end{itemize}
\begin{algorithm}[H] \begin{algorithm}[H]
@ -665,7 +665,37 @@ When using this one has to be sure that the labels indeed remain the
same or else the network will not learn the desired ... same or else the network will not learn the desired ...
In the case of handwritten digits for example a to high rotation angle In the case of handwritten digits for example a to high rotation angle
will ... a nine or six. will ... a nine or six.
The most common transformations are rotation, zoom, shear, brightness, mirroring. The most common transformations are rotation, zoom, shear, brightness,
mirroring.
\begin{figure}[h]
\centering
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
\caption{original\\image}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_zoom.pdf}
\caption{random\\zoom}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shear.pdf}
\caption{random\\shear}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_rotation.pdf}
\caption{random\\rotation}
\end{subfigure}
\begin{subfigure}{0.19\textwidth}
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf}
\caption{random\\positional shift}
\end{subfigure}
\caption{Example for the manipuations used in ... As all images are
of the same intensity brightness manipulation does not seem
... Additionally mirroring is not used for ... reasons.}
\end{figure}
\input{Plots/gen_dropout.tex}
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
training set?} training set?}
@ -674,10 +704,41 @@ training set?}
For some applications (medical problems with small amount of patients) For some applications (medical problems with small amount of patients)
the available data can be highly limited. the available data can be highly limited.
In order to get a understanding for the achievable accuracy for such a In these problems the networks are highly ... for overfitting the
scenario in the following we examine the ... and .. with a highly data. In order to get a understanding of accuracys achievable and the
reduced training set and the impact the above mentioned strategies on impact of the measures to prevent overfitting discussed above we and train
combating overfitting have. the network on datasets of varying sizes.
First we use the mnist handwriting dataset and then a slightly harder
problem given by the mnist fashion dataset which contains PREEDITED
pictures of clothes from 10 different categories.
\input{Plots/fashion_mnist.tex}
For training for each class a certain number of random datapoints are
chosen for training the network. The sizes chosen are:
full dataset: ... per class\\
1000 per class
100 per class
10 per class
the results for training .. are given in ... Here can be seen...
\begin{figure}[h]
\centering
\missingfigure{datagen digits}
\caption{Sample pictures of the mnist fashioyn dataset, one per
class.}
\label{mnist fashion}
\end{figure}
\begin{figure}[h]
\centering
\missingfigure{datagen fashion}
\caption{Sample pictures of the mnist fashioyn dataset, one per
class.}
\label{mnist fashion}
\end{figure}
\clearpage \clearpage
\section{Bla} \section{Bla}

@ -295,7 +295,7 @@ interpretation.
Commonly the nodes in the output layer each correspond to a class and Commonly the nodes in the output layer each correspond to a class and
the class chosen as prediction is the one with the highest value at the class chosen as prediction is the one with the highest value at
the corresponding output node. the corresponding output node.
The naive transformation to achieve this is transforming the output This corresponds to a transformation of the output
vector $o$ into a one-hot vector vector $o$ into a one-hot vector
\[ \[
\text{pred}_i = \text{pred}_i =

@ -92,6 +92,9 @@
\newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}} \newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}
\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut
\SetKwInput{KwInput}{Input} \SetKwInput{KwInput}{Input}
%\newcommand{\myrightarrow}[1]{\xrightarrow{\makebox[2em][c]{$\scriptstyle#1$}}} %\newcommand{\myrightarrow}[1]{\xrightarrow{\makebox[2em][c]{$\scriptstyle#1$}}}

@ -6,6 +6,10 @@
%%% End: %%% End:
\section{Shallow Neural Networks} \section{Shallow Neural Networks}
In order to get a some understanding of the behavior of neural
networks we study a simplified class of networks called shallow neural
networks in this chapter. We consider shallow neural networks consist of a single
hidden layer and
In order to examine some behavior of neural networks in this chapter In order to examine some behavior of neural networks in this chapter
we consider a simple class of networks, the shallow ones. These we consider a simple class of networks, the shallow ones. These
networks only contain one hidden layer and have a single output node. networks only contain one hidden layer and have a single output node.

Loading…
Cancel
Save