progress
This commit is contained in:
parent
1a45e7d596
commit
bad8e42630
@ -80,6 +80,7 @@ plot coordinates {
|
|||||||
\\\cline{1-4}\cline{6-9}
|
\\\cline{1-4}\cline{6-9}
|
||||||
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
|
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
|
||||||
\\\cline{1-4}\cline{6-9}
|
\\\cline{1-4}\cline{6-9}
|
||||||
|
\multicolumn{9}{c}{test}\\
|
||||||
0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
|
0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
|
||||||
\end{tabu}
|
\end{tabu}
|
||||||
\caption{Performance metrics of the networks trained in
|
\caption{Performance metrics of the networks trained in
|
||||||
|
53
TeX/Plots/fashion_mnist.tex
Normal file
53
TeX/Plots/fashion_mnist.tex
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist0.pdf}
|
||||||
|
\caption{T-shirt/top}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist1.pdf}
|
||||||
|
\caption{Trousers}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist2.pdf}
|
||||||
|
\caption{Pullover}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist3.pdf}
|
||||||
|
\caption{Dress}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist4.pdf}
|
||||||
|
\caption{Coat}
|
||||||
|
\end{subfigure}\\
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist5.pdf}
|
||||||
|
\caption{Sandal}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist6.pdf}
|
||||||
|
\caption{Shirt}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist7.pdf}
|
||||||
|
\caption{Sneaker}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist8.pdf}
|
||||||
|
\caption{Bag}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf}
|
||||||
|
\caption{Ankle boot}
|
||||||
|
\end{subfigure}
|
||||||
|
\caption{The fashtion MNIST data set contains 70.000 images of
|
||||||
|
preprocessed product images from Zalando, which are categorized as
|
||||||
|
T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
|
||||||
|
Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while
|
||||||
|
the rest are used to validate the models trained.}
|
||||||
|
\label{fig:MNIST}
|
||||||
|
\end{figure}
|
||||||
|
%%% Local Variables:
|
||||||
|
%%% mode: latex
|
||||||
|
%%% TeX-master: "../main"
|
||||||
|
%%% End:
|
79
TeX/Plots/gen_dropout.tex
Normal file
79
TeX/Plots/gen_dropout.tex
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
\pgfplotsset{
|
||||||
|
compat=1.11,
|
||||||
|
legend image code/.code={
|
||||||
|
\draw[mark repeat=2,mark phase=2]
|
||||||
|
plot coordinates {
|
||||||
|
(0cm,0cm)
|
||||||
|
(0.3cm,0cm) %% default is (0.3cm,0cm)
|
||||||
|
(0.6cm,0cm) %% default is (0.6cm,0cm)
|
||||||
|
};%
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\begin{figure}
|
||||||
|
\begin{subfigure}[h]{\textwidth}
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
|
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||||
|
height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
|
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Plots/Data/adam_datagen_full_mean.log};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Plots/Data/adam_datagen_dropout_02_full_mean.log};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Plots/Data/adam_datagen_dropout_04_full_mean.log};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Plots/Data/adam_dropout_02_full_mean.log};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Plots/Data/adam_dropout_04_full_mean.log};
|
||||||
|
\addplot [dashed] table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Plots/Data/adam_full_mean.log};
|
||||||
|
|
||||||
|
\addlegendentry{\footnotesize{G.}}
|
||||||
|
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||||
|
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||||
|
\addlegendentry{\footnotesize{D. 0.2}}
|
||||||
|
\addlegendentry{\footnotesize{D. 0.4}}
|
||||||
|
\addlegendentry{\footnotesize{Default}}
|
||||||
|
\end{axis}
|
||||||
|
\end{tikzpicture}
|
||||||
|
\caption{Classification accuracy}
|
||||||
|
\vspace{.25cm}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}[h]{1.0\linewidth}
|
||||||
|
\begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
|
||||||
|
\multicolumn{7}{c}{Classification Accuracy}\Bstrut
|
||||||
|
\\\hline
|
||||||
|
&\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.+D.~0.4 \Tstrut \Bstrut
|
||||||
|
\\\hline
|
||||||
|
mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
|
||||||
|
max& \\
|
||||||
|
min& \\
|
||||||
|
\multicolumn{7}{c}{Training Accuracy}\Bstrut
|
||||||
|
\\\hline
|
||||||
|
mean&0.9994&0.9990&0.9989&0.9967&0.9954&0.9926 \Tstrut \\
|
||||||
|
max& \\
|
||||||
|
min& \\
|
||||||
|
|
||||||
|
\end{tabu}
|
||||||
|
\caption{Mean and maximum accuracy after 48 epochs of training.}
|
||||||
|
\end{subfigure}
|
||||||
|
\caption{Accuracy for the net given in ... with Dropout (D.),
|
||||||
|
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||||
|
with \textsc{Adam}. For each epoch the 60.000 training samples
|
||||||
|
were used, or for data generation 10.000 steps with each using
|
||||||
|
batches of 60 generated data points. For each configuration the
|
||||||
|
model was trained 5 times and the average accuracies at each epoch
|
||||||
|
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||||
|
the test and training set are given in (b).}
|
||||||
|
\end{figure}
|
||||||
|
%%% Local Variables:
|
||||||
|
%%% mode: latex
|
||||||
|
%%% TeX-master: "../main"
|
||||||
|
%%% End:
|
@ -7,6 +7,10 @@
|
|||||||
\usepackage{tabu}
|
\usepackage{tabu}
|
||||||
\usepackage{graphicx}
|
\usepackage{graphicx}
|
||||||
\usetikzlibrary{calc, 3d}
|
\usetikzlibrary{calc, 3d}
|
||||||
|
\usepgfplotslibrary{colorbrewer}
|
||||||
|
|
||||||
|
\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut
|
||||||
|
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut
|
||||||
|
|
||||||
\begin{document}
|
\begin{document}
|
||||||
\pgfplotsset{
|
\pgfplotsset{
|
||||||
@ -15,71 +19,80 @@ legend image code/.code={
|
|||||||
\draw[mark repeat=2,mark phase=2]
|
\draw[mark repeat=2,mark phase=2]
|
||||||
plot coordinates {
|
plot coordinates {
|
||||||
(0cm,0cm)
|
(0cm,0cm)
|
||||||
(0.0cm,0cm) %% default is (0.3cm,0cm)
|
(0.3cm,0cm) %% default is (0.3cm,0cm)
|
||||||
(0.0cm,0cm) %% default is (0.6cm,0cm)
|
(0.6cm,0cm) %% default is (0.6cm,0cm)
|
||||||
};%
|
};%
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\begin{subfigure}[b]{\textwidth}
|
\begin{subfigure}[h]{\textwidth}
|
||||||
\begin{tikzpicture}
|
\begin{tikzpicture}
|
||||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||||
height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
|
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||||
xlabel = {epoch}, ylabel = {Classification Accuracy}]
|
height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||||
|
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle list/Dark2]
|
||||||
|
% \addplot [dashed] table
|
||||||
|
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||||
|
% {Data/adam_datagen_full.log};
|
||||||
\addplot table
|
\addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adagrad.log};
|
{Data/adam_datagen_full_mean.log};
|
||||||
|
% \addplot [dashed] table
|
||||||
|
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||||
|
% {Data/adam_datagen_dropout_02_full.log};
|
||||||
\addplot table
|
\addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adadelta.log};
|
{Data/adam_datagen_dropout_02_full_mean.log};
|
||||||
\addplot table
|
\addplot table
|
||||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
{Data/adam.log};
|
{Data/adam_datagen_dropout_04_full_mean.log};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Data/adam_dropout_02_full_mean.log};
|
||||||
|
\addplot table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Data/adam_dropout_04_full_mean.log};
|
||||||
|
\addplot [dashed] table
|
||||||
|
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||||
|
{Data/adam_full_mean.log};
|
||||||
|
|
||||||
\addlegendentry{\footnotesize{ADAGRAD}}
|
\addlegendentry{\footnotesize{G.}}
|
||||||
\addlegendentry{\footnotesize{ADADELTA}}
|
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||||
\addlegendentry{\footnotesize{ADAM}}
|
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||||
\addlegendentry{SGD$_{0.01}$}
|
\addlegendentry{\footnotesize{D. 0.2}}
|
||||||
|
\addlegendentry{\footnotesize{D. 0.4}}
|
||||||
|
\addlegendentry{\footnotesize{Default}}
|
||||||
\end{axis}
|
\end{axis}
|
||||||
\end{tikzpicture}
|
\end{tikzpicture}
|
||||||
%\caption{Classification accuracy}
|
\caption{Classification accuracy}
|
||||||
|
\vspace{.25cm}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}[b]{\textwidth}
|
\begin{subfigure}[h]{1.0\linewidth}
|
||||||
\begin{tikzpicture}
|
\begin{tabu} to \textwidth {@{} l *6{X[c]} @{}}
|
||||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
\multicolumn{7}{c}{Classification Accuracy}\Bstrut
|
||||||
height = 0.7\textwidth, ymax = 0.5,
|
\\\hline
|
||||||
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
|
&\textsc{Adam}&D. 0.2&D. 0.4&G.&G.+D.~0.2&G.~,D.~0.4 \Tstrut \Bstrut
|
||||||
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
|
\\\hline
|
||||||
\addplot table
|
mean&0.9994&0.9990&0.9989&0.9937&0.9938&0.9940 \Tstrut \\
|
||||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log};
|
max& \\
|
||||||
\addplot table
|
min& \\
|
||||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log};
|
\multicolumn{7}{c}{Training Accuracy}\Bstrut
|
||||||
\addplot table
|
\\\hline
|
||||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log};
|
mean&0.9914&0.9918&0.9928&0.9937&0.9938&0.9940 \Tstrut \\
|
||||||
|
max& \\
|
||||||
|
min& \\
|
||||||
|
|
||||||
\addlegendentry{\footnotesize{ADAGRAD}}
|
|
||||||
\addlegendentry{\footnotesize{ADADELTA}}
|
|
||||||
\addlegendentry{\footnotesize{ADAM}}
|
|
||||||
\addlegendentry{SGD$_{0.01}$}
|
|
||||||
|
|
||||||
\end{axis}
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{Performance metrics during training}
|
|
||||||
\end{subfigure}
|
|
||||||
\\~\\
|
|
||||||
\begin{subfigure}[b]{1.0\linewidth}
|
|
||||||
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
|
|
||||||
\multicolumn{3}{c}{Classification Accuracy}
|
|
||||||
&~&\multicolumn{3}{c}{Error Measure}
|
|
||||||
\\\cline{1-3}\cline{5-7}
|
|
||||||
ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
|
|
||||||
\\\cline{1-3}\cline{5-7}
|
|
||||||
1&1&1&&1&1&1
|
|
||||||
\end{tabu}
|
\end{tabu}
|
||||||
\caption{Performace metrics after 20 epochs}
|
\caption{Mean and maximum accuracy after 48 epochs of training.}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\caption{Performance metrics of the network given in ... trained
|
\caption{Accuracy for the net given in ... with Dropout (D.),
|
||||||
with different optimization algorithms}
|
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||||
|
with \textsc{Adam}. For each epoch the 60.000 training samples
|
||||||
|
were used, or for data generation 10.000 steps with each using
|
||||||
|
batches of 60 generated data points. For each configuration the
|
||||||
|
model was trained 5 times and the average accuracies at each epoch
|
||||||
|
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||||
|
the test and training set are given in (b).}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{center}
|
\begin{center}
|
||||||
@ -87,18 +100,23 @@ plot coordinates {
|
|||||||
\centering
|
\centering
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist0.pdf}
|
\includegraphics[width=\textwidth]{Data/mnist0.pdf}
|
||||||
|
\caption{original\\image}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist1.pdf}
|
\includegraphics[width=\textwidth]{Data/mnist_gen_zoom.pdf}
|
||||||
|
\caption{random\\zoom}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist2.pdf}
|
\includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf}
|
||||||
|
\caption{random\\shear}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist3.pdf}
|
\includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf}
|
||||||
|
\caption{random\\rotation}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist4.pdf}
|
\includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf}
|
||||||
|
\caption{random\\positional shift}
|
||||||
\end{subfigure}\\
|
\end{subfigure}\\
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\begin{subfigure}{0.19\textwidth}
|
||||||
\includegraphics[width=\textwidth]{Data/mnist5.pdf}
|
\includegraphics[width=\textwidth]{Data/mnist5.pdf}
|
||||||
|
@ -67,7 +67,7 @@ plot coordinates {
|
|||||||
\end{tabu}
|
\end{tabu}
|
||||||
\caption{Performace metrics after 20 epochs}
|
\caption{Performace metrics after 20 epochs}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\caption{Performance metrics of the network given in ... trained
|
\caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
|
||||||
with different optimization algorithms}
|
with different optimization algorithms}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
%%% Local Variables:
|
%%% Local Variables:
|
||||||
|
@ -450,7 +450,7 @@ $\gamma$ is divided by the sum of the squares of the past partial
|
|||||||
derivatives in this parameter. This results in a monotonously
|
derivatives in this parameter. This results in a monotonously
|
||||||
decreasing learning rate for each parameter. This results in a faster
|
decreasing learning rate for each parameter. This results in a faster
|
||||||
decaying learning rate for parameters with large updates, where as
|
decaying learning rate for parameters with large updates, where as
|
||||||
parameters with small updates experience smaller decay. The ADAGRAD
|
parameters with small updates experience smaller decay. The \textsc{AdaGrad}
|
||||||
algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
||||||
|
|
||||||
\begin{algorithm}[H]
|
\begin{algorithm}[H]
|
||||||
@ -465,15 +465,15 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
|||||||
1, \dots,p$\;
|
1, \dots,p$\;
|
||||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||||
}
|
}
|
||||||
\caption{\textls{ADAGRAD}}
|
\caption{\textls{\textsc{AdaGrad}}}
|
||||||
\label{alg:ADAGRAD}
|
\label{alg:ADAGRAD}
|
||||||
\end{algorithm}
|
\end{algorithm}
|
||||||
|
|
||||||
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (ADADELTA)
|
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta})
|
||||||
in order to improve upon the two main drawbacks of ADAGRAD, being the
|
in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the
|
||||||
continual decay of the learning rate and the need for a manually
|
continual decay of the learning rate and the need for a manually
|
||||||
selected global learning rate $\gamma$.
|
selected global learning rate $\gamma$.
|
||||||
As ADAGRAD accumulates the squared gradients the learning rate will
|
As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will
|
||||||
eventually become infinitely small.
|
eventually become infinitely small.
|
||||||
In order to ensure that even after a significant of iterations
|
In order to ensure that even after a significant of iterations
|
||||||
learning continues to make progress instead of summing the gradients a
|
learning continues to make progress instead of summing the gradients a
|
||||||
@ -500,7 +500,7 @@ by these of the parameter update $\Delta x_t$. This proper
|
|||||||
x^2]_{t-1} + (1+p)\Delta x_t^2$\;
|
x^2]_{t-1} + (1+p)\Delta x_t^2$\;
|
||||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||||
}
|
}
|
||||||
\caption{ADADELTA, \textcite{ADADELTA}}
|
\caption{\textsc{AdaDelta}, \textcite{ADADELTA}}
|
||||||
\label{alg:gd}
|
\label{alg:gd}
|
||||||
\end{algorithm}
|
\end{algorithm}
|
||||||
|
|
||||||
@ -520,11 +520,11 @@ of the marble.
|
|||||||
This results in the algorithm being able to escape ... due to the
|
This results in the algorithm being able to escape ... due to the
|
||||||
build up momentum from approaching it.
|
build up momentum from approaching it.
|
||||||
|
|
||||||
\begin{itemize}
|
% \begin{itemize}
|
||||||
\item ADAM
|
% \item ADAM
|
||||||
\item momentum
|
% \item momentum
|
||||||
\item ADADETLA \textcite{ADADELTA}
|
% \item ADADETLA \textcite{ADADELTA}
|
||||||
\end{itemize}
|
% \end{itemize}
|
||||||
|
|
||||||
|
|
||||||
\begin{algorithm}[H]
|
\begin{algorithm}[H]
|
||||||
@ -665,7 +665,37 @@ When using this one has to be sure that the labels indeed remain the
|
|||||||
same or else the network will not learn the desired ...
|
same or else the network will not learn the desired ...
|
||||||
In the case of handwritten digits for example a to high rotation angle
|
In the case of handwritten digits for example a to high rotation angle
|
||||||
will ... a nine or six.
|
will ... a nine or six.
|
||||||
The most common transformations are rotation, zoom, shear, brightness, mirroring.
|
The most common transformations are rotation, zoom, shear, brightness,
|
||||||
|
mirroring.
|
||||||
|
|
||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
|
||||||
|
\caption{original\\image}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_zoom.pdf}
|
||||||
|
\caption{random\\zoom}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shear.pdf}
|
||||||
|
\caption{random\\shear}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_rotation.pdf}
|
||||||
|
\caption{random\\rotation}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf}
|
||||||
|
\caption{random\\positional shift}
|
||||||
|
\end{subfigure}
|
||||||
|
\caption{Example for the manipuations used in ... As all images are
|
||||||
|
of the same intensity brightness manipulation does not seem
|
||||||
|
... Additionally mirroring is not used for ... reasons.}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\input{Plots/gen_dropout.tex}
|
||||||
|
|
||||||
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
|
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
|
||||||
training set?}
|
training set?}
|
||||||
@ -674,10 +704,41 @@ training set?}
|
|||||||
|
|
||||||
For some applications (medical problems with small amount of patients)
|
For some applications (medical problems with small amount of patients)
|
||||||
the available data can be highly limited.
|
the available data can be highly limited.
|
||||||
In order to get a understanding for the achievable accuracy for such a
|
In these problems the networks are highly ... for overfitting the
|
||||||
scenario in the following we examine the ... and .. with a highly
|
data. In order to get a understanding of accuracys achievable and the
|
||||||
reduced training set and the impact the above mentioned strategies on
|
impact of the measures to prevent overfitting discussed above we and train
|
||||||
combating overfitting have.
|
the network on datasets of varying sizes.
|
||||||
|
First we use the mnist handwriting dataset and then a slightly harder
|
||||||
|
problem given by the mnist fashion dataset which contains PREEDITED
|
||||||
|
pictures of clothes from 10 different categories.
|
||||||
|
|
||||||
|
\input{Plots/fashion_mnist.tex}
|
||||||
|
|
||||||
|
For training for each class a certain number of random datapoints are
|
||||||
|
chosen for training the network. The sizes chosen are:
|
||||||
|
full dataset: ... per class\\
|
||||||
|
1000 per class
|
||||||
|
100 per class
|
||||||
|
10 per class
|
||||||
|
|
||||||
|
the results for training .. are given in ... Here can be seen...
|
||||||
|
|
||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\missingfigure{datagen digits}
|
||||||
|
\caption{Sample pictures of the mnist fashioyn dataset, one per
|
||||||
|
class.}
|
||||||
|
\label{mnist fashion}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\missingfigure{datagen fashion}
|
||||||
|
\caption{Sample pictures of the mnist fashioyn dataset, one per
|
||||||
|
class.}
|
||||||
|
\label{mnist fashion}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
|
||||||
\clearpage
|
\clearpage
|
||||||
\section{Bla}
|
\section{Bla}
|
||||||
|
@ -295,7 +295,7 @@ interpretation.
|
|||||||
Commonly the nodes in the output layer each correspond to a class and
|
Commonly the nodes in the output layer each correspond to a class and
|
||||||
the class chosen as prediction is the one with the highest value at
|
the class chosen as prediction is the one with the highest value at
|
||||||
the corresponding output node.
|
the corresponding output node.
|
||||||
The naive transformation to achieve this is transforming the output
|
This corresponds to a transformation of the output
|
||||||
vector $o$ into a one-hot vector
|
vector $o$ into a one-hot vector
|
||||||
\[
|
\[
|
||||||
\text{pred}_i =
|
\text{pred}_i =
|
||||||
|
@ -92,6 +92,9 @@
|
|||||||
|
|
||||||
\newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}
|
\newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}
|
||||||
|
|
||||||
|
\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut
|
||||||
|
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut
|
||||||
|
|
||||||
\SetKwInput{KwInput}{Input}
|
\SetKwInput{KwInput}{Input}
|
||||||
|
|
||||||
%\newcommand{\myrightarrow}[1]{\xrightarrow{\makebox[2em][c]{$\scriptstyle#1$}}}
|
%\newcommand{\myrightarrow}[1]{\xrightarrow{\makebox[2em][c]{$\scriptstyle#1$}}}
|
||||||
|
@ -6,6 +6,10 @@
|
|||||||
%%% End:
|
%%% End:
|
||||||
\section{Shallow Neural Networks}
|
\section{Shallow Neural Networks}
|
||||||
|
|
||||||
|
In order to get a some understanding of the behavior of neural
|
||||||
|
networks we study a simplified class of networks called shallow neural
|
||||||
|
networks in this chapter. We consider shallow neural networks consist of a single
|
||||||
|
hidden layer and
|
||||||
In order to examine some behavior of neural networks in this chapter
|
In order to examine some behavior of neural networks in this chapter
|
||||||
we consider a simple class of networks, the shallow ones. These
|
we consider a simple class of networks, the shallow ones. These
|
||||||
networks only contain one hidden layer and have a single output node.
|
networks only contain one hidden layer and have a single output node.
|
||||||
|
Loading…
Reference in New Issue
Block a user