diff --git a/TeX/Plots/pfg_test.tex b/TeX/Plots/pfg_test.tex index 66d52b7..92a2917 100644 --- a/TeX/Plots/pfg_test.tex +++ b/TeX/Plots/pfg_test.tex @@ -24,24 +24,21 @@ plot coordinates { \begin{subfigure}[b]{\textwidth} \begin{tikzpicture} \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.7\textwidth, - xtick = {1, 3, 5,7,9,11,13,15,17,19}, - xticklabels = {$2$, $4$, $6$, $8$, - $10$,$12$,$14$,$16$,$18$,$20$}, + height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east}, xlabel = {epoch}, ylabel = {Classification Accuracy}] \addplot table - [x=epoch, y=val_accuracy, col sep=comma, mark = none] {Data/gd_10min.log}; + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Data/adagrad.log}; \addplot table - [x=epoch, y=val_accuracy, col sep=comma] {Data/GD_05.log}; + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Data/adadelta.log}; \addplot table - [x=epoch, y=val_accuracy, col sep=comma] {Data/GD_1.log}; - \addplot table - [x=epoch, y=val_accuracy, col sep=comma] - {Data/SGD_01_b32.log}; + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Data/adam.log}; - \addlegendentry{GD$_{0.01}$} - \addlegendentry{GD$_{0.05}$} - \addlegendentry{GD$_{0.1}$} + \addlegendentry{\footnotesize{ADAGRAD}} + \addlegendentry{\footnotesize{ADADELTA}} + \addlegendentry{\footnotesize{ADAM}} \addlegendentry{SGD$_{0.01}$} \end{axis} \end{tikzpicture} @@ -50,25 +47,19 @@ plot coordinates { \begin{subfigure}[b]{\textwidth} \begin{tikzpicture} \begin{axis}[tick style = {draw = none}, width = \textwidth, - height = 0.7\textwidth, - ytick = {0, 1, 2, 3, 4}, - yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$}, - xtick = {1, 3, 5,7,9,11,13,15,17,19}, - xticklabels = {$2$, $4$, $6$, $8$, - $10$,$12$,$14$,$16$,$18$,$20$}, - xlabel = {epoch}, ylabel = {Error Measure}] - \addplot table - [x=epoch, y=val_loss, col sep=comma] {Data/GD_01.log}; + height = 0.7\textwidth, ymax = 0.5, + xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels = + {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}] \addplot table - [x=epoch, y=val_loss, col sep=comma] {Data/GD_05.log}; + [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log}; \addplot table - [x=epoch, y=val_loss, col sep=comma] {Data/GD_1.log}; + [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log}; \addplot table - [x=epoch, y=val_loss, col sep=comma] {Data/SGD_01_b32.log}; + [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log}; - \addlegendentry{GD$_{0.01}$} - \addlegendentry{GD$_{0.05}$} - \addlegendentry{GD$_{0.1}$} + \addlegendentry{\footnotesize{ADAGRAD}} + \addlegendentry{\footnotesize{ADADELTA}} + \addlegendentry{\footnotesize{ADAM}} \addlegendentry{SGD$_{0.01}$} \end{axis} @@ -77,13 +68,13 @@ plot coordinates { \end{subfigure} \\~\\ \begin{subfigure}[b]{1.0\linewidth} - \begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}} - \multicolumn{4}{c}{Classification Accuracy} - &~&\multicolumn{4}{c}{Error Measure} - \\\cline{1-4}\cline{6-9} - GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$ - \\\cline{1-4}\cline{6-9} - 1&1&1&1&&1&1&1&1 + \begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}} + \multicolumn{3}{c}{Classification Accuracy} + &~&\multicolumn{3}{c}{Error Measure} + \\\cline{1-3}\cline{5-7} + ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM + \\\cline{1-3}\cline{5-7} + 1&1&1&&1&1&1 \end{tabu} \caption{Performace metrics after 20 epochs} \end{subfigure} diff --git a/TeX/Plots/sdg_comparison.tex b/TeX/Plots/sdg_comparison.tex new file mode 100644 index 0000000..c42ffc4 --- /dev/null +++ b/TeX/Plots/sdg_comparison.tex @@ -0,0 +1,76 @@ +\pgfplotsset{ +compat=1.11, +legend image code/.code={ +\draw[mark repeat=2,mark phase=2] +plot coordinates { +(0cm,0cm) +(0.0cm,0cm) %% default is (0.3cm,0cm) +(0.0cm,0cm) %% default is (0.6cm,0cm) +};% +} +} +\begin{figure} + \begin{subfigure}[b]{\textwidth} + \begin{tikzpicture} + \begin{axis}[tick style = {draw = none}, width = \textwidth, + height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east}, + xlabel = {epoch}, ylabel = {Classification Accuracy}] + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adagrad.log}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adadelta.log}; + \addplot table + [x=epoch, y=val_accuracy, col sep=comma, mark = none] + {Plots/Data/adam.log}; + + \addlegendentry{\footnotesize{ADAGRAD}} + \addlegendentry{\footnotesize{ADADELTA}} + \addlegendentry{\footnotesize{ADAM}} + \addlegendentry{SGD$_{0.01}$} + \end{axis} + \end{tikzpicture} + %\caption{Classification accuracy} + \end{subfigure} + \begin{subfigure}[b]{\textwidth} + \begin{tikzpicture} + \begin{axis}[tick style = {draw = none}, width = \textwidth, + height = 0.6\textwidth, ymax = 0.5, + xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels = + {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}] + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log}; + \addplot table + [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log}; + + \addlegendentry{\footnotesize{ADAGRAD}} + \addlegendentry{\footnotesize{ADADELTA}} + \addlegendentry{\footnotesize{ADAM}} + \addlegendentry{SGD$_{0.01}$} + + \end{axis} + \end{tikzpicture} + \caption{Performance metrics during training} + \end{subfigure} + \\~\\ + \begin{subfigure}[b]{1.0\linewidth} + \begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}} + \multicolumn{3}{c}{Classification Accuracy} + &~&\multicolumn{3}{c}{Error Measure} + \\\cline{1-3}\cline{5-7} + ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM + \\\cline{1-3}\cline{5-7} + 1&1&1&&1&1&1 + \end{tabu} + \caption{Performace metrics after 20 epochs} + \end{subfigure} + \caption{Performance metrics of the network given in ... trained + with different optimization algorithms} +\end{figure} +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "../main" +%%% End: diff --git a/TeX/bibliograpy.bib b/TeX/bibliograpy.bib index 430f468..920c253 100644 --- a/TeX/bibliograpy.bib +++ b/TeX/bibliograpy.bib @@ -7,7 +7,6 @@ copyright = {In Copyright - Non-Commercial Use Permitted}, keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence}, size = {53 p.}, - address = {Ithaca, NY}, abstract = {Today, various forms of neural networks are trained to perform approximation tasks in many fields. However, the solutions obtained are not fully understood. Empirical results suggest that typical training algorithms favor regularized solutions.These observations motivate us to analyze properties of the solutions found by gradient descent initialized close to zero, that is frequently employed to perform the training task. As a starting point, we consider one dimensional (shallow) ReLU neural networks in which weights are chosen randomly and only the terminal layer is trained. We show that the resulting solution converges to the smooth spline interpolation of the training data as the number of hidden nodes tends to infinity. Moreover, we derive a correspondence between the early stopped gradient descent and the smoothing spline regression. This might give valuable insight on the properties of the solutions obtained using gradient descent methods in general settings.}, DOI = {10.3929/ethz-b-000402003}, title = {How Implicit Regularization of Neural Networks Affects the Learned Function – Part I}, @@ -89,4 +88,50 @@ url={https://doi.org/10.1038/323533a0} timestamp = {Wed, 17 Apr 2019 17:23:45 +0200}, biburl = {https://dblp.org/rec/journals/corr/HeZRS15.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@book{PRML, + title = {Pattern Recognition and Machine Learning}, + author = {Christopher M. Bishop}, + publisher = {Springer}, + isbn = {9780387310732,0387310738}, + year = 2006, + series = {Information science and statistics}, + edition = {1st ed. 2006. Corr. 2nd printing}, + pages = {209} +} + +@article{ADAGRAD, +author = {Duchi, John and Hazan, Elad and Singer, Yoram}, +title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization}, +year = {2011}, +issue_date = {2/1/2011}, +publisher = {JMLR.org}, +volume = {12}, +number = {null}, +issn = {1532-4435}, +journal = {J. Mach. Learn. Res.}, +month = jul, +pages = {2121–2159}, +numpages = {39} +} + +@article{DBLP:journals/corr/DauphinPGCGB14, + author = {Yann N. Dauphin and + Razvan Pascanu and + {\c{C}}aglar G{\"{u}}l{\c{c}}ehre and + Kyunghyun Cho and + Surya Ganguli and + Yoshua Bengio}, + title = {Identifying and attacking the saddle point problem in high-dimensional + non-convex optimization}, + journal = {CoRR}, + volume = {abs/1406.2572}, + year = {2014}, + url = {http://arxiv.org/abs/1406.2572}, + archivePrefix = {arXiv}, + eprint = {1406.2572}, + timestamp = {Mon, 22 Jul 2019 13:15:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} } \ No newline at end of file diff --git a/TeX/further_applications_of_nn.tex b/TeX/further_applications_of_nn.tex index 1e3302e..5af0c3e 100644 --- a/TeX/further_applications_of_nn.tex +++ b/TeX/further_applications_of_nn.tex @@ -384,22 +384,23 @@ network using true gradients when training for the same mount of time. \subsection{Modified Stochastic Gradient Descent} There is a inherent problem in the sensitivity of the gradient descent algorithm regarding the learning rate $\gamma$. -The difficulty of choosing the learning rate is -in the Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small +The difficulty of choosing the learning rate can be seen +in Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small but as the rate is enlarged the algorithm can become unstable and diverge. Even for learning rates small enough to ensure the parameters do not diverge to infinity steep valleys can hinder the progress of the algorithm as with to large leaning rates gradient descent -``bounces between'' the walls of the valley rather then follow ... +``bounces between'' the walls of the valley rather then follow a +downward trend in the valley. % \[ % w - \gamma \nabla_w ... % \] -thus the weights grow to infinity. +%thus the weights grow to infinity. \todo{unstable learning rate besser erklären} -To combat this problem it is proposed \todo{quelle} alter the learning +To combat this problem \todo{quelle} propose to alter the learning rate over the course of training, often called leaning rate scheduling. The most popular implementations of this are time based decay @@ -417,16 +418,68 @@ and exponential decay, where the learning rate is decreased after each epoch, \gamma_n = \gamma_o e^{-n d}. \] These methods are able to increase the accuracy of a model by a large -margin as seen in the training of RESnet by \textcite{resnet} - \todo{vielleicht grafik - einbauen}. However stochastic gradient descent with weight decay is -still highly sensitive to the choice of the hyperparameters $\gamma$ +margin as seen in the training of RESnet by \textcite{resnet}. +\todo{vielleicht grafik + einbauen} +However stochastic gradient descent with weight decay is +still highly sensitive to the choice of the hyperparameters $\gamma_0$ and $d$. In order to mitigate this problem a number of algorithms have been developed to regularize the learning rate with as minimal hyperparameter guesswork as possible. -One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELTA} -\clearpage + +We will examine and compare a ... algorithms that use a adaptive +learning rate. +They all scale the gradient for the update depending of past gradients +for each weight individually. + +The algorithms are build up on each other with the adaptive gradient +algorithm (ADAGRAD, \textcite{ADAGRAD}) +laying the base work. Here for each parameter update the learning rate +is given my a constant +$\gamma$ is divided by the sum of the squares of the past partial +derivatives in this parameter. This results in a monotonously +decreasing learning rate for each parameter. This results in a faster +decaying learning rate for parameters with large updates, where as +parameters with small updates experience smaller decay. The ADAGRAD +algorithm is given in Algorithm~\ref{alg:ADAGRAD}. + +\begin{algorithm}[H] + \SetAlgoLined + \KwInput{Global learning rate $\gamma$} + \KwInput{Constant $\varepsilon$} + \KwInput{Initial parameter vector $x_1 \in \mathbb{R}^p$} + \For{$t \in \left\{1,\dots,T\right\};\, t+1$}{ + Compute Gradient: $g_t$\; + Compute Update: $\Delta x_{t,i} \leftarrow + -\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_t, \forall i = + 1, \dots,p$\; + Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\; + } + \caption{ADAGRAD} + \label{alg:ADAGRAD} +\end{algorithm} + +Building on ADAGRAD \textcite{ADADELTA} developed the ... (ADADELTA) +in order to improve upon the two main drawbacks of ADAGRAD, being the +continual decay of the learning rate and the need for a manually +selected global learning rate $\gamma$. +As ADAGRAD accumulates the squared gradients the learning rate will +eventually become infinitely small. +In order to ensure that even after a significant of iterations +learning continues to make progress instead of summing the gradients a +exponentially decaying average of the past gradients is used to .... +Additionally the fixed global learning rate $\gamma$ is substituted by +a exponentially decaying average of the past parameter updates. +The usage of the past parameter updates is motivated by ensuring that +if the parameter vector had some hypothetical units they would be matched +by these of the parameter update $\Delta x_t$. This proper +\todo{erklärung unit} + + +While the stochastic gradient algorithm is less susceptible to local +extrema than gradient descent the problem still persists especially +with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14} \begin{itemize} \item ADAM @@ -454,9 +507,10 @@ One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELT \label{alg:gd} \end{algorithm} +\input{Plots/sdg_comparison.tex} % \subsubsubsection{Stochastic Gradient Descent} - +\clearpage \subsection{Combating Overfitting} % As in many machine learning applications if the model is overfit in @@ -489,7 +543,7 @@ iteration, this practice is called Dropout and was introduced by \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als training set?} - +\subsubsection{Effectively for small training sets} %%% Local Variables: %%% mode: latex diff --git a/TeX/introduction_nn.tex b/TeX/introduction_nn.tex index c6a6df5..19d3f76 100644 --- a/TeX/introduction_nn.tex +++ b/TeX/introduction_nn.tex @@ -194,45 +194,113 @@ using data with the expected response (label) attached to each data-point in fitting the model, where usually some distance between the model output and the labels is minimized. -\subsubsection{Interpreting the Output} - -In order to properly interpret the output of a neural network and -training it, depending on the problem it might be advantageous to -transform the output form the last layer. Given the nature of the -neural network the value at each output node is a real number. This is -desirable for applications where the desired output is a real numbered -vector (e.g. steering inputs for a autonomous car), however for -classification problems it is desirable to transform this -output. Often classification problems are modeled in such a way that -each output node corresponds to a class. Then the output vector needs -to be normalized in order to give a prediction. The naive approach is -to transform the output vector $o$ into a one-hot vector $p$ -corresponding to a $0$ -entry for all classes except one, which is the predicted class. - +\subsubsection{Interpreting the Output / Classification vs Regression + / Nonliniarity in last layer} + +Given the nature of the neural net the output of the last layer are +real numbers. For regression tasks this is desirable, for +classification problems however some transformations might be +necessary. +As the goal in the latter is to predict a certain class or classes for +an object the output needs to be of a form that allows this +interpretation. +Commonly the nodes in the output layer each correspond to a class and +the class chosen as prediction is the one with the highest value at +the corresponding output node. +The naive transformation to achieve this is transforming the output +vector $o$ into a one-hot vector \[ - p_i = + \text{pred}_i = \begin{cases} - 1,& i < j, \forall i,j \in \text{arg}\max o_i, \\ - 0,& \text{else.} + 1,& \text{if } o_i = \max_j o_j \\ + 0,& \text{else}. \end{cases} -\]\todo{besser formulieren} - -However this imposes difficulties in training the network as with this -addition the model is no longer differentiable which imitates the -ways the model can be trained. Additionally information about the -``certainty'' for each class in the prediction gets lost. A popular -way to circumvent this problem is to normalize the output vector is -such a way that the entries add up to one, this allows for the -interpretation of probabilities assigned to each class. +\] +This however makes training the model with gradient based methods impossible, as the derivative of +the transformation is either zero or undefined. +A continuous transformation that is close to the argmax one is given by +softmax +\[ + \text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}. +\] +The softmax function transforms the realm of the output to the interval $[0,1]$ +and the individual values sum to one, thus the output can be interpreted as +a probability for each class given the input. +Additionally to being differentiable this allows for evaluataing the +cetainiy of a prediction, rather than just whether it is accurate. + +\todo{vielleicht additiv invarianz} +% Another property that makes softmax attractive is the invariance to addition +% \[ +% \text{sofmax}(o) = \text{softmax}(o + c +% \] + + +% In order to properly interpret the output of a neural network and +% training it, depending on the problem it might be advantageous to +% transform the output form the last layer. Given the nature of the +% neural network the value at each output node is a real number. This is +% desirable for applications where the desired output is a real numbered +% vector (e.g. steering inputs for a autonomous car), however for +% classification problems it is desirable to transform this +% output. Often classification problems are modeled in such a way that +% each output node corresponds to a class. Then the output vector needs +% to be normalized in order to give a prediction. The naive approach is +% to transform the output vector $o$ into a one-hot vector $p$ +% corresponding to a $0$ +% entry for all classes except one, which is the predicted class. + +% \[ +% p_i = +% \begin{cases} +% 1,& i < j, \forall i,j \in \text{arg}\max o_i, \\ +% 0,& \text{else.} +% \end{cases} +% \]\todo{besser formulieren} + +% However this imposes difficulties in training the network as with this +% addition the model is no longer differentiable which imitates the +% ways the model can be trained. Additionally information about the +% ``certainty'' for each class in the prediction gets lost. A popular +% way to circumvent this problem is to normalize the output vector is +% such a way that the entries add up to one, this allows for the +% interpretation of probabilities assigned to each class. \subsubsection{Error Measurement} In order to make assessment about the quality of a network $\mathcal{NN}$ and train -it we need to discuss how we measure error. As for regression problems -the output is continuous in contrast to the class predictions in a -classification problem, we need to discuss these problems separately. -\paragraph{Regression Problems} +it we need to discuss how we measure error. The choice of the error +function is highly dependent on the type of the problem. For +regression problems a commonly used error measure is the mean squared +error (MSE) +which for a function $f$ and data $(x_i,y_i), i=1,\dots,n$ is given by +\[ + MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2. +\] +However depending on the problem error measures with differnt +properties might be needed, for example in some contexts it is +required to consider a proportional rather than absolute error as is +common in time series models. \todo{komisch} + +As discussed above the output of a neural network for a classification +problem can be interpreted as a probability distribution over the classes +conditioned on the input. In this case it is \todo{can?} desirable to +use error functions designed to compare probability distributions. A +widespread error function for this use case is the cross entropy (\textcite{PRML}), +which for two discrete distributions $p, q$ with the same realm $C$ is given by +\[ + H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right), +\] +which compares a $q$ to a true underlying distribution $p$. +For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$ +corresponds to the probability of class $c$ given $x_i$ and predictor +$f$ we get the loss function +\[ + Bla = \sum_{i=1}^n H(y_i, f(x_i)). +\] + +-Maximum Likelihood +-Ableitung mit softmax pseudo linear -> fast improvemtns possible \subsubsection{Gradient Descent Algorithm} diff --git a/TeX/main.tex b/TeX/main.tex index e200aa0..a413bc8 100644 --- a/TeX/main.tex +++ b/TeX/main.tex @@ -34,7 +34,7 @@ \usepackage{todonotes} \usepackage{lipsum} \usepackage[ruled,vlined]{algorithm2e} -\usepackage{showframe} +%\usepackage{showframe} \usepackage[protrusion=true, expansion=true, kerning=true]{microtype} \captionsetup[sub]{justification=centering}