main
Tobias Arndt 4 years ago
parent b0afc88091
commit b716f7688a

@ -24,24 +24,21 @@ plot coordinates {
\begin{subfigure}[b]{\textwidth} \begin{subfigure}[b]{\textwidth}
\begin{tikzpicture} \begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth, \begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.7\textwidth, height = 0.7\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
xtick = {1, 3, 5,7,9,11,13,15,17,19},
xticklabels = {$2$, $4$, $6$, $8$,
$10$,$12$,$14$,$16$,$18$,$20$},
xlabel = {epoch}, ylabel = {Classification Accuracy}] xlabel = {epoch}, ylabel = {Classification Accuracy}]
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none] {Data/gd_10min.log}; [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adagrad.log};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma] {Data/GD_05.log}; [x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Data/adadelta.log};
\addplot table \addplot table
[x=epoch, y=val_accuracy, col sep=comma] {Data/GD_1.log}; [x=epoch, y=val_accuracy, col sep=comma, mark = none]
\addplot table {Data/adam.log};
[x=epoch, y=val_accuracy, col sep=comma]
{Data/SGD_01_b32.log};
\addlegendentry{GD$_{0.01}$} \addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{GD$_{0.05}$} \addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{GD$_{0.1}$} \addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$} \addlegendentry{SGD$_{0.01}$}
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}
@ -50,25 +47,19 @@ plot coordinates {
\begin{subfigure}[b]{\textwidth} \begin{subfigure}[b]{\textwidth}
\begin{tikzpicture} \begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth, \begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.7\textwidth, height = 0.7\textwidth, ymax = 0.5,
ytick = {0, 1, 2, 3, 4}, xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$}, {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
xtick = {1, 3, 5,7,9,11,13,15,17,19},
xticklabels = {$2$, $4$, $6$, $8$,
$10$,$12$,$14$,$16$,$18$,$20$},
xlabel = {epoch}, ylabel = {Error Measure}]
\addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/GD_01.log};
\addplot table \addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/GD_05.log}; [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adagrad.log};
\addplot table \addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/GD_1.log}; [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adadelta.log};
\addplot table \addplot table
[x=epoch, y=val_loss, col sep=comma] {Data/SGD_01_b32.log}; [x=epoch, y=val_loss, col sep=comma, mark = none] {Data/adam.log};
\addlegendentry{GD$_{0.01}$} \addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{GD$_{0.05}$} \addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{GD$_{0.1}$} \addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$} \addlegendentry{SGD$_{0.01}$}
\end{axis} \end{axis}
@ -77,13 +68,13 @@ plot coordinates {
\end{subfigure} \end{subfigure}
\\~\\ \\~\\
\begin{subfigure}[b]{1.0\linewidth} \begin{subfigure}[b]{1.0\linewidth}
\begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}} \begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
\multicolumn{4}{c}{Classification Accuracy} \multicolumn{3}{c}{Classification Accuracy}
&~&\multicolumn{4}{c}{Error Measure} &~&\multicolumn{3}{c}{Error Measure}
\\\cline{1-4}\cline{6-9} \\\cline{1-3}\cline{5-7}
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$ ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
\\\cline{1-4}\cline{6-9} \\\cline{1-3}\cline{5-7}
1&1&1&1&&1&1&1&1 1&1&1&&1&1&1
\end{tabu} \end{tabu}
\caption{Performace metrics after 20 epochs} \caption{Performace metrics after 20 epochs}
\end{subfigure} \end{subfigure}

@ -0,0 +1,76 @@
\pgfplotsset{
compat=1.11,
legend image code/.code={
\draw[mark repeat=2,mark phase=2]
plot coordinates {
(0cm,0cm)
(0.0cm,0cm) %% default is (0.3cm,0cm)
(0.0cm,0cm) %% default is (0.6cm,0cm)
};%
}
}
\begin{figure}
\begin{subfigure}[b]{\textwidth}
\begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
xlabel = {epoch}, ylabel = {Classification Accuracy}]
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adagrad.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adadelta.log};
\addplot table
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
{Plots/Data/adam.log};
\addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$}
\end{axis}
\end{tikzpicture}
%\caption{Classification accuracy}
\end{subfigure}
\begin{subfigure}[b]{\textwidth}
\begin{tikzpicture}
\begin{axis}[tick style = {draw = none}, width = \textwidth,
height = 0.6\textwidth, ymax = 0.5,
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
\addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
\addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
\addplot table
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
\addlegendentry{\footnotesize{ADAGRAD}}
\addlegendentry{\footnotesize{ADADELTA}}
\addlegendentry{\footnotesize{ADAM}}
\addlegendentry{SGD$_{0.01}$}
\end{axis}
\end{tikzpicture}
\caption{Performance metrics during training}
\end{subfigure}
\\~\\
\begin{subfigure}[b]{1.0\linewidth}
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
\multicolumn{3}{c}{Classification Accuracy}
&~&\multicolumn{3}{c}{Error Measure}
\\\cline{1-3}\cline{5-7}
ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
\\\cline{1-3}\cline{5-7}
1&1&1&&1&1&1
\end{tabu}
\caption{Performace metrics after 20 epochs}
\end{subfigure}
\caption{Performance metrics of the network given in ... trained
with different optimization algorithms}
\end{figure}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

@ -7,7 +7,6 @@
copyright = {In Copyright - Non-Commercial Use Permitted}, copyright = {In Copyright - Non-Commercial Use Permitted},
keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence}, keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
size = {53 p.}, size = {53 p.},
address = {Ithaca, NY},
abstract = {Today, various forms of neural networks are trained to perform approximation tasks in many fields. However, the solutions obtained are not fully understood. Empirical results suggest that typical training algorithms favor regularized solutions.These observations motivate us to analyze properties of the solutions found by gradient descent initialized close to zero, that is frequently employed to perform the training task. As a starting point, we consider one dimensional (shallow) ReLU neural networks in which weights are chosen randomly and only the terminal layer is trained. We show that the resulting solution converges to the smooth spline interpolation of the training data as the number of hidden nodes tends to infinity. Moreover, we derive a correspondence between the early stopped gradient descent and the smoothing spline regression. This might give valuable insight on the properties of the solutions obtained using gradient descent methods in general settings.}, abstract = {Today, various forms of neural networks are trained to perform approximation tasks in many fields. However, the solutions obtained are not fully understood. Empirical results suggest that typical training algorithms favor regularized solutions.These observations motivate us to analyze properties of the solutions found by gradient descent initialized close to zero, that is frequently employed to perform the training task. As a starting point, we consider one dimensional (shallow) ReLU neural networks in which weights are chosen randomly and only the terminal layer is trained. We show that the resulting solution converges to the smooth spline interpolation of the training data as the number of hidden nodes tends to infinity. Moreover, we derive a correspondence between the early stopped gradient descent and the smoothing spline regression. This might give valuable insight on the properties of the solutions obtained using gradient descent methods in general settings.},
DOI = {10.3929/ethz-b-000402003}, DOI = {10.3929/ethz-b-000402003},
title = {How Implicit Regularization of Neural Networks Affects the Learned Function Part I}, title = {How Implicit Regularization of Neural Networks Affects the Learned Function Part I},
@ -89,4 +88,50 @@ url={https://doi.org/10.1038/323533a0}
timestamp = {Wed, 17 Apr 2019 17:23:45 +0200}, timestamp = {Wed, 17 Apr 2019 17:23:45 +0200},
biburl = {https://dblp.org/rec/journals/corr/HeZRS15.bib}, biburl = {https://dblp.org/rec/journals/corr/HeZRS15.bib},
bibsource = {dblp computer science bibliography, https://dblp.org} bibsource = {dblp computer science bibliography, https://dblp.org}
}
@book{PRML,
title = {Pattern Recognition and Machine Learning},
author = {Christopher M. Bishop},
publisher = {Springer},
isbn = {9780387310732,0387310738},
year = 2006,
series = {Information science and statistics},
edition = {1st ed. 2006. Corr. 2nd printing},
pages = {209}
}
@article{ADAGRAD,
author = {Duchi, John and Hazan, Elad and Singer, Yoram},
title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
year = {2011},
issue_date = {2/1/2011},
publisher = {JMLR.org},
volume = {12},
number = {null},
issn = {1532-4435},
journal = {J. Mach. Learn. Res.},
month = jul,
pages = {21212159},
numpages = {39}
}
@article{DBLP:journals/corr/DauphinPGCGB14,
author = {Yann N. Dauphin and
Razvan Pascanu and
{\c{C}}aglar G{\"{u}}l{\c{c}}ehre and
Kyunghyun Cho and
Surya Ganguli and
Yoshua Bengio},
title = {Identifying and attacking the saddle point problem in high-dimensional
non-convex optimization},
journal = {CoRR},
volume = {abs/1406.2572},
year = {2014},
url = {http://arxiv.org/abs/1406.2572},
archivePrefix = {arXiv},
eprint = {1406.2572},
timestamp = {Mon, 22 Jul 2019 13:15:46 +0200},
biburl = {https://dblp.org/rec/journals/corr/DauphinPGCGB14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
} }

@ -384,22 +384,23 @@ network using true gradients when training for the same mount of time.
\subsection{Modified Stochastic Gradient Descent} \subsection{Modified Stochastic Gradient Descent}
There is a inherent problem in the sensitivity of the gradient descent There is a inherent problem in the sensitivity of the gradient descent
algorithm regarding the learning rate $\gamma$. algorithm regarding the learning rate $\gamma$.
The difficulty of choosing the learning rate is The difficulty of choosing the learning rate can be seen
in the Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small in Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
but as the rate is enlarged the algorithm can become unstable and but as the rate is enlarged the algorithm can become unstable and
diverge. Even for learning rates small enough to ensure the parameters diverge. Even for learning rates small enough to ensure the parameters
do not diverge to infinity steep valleys can hinder the progress of do not diverge to infinity steep valleys can hinder the progress of
the algorithm as with to large leaning rates gradient descent the algorithm as with to large leaning rates gradient descent
``bounces between'' the walls of the valley rather then follow ... ``bounces between'' the walls of the valley rather then follow a
downward trend in the valley.
% \[ % \[
% w - \gamma \nabla_w ... % w - \gamma \nabla_w ...
% \] % \]
thus the weights grow to infinity. %thus the weights grow to infinity.
\todo{unstable learning rate besser \todo{unstable learning rate besser
erklären} erklären}
To combat this problem it is proposed \todo{quelle} alter the learning To combat this problem \todo{quelle} propose to alter the learning
rate over the course of training, often called leaning rate rate over the course of training, often called leaning rate
scheduling. The most popular implementations of this are time based scheduling. The most popular implementations of this are time based
decay decay
@ -417,16 +418,68 @@ and exponential decay, where the learning rate is decreased after each epoch,
\gamma_n = \gamma_o e^{-n d}. \gamma_n = \gamma_o e^{-n d}.
\] \]
These methods are able to increase the accuracy of a model by a large These methods are able to increase the accuracy of a model by a large
margin as seen in the training of RESnet by \textcite{resnet} margin as seen in the training of RESnet by \textcite{resnet}.
\todo{vielleicht grafik \todo{vielleicht grafik
einbauen}. However stochastic gradient descent with weight decay is einbauen}
still highly sensitive to the choice of the hyperparameters $\gamma$ However stochastic gradient descent with weight decay is
still highly sensitive to the choice of the hyperparameters $\gamma_0$
and $d$. and $d$.
In order to mitigate this problem a number of algorithms have been In order to mitigate this problem a number of algorithms have been
developed to regularize the learning rate with as minimal developed to regularize the learning rate with as minimal
hyperparameter guesswork as possible. hyperparameter guesswork as possible.
One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELTA}
\clearpage We will examine and compare a ... algorithms that use a adaptive
learning rate.
They all scale the gradient for the update depending of past gradients
for each weight individually.
The algorithms are build up on each other with the adaptive gradient
algorithm (ADAGRAD, \textcite{ADAGRAD})
laying the base work. Here for each parameter update the learning rate
is given my a constant
$\gamma$ is divided by the sum of the squares of the past partial
derivatives in this parameter. This results in a monotonously
decreasing learning rate for each parameter. This results in a faster
decaying learning rate for parameters with large updates, where as
parameters with small updates experience smaller decay. The ADAGRAD
algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
\begin{algorithm}[H]
\SetAlgoLined
\KwInput{Global learning rate $\gamma$}
\KwInput{Constant $\varepsilon$}
\KwInput{Initial parameter vector $x_1 \in \mathbb{R}^p$}
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
Compute Gradient: $g_t$\;
Compute Update: $\Delta x_{t,i} \leftarrow
-\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_t, \forall i =
1, \dots,p$\;
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
}
\caption{ADAGRAD}
\label{alg:ADAGRAD}
\end{algorithm}
Building on ADAGRAD \textcite{ADADELTA} developed the ... (ADADELTA)
in order to improve upon the two main drawbacks of ADAGRAD, being the
continual decay of the learning rate and the need for a manually
selected global learning rate $\gamma$.
As ADAGRAD accumulates the squared gradients the learning rate will
eventually become infinitely small.
In order to ensure that even after a significant of iterations
learning continues to make progress instead of summing the gradients a
exponentially decaying average of the past gradients is used to ....
Additionally the fixed global learning rate $\gamma$ is substituted by
a exponentially decaying average of the past parameter updates.
The usage of the past parameter updates is motivated by ensuring that
if the parameter vector had some hypothetical units they would be matched
by these of the parameter update $\Delta x_t$. This proper
\todo{erklärung unit}
While the stochastic gradient algorithm is less susceptible to local
extrema than gradient descent the problem still persists especially
with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
\begin{itemize} \begin{itemize}
\item ADAM \item ADAM
@ -454,9 +507,10 @@ One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELT
\label{alg:gd} \label{alg:gd}
\end{algorithm} \end{algorithm}
\input{Plots/sdg_comparison.tex}
% \subsubsubsection{Stochastic Gradient Descent} % \subsubsubsection{Stochastic Gradient Descent}
\clearpage
\subsection{Combating Overfitting} \subsection{Combating Overfitting}
% As in many machine learning applications if the model is overfit in % As in many machine learning applications if the model is overfit in
@ -489,7 +543,7 @@ iteration, this practice is called Dropout and was introduced by
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als \todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
training set?} training set?}
\subsubsection{Effectively for small training sets}
%%% Local Variables: %%% Local Variables:
%%% mode: latex %%% mode: latex

@ -194,45 +194,113 @@ using data with the expected response (label) attached to each
data-point in fitting the model, where usually some distance between data-point in fitting the model, where usually some distance between
the model output and the labels is minimized. the model output and the labels is minimized.
\subsubsection{Interpreting the Output} \subsubsection{Interpreting the Output / Classification vs Regression
/ Nonliniarity in last layer}
In order to properly interpret the output of a neural network and
training it, depending on the problem it might be advantageous to Given the nature of the neural net the output of the last layer are
transform the output form the last layer. Given the nature of the real numbers. For regression tasks this is desirable, for
neural network the value at each output node is a real number. This is classification problems however some transformations might be
desirable for applications where the desired output is a real numbered necessary.
vector (e.g. steering inputs for a autonomous car), however for As the goal in the latter is to predict a certain class or classes for
classification problems it is desirable to transform this an object the output needs to be of a form that allows this
output. Often classification problems are modeled in such a way that interpretation.
each output node corresponds to a class. Then the output vector needs Commonly the nodes in the output layer each correspond to a class and
to be normalized in order to give a prediction. The naive approach is the class chosen as prediction is the one with the highest value at
to transform the output vector $o$ into a one-hot vector $p$ the corresponding output node.
corresponding to a $0$ The naive transformation to achieve this is transforming the output
entry for all classes except one, which is the predicted class. vector $o$ into a one-hot vector
\[ \[
p_i = \text{pred}_i =
\begin{cases} \begin{cases}
1,& i < j, \forall i,j \in \text{arg}\max o_i, \\ 1,& \text{if } o_i = \max_j o_j \\
0,& \text{else.} 0,& \text{else}.
\end{cases} \end{cases}
\]\todo{besser formulieren} \]
This however makes training the model with gradient based methods impossible, as the derivative of
However this imposes difficulties in training the network as with this the transformation is either zero or undefined.
addition the model is no longer differentiable which imitates the A continuous transformation that is close to the argmax one is given by
ways the model can be trained. Additionally information about the softmax
``certainty'' for each class in the prediction gets lost. A popular \[
way to circumvent this problem is to normalize the output vector is \text{softmax}(o)_i = \frac{e^{o_i}}{\sum_j e^{o_j}}.
such a way that the entries add up to one, this allows for the \]
interpretation of probabilities assigned to each class. The softmax function transforms the realm of the output to the interval $[0,1]$
and the individual values sum to one, thus the output can be interpreted as
a probability for each class given the input.
Additionally to being differentiable this allows for evaluataing the
cetainiy of a prediction, rather than just whether it is accurate.
\todo{vielleicht additiv invarianz}
% Another property that makes softmax attractive is the invariance to addition
% \[
% \text{sofmax}(o) = \text{softmax}(o + c
% \]
% In order to properly interpret the output of a neural network and
% training it, depending on the problem it might be advantageous to
% transform the output form the last layer. Given the nature of the
% neural network the value at each output node is a real number. This is
% desirable for applications where the desired output is a real numbered
% vector (e.g. steering inputs for a autonomous car), however for
% classification problems it is desirable to transform this
% output. Often classification problems are modeled in such a way that
% each output node corresponds to a class. Then the output vector needs
% to be normalized in order to give a prediction. The naive approach is
% to transform the output vector $o$ into a one-hot vector $p$
% corresponding to a $0$
% entry for all classes except one, which is the predicted class.
% \[
% p_i =
% \begin{cases}
% 1,& i < j, \forall i,j \in \text{arg}\max o_i, \\
% 0,& \text{else.}
% \end{cases}
% \]\todo{besser formulieren}
% However this imposes difficulties in training the network as with this
% addition the model is no longer differentiable which imitates the
% ways the model can be trained. Additionally information about the
% ``certainty'' for each class in the prediction gets lost. A popular
% way to circumvent this problem is to normalize the output vector is
% such a way that the entries add up to one, this allows for the
% interpretation of probabilities assigned to each class.
\subsubsection{Error Measurement} \subsubsection{Error Measurement}
In order to make assessment about the quality of a network $\mathcal{NN}$ and train In order to make assessment about the quality of a network $\mathcal{NN}$ and train
it we need to discuss how we measure error. As for regression problems it we need to discuss how we measure error. The choice of the error
the output is continuous in contrast to the class predictions in a function is highly dependent on the type of the problem. For
classification problem, we need to discuss these problems separately. regression problems a commonly used error measure is the mean squared
\paragraph{Regression Problems} error (MSE)
which for a function $f$ and data $(x_i,y_i), i=1,\dots,n$ is given by
\[
MSE(f) = \frac{1}{n} \sum_i^n \left(f(x_i) - y_i\right)^2.
\]
However depending on the problem error measures with differnt
properties might be needed, for example in some contexts it is
required to consider a proportional rather than absolute error as is
common in time series models. \todo{komisch}
As discussed above the output of a neural network for a classification
problem can be interpreted as a probability distribution over the classes
conditioned on the input. In this case it is \todo{can?} desirable to
use error functions designed to compare probability distributions. A
widespread error function for this use case is the cross entropy (\textcite{PRML}),
which for two discrete distributions $p, q$ with the same realm $C$ is given by
\[
H(p, q) = \sum_{c \in C} p(c) \ln\left(\frac{1}{q(c)}\right),
\]
which compares a $q$ to a true underlying distribution $p$.
For a data set $(x_i,y_i), i = 1,\dots,n$ where each $y_{i,c}$
corresponds to the probability of class $c$ given $x_i$ and predictor
$f$ we get the loss function
\[
Bla = \sum_{i=1}^n H(y_i, f(x_i)).
\]
-Maximum Likelihood
-Ableitung mit softmax pseudo linear -> fast improvemtns possible
\subsubsection{Gradient Descent Algorithm} \subsubsection{Gradient Descent Algorithm}

@ -34,7 +34,7 @@
\usepackage{todonotes} \usepackage{todonotes}
\usepackage{lipsum} \usepackage{lipsum}
\usepackage[ruled,vlined]{algorithm2e} \usepackage[ruled,vlined]{algorithm2e}
\usepackage{showframe} %\usepackage{showframe}
\usepackage[protrusion=true, expansion=true, kerning=true]{microtype} \usepackage[protrusion=true, expansion=true, kerning=true]{microtype}
\captionsetup[sub]{justification=centering} \captionsetup[sub]{justification=centering}

Loading…
Cancel
Save