progress
This commit is contained in:
parent
2c2b053d54
commit
b0afc88091
@ -73,18 +73,18 @@ plot coordinates {
|
|||||||
\label{fig:sgd_vs_gd}
|
\label{fig:sgd_vs_gd}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{table}
|
\begin{table}[h]
|
||||||
\begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}}
|
\begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}}
|
||||||
\multicolumn{4}{c}{Classification Accuracy}
|
\multicolumn{4}{c}{Classification Accuracy}
|
||||||
&~&\multicolumn{4}{c}{Error Measure}
|
&~&\multicolumn{4}{c}{Error Measure}
|
||||||
\\\cline{1-4}\cline{6-9}
|
\\\cline{1-4}\cline{6-9}
|
||||||
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
|
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
|
||||||
\\\cline{1-4}\cline{6-9}
|
\\\cline{1-4}\cline{6-9}
|
||||||
1&1&1&1&&1&1&1&1
|
0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
|
||||||
\end{tabu}
|
\end{tabu}
|
||||||
\caption{Performace metrics of the networks trained in
|
\caption{Performance metrics of the networks trained in
|
||||||
Figure~\ref{ref:sdg_vs_gd} after 20 training epochs.}
|
Figure~\ref{fig:sgd_vs_gd} after 20 training epochs.}
|
||||||
\label{sgd_vs_gd}
|
\label{table:sgd_vs_gd}
|
||||||
\end{table}
|
\end{table}
|
||||||
%%% Local Variables:
|
%%% Local Variables:
|
||||||
%%% mode: latex
|
%%% mode: latex
|
||||||
|
41
TeX/Plots/mnist.tex
Normal file
41
TeX/Plots/mnist.tex
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
|
||||||
|
\end{subfigure}\\
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\begin{subfigure}{0.19\textwidth}
|
||||||
|
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
||||||
|
\end{subfigure}
|
||||||
|
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||||
|
digits. Of these images 60.000 are used as training images, while
|
||||||
|
the rest are used to validate the models trained.}
|
||||||
|
\label{fig:MNIST}
|
||||||
|
\end{figure}
|
||||||
|
%%% Local Variables:
|
||||||
|
%%% mode: latex
|
||||||
|
%%% TeX-master: "../main"
|
||||||
|
%%% End:
|
@ -73,3 +73,20 @@ url={https://doi.org/10.1038/323533a0}
|
|||||||
username = {mhwombat},
|
username = {mhwombat},
|
||||||
year = 2010
|
year = 2010
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@article{resnet,
|
||||||
|
author = {Kaiming He and
|
||||||
|
Xiangyu Zhang and
|
||||||
|
Shaoqing Ren and
|
||||||
|
Jian Sun},
|
||||||
|
title = {Deep Residual Learning for Image Recognition},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/1512.03385},
|
||||||
|
year = 2015,
|
||||||
|
url = {http://arxiv.org/abs/1512.03385},
|
||||||
|
archivePrefix = {arXiv},
|
||||||
|
eprint = {1512.03385},
|
||||||
|
timestamp = {Wed, 17 Apr 2019 17:23:45 +0200},
|
||||||
|
biburl = {https://dblp.org/rec/journals/corr/HeZRS15.bib},
|
||||||
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||||
|
}
|
@ -136,7 +136,7 @@ output is given by
|
|||||||
\]
|
\]
|
||||||
where $\sqrt{\cdot}$ and $\cdot^2$ are applied component
|
where $\sqrt{\cdot}$ and $\cdot^2$ are applied component
|
||||||
wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img_conv}.
|
wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img_conv}.
|
||||||
|
\todo{padding}
|
||||||
|
|
||||||
|
|
||||||
\begin{figure}[h]
|
\begin{figure}[h]
|
||||||
@ -186,20 +186,59 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
|
|||||||
\clearpage
|
\clearpage
|
||||||
\newpage
|
\newpage
|
||||||
\subsection{Convolutional NN}
|
\subsection{Convolutional NN}
|
||||||
|
\todo{Eileitung zu CNN}
|
||||||
|
% Conventional neural network as described in chapter .. are made up of
|
||||||
|
% fully connected layers, meaning each node in a layer is influenced by
|
||||||
|
% all nodes of the previous layer. If one wants to extract information
|
||||||
|
% out of high dimensional input such as images this results in a very
|
||||||
|
% large amount of variables in the model. This limits the
|
||||||
|
|
||||||
In conventional neural networks as described in chapter ... all layers
|
% In conventional neural networks as described in chapter ... all layers
|
||||||
are fully connected, meaning each output node in a layer is influenced
|
% are fully connected, meaning each output node in a layer is influenced
|
||||||
by all inputs. For $i$ inputs and $o$ output nodes this results in $i
|
% by all inputs. For $i$ inputs and $o$ output nodes this results in $i
|
||||||
+ 1$ variables at each node (weights and bias) and a total $o(i + 1)$
|
% + 1$ variables at each node (weights and bias) and a total $o(i + 1)$
|
||||||
variables. For large inputs like image data the amount of variables
|
% variables. For large inputs like image data the amount of variables
|
||||||
that have to be trained in order to fit the model can get excessive
|
% that have to be trained in order to fit the model can get excessive
|
||||||
and hinder the ability to train the model due to memory and
|
% and hinder the ability to train the model due to memory and
|
||||||
computational restrictions. By using convolution we can extract
|
% computational restrictions. By using convolution we can extract
|
||||||
meaningful information such as edges in an image with a kernel of a
|
% meaningful information such as edges in an image with a kernel of a
|
||||||
small size $k$ in the tens or hundreds independent of the size of the
|
% small size $k$ in the tens or hundreds independent of the size of the
|
||||||
original image. Thus for a large image $k \cdot i$ can be several
|
% original image. Thus for a large image $k \cdot i$ can be several
|
||||||
orders of magnitude smaller than $o\cdot i$ .
|
% orders of magnitude smaller than $o\cdot i$ .
|
||||||
|
|
||||||
|
As seen in the previous section convolution can lend itself to
|
||||||
|
manipulation of images or other large data which motivates it usage in
|
||||||
|
neural networks.
|
||||||
|
This is achieved by implementing convolutional layers where several
|
||||||
|
filters are applied to the input. Where the values of the filters are
|
||||||
|
trainable parameters of the model.
|
||||||
|
Each node in such a layer corresponds to a pixel of the output of
|
||||||
|
convolution with one of those filters on which a bias and activation
|
||||||
|
function are applied.
|
||||||
|
The usage of multiple filters results in multiple outputs of the same
|
||||||
|
size as the input. These are often called channels. Depending on the
|
||||||
|
size of the filters this can result in the dimension of the output
|
||||||
|
being one larger than the input.
|
||||||
|
However for convolutional layers following a convolutional layer the
|
||||||
|
size of the filter is often chosen to coincide with the amount of channels
|
||||||
|
of the output of the previous layer without using padding in this
|
||||||
|
direction in order to prevent gaining additional
|
||||||
|
dimensions\todo{komisch} in the output.
|
||||||
|
This can also be used to flatten certain less interesting channels of
|
||||||
|
the input as for example a color channels.
|
||||||
|
Thus filters used in convolutional networks are usually have the same
|
||||||
|
amount of dimensions as the input or one more.
|
||||||
|
|
||||||
|
The size of the filters and the way they are applied can be tuned
|
||||||
|
while building the model should be the same for all filters in one
|
||||||
|
layer in order for the output being of consistent size in all channels.
|
||||||
|
It is common to reduce the d< by not applying the
|
||||||
|
filters on each ``pixel'' but rather specify a ``stride'' $s$ at which
|
||||||
|
the filter $g$ is moved over the input $I$
|
||||||
|
|
||||||
|
\[
|
||||||
|
O_{x,y,c} = \sum_{i,j,l \in \mathbb{Z}} I_{x-i,y-j,c-l} g_{i,j,l}.
|
||||||
|
\]
|
||||||
|
|
||||||
As seen convolution lends itself for image manipulation. In this
|
As seen convolution lends itself for image manipulation. In this
|
||||||
chapter we will explore how we can incorporate convolution in neural
|
chapter we will explore how we can incorporate convolution in neural
|
||||||
@ -260,9 +299,9 @@ network. A class of algorithms that augment the gradient descent
|
|||||||
algorithm in order to lessen this problem are stochastic gradient
|
algorithm in order to lessen this problem are stochastic gradient
|
||||||
descent algorithms. Here the premise is that instead of using the whole
|
descent algorithms. Here the premise is that instead of using the whole
|
||||||
dataset a (different) subset of data is chosen to
|
dataset a (different) subset of data is chosen to
|
||||||
compute the gradient in each iteration.
|
compute the gradient in each iteration (Algorithm~\ref{alg:sdg}).
|
||||||
The amount of iterations until each data point has been considered in
|
The training period until each data point has been considered in
|
||||||
updating the parameters is commonly called a ``epoch''.
|
updating the parameters is commonly called an ``epoch''.
|
||||||
Using subsets reduces the amount of memory and computing power required for
|
Using subsets reduces the amount of memory and computing power required for
|
||||||
each iteration. This makes it possible to use very large training
|
each iteration. This makes it possible to use very large training
|
||||||
sets to fit the model.
|
sets to fit the model.
|
||||||
@ -270,18 +309,42 @@ Additionally the noise introduced on the gradient can improve
|
|||||||
the accuracy of the fit as stochastic gradient descent algorithms are
|
the accuracy of the fit as stochastic gradient descent algorithms are
|
||||||
less likely to get stuck on local extrema.
|
less likely to get stuck on local extrema.
|
||||||
|
|
||||||
\input{Plots/SGD_vs_GD.tex}
|
|
||||||
|
|
||||||
Another important benefit in using subsets is that depending on their size the
|
Another important benefit in using subsets is that depending on their size the
|
||||||
gradient can be calculated far quicker which allows to make more steps
|
gradient can be calculated far quicker which allows for more parameter updates
|
||||||
in the same time. If the approximated gradient is close enough to the
|
in the same time. If the approximated gradient is close enough to the
|
||||||
``real'' one this can drastically cut down the time required for
|
``real'' one this can drastically cut down the time required for
|
||||||
training the model. And improve the accuracy achievable in a given
|
training the model to a certain degree or improve the accuracy achievable in a given
|
||||||
mount of training time.
|
mount of training time.
|
||||||
|
|
||||||
|
\begin{algorithm}
|
||||||
|
\SetAlgoLined
|
||||||
|
\KwInput{Function $f$, Weights $w$, Learning Rate $\gamma$, Batch Size $B$, Loss Function $L$,
|
||||||
|
Training Data $D$, Epochs $E$.}
|
||||||
|
\For{$i \in \left\{1:E\right\}$}{
|
||||||
|
S <- D
|
||||||
|
\While{$\abs{S} \geq B$}{
|
||||||
|
Draw $\tilde{D}$ from $S$ with $\vert\tilde{D}\vert = B$\;
|
||||||
|
Update $S$: $S \leftarrow S \setminus \tilde{D}$\;
|
||||||
|
Compute Gradient: $g \leftarrow \frac{\mathrm{d} L(f_w,
|
||||||
|
\tilde{D})}{\mathrm{d} w}$\;
|
||||||
|
Update: $w \leftarrow w - \gamma g$\;
|
||||||
|
}
|
||||||
|
\If{$S \neq \emptyset$}{
|
||||||
|
Compute Gradient: $g \leftarrow \frac{\mathrm{d} L(f_w,
|
||||||
|
S)}{\mathrm{d} w}$\;
|
||||||
|
Update: $w \leftarrow w - \gamma g$\;
|
||||||
|
}
|
||||||
|
Increment: $i \leftarrow i+1$\;
|
||||||
|
}
|
||||||
|
\caption{Stochastic gradient descent.}
|
||||||
|
\label{alg:sgd}
|
||||||
|
\end{algorithm}
|
||||||
|
|
||||||
In order to illustrate this behavior we modeled a convolutional neural
|
In order to illustrate this behavior we modeled a convolutional neural
|
||||||
network to ... handwritten digits. The data set used for this is the
|
network to ... handwritten digits. The data set used for this is the
|
||||||
MNIST database of handwritten digits (\textcite{MNIST},
|
MNIST database of handwritten digits (\textcite{MNIST},
|
||||||
Figure~\ref{fig:MNIST}).
|
Figure~\ref{fig:MNIST}).
|
||||||
|
\input{Plots/mnist.tex}
|
||||||
The network used consists of two convolution and max pooling layers
|
The network used consists of two convolution and max pooling layers
|
||||||
followed by one fully connected hidden layer and the output layer.
|
followed by one fully connected hidden layer and the output layer.
|
||||||
Both covolutional layers utilize square filters of size five which are
|
Both covolutional layers utilize square filters of size five which are
|
||||||
@ -292,62 +355,78 @@ consists of 256 nodes and the output layer of 10, one for each digit.
|
|||||||
All layers except the output layer use RELU as activation function
|
All layers except the output layer use RELU as activation function
|
||||||
with the output layer using softmax (\ref{def:softmax}).
|
with the output layer using softmax (\ref{def:softmax}).
|
||||||
As loss function categorical crossentropy is used (\ref{def:...}).
|
As loss function categorical crossentropy is used (\ref{def:...}).
|
||||||
In Figure~\ref{fig:mnist_architecture} the architecture of the network
|
The architecture of the convolutional neural network is summarized in
|
||||||
is summarized.
|
Figure~\ref{fig:mnist_architecture}.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\missingfigure{network architecture}
|
||||||
|
\caption{architecture}
|
||||||
|
\label{fig:mnist_architecture}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The results of the network being trained with gradient descent and
|
||||||
|
stochastic gradient descent for 20 epochs are given in Figure~\ref{fig:sgd_vs_gd}
|
||||||
|
and Table~\ref{table:sgd_vs_gd}
|
||||||
|
|
||||||
|
\input{Plots/SGD_vs_GD.tex}
|
||||||
|
|
||||||
Here it can be seen that the network trained with stochstic gradient
|
Here it can be seen that the network trained with stochstic gradient
|
||||||
descent is more accurate after the first epoch than the ones trained
|
descent is more accurate after the first epoch than the ones trained
|
||||||
with gradient descent after 20 epochs.
|
with gradient descent after 20 epochs.
|
||||||
This is due to the former using a batch size of 32 and thus having
|
This is due to the former using a batch size of 32 and thus having
|
||||||
made 1.875 updates to the weights
|
made 1.875 updates to the weights
|
||||||
after the first epoch. While each of these updates uses a approximate
|
after the first epoch in comparison to one update . While each of
|
||||||
|
these updates uses a approximate
|
||||||
gradient calculated on the subset it performs far better than the
|
gradient calculated on the subset it performs far better than the
|
||||||
network using true gradients when training for the same mount of time.
|
network using true gradients when training for the same mount of time.
|
||||||
\todo{vergleich training time}
|
\todo{vergleich training time}
|
||||||
The difficulty of choosing the learning rate ALSO ILLUSTRATED IN
|
\clearpage
|
||||||
FUGURE...
|
\subsection{Modified Stochastic Gradient Descent}
|
||||||
|
There is a inherent problem in the sensitivity of the gradient descent
|
||||||
|
algorithm regarding the learning rate $\gamma$.
|
||||||
|
The difficulty of choosing the learning rate is
|
||||||
|
in the Figure~\ref{sgd_vs_gd}. For small rates the progress in each iteration is small
|
||||||
|
but as the rate is enlarged the algorithm can become unstable and
|
||||||
|
diverge. Even for learning rates small enough to ensure the parameters
|
||||||
|
do not diverge to infinity steep valleys can hinder the progress of
|
||||||
|
the algorithm as with to large leaning rates gradient descent
|
||||||
|
``bounces between'' the walls of the valley rather then follow ...
|
||||||
|
|
||||||
|
% \[
|
||||||
|
% w - \gamma \nabla_w ...
|
||||||
|
% \]
|
||||||
|
thus the weights grow to infinity.
|
||||||
|
\todo{unstable learning rate besser
|
||||||
|
erklären}
|
||||||
|
|
||||||
The results of the network being trained with gradient descent and
|
To combat this problem it is proposed \todo{quelle} alter the learning
|
||||||
stochastic gradient descent are given in Figure~\ref{fig:sgd_vs_gd}
|
rate over the course of training, often called leaning rate
|
||||||
and Table~\ref{table:sgd_vs_dg}
|
scheduling. The most popular implementations of this are time based
|
||||||
|
decay
|
||||||
\begin{figure}[h]
|
\[
|
||||||
\centering
|
\gamma_{n+1} = \frac{\gamma_n}{1 + d n},
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\]
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
|
where $d$ is the decay parameter and $n$ is the number of epochs,
|
||||||
\end{subfigure}
|
step based decay where the learning rate is fixed for a span of $r$
|
||||||
\begin{subfigure}{0.19\textwidth}
|
epochs and then decreased according to parameter $d$
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
|
\[
|
||||||
\end{subfigure}
|
\gamma_n = \gamma_0 d^{\text{floor}{\frac{n+1}{r}}}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\]
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
|
and exponential decay, where the learning rate is decreased after each epoch,
|
||||||
\end{subfigure}
|
\[
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\gamma_n = \gamma_o e^{-n d}.
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
|
\]
|
||||||
\end{subfigure}
|
These methods are able to increase the accuracy of a model by a large
|
||||||
\begin{subfigure}{0.19\textwidth}
|
margin as seen in the training of RESnet by \textcite{resnet}
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
|
\todo{vielleicht grafik
|
||||||
\end{subfigure}\\
|
einbauen}. However stochastic gradient descent with weight decay is
|
||||||
\begin{subfigure}{0.19\textwidth}
|
still highly sensitive to the choice of the hyperparameters $\gamma$
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
|
and $d$.
|
||||||
\end{subfigure}
|
In order to mitigate this problem a number of algorithms have been
|
||||||
\begin{subfigure}{0.19\textwidth}
|
developed to regularize the learning rate with as minimal
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
|
hyperparameter guesswork as possible.
|
||||||
\end{subfigure}
|
One of these algorithms is the ADADELTA algorithm developed by \textcite{ADADELTA}
|
||||||
\begin{subfigure}{0.19\textwidth}
|
\clearpage
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
|
|
||||||
\end{subfigure}
|
|
||||||
\begin{subfigure}{0.19\textwidth}
|
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
|
|
||||||
\end{subfigure}
|
|
||||||
\begin{subfigure}{0.19\textwidth}
|
|
||||||
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
|
||||||
\end{subfigure}
|
|
||||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
|
||||||
digits. Of these images 60.000 are used as training images, while
|
|
||||||
the rest are used to validate the models trained.}
|
|
||||||
\label{fig:MNIST}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item ADAM
|
\item ADAM
|
||||||
@ -363,8 +442,8 @@ and Table~\ref{table:sgd_vs_dg}
|
|||||||
Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
|
Initialize accumulation variables $E[g^2]_0 = 0, E[\Delta x^2]_0 =0$\;
|
||||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||||
Compute Gradient: $g_t$\;
|
Compute Gradient: $g_t$\;
|
||||||
Accumulate Gradient: $[E[g^2]_t \leftarrow \roh D[g^2]_{t-1} +
|
Accumulate Gradient: $[E[g^2]_t \leftarrow \rho D[g^2]_{t-1} +
|
||||||
(1-\roh)g_t^2$\;
|
(1-\rho)g_t^2$\;
|
||||||
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
|
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
|
||||||
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
|
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
|
||||||
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
|
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
|
||||||
|
@ -78,7 +78,7 @@
|
|||||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||||
\DeclareMathOperator*{\po}{\mathbb{P}\text{-}\mathcal{O}}
|
\DeclareMathOperator*{\po}{\mathbb{P}\text{-}\mathcal{O}}
|
||||||
\DeclareMathOperator*{\equals}{=}
|
\DeclareMathOperator*{\equals}{=}
|
||||||
\begin{document}f
|
\begin{document}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -172,7 +172,6 @@ increased.
|
|||||||
|
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\begin{adjustbox}{width = \textwidth}
|
|
||||||
\pgfplotsset{
|
\pgfplotsset{
|
||||||
compat=1.11,
|
compat=1.11,
|
||||||
legend image code/.code={
|
legend image code/.code={
|
||||||
@ -202,7 +201,6 @@ plot coordinates {
|
|||||||
\addlegendentry{\footnotesize{spline}};
|
\addlegendentry{\footnotesize{spline}};
|
||||||
\end{axis}
|
\end{axis}
|
||||||
\end{tikzpicture}
|
\end{tikzpicture}
|
||||||
\end{adjustbox}
|
|
||||||
\caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
\caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
||||||
\varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
|
\varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
|
||||||
(\textcolor{blue}{blue dots}) the neural network constructed
|
(\textcolor{blue}{blue dots}) the neural network constructed
|
||||||
|
Loading…
Reference in New Issue
Block a user