Masterarbeit/TeX/theo_3_8.tex



%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:
\section{Shallow Neural Networks}

In order to examine some behavior of neural networks in this chapter
we consider a simple class of networks, the shallow ones. These
networks only contain one hidden layer and have a single output node.

\begin{Definition}[Shallow neural network]
  For a input dimension $d$ and a Lipschitz continuous activation function $\sigma:
  \mathbb{R} \to \mathbb{R}$ we define a shallow neural network with
  $n$ hidden nodes as
  $\mathcal{NN}_\vartheta : \mathbb{R}^d \to \mathbb{R}$ as
  \[
    \mathcal{NN}_\vartheta \coloneqq \sum_{k=1}^n w_k \sigma\left(b_k +
    \sum_{j=1}^d v_{k,j} x_j\right) + c ~~ \forall x \in \mathbb{R}^d
\]
with
\begin{itemize}
  \item weights $w_k \in \mathbb{R},~k \in \left\{1,\dots,n\right\}$
  \item biases $b_k \in \mathbb{R},~k \in \left\{1, \dots,n\right\}$
  \item weights $v_k \in \mathbb{R}^d,~k\in\left\{1,\dots,n\right\}$
  \item bias $c \in \mathbb{R}$
  \item these weights and biases collected in
  \[
    \vartheta \coloneqq (w, b, v, c) \in \Theta \coloneqq
    \mathbb{R}^{n \times n \times (n \times d) \times 1}
  \]
\end{itemize}
\end{Definition}
% \begin{figure}
%   \begin{tikzpicture}[x=1.5cm, y=1.5cm]
%     \tikzset{myptr/.style={decoration={markings,mark=at position 1 with %
%       {\arrow[scale=1.5,>=stealth]{>}}},postaction={decorate}}}

%     \foreach \m/\l [count=\y] in {1}
%     \node [every neuron/.try, neuron \m/.try] (input-\m) at (0,0.5-\y) {};

%     \foreach \m [count=\y] in {1,2,missing,3,4}
%     \node [every neuron/.try, neuron \m/.try ] (hidden-\m) at (1.25,3.25-\y*1.25) {};

%     \foreach \m [count=\y] in {1}
%     \node [every neuron/.try, neuron \m/.try ] (output-\m) at (2.5,0.5-\y) {};

%     \foreach \l [count=\i] in {1}
%     \draw [myptr] (input-\i)+(-1,0) -- (input-\i)
%     node [above, midway] {$x$};

%     \foreach \l [count=\i] in {1,2,n-1,n}
%     \node [above] at (hidden-\i.north) {$\mathcal{N}_{\l}$};

%     \foreach \l [count=\i] in {1,n_l}
%     \node [above] at (output-\i.north) {};

%     \foreach \l [count=\i] in {1}
%     \draw [myptr, >=stealth] (output-\i) -- ++(1,0)
%     node [above, midway] {$y$};


%     \foreach \i in {1}
%     \foreach \j in {1,2,...,3,4}
%     \draw [myptr, >=stealth] (input-\i) -- (hidden-\j);

%     \foreach \i in {1,2,...,3,4}
%     \foreach \j in {1}
%     \draw [myptr, >=stealth] (hidden-\i) -- (output-\j);

%     \node [align=center, above] at (0,1) {Input    \\layer};
%     \node [align=center, above] at (1.25,3) {Hidden layer};
%     \node [align=center, above] at (2.5,1) {Output \\layer};

%   \end{tikzpicture}
%   \caption{Shallow Neural Network with input- and output-dimension of \(d
%     = 1\)}
%   \label{fig:shallowNN}
% \end{figure}

As neural networks with a large amount of nodes have a large amount of
parameters that can be tuned it can often fit the data quite well. If a ReLU
\[
  \sigma(x) \coloneqq \max{(0, x)}
\]
is chosen as activation function one can easily prove that if the
amount of hidden nodes exceeds the
amount of data points in the training data a shallow network trained
on MSE will perfectly fit the data.
\begin{Theorem}[sinnvoller titel]
  For training data of size t
  \[
    \left(x_i^{\text{train}}, y_i^{\text{train}}\right) \in \mathbb{R}^d
    \times \mathbb{R},~i\in\left\{1,\dots,t\right\}
  \]
  a shallow neural network $\mathcal{NN}_\vartheta$ with $n \geq t$
  hidden nodes will perfectly fit the data when
  minimizing squared error loss.
  \proof
  W.l.o.g. all values $x_{ij}^{\text{train}} \in [0,1],~\forall i \in
  \left\{1,\dots\right\}, j \in \left\{1,\dots,d\right\}$. Now we
  chose $v^*$ in order to calculate a unique value for all
  $x_i^{\text{train}}$:
  \[
    v^*_{k,j} = v^*_{j} = 10^{j-1}, ~ \forall k \in \left\{1,\dots,n\right\}.
  \]
  Assuming $x_i^{\text{train}} \neq x_j^{\text{train}},~\forall i\neq
  j$ we get
  \[
    \left(v_k^*\right)^{\mathrm{T}} x_i^{\text{train}} \neq
    \left(v_k^*\right)^{\mathrm{T}} x_j^{\text{train}}, ~ \forall i
    \neq j.
  \]
  W.l.o.g assume $x_i^{\text{train}}$ are ordered such that
  $\left(v_k^*\right)^{\mathrm{T}} x_i^{\text{train}} <
    \left(v_k^*\right)^{\mathrm{T}} x_j^{\text{train}}, ~\forall j<j$,
  Then we can choose $b^*_k$ such that neuron $k$ is only active for all
  $x_i^{\text{train}}$ with $i \geq k$:
  \begin{align*}
    b^*_1 &> -\left(v^*\right)^{\mathrm{T}} x_1^{\text{train}},\\
    b^*_k &= -\left(v^*\right)^{\mathrm{T}}
            x_{k-1}^{\text{train}},~\forall k \in \left\{2, \dots,
            t\right\}, \\
    b_k^* &\leq -\left(v^*\right)^{\mathrm{T}}
            x_{t}^{\text{train}},~\forall k > t.
  \end{align*}
  With
  \begin{align*}
    w_k^* &= \frac{y_k^{\text{train}} - \sum_{j =1}^{k-1} w^*_j\left(b^*_j +
        x_k^{\text{train}}\right)}{b_k + \left(v^*\right)^{\mathrm{T}}
      x_k^{\text{train}}},~\forall k \in \left\{1,\dots,t\right\}\\
    w_k^* &\in \mathbb{R} \text{ arbitrary, } \forall k > t.
  \end{align*}
  and $\vartheta^* = (w^*, b^*, v^*, c = 0)$ we get
  \[
    \mathcal{NN}_{\vartheta^*} (x_i^{\text{train}}) = \sum_{k =
      1}^{i-1} w_k\left(\left(v^*\right)^{\mathrm{T}}
      x_i^{\text{train}}\right) + w_i\left(\left(v^*\right)^{\mathrm{T}}
      x_i^{\text{train}}\right) = y_i^{\text{train}}.
  \]
  As the squared error of $\mathcal{NN}_{\vartheta^*}$ is zero all
  squared error loss minimizing shallow networks with at least $t$ hidden
  nodes will perfectly fit the data.
  \qed
  \label{theo:overfit}
\end{Theorem}

However this behavior is often not desired as over fit models often
have bad generalization properties especially if noise is present in
the data. This effect can be seen in
Figure~\ref{fig:overfit}. Here a network that perfectly fits the
training data regarding the MSE is \todo{Formulierung}
constructed and compared to a regression spline
(Definition~\ref{def:wrs}). While the network
fits the data better than the spline, the spline is much closer to the
underlying mechanism that was used to generate the data. The better
generalization of the spline compared to the network is further
illustrated by the better validation error computed with new generated
test data.
In order to improve the accuracy of the model we want to reduce
overfitting. A possible way to achieve this is by explicitly
regularizing the network through the cost function as done with
ridge penalized networks
(Definition~\ref{def:rpnn}) where large weights $w$ are punished. In
Theorem~\ref{theo:main1} we will
prove that this will result in the network converging to
regressions splines as the amount of nodes in the hidden layer is
increased.


\begin{figure}
  \begin{adjustbox}{width = \textwidth}
    \pgfplotsset{
      compat=1.11,
legend image code/.code={
  \draw[mark repeat=2,mark phase=2]
plot coordinates {
  (0cm,0cm)
(0.15cm,0cm)        %% default is (0.3cm,0cm)
(0.3cm,0cm)         %% default is (0.6cm,0cm)
};%
}
}
    \begin{tikzpicture}
      \begin{axis}[tick style = {draw = none}, width = \textwidth,
        height = 0.6\textwidth]
        \addplot table
        [x=x, y=y, col sep=comma, only marks,mark options={scale =
          0.7}] {Plots/Data/overfit.csv};
        \addplot [red, line width=0.8pt] table [x=x_n, y=s_n, col
        sep=comma, forget plot] {Plots/Data/overfit.csv};
        \addplot [black, line width=0.8pt] table [x=x_n, y=y_n, col
        sep=comma] {Plots/Data/overfit.csv};
        \addplot [black, line width=0.8pt, dashed] table [x=x, y=y, col
        sep=comma] {Plots/Data/overfit_spline.csv};

        \addlegendentry{\footnotesize{data}};
        \addlegendentry{\footnotesize{$\mathcal{NN}_{\vartheta^*}$}};
        \addlegendentry{\footnotesize{spline}};
      \end{axis}
    \end{tikzpicture}
  \end{adjustbox}
  \caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
    \varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
    (\textcolor{blue}{blue dots}) the neural network constructed
    according to the proof of Theorem~\ref{theo:overfit} (black) and the
    underlying signal (\textcolor{red}{red}). While the network has no
    bias a regression spline (black dashed) fits the data much
    better. For a test set of size 20 with uniformly distributed $x$
    values and responses of the same fashion as the training data the MSE of the neural network is
    0.30, while the MSE of the spline is only 0.14 thus generalizing
    much better.
  }
  \label{fig:overfit}
\end{figure}

\clearpage
\subsection{Convergence Behaviour of 1-dim. Randomized Shallow Neural
  Networks}


In this section we will analyze the connection of randomized shallow
Neural Networks with one dimensional input and regression splines. We
will see that the punishment of the size of the weights in training
the randomized shallow
Neural Network will result in a function that minimizes the second
derivative as the amount of hidden nodes is grown to infinity. In order
to properly formulate this relation we will first need to introduce
some definitions.

\begin{Definition}[Randomized shallow neural network]
  For an input dimension $d$, let $n \in \mathbb{N}$ be the number of
  hidden nodes and $v(\omega) \in \mathbb{R}^{i \times n}, b(\omega)
  \in \mathbb{R}^n$ randomly drawn weights. Then for a weight vector
  $w$ the corresponding randomized shallow neural network is given by
\[
  \mathcal{RN}_{w, \omega} (x) = \sum_{k=1}^n w_k
  \sigma\left(b_k(\omega) + \sum_{j=1}^d v_{k, j}(\omega) x_j\right).
\]
\label{def:rsnn}
\end{Definition}

\begin{Definition}[Ridge penalized Neural Network]
  \label{def:rpnn}
  Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
  network, as introduced in ???. Then the optimal ridge penalized
  network is given by
  \[
    \mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
    \mathcal{RN}_{w^{*, \tilde{\lambda}}(\omega), \omega}
  \]
  with
  \[
    w^{*,\tilde{\lambda}}(\omega) :\in \argmin_{w \in
      \mathbb{R}^n} \underbrace{ \left\{\overbrace{\sum_{i = 1}^N \left(\mathcal{RN}_{w,
              \omega}(x_i^{\text{train}}) -
            y_i^{\text{train}}\right)^2}^{L(\mathcal{RN}_{w, \omega})} +
        \tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
  \]
\end{Definition}
In the ridge penalized Neural Network large weights are penalized, the
extend of which can be tuned with the parameter $\tilde{\lambda}$. If
$n$ is larger than the amount of training samples $N$ then for
$\tilde{\lambda} \to 0$ the network will interpolate the data while
having minimal weights, resulting in the \textit{minimum norm
  network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
\[
  \mathcal{RN}_{w^{\text{min}}, \omega} \text{ randomized shallow
    Neural network with weights } w^{\text{min}}:
\]
\[
  w^{\text{min}} \in \argmin_{w \in \mathbb{R}^n} \norm{w}, \text{
    s.t. }
  \mathcal{RN}_{w,\omega}(x_i^{\text{train}}) = y_i^{\text{train}}, \, \forall i \in
  \left\{1,\dots,N\right\}.
\]
For $\tilde{\lambda} \to \infty$ the learned
function will resemble the data less and less with the weights
approaching $0$. .\par
In order to make the notation more convinient in the follwoing the
$\omega$ used to express the realised random parameters will no longer
be explizitly mentioned.
\begin{Definition}
  \label{def:kink}
  Let $\mathcal{RN}_w$ be a randomized shallow Neural
  Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can
  be observed.
  \[
    \mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \sigma(b_k + v_kx)
  \]
  Because we specified $\sigma(y) \coloneqq \max\left\{0, y\right\}$ a
  kink in $\sigma$ can be observed at $\sigma(0) = 0$. As $b_k + v_kx = 0$ for $x
  = -\frac{b_k}{v_k}$ we define the following:
  \begin{enumerate}[label=(\alph*)]
    \item Let $\xi_k \coloneqq -\frac{b_k}{v_k}$ be the k-th kink of $\mathcal{RN}_w$.
    \item Let $g_{\xi}(\xi_k)$ be the density of the kinks $\xi_k =
    - \frac{b_k}{v_k}$ in accordance to the distributions of $b_k$ and
    $v_k$.
    \item Let $h_{k,n} \coloneqq \frac{1}{n g_{\xi}(\xi_k)}$ be the
    average estmated distance from kink $\xi_k$ to the next nearest
    one.
  \end{enumerate}
\end{Definition}

In order to later prove the connection between randomised shallow
Neural Networks and regression splines, we first take a look at a
smooth approximation of the RSNN.

\begin{Definition}[Smooth Approximation of Randomized Shallow Neural
  Network]
  \label{def:srsnn}
  Let $RS_{w}$ be a randomized shallow Neural Network according to
  Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with
  corresponding kink density $g_{\xi}$ as given by
  Definition~\ref{def:kink}.
  In order to smooth the RSNN consider following kernel for every $x$:

  \[
    \kappa_x(s) \coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n}
          g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R}
  \]

  Using this kernel we define a smooth approximation of
  $\mathcal{RN}_w$ by

  \[
    f^w(x) \coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) \kappa_x(s) ds.
  \]
\end{Definition}

Note that the kernel introduced in Definition~\ref{def:srsnn}
satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.

Now we take a look at weighted regression splines. Later we will prove
that the ridge penalized neural network as defined in
Definition~\ref{def:rpnn} converges a weighted regression spline, as
the amount of hidden nodes is grown to inifity.

\begin{Definition}[Adapted Weighted regression spline]
  \label{def:wrs}
  Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
  \left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
  and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
  regression spline $f^{*, \lambda}_g$ is given by

  \[
    f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
        \\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
            1}^N \left(f(x_i^{\text{train}}) - y_i^{\text{train}}\right)^2}^{L(f)} +
        \lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
        dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
  \]
  \todo{Anforderung an Ableitung von f, doch nicht?}
\end{Definition}

Similary to ridge weight penalized neural networks the parameter
$\lambda$ controls a trade-off between accuracy on the training data
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
the second derivative. Such a function is known as cubic spline
interpolation.
\todo{cite cubic spline}

\[
  f^{*, 0+} \text{ smooth spline interpolation: }
\]
\[
  f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
  \argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
      y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
\]

For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
to linear regression of the data.
\begin{Definition}[Spline approximating Randomised Shallow Neural
  Network]
  \label{def:sann}
  Let $\mathcal{RN}$ be a randomised shallow Neural Network according
  to Definition~\ref{def:RSNN} and $f^{*, \lambda}_g$ be the weighted
  regression spline as introduced in Definition~\ref{def:wrs}. Then
  the randomised shallow neural network approximating $f^{*,
    \lambda}_g$ is given by
  \[
    \mathcal{RN}_{\tilde{w}}(x) = \sum_{k = 1}^n \tilde{w}_k \sigma(b_k + v_k x),
  \]
  with the weights $\tilde{w}_k$ defined as
  \[
    \tilde{w}_k \coloneqq \frac{h_{k,n} v_k}{\mathbb{E}[v^2 \vert \xi
      = \xi_k]}  (f_g^{*, \lambda})''(\xi_k).
  \]
\end{Definition}

The approximating nature of the network in
Definition~\ref{def:sann} can be seen by LOOKING \todo{besseres Wort
  finden} at the first derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given
by
\begin{align}
  \frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x}
  \Big{|}_{x} &= \sum_k^n \tilde{w}_k \mathds{1}_{\left\{b_k + v_k x >
                0\right\}}(v_k) = \sum_{\substack{k \in \mathbb{N} \\ \xi_k <
  x}} \tilde{w}_k v_k \nonumber \\
              &= \frac{1}{n} \sum_{\substack{k \in \mathbb{N} \\
  \xi_k < x}} \frac{v_k^2}{g_{\xi}(\xi_k) \mathbb{E}[v^2 \vert \xi
  = \xi_k]} (f_g^{*, \lambda})''(\xi_k). \label{eq:derivnn}
\end{align}
\todo{gescheite Ableitungs Notation}
As the expression (\ref{eq:derivnn}) behaves similary to a
Riemann-sum for $n \to \infty$ it will converge to the first
derievative of $f^{*,\lambda}_g$. A formal proof of this behaviour
is given in Lemma~\ref{lem:s0}.


In order to formulate the theorem describing the convergence of $RN_w$
we need to make a couple of assumptions.
\todo{Bessere Formulierung}

\begin{Assumption}~
  \label{ass:theo38}
  \begin{enumerate}[label=(\alph*)]
    \item The probability density fucntion of the kinks $\xi_k$,
    namely $g_{\xi}$ as defined in Definition~\ref{def:kink} exists
    and is well defined.
    \item The density function $g_\xi$
    has compact support on $\supp(g_{\xi})$.
    \item The density function $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
    \item $g_{\xi}(0) \neq 0$.
    \item $\frac{1}{g_{\xi}}\Big|_{\supp(g_{\xi})}$ is uniformly
    continous on $\supp(g_{\xi})$.
    \item The conditional distribution $\mathcal{L}(v_k|\xi_k = x)$
    is uniformly continous on $\supp(g_{\xi})$.
    \item $\mathbb{E}\left[v_k^2\right] < \infty$.
  \end{enumerate}
\end{Assumption}

As we will prove the prorpsition in the Sobolev space, we hereby
introduce it and its inuced\todo{richtiges wort?} norm.

\begin{Definition}[Sobolev Space]
  For $K \subset \mathbb{R}^n$ open and $1 \leq p \leq \infty$ we
  define the Sobolev space $W^{k,p}(K)$ as the space containing all
  real valued functions $u \in L^p(K)$ such that for every multi-index
  $\alpha \in \mathbb{N}^n$ with $\abs{\alpha} \leq
  k$ the mixed parial derivatives
  \[
    u^{(\alpha)} = \frac{\partial^{\abs{\alpha}} u}{\partial
      x_1^{\alpha_1} \dots \partial x_n^{\alpha_n}}
  \]
  exists in the weak sense and
  \[
    \norm{u^{(\alpha)}}_{L^p} < \infty.
  \]
  \todo{feritg machen}
  \label{def:sobonorm}
  The natural norm of the sobolev space is given by
  \[
    \norm{f}_{W^{k,p}(K)} =
    \begin{cases}
      \left(\sum_{\abs{\alpha} \leq k}
        \norm{f^{(\alpha)}}^p_{L^p}\right)^{\nicefrac{1}{p}},&
      \text{for } 1 \leq p < \infty \\
      max_{\abs{\alpha} \leq k}\left\{f^{(\alpha)}\right\},& \text{for
      } p = \infty
    \end{cases}
    .
  \]
\end{Definition}

With these assumption in place we can formulate the main theorem.
\todo{Bezug Raum}


\begin{Theorem}[Ridge weight penaltiy corresponds to weighted regression spline]
  \label{theo:main1}
  For $N \in \mathbb{N}$ arbitrary training data
  \(\left(x_i^{\text{train}}, y_i^{\text{train}}
  \right)\) and $\mathcal{RN}^{*, \tilde{\lambda}}, f_g^{*, \lambda}$
  according to Definition~\ref{def:rpnn} and Definition~\ref{def:wrs}
  respectively with Assumption~\ref{ass:theo38} it holds

  \begin{equation}
    \label{eq:main1}
    \plimn \norm{\mathcal{RN^{*, \tilde{\lambda}}} - f^{*,
        \lambda}_{g}}_{W^{1,\infty}(K)} = 0.
  \end{equation}

  With
  \begin{align*}
    g(x) & \coloneqq g_{\xi}(x)\mathbb{E}\left[ v_k^2 \vert \xi_k = x
           \right], \forall x \in \mathbb{R}, \\
    \tilde{\lambda} & \coloneqq \lambda n g(0).
  \end{align*}
\end{Theorem}
We will proof Theo~\ref{theo:main1} by showing that
\begin{equation}
  \label{eq:main2}
  \plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f^{w^*}}_{W^{1,
      \infty}(K)} = 0
\end{equation}
and
\begin{equation}
  \label{eq:main3}
  \plimn \norm{f^{w^*} - f_g^{*, \lambda}}_{W^{1,\infty}(K)} = 0
\end{equation}
and then using the triangle inequality to follow (\ref{eq:main1}). In
order to prove (\ref{eq:main2}) and (\ref{eq:main3}) we will need to
introduce a number of auxiliary lemmmata, proves to these will be
provided in the appendix, as they would SPRENGEN DEN RAHMEN.


\begin{Lemma}[Poincar\'e typed inequality]
  \label{lem:pieq}
  Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
  \mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
  \subset \mathbb{R}\) with \(f(a)=0\) it holds that
  \begin{equation*}
    \label{eq:pti1}
    \exists C_K^{\infty} \in \mathbb{R}_{>0} :
    \norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
    \norm{f'}_{L^{\infty}(K)}.
  \end{equation*}
  If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
  \mathbb{R}\) Lesbeque integrable then additionally
  \begin{equation*}
    \label{eq:pti2}
    \exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
    C_K^2 \norm{f''}_{L^2(K)}.
  \end{equation*}
  % \proof
  % With the fundamental theorem of calculus, if
  % \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
  % \begin{equation}
  %   \label{eq:f_f'}
  %   \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
  %   \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
  %   \sup_{y \in K}\abs{f'(y)}.
  % \end{equation}
  % Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
  % \[
  %   \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
  %   \max\left\{\norm{f}_{L^{\infty}(K)},
  %     \norm{f'}_{L^{\infty}(K)}\right\}
  %   \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
  %     1\right\}\norm{f'}_{L^{\infty}(K)}.
  % \]
  % With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
  % get (\ref{eq:pti1}).
  % By using the Hölder inequality, we can proof the second claim.
  % \begin{align*}
      %       \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
                                          %                                           \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
                                          %                                           K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
      %                                         &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
                                                  %                                                   \in
                                                  %                                                   K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
                                                  %                                                   = \abs{b-a}\norm{f''}_{L^2(K)}.
                                                  %     \end{align*}
                                                  %                                                   Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
                                                  %                                                   \abs{b-a}C_K^{\infty}\).
                                                  %                                                   \qed
\end{Lemma}

\begin{Lemma}
  \label{lem:cnvh}
  Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
  \mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
  \[
    \forall x \in  \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
      \frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
  \]
  it holds, that
  \[
    \plimn \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
    h_{k,n}
    =\int_{\min\left\{C_{g_{\xi}}^l, T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
    \mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
  \]
  uniformly in \(T \in K\).
                                                  %                                                   \proof
                                                  %                                                   For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
                                                  %                                                   consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
                                                  %                                                   \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
                                                  %                                                   \begin{equation}
                                                  %                                                     \label{eq:psi_stet}
                                                  %                                                     \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
                                                  %                                                     \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
                                                  %                                                     \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
                                                  %                                                     \frac{1}{g_{\xi}(\xi')}} < \varepsilon
                                                  %                                                   \end{equation}
                                                  %                                                   uniformly in \(v\). In order to
                                                  %                                                   save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
                                                  %                                                   \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
                                                  %                                                   intervall. By splitting the interval in disjoint strips of length \(\delta
                                                  %                                                   \leq \delta(\varepsilon)\) we get:

                                                  %                                                   \[
                                                  %                                                     \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
                                                  %                                                     \frac{\bar{h}_k}{2}}_{\circled{1}} =
                                                  %                                                     \underbrace{\sum_{l \in \mathbb{Z}:
                                                  %                                                     \left[\delta l, \delta (l + 1)\right] \subseteq
                                                  %                                                     \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
                                                  %                                                     \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
                                                  %                                                       \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
                                                  %                                                       \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
                                                  %                                                   \]
                                                  %                                                   Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
                                                  %                                                   \begin{align*}
                                                  %                                                     \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
                                                  %                                                       \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
                                                  %                                                       \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
                                                  %                                                         \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
                                                  %                                                         \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
                                                  %                                                         \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
                                                  %                                                       1}\right) \\
                                                  % %                                                     \intertext{}
                                                  %                                                     &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
                                                  %                                                       \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
                                                  %                                                       \varphi\left(l\delta, v_k\right)}
                                                  %                                                       {\abs{\left\{m \in
                                                  %                                                         \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
                                                  %                                                         \kappa : \xi_m \in [\delta l, \delta(l +
                                                  %                                                         1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
                                                  %                                                     \intertext{We use the mean to approximate the number of kinks in
                                                  %                                                     each $\delta$-strip, as it follows a bonomial distribution this
                                                  %                                                     amounts to
                                                  %                                                     \[
                                                  %                                                       \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
                                                  %                                                           \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
                                                  %                                                       1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
                                                  %                                                       \tilde{\varepsilon}).
                                                  %                                                     \]
                                                  %                                                     Bla Bla Bla $v_k$}
                                                  %                                                     \circled{1} & \approx
                                                  %                                                   \end{align*}
\end{Lemma}

\begin{Lemma}[Step 0]
  For any $\lambda > 0$, training data $(x_i^{\text{train}}
  y_i^{\text{train}}) \in \mathbb{R}^2$, with $ i \in
  \left\{1,\dots,N\right\}$ and subset $K \subset \mathbb{R}$ the spline approximating randomized
  shallow neural network $\mathcal{RN}_{\tilde{w}}$ converges to the
  regression spline $f^{*, \lambda}_g$ in
  $\norm{.}_{W^{1,\infty}(K)}$ as the node count $n$ increases,
  \begin{equation}
    \label{eq:s0}
    \plimn \norm{\mathcal{RN}_{\tilde{w}} - f^{*, \lambda}_g}_{W^{1,
        \infty}(K)} = 0
  \end{equation}
  \proof
  Using Lemma~\ref{lem:pieq} it is sufficient to show
  \[
    \plimn \norm{\mathcal{RN}_{\tilde{w}}' - (f^{*,
        \lambda}_g)'}_{L^{\infty}} = 0.
  \]
  This can be achieved by using Lemma~\ref{lem:cnvh} with $\varphi(\xi_k,
  v_k) = \frac{v_k^2}{\mathbb{E}[v^2|\xi = z]} (f^{*, \lambda}_w)''(\xi_k) $
  thus obtaining
  \begin{align*}
    \plimn \frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x}
    \stackrel{(\ref{eq:derivnn})}{=}
    & \plimn \sum_{\substack{k \in \mathbb{N} \\
    \xi_k < x}} \frac{v_k^2}{\mathbb{E}[v^2 \vert \xi
    = \xi_k]} (f_g^{*, \lambda})''(\xi_k) h_{k,n}
    \stackrel{\text{Lemma}~\ref{lem:cnvh}}{=} \\
    \stackrel{\phantom{(\ref{eq:derivnn})}}{=}
    &
      \int_{\min\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
      \mathbb{E}\left[\frac{v^2}{\mathbb{E}[v^2|\xi = z]} (f^{*,
      \lambda}_w)''(\xi) \vert
      \xi = x \right] dx \equals^{\text{Tower-}}_{\text{property}} \\
    \stackrel{\phantom{(\ref{eq:derivnn})}}{=}
    &
      \int_{\min\left\{C_{g_{\xi}}^l,
      T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}(f^{*,\lambda}_w)''(x)
      dx.
  \end{align*}
  By the fundamental theorem of calculus and $\supp(f') \subset
  \supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}.
  \qed
\end{Lemma}

\begin{Lemma}[Step 2]
  For any $\lambda > 0$ and training data $(x_i^{\text{train}},
  y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
  \left\{1,\dots,N\right\}$, we have
  \[
    \plimn F^{\tilde{\lambda}}_n(\mathcal{RN}_{\tilde{w}}) =
    F^{\lambda, g}(f^{*, \lambda}_g) = 0.
  \]
  \proof
  This can be prooven by showing
\end{Lemma}

\begin{Lemma}[Step 3]
  For any $\lambda > 0$ and training data $(x_i^{\text{train}},
  y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
  \left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
  defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
  respectively, it holds
  \[
    \plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} -
      f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0.
  \]
\end{Lemma}

\begin{Lemma}[Step 4]
  For any $\lambda > 0$ and training data $(x_i^{\text{train}},
  y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
  \left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
  defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
  respectively, it holds
  \[
    \plimn \abs{F_n^{\lambda}(\mathcal{RN}^{*,\tilde{\lambda}}) -
      F^{\lambda, g}(f^{w*, \tilde{\lambda}})} = 0.
  \]
\end{Lemma}

\begin{Lemma}[Step 7]
  For any $\lambda > 0$ and training data $(x_i^{\text{train}},
  y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
  \left\{1,\dots,N\right\}$, for any sequence of functions $f^n \in
  W^{2,2}$ with
  \[
    \plimn F^{\lambda, g} (f^n) = F^{\lambda, g}(f^{*, \lambda}),
  \]
  it follows
  \[
    \plimn \norm{f^n - f^{*, \lambda}} = 0.
  \]
\end{Lemma}

\textcite{heiss2019} further show a link between ridge penalized
networks and randomized shallow neural networks which are trained with
gradient descent which is stopped after a certain amount of iterations.

\newpage
\subsection{Simulations}
In the following the behaviour described in Theorem~\ref{theo:main1}
is visualized in a simulated example. For this two sets of training
data have been generated.
\begin{itemize}
  \item $\text{data}_A = (x_{i, A}^{\text{train}},
  y_{i,A}^{\text{train}})$ with
  \begin{align*}
    x_{i, A}^{\text{train}} &\coloneqq -\pi + \frac{2 \pi}{5} (i - 1),
                              i \in \left\{1, \dots, 6\right\}, \\
    y_{i, A}^{\text{train}} &\coloneqq \sin( x_{i, A}^{\text{train}}). \phantom{(i - 1),
                              i \in \left\{1, \dots, 6\right\}}
  \end{align*}
  \item $\text{data}_b = (x_{i, B}^{\text{train}}, y_{i,
    B}^{\text{train}})$ with
  \begin{align*}
    x_{i, B}^{\text{train}} &\coloneqq \pi\frac{i - 8}{7},
                              i \in \left\{1, \dots, 15\right\}, \\
    y_{i, B}^{\text{train}} &\coloneqq \sin( x_{i, B}^{\text{train}}). \phantom{(i - 1),
                              i \in \left\{1, \dots, 6\right\}}
  \end{align*}
\end{itemize}
For the $\mathcal{RN}$ the random weights are distributed
as follows
\begin{align*}
  \xi_i &\stackrel{i.i.d.}{\sim} \text{Unif}(-5,5), \\
  v_i &\stackrel{i.i.d.}{\sim} \mathcal{N}(0, 5), \\
  b_i &\stackrel{\phantom{i.i.d.}}{\sim} -\xi_i v_i.
\end{align*}
Note that by the choices for the distributions $g$ as defined in
Theorem~\ref{theo:main1}
would equate to $g(x) = \frac{\mathbb{E}[v_k^2|\xi_k = x]}{10}$. In
order to utilize the
smoothing spline implemented in Mathlab, $g$ has been simplified to $g
\equiv \frac{1}{10}$ instead. For all figures $f_1^{*, \lambda}$ has
been calculated with Matlab's ..... As ... minimizes
\[
  \bar{\lambda} \sum_{i=1}^N(y_i^{train} - f(x_i^{train}))^2 + (1 -
  \bar{\lambda}) \int (f''(x))^2 dx
\]
the smoothing parameter used for fittment is $\bar{\lambda} =
\frac{1}{1 + \lambda}$. The parameter $\tilde{\lambda}$ for training
the networks is chosen as defined in Theorem~\ref{theo:main1} and each
one is trained on the full training data for 5000 iterations using
gradient descent. The
results are given in Figure~\ref{blblb}, here it can be seen that in
the intervall of the traing data $[-\pi, \pi]$ the neural network and
smoothing spline are nearly identical, coinciding with the proposition.

\input{Plots/RN_vs_RS}


%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: