%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

In this section we will analyze the connection of shallow Neural
Networks and regression splines. We will see that the punishment of
wight size in training the shallow Neural Netowork will result in a
function that minimizes the second derivative as the amount of hidden
nodes ia grown to infinity. In order to properly formulate this relation we will
first need to introduce some definitions.

\begin{Definition}[Ridge penalized Neural Network]
  Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
  network, as introduced in ???. Then the optimal ridge penalized
  network is given by
  \[
    \mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
    \mathcal{RN}_{w^{*, \tilde{\lambda}}(\omega), \omega}
  \]
  with
  \[
    w^{*,\tilde{\lambda}}(\omega) :\in \argmin_{w \in
      \mathbb{R}^n} \underbrace{ \left\{\overbrace{\sum_{i = 1}^N \left(\mathcal{RN}_{w,
        \omega}(x_i^{\text{train}}) -
      y_i^{\text{train}}\right)^2}^{L(\mathcal{RN}_{w, \omega})} +
    \tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
  \]
\end{Definition}
\label{def:rpnn}
In the ridge penalized Neural Network large weights are penalized, the
extend of which can be tuned with the parameter $\tilde{\lambda}$. If
$n$ is larger than the amount of training samples $N$ then for
$\tilde{\lambda} \to 0$ the network will interpolate the data while
having minimal weights, resulting in the \textit{minimum norm
  network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
\[
  \mathcal{RN}_{w^{\text{min}}, \omega} \text{ randomized shallow
    Neural network with weights } w^{\text{min}}:
\]
\[
  w^{\text{min}} \in \argmin_{w \in \mathbb{R}^n} \norm{w}, \text{
    s.t. }
  \mathcal{RN}_{w,\omega}(x_i^{train}) = y_i^{train}, \, \forall i \in
  \left\{1,\dots,N\right\}. 
\]
For $\tilde{\lambda} \to \infty$ the learned
function will resemble the data less and less with the weights
approaching $0$. Usually $\tilde{\lambda}$ lies between 0 and 1, as
for larger values the focus of weight reduction is larger than fittig
the data.\par
In order to make the notation more convinient in the follwoing the
$\omega$ used to express the realised random parameters will no longer
be explizitly mentioned.
\begin{Definition}
  \label{def:kink}
  Let $\mathcal{RN}_w$ be a randomized shallow Neural
  Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can
  be observed.
  \[
    \mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \gamma(b_k + v_kx)
  \]
  Because we specified $\gamma(y) \coloneqq \max\left\{0, y\right\}$ a
  kink in $\gamma$ can be observed at $\gamma(0) = 0$. As $b_k + v_kx = 0$ for $x
  = -\frac{b_k}{v_k}$ we define the following:
  \begin{enumerate}[label=(\alph*)]
    \item Let $\xi_k \coloneqq -\frac{b_k}{v_k}$ be the k-th kink of $\mathcal{RN}_w$.
    \item Let $g_{\xi}(\xi_k)$ be the density of the kinks $\xi_k =
    - \frac{b_k}{v_k}$ in accordance to the distributions of $b_k$ and
    $v_k$.
    \item Let $h_{k,n} \coloneqq \frac{1}{n g_{\xi}(\xi_k)}$ be the
    average estmated distance from kink $\xi_k$ to the next nearest
    one.
  \end{enumerate}
\end{Definition}

In order to later prove the connection between randomised shallow
Neural Networks and regression splines, we first take a look at a
smooth approximation of the RSNN.

\begin{Definition}[Smooth Approximation of Randomized Shallow Neural
  Network]
  \label{def:srsnn}
  Let $RS_{w}$ be a randomized shallow Neural Network according to
  Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with
  corresponding kink density $g_{\xi}$ as given by
  Definition~\ref{def:kink}.
  In order to smooth the RSNN consider following kernel for every $x$:

  \[
    \kappa_x(s) \coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n}
      g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R}
  \]

  Using this kernel we define a smooth approximation of
  $\mathcal{RN}_w$ by

  \[
    f^w(x) \coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) \kappa_x(s) ds.
  \]
\end{Definition}

Note that the kernel introduced in Definition~\ref{def:srsnn}
satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.

Now we take a look at weighted regression splines. Later we will prove
that the ridge penalized neural network as defined in
Definition~\ref{def:rpnn} converges a weighted regression spline, as
the amount of hidden nodes is grown to inifity.

\begin{Definition}[Weighted regression spline]
  Let $x_i^{train}, y_i^{train} \in \mathbb{R}, i \in
  \left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
  and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
  regression spline $f^{*, \lambda}_g$ is given by

  \[
    f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
    \\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
      1}^N \left(f(x_i^{train}) - y_i^{train}\right)^2}^{L(f)} +
  \lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
  dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
  \]
\end{Definition}

Similary to ridge weight penalized neural networks the parameter
$\lambda$ controls a trade-off between accuracy on the training data
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
the second derivative. Such a function is known as smooth spline
interpolation or (cubic) smoothing spline.

\[
   f^{*, 0+} \text{ smooth spline interpolation: }
\]
\[
  f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
  \argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{train}) =
    y_i^{train}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
\]

\begin{Assumption}~
  \begin{enumerate}[label=(\alph*)]
    \item The probability density function of the kinks $\xi_k$, namely $g_\xi$
    has compact support on $\supp(g_{\xi})$.
    \item The density $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
    \item $g_{\xi}(0) \neq 0$
  \end{enumerate}
\end{Assumption}

\begin{Theorem}[Ridge weight penaltiy corresponds to adapted spline]
  \label{theo:main1}
  For arbitrary training data \(\left(x_i^{train}, y_i^{train}\right)\) it holds
  \[
    \plimn \norm{\mathcal{RN^{*, \tilde{\lambda}}} - f^{*,
        \tilde{\lambda}}_{g, \pm}}_{W^{1,\infty}(K)} = 0.
  \]

  With
  \begin{align*}
    \label{eq:1}
    \tilde{\lambda} &\coloneqq \lambda n g(0), \\
    g(x) &\coloneqq
           g_{\xi}(x)\mathbb{E}\left[ v_k^2 \vert \xi_k = x \right], \forall x
           \in \mathbb{R}
  \end{align*}
  and \(RN^{*, \tilde{\lambda}}\), \(f^{*,\tilde{\lambda}}_{g, \pm}\)
  as defined in ??? and ??? respectively. 
\end{Theorem}
In order to proof Theo~\ref{theo:main1} we need to proof a number of
auxiliary Lemmata first.

\begin{Definition}[Sobolev Norm]
  \label{def:sobonorm}
  The natural norm of the sobolev space is given by
  \[
    \norm{f}_{W^{k,p}(K)} =
    \begin{cases}
      \left(\sum_{\abs{\alpha} \leq k}
        \norm{f^{(\alpha)}}^p_{L^p}\right)^{\nicefrac{1}{p}},&
      \text{for } 1 \leq p < \infty \\
      max_{\abs{\alpha} \leq k}\left\{f^{(\alpha)}\right\},& \text{for
      } p = \infty
    \end{cases}
    .
  \]
\end{Definition}

\begin{Lemma}[Poincar\'e typed inequality]
  Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
  \mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
  \subset \mathbb{R}\) with \(f(a)=0\) it holds that
  \begin{equation*}
    \label{eq:pti1}
    \exists C_K^{\infty} \in \mathbb{R}_{>0} :
    \norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
    \norm{f'}_{L^{\infty}(K)}.
  \end{equation*}
  If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
  \mathbb{R}\) Lesbeque integrable then additionally
  \begin{equation*}
    \label{eq:pti2}
    \exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
    C_K^2 \norm{f''}_{L^2(K)}.
  \end{equation*}
  % \proof
  % With the fundamental theorem of calculus, if
  % \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
  % \begin{equation}
  %   \label{eq:f_f'}
  %   \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
  %   \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
  %   \sup_{y \in K}\abs{f'(y)}.
  % \end{equation}
  % Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
  % \[
  %   \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
  %   \max\left\{\norm{f}_{L^{\infty}(K)},
  %     \norm{f'}_{L^{\infty}(K)}\right\}
  %   \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
  %     1\right\}\norm{f'}_{L^{\infty}(K)}.
  % \]
  % With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
  % get (\ref{eq:pti1}).
  % By using the Hölder inequality, we can proof the second claim.
  % \begin{align*}
  %   \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
  %     \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
  %     K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
  %                             &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
  %                               \in
  %                               K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
  %                               = \abs{b-a}\norm{f''}_{L^2(K)}.
  % \end{align*}
  % Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
  % \abs{b-a}C_K^{\infty}\).
  % \qed
\end{Lemma}

\begin{Lemma}
  Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
  \mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
  \[
    \forall x \in  \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
      \frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
  \]
  it holds, that
  \[
    \plimn \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
    \frac{\bar{h}_k}{2}
    =\int_{max\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
    \mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
  \]
  uniformly in \(T \in K\).
  % \proof
%   For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
%   consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
%   \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
%   \begin{equation}
%     \label{eq:psi_stet}
%     \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
%     \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
%       \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
%       \frac{1}{g_{\xi}(\xi')}} < \varepsilon
%   \end{equation}
%   uniformly in \(v\). In order to
%     save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
%     \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
%   intervall. By splitting the interval in disjoint strips of length \(\delta
%   \leq \delta(\varepsilon)\) we get:

%   \[
%     \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
%       \frac{\bar{h}_k}{2}}_{\circled{1}} =
%     \underbrace{\sum_{l \in \mathbb{Z}:
%         \left[\delta l, \delta (l + 1)\right] \subseteq
%         \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
%         \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
%           \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
%       \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
%   \]
%   Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
%   \begin{align*}
%     \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
%     \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
%     \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
%     \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
%     \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
%     \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
%     1}\right) \\
% % \intertext{} 
%                 &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
%     \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
%     \varphi\left(l\delta, v_k\right)}
%     {\abs{\left\{m \in
%     \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
%     \kappa : \xi_m \in [\delta l, \delta(l +
%     1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
%     \intertext{We use the mean to approximate the number of kinks in
%     each $\delta$-strip, as it follows a bonomial distribution this
%     amounts to
%     \[
%     \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
%     \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
%     1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
%     \tilde{\varepsilon}).
%     \]
%     Bla Bla Bla $v_k$}
%     \circled{1} & \approx 
%   \end{align*}
\end{Lemma}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: