Anfang des roten Faden in theo 3.8

2020-05-08 18:17:48 +02:00 · 2020-05-08 18:17:48 +02:00 · 74113d5060
commit 74113d5060
parent 5a33ed3c8e
4 changed files with 268 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ main-blx.bib

 # emacs autosaves
 *.tex~
+*#*.tex*

 # no pdfs
 *.pdf
--- a/TeX/introduction_nn.tex
+++ b/TeX/introduction_nn.tex
@ -147,7 +147,7 @@ except for the input layer, which recieves the components of the input.
    1.5mm] (i_4) at (0, -1.25) {};
    \node [align=left, left] at (-0.125, -1.25) {\(i_m\)};
    \draw[decoration={calligraphic brace,amplitude=5pt, mirror}, decorate, line width=1.25pt]
-    (-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Input};
+    (-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Inputs};

    \node [align = center, above] at (1.25, 3) {Synaptic\\weights};
    \node [every neuron] (w_1) at (1.25, 2.5) {\(w_{k, 1}\)};
--- a/TeX/main.tex
+++ b/TeX/main.tex
@ -27,6 +27,7 @@
 \usepackage{dsfont}
 \usepackage{tikz}
 \usepackage{nicefrac}
+\usepackage{enumitem}

 \usetikzlibrary{matrix,chains,positioning,decorations.pathreplacing,arrows}
 \usetikzlibrary{positioning,calc,calligraphy}
@ -59,14 +60,19 @@
 \newtheorem{Lemma}[Theorem]{Lemma}
 \newtheorem{Algorithm}[Theorem]{Algorithm}
 \newtheorem{Example}[Theorem]{Example}
+\newtheorem{Assumption}[Theorem]{Assumption}


 \DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim}
+\DeclareMathOperator{\supp}{supp}
+\DeclareMathOperator*{\argmin}{arg\,min}
 \begin{document}

        
 \newcommand{\plimn}[0]{\plim\limits_{n \to \infty}}
 \newcommand{\norm}[1]{\left\lVert#1\right\rVert}
+\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
+            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}


 \newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}
--- a/TeX/theo_3_8.tex
+++ b/TeX/theo_3_8.tex
@ -5,8 +5,153 @@
 %%% TeX-master: "main"
 %%% End:

-With the following Theorem we will have an explicit desrctiption for the
-limes of RN as the amount of nodes is increased.
+In this section we will analyze the connection of shallow Neural
+Networks and regression splines. We will see that the punishment of
+wight size in training the shallow Neural Netowork will result in a
+function that minimizes the second derivative as the amount of hidden
+nodes ia grown to infinity. In order to properly formulate this relation we will
+first need to introduce some definitions.
+
+\begin{Definition}[Ridge penalized Neural Network]
+  Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
+  network, as introduced in ???. Then the optimal ridge penalized
+  network is given by
+  \[
+    \mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
+    \mathcal{RN}_{w^{*, \tilde{\lambda}}(\omega), \omega}
+  \]
+  with
+  \[
+    w^{*,\tilde{\lambda}}(\omega) :\in \argmin_{w \in
+      \mathbb{R}^n} \underbrace{ \left\{\overbrace{\sum_{i = 1}^N \left(\mathcal{RN}_{w,
+        \omega}(x_i^{\text{train}}) -
+      y_i^{\text{train}}\right)^2}^{L(\mathcal{RN}_{w, \omega})} +
+    \tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
+  \]
+\end{Definition}
+\label{def:rpnn}
+In the ridge penalized Neural Network large weights are penalized, the
+extend of which can be tuned with the parameter $\tilde{\lambda}$. If
+$n$ is larger than the amount of training samples $N$ then for
+$\tilde{\lambda} \to 0$ the network will interpolate the data while
+having minimal weights, resulting in the \textit{minimum norm
+  network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
+\[
+  \mathcal{RN}_{w^{\text{min}}, \omega} \text{ randomized shallow
+    Neural network with weights } w^{\text{min}}:
+\]
+\[
+  w^{\text{min}} \in \argmin_{w \in \mathbb{R}^n} \norm{w}, \text{
+    s.t. }
+  \mathcal{RN}_{w,\omega}(x_i^{train}) = y_i^{train}, \, \forall i \in
+  \left\{1,\dots,N\right\}. 
+\]
+For $\tilde{\lambda} \to \infty$ the learned
+function will resemble the data less and less with the weights
+approaching $0$. Usually $\tilde{\lambda}$ lies between 0 and 1, as
+for larger values the focus of weight reduction is larger than fittig
+the data.\par
+In order to make the notation more convinient in the follwoing the
+$\omega$ used to express the realised random parameters will no longer
+be explizitly mentioned.
+\begin{Definition}
+  \label{def:kink}
+  Let $\mathcal{RN}_w$ be a randomized shallow Neural
+  Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can
+  be observed.
+  \[
+    \mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \gamma(b_k + v_kx)
+  \]
+  Because we specified $\gamma(y) \coloneqq \max\left\{0, y\right\}$ a
+  kink in $\gamma$ can be observed at $\gamma(0) = 0$. As $b_k + v_kx = 0$ for $x
+  = -\frac{b_k}{v_k}$ we define the following:
+  \begin{enumerate}[label=(\alph*)]
+    \item Let $\xi_k \coloneqq -\frac{b_k}{v_k}$ be the k-th kink of $\mathcal{RN}_w$.
+    \item Let $g_{\xi}(\xi_k)$ be the density of the kinks $\xi_k =
+    - \frac{b_k}{v_k}$ in accordance to the distributions of $b_k$ and
+    $v_k$.
+    \item Let $h_{k,n} \coloneqq \frac{1}{n g_{\xi}(\xi_k)}$ be the
+    average estmated distance from kink $\xi_k$ to the next nearest
+    one.
+  \end{enumerate}
+\end{Definition}
+
+In order to later prove the connection between randomised shallow
+Neural Networks and regression splines, we first take a look at a
+smooth approximation of the RSNN.
+
+\begin{Definition}[Smooth Approximation of Randomized Shallow Neural
+  Network]
+  \label{def:srsnn}
+  Let $RS_{w}$ be a randomized shallow Neural Network according to
+  Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with
+  corresponding kink density $g_{\xi}$ as given by
+  Definition~\ref{def:kink}.
+  In order to smooth the RSNN consider following kernel for every $x$:
+
+  \[
+    \kappa_x(s) \coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n}
+      g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R}
+  \]
+
+  Using this kernel we define a smooth approximation of
+  $\mathcal{RN}_w$ by
+
+  \[
+    f^w(x) \coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) \kappa_x(s) ds.
+  \]
+\end{Definition}
+
+Note that the kernel introduced in Definition~\ref{def:srsnn}
+satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
+similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
+is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
+\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
+
+Now we take a look at weighted regression splines. Later we will prove
+that the ridge penalized neural network as defined in
+Definition~\ref{def:rpnn} converges a weighted regression spline, as
+the amount of hidden nodes is grown to inifity.
+
+\begin{Definition}[Weighted regression spline]
+  Let $x_i^{train}, y_i^{train} \in \mathbb{R}, i \in
+  \left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
+  and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
+  regression spline $f^{*, \lambda}_g$ is given by
+
+  \[
+    f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
+    \\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
+      1}^N \left(f(x_i^{train}) - y_i^{train}\right)^2}^{L(f)} +
+  \lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
+  dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
+  \]
+\end{Definition}
+
+Similary to ridge weight penalized neural networks the parameter
+$\lambda$ controls a trade-off between accuracy on the training data
+and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
+resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
+the second derivative. Such a function is known as smooth spline
+interpolation or (cubic) smoothing spline.
+
+\[
+   f^{*, 0+} \text{ smooth spline interpolation: }
+\]
+\[
+  f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
+  \argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{train}) =
+    y_i^{train}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
+\]
+
+\begin{Assumption}~
+  \begin{enumerate}[label=(\alph*)]
+    \item The probability density function of the kinks $\xi_k$, namely $g_\xi$
+    has compact support on $\supp(g_{\xi})$.
+    \item The density $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
+    \item $g_{\xi}(0) \neq 0$
+  \end{enumerate}
+\end{Assumption}

 \begin{Theorem}[Ridge weight penaltiy corresponds to adapted spline]
  \label{theo:main1}
@ -28,7 +173,7 @@ limes of RN as the amount of nodes is increased.
  as defined in ??? and ??? respectively. 
 \end{Theorem}
 In order to proof Theo~\ref{theo:main1} we need to proof a number of
-auxilary Lemmata first.
+auxiliary Lemmata first.

 \begin{Definition}[Sobolev Norm]
  \label{def:sobonorm}
@ -50,49 +195,126 @@ auxilary Lemmata first.
  Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
  \mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
  \subset \mathbb{R}\) with \(f(a)=0\) it holds that
-  \begin{equation}
+  \begin{equation*}
    \label{eq:pti1}
    \exists C_K^{\infty} \in \mathbb{R}_{>0} :
    \norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
    \norm{f'}_{L^{\infty}(K)}.
-  \end{equation}
+  \end{equation*}
  If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
  \mathbb{R}\) Lesbeque integrable then additionally
-  \begin{equation}
+  \begin{equation*}
    \label{eq:pti2}
    \exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
    C_K^2 \norm{f''}_{L^2(K)}.
-  \end{equation}
-  \proof
-  With the fundamental theorem of calculus, if
-  \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
-  \begin{equation}
-    \label{eq:f_f'}
-    \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
-    \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
-    \sup_{y \in K}\abs{f'(y)}.
-  \end{equation}
-  Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
+  \end{equation*}
+  % \proof
+  % With the fundamental theorem of calculus, if
+  % \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
+  % \begin{equation}
+  %   \label{eq:f_f'}
+  %   \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
+  %   \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
+  %   \sup_{y \in K}\abs{f'(y)}.
+  % \end{equation}
+  % Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
+  % \[
+  %   \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
+  %   \max\left\{\norm{f}_{L^{\infty}(K)},
+  %     \norm{f'}_{L^{\infty}(K)}\right\}
+  %   \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
+  %     1\right\}\norm{f'}_{L^{\infty}(K)}.
+  % \]
+  % With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
+  % get (\ref{eq:pti1}).
+  % By using the Hölder inequality, we can proof the second claim.
+  % \begin{align*}
+  %   \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
+  %     \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
+  %     K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
+  %                             &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
+  %                               \in
+  %                               K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
+  %                               = \abs{b-a}\norm{f''}_{L^2(K)}.
+  % \end{align*}
+  % Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
+  % \abs{b-a}C_K^{\infty}\).
+  % \qed
+\end{Lemma}
+
+\begin{Lemma}
+  Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
+  \mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
  \[
-    \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
-    \max\left\{\norm{f}_{L^{\infty}(K)},
-      \norm{f'}_{L^{\infty}(K)}\right\}
-    \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
-      1\right\}\norm{f'}_{L^{\infty}(K)}.
+    \forall x \in  \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
+      \frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
  \]
-  With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
-  get (\ref{eq:pti1}).
-  By using the Hölder inequality, we can proof the second claim.
-  \begin{align*}
-    \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
-      \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
-      K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
-                              &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
-                                \in
-                                K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
-                                = \abs{b-a}\norm{f''}_{L^2(K)}.
-  \end{align*}
-  Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
-  \abs{b-a}C_K^{\infty}\).
-  \qed
-\end{Lemma}
+  it holds, that
+  \[
+    \plimn \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
+    \frac{\bar{h}_k}{2}
+    =\int_{max\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
+    \mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
+  \]
+  uniformly in \(T \in K\).
+  % \proof
+%   For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
+%   consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
+%   \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
+%   \begin{equation}
+%     \label{eq:psi_stet}
+%     \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
+%     \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
+%       \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
+%       \frac{1}{g_{\xi}(\xi')}} < \varepsilon
+%   \end{equation}
+%   uniformly in \(v\). In order to
+%     save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
+%     \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
+%   intervall. By splitting the interval in disjoint strips of length \(\delta
+%   \leq \delta(\varepsilon)\) we get:
+
+%   \[
+%     \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
+%       \frac{\bar{h}_k}{2}}_{\circled{1}} =
+%     \underbrace{\sum_{l \in \mathbb{Z}:
+%         \left[\delta l, \delta (l + 1)\right] \subseteq
+%         \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
+%         \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
+%           \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
+%       \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
+%   \]
+%   Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
+%   \begin{align*}
+%     \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
+%     \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
+%     \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
+%     \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
+%     \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
+%     \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
+%     1}\right) \\
+% % \intertext{} 
+%                 &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
+%     \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
+%     \varphi\left(l\delta, v_k\right)}
+%     {\abs{\left\{m \in
+%     \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
+%     \kappa : \xi_m \in [\delta l, \delta(l +
+%     1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
+%     \intertext{We use the mean to approximate the number of kinks in
+%     each $\delta$-strip, as it follows a bonomial distribution this
+%     amounts to
+%     \[
+%     \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
+%     \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
+%     1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
+%     \tilde{\varepsilon}).
+%     \]
+%     Bla Bla Bla $v_k$}
+%     \circled{1} & \approx 
+%   \end{align*}
+\end{Lemma}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "main"
+%%% End: