Anfang des roten Faden in theo 3.8

This commit is contained in:
Tobias Arndt 2020-05-08 18:17:48 +02:00
parent 5a33ed3c8e
commit 74113d5060
4 changed files with 268 additions and 39 deletions

1
.gitignore vendored
View File

@ -9,6 +9,7 @@ main-blx.bib
# emacs autosaves
*.tex~
*#*.tex*
# no pdfs
*.pdf

View File

@ -147,7 +147,7 @@ except for the input layer, which recieves the components of the input.
1.5mm] (i_4) at (0, -1.25) {};
\node [align=left, left] at (-0.125, -1.25) {\(i_m\)};
\draw[decoration={calligraphic brace,amplitude=5pt, mirror}, decorate, line width=1.25pt]
(-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Input};
(-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Inputs};
\node [align = center, above] at (1.25, 3) {Synaptic\\weights};
\node [every neuron] (w_1) at (1.25, 2.5) {\(w_{k, 1}\)};

View File

@ -27,6 +27,7 @@
\usepackage{dsfont}
\usepackage{tikz}
\usepackage{nicefrac}
\usepackage{enumitem}
\usetikzlibrary{matrix,chains,positioning,decorations.pathreplacing,arrows}
\usetikzlibrary{positioning,calc,calligraphy}
@ -59,14 +60,19 @@
\newtheorem{Lemma}[Theorem]{Lemma}
\newtheorem{Algorithm}[Theorem]{Algorithm}
\newtheorem{Example}[Theorem]{Example}
\newtheorem{Assumption}[Theorem]{Assumption}
\DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim}
\DeclareMathOperator{\supp}{supp}
\DeclareMathOperator*{\argmin}{arg\,min}
\begin{document}
\newcommand{\plimn}[0]{\plim\limits_{n \to \infty}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
\node[shape=circle,draw,inner sep=2pt] (char) {#1};}}
\newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}

View File

@ -5,8 +5,153 @@
%%% TeX-master: "main"
%%% End:
With the following Theorem we will have an explicit desrctiption for the
limes of RN as the amount of nodes is increased.
In this section we will analyze the connection of shallow Neural
Networks and regression splines. We will see that the punishment of
wight size in training the shallow Neural Netowork will result in a
function that minimizes the second derivative as the amount of hidden
nodes ia grown to infinity. In order to properly formulate this relation we will
first need to introduce some definitions.
\begin{Definition}[Ridge penalized Neural Network]
Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
network, as introduced in ???. Then the optimal ridge penalized
network is given by
\[
\mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
\mathcal{RN}_{w^{*, \tilde{\lambda}}(\omega), \omega}
\]
with
\[
w^{*,\tilde{\lambda}}(\omega) :\in \argmin_{w \in
\mathbb{R}^n} \underbrace{ \left\{\overbrace{\sum_{i = 1}^N \left(\mathcal{RN}_{w,
\omega}(x_i^{\text{train}}) -
y_i^{\text{train}}\right)^2}^{L(\mathcal{RN}_{w, \omega})} +
\tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
\]
\end{Definition}
\label{def:rpnn}
In the ridge penalized Neural Network large weights are penalized, the
extend of which can be tuned with the parameter $\tilde{\lambda}$. If
$n$ is larger than the amount of training samples $N$ then for
$\tilde{\lambda} \to 0$ the network will interpolate the data while
having minimal weights, resulting in the \textit{minimum norm
network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
\[
\mathcal{RN}_{w^{\text{min}}, \omega} \text{ randomized shallow
Neural network with weights } w^{\text{min}}:
\]
\[
w^{\text{min}} \in \argmin_{w \in \mathbb{R}^n} \norm{w}, \text{
s.t. }
\mathcal{RN}_{w,\omega}(x_i^{train}) = y_i^{train}, \, \forall i \in
\left\{1,\dots,N\right\}.
\]
For $\tilde{\lambda} \to \infty$ the learned
function will resemble the data less and less with the weights
approaching $0$. Usually $\tilde{\lambda}$ lies between 0 and 1, as
for larger values the focus of weight reduction is larger than fittig
the data.\par
In order to make the notation more convinient in the follwoing the
$\omega$ used to express the realised random parameters will no longer
be explizitly mentioned.
\begin{Definition}
\label{def:kink}
Let $\mathcal{RN}_w$ be a randomized shallow Neural
Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can
be observed.
\[
\mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \gamma(b_k + v_kx)
\]
Because we specified $\gamma(y) \coloneqq \max\left\{0, y\right\}$ a
kink in $\gamma$ can be observed at $\gamma(0) = 0$. As $b_k + v_kx = 0$ for $x
= -\frac{b_k}{v_k}$ we define the following:
\begin{enumerate}[label=(\alph*)]
\item Let $\xi_k \coloneqq -\frac{b_k}{v_k}$ be the k-th kink of $\mathcal{RN}_w$.
\item Let $g_{\xi}(\xi_k)$ be the density of the kinks $\xi_k =
- \frac{b_k}{v_k}$ in accordance to the distributions of $b_k$ and
$v_k$.
\item Let $h_{k,n} \coloneqq \frac{1}{n g_{\xi}(\xi_k)}$ be the
average estmated distance from kink $\xi_k$ to the next nearest
one.
\end{enumerate}
\end{Definition}
In order to later prove the connection between randomised shallow
Neural Networks and regression splines, we first take a look at a
smooth approximation of the RSNN.
\begin{Definition}[Smooth Approximation of Randomized Shallow Neural
Network]
\label{def:srsnn}
Let $RS_{w}$ be a randomized shallow Neural Network according to
Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with
corresponding kink density $g_{\xi}$ as given by
Definition~\ref{def:kink}.
In order to smooth the RSNN consider following kernel for every $x$:
\[
\kappa_x(s) \coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n}
g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R}
\]
Using this kernel we define a smooth approximation of
$\mathcal{RN}_w$ by
\[
f^w(x) \coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) \kappa_x(s) ds.
\]
\end{Definition}
Note that the kernel introduced in Definition~\ref{def:srsnn}
satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
Now we take a look at weighted regression splines. Later we will prove
that the ridge penalized neural network as defined in
Definition~\ref{def:rpnn} converges a weighted regression spline, as
the amount of hidden nodes is grown to inifity.
\begin{Definition}[Weighted regression spline]
Let $x_i^{train}, y_i^{train} \in \mathbb{R}, i \in
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
regression spline $f^{*, \lambda}_g$ is given by
\[
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
\\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
1}^N \left(f(x_i^{train}) - y_i^{train}\right)^2}^{L(f)} +
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
\]
\end{Definition}
Similary to ridge weight penalized neural networks the parameter
$\lambda$ controls a trade-off between accuracy on the training data
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
the second derivative. Such a function is known as smooth spline
interpolation or (cubic) smoothing spline.
\[
f^{*, 0+} \text{ smooth spline interpolation: }
\]
\[
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{train}) =
y_i^{train}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
\]
\begin{Assumption}~
\begin{enumerate}[label=(\alph*)]
\item The probability density function of the kinks $\xi_k$, namely $g_\xi$
has compact support on $\supp(g_{\xi})$.
\item The density $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
\item $g_{\xi}(0) \neq 0$
\end{enumerate}
\end{Assumption}
\begin{Theorem}[Ridge weight penaltiy corresponds to adapted spline]
\label{theo:main1}
@ -28,7 +173,7 @@ limes of RN as the amount of nodes is increased.
as defined in ??? and ??? respectively.
\end{Theorem}
In order to proof Theo~\ref{theo:main1} we need to proof a number of
auxilary Lemmata first.
auxiliary Lemmata first.
\begin{Definition}[Sobolev Norm]
\label{def:sobonorm}
@ -50,49 +195,126 @@ auxilary Lemmata first.
Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
\mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
\subset \mathbb{R}\) with \(f(a)=0\) it holds that
\begin{equation}
\begin{equation*}
\label{eq:pti1}
\exists C_K^{\infty} \in \mathbb{R}_{>0} :
\norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
\norm{f'}_{L^{\infty}(K)}.
\end{equation}
\end{equation*}
If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
\mathbb{R}\) Lesbeque integrable then additionally
\begin{equation}
\begin{equation*}
\label{eq:pti2}
\exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
C_K^2 \norm{f''}_{L^2(K)}.
\end{equation}
\proof
With the fundamental theorem of calculus, if
\(\norm{f}_{L^{\infty}(K)}<\infty\) we get
\begin{equation}
\label{eq:f_f'}
\norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
\sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
\sup_{y \in K}\abs{f'(y)}.
\end{equation}
Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
\end{equation*}
% \proof
% With the fundamental theorem of calculus, if
% \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
% \begin{equation}
% \label{eq:f_f'}
% \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
% \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
% \sup_{y \in K}\abs{f'(y)}.
% \end{equation}
% Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
% \[
% \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
% \max\left\{\norm{f}_{L^{\infty}(K)},
% \norm{f'}_{L^{\infty}(K)}\right\}
% \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
% 1\right\}\norm{f'}_{L^{\infty}(K)}.
% \]
% With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
% get (\ref{eq:pti1}).
% By using the Hölder inequality, we can proof the second claim.
% \begin{align*}
% \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
% \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
% K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
% &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
% \in
% K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
% = \abs{b-a}\norm{f''}_{L^2(K)}.
% \end{align*}
% Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
% \abs{b-a}C_K^{\infty}\).
% \qed
\end{Lemma}
\begin{Lemma}
Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
\mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
\[
\norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
\max\left\{\norm{f}_{L^{\infty}(K)},
\norm{f'}_{L^{\infty}(K)}\right\}
\stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
1\right\}\norm{f'}_{L^{\infty}(K)}.
\forall x \in \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
\frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
\]
With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
get (\ref{eq:pti1}).
By using the Hölder inequality, we can proof the second claim.
\begin{align*}
\norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
\mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
&\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
\in
K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
= \abs{b-a}\norm{f''}_{L^2(K)}.
\end{align*}
Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
\abs{b-a}C_K^{\infty}\).
\qed
\end{Lemma}
it holds, that
\[
\plimn \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
\frac{\bar{h}_k}{2}
=\int_{max\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
\mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
\]
uniformly in \(T \in K\).
% \proof
% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
% \begin{equation}
% \label{eq:psi_stet}
% \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
% \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
% \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
% \frac{1}{g_{\xi}(\xi')}} < \varepsilon
% \end{equation}
% uniformly in \(v\). In order to
% save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
% \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
% intervall. By splitting the interval in disjoint strips of length \(\delta
% \leq \delta(\varepsilon)\) we get:
% \[
% \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
% \frac{\bar{h}_k}{2}}_{\circled{1}} =
% \underbrace{\sum_{l \in \mathbb{Z}:
% \left[\delta l, \delta (l + 1)\right] \subseteq
% \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
% \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
% \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
% \]
% Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
% \begin{align*}
% \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
% \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
% \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
% 1}\right) \\
% % \intertext{}
% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
% \varphi\left(l\delta, v_k\right)}
% {\abs{\left\{m \in
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
% \kappa : \xi_m \in [\delta l, \delta(l +
% 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
% \intertext{We use the mean to approximate the number of kinks in
% each $\delta$-strip, as it follows a bonomial distribution this
% amounts to
% \[
% \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
% \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
% 1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
% \tilde{\varepsilon}).
% \]
% Bla Bla Bla $v_k$}
% \circled{1} & \approx
% \end{align*}
\end{Lemma}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: