Anfang des roten Faden in theo 3.8
This commit is contained in:
parent
5a33ed3c8e
commit
74113d5060
1
.gitignore
vendored
1
.gitignore
vendored
@ -9,6 +9,7 @@ main-blx.bib
|
||||
|
||||
# emacs autosaves
|
||||
*.tex~
|
||||
*#*.tex*
|
||||
|
||||
# no pdfs
|
||||
*.pdf
|
||||
|
@ -147,7 +147,7 @@ except for the input layer, which recieves the components of the input.
|
||||
1.5mm] (i_4) at (0, -1.25) {};
|
||||
\node [align=left, left] at (-0.125, -1.25) {\(i_m\)};
|
||||
\draw[decoration={calligraphic brace,amplitude=5pt, mirror}, decorate, line width=1.25pt]
|
||||
(-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Input};
|
||||
(-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Inputs};
|
||||
|
||||
\node [align = center, above] at (1.25, 3) {Synaptic\\weights};
|
||||
\node [every neuron] (w_1) at (1.25, 2.5) {\(w_{k, 1}\)};
|
||||
|
@ -27,6 +27,7 @@
|
||||
\usepackage{dsfont}
|
||||
\usepackage{tikz}
|
||||
\usepackage{nicefrac}
|
||||
\usepackage{enumitem}
|
||||
|
||||
\usetikzlibrary{matrix,chains,positioning,decorations.pathreplacing,arrows}
|
||||
\usetikzlibrary{positioning,calc,calligraphy}
|
||||
@ -59,14 +60,19 @@
|
||||
\newtheorem{Lemma}[Theorem]{Lemma}
|
||||
\newtheorem{Algorithm}[Theorem]{Algorithm}
|
||||
\newtheorem{Example}[Theorem]{Example}
|
||||
\newtheorem{Assumption}[Theorem]{Assumption}
|
||||
|
||||
|
||||
\DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim}
|
||||
\DeclareMathOperator{\supp}{supp}
|
||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||
\begin{document}
|
||||
|
||||
|
||||
\newcommand{\plimn}[0]{\plim\limits_{n \to \infty}}
|
||||
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
|
||||
\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
|
||||
\node[shape=circle,draw,inner sep=2pt] (char) {#1};}}
|
||||
|
||||
|
||||
\newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}}
|
||||
|
298
TeX/theo_3_8.tex
298
TeX/theo_3_8.tex
@ -5,8 +5,153 @@
|
||||
%%% TeX-master: "main"
|
||||
%%% End:
|
||||
|
||||
With the following Theorem we will have an explicit desrctiption for the
|
||||
limes of RN as the amount of nodes is increased.
|
||||
In this section we will analyze the connection of shallow Neural
|
||||
Networks and regression splines. We will see that the punishment of
|
||||
wight size in training the shallow Neural Netowork will result in a
|
||||
function that minimizes the second derivative as the amount of hidden
|
||||
nodes ia grown to infinity. In order to properly formulate this relation we will
|
||||
first need to introduce some definitions.
|
||||
|
||||
\begin{Definition}[Ridge penalized Neural Network]
|
||||
Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
|
||||
network, as introduced in ???. Then the optimal ridge penalized
|
||||
network is given by
|
||||
\[
|
||||
\mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
|
||||
\mathcal{RN}_{w^{*, \tilde{\lambda}}(\omega), \omega}
|
||||
\]
|
||||
with
|
||||
\[
|
||||
w^{*,\tilde{\lambda}}(\omega) :\in \argmin_{w \in
|
||||
\mathbb{R}^n} \underbrace{ \left\{\overbrace{\sum_{i = 1}^N \left(\mathcal{RN}_{w,
|
||||
\omega}(x_i^{\text{train}}) -
|
||||
y_i^{\text{train}}\right)^2}^{L(\mathcal{RN}_{w, \omega})} +
|
||||
\tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
|
||||
\]
|
||||
\end{Definition}
|
||||
\label{def:rpnn}
|
||||
In the ridge penalized Neural Network large weights are penalized, the
|
||||
extend of which can be tuned with the parameter $\tilde{\lambda}$. If
|
||||
$n$ is larger than the amount of training samples $N$ then for
|
||||
$\tilde{\lambda} \to 0$ the network will interpolate the data while
|
||||
having minimal weights, resulting in the \textit{minimum norm
|
||||
network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
|
||||
\[
|
||||
\mathcal{RN}_{w^{\text{min}}, \omega} \text{ randomized shallow
|
||||
Neural network with weights } w^{\text{min}}:
|
||||
\]
|
||||
\[
|
||||
w^{\text{min}} \in \argmin_{w \in \mathbb{R}^n} \norm{w}, \text{
|
||||
s.t. }
|
||||
\mathcal{RN}_{w,\omega}(x_i^{train}) = y_i^{train}, \, \forall i \in
|
||||
\left\{1,\dots,N\right\}.
|
||||
\]
|
||||
For $\tilde{\lambda} \to \infty$ the learned
|
||||
function will resemble the data less and less with the weights
|
||||
approaching $0$. Usually $\tilde{\lambda}$ lies between 0 and 1, as
|
||||
for larger values the focus of weight reduction is larger than fittig
|
||||
the data.\par
|
||||
In order to make the notation more convinient in the follwoing the
|
||||
$\omega$ used to express the realised random parameters will no longer
|
||||
be explizitly mentioned.
|
||||
\begin{Definition}
|
||||
\label{def:kink}
|
||||
Let $\mathcal{RN}_w$ be a randomized shallow Neural
|
||||
Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can
|
||||
be observed.
|
||||
\[
|
||||
\mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \gamma(b_k + v_kx)
|
||||
\]
|
||||
Because we specified $\gamma(y) \coloneqq \max\left\{0, y\right\}$ a
|
||||
kink in $\gamma$ can be observed at $\gamma(0) = 0$. As $b_k + v_kx = 0$ for $x
|
||||
= -\frac{b_k}{v_k}$ we define the following:
|
||||
\begin{enumerate}[label=(\alph*)]
|
||||
\item Let $\xi_k \coloneqq -\frac{b_k}{v_k}$ be the k-th kink of $\mathcal{RN}_w$.
|
||||
\item Let $g_{\xi}(\xi_k)$ be the density of the kinks $\xi_k =
|
||||
- \frac{b_k}{v_k}$ in accordance to the distributions of $b_k$ and
|
||||
$v_k$.
|
||||
\item Let $h_{k,n} \coloneqq \frac{1}{n g_{\xi}(\xi_k)}$ be the
|
||||
average estmated distance from kink $\xi_k$ to the next nearest
|
||||
one.
|
||||
\end{enumerate}
|
||||
\end{Definition}
|
||||
|
||||
In order to later prove the connection between randomised shallow
|
||||
Neural Networks and regression splines, we first take a look at a
|
||||
smooth approximation of the RSNN.
|
||||
|
||||
\begin{Definition}[Smooth Approximation of Randomized Shallow Neural
|
||||
Network]
|
||||
\label{def:srsnn}
|
||||
Let $RS_{w}$ be a randomized shallow Neural Network according to
|
||||
Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with
|
||||
corresponding kink density $g_{\xi}$ as given by
|
||||
Definition~\ref{def:kink}.
|
||||
In order to smooth the RSNN consider following kernel for every $x$:
|
||||
|
||||
\[
|
||||
\kappa_x(s) \coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n}
|
||||
g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R}
|
||||
\]
|
||||
|
||||
Using this kernel we define a smooth approximation of
|
||||
$\mathcal{RN}_w$ by
|
||||
|
||||
\[
|
||||
f^w(x) \coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) \kappa_x(s) ds.
|
||||
\]
|
||||
\end{Definition}
|
||||
|
||||
Note that the kernel introduced in Definition~\ref{def:srsnn}
|
||||
satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
|
||||
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
|
||||
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
|
||||
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
|
||||
|
||||
Now we take a look at weighted regression splines. Later we will prove
|
||||
that the ridge penalized neural network as defined in
|
||||
Definition~\ref{def:rpnn} converges a weighted regression spline, as
|
||||
the amount of hidden nodes is grown to inifity.
|
||||
|
||||
\begin{Definition}[Weighted regression spline]
|
||||
Let $x_i^{train}, y_i^{train} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
||||
regression spline $f^{*, \lambda}_g$ is given by
|
||||
|
||||
\[
|
||||
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
|
||||
\\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
|
||||
1}^N \left(f(x_i^{train}) - y_i^{train}\right)^2}^{L(f)} +
|
||||
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
|
||||
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
|
||||
\]
|
||||
\end{Definition}
|
||||
|
||||
Similary to ridge weight penalized neural networks the parameter
|
||||
$\lambda$ controls a trade-off between accuracy on the training data
|
||||
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
|
||||
resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
|
||||
the second derivative. Such a function is known as smooth spline
|
||||
interpolation or (cubic) smoothing spline.
|
||||
|
||||
\[
|
||||
f^{*, 0+} \text{ smooth spline interpolation: }
|
||||
\]
|
||||
\[
|
||||
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
|
||||
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{train}) =
|
||||
y_i^{train}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
|
||||
\]
|
||||
|
||||
\begin{Assumption}~
|
||||
\begin{enumerate}[label=(\alph*)]
|
||||
\item The probability density function of the kinks $\xi_k$, namely $g_\xi$
|
||||
has compact support on $\supp(g_{\xi})$.
|
||||
\item The density $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$.
|
||||
\item $g_{\xi}(0) \neq 0$
|
||||
\end{enumerate}
|
||||
\end{Assumption}
|
||||
|
||||
\begin{Theorem}[Ridge weight penaltiy corresponds to adapted spline]
|
||||
\label{theo:main1}
|
||||
@ -28,7 +173,7 @@ limes of RN as the amount of nodes is increased.
|
||||
as defined in ??? and ??? respectively.
|
||||
\end{Theorem}
|
||||
In order to proof Theo~\ref{theo:main1} we need to proof a number of
|
||||
auxilary Lemmata first.
|
||||
auxiliary Lemmata first.
|
||||
|
||||
\begin{Definition}[Sobolev Norm]
|
||||
\label{def:sobonorm}
|
||||
@ -50,49 +195,126 @@ auxilary Lemmata first.
|
||||
Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' :
|
||||
\mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b]
|
||||
\subset \mathbb{R}\) with \(f(a)=0\) it holds that
|
||||
\begin{equation}
|
||||
\begin{equation*}
|
||||
\label{eq:pti1}
|
||||
\exists C_K^{\infty} \in \mathbb{R}_{>0} :
|
||||
\norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty}
|
||||
\norm{f'}_{L^{\infty}(K)}.
|
||||
\end{equation}
|
||||
\end{equation*}
|
||||
If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to
|
||||
\mathbb{R}\) Lesbeque integrable then additionally
|
||||
\begin{equation}
|
||||
\begin{equation*}
|
||||
\label{eq:pti2}
|
||||
\exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
|
||||
C_K^2 \norm{f''}_{L^2(K)}.
|
||||
\end{equation}
|
||||
\proof
|
||||
With the fundamental theorem of calculus, if
|
||||
\(\norm{f}_{L^{\infty}(K)}<\infty\) we get
|
||||
\begin{equation}
|
||||
\label{eq:f_f'}
|
||||
\norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
|
||||
\sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
|
||||
\sup_{y \in K}\abs{f'(y)}.
|
||||
\end{equation}
|
||||
Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
|
||||
\end{equation*}
|
||||
% \proof
|
||||
% With the fundamental theorem of calculus, if
|
||||
% \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
|
||||
% \begin{equation}
|
||||
% \label{eq:f_f'}
|
||||
% \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq
|
||||
% \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a}
|
||||
% \sup_{y \in K}\abs{f'(y)}.
|
||||
% \end{equation}
|
||||
% Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by
|
||||
% \[
|
||||
% \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
|
||||
% \max\left\{\norm{f}_{L^{\infty}(K)},
|
||||
% \norm{f'}_{L^{\infty}(K)}\right\}
|
||||
% \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
|
||||
% 1\right\}\norm{f'}_{L^{\infty}(K)}.
|
||||
% \]
|
||||
% With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
|
||||
% get (\ref{eq:pti1}).
|
||||
% By using the Hölder inequality, we can proof the second claim.
|
||||
% \begin{align*}
|
||||
% \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
|
||||
% \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
|
||||
% K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
|
||||
% &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
|
||||
% \in
|
||||
% K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
|
||||
% = \abs{b-a}\norm{f''}_{L^2(K)}.
|
||||
% \end{align*}
|
||||
% Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
|
||||
% \abs{b-a}C_K^{\infty}\).
|
||||
% \qed
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}
|
||||
Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi :
|
||||
\mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that
|
||||
\[
|
||||
\norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=}
|
||||
\max\left\{\norm{f}_{L^{\infty}(K)},
|
||||
\norm{f'}_{L^{\infty}(K)}\right\}
|
||||
\stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a},
|
||||
1\right\}\norm{f'}_{L^{\infty}(K)}.
|
||||
\forall x \in \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v)
|
||||
\frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty,
|
||||
\]
|
||||
With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we
|
||||
get (\ref{eq:pti1}).
|
||||
By using the Hölder inequality, we can proof the second claim.
|
||||
\begin{align*}
|
||||
\norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
|
||||
\mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
|
||||
K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
|
||||
&\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
|
||||
\in
|
||||
K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
|
||||
= \abs{b-a}\norm{f''}_{L^2(K)}.
|
||||
\end{align*}
|
||||
Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
|
||||
\abs{b-a}C_K^{\infty}\).
|
||||
\qed
|
||||
\end{Lemma}
|
||||
it holds, that
|
||||
\[
|
||||
\plimn \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||
\frac{\bar{h}_k}{2}
|
||||
=\int_{max\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
|
||||
\mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
|
||||
\]
|
||||
uniformly in \(T \in K\).
|
||||
% \proof
|
||||
% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
|
||||
% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
|
||||
% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
|
||||
% \begin{equation}
|
||||
% \label{eq:psi_stet}
|
||||
% \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
|
||||
% \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
|
||||
% \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
|
||||
% \frac{1}{g_{\xi}(\xi')}} < \varepsilon
|
||||
% \end{equation}
|
||||
% uniformly in \(v\). In order to
|
||||
% save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
|
||||
% \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
|
||||
% intervall. By splitting the interval in disjoint strips of length \(\delta
|
||||
% \leq \delta(\varepsilon)\) we get:
|
||||
|
||||
% \[
|
||||
% \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||
% \frac{\bar{h}_k}{2}}_{\circled{1}} =
|
||||
% \underbrace{\sum_{l \in \mathbb{Z}:
|
||||
% \left[\delta l, \delta (l + 1)\right] \subseteq
|
||||
% \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
|
||||
% \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
|
||||
% \]
|
||||
% Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
|
||||
% \begin{align*}
|
||||
% \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
|
||||
% \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
|
||||
% 1}\right) \\
|
||||
% % \intertext{}
|
||||
% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(l\delta, v_k\right)}
|
||||
% {\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l +
|
||||
% 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
|
||||
% \intertext{We use the mean to approximate the number of kinks in
|
||||
% each $\delta$-strip, as it follows a bonomial distribution this
|
||||
% amounts to
|
||||
% \[
|
||||
% \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||
% \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
|
||||
% 1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
|
||||
% \tilde{\varepsilon}).
|
||||
% \]
|
||||
% Bla Bla Bla $v_k$}
|
||||
% \circled{1} & \approx
|
||||
% \end{align*}
|
||||
\end{Lemma}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "main"
|
||||
%%% End:
|
||||
|
Loading…
Reference in New Issue
Block a user