diff --git a/.gitignore b/.gitignore index 52a1d96..1e8f2a4 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ main-blx.bib # emacs autosaves *.tex~ +*#*.tex* # no pdfs *.pdf diff --git a/TeX/introduction_nn.tex b/TeX/introduction_nn.tex index f5472d4..d171dd3 100644 --- a/TeX/introduction_nn.tex +++ b/TeX/introduction_nn.tex @@ -147,7 +147,7 @@ except for the input layer, which recieves the components of the input. 1.5mm] (i_4) at (0, -1.25) {}; \node [align=left, left] at (-0.125, -1.25) {\(i_m\)}; \draw[decoration={calligraphic brace,amplitude=5pt, mirror}, decorate, line width=1.25pt] - (-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Input}; + (-0.6,2.7) -- (-0.6,-1.45) node [black, midway, xshift=-0.6cm, left] {Inputs}; \node [align = center, above] at (1.25, 3) {Synaptic\\weights}; \node [every neuron] (w_1) at (1.25, 2.5) {\(w_{k, 1}\)}; diff --git a/TeX/main.tex b/TeX/main.tex index 3438a5f..4a7f87c 100644 --- a/TeX/main.tex +++ b/TeX/main.tex @@ -27,6 +27,7 @@ \usepackage{dsfont} \usepackage{tikz} \usepackage{nicefrac} +\usepackage{enumitem} \usetikzlibrary{matrix,chains,positioning,decorations.pathreplacing,arrows} \usetikzlibrary{positioning,calc,calligraphy} @@ -59,14 +60,19 @@ \newtheorem{Lemma}[Theorem]{Lemma} \newtheorem{Algorithm}[Theorem]{Algorithm} \newtheorem{Example}[Theorem]{Example} +\newtheorem{Assumption}[Theorem]{Assumption} \DeclareMathOperator*{\plim}{\mathbb{P}\text{-}\lim} +\DeclareMathOperator{\supp}{supp} +\DeclareMathOperator*{\argmin}{arg\,min} \begin{document} \newcommand{\plimn}[0]{\plim\limits_{n \to \infty}} \newcommand{\norm}[1]{\left\lVert#1\right\rVert} +\newcommand*\circled[1]{\tikz[baseline=(char.base)]{ + \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} \newcommand{\abs}[1]{\ensuremath{\left\vert#1\right\vert}} diff --git a/TeX/theo_3_8.tex b/TeX/theo_3_8.tex index 41d82a2..ebd4c6e 100644 --- a/TeX/theo_3_8.tex +++ b/TeX/theo_3_8.tex @@ -5,8 +5,153 @@ %%% TeX-master: "main" %%% End: -With the following Theorem we will have an explicit desrctiption for the -limes of RN as the amount of nodes is increased. +In this section we will analyze the connection of shallow Neural +Networks and regression splines. We will see that the punishment of +wight size in training the shallow Neural Netowork will result in a +function that minimizes the second derivative as the amount of hidden +nodes ia grown to infinity. In order to properly formulate this relation we will +first need to introduce some definitions. + +\begin{Definition}[Ridge penalized Neural Network] + Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural + network, as introduced in ???. Then the optimal ridge penalized + network is given by + \[ + \mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq + \mathcal{RN}_{w^{*, \tilde{\lambda}}(\omega), \omega} + \] + with + \[ + w^{*,\tilde{\lambda}}(\omega) :\in \argmin_{w \in + \mathbb{R}^n} \underbrace{ \left\{\overbrace{\sum_{i = 1}^N \left(\mathcal{RN}_{w, + \omega}(x_i^{\text{train}}) - + y_i^{\text{train}}\right)^2}^{L(\mathcal{RN}_{w, \omega})} + + \tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}. + \] +\end{Definition} +\label{def:rpnn} +In the ridge penalized Neural Network large weights are penalized, the +extend of which can be tuned with the parameter $\tilde{\lambda}$. If +$n$ is larger than the amount of training samples $N$ then for +$\tilde{\lambda} \to 0$ the network will interpolate the data while +having minimal weights, resulting in the \textit{minimum norm + network} $\mathcal{RN}_{w^{\text{min}}, \omega}$. +\[ + \mathcal{RN}_{w^{\text{min}}, \omega} \text{ randomized shallow + Neural network with weights } w^{\text{min}}: +\] +\[ + w^{\text{min}} \in \argmin_{w \in \mathbb{R}^n} \norm{w}, \text{ + s.t. } + \mathcal{RN}_{w,\omega}(x_i^{train}) = y_i^{train}, \, \forall i \in + \left\{1,\dots,N\right\}. +\] +For $\tilde{\lambda} \to \infty$ the learned +function will resemble the data less and less with the weights +approaching $0$. Usually $\tilde{\lambda}$ lies between 0 and 1, as +for larger values the focus of weight reduction is larger than fittig +the data.\par +In order to make the notation more convinient in the follwoing the +$\omega$ used to express the realised random parameters will no longer +be explizitly mentioned. +\begin{Definition} + \label{def:kink} + Let $\mathcal{RN}_w$ be a randomized shallow Neural + Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can + be observed. + \[ + \mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \gamma(b_k + v_kx) + \] + Because we specified $\gamma(y) \coloneqq \max\left\{0, y\right\}$ a + kink in $\gamma$ can be observed at $\gamma(0) = 0$. As $b_k + v_kx = 0$ for $x + = -\frac{b_k}{v_k}$ we define the following: + \begin{enumerate}[label=(\alph*)] + \item Let $\xi_k \coloneqq -\frac{b_k}{v_k}$ be the k-th kink of $\mathcal{RN}_w$. + \item Let $g_{\xi}(\xi_k)$ be the density of the kinks $\xi_k = + - \frac{b_k}{v_k}$ in accordance to the distributions of $b_k$ and + $v_k$. + \item Let $h_{k,n} \coloneqq \frac{1}{n g_{\xi}(\xi_k)}$ be the + average estmated distance from kink $\xi_k$ to the next nearest + one. + \end{enumerate} +\end{Definition} + +In order to later prove the connection between randomised shallow +Neural Networks and regression splines, we first take a look at a +smooth approximation of the RSNN. + +\begin{Definition}[Smooth Approximation of Randomized Shallow Neural + Network] + \label{def:srsnn} + Let $RS_{w}$ be a randomized shallow Neural Network according to + Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with + corresponding kink density $g_{\xi}$ as given by + Definition~\ref{def:kink}. + In order to smooth the RSNN consider following kernel for every $x$: + + \[ + \kappa_x(s) \coloneqq \mathds{1}_{\left\{\abs{s} \leq \frac{1}{2 \sqrt{n} + g_{\xi}(x)}\right\}}(s)\sqrt{n} g_{\xi}(x), \, \forall s \in \mathbb{R} + \] + + Using this kernel we define a smooth approximation of + $\mathcal{RN}_w$ by + + \[ + f^w(x) \coloneqq \int_{\mathds{R}} \mathcal{RN}_w(x-s) \kappa_x(s) ds. + \] +\end{Definition} + +Note that the kernel introduced in Definition~\ref{def:srsnn} +satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly +similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$ +is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w * +\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not. + +Now we take a look at weighted regression splines. Later we will prove +that the ridge penalized neural network as defined in +Definition~\ref{def:rpnn} converges a weighted regression spline, as +the amount of hidden nodes is grown to inifity. + +\begin{Definition}[Weighted regression spline] + Let $x_i^{train}, y_i^{train} \in \mathbb{R}, i \in + \left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$ + and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted + regression spline $f^{*, \lambda}_g$ is given by + + \[ + f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R}) + \\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i = + 1}^N \left(f(x_i^{train}) - y_i^{train}\right)^2}^{L(f)} + + \lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)} + dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}. + \] +\end{Definition} + +Similary to ridge weight penalized neural networks the parameter +$\lambda$ controls a trade-off between accuracy on the training data +and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the +resuling function $f^{*, 0+}$ will interpolate the training data while minimizing +the second derivative. Such a function is known as smooth spline +interpolation or (cubic) smoothing spline. + +\[ + f^{*, 0+} \text{ smooth spline interpolation: } +\] +\[ + f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in + \argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{train}) = + y_i^{train}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right). +\] + +\begin{Assumption}~ + \begin{enumerate}[label=(\alph*)] + \item The probability density function of the kinks $\xi_k$, namely $g_\xi$ + has compact support on $\supp(g_{\xi})$. + \item The density $g_{\xi}$ is uniformly continuous on $\supp(g_{\xi})$. + \item $g_{\xi}(0) \neq 0$ + \end{enumerate} +\end{Assumption} \begin{Theorem}[Ridge weight penaltiy corresponds to adapted spline] \label{theo:main1} @@ -28,7 +173,7 @@ limes of RN as the amount of nodes is increased. as defined in ??? and ??? respectively. \end{Theorem} In order to proof Theo~\ref{theo:main1} we need to proof a number of -auxilary Lemmata first. +auxiliary Lemmata first. \begin{Definition}[Sobolev Norm] \label{def:sobonorm} @@ -50,49 +195,126 @@ auxilary Lemmata first. Let \(f:\mathbb{R} \to \mathbb{R}\) differentiable with \(f' : \mathbb{R} \to \mathbb{R}\) Lesbeque integrable. Then for \(K=[a,b] \subset \mathbb{R}\) with \(f(a)=0\) it holds that - \begin{equation} + \begin{equation*} \label{eq:pti1} \exists C_K^{\infty} \in \mathbb{R}_{>0} : \norm{f}_{w^{1,\infty}(K)} \leq C_K^{\infty} \norm{f'}_{L^{\infty}(K)}. - \end{equation} + \end{equation*} If additionaly \(f'\) is differentiable with \(f'': \mathbb{R} \to \mathbb{R}\) Lesbeque integrable then additionally - \begin{equation} + \begin{equation*} \label{eq:pti2} \exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq C_K^2 \norm{f''}_{L^2(K)}. - \end{equation} - \proof - With the fundamental theorem of calculus, if - \(\norm{f}_{L^{\infty}(K)}<\infty\) we get - \begin{equation} - \label{eq:f_f'} - \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq - \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a} - \sup_{y \in K}\abs{f'(y)}. - \end{equation} - Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by + \end{equation*} + % \proof + % With the fundamental theorem of calculus, if + % \(\norm{f}_{L^{\infty}(K)}<\infty\) we get + % \begin{equation} + % \label{eq:f_f'} + % \norm{f}_{L^{\infty}(K)} = \sup_{x \in K}\abs{\int_a^x f'(s) ds} \leq + % \sup_{x \in K}\abs{\int_a^x \sup_{y \in K} \abs{f'(y)} ds} \leq \abs{b-a} + % \sup_{y \in K}\abs{f'(y)}. + % \end{equation} + % Using this we can bound \(\norm{f}_{w^{1,\infty}(K)}\) by + % \[ + % \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=} + % \max\left\{\norm{f}_{L^{\infty}(K)}, + % \norm{f'}_{L^{\infty}(K)}\right\} + % \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a}, + % 1\right\}\norm{f'}_{L^{\infty}(K)}. + % \] + % With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we + % get (\ref{eq:pti1}). + % By using the Hölder inequality, we can proof the second claim. + % \begin{align*} + % \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y) + % \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in + % K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\ + % &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x + % \in + % K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)} + % = \abs{b-a}\norm{f''}_{L^2(K)}. + % \end{align*} + % Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq + % \abs{b-a}C_K^{\infty}\). + % \qed +\end{Lemma} + +\begin{Lemma} + Let $\mathcal{RN}$ be a shallow Neural network. For \(\varphi : + \mathbb{R}^2 \to \mathbb{R}\) uniformly continous such that \[ - \norm{f}_{w^{1,\infty}(K)} \stackrel{\text{Def~\ref{def:sobonorm}}}{=} - \max\left\{\norm{f}_{L^{\infty}(K)}, - \norm{f'}_{L^{\infty}(K)}\right\} - \stackrel{(\ref{eq:f_f'})}{\leq} max\left\{\abs{b-a}, - 1\right\}\norm{f'}_{L^{\infty}(K)}. + \forall x \in \supp(g_{\xi}) : \mathbb{E}\left[\varphi(\xi, v) + \frac{1}{n g_{\xi}(\xi)} \vert \xi = x \right] < \infty, \] - With \(C_k^{\infty} \coloneqq max\left\{\abs{b-a}, 1\right\}\) we - get (\ref{eq:pti1}). - By using the Hölder inequality, we can proof the second claim. - \begin{align*} - \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y) - \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in - K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\ - &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x - \in - K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)} - = \abs{b-a}\norm{f''}_{L^2(K)}. - \end{align*} - Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq - \abs{b-a}C_K^{\infty}\). - \qed -\end{Lemma} \ No newline at end of file + it holds, that + \[ + \plimn \sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k) + \frac{\bar{h}_k}{2} + =\int_{max\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}} + \mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx + \] + uniformly in \(T \in K\). + % \proof +% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to +% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and +% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\), +% \begin{equation} +% \label{eq:psi_stet} +% \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall +% \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v) +% \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v) +% \frac{1}{g_{\xi}(\xi')}} < \varepsilon +% \end{equation} +% uniformly in \(v\). In order to +% save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b +% \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an +% intervall. By splitting the interval in disjoint strips of length \(\delta +% \leq \delta(\varepsilon)\) we get: + +% \[ +% \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k) +% \frac{\bar{h}_k}{2}}_{\circled{1}} = +% \underbrace{\sum_{l \in \mathbb{Z}: +% \left[\delta l, \delta (l + 1)\right] \subseteq +% \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T +% \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\ +% \xi_k \in \left[\delta l, \delta (l + 1)\right]}} +% \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right) +% \] +% Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by +% \begin{align*} +% \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\ +% \xi_k \in \left[\delta l, \delta (l + 1)\right]}} +% \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)} +% \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in +% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in +% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{= +% 1}\right) \\ +% % \intertext{} +% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\ +% \xi_k \in \left[\delta l, \delta (l + 1)\right]}} +% \varphi\left(l\delta, v_k\right)} +% {\abs{\left\{m \in +% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in +% \kappa : \xi_m \in [\delta l, \delta(l + +% 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\ +% \intertext{We use the mean to approximate the number of kinks in +% each $\delta$-strip, as it follows a bonomial distribution this +% amounts to +% \[ +% \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l, +% \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l + +% 1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm +% \tilde{\varepsilon}). +% \] +% Bla Bla Bla $v_k$} +% \circled{1} & \approx +% \end{align*} +\end{Lemma} +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: