From 5b83fdd7ed9ad5d37f10269e7121cc3d5083ad6e Mon Sep 17 00:00:00 2001
From: Anton Lydike <me@antonlydike.de>
Date: Mon, 1 Mar 2021 10:59:36 +0100
Subject: [PATCH] stand erstabgabe

---
 References.bib |  19 ++++++-
 paper.tex      | 149 +++++++++++++++++++++++++++++--------------------
 todo.md        |  16 ++----
 3 files changed, 110 insertions(+), 74 deletions(-)

diff --git a/References.bib b/References.bib
index f8fc030..706c3d4 100644
--- a/References.bib
+++ b/References.bib
@@ -23,7 +23,7 @@
   journal = {IEEE transactions on information forensics and security},
   number  = {3},
   volume  = {4},
-  title   = {Centered Hyperspherical and HyperellipsoidalOne-Class Support Vector Machines for AnomalyDetection in Sensor Networks},
+  title   = {Centered Hyperspherical and Hyperellipsoidal One-Class Support Vector Machines for Anomaly Detection in Sensor Networks},
   year    = {2010},
   pages   = {518-533}
 }
@@ -276,6 +276,23 @@ pages="140--149",
 abstract="Polymer dielectric-based humidity sensors used in the orchid greenhouse monitoring system usually work improperly after continuously being used in a high humid condition for some time (e.g., after eight months). This problem, called sensor drift, has been broadly observed. This paper proposes a simple data-driven technique based on a Kalman filter with an artificial neural network to detect the drift and correct data. The combination of two proposed measures based on the {\$}{\$}L^1{\$}{\$}L1distance and the cosine similarity is used to determine the sensor's status, which is later used to adjust the Kalman gain accordingly. That is, when the sensor malfunctions, the gain is biased toward the prediction. When the sensor is in the normal status, the gain is biased toward the measurement. When the sensor drift is detected, the gain varies in between the prediction and the measurement. The experimental results show that the proposed method could reduce the accumulated mean absolute deviation by approximately 55.66{\%}.",
 isbn="978-3-030-68133-3"
 }
+@inproceedings{kakanakova2017,
+  title={Outlier detection via deep learning architecture},
+  author={Kakanakova, Irina and Stoyanov, Stefan},
+  booktitle={Proceedings of the 18th International Conference on Computer Systems and Technologies},
+  pages={73--79},
+  year={2017}
+}
+
+@inproceedings{li2015drift,
+  title={Drift detection and calibration of sensor networks},
+  author={Li, Zhan and Wang, Yuzhi and Yang, Anqi and Yang, Huazhong},
+  booktitle={2015 International Conference on Wireless Communications \& Signal Processing (WCSP)},
+  pages={1--6},
+  year={2015},
+  organization={IEEE}
+}
+
 
 @article{mohanty2020,
   title={Deep learning with LSTM based distributed data mining model for energy efficient wireless sensor networks},
diff --git a/paper.tex b/paper.tex
index a272ee4..e9ef9c6 100644
--- a/paper.tex
+++ b/paper.tex
@@ -18,7 +18,7 @@
 \begin{document}
 
 
-\title{Anomaly detection in wireless sensor networks: A survey}
+\title{Anomaly Detection in Wireless Sensor Networks: A Survey}
 
 
 \seminar{SVS} % Selbstorganisation in verteilten Systemen
@@ -45,8 +45,8 @@
 
 There are many different approaches to anomaly detection, a common way to classify these is by their place of computation. An approach is considered centralized, when a large chunk of the computation is done at a single point, or at a later stage during analysis. A decentralized approach implies that a considerable amount of processing is done on the individual nodes, doing analysis while being deployed. It is also important to differentiate between online and offline detection. Online detection can run while the WSN is operating, while offline detection is done after the data is collected or during pauses of operation. Online detection often reduces mission duration due to increased power consumption, but can also have the opposite effect, if the analysis done can be used to reduce the amount of communication required for the WSN to function. 
 
-\subsection{Anomaly types}
-We need to clarify the different kinds of anomalies that can occur in WSN data sets. Bosman et al. \cite{bosman2017} proposes four different kinds of anomalies that occur in WSN (c.f. Figure~\ref{fig:noisetypes}):
+\subsection{Anomaly Types}
+We need to clarify the different kinds of anomalies that can occur in WSN data sets. Commonly, four different kinds of anomalies that occur in WSN are considered (c.f. Figure~\ref{fig:noisetypes}):
 
 \begin{itemize}
   \item \emph{Spikes} are short changes with a large amplitude
@@ -63,11 +63,11 @@ We need to clarify the different kinds of anomalies that can occur in WSN data s
 
 We will first look into sensor self-calibration, which often removes or reduces drift and constant offsets. Then we will look into model based techniques for outlier detection, and then into machine learning based approaches. Outlier detection is able to detect spikes, noise and drift type anomalies, while it has difficulties detecting constant type anomalies. 
 
-A Noise anomaly is not the same as a noisy sensor, working with noisy data is a problem in WSN, but we will not focus on methods of cleaning noisy data, as it is not in the scope of this survey. Elnahrawy et al. \cite{elnahrawy2003} and Barcelo et al. \cite{barcelo2019} are a great places to start, if you are interested in this topic.
+A Noise anomaly is not the same as a noisy sensor, working with noisy data is a problem in WSN, but we will not focus on methods of cleaning noisy data, as it is not in the scope of this survey. Elnahrawy et al. \cite{elnahrawy2003} and Barcelo et al. \cite{barcelo2019} are a great places to start a survey in this direction.
 
 A fifth anomaly type, \emph{sensor failure}, is commonly added to anomaly detection \cite{rajasegarar2008,chandola2009}. Since sensor failure often manifests in these four different ways mentioned above, and we are not interested in sensor fault prediction, detection and management here, faulty sensors will not be discussed further. 
  
-\section{Sensor drift and self-calibration}
+\section{Sensor Drift and Self-Calibration}
 Advancements in energy storage density, processing power and sensor availability have increased the possible length of deployment of many WSN. This increase in sensor lifetime, together with an increase in node count due to reduced part cost \cite{wang2016}, as well as the introduction of the Internet of Things (IoT) have brought forth new problems in sensor calibration and drift detection \cite{dehkordi2020}. Increasing the amount of collected data and the length of time over which it is collected introduces a need for better quality control of the sensors that data came from. Ni et al. \cite{ni2009} noticed drift as high as 200\% in soil CO$_2$ sensors, while Buonadonna et al. \cite{buonadonna2005} noticed that his light sensors (which were calibrated to the manufacturer's specification) were performing very poorly when measured against laboratory equipment. It is out of these circumstances, that the need arises for better and more frequent sensor calibration. 
 
 \begin{figure*}[ht]
@@ -88,20 +88,24 @@ The field of self-calibration in WSN quite broad, in order to get an overview ov
   \item \emph{Processing mode} divides the approaches into \emph{centralized} processing, meaining calibration parameters are calculated by a central node and then distributed over the network, and \emph{decentralized}, where a single node, or collection of nodes collaborate to calculate their calibration parameters.
 \end{itemize}
 
-This level of specialization requires it's own survey, which most recently was Barcelo-Ordinas et al. \cite{barcelo2019}. He categorizes 39 approaches into these attributes and discusses them in-depth. We will instead just look at some central problems and ideas to these approaches in detail:
+This level of specialization requires its own survey, which most recently was Barcelo-Ordinas et al. \cite{barcelo2019}. They categorize 39 approaches into these attributes and discuss them in-depth. We will instead just look at some central problems and ideas to these approaches, focusing especially on the aspects around ground truth:
 
-\subsection{Problems in blind self-calibration approaches}
+\subsection{Problems in Blind Self-Calibration Approaches}
 The central problem in self-calibration is predicting the error of a given sensor. Since this is such a broad problem, many different solutions exist. 
 
-Kumar et al. \cite{kumar2013} proposes a solution that uses no ground-truth sensors and can be used online in a distributed fashion. It uses spatial Kriging (gaussian interpolation) and Kalman filtering (a linear approximation model accounting for noise, explained in detail in \ref{sec:kalman}) on neighborhood data in order to reduce noise and remove drift. This solution suffers from accumulative error due to a missing ground truth, as the system has no point of reference or general model to rely on. The uncertainty of the model, and thereby the accumulative error can be reduced by increasing the number of sensors which are used. A common method for gaining more measurements is increasing network density \cite{wang2016}, or switching from a single-sensor approach to sensor fusion. Barcelo-Ordinas et al. \cite{barcelo2018} explores the possibility of adding multiple copies of the same kind of sensor to each node.
+Kumar et al. \cite{kumar2013} proposes a solution that uses no ground-truth sensors and can be used online in a distributed fashion. It uses spatial Kriging (gaussian interpolation) and Kalman filtering (a linear approximation model accounting for noise, explained in detail in \ref{sec:kalman}) on neighborhood data in order to reduce noise and remove drift. This solution suffers from accumulative error due to a missing ground truth, as the system has no point of reference or general model to rely on. The uncertainty of the model, and thereby the accumulative error can be reduced by increasing the number of sensors which are used. A common method for gaining more measurements is increasing network density \cite{wang2016}, or switching from a single-sensor approach to sensor fusion. Barcelo-Ordinas et al. \cite{barcelo2018} explores the possibility of adding multiple copies of the same kind of sensor to each node. All of these approaches are shown to reduce the accumulative error inherent in blind self-calibration approaches but cannot completely negate it. This is a problem for networks who are planned to operate over large time span (e.g. multiple years). In those cases, non-blind calibration might be a better suited solution.
 
-\subsection{Non-blind self-calibration techniques}
-Non-blind, also known as reference-based calibration approached rely on known-good reference information. They often rely on data from much more expensive sensors, which often come with restrictions on their use. One type of non-blind calibration is done in a laboratory setting (see\cite{ramanathan2006}), a known-good sensor is used with in a controllable environment. Other approaches can calibrate instantly with a calibrated sensor nearby \cite{hasenfratz2012}, enabling calibration of multiple nodes in quick succession.
+\subsection{Non-Blind Self-Calibration Techniques}
+Non-blind, also known as reference-based calibration approached rely on known-good reference information. This data is often gathered from much more expensive sensors, which often come with restrictions on their use, e.g. local weather stations not reporting continuos data, and not at the exact location of the WSN. 
 
-Maag et al. \cite{maag2017} proposes a hybrid solution, where calibrated sensor arrays can be used to calibrate other non-calibrated arrays in a local network of air pollution sensors over multiple hops with minimal accumulative errors. They show 16-60\% lower error rates than other approaches currently in use.
+Another method is simply calibrating the sensors in a laboratory setting (e.g. \cite{ramanathan2006}). A known-good sensor is used for calibration within a controlled environment pre and/or post deployment as per the manufacturers specifications and the calibration parameters are applied to the collected data. While this improves the accuracy of the measured data, this is of limited usefulness if live readings from the network need to be accurate. 
 
-\subsection{An example for blind calibration} \label{sec:kalman}
-Sirisanwannakul et al \cite{Sirisanwannakul2021} uses a blind centralized approach, where humidity sensors are calibrated using Kalman filtering in combination with a neural network to detect and counteract sensor drift. Kalman filtering consists of two phases, prediction and update. A Kalman filter can, given the previous state of knowledge at step $k-1$ consisting of an estimated system state and uncertainty, calculate a prediction for the next system state and it's uncertainty. This is called the prediction phase. Then,  a new (possibly skewed) measurement is observed and used to compute a prediction of the actual current state and uncertainty. This is called the update phase. The filter is recursive in nature and can be calculated with limited hardware in real-time, making it useful for many different anomaly detection applications.
+An approach by Hasenfratz et al. \cite{hasenfratz2012} can calibrate low-cost gas sensors instantly with a calibrated sensor nearby, enabling calibration in the field without the need of an controlled environment or laboratory setting. This of course comes with a tradeoff in accuracy, but they show that the calibration is as good or better than the manufacturers. An ozone sensor calibrated using this scheme is only off by $\pm 2$ppb (parts per billion) when compared to a calibrated ozone sensor, despite the manufacturers claimed accuracy of $\pm 20$ppb. While these results are remarkable, it is not always feasible to visit every sensor in a WSN.
+
+Maag et al. \cite{maag2017} propose a solution to this problem. They formulate a hybrid solution, where calibrated sensor arrays can be used to calibrate other non-calibrated arrays in a local network of air pollution sensors over multiple hops with minimal accumulative errors. They show 16-60\% lower error rates than other iterative approaches currently in use.
+
+\subsection{An Example for Blind Calibration} \label{sec:kalman}
+Sirisanwannakul et al \cite{Sirisanwannakul2021} uses a blind centralized approach, where humidity sensors are calibrated using Kalman filtering in combination with a neural network to detect and counteract sensor drift. Kalman filtering consists of two phases, prediction and update. A Kalman filter can, given the previous state of knowledge at step $k-1$ consisting of an estimated system state and uncertainty, calculate a prediction for the next system state and its uncertainty. This is called the prediction phase. Then,  a new (possibly skewed) measurement is observed and used to compute a prediction of the actual current state and uncertainty. This is called the update phase. The filter is recursive in nature and can be calculated with limited hardware in real-time, making it useful for many different anomaly detection applications.
 
 Kalman filters are based on a linear dynamical system on a discrete time domain. It represents the system state as vectors and matrices of real numbers. In order to use Kalman filters, the observed process must be modeled in a specific structure:
 
@@ -147,33 +151,35 @@ Sirisanwannakul et al. takes the computed Kalman gain and compares its bias. In
 \section{Outlier detection - model-based approaches}
 A centralized WSN is defined by the existence of a central entity, called the \emph{base station} or \emph{fusion centre}, where all data is delivered to and analyzed. It is often assumed, that the base station does not have limits on its processing power or storage. Centralized approaches are not optimal in hostile environments, but that is not our focus here. Since central anomaly detection is closely related to the general field of anomaly detection, we will not go into much detail on these solution, instead focusing on covering solutions more specific to the field of WSN.
 
-\subsection{Statistical analysis}
+\subsection{Statistical Analysis}
 Classical Statistical analysis is done by creating a model of the expected data and then finding the probability for each recorded data point. Improbable data points are then deemed outliers. The problem for many statistical approaches is finding this model of the expected data, as it is not always feasible to create it in advance. It also bears the problem of bad models or slow changes in the environment \cite{mcdonald2013}.
 
-Sheng et al. \cite{sheng2007} proposes a new approach, where histograms of each nodes sensors data are polled, combined, and then analyzed for outliers by looking at the maximum distance a data point can be away from his nearest neighbors. This solution has several problems, as it incurs a considerable communication overhead and fails to account for non gaussian distribution. Since the this approach uses fixed parameters, it also requires updating them every time the expected data changes. 
+Sheng et al. \cite{sheng2007} propose an approach to global outlier detection, meaning a data point is only regarded as an outlier, if their value differs significantly from all values collected over a given time, not just from local sensors near the measured one. They propose that the base station requests bucketed histograms of each nodes sensors data distribution to reduce the data transmitted. These histograms are polled, combined, and then used to analyze outliers by looking at the maximum distance a data point can be away from his nearest neighbors. This method bears some problems, as it fails to account for non gaussian distribution. Another problem is the use of fixed parameters for outlier detection, requiring prior knowledge of the data collected and anomaly density. These fixed parameters also require an update, whenever these parameters change. Due to the histograms used, this method cannot be used in a shifting network topology.
 
-Böhm et al. \cite{böhm2008} proposes a solution not only to non gaussian distributions, but also to noisy data. They define a general probability distribution function (PDF) with an exponential distribution function (EDF) as a basis, which is better suited to fitting around non gaussian data as seen in Figure~\ref{fig:probdistböhm}. He then outlines an algorithm where the data is split into clusters, for each cluster an EDF is fitted and outliers are discarded.
+Böhm et al. \cite{böhm2008} propose a solution not only to non gaussian distributions, but also to noisy data. They define a general probability distribution function (PDF) with an exponential distribution function (EDF) as a basis, which is better suited to fitting around non gaussian data as seen in Figure~\ref{fig:probdistböhm}. They then outline an algorithm where the data is split into clusters, for each cluster an EDF is fitted and outliers are discarded. This method does not require any prior parametrization and is therefore more robust to configuration error. 
+
+Since this process not only detects outliers, but does a complete clustering of the given data, it is computationally much more expensive than other methods for detecting outliers. However, since this is a complete clustering algorithm, it can be used in offline analysis for clustering and will produce good results quicker than PCA or similar algorithms. Outlier detection is more a byproduct of clustering, than the end result.
 
 \begin{figure}
   \includegraphics[width=8.5cm]{img/probability-dist-böhm.png}
-  \caption{Difference of fitting a gaussian probability PDF and a customized exponential PDF. Image from \cite{böhm2008}.}
+  \caption{Difference of fitting a gaussian PDF and a customized exponential PDF. Image from \cite{böhm2008}.}
   \label{fig:probdistböhm}
 \end{figure}
 
-\subsection{Density based analysis}
-Outliers can be selected by looking at the density of points as well. Breuning et al. \cite{breuning2000} proposes a method of calculating a local outlier factor (LOF) of each point based on the local density of its $n$ nearest neighbors. The problem lies in selecting good values for $n$. If $n$ is too small, clusters of outliers might not be detected, while a large $n$ might mark points as outliers, even if they are in a large cluster of less than $n$ points. This problem is further exasperated when we try to use this in a WSN setting, for example by streaming through the last $k$ points, as cluster size will not stay constant as incoming data might be delayed or lost in transit.
+\subsection{Density Based Analysis}
+Outliers can be selected by looking at the density of points as well. Breuning et al. \cite{breuning2000} propose a method of calculating a local outlier factor (LOF) of each point based on the local density of its $n$ nearest neighbors. The problem lies in selecting good values for $n$. If $n$ is too small, clusters of outliers might not be detected, while a large $n$ might mark points as outliers, even if they are in a large cluster of less than $n$ points. This problem is further exasperated when we try to use this in a WSN setting, for example by streaming through the last $k$ points, as cluster size will not stay constant as incoming data might be delayed or lost in transit.
 
-Papadimitriou et al. \cite{papadimitriou2003} introduces a parameterless approach. They formulate a method using a local correlation integral (LOCI), which does not require parametrization. It uses a multi-granularity deviation factor (MDEF), which is the relative deviation for a point $p$ in a radius $r$. The MDEF is simply the number of nodes in an $r$-neighborhood divided by the sum of all points in the same neighborhood. LOCI provides an automated way to select good parameters for the MDEF and can detect outliers and outlier-clusters with comparable performance to other statistical approaches. They also formulate aLOCI, a linear approximation of LOCI, which also gives accurate results while reducing runtime.
+Papadimitriou et al. \cite{papadimitriou2003} introduces a parameterless approach. They formulate a method using a local correlation integral (LOCI), which does not require parametrization. It uses a multi-granularity deviation factor (MDEF), which is the relative deviation for a point $p$ in a radius $r$. The MDEF is simply the number of nodes in an $r$-neighborhood divided by the sum of all points in the same neighborhood. LOCI provides an automated way to select good parameters for the MDEF and can detect outliers and outlier-clusters with comparable performance to other statistical approaches. They also formulate aLOCI, a linear approximation of LOCI, which also gives accurate results while reducing runtime. This approach can be used centralized, decentralized or clustered, depending on the scale of the event of interest. aLOCI seems great for even running on the sensor nodes itself, as it has relatively low computational complexity.
 
 
-\subsection{Principal component analysis}
+\subsection{Principal Component Analysis}
 Principal components of a point cloud in $\R^n$ are $n$ vectors $p_i$, where $p_i$ defines a line with minimal average square distance to the point cloud while lying orthogonal to all $p_j, j<i$. These $p_i$ define an orthogonal basis of $\R^n$. The length of each $p_i$ is directly proportionate to the variance of the data in that direction. Principal Component Analysis (PCA) uses these $p_i$ to perform a change of basis of each given data point. The most common algorithm to perform PCA relies on centering the data set around the mean and then finding the eigenvectors of the covariance matrix of the point cloud \cite{jolliffee2002, macua2010}.
 
 When using $\{p_1, \dots, p_k\}, k < n$ as the new orthogonal basis, the dimensional complexity can be reduced from $n$ to $k$ while retaining as much data as possible, as the dimensions with the lowest variance are discarded. PCA is rather complex, given a data matrix $X_{[n\times j]}$ ($j$ collections of $n$ measurements), the complexity is $\mathcal{O}(n^3)$, meaning it grows cubic with the number of measured attributes \cite{yu2017}. Most of this complexity stems from the eigenvalue decomposition used in PCA.
 
-Chan et al. \cite{chan2012} proposes a solution to this problem, he develops two methods to approximate the eigenvalue decomposition by updating the state recursively and reusing large parts of the already done calculation, which reduces the computational complexity. They simulate this algorithm on existing data sets and find it outperforms existing PCA based solutions such as \cite{li2000, tien2004}. 
+Chan et al. \cite{chan2012} propose a solution to this problem, they develop two methods to approximate the eigenvalue decomposition by updating the state recursively and reusing large parts of the already done calculation, which reduces the computational complexity. They simulate this algorithm on existing data sets and find it outperforms existing PCA based solutions such as \cite{li2000, tien2004}. 
 
-Yu et al. \cite{yu2017} recognizes that this solution is performs well, but is to expensive to run on each individual node in a network. They propose a clustered and iterative way of doing PCA that reduces the complexity on each cluster head down to $\Oc(n^2t)$ where $t$ is recursion depth. He proposes clustering the nodes into groups with cluster heads which have more processing power. The leaf nodes send their samples to the cluster head, which then reorganizes and splits the sensor data, and after an initial PCA, can update his measured principal components and covariance matrices more efficiently. During this process, outliers are can be identified with relative ease using the known covariance of the data and the calculated principal components. Furthermore PCA is used to decrease the dimensional complexity of the sensor data. This compressed data is transmitted to the base station, together with the principal component vectors and covariance matrix. This allows for later reconstruction of data with high accuracy, with errors usually below 1\%, while reducing the amount of information send. 
+Yu et al. \cite{yu2017} recognize that this solution is performs well, but is to expensive to run on each individual node in a network. They propose a clustered and iterative way of doing PCA that reduces the complexity on each cluster head down to $\Oc(n^2t)$ where $t$ is recursion depth. They propose clustering the nodes into groups with cluster heads which have more processing power. The leaf nodes send their samples to the cluster head, which then reorganizes and splits the sensor data, and after an initial PCA, can update his measured principal components and covariance matrices more efficiently. During this process, outliers are can be identified with relative ease using the known covariance of the data and the calculated principal components. Furthermore PCA is used to decrease the dimensional complexity of the sensor data. This compressed data is transmitted to the base station, together with the principal component vectors and covariance matrix. This allows for later reconstruction of data with high accuracy, with errors usually below 1\%, while reducing the amount of information send. 
 
 
 Macua et al. \cite{macua2010} propose a truly decentralized approach: Using consensus algorithms to calculate the sample mean, and then approximating the global data covariance matrix. Once a good enough approximation is found, each node can do PCA individually. This approach is not suited for deployment in low-power WSN, as it incurs considerable cost in forms of communication and especially processing power required.
@@ -181,42 +187,49 @@ Macua et al. \cite{macua2010} propose a truly decentralized approach: Using cons
 
 
 
-\section{Outlier detection - machine learning approaches}
+\section{Outlier Detection - Machine Learning Approaches}
 Most machine learning approaches focus on outlier detection, which is a common problem in WSN, as an outlier is inherently an anomaly. Outlier detection is largely unable to detect drift and has difficulties wih noise, but excels at detecting data points or groups which appear to be inconsistent with the other data (spikes, noise, sometimes drift). A common problem is finding outliers in data with an inherently complex structure.
 
 Supervised learning is the process of training a neural network on a set of labeled data. Acquiring labeled data sets that are applicable to the given situation is often difficult, as it requires the existence of another classification method, or labeling by hand. Furthermore, even if a data set would exist, the class imbalance (total number of positive labels vs number of negative labels) would render such training data sub-optimal. And lastly, the data generated by a WSN might change over time without being anomalous, requiring frequent retraining \cite{ramotsoela2018}. Out of these circumstances arises the need for unsupervised or semi-supervised anomaly detection methods.
 
 We will look into a couple different approaches to outlier detection:
 
-\subsection{Support vector machines (SVMs)}
-Rajasegarar et al. \cite{rajasegarar2010} uses SVMs, which leverage a kernel function to map the input space to a higher dimensional feature space. This allows the SVM to then model highly nonlinear patterns of normal behavior in a flexible manner. This means, that patterns that are difficult to classify in the problem space, become more easily recognizable and therefore classifiable in the feature space. Once the data is mapped into the feature space, hyperelipsoids or other shapes are fitted to the data points to define regions of the feature space that classify the data as normal.
+\subsection{Support Vector Machines (SVMs)} \label{cap:svm}
+SVMs leverage a kernel function to map the input space to a higher dimensional feature space. This allows the modeling highly nonlinear patterns of normal behavior in a flexible manner. This means, that patterns that are difficult to classify in the problem space, become more easily recognizable and therefore classifiable in the feature space. Once the data is mapped into the feature space, hyperelipsoids or other shapes are fitted to the data points to define regions of the feature space that classify the data as normal or anomalous.
 
-While this approach works well to find outliers in the data, it is also computationally expensive and incurs a large communication overhead. In an attempt to decrease computational complexity, only a single hyperelipsoid is fitted to the data set. This method is called a one-class support vector machine. Originally Wang et al. \cite{wang2006} created a model of a one-class SVM (OCSVM), however the solution required the solution of a computationally complex second-order cone programming problem, making it unusable for distributed usage. Rajasegarar et al. \cite{rajasegarar2007, rajasegarar2010} improved on this OCSVM in a couple of ways.
+While this approach works well to find outliers in the data, it is also computationally expensive and incurs a large communication overhead. In an attempt to decrease computational complexity, only a single hyperelipsoid is fitted to the data set. This method is called a one-class support vector machine. Originally Wang et al. \cite{wang2006} created a model of a one-class SVM (OCSVM), however it required the solution of a computationally complex second-order cone programming problem, making it unusable for distributed usage. Rajasegarar et al. \cite{rajasegarar2007, rajasegarar2010} improved on this OCSVM in a couple of ways.
 
 They used the fact, that they could normalize numerical input data to lay in the vicinity of the origin inside the feature space, and furthermore the results of Laskov et al. \cite{laskov2004} which showed, that normalized numerical data is one-sided, always lying in the positive quadrants. This lead to the formulation of a centered-hyperelipsoidal SVM (CESVM) model, which vastly reduces computational complexity to a linear problem. Furthermore they introduce a one-class quarter-sphere SVM (QSSVM) which reduced the communication overhead. They conclude however, that the technique ist still unfit for decentralized use because of the large remaining communication overhead, as a consensus for the radiuses and other parameters is still required.
 
-The QSSVM was improved in 2012 by Shahid et al. \cite{shahid2012a, shahid2012b}, proposing three schemes that reduce communication overhead while maintaining detection performance. His propositions make use of the spatio-temporal \& attribute (STA) correlations in the measured data. These propositions accept worse consensus about the placement of the hypersphere among neighboring nodes in order to reduce the communication overhead. He then shows, that his approaches are comparable in performance to the QSSVM proposed by Rajasegarar et al. if the data correlates well enough inside each neighborhood. It is important to note, that this neighborhood information does not rely on nodes being stationary and is therefore usable in a shifting network topology.
+The QSSVM was further improved in 2012 by Shahid et al. \cite{shahid2012a, shahid2012b}, proposing three schemes that reduce communication overhead while maintaining detection performance. His propositions make use of the spatio-temporal and attribute (STA) correlations in the measured data. These propositions accept worse consensus about the placement of the hypersphere among neighboring nodes in order to reduce the communication overhead. They then show, that these approaches are comparable in performance to the QSSVM proposed by Rajasegarar et al. if the data correlates well enough inside each neighborhood. It is important to note, that this neighborhood information does not rely on nodes being stationary and is therefore usable in a shifting network topology.
 
 \subsection{Generalized Hebbian Algorithm}
-Ali et al. \cite{ali2015} proposes an approach to detect and identify events using Generalized Hebbian Algorithm (GHA). Event detection is important in anomaly detection, but event identification is almost equally as important, especially when a sensor network is used to detect an event spanning multiple nodes. They propose a combined algorithm to detect, identify and communicate events in a WSN to detect local and global events. This is achieved by calculating identification ratios, i.e. the percentage each attribute contributed to the event, before broadcasting the detected event. 
+Ali et al. \cite{ali2015} propose an approach to detect and identify events using Generalized Hebbian Algorithm (GHA). Event detection is important in anomaly detection, but event identification is almost equally as important, especially when a sensor network is used to detect an event spanning multiple nodes. They propose a combined algorithm to detect, identify and communicate events in a WSN to detect local and global events. This is achieved by calculating identification ratios, i.e. the percentage each attribute contributed to the event, before broadcasting the detected event. 
 
-They start off with an outlier detection scheme using hyper-ellipsoids fitted around 98\% of their data points to detect outliers, using an iterative boundary estimation model based on the model formulated by by Moshtaghi et al. \cite{moshtaghi2011} called Forgetting Factor Iterative Data Capture Anomaly Detection (FFIDCAD). It can compute multidimensional boundaries of of the local model online in an iterative fashion, reducing the amount of required computation immensely, while also working in non-stationary environments and changing network topology due to its forgetting factor. A local event is declared, after observing more than $q$ outliers in a row, where $q$ is chosen depending on sampling rate and required temporal resolution.
+They start off with an outlier detection scheme using hyper-ellipsoids fitted around 98\% of their data points to detect outliers, using an iterative boundary estimation model based on the model formulated by by Moshtaghi et al. \cite{moshtaghi2011} called Forgetting Factor Iterative Data Capture Anomaly Detection (FFIDCAD). It can compute multidimensional boundaries of of the local model online in an iterative fashion, reducing the amount of required computation immensely, while also working in non-stationary environments and changing network topology due to the forgetting factor. A local event is declared, after observing more than $q$ outliers in a row, where $q$ is chosen depending on sampling rate and required temporal resolution.
 
-Once an event is detected, Ali et al. proposes using a Generalized Hebbian Algorithm (GHA) to replace the Eigenvalue Decomposition (EVD) commonly used in offline identification schemes such as PCA. EVD requires large batches of measurements to accurately compute principal components, while GHA can work online in a streaming fashion. They further show, that their online GHA bases approach has similar accuracy to offline EVD based techniques, while vastly reducing computational complexity. Once the eigenvectors are calculated, the last measurement is projected onto the calculated eigenvectors and whitened, creating a vector containing the identification ratios for each attribute.
+Once an event is detected, Ali et al. propose using a Generalized Hebbian Algorithm (GHA) to replace the Eigenvalue Decomposition (EVD) commonly used in offline identification schemes such as PCA. EVD requires large batches of measurements to accurately compute principal components, while GHA can work online in a streaming fashion. They further show, that their online GHA bases approach has similar accuracy to offline EVD based techniques, while vastly reducing computational complexity. Once the eigenvectors are calculated, the last measurement is projected onto the calculated eigenvectors and whitened, creating a vector containing the identification ratios for each attribute.
 
-Ali et al. claim that their algorithm has complexity of $\mathcal{O}(nd^2)$, compared to $\mathcal{O}(n^2+nd^2)$ of common SVM based approaches \cite{shahid2012a,shahid2012b}. Here $n$ is the number of measurements and $d$ is the number of attributes. Furthermore, due to the online nature of this approach, communication overhead is much lower, as only detected local events have to be broadcast, instead of the ongoing exchange of support vectors that have to be broadcast in the aforementioned SVM approaches.
+Ali et al. claim that their algorithm has complexity of $\mathcal{O}(nd^2)$, compared to $\mathcal{O}(n^2+nd^2)$ of common SVM based approaches \cite{shahid2012a,shahid2012b}. Here $n$ is the number of measurements and $d$ is the number of attributes. Furthermore, due to the online nature of this approach, communication overhead is much lower, as only detected local events have to be broadcast, instead of the ongoing exchange of support vectors that have to be broadcast in the SVM approaches mentioned in Chapter~\ref{cap:svm}. 
 
 
-\subsection{Extreme learning}
-When working decentralized in an environment, where data is funneled into sinks, it is still possible to obtain additional data without additional overhead just by listening to other nodes broadcasts. This data can be fed into various prediction models-
+\subsection{Extreme Learning}
+When working decentralized in an environment, where data is funneled into sinks, it is still possible for nodes to obtain additional data without additional overhead just by listening to other nodes broadcasts.
 
-Bosman et al. \cite{bosman2017} looks at the performance of recursive last squares (RLS) and the online sequential extreme learning machine (OS-ELM) approach to train a single-layer feed-forward neural network (SLFN). These are compared to first degree polynomial function approximation (FA) and sliding window mean prediction. The article shows, that incorporation neighborhood information improves anomaly detection only in cases where the data set is well-correlated and shows low spatial entropy, as is common in most natural monitoring applications. When the data set does not correlate well, or there is too much spatial entropy, the methods described in this paper fail to predict anomalies. It concludes, that neighborhood aggregation is not useful beyond 5 neighbors, as such a large data set will fail to meet the aforementioned conditions. The exact size of the optimal neighborhood will vary with network topology and sensor modality.
+Bosman et al. \cite{bosman2017,bosman2013} looks at the performance of recursive last squares (RLS) and the online sequential extreme learning machine (OS-ELM) approach to train a single-layer feed-forward neural network (SLFN). These are compared to first degree polynomial function approximation (FA) and sliding window mean prediction. They show, that incorporation neighborhood information improves anomaly detection only in cases where the data set is well-correlated and shows low spatial entropy, as is common in most natural monitoring applications. When the data set does not correlate well, or there is too much spatial entropy, the methods described in this paper fail to predict anomalies. It concludes, that neighborhood aggregation is not useful beyond 5 neighbors, as such a large data set will fail to meet the aforementioned conditions. The exact size of the optimal neighborhood will vary with network topology and sensor modality.
 
-Here, all four types of anomalies were accounted for in the data set, but there was no analysis, how good the detection was for each kind of anomaly.
+Here, all four types of anomalies were accounted for in the data set, but there was no analysis how good the detection was for each kind of anomaly.
 
 
 
-\subsection{Deep learning approaches}
+\begin{figure}
+  \includegraphics[width=8.5cm]{img/lstm_pump_predictions.png}
+  \caption{LSTM prediction results of water pump sensor data from Zhang et al. \cite{zhang2018}}
+  \label{fig:zhangpump}
+\end{figure}
+
+
+\subsection{Deep Learning Approaches}
 Deep learning techniques for solving anomaly detection in WSN aim at solving a slightly different problem than other methods mentioned thus far. As the amount of data increases that WSN produce, either by increasing node count, sensor count, or adding high output sensors such as cameras, traditional outlier detection algorithms might not be capable of keeping up \cite{chalapathy2019}. 
 
 In such environments, the analysis part is often moved to the cloud \cite{yu2017}, removing some of the restrictions originally introduced by WSN. While this paper will not discuss topics such as image recognition or anomaly detection in video \cite{kiran2018}, we will highlight some interesting results using deep neural networks to predict or detect anomalies in neural networks.
@@ -225,41 +238,53 @@ Zhang et al. \cite{zhang2018} uses LSTM neural networks to analyze and predict w
 
 They found, that they can not only predict future sensor measurements with high accuracy (root mean square error below $0.01$, even for complex sensor patterns) but can also identify and to en extend predict failures with their model (Figure~\ref{fig:zhangpump}).
 
-\begin{figure}
-  \includegraphics[width=8.5cm]{img/lstm_pump_predictions.png}
-  \caption{LSTM prediction results of water pump sensor data from Zhang et al. \cite{zhang2018}}
-  \label{fig:zhangpump}
-\end{figure}
+Kakanakova et al. \cite{kakanakova2017} looks at a more generalized form of outlier detection using deep neural networks called Deep Belief Networks (DBN). DBN consist of a composition of so called Restricted Boltzmann Machines (RBM), where the output of each RBM serves as the input for the next. The input of the first RBM serves as the input of the DBN, and the last RBMs output is the output of the whole DBN.
+
+A RBM is a graph on nodes connected by weights, consisting of two types of nodes, visible and hidden nodes. Weighted connections only span from hidden to visible nodes. The RBM has 
+an input node for each dimension of the input vector, plus two nodes for outlier flags and bias. During training of a RBM, the weights of the connections and values of the hidden nodes are changed to best fit the training data. 
+
+Training an DBN is done by training the first RBM, freezing it's weights and using the values of the hidden nodes as inputs for the next RBM. Kakanakova et al. proves, that this type of Deep Neural Network can learn behavior that is to complex even for SVM approaches, and shows that DBM outperforms SVM approaches on their synthetical data sets. They note, that while a DBM can outperform these other methods in complex tasks, DBM are not suited for simpler problems, as training becomes less effective with lower complexity problems.
+
+
 
-\begin{table*}[ht]
+\section{Conclusion}
+
+\begin{table*}[h!]
   \begin{adjustbox}{max width=\textwidth}
     \begin{tabular}{ccccccccccccc}
-        Reference & Online/Offline & Centralized/Decentralized & Required topology & Communication & Recalibration & Basis \\ \toprule
-        \cite{Sirisanwannakul2021} & Online & Centralized & Static & Low & No & Kalman filter \\
-        \cite{sheng2007} & Online & Centralized & Any & High & No & Statistical analysis \\
-        \cite{böhm2008} & Online & Centralized & Any & Normal & No & Statistical analysis \\
-        \cite{breuning2000} & Online & Centralized & Static & Normal & Yes & Density \\
-        \cite{papadimitriou2003} & Online & Centralized & Any & Normal & No & Density \\
-        \cite{chan2012} & Online & Decentralized & Static & Low & No & PCA \\
-        \cite{yu2017} & Online & Clustered & Static & Low & No & PCA \\
-        \cite{macua2010} & Online & Decentralized & Any & High & No & Distributed PCA \\
-        \cite{rajasegarar2010} & Online & Decentralized & Any & Prohibitive & No & SVM \\
-        \cite{shahid2012b} & Online & Decentralized & Any & High/Normal & No & SVM   \\
-        \cite{ali2015} & Online & Decentralized & Any & Low & No & GHA \\
-        \cite{bosman2017} & Online & Decentralized & Any & Normal & No & OS-ELM \\
-        \cite{zhang2018} & Online & Centralized & Static & Normal & Yes & LSTM \\ \hline
+        Reference & Prior knowledge & Centralized/Decentralized & Required topology & Communication & Recalibration & Basis \\ \toprule 
+        \cite{Sirisanwannakul2021} & No & Centralized & Static & Low & No & Kalman filter \\
+        \cite{sheng2007} & Yes & Centralized & Any & Low & No & Statistical analysis \\
+        \cite{böhm2008} & Yes & Centralized & Any & Normal & No & Statistical analysis \\
+        \cite{breuning2000} & Yes & Centralized & Static & Normal & Yes & Density \\
+        \cite{papadimitriou2003} & No & Any & Any & Normal/Low & No & Density \\
+        \cite{chan2012} & No & Decentralized & Static & Low & No & PCA \\
+        \cite{yu2017} & No & Clustered & Static & Low & No & PCA \\
+        \cite{macua2010} & No & Decentralized & Any & High & No & Distributed PCA \\
+        \cite{wang2006} & Training & Decentralized & Any & Prohibitive & No & SVM \\
+        \cite{rajasegarar2010} & Training & Decentralized & Any & Prohibitive & No & SVM \\
+        \cite{shahid2012b} & Training & Decentralized & Any & High/Normal & No & SVM   \\
+        \cite{ali2015} & No & Decentralized & Any & Low & No & GHA \\
+        \cite{bosman2017} & No & Decentralized & Any & Normal & No & OS-ELM \\
+        \cite{zhang2018} & Training & Centralized & Static & Normal & Yes & LSTM \\ 
+        \cite{kakanakova2017} & Training & Centralized & Any & Normal & Yes & DBM \\
+        \hline
     \end{tabular}
   \end{adjustbox}
-  \caption{A comparison of approaches investigated in this survey. The column "Recalibration" indicates, if the model used requires recalibration or retraining upon a change in the environment. }
+  \caption{A comparison of approaches investigated in this survey. The column "Recalibration" indicates, if the model used requires recalibration or retraining upon a change in the environment. Communication cost is compared to "normal" behavior, where all data is transmitted to a base station. Low implies a reduction in communication, while "Prohibitive" marks approaches that require more communication than is feasible in most WSN.}
+
   \label{tbl:comparison}
 \end{table*}
 
-\section{Non-stationary data} \cite{oreilly2014}
+Anomaly detection in WSN is a relatively new addition to the general field of anomaly detection, but has already become a rather complex landscape of solutions, as many experts in their respective fields have used their knowledge to find solutions to these new problems. This survey attempts to capture this diversity in methods and introduces many fundamentally different approaches. In order to organize approaches, we first defined the four anomaly types that are expected in WSN, and then looked at methods that detect or remove these. 
 
-\section{Conclusion}
-Anomaly detection in WSN is a relatively new addition to the general field of anomaly detection, but has already become a rather complex landscape of solutions, as many experts in their respective fields have used their knowledge to find solutions to these new problems. This survey attempts to capture this diversity in methods and introduces many fundamentally different approaches. In order to organize approaches, we first defined the four anomaly types that are expected in WSNs, and then looked at methods that detect or remove these. 
+First we looked at solutions for sensor drift and offset and found that while sensor calibration is an important step in preventing these, non-blind calibration adds a considerable amount of work in either extrapolating results into the WSN, or calibrating by using another high-quality sensor that often needs to be brought into the close proximity of the sensor. We looked into a real-world application of blind sensor calibration and confirmed the problems with this approach, as accumulative errors cannot be corrected without ground truth.
+
+Then we looked at different ways of outlier detection using statistical or density based approaches in a centralized manner, followed by large number of decentralized approaches, using methods like PCA, SVM, GHA and ELM. We saw, that SVM are a great solution to outlier detection, due to their ability to model highly nonlinear but normal behavior, but require a lot more communication than other approaches such as PCA or GHA, while not performing much better in most cases. We saw how neighborhood data can be used to detect local anomalies using ELM, with performance directly proportional to the correlation inside the neighborhood. 
+
+Finally we took a look at some deep learning approaches, and the challenges that come with them. We saw great performance of LSTM and DBM based approaches and understand that their application is limited in conventional, low power WSN.
 
-At first we looked at solutions for sensor drift and offset, and found, that while sensor calibration is an important step in preventing these, calibration in the field is often not feasible due to missing ground truth. We then looked at some other ways to compensate for sensor drift in data sets.
+These findings are summarized in Table~\ref{tbl:comparison}.
 
 
 
diff --git a/todo.md b/todo.md
index 3a25c00..af2d731 100644
--- a/todo.md
+++ b/todo.md
@@ -4,22 +4,16 @@
 
 * drift detection
   * maybe some more
-* extreme learning https://www.researchgate.net/profile/Giovanni-Iacca/publication/262274757_Online_Extreme_Learning_on_Fixed-Point_Sensor_Networks/links/53df8fbb0cf2aede4b490cb3/Online-Extreme-Learning-on-Fixed-Point-Sensor-Networks.pdf
-* non stationary data
+
 * more statistical methods
-* capitalization in subsectio headers
-* as it's => as it is
-* HE => THEY
-* schreiben warum non-blind calibration scheiße ist
-* the paper => they
-* Chan et al. \cite{chan2012} proposes a solution to this problem, he develops two methods to approximate the eigenvalue decomposition by updating the state recursively and reusing large parts of the already done calculation, which reduces the computational complexity. They simulate this algorithm on existing data sets and find it outperforms existing PCA based solutions such as \cite{li2000, tien2004}. 
-* GHA fehlt ergebnis
+
 * extreme learning erster absatz ist kaputt
 * extreme learning erweitern
-* SVM rajesagrar cites are weird
-* https://scihubtw.tw/10.1145/3134302.3134337
+* https://arxiv.org/pdf/1408.5823.pdf maybe improved D-PCA
 
 https://netlibrary.aau.at/obvuklhs/content/titleinfo/5395523/full.pdf
+
+https://scihubtw.tw/10.1109/WCSP.2015.7341138
 ## proposed structure
 
 * introduction