diff --git a/References.bib b/References.bib index e0bba1d..f3dabec 100644 --- a/References.bib +++ b/References.bib @@ -454,3 +454,34 @@ isbn="978-3-030-68133-3" year={2011}, publisher={Elsevier} } +@article{zhang2006detecting, + title={Detecting outlying subspaces for high-dimensional data: the new task, algorithms, and performance}, + author={Zhang, Ji and Wang, Hai}, + journal={Knowledge and information systems}, + volume={10}, + number={3}, + pages={333--355}, + year={2006}, + publisher={Springer} +} + +@article{huang2011extreme, + title={Extreme learning machine for regression and multiclass classification}, + author={Huang, Guang-Bin and Zhou, Hongming and Ding, Xiaojian and Zhang, Rui}, + journal={IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics)}, + volume={42}, + number={2}, + pages={513--529}, + year={2011}, + publisher={IEEE} +} +@article{huang2015extreme, + title={What are extreme learning machines? Filling the gap between Frank Rosenblatt’s dream and John von Neumann’s puzzle}, + author={Huang, Guang-Bin}, + journal={Cognitive Computation}, + volume={7}, + number={3}, + pages={263--278}, + year={2015}, + publisher={Springer} +} diff --git a/img/density-problem.png b/img/density-problem.png new file mode 100644 index 0000000..c62530b Binary files /dev/null and b/img/density-problem.png differ diff --git a/img/pattern-anomaly.png b/img/pattern-anomaly.png index caf6b26..3acdb1f 100644 Binary files a/img/pattern-anomaly.png and b/img/pattern-anomaly.png differ diff --git a/paper.tex b/paper.tex index be90edb..5ef44c4 100644 --- a/paper.tex +++ b/paper.tex @@ -47,7 +47,7 @@ A Wireless Sensor Network (WSN) is commonly defined as a collection of battery p The power required to transmit data is often the largest contributing factor to the lifetime of each node, as it drains the battery \cite{sheng2007}. Especially if the network collects large amounts of data, or spans large areas, a lot of energy can be saved by reducing the number and size of the transmissions. An Ideal solution would be to not send the unimportant data at all, thus arises the need for anomaly detection in WSNs, enabling nodes to identify important data themselves. This is however not the only factor why anomaly detection is interesting. Some WSN are deployed to detect phenomena such as forest fires \cite{hefeeda2007wireless}, or monitor active volcanos \cite{werner2006deploying}. In these cases, anomaly detection is not only used to limit the required communication, but also to fulfill the core purpose of the network. -Not all approaches to anomaly detection in WSN are able to run directly on the node, therefore this survey will differentiate between \emph{decentralized} (algorithms running directly on the node) and \emph{centralized} (running at a central location) methods. It's not always beneficial to have a decentralized approach, as some networks are less restricted by their energy (for example by having a power supply) and would rather use greater computational power and a complete set of data (meaning data from all sensors, not just ones in a local area) to improve their detection and/or prediction accuracy. This is often encountered in industrial settings \cite{ramotsoela2018}. +Not all approaches to anomaly detection in WSN are able to run directly on the node, therefore this survey will differentiate between \emph{decentralized} (algorithms running directly on the node) and \emph{centralized} (running at a central location) methods. It's not always beneficial to have a decentralized approach, as some networks are less restricted by their energy (for example by having a power supply or being frequently serviced by personal) and would rather use greater computational power and a complete set of data (meaning data from all sensors, not just ones in a local area) to improve their detection and/or prediction accuracy. This is often encountered in industrial settings \cite{ramotsoela2018}. Another factor for these models is the network topology. In a non-static WSN, a model using neighborhood information has to account for changes in the network topology surrounding it, as the number of neighbors changes, or the data they measured previously is not actually belonging to the current neighborhood. If the node keeps track of previous measurements, it also needs to take into account how it's changes in position might influence the measured data. @@ -73,13 +73,13 @@ In the general field of anomaly detection, more advanced definitions of anomalie \begin{figure} \includegraphics[width=8.5cm]{img/anomaly_types.png} - \caption{Spike, noise, constant and drift type anomalies in noisy linear data, image from Bosmal et al. \cite{bosman2013}} + \caption{Spike, noise, constant and drift type anomalies in noisy linear data, image from Bosmal et al. \cite{bosman2013}.} \label{fig:noisetypes} \end{figure} \begin{figure} \includegraphics[width=8.5cm]{img/pattern-anomaly.png} - \caption{Pattern based anomaly corresponding to an Atrial Premature Contraction in an electrodiagram, image from Chandola et al. \cite{chandola2009}} + \caption{Pattern based anomaly corresponding to an Atrial Premature Contraction in an electrodiagram, image from Chandola et al. \cite{chandola2009}.} \label{fig:patternanomaly} \end{figure} @@ -101,29 +101,24 @@ The problem of outlier detection in WSNs is the creation of a model which can us After the introduction and coverage of related work, we will look into sensor self-calibration, a method of improving sensor accuracy. Calibrating a sensor will remove constant offsets, enabling nodes to compare measurements between one another more easily. If a sensor is in use for a prolonged length of time, it might needs to be recalibrated, to remove sensor drift. -Then we will look into conventional, model based approaches to outlier detection, such as statistical, or density based models, followed by the more recent machine learning based models. Finally, all presented models are summarized in a table and evaluated based on their properties and requirements. +Afterwards we will look into a collection of different outlier detection methods, ranging from statistical methods to machine learning. \section{Related Work} -Chandola et al. \cite{chandola2009} provide a very comprehensive survey on outlier detection in general, not just focused on WSN. They introduce many key concepts and definitions, but focus more on outliers than anomalies in general. +Chandola et al. \cite{chandola2009} provide a very comprehensive survey on anomaly detection in general, not just focused on WSN. They introduce many key concepts and definitions, but focus more on outliers than anomalies in general. +O'Reilly et al. \cite{oreilly2014} look into anomaly detection in WSN in the specific context of non-stationary environments, meaning environments where the ''normal'' state evolves over time, and isn't static. Due to the nature of the problem, almost all approaches presented there had some machine-learning aspects to them, as they needed to first detect when a change of model was required, and then create a new model that conforms to the new data sensed by the network. +McDonald et al. \cite{mcdonald2013} survey methods of finding outliers in WSN, with a focus on distributed solutions. They go into a moderate amount of detail on most solutions, but skip over a lot of methods such as principal component ananlysis, and support vector machines, which were already maturing at that point in time. Instead they only present distance and density based approaches. -McDonald et al. \cite{mcdonald2013} survey methods of finding outliers in WSN, with a focus on distributed solutions. They go into a moderate amount of detail on most solutions, but skip over a lot of methods such as principal component ananlysis, and support vector machines, which were already maturing at that point in time. +Barcelo-Ordinas et al. \cite{barcelo2019} provide a very in-depth reference study for sensor self-calibration, they analyzes 39 different approaches in several different categories. This survey is covered further in the section covering sensor self-calibration. +Ramotsoela et al. \cite{ramotsoela2018} survey anomaly detection in industrial settings, where machine learning is preferred due to the observed phenomena being more complex. The survey covers both intrusion detection and outlier detection methods, and compiles a table of 17 different approaches to anomaly detection. They look at six fundamentally different approaches and score them based on accuracy, prior knowledge, complexity and data prediction. They look more closely at k-nearest neighbor models but find similar problems as mentioned in chapter \ref{sec:distance} and \ref{sec:density}. -Ramotsoela et al. \cite{ramotsoela2018} - -Chalapathy et al. \cite{chalapathy2019} - - -Kakanakova et el. \cite{kakanakova2017} - - -Barcelo-Ordinas et al. \cite{barcelo2019} survey self-calibation methods for WSNs, +Further information concerning advanced machine learning models such as Deep Learning techniques are covered by Chalapathy et al. \cite{chalapathy2019} and Kakanakova et el. \cite{kakanakova2017}. Both of these surveys do not focus on WNS, but propose methods which are applicable to the general field. @@ -208,8 +203,16 @@ After the update phase, we obtain $\hat{x}_{k|k}$, which is our best approximati Sirisanwannakul et al. takes the computed Kalman gain and compares its bias. In normal operation, the gain is biased towards the measurement. If the sensor malfunctions, the bias is towards the prediction. But if the gains bias is between prediction and measurement, the system assumes sensor drift and corrects automatically. Since this approach lacks a ground truth measurement it cannot recalibrate the sensor, but the paper shows that accumulative error can be reduced by more than 50\%. -\section{Outlier detection - Classical Approaches} -We consider a classical approach to be anything that uses conventional (non-machine learning) models or algorithms to perform outlier detection. This chapter will first look at +\section{Outlier detection} +This chapter will analyse a couple of fundamentally different approaches to outlier detection. The approaches are roughly ordered by age, where newer approaches come last. We will start with basic methods that are used outside of WSN and transition to more specific applications. All approaches covered here are listed in Table~\ref{tbl:comparison} at the end of the survey and analyzed by a couple of key metrics: + +\begin{itemize} + \item \emph{Prior knowledge}: Does an approach require any prior knowledge, for example for constructing models beforehand, or training machine learning models. + \item \emph{Centralized/Decentralized}: Is the outlier detection performed on individual nodes, or at a centralized sink. Some methods work both ways, and some work in a clustered approach + \item \emph{Required topology}: If an approach requires a static topology, nodes must be stationary. + \item \emph{Communication}: How much communication is required by this approach. ''Normal'' means about the same as streaming all data to the sink, ''Prohibitive'' means it that the approach is not usable and requires some optimization. + \item \emph{Recalibration}: Does the model need recalibration or updates when the environment changes around it. +\end{itemize} \subsection{Statistical Analysis} Classical Statistical analysis is done by creating a statistical model of the expected data and then finding the probability for each recorded data point. Improbable data points are then deemed outliers. The problem for many statistical approaches is finding this model of the expected data, as it is not always feasible to create it in advance, when the nature of the phenomena is not well known in advance, or if the expected data is too complex. It is also not very robust to changes in the environment \cite{mcdonald2013}, requiring frequent updates to the model if the environment changes in ways not forseen by the model. @@ -226,59 +229,68 @@ Since this process not only detects outliers, but does a complete clustering of \label{fig:probdistböhm} \end{figure} -\subsection{Density Based Analysis} -Outliers can be selected by looking at the density of points as well. Breuning et al. \cite{breuning2000} propose a method of calculating a local outlier factor (LOF) of each point based on the local density of its $n$ nearest neighbors. The problem lies in selecting good values for $n$. If $n$ is too small, clusters of outliers might not be detected, while a large $n$ might mark points as outliers, even if they are in a large cluster of less than $n$ points. This problem is further exasperated when we try to use this in a WSN setting, for example by streaming through the last $k$ points, as cluster size will not stay constant as incoming data might be delayed or lost in transit. -Papadimitriou et al. \cite{papadimitriou2003} introduces a parameterless approach. They formulate a method using a local correlation integral (LOCI), which does not require parametrization. It uses a multi-granularity deviation factor (MDEF), which is the relative deviation for a point $p$ in a radius $r$. The MDEF is simply the number of nodes in an $r$-neighborhood divided by the sum of all points in the same neighborhood. LOCI provides an automated way to select good parameters for the MDEF and can detect outliers and outlier-clusters with comparable performance to other statistical approaches. They also formulate aLOCI, a linear approximation of LOCI, which also gives accurate results while reducing runtime. This approach can be used centralized, decentralized or clustered, depending on the scale of the event of interest. aLOCI seems great for even running on the sensor nodes itself, as it has relatively low computational complexity. +\subsection{Distance Based Analysis} \label{sec:distance} +An older solution to finding outliers in data is the distance based approach, it assigns an anomaly score to each data point, based on the distance to it's $k$ nearest neighbors \cite{zhang2006detecting}. This approach however fails at detecting outliers in a system with two or more clusters that do not have the same density. Figure \ref{fig:densityproblem} shows two clusters $C_1$, $C_2$ with varying density. The point $p_1$ will either be incorrectly identified as a non-outlier, or the whole set of $C_1$ will be identified as outliers together with $p_1$. +\begin{figure} + \includegraphics[width=6.5cm]{img/density-problem.png} + \caption{Two sets of clusters $C_1$ and $C_2$, and two outliers $p_1$ and $p_2$. Image from Chandola et al. \cite{chandola2009}.} + \label{fig:densityproblem} +\end{figure} -\subsection{Distance Based Approaches} +\subsection{Density Based Analysis} \label{sec:density} +Outliers can be selected by looking at the density of points as well. If done correctly, the problem described above can be prevented. Breuning et al. \cite{breuning2000} propose a method of calculating a local outlier factor (LOF) of each point based on the local density of its $n$ nearest neighbors. The problem lies in selecting good values for $n$. If $n$ is too small, clusters of outliers might not be detected, while a large $n$ might mark points as outliers, even if they are in a large cluster of less than $n$ points. This problem is further exasperated when we try to use this in a WSN setting, for example by streaming through the last $k$ points, as cluster size will not stay constant when incoming data is delayed or lost in transit. + + +Papadimitriou et al. \cite{papadimitriou2003} introduces a parameterless approach. They formulate a method using a local correlation integral (LOCI), which does not require parametrization. It uses a multi-granularity deviation factor (MDEF), which is the relative deviation for a point $p$ in a radius $r$. The MDEF is simply the number of nodes in an $r$-neighborhood divided by the sum of all points in the same neighborhood. LOCI provides an automated way to select good parameters for the MDEF and can detect outliers and outlier-clusters with comparable performance to other statistical approaches. They also formulate aLOCI, a linear approximation of LOCI, which also gives accurate results while reducing runtime. This approach can be used centralized, decentralized or clustered, depending on the scale of the event of interest. aLOCI seems great for even running on the sensor nodes itself, as it has relatively low computational complexity. \subsection{Principal Component Analysis} +Another way of detecting outliers is by computing the Principal Component Analysis (PCA) of the collected data. This way one can find the variance of the collected data in each axis. If a measured data point is far outside the expected variance ranges, it can be flagged as anomalous. PCA can also be used to reduce the number of dimensions a set of data contains while minimizing the loss of meaningful information. + \begin{figure*}[ht] \includegraphics[width=0.8\textwidth]{img/PCA.png} - \caption{An Example of reducing a 3 Dimensional dataset to two dimensions using PCA to minimize loss of information. The PCA Vectors are marked red.} + \caption{An example of reducing a 3 dimensional dataset to two dimensions using PCA to minimize loss of information. PCA Vectors are marked red.} \label{fig:pca} \end{figure*} -Principal components of a point cloud in $\R^n$ are $n$ vectors $p_i$, where $p_i$ defines a line with minimal average square distance to the point cloud while lying orthogonal to all $p_j, j